Created
September 4, 2024 08:35
-
-
Save MarioZZJ/040ee8efb2d819cc0e17d3c6a1b4e0a0 to your computer and use it in GitHub Desktop.
demo for extract keywords via llm: bigmodel api
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 50, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "API_KEY = 'YOUR_API_KEY'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# 本地环境准备" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from zhipuai import ZhipuAI\n", | |
| "client = ZhipuAI(api_key=API_KEY) # 填写您自己的APIKey" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CompletionMessage(content='\"智领变革,图创未来 —— 智谱AI,智慧加速每一步\"', role='assistant', tool_calls=None)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "response = client.chat.completions.create(\n", | |
| " model=\"glm-4-0520\", # 填写需要调用的模型编码\n", | |
| " messages=[\n", | |
| " {\"role\": \"user\", \"content\": \"作为一名营销专家,请为我的产品创作一个吸引人的slogan\"},\n", | |
| " {\"role\": \"assistant\", \"content\": \"当然,为了创作一个吸引人的slogan,请告诉我一些关于您产品的信息\"},\n", | |
| " {\"role\": \"user\", \"content\": \"智谱AI开放平台\"},\n", | |
| " {\"role\": \"assistant\", \"content\": \"智启未来,谱绘无限一智谱AI,让创新触手可及!\"},\n", | |
| " {\"role\": \"user\", \"content\": \"创造一个更精准、吸引人的slogan\"}\n", | |
| " ],\n", | |
| ")\n", | |
| "print(response.choices[0].message)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'\"智领变革,图创未来 —— 智谱AI,智慧加速每一步\"'" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "response.choices[0].message.content" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# 应用示例" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import json" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "text = [\n", | |
| " [\"test101\",\"生命周期视角下颠覆性技术的扩散特征研究\",\"颠覆性技术扩散特点的发现对于识别潜在颠覆性技术具有重要意义。本文基于E.M.罗杰斯提出的技术扩散理论,构建生命周期视角下颠覆性技术的扩散研究框架,实现颠覆性技术扩散特点的研究。首先利用大规模调研的方式,获取当前政府、产业界、学术界一致公认的颠覆性技术,利用文献计量和替代计量的分析方法,探究技术生命周期视角下颠覆性技术的扩散特点,最后获得颠覆性技术扩散的周期、速率、滞后性、无序性、主导性5个方面的结论。本研究为探索颠覆性技术的特征和规律以及潜在颠覆性技术的遴选提供了新的思路。\"],\n", | |
| " [\"test102\",\"面向语义信息分析的多层次技术演化轨迹识别方法研究\",\"[目的/意义]面向语义信息以层次渐进的方式识别技术演化轨迹,有助于加强对技术细节的理解并提升轨迹识别的准确性。[方法/过程]首先,提取专利和科技论文的SAO结构,依据语义信息确定研究主题,并利用S曲线分析技术生命周期。其次,借助机器学习算法与社会网络分析指标,分不同周期,通过多层次提取,筛选技术演化轨迹。最后,以造血干细胞领域为实证对象,发现该领域中与遗传病因技术主题相关的专利和科技论文的研究重点存在显著差异,该主题尚未形成统一的演化路径,且有关免疫系统疾病与糖尿病方面的研究是未来潜在的演化趋势。[结果/结论]所提方法通过客观的数值计算结果,逐步实现复杂技术演化路径的提取与凝练,在揭示技术主要发展历程的同时,能够客观预测技术演化趋势。\"],\n", | |
| " [\"test103\",\"突变视角下潜在颠覆性技术识别与分析方法研究\",\"[目的/意义]将社会网络分析与突变理论相结合,提出识别潜在颠覆性技术的新方法,帮助企业获得市场、经济的先行优势,同时有助于加强政府对重点技术的前期引导作用。[方法/过程]首先,基于文本三元组结构,构建语义信息层面的技术共现网络。其次,基于社区网络结构和颠覆性技术特征,提出社区网络关系强度和对外资源利用力两个指标,利用突变理论模型识别易突变社区。最后,对易突变社区内节点进行主题词突发性监测和词频异常监测确定技术突发可能性,以此识别潜在颠覆性技术。[结果/结论]以造血干细胞领域2001—2018年论文数据作为分析对象,验证了方法的有效性,发现“建立完善的关于免疫缺陷疾病及相关移植治疗手段的临床研究模型”是该领域的潜在颠覆性技术。\"]\n", | |
| "]\n", | |
| "texts_df = pd.DataFrame(text, columns=[\"id\",\"title\", \"abstract\"])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "content_template = \"\"\"\n", | |
| "## Goals\n", | |
| "- 从提供的学术论文标题和摘要中提取五个与研究内容最相关的主题词。\n", | |
| "\n", | |
| "## Constrains\n", | |
| "- 输出必须仅包含关键词内容。\n", | |
| "- 输出格式必须是Markdown列表形式。\n", | |
| "- 按照相关性依次输出主题词。\n", | |
| "\n", | |
| "## Skills\n", | |
| "- 理解学术论文标题和摘要的能力。\n", | |
| "- 提取关键信息的能力。\n", | |
| "- 排序和筛选主题词的能力。\n", | |
| "\n", | |
| "## Output Format\n", | |
| "- 以Markdown列表形式输出五个主题词。\n", | |
| "\n", | |
| "## Workflow\n", | |
| "1. 读取并理解学术论文的标题和摘要。\n", | |
| "2. 提取与研究内容最相关的主题词。\n", | |
| "3. 按照相关性对主题词进行排序。\n", | |
| "4. 以Markdown列表形式输出五个主题词。\n", | |
| "\n", | |
| "## 标题内容\n", | |
| "{title}\n", | |
| "\n", | |
| "## 摘要内容\n", | |
| "{abstract}\n", | |
| "\"\"\"\n", | |
| "\n", | |
| "# json 模版,注意这里可以指定特定模型\n", | |
| "json_template = {\"custom_id\": \"xxx\", \"method\": \"POST\", \"url\": \"/v4/chat/completions\", \"body\": {\"model\": \"glm-4\", \"messages\": [{\"role\": \"system\", \"content\": \"你是一个专注于从学术论文标题和摘要中提取主题词的专家\"},{\"role\": \"user\", \"content\": \"xxx\"}]}}\n", | |
| "\n", | |
| "for index, row in texts_df.iterrows():\n", | |
| " id = row[\"id\"]\n", | |
| " title = row[\"title\"]\n", | |
| " abstract = row[\"abstract\"]\n", | |
| " # 将标题、摘要填入模板\n", | |
| " content = content_template.format(title=title, abstract=abstract)\n", | |
| " json_row = json_template.copy()\n", | |
| " json_row[\"custom_id\"] = id\n", | |
| " json_row[\"body\"][\"messages\"][1][\"content\"] = content\n", | |
| "\n", | |
| " # 写入文件一行\n", | |
| " with open(f\"json_submit.jsonl\", \"a\") as f:\n", | |
| " f.write(json.dumps(json_row, ensure_ascii=False) + \"\\n\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "1725432608_4dc1a45595254515ba04d446f78e383c\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# 上传文件\n", | |
| "from zhipuai import ZhipuAI\n", | |
| " \n", | |
| "client = ZhipuAI(api_key=API_KEY) # 请填写您自己的APIKey\n", | |
| " \n", | |
| "result = client.files.create(\n", | |
| " file=open(\"json_submit.jsonl\", \"rb\"),\n", | |
| " purpose=\"batch\"\n", | |
| ")\n", | |
| "print(result.id)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Batch(id='batch_1831223675418845184', completion_window='24h', created_at=1725432721367, endpoint='/v4/chat/completions', input_file_id='1725432608_4dc1a45595254515ba04d446f78e383c', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=None, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': '主题词抽取'}, output_file_id=None, request_counts=BatchRequestCounts(completed=None, failed=None, total=3))\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# 创建 batch 任务\n", | |
| "from zhipuai import ZhipuAI\n", | |
| " \n", | |
| "client = ZhipuAI(api_key=API_KEY) # 填写您自己的APIKey\n", | |
| "\n", | |
| "create = client.batches.create(\n", | |
| " input_file_id=\"1725432608_4dc1a45595254515ba04d446f78e383c\",\n", | |
| " endpoint=\"/v4/chat/completions\", \n", | |
| " completion_window=\"24h\", #完成时间只支持 24 小时\n", | |
| " metadata={\n", | |
| " \"description\": \"主题词抽取\"\n", | |
| " }\n", | |
| ")\n", | |
| "print(create)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Batch(id='batch_1831223675418845184', completion_window='24h', created_at=1725432721000, endpoint='/v4/chat/completions', input_file_id='1725432608_4dc1a45595254515ba04d446f78e383c', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1725435402000, error_file_id='', errors=None, expired_at=None, expires_at=None, failed_at=None, finalizing_at=1725435350000, in_progress_at=1725435223000, metadata={'description': '主题词抽取'}, output_file_id='1725435402_e5bcfc2d00a34ce4a0c9ea58c27b1041', request_counts=BatchRequestCounts(completed=3, failed=0, total=3))\n", | |
| "1725435402_e5bcfc2d00a34ce4a0c9ea58c27b1041\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# 检查 batch 状态,获取文件 id\n", | |
| "\n", | |
| "batch_job = client.batches.retrieve(\"batch_1831223675418845184\")\n", | |
| "print(batch_job)\n", | |
| "print(batch_job.output_file_id)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 30, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# 下载 batch 结果\n", | |
| "\n", | |
| "from zhipuai import ZhipuAI\n", | |
| " \n", | |
| "client = ZhipuAI(api_key=API_KEY) # 填写您自己的APIKey\n", | |
| "# client.files.content返回 _legacy_response.HttpxBinaryResponseContent实例\n", | |
| "content = client.files.content(\"1725435402_e5bcfc2d00a34ce4a0c9ea58c27b1041\") \n", | |
| "\n", | |
| "# 使用write_to_file方法把返回结果写入文件\n", | |
| "content.write_to_file(\"write_to_file_batchoutput.jsonl\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 49, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "test101: ['颠覆性技术', '技术扩散', '生命周期视角', '扩散特点', '遴选机制']\n", | |
| "test102: ['语义信息分析', '技术演化轨迹', '多层次提取', '造血干细胞领域', '专利与科技论文分析']\n", | |
| "test103: ['颠覆性技术识别', '社会网络分析', '突变理论', '技术共现网络', '语义信息分析']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "ids = []\n", | |
| "results = []\n", | |
| "\n", | |
| "with open(\"write_to_file_batchoutput.jsonl\") as f:\n", | |
| " for line in f:\n", | |
| " response = json.loads(line)[\"response\"][\"body\"]\n", | |
| " id = response['request_id']\n", | |
| " content = response['choices'][0]['message']['content'].replace('- ','').split('\\n')\n", | |
| " print(f\"{id}: {content}\")\n", | |
| " ids.append(id)\n", | |
| " results.append(content)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# 循环调用 API 并解析结果" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 51, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "test101: ['颠覆性技术', '技术扩散', '生命周期', '扩散特点', '技术遴选']\n", | |
| "test102: ['语义信息分析', '技术演化轨迹', '多层次提取', '生命周期分析', '机器学习算法']\n", | |
| "test103: ['颠覆性技术识别', '社会网络分析', '突变理论', '技术共现网络', '易突变社区检测']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "client = ZhipuAI(api_key=API_KEY) # 填写您自己的APIKey\n", | |
| "\n", | |
| "for index, row in texts_df.iterrows():\n", | |
| " id = row[\"id\"]\n", | |
| " title = row[\"title\"]\n", | |
| " abstract = row[\"abstract\"]\n", | |
| " # 将标题、摘要填入模板\n", | |
| " content = content_template.format(title=title, abstract=abstract)\n", | |
| " json_row = json_template.copy()\n", | |
| " json_row[\"body\"][\"messages\"][1][\"content\"] = content\n", | |
| " response = client.chat.completions.create(\n", | |
| " model=\"glm-4-0520\", # 填写需要调用的模型\n", | |
| " messages=json_row['body']['messages']\n", | |
| " )\n", | |
| "\n", | |
| " content = response.choices[0].message.content.replace('- ','').split('\\n')\n", | |
| " print(f\"{id}: {content}\")" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "zhipu", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.11.0" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment