概述
开发智能搜索平台:
1.auto_search_answer:输入检索内容
2.identify_model:调用判别模型
3.generate_random_key:创建密钥
4.run_conversation:function calling with get_answer
5.get_answer:智能助手函数,当你无法回答某个问题时,调用该函数,能够获得答案
6.convert_keyword:调用转化函数,将用户的问题转化为更适合在知乎上进行搜索的关键词
7.google_search:谷歌搜索知乎,默认搜索返回10个答案
8.get_search_text:收集知乎内容
代码
import os
import json
import inspect
import requests
import tiktoken
import random
import string
from lxml import etree
from zhipuai import ZhipuAI
client = ZhipuAI(api_key="****") # 填写您自己的APIKey
def google_search(query, num_results=10, site_url=None):
api_key = "****"
cse_id = "****"
url = "https://www.googleapis.com/customsearch/v1"
# API 请求参数
if site_url == None:
params = {
'q': query,
'key': api_key,
'cx': cse_id,
'num': num_results
}
else:
params = {
'q': query,
'key': api_key,
'cx': cse_id,
'num': num_results,
'siteSearch': site_url
}
# 发送请求
response = requests.get(url, params=params)
response.raise_for_status()
# 解析响应
search_results = response.json().get('items', [])
# 提取所需信息
results = [{
'title': item['title'],
'link': item['link'],
'snippet': item['snippet']
} for item in search_results]
return results
def windows_create_name(s, max_length=255):
"""
将字符串转化为符合Windows文件/文件夹命名规范的名称。
参数:
- s (str): 输入的字符串。
- max_length (int): 输出字符串的最大长度,默认为255。
返回:
- str: 一个可以安全用作Windows文件/文件夹名称的字符串。
"""
# Windows文件/文件夹名称中不允许的字符列表
forbidden_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
# 使用下划线替换不允许的字符
for char in forbidden_chars:
s = s.replace(char, '_')
# 删除尾部的空格或点
s = s.rstrip(' .')
# 检查是否存在以下不允许被用于文档名称的关键词,如果有的话则替换为下划线
reserved_names = ["CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8",
"COM9",
"LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9"]
if s.upper() in reserved_names:
s += '_'
# 如果字符串过长,进行截断
if len(s) > max_length:
s = s[:max_length]
return s
def get_search_text(q, url):
code_ = False
title = None
cookie = "*****"
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
headers = {
'authority': 'www.zhihu.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
'cache-control': 'private, must-revalidate, no-cache, no-store, max-age=0',
'cookie': cookie, # 需要手动获取cookie
'upgrade-insecure-requests': '1',
'user-agent': user_agent, # 手动编写或者选择之后给出的user-agent选项选择其一填写
}
# 普通问答地址
if 'zhihu.com/question' in url:
res = requests.get(url, headers=headers).text
res_xpath = etree.HTML(res)
title = res_xpath.xpath('//div/div[1]/div/h1/text()')[0]
text_d = res_xpath.xpath(
'//*[@id="root"]/div/main/div/div/div[3]/div[1]/div/div[2]/div/div/div/div[2]/span[1]/div/div/span/p/text()')
# 专栏地址
elif 'zhuanlan' in url:
headers['authority'] = 'zhaunlan.zhihu.com'
res = requests.get(url, headers=headers).text
res_xpath = etree.HTML(res)
title = res_xpath.xpath('//div[1]/div/main/div/article/header/h1/text()')[0]
text_d = res_xpath.xpath('//div/main/div/article/div[1]/div/div/div/p/text()')
code_ = res_xpath.xpath('//div/main/div/article/div[1]/div/div/div//pre/code/text()')
# 特定回答的问答网址
elif 'answer' in url:
res = requests.get(url, headers=headers).text
res_xpath = etree.HTML(res)
title = res_xpath.xpath('//div/div[1]/div/h1/text()')[0]
text_d = res_xpath.xpath('//div[1]/div/div[3]/div/div/div/div[2]/span[1]/div/div/span/p/text()')
if title == None:
return None
# 创建问题答案正文
text = ''
for t in text_d:
txt = str(t).replace('\n', ' ')
text += txt
# 如果有code,则将code追加到正文的追后面
if code_:
for c in code_:
co = str(c).replace('\n', ' ')
text += co
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
json_data = [
{
"link": url,
"title": title,
"content": text,
"tokens": len(encoding.encode(text))
}
]
with open('./auto_search/%s/%s.json' % (q, title), 'w') as f:
json.dump(json_data, f)
return title
def generate_random_key(length=30):
return ''.join(random.choice(string.ascii_letters + string.digits) for _ in range(length))
def identify_model(q):
# 创建密钥
sk = generate_random_key()
# 调用模型进行判别
response = client.chat.completions.create(
model="glm-4",
messages=[
{"role": "system", "content": "你是一个用户问题判断器,专门用于判别你是否知道用户当前问题的答案。\
如果不知道,请回答“%s”,若知道,请正常回答" % sk},
{"role": "user", "content": "请问,GPT-3.5微调总共分为几步?"},
{"role": "assistant", "content": "%s" % sk},
{"role": "user", "content": q}
]
)
res = response.choices[0].message.content
if sk in res or '对不起' in res or '抱歉' in res or '超出知识库' in res:
return (True)
else:
return (res)
def convert_keyword(q):
"""
将用户输入的问题转化为适合在知乎上进行搜索的关键词
"""
response = client.chat.completions.create(
model="glm-4",
messages=[
{"role": "system",
"content": "你专门负责将用户的问题转化为知乎网站搜索关键词,只返回一个你认为最合适的搜索关键词即可"},
{"role": "user", "content": "请问,GPT-3.5微调总共分为几步?"},
{"role": "assistant", "content": "GPT-3.5微调流程"},
{"role": "user", "content": q}
]
)
q = response.choices[0].message.content
return q
def get_answer(q):
"""
智能助手函数,当你无法回答某个问题时,调用该函数,能够获得答案
:param q: 必选参数,询问的问题,字符串类型对象
:return:某问题的答案,以字符串形式呈现
"""
# 调用转化函数,将用户的问题转化为更适合在知乎上进行搜索的关键词
q = convert_keyword(q)
# 默认搜索返回10个答案
print('正在接入谷歌搜索,查找和问题相关的答案...')
results = google_search(query=q, num_results=2, site_url='https://zhihu.com/')
# 创建对应问题的子文件夹
folder_path = './auto_search/%s' % q
if not os.path.exists(folder_path):
os.makedirs(folder_path)
# 单独提取links放在一个list中
print('正在读取搜索的到的相关答案...')
num_tokens = 0
content = ''
for item in results:
url = item['link']
title = get_search_text(q, url)
with open('./auto_search/%s/%s.json' % (q, title), 'r') as f:
jd = json.load(f)
num_tokens += jd[0]['tokens']
if num_tokens <= 12000:
content += jd[0]['content']
else:
break
print('正在进行最后的整理...')
return (content)
def auto_functions(functions_list):
"""
Chat模型的functions参数编写函数
:param functions_list: 包含一个或者多个函数对象的列表;
:return:满足Chat模型functions参数要求的functions对象
"""
def functions_generate(functions_list):
# 创建空列表,用于保存每个函数的描述字典
functions = []
# 对每个外部函数进行循环
for function in functions_list:
# 读取函数对象的函数说明
function_description = inspect.getdoc(function)
# 读取函数的函数名字符串
function_name = function.__name__
system_prompt = '以下是某的函数说明:%s,输出结果必须是一个JSON格式的字典,只输出这个字典即可,前后不需要任何前后修饰或说明的语句' % function_description
user_prompt = '根据这个函数的函数说明,请帮我创建一个JSON格式的字典,这个字典有如下5点要求:\
1.字典总共有三个键值对;\
2.第一个键值对的Key是字符串name,value是该函数的名字:%s,也是字符串;\
3.第二个键值对的Key是字符串description,value是该函数的函数的功能说明,也是字符串;\
4.第三个键值对的Key是字符串parameters,value是一个JSON Schema对象,用于说明该函数的参数输入规范。\
5.输出结果必须是一个JSON格式的字典,只输出这个字典即可,前后不需要任何前后修饰或说明的语句' % function_name
response = client.chat.completions.create(
model="glm-4",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
)
json_str = response.choices[0].message.content.replace("```json", "").replace("```", "")
json_function_description = json.loads(json_str)
json_str = {"type": "function", "function": json_function_description}
functions.append(json_str)
return functions
## 最大可以尝试4次
max_attempts = 4
attempts = 0
while attempts < max_attempts:
try:
functions = functions_generate(functions_list)
break # 如果代码成功执行,跳出循环
except Exception as e:
attempts += 1 # 增加尝试次数
print("发生错误:", e)
if attempts == max_attempts:
print("已达到最大尝试次数,程序终止。")
raise # 重新引发最后一个异常
else:
print("正在重新运行...")
return functions
def run_conversation(messages, functions_list=None, model="glm-4", function_call="auto"):
"""
能够自动执行外部函数调用的对话模型
:param messages: 必要参数,字典类型,输入到Chat模型的messages参数对象
:param functions_list: 可选参数,默认为None,可以设置为包含全部外部函数的列表对象
:param model: Chat模型,可选参数,默认模型为glm-4
:return:Chat模型输出结果
"""
# 如果没有外部函数库,则执行普通的对话任务
if functions_list == None:
response = client.chat.completions.create(
model=model,
messages=messages,
)
response_message = response.choices[0].message
final_response = response_message.content
# 若存在外部函数库,则需要灵活选取外部函数并进行回答
else:
# 创建functions对象
tools = auto_functions(functions_list)
# 创建外部函数库字典
available_functions = {func.__name__: func for func in functions_list}
# 第一次调用大模型
response = client.chat.completions.create(
model=model,
messages=messages,
tools=tools,
tool_choice=function_call, )
response_message = response.choices[0].message
tool_calls = response_message.tool_calls
if tool_calls:
# messages.append(response_message)
messages.append(response.choices[0].message.model_dump())
for tool_call in tool_calls:
function_name = tool_call.function.name
function_to_call = available_functions[function_name]
function_args = json.loads(tool_call.function.arguments)
## 真正执行外部函数的就是这儿的代码
function_response = function_to_call(**function_args)
messages.append(
{
"role": "tool",
"content": function_response,
"tool_call_id": tool_call.id,
}
)
## 第二次调用模型
second_response = client.chat.completions.create(
model=model,
messages=messages,
tools=tools
)
# 获取最终结果
print(second_response.choices[0].message)
final_response = second_response.choices[0].message.content
else:
final_response = response_message.content
return final_response
def auto_search_answer(q):
# 调用判别模型
res = identify_model(q)
print(res)
if res == True:
messages = [{"role": "user", "content": q}]
res = run_conversation(messages=messages,
functions_list=[get_answer],
model="glm-4",
function_call={"type": "function", "function": {"name": "get_answer"}})
return (res)
print(auto_search_answer("macd周期分离找拐点"))