LlamaFactory 微调 Text2SQL

Text2SQL(nl2sql) 文本转sql的

# 创建文件 convert_sharegpt.py
import json
import argparse

def convert_to_sharegpt_format(input_file, output_file):
    # 读取输入 JSON 文件
    with open(input_file, 'r') as file:
        data = json.load(file)

    # 初始化一个空列表来存储转换后的对话
    sharegpt_format = []

    # 遍历每个条目（在这个例子中可能有多个条目）
    for entry in data:
        # 初始化一个空列表来存储当前条目的对话
        conversation = []

        # 将 final 中的对话添加到对话中
        if "final" in entry:
            conversation.append({
                "from": "human",
                "value": entry["final"]["utterance"]
            })
            conversation.append({
                "from": "gpt",
                "value": entry["final"]["query"]
            })
        
        # 遍历每个交互
        for interaction in entry["interaction"]:
            # 将用户的指令添加到对话中
            conversation.append({
                "from": "human",
                "value": interaction["utterance"]
            })
            
            # 将模型的响应添加到对话中
            conversation.append({
                "from": "gpt",
                "value": interaction["query"]
            })
        
        # 将当前对话添加到最终的格式中
        sharegpt_format.append({
            "conversations": conversation
        })

    # 将转换后的数据写入输出 JSON 文件
    with open(output_file, 'w') as output_file_handle:
        json.dump(sharegpt_format, output_file_handle, indent=4)

if __name__ == "__main__":
    # 创建 ArgumentParser 对象
    parser = argparse.ArgumentParser(description="Convert CoSQL train data to ShareGPT format.")
    
    # 添加输入文件路径参数
    parser.add_argument("input_file", type=str, help="Path to the input JSON file (e.g., cosql_train.json)")
    
    # 添加输出文件路径参数
    parser.add_argument("output_file", type=str, help="Path to the output JSON file (e.g., sharegpt_cosql_train.json)")
    
    # 解析命令行参数
    args = parser.parse_args()
    
    # 调用转换函数
    convert_to_sharegpt_format(args.input_file, args.output_file)

# 创建文件 convert_alpaca.py 转换alpaca格式代码 单轮对话数据
# 使用bird数据https://bird-bench.github.io/

import json
import argparse

def convert_to_alpaca_format(input_file, output_file):
    # 读取输入 JSON 文件
    with open(input_file, 'r') as file:
        data = json.load(file)

    # 初始化一个空列表来存储转换后的对话
    alpaca_format = []

    # 遍历每个条目（在这个例子中可能有多个条目）
    for entry in data:
        # 将当前条目转换为 Alpaca 格式
        alpaca_entry = {
            "instruction": entry["question"],
            "input": entry["evidence"],
            "output": entry["SQL"]
        }
        
        # 将转换后的条目添加到最终的格式中
        alpaca_format.append(alpaca_entry)

    # 将转换后的数据写入输出 JSON 文件
    with open(output_file, 'w') as output_file_handle:
        json.dump(alpaca_format, output_file_handle, indent=4)

if __name__ == "__main__":
    # 创建 ArgumentParser 对象
    parser = argparse.ArgumentParser(description="Convert CoSQL train data to Alpaca format.")
    
    # 添加输入文件路径参数
    parser.add_argument("input_file", type=str, help="Path to the input JSON file (e.g., cosql_train.json)")
    
    # 添加输出文件路径参数
    parser.add_argument("output_file", type=str, help="Path to the output JSON file (e.g., alpaca_cosql_train.json)")
    
    # 解析命令行参数
    args = parser.parse_args()
    
    # 调用转换函数
    convert_to_alpaca_format(args.input_file, args.output_file)

执行下转换

python /data/script/convert_sharegpt.py /data/cosql_train.json /data/nowdata/share
gpt_cosql_train.json

python /data/script/convert_alpaca.py /data/dev.json /data/nowdata/alpaca_dev.json

数据集准备好之后，发送一份到LlamaFactory/data

scp    /data/nowdata/alpaca_dev.json     /data/LlamaFactory-main/data
scp    /data/nowdata/sharegpt_cosql_train.json   /data/LlamaFactory-main/data

注册数据文件
路径：/data/LlamaFactory-main/data
vim dataset_info.json

"alpaca_dev": {
    "file_name": "alpaca_dev.json"
  },
  "sharegpt_cosql_train": {
      "file_name": "sharegpt_cosql_train.json",
      "formatting": "sharegpt",
   "columns": {
     "messages": "conversations"
    }
  },

启动webui

export USE_MODELSCOPE_HUB=1
export CUDA_VISIBLE_DEVICES=0
幕僚的话 改下端口 加上这一句
export GRADIO_SERVER_PORT=8890
llamafactory-cli webui

5轮要1个小时左右有点慢。。

image.png

效果验证

image.png

导出模型,先创建一个文件夹

mkdir  /data/models/Qwen/Qwen2.5-1.5B-Instruct_ft_text2sql

image.png

LlamaFactory 微调 Text2SQL

LlamaFactory 微调 Text2SQL

相关阅读更多精彩内容

友情链接更多精彩内容