以sentence-transformers/all-MiniLM-L12-v2为例:
使用用户的pdf材料fine-tune模型sentence-transformers/all-MiniLM-L12-v2以生成一个自定义的嵌入模型。
准备库
$ pip install pdfplumber nltk sentence-transformers datasets torch
python脚本
import pdfplumber
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import os
#nltk.download('punkt')
#nltk.download('punkt_tab')
# === Step 1: Extract text from all PDFs in a folder ===
def extract_text_from_pdfs(folder_path):
all_text = ""
for filename in os.listdir(folder_path):
if filename.endswith(".pdf"):
pdf_path = os.path.join(folder_path, filename)
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
all_text += page_text + "\n"
return all_text
# === Step 2: Split text into sentences ===
def split_into_sentences(text):
return sent_tokenize(text)
# === Step 3: Create sentence pairs + heuristic similarity ===
def create_sentence_pairs(sentences):
pairs = []
for i in range(len(sentences) - 1):
score = 0.8 # Heuristic: Nearby sentences are likely related
pairs.append(InputExample(texts=[sentences[i], sentences[i+1]], label=score))
return pairs
# === Step 4: Fine-tune Sentence Transformer ===
def fine_tune_model(train_examples, model_name, output_dir):
model = SentenceTransformer(model_name)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)
train_loss = losses.CosineSimilarityLoss(model)
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=0)
model.save(output_dir)
# === Run the Pipeline ===
pdf_folder = "./pdf_files" # Folder containing your PDFs
output_dir = "fine_tuned_model"
text = extract_text_from_pdfs(pdf_folder)
sentences = split_into_sentences(text)
train_examples = create_sentence_pairs(sentences)
fine_tune_model(train_examples, "sentence-transformers/all-MiniLM-L12-v2", output_dir)
print(f"Fine-tuning complete, model saved to '{output_dir}'")
测试
from sentence_transformers import SentenceTransformer, util
# Load your fine-tuned model (folder name from fine-tuning step)
model = SentenceTransformer("fine_tuned_model")
# Example sentences for testing
sentence1 = "A cat sits on the mat."
sentence2 = "There is a cat on the mat."
sentence3 = "The sun is shining."
# Generate embeddings
embeddings = model.encode([sentence1, sentence2, sentence3])
print(f"Embeddings: {embeddings}");
# Compute cosine similarity between pairs
similarity_1_2 = util.cos_sim(embeddings[0], embeddings[1])
similarity_1_3 = util.cos_sim(embeddings[0], embeddings[2])
# Show results
print(f"Similarity between sentence 1 and 2: {similarity_1_2.item():.4f}")
print(f"Similarity between sentence 1 and 3: {similarity_1_3.item():.4f}")