一、环境
import datetime !pip install "transformers==4.48.2" !pip install pip3-autoremove !pip-autoremove torch torchvision torchaudio -y !pip install torch torchvision torchaudio transformers --index-url https://download.pytorch.org/whl !pip install "trl==0.14.0" !pip install "fsspec==2024.10.0"
二、引入准备
from datasets import load_dataset from trl import SFTConfig,SFTTrainer from peft import LoraConfig import json from transformers import AutoTokenizer,AutoModelForCausalLM import transformers import torch print(transformers.__version__) print(torch.__version__)
三、加载数据
dataset = load_dataset('json',data_files='/model/FreedomIntelligence:medical-o1-reasoning-SFT/medical_o1_sft_Chinese.json',split='train[0:1000]')
print(type(dataset))
print(dataset)
print(dataset[0].keys())
print(dataset[0]['Question'])
print(dataset[0]['Complex_CoT'])
print(dataset[0]['Response'])
def preprocess_data(example):
example['text'] = example['Question'] + " " + example['Complex_CoT'] + " " + example['Response']
return example
dataset = dataset.map(preprocess_data)
# 打印样本,确认合并结果
print(dataset[0].keys()) # 现在应该包含 'text' 字段
print(dataset[0]['text']) # 打印合并后的文本
train_test_dataset = dataset.train_test_split(test_size=0.3)
print(train_test_dataset['test'][0])
print(train_test_dataset['train'][0])
四、加载模型
import time
device = 'mps'
timestart=time.time()
modepath = "model/DeepSeek-R1-Distill-Qwen-1.5B"
tokenizer = AutoTokenizer.from_pretrained(modepath,trust_remote_code=True,padding_side = 'right')
model = AutoModelForCausalLM.from_pretrained(modepath,trust_remote_code=True).to(device)
tokenizer.pad_token=tokenizer.eos_token
time_end = time.time()
print(f"加载耗时:{time_end-timestart}秒")
五、评估函数
from transformers import TrainerCallback,TrainingArguments,TrainerState,TrainerControl
import os
import matplotlib.pyplot as plt
import datetime
import torch
class EvaluationCallback(TrainerCallback):
def __init__(self,test_dataset,tokenizer):
self.test_dataset = test_dataset
self.tokenizer = tokenizer
self.epoch = 0
self.eval_loss = []
self.train_loss=[]
self.epochs=[]
# print(f"\ndataset的内容是:{test_dataset[1]}")
def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
print(f"\nevalation model after {self.epoch} times")
if state.log_history:
latest_loss = state.log_history[-1].get('loss')
if latest_loss is not None:
self.train_loss.append(latest_loss)
self.epochs.append(self.epoch)
model.eval()
total_eval_loss = 0
num_eval_samples=0
with torch.no_grad():
for i in range(min(1,len(self.test_dataset))):
# question = self.test_dataset[i]['Question']
# cot = self.test_dataset[i]['Complex_CoT']
# response = self.test_dataset[i]['Response']
# pmt = question+cot+response
print(self.test_dataset[i])
pmt = self.test_dataset[i]
message = [{"role":"user","content":pmt}]
chat_template_text = self.tokenizer.apply_chat_template(message,
tokenize=False,
add_generation_prompt=False)
print(f"\nchat_template_text内容是{chat_template_text}")
model_input = self.tokenizer([chat_template_text],return_tensors='pt').to(model.device)
generated_ids = model.generate(**model_input,
max_new_token=100,
do_sample=False,
temperature=0.3,
top_p=0.9,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id,
repetition_penalty=1.2,
no_repeat_ngram_size=3,
early_stopping=False
)
outputs = model(**model_input,labels=model_input.input_ids)
loss = outputs.loss.item()
total_eval_loss += loss
num_eval_samples +=1
# generated_text = self.tokenizer.batch_decode([output_ids[len(input_ids):] for input_ids],
# skip_special_tokens=True)[0]
generated_text = self.tokenizer.batch_decode(
[output_ids[len(input_ids):] for input_ids, output_ids in zip(model_input.input_ids, generated_ids)],
skip_special_tokens=True
)[0]
print(f"\ntest simple {i+1}:")
print(f"input:{pmt}")
print(f"output:{generated_text}")
print(f"loss:{loss}")
print("-"+50)
avg_eval_loss = total_eval_loss/num_eval_samples if num_eval_samples>0 else 0
self.eval_loss.append(avg_eval_loss)
metrics={
'epochs':self.epochs,
'train_loss':self.train_loss,
'eval_loss':self.eval_loss,
'current_epoch':self.epoch,
'timestamp':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
os.makedirs('losses', exist_ok=True)
# Save metrics to JSON
with open('losses/training_metrics.json', 'w') as f:
json.dump(metrics, f, indent=2)
plt.figure(figsize=(10, 6))
plt.plot(self.epochs, self.train_losses, 'b-', label='Training Loss')
plt.plot(range(len(self.eval_losses)), self.eval_losses, 'r-', label='Evaluation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Evaluation Loss Over Time')
plt.grid(True)
plt.legend()
plt.savefig('losses/training_progress.png')
plt.close()
self.epoch += 1
model.train()
六、超参配置
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0,
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
],
modules_to_save=["lm_head", "embed_token"],
task_type="CAUSAL_LM"
)
七、设置训练器
import shutil
import os
output_dir = "./finetuned_model/deepseek_1.5b_wenan"
print(output_dir)
# Remove output directory if it exists
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
print(f"Removed existing output directory: {output_dir}")
# Create fresh output directory
os.makedirs(output_dir)
print(f"\n dataset is {train_test_dataset['train'][0]}")
print(f"\n train_test_dataset is {train_test_dataset['test'][0]}")
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=train_test_dataset['train'],
eval_dataset=train_test_dataset['test'],
args=SFTConfig(
output_dir=output_dir,
num_train_epochs=5, # Reduced from 5
per_device_train_batch_size=2, # Add small batch size
gradient_accumulation_steps=4, # Add gradient accumulation
learning_rate=1e-4, # Add explicit learning rate
weight_decay=0.01, # Add weight decay for regularization
logging_steps=1, # More frequent logging for small dataset
save_steps=5, # More frequent saving
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="loss",
greater_is_better=False,
warmup_steps=10, # Add warmup steps
),
peft_config=peft_config,
callbacks=[EvaluationCallback(train_test_dataset['test'], tokenizer)]
)
八、显示训练量
trainable_params = 0
all_params = 0
for _, param in trainer.model.named_parameters():
all_params += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(f"Trainable parameters: {trainable_params:,}")
print(f"All parameters: {all_params:,}")
print(f"Percentage of parameters being trained: {100 * trainable_params / all_params:.2f}%")
九、训练
train_output = trainer.train()
十、显示训练效果
def generate_response(model, tokenizer, user_input, system_prompt):
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_input}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False
)
print(f"\ntext: {text}")
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(
**model_inputs,
max_new_tokens=100,
do_sample=True,
temperature=0.1,
top_p=0.9,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
repetition_penalty=1.2,
no_repeat_ngram_size=3,
early_stopping=False
)
# print(f"\ngenerated_ids: {tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]}")
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
# print(generated_ids)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)[0]
return response
十一、打印测试结果
print("\nTesting with examples from test dataset:")
for i in range(min(1, len(train_test_dataset['test']))):
system_prompt = "你是一个文案助手,你的任务是帮用户生成文案"
test_input = train_test_dataset['test'][i]
print(f"\nTest input {i+1}: {test_input}")
response = generate_response(trainer.model, trainer.processing_class, test_input, system_prompt)
print(f"Model response: {response}")
print("-" * 80)
十二、保存模型
new_model_local = "DeepSeek-R1-Medical-COT-zh" model.save_pretrained(new_model_local) tokenizer.save_pretrained(new_model_local) model.save_pretrained_merged(new_model_local, tokenizer, save_method = "merged_16bit",)
十三、上传到HF
new_model_online = "tain198127/DeepSeek-R1-Medical-COT-zh" model.push_to_hub(new_model_online) tokenizer.push_to_hub(new_model_online) model.push_to_hub_merged(new_model_online, tokenizer, save_method = "merged_16bit")