2025-02-21 16:52:03 +08:00
|
|
|
import paddle
|
2025-02-25 09:27:14 +08:00
|
|
|
from paddlenlp.datasets import load_dataset
|
2025-02-21 16:52:03 +08:00
|
|
|
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
|
|
|
|
|
import yaml
|
|
|
|
|
import json
|
|
|
|
|
import numpy as np
|
|
|
|
|
import functools
|
|
|
|
|
from paddle.nn import CrossEntropyLoss
|
|
|
|
|
from paddlenlp.data import DataCollatorWithPadding
|
|
|
|
|
from paddlenlp.trainer import Trainer, TrainingArguments
|
|
|
|
|
import os
|
2025-02-25 09:27:14 +08:00
|
|
|
from sklearn.metrics import precision_score, recall_score, f1_score
|
|
|
|
|
|
|
|
|
|
def load_config(config_path):
|
|
|
|
|
"""加载 YAML 配置文件"""
|
|
|
|
|
with open(config_path, "r", encoding="utf-8") as f:
|
|
|
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_label_mappings(labels):
|
|
|
|
|
"""生成 label2id 和 id2label 映射"""
|
|
|
|
|
label_id = {label: idx for idx, label in enumerate(labels)}
|
|
|
|
|
id_label = {idx: label for label, idx in label_id.items()}
|
|
|
|
|
return label_id, id_label
|
2025-02-21 16:52:03 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def preprocess_function(examples, tokenizer, max_length, is_test=False):
|
2025-02-25 09:27:14 +08:00
|
|
|
"""数据预处理函数"""
|
2025-02-21 16:52:03 +08:00
|
|
|
result = tokenizer(examples["text"], max_length=max_length, truncation=True, padding='max_length')
|
|
|
|
|
if not is_test:
|
|
|
|
|
result["labels"] = np.array([examples["label"]], dtype="int64")
|
|
|
|
|
return result
|
|
|
|
|
|
2025-02-25 09:27:14 +08:00
|
|
|
|
2025-02-21 16:52:03 +08:00
|
|
|
def read_local_dataset(path, label2id=None, is_test=False):
|
2025-02-25 09:27:14 +08:00
|
|
|
"""读取本地数据集"""
|
2025-02-21 16:52:03 +08:00
|
|
|
with open(path, "r", encoding="utf-8") as f:
|
2025-02-25 09:27:14 +08:00
|
|
|
data = json.load(f)
|
2025-02-21 16:52:03 +08:00
|
|
|
for item in data:
|
|
|
|
|
if is_test:
|
|
|
|
|
if "text" in item:
|
2025-02-25 09:27:14 +08:00
|
|
|
yield {"text": item["text"]}
|
2025-02-21 16:52:03 +08:00
|
|
|
else:
|
|
|
|
|
if "text" in item and "label" in item:
|
2025-02-25 09:27:14 +08:00
|
|
|
yield {"text": item["text"], "label": label2id.get(item["label"], -1)}
|
2025-02-21 16:52:03 +08:00
|
|
|
|
|
|
|
|
|
2025-02-25 09:27:14 +08:00
|
|
|
def load_and_preprocess_dataset(path, label2id, tokenizer, max_length, is_test=False):
|
|
|
|
|
"""加载并预处理数据集"""
|
|
|
|
|
dataset = load_dataset(read_local_dataset, path=path, label2id=label2id, lazy=False, is_test=is_test)
|
|
|
|
|
trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_length=max_length, is_test=is_test)
|
|
|
|
|
return dataset.map(trans_func)
|
2025-02-21 16:52:03 +08:00
|
|
|
|
2025-02-25 09:27:14 +08:00
|
|
|
def export_model(trainer, export_model_dir):
|
|
|
|
|
"""导出模型和 tokenizer"""
|
|
|
|
|
os.makedirs(export_model_dir, exist_ok=True)
|
|
|
|
|
model_to_export = trainer.model
|
|
|
|
|
input_spec = [paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")]
|
|
|
|
|
paddle.jit.save(model_to_export, os.path.join(export_model_dir, 'model'), input_spec=input_spec)
|
|
|
|
|
trainer.tokenizer.save_pretrained(export_model_dir)
|
2025-02-21 16:52:03 +08:00
|
|
|
|
2025-02-25 09:27:14 +08:00
|
|
|
id2label_file = os.path.join(export_model_dir, 'id2label.json')
|
|
|
|
|
label2id_file = os.path.join(export_model_dir, 'label2id.json')
|
|
|
|
|
with open(id2label_file, 'w', encoding='utf-8') as f:
|
|
|
|
|
json.dump(trainer.model.id2label, f, ensure_ascii=False)
|
|
|
|
|
with open(label2id_file, 'w', encoding='utf-8') as f:
|
|
|
|
|
json.dump(trainer.model.label2id, f, ensure_ascii=False)
|
|
|
|
|
print(f'Model and tokenizer have been saved to {export_model_dir}')
|
2025-02-21 16:52:03 +08:00
|
|
|
|
|
|
|
|
|
2025-02-25 09:27:14 +08:00
|
|
|
def compute_metrics(p):
|
|
|
|
|
"""计算评估指标"""
|
|
|
|
|
predictions, labels = p
|
|
|
|
|
pred_labels = np.argmax(predictions, axis=1)
|
|
|
|
|
accuracy = np.sum(pred_labels == labels) / len(labels)
|
|
|
|
|
precision = precision_score(labels, pred_labels, average='macro')
|
|
|
|
|
recall = recall_score(labels, pred_labels, average='macro')
|
|
|
|
|
f1 = f1_score(labels, pred_labels, average='macro')
|
|
|
|
|
metrics = {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
|
|
|
|
|
print("Computed metrics:", metrics) # Debug statement
|
|
|
|
|
return metrics
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
# 读取配置
|
|
|
|
|
config = load_config("data.yaml")
|
|
|
|
|
label_id, id_label = generate_label_mappings(config["labels"])
|
|
|
|
|
|
|
|
|
|
# 加载数据集
|
|
|
|
|
tokenizer = ErnieTokenizer.from_pretrained(config["model_path"])
|
|
|
|
|
train_ds = load_and_preprocess_dataset(config["train"], label_id, tokenizer, max_length=256)
|
|
|
|
|
test_ds = load_and_preprocess_dataset(config["test"], label_id, tokenizer, max_length=256, is_test=True)
|
|
|
|
|
|
|
|
|
|
# 加载模型
|
|
|
|
|
model = ErnieForSequenceClassification.from_pretrained(config["model_path"], num_classes=config["nc"],
|
|
|
|
|
label2id=label_id, id2label=id_label)
|
|
|
|
|
|
|
|
|
|
# 定义 DataLoader
|
|
|
|
|
data_collator = DataCollatorWithPadding(tokenizer)
|
|
|
|
|
|
|
|
|
|
# 定义训练参数
|
|
|
|
|
training_args = TrainingArguments(
|
|
|
|
|
output_dir="./output",
|
|
|
|
|
evaluation_strategy="steps", # 按步数进行评估
|
|
|
|
|
eval_steps=100, # 每100步评估一次
|
|
|
|
|
save_steps=500,
|
|
|
|
|
logging_dir="./logs",
|
|
|
|
|
logging_steps=50, # 每50步输出一次日志
|
|
|
|
|
num_train_epochs=10, # 训练轮数
|
|
|
|
|
per_device_train_batch_size=16,
|
|
|
|
|
per_device_eval_batch_size=16,
|
|
|
|
|
gradient_accumulation_steps=1,
|
|
|
|
|
learning_rate=5e-5,
|
|
|
|
|
weight_decay=0.01,
|
|
|
|
|
disable_tqdm=False,
|
|
|
|
|
metric_for_best_model="accuracy", # 根据准确率选择最佳模型
|
|
|
|
|
greater_is_better=True, # 准确率越高越好
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# 创建 Trainer
|
|
|
|
|
trainer = Trainer(
|
|
|
|
|
model=model,
|
|
|
|
|
tokenizer=tokenizer,
|
|
|
|
|
args=training_args,
|
|
|
|
|
criterion=CrossEntropyLoss(),
|
|
|
|
|
train_dataset=train_ds,
|
|
|
|
|
eval_dataset=test_ds,
|
|
|
|
|
data_collator=data_collator,
|
|
|
|
|
compute_metrics=compute_metrics, # 使用自定义的评估指标
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# 训练模型
|
|
|
|
|
trainer.train()
|
|
|
|
|
|
|
|
|
|
# 导出模型
|
|
|
|
|
export_model(trainer, './output/export')
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|