import paddle from paddlenlp.datasets import load_dataset from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer import yaml import json import numpy as np import functools from paddle.nn import CrossEntropyLoss from paddlenlp.data import DataCollatorWithPadding from paddlenlp.trainer import Trainer, TrainingArguments import os from sklearn.metrics import precision_score, recall_score, f1_score def load_config(config_path): """加载 YAML 配置文件""" try: with open(config_path, "r", encoding="utf-8") as f: return yaml.safe_load(f) except Exception as e: raise ValueError(f"读取配置文件时出错: {str(e)}") def generate_label_mappings(labels): """生成 label2id 和 id2label 映射""" label_id = {label: idx for idx, label in enumerate(labels)} id_label = {idx: label for label, idx in label_id.items()} return label_id, id_label def preprocess_function(examples, tokenizer, max_length, is_test=False): """数据预处理函数""" result = tokenizer(examples["text"], max_length=max_length, truncation=True, padding='max_length') if not is_test: result["labels"] = np.array([examples["label"]], dtype="int64") return result def read_local_dataset(path, label2id=None, is_test=False): """读取本地数据集""" try: with open(path, "r", encoding="utf-8") as f: data = json.load(f) for item in data: if is_test: if "text" in item: yield {"text": item["text"]} else: if "text" in item and "label" in item: yield {"text": item["text"], "label": label2id.get(item["label"], -1)} except Exception as e: raise ValueError(f"读取数据集时出错: {str(e)}") def load_and_preprocess_dataset(path, label2id, tokenizer, max_length, is_test=False): """加载并预处理数据集""" try: dataset = load_dataset(read_local_dataset, path=path, label2id=label2id, lazy=False, is_test=is_test) trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_length=max_length, is_test=is_test) return dataset.map(trans_func) except Exception as e: raise ValueError(f"加载和预处理数据集时出错: {str(e)}") def export_model(trainer, export_model_dir): """导出模型和 tokenizer""" os.makedirs(export_model_dir, exist_ok=True) model_to_export = trainer.model input_spec = [paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")] paddle.jit.save(model_to_export, os.path.join(export_model_dir, 'model'), input_spec=input_spec) trainer.tokenizer.save_pretrained(export_model_dir) # 保存 id2label 和 label2id 文件 id2label_file = os.path.join(export_model_dir, 'id2label.json') label2id_file = os.path.join(export_model_dir, 'label2id.json') with open(id2label_file, 'w', encoding='utf-8') as f: json.dump(trainer.model.id2label, f, ensure_ascii=False) with open(label2id_file, 'w', encoding='utf-8') as f: json.dump(trainer.model.label2id, f, ensure_ascii=False) print(f'Model and tokenizer have been saved to {export_model_dir}') def compute_metrics(p): """计算评估指标""" predictions, labels = p pred_labels = np.argmax(predictions, axis=1) + 1 accuracy = np.sum(pred_labels == labels) / len(labels) precision = precision_score(labels, pred_labels, average='macro') recall = recall_score(labels, pred_labels, average='macro') f1 = f1_score(labels, pred_labels, average='macro') metrics = {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1} print("Computed metrics:", metrics) # 打印计算出来的指标 return metrics def main(): try: # 读取配置 config = load_config("data.yaml") label_id, id_label = generate_label_mappings(config["labels"]) # 加载数据集 tokenizer = ErnieTokenizer.from_pretrained(config["model_path"]) train_ds = load_and_preprocess_dataset(config["train"], label_id, tokenizer, max_length=256) test_ds = load_and_preprocess_dataset(config["val"], label_id, tokenizer, max_length=256, is_test=True) # 加载模型 model = ErnieForSequenceClassification.from_pretrained(config["model_path"], num_classes=len(label_id), label2id=label_id, id2label=id_label) # 定义 DataLoader data_collator = DataCollatorWithPadding(tokenizer) # 定义训练参数 training_args = TrainingArguments( output_dir="./output_temp", evaluation_strategy="epoch", save_strategy="epoch", eval_steps=2000, # 每2000步评估一次,evaluation_strategy="steps"时生效 save_steps=2000, # 每2000步保存一次,save_strategy="steps"时生效 logging_dir="./logs", logging_steps=100, # 每100步输出一次日志 num_train_epochs=8, # 训练轮数 per_device_train_batch_size=64, per_device_eval_batch_size=64, gradient_accumulation_steps=1, learning_rate=5e-5, weight_decay=0.01, disable_tqdm=False, greater_is_better=True, # 准确率越高越好 ) # 创建 Trainer trainer = Trainer( model=model, tokenizer=tokenizer, args=training_args, criterion=CrossEntropyLoss(), train_dataset=train_ds, eval_dataset=test_ds, data_collator=data_collator, compute_metrics=compute_metrics, # 使用自定义的评估指标 ) # 训练模型 trainer.train() # 保存模型 trainer.save_model("./saved_model_static") # 默认保存为 './uie_ner' 目录 except Exception as e: print(f"训练过程中出错: {str(e)}") if __name__ == "__main__": main()