重构模型训练

This commit is contained in:
jiang 2025-02-21 16:52:03 +08:00
parent 3cea60d4fd
commit 1cb6c7f69c
19 changed files with 57622 additions and 0 deletions

8
ernie/data.yaml Normal file
View File

@ -0,0 +1,8 @@
model_path: E:\workingSpace\PycharmProjects\ernie\models\ernie-3.0-tiny-base-v2-zh #模型地址
train: E:\workingSpace\PycharmProjects\ernie\ernie\data\train.json # 训练集路径
val: E:\workingSpace\PycharmProjects\ernie\ernie\data\val.json # 验证集路径
test: E:\workingSpace\PycharmProjects\ernie\ernie\data\test.json # (可选) 测试集路径
# 类别信息
nc: 10 # 目标类别数
labels: ["天气查询","互联网查询","页面切换","日计划数量查询","周计划数量查询","日计划作业内容","周计划作业内容","施工人数","作业考勤人数","知识问答"] # 类别名称

3802
ernie/data/test.json Normal file

File diff suppressed because it is too large Load Diff

26574
ernie/data/train.json Normal file

File diff suppressed because it is too large Load Diff

7594
ernie/data/val.json Normal file

File diff suppressed because it is too large Load Diff

27
ernie/load_model.py Normal file
View File

@ -0,0 +1,27 @@
import paddle
import numpy as np
from paddlenlp.transformers import ErnieTokenizer
import paddle.nn.functional as F # 用于 Softmax
# 加载模型和tokenizer
model = paddle.jit.load("trained_model_static") # 加载保存的静态图模型
tokenizer = ErnieTokenizer.from_pretrained("E:/workingSpace/PycharmProjects/Intention/models/ernie-3.0-tiny-base-v2-zh")
# 创建输入示例
text = "今天送变电二公司有?"
inputs = tokenizer(text, max_length=256, truncation=True, padding='max_length', return_tensors="pd")
# 将输入数据转化为 Paddle tensor 格式
input_ids = paddle.to_tensor(inputs["input_ids"])
# 进行推理
model.eval() # 确保模型在推理模式
logits = model(input_ids) # 模型推理得到logits
# 使用 Softmax 转换 logits 为概率
probabilities = F.softmax(logits, axis=1) # 归一化 logits 得到概率分布
# 获取最大概率的标签
max_prob_idx = np.argmax(probabilities.numpy(), axis=1)
max_prob_value = np.max(probabilities.numpy(), axis=1)
# 输出预测结果
print(f"Predicted label: {max_prob_idx}")
print(f"Predicted label: {max_prob_value}")

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

76
ernie/test_model.py Normal file
View File

@ -0,0 +1,76 @@
import paddle
import numpy as np
import yaml
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from paddlenlp.transformers import ErnieTokenizer
from paddle.io import DataLoader
from paddlenlp.data import DataCollatorWithPadding
import json
import functools
from paddlenlp.datasets import load_dataset
# 加载配置
with open("data.yaml", "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
# 加载模型和 tokenizer
model = paddle.jit.load("trained_model_static") # 加载保存的静态图模型
tokenizer = ErnieTokenizer.from_pretrained("E:/workingSpace/PycharmProjects/Intention/models/ernie-3.0-tiny-base-v2-zh")
# 读取数据集的函数
def read_local_dataset(path, label2id=None, is_test=True):
with open(path, "r", encoding="utf-8") as f:
data = json.load(f) # 读取 JSON 数据
for item in data:
if "text" in item:
yield {"text": item["text"], "label": label2id.get(item["label"], -1)} # 如果 label 不存在,标记为 -1
# 生成 label2id 和 id2label
label_id = {label: idx for idx, label in enumerate(config["labels"])}
id_label = {idx: label for label, idx in label_id.items()}
# 数据预处理函数
def preprocess_function(examples, tokenizer, max_length, is_test=False):
result = tokenizer(examples["text"], max_length=max_length, truncation=True, padding='max_length')
if not is_test:
result["labels"] = np.array([examples["label"]], dtype="int64")
return result
# 加载数据集
test_ds = load_dataset(read_local_dataset, path=config["val"], label2id=label_id, lazy=False)
# 转换数据集
trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_length=256)
test_ds = test_ds.map(trans_func)
# 使用 DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)
test_dataloader = DataLoader(test_ds, batch_size=16, shuffle=False, collate_fn=data_collator)
# 评估模型
model.eval() # 切换到评估模式
all_preds = []
all_labels = []
# 遍历数据集进行推理
for batch in test_dataloader:
input_ids = batch["input_ids"] # 使用模型输入的 input_ids
attention_mask = batch["attention_mask"] # 使用 attention_mask
labels = batch["labels"] # 获取真实标签(根据预处理函数传递)
# 获取模型输出
logits = model(input_ids, attention_mask=attention_mask) # 传递 input_ids 和 attention_mask
pred_labels = np.argmax(logits.numpy(), axis=1) # 选择概率最大的标签
# 保存预测值和真实标签
all_preds.extend(pred_labels)
all_labels.extend(labels.numpy())
# 计算评估指标
accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
# 输出性能评估结果
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

125
ernie/train.py Normal file
View File

@ -0,0 +1,125 @@
import paddle
from paddlenlp.datasets import MapDataset, load_dataset
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
import yaml
import json
import numpy as np
import functools
from paddle.io import DataLoader
from paddle.nn import CrossEntropyLoss
from paddlenlp.data import DataCollatorWithPadding
from paddlenlp.trainer import Trainer, TrainingArguments
import os
import json
import paddle
# 读取 YAML 配置
with open("data.yaml", "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
# 生成 label2id 和 id2label
label_id = {label: idx for idx, label in enumerate(config["labels"])}
id_label = {idx: label for label, idx in label_id.items()}
# 数据预处理函数
def preprocess_function(examples, tokenizer, max_length, is_test=False):
result = tokenizer(examples["text"], max_length=max_length, truncation=True, padding='max_length')
if not is_test:
result["labels"] = np.array([examples["label"]], dtype="int64")
return result
# 读取本地数据集
def read_local_dataset(path, label2id=None, is_test=False):
with open(path, "r", encoding="utf-8") as f:
data = json.load(f) # 读取 JSON 数据
for item in data:
if is_test:
if "text" in item:
yield {"text": item["text"]} # 测试集仅返回文本
else:
if "text" in item and "label" in item:
yield {"text": item["text"], "label": label2id.get(item["label"], -1)} # 如果label缺失默认标记为 -1
# 加载数据集
train_ds = load_dataset(read_local_dataset, path=config["train"], label2id=label_id, lazy=False)
test_ds = load_dataset(read_local_dataset, path=config["test"], label2id=label_id, lazy=False)
# 加载模型
model = ErnieForSequenceClassification.from_pretrained(config["model_path"], num_classes=config["nc"],
label2id=label_id, id2label=id_label)
tokenizer = ErnieTokenizer.from_pretrained(config["model_path"])
# 转换数据集
trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_length=256)
train_ds = train_ds.map(trans_func)
test_ds = test_ds.map(trans_func)
# 定义 DataLoader并使用 DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(train_ds, batch_size=16, shuffle=True, collate_fn=data_collator)
test_dataloader = DataLoader(test_ds, batch_size=16, shuffle=False, collate_fn=data_collator)
# 定义训练参数
training_args = TrainingArguments(
output_dir="./output",
evaluation_strategy="steps", # 评估频率
save_steps=500, # 保存频率
logging_dir="./logs", # 日志目录
logging_steps=100, # 日志频率
num_train_epochs=100, # 训练轮数
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
gradient_accumulation_steps=1, # 梯度累积
learning_rate=5e-5,
weight_decay=0.01, # 权重衰减
disable_tqdm=False, # 是否禁用 tqdm 进度条
)
# 定义评估指标(如果有的话)
def compute_metrics(p):
predictions, labels = p
pred_labels = np.argmax(predictions, axis=1)
accuracy = np.sum(pred_labels == labels) / len(labels)
return {"accuracy": accuracy}
# 创建 Trainer
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
criterion=CrossEntropyLoss(),
train_dataset=train_ds,
eval_dataset=test_ds,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
# 训练模型
trainer.train()
# 设置导出路径
export_model_dir = './output/export'
# 确保目录存在
os.makedirs(export_model_dir, exist_ok=True)
# 导出模型
model_to_export = trainer.model
input_spec = [paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")]
# 导出模型的静态图
paddle.jit.save(model_to_export, os.path.join(export_model_dir, 'model'), input_spec=input_spec)
# 保存 tokenizer 配置
tokenizer.save_pretrained(export_model_dir)
# 保存标签映射文件
id2label_file = os.path.join(export_model_dir, 'id2label.json')
with open(id2label_file, 'w', encoding='utf-8') as f:
json.dump(id_label, f, ensure_ascii=False)
label2id_file = os.path.join(export_model_dir, 'label2id.json')
with open(label2id_file, 'w', encoding='utf-8') as f:
json.dump(label_id, f, ensure_ascii=False)
print(f'Model and tokenizer have been saved to {export_model_dir}')

View File

@ -0,0 +1,24 @@
import json
# 读取 text 文件
with open("data/train.txt", "r", encoding="utf-8") as f:
data = f.readlines() # 按行读取
# 解析数据
json_list = []
for line in data:
parts = line.strip().split("\t") # 按 Tab 拆分
if len(parts) == 2: # 确保数据格式正确
json_list.append({"text": parts[0], "label": parts[1]})
else:
print(f"跳过格式错误的行: {line.strip()}") # 打印错误数据,方便排查
# 转换为 JSON 格式
json_output = json.dumps(json_list, ensure_ascii=False, indent=4)
# 保存到 JSON 文件
with open("data/train.json", "w", encoding="utf-8") as f:
f.write(json_output)
# 打印 JSON 结果
print(json_output)

0
uie/data/dev.json Normal file
View File

17279
uie/data/dev.txt Normal file

File diff suppressed because it is too large Load Diff

0
uie/data/train.json Normal file
View File

2047
uie/data/train.txt Normal file

File diff suppressed because it is too large Load Diff

66
uie/train.py Normal file
View File

@ -0,0 +1,66 @@
import os
import json
import paddle
import paddlenlp
from paddlenlp.utils.log import logger
from paddlenlp.datasets import MapDataset
from paddlenlp.transformers import UIE, ErnieTokenizer
from paddlenlp.trainer import TrainingArguments, Trainer
# 读取数据
def read_data(filepath):
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
return data
# 数据转换函数
def convert_data(examples, tokenizer):
results = []
for example in examples:
text = example["text"]
encoding = tokenizer(text)
results.append({
"input_ids": encoding["input_ids"],
"token_type_ids": encoding["token_type_ids"],
"attention_mask": encoding["attention_mask"],
"labels": example["label"],
})
return results
# 加载数据
train_data = read_data("data/train.json")
dev_data = read_data("data/dev.json")
# 选择模型
model_name = "uie-base"
tokenizer = ErnieTokenizer.from_pretrained(model_name)
model = UIE.from_pretrained(model_name)
# 预处理数据
train_dataset = MapDataset(convert_data(train_data, tokenizer))
dev_dataset = MapDataset(convert_data(dev_data, tokenizer))
# 训练参数
training_args = TrainingArguments(
output_dir="./checkpoint",
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
learning_rate=1e-5,
num_train_epochs=10,
logging_dir="./logs",
logging_steps=100,
save_steps=500,
evaluation_strategy="epoch",
save_total_limit=2
)
# 训练器
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=dev_dataset
)
# 开始训练
trainer.train()