重构模型训练
This commit is contained in:
parent
3cea60d4fd
commit
1cb6c7f69c
|
|
@ -0,0 +1,8 @@
|
|||
model_path: E:\workingSpace\PycharmProjects\ernie\models\ernie-3.0-tiny-base-v2-zh #模型地址
|
||||
train: E:\workingSpace\PycharmProjects\ernie\ernie\data\train.json # 训练集路径
|
||||
val: E:\workingSpace\PycharmProjects\ernie\ernie\data\val.json # 验证集路径
|
||||
test: E:\workingSpace\PycharmProjects\ernie\ernie\data\test.json # (可选) 测试集路径
|
||||
|
||||
# 类别信息
|
||||
nc: 10 # 目标类别数
|
||||
labels: ["天气查询","互联网查询","页面切换","日计划数量查询","周计划数量查询","日计划作业内容","周计划作业内容","施工人数","作业考勤人数","知识问答"] # 类别名称
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,27 @@
|
|||
import paddle
|
||||
import numpy as np
|
||||
from paddlenlp.transformers import ErnieTokenizer
|
||||
import paddle.nn.functional as F # 用于 Softmax
|
||||
# 加载模型和tokenizer
|
||||
model = paddle.jit.load("trained_model_static") # 加载保存的静态图模型
|
||||
tokenizer = ErnieTokenizer.from_pretrained("E:/workingSpace/PycharmProjects/Intention/models/ernie-3.0-tiny-base-v2-zh")
|
||||
|
||||
# 创建输入示例
|
||||
text = "今天送变电二公司有?"
|
||||
inputs = tokenizer(text, max_length=256, truncation=True, padding='max_length', return_tensors="pd")
|
||||
|
||||
# 将输入数据转化为 Paddle tensor 格式
|
||||
input_ids = paddle.to_tensor(inputs["input_ids"])
|
||||
|
||||
# 进行推理
|
||||
model.eval() # 确保模型在推理模式
|
||||
logits = model(input_ids) # 模型推理得到logits
|
||||
|
||||
# 使用 Softmax 转换 logits 为概率
|
||||
probabilities = F.softmax(logits, axis=1) # 归一化 logits 得到概率分布
|
||||
# 获取最大概率的标签
|
||||
max_prob_idx = np.argmax(probabilities.numpy(), axis=1)
|
||||
max_prob_value = np.max(probabilities.numpy(), axis=1)
|
||||
# 输出预测结果
|
||||
print(f"Predicted label: {max_prob_idx}")
|
||||
print(f"Predicted label: {max_prob_value}")
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,76 @@
|
|||
import paddle
|
||||
import numpy as np
|
||||
import yaml
|
||||
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
||||
from paddlenlp.transformers import ErnieTokenizer
|
||||
from paddle.io import DataLoader
|
||||
from paddlenlp.data import DataCollatorWithPadding
|
||||
import json
|
||||
import functools
|
||||
from paddlenlp.datasets import load_dataset
|
||||
|
||||
# 加载配置
|
||||
with open("data.yaml", "r", encoding="utf-8") as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
# 加载模型和 tokenizer
|
||||
model = paddle.jit.load("trained_model_static") # 加载保存的静态图模型
|
||||
tokenizer = ErnieTokenizer.from_pretrained("E:/workingSpace/PycharmProjects/Intention/models/ernie-3.0-tiny-base-v2-zh")
|
||||
|
||||
|
||||
# 读取数据集的函数
|
||||
def read_local_dataset(path, label2id=None, is_test=True):
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f) # 读取 JSON 数据
|
||||
for item in data:
|
||||
if "text" in item:
|
||||
yield {"text": item["text"], "label": label2id.get(item["label"], -1)} # 如果 label 不存在,标记为 -1
|
||||
|
||||
# 生成 label2id 和 id2label
|
||||
label_id = {label: idx for idx, label in enumerate(config["labels"])}
|
||||
id_label = {idx: label for label, idx in label_id.items()}
|
||||
|
||||
# 数据预处理函数
|
||||
def preprocess_function(examples, tokenizer, max_length, is_test=False):
|
||||
result = tokenizer(examples["text"], max_length=max_length, truncation=True, padding='max_length')
|
||||
if not is_test:
|
||||
result["labels"] = np.array([examples["label"]], dtype="int64")
|
||||
return result
|
||||
|
||||
# 加载数据集
|
||||
test_ds = load_dataset(read_local_dataset, path=config["val"], label2id=label_id, lazy=False)
|
||||
# 转换数据集
|
||||
trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_length=256)
|
||||
test_ds = test_ds.map(trans_func)
|
||||
|
||||
# 使用 DataCollatorWithPadding
|
||||
data_collator = DataCollatorWithPadding(tokenizer)
|
||||
test_dataloader = DataLoader(test_ds, batch_size=16, shuffle=False, collate_fn=data_collator)
|
||||
|
||||
# 评估模型
|
||||
model.eval() # 切换到评估模式
|
||||
all_preds = []
|
||||
all_labels = []
|
||||
|
||||
# 遍历数据集进行推理
|
||||
for batch in test_dataloader:
|
||||
input_ids = batch["input_ids"] # 使用模型输入的 input_ids
|
||||
attention_mask = batch["attention_mask"] # 使用 attention_mask
|
||||
labels = batch["labels"] # 获取真实标签(根据预处理函数传递)
|
||||
# 获取模型输出
|
||||
logits = model(input_ids, attention_mask=attention_mask) # 传递 input_ids 和 attention_mask
|
||||
pred_labels = np.argmax(logits.numpy(), axis=1) # 选择概率最大的标签
|
||||
|
||||
# 保存预测值和真实标签
|
||||
all_preds.extend(pred_labels)
|
||||
all_labels.extend(labels.numpy())
|
||||
|
||||
# 计算评估指标
|
||||
accuracy = accuracy_score(all_labels, all_preds)
|
||||
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
|
||||
|
||||
# 输出性能评估结果
|
||||
print(f"Accuracy: {accuracy:.4f}")
|
||||
print(f"Precision: {precision:.4f}")
|
||||
print(f"Recall: {recall:.4f}")
|
||||
print(f"F1 Score: {f1:.4f}")
|
||||
|
|
@ -0,0 +1,125 @@
|
|||
import paddle
|
||||
from paddlenlp.datasets import MapDataset, load_dataset
|
||||
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
|
||||
import yaml
|
||||
import json
|
||||
import numpy as np
|
||||
import functools
|
||||
from paddle.io import DataLoader
|
||||
from paddle.nn import CrossEntropyLoss
|
||||
from paddlenlp.data import DataCollatorWithPadding
|
||||
from paddlenlp.trainer import Trainer, TrainingArguments
|
||||
import os
|
||||
import json
|
||||
import paddle
|
||||
# 读取 YAML 配置
|
||||
with open("data.yaml", "r", encoding="utf-8") as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
# 生成 label2id 和 id2label
|
||||
label_id = {label: idx for idx, label in enumerate(config["labels"])}
|
||||
id_label = {idx: label for label, idx in label_id.items()}
|
||||
|
||||
# 数据预处理函数
|
||||
def preprocess_function(examples, tokenizer, max_length, is_test=False):
|
||||
result = tokenizer(examples["text"], max_length=max_length, truncation=True, padding='max_length')
|
||||
if not is_test:
|
||||
result["labels"] = np.array([examples["label"]], dtype="int64")
|
||||
return result
|
||||
|
||||
# 读取本地数据集
|
||||
def read_local_dataset(path, label2id=None, is_test=False):
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f) # 读取 JSON 数据
|
||||
for item in data:
|
||||
if is_test:
|
||||
if "text" in item:
|
||||
yield {"text": item["text"]} # 测试集仅返回文本
|
||||
else:
|
||||
if "text" in item and "label" in item:
|
||||
yield {"text": item["text"], "label": label2id.get(item["label"], -1)} # 如果label缺失,默认标记为 -1
|
||||
|
||||
# 加载数据集
|
||||
train_ds = load_dataset(read_local_dataset, path=config["train"], label2id=label_id, lazy=False)
|
||||
test_ds = load_dataset(read_local_dataset, path=config["test"], label2id=label_id, lazy=False)
|
||||
|
||||
# 加载模型
|
||||
model = ErnieForSequenceClassification.from_pretrained(config["model_path"], num_classes=config["nc"],
|
||||
label2id=label_id, id2label=id_label)
|
||||
tokenizer = ErnieTokenizer.from_pretrained(config["model_path"])
|
||||
|
||||
# 转换数据集
|
||||
trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_length=256)
|
||||
train_ds = train_ds.map(trans_func)
|
||||
test_ds = test_ds.map(trans_func)
|
||||
|
||||
# 定义 DataLoader,并使用 DataCollatorWithPadding
|
||||
data_collator = DataCollatorWithPadding(tokenizer)
|
||||
train_dataloader = DataLoader(train_ds, batch_size=16, shuffle=True, collate_fn=data_collator)
|
||||
test_dataloader = DataLoader(test_ds, batch_size=16, shuffle=False, collate_fn=data_collator)
|
||||
|
||||
# 定义训练参数
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./output",
|
||||
evaluation_strategy="steps", # 评估频率
|
||||
save_steps=500, # 保存频率
|
||||
logging_dir="./logs", # 日志目录
|
||||
logging_steps=100, # 日志频率
|
||||
num_train_epochs=100, # 训练轮数
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=16,
|
||||
gradient_accumulation_steps=1, # 梯度累积
|
||||
learning_rate=5e-5,
|
||||
weight_decay=0.01, # 权重衰减
|
||||
disable_tqdm=False, # 是否禁用 tqdm 进度条
|
||||
)
|
||||
|
||||
# 定义评估指标(如果有的话)
|
||||
def compute_metrics(p):
|
||||
predictions, labels = p
|
||||
pred_labels = np.argmax(predictions, axis=1)
|
||||
accuracy = np.sum(pred_labels == labels) / len(labels)
|
||||
return {"accuracy": accuracy}
|
||||
|
||||
# 创建 Trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
args=training_args,
|
||||
criterion=CrossEntropyLoss(),
|
||||
train_dataset=train_ds,
|
||||
eval_dataset=test_ds,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics,
|
||||
)
|
||||
|
||||
# 训练模型
|
||||
trainer.train()
|
||||
|
||||
|
||||
# 设置导出路径
|
||||
export_model_dir = './output/export'
|
||||
|
||||
# 确保目录存在
|
||||
os.makedirs(export_model_dir, exist_ok=True)
|
||||
|
||||
# 导出模型
|
||||
model_to_export = trainer.model
|
||||
input_spec = [paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")]
|
||||
|
||||
# 导出模型的静态图
|
||||
paddle.jit.save(model_to_export, os.path.join(export_model_dir, 'model'), input_spec=input_spec)
|
||||
|
||||
# 保存 tokenizer 配置
|
||||
tokenizer.save_pretrained(export_model_dir)
|
||||
|
||||
# 保存标签映射文件
|
||||
id2label_file = os.path.join(export_model_dir, 'id2label.json')
|
||||
with open(id2label_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(id_label, f, ensure_ascii=False)
|
||||
|
||||
label2id_file = os.path.join(export_model_dir, 'label2id.json')
|
||||
with open(label2id_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(label_id, f, ensure_ascii=False)
|
||||
|
||||
print(f'Model and tokenizer have been saved to {export_model_dir}')
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
import json
|
||||
|
||||
# 读取 text 文件
|
||||
with open("data/train.txt", "r", encoding="utf-8") as f:
|
||||
data = f.readlines() # 按行读取
|
||||
|
||||
# 解析数据
|
||||
json_list = []
|
||||
for line in data:
|
||||
parts = line.strip().split("\t") # 按 Tab 拆分
|
||||
if len(parts) == 2: # 确保数据格式正确
|
||||
json_list.append({"text": parts[0], "label": parts[1]})
|
||||
else:
|
||||
print(f"跳过格式错误的行: {line.strip()}") # 打印错误数据,方便排查
|
||||
|
||||
# 转换为 JSON 格式
|
||||
json_output = json.dumps(json_list, ensure_ascii=False, indent=4)
|
||||
|
||||
# 保存到 JSON 文件
|
||||
with open("data/train.json", "w", encoding="utf-8") as f:
|
||||
f.write(json_output)
|
||||
|
||||
# 打印 JSON 结果
|
||||
print(json_output)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,66 @@
|
|||
import os
|
||||
import json
|
||||
import paddle
|
||||
import paddlenlp
|
||||
from paddlenlp.utils.log import logger
|
||||
from paddlenlp.datasets import MapDataset
|
||||
from paddlenlp.transformers import UIE, ErnieTokenizer
|
||||
from paddlenlp.trainer import TrainingArguments, Trainer
|
||||
|
||||
# 读取数据
|
||||
def read_data(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
return data
|
||||
|
||||
# 数据转换函数
|
||||
def convert_data(examples, tokenizer):
|
||||
results = []
|
||||
for example in examples:
|
||||
text = example["text"]
|
||||
encoding = tokenizer(text)
|
||||
results.append({
|
||||
"input_ids": encoding["input_ids"],
|
||||
"token_type_ids": encoding["token_type_ids"],
|
||||
"attention_mask": encoding["attention_mask"],
|
||||
"labels": example["label"],
|
||||
})
|
||||
return results
|
||||
|
||||
# 加载数据
|
||||
train_data = read_data("data/train.json")
|
||||
dev_data = read_data("data/dev.json")
|
||||
|
||||
# 选择模型
|
||||
model_name = "uie-base"
|
||||
tokenizer = ErnieTokenizer.from_pretrained(model_name)
|
||||
model = UIE.from_pretrained(model_name)
|
||||
|
||||
# 预处理数据
|
||||
train_dataset = MapDataset(convert_data(train_data, tokenizer))
|
||||
dev_dataset = MapDataset(convert_data(dev_data, tokenizer))
|
||||
|
||||
# 训练参数
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./checkpoint",
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=16,
|
||||
learning_rate=1e-5,
|
||||
num_train_epochs=10,
|
||||
logging_dir="./logs",
|
||||
logging_steps=100,
|
||||
save_steps=500,
|
||||
evaluation_strategy="epoch",
|
||||
save_total_limit=2
|
||||
)
|
||||
|
||||
# 训练器
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=dev_dataset
|
||||
)
|
||||
|
||||
# 开始训练
|
||||
trainer.train()
|
||||
Loading…
Reference in New Issue