重构模型训练

This commit is contained in:
jiang 2025-02-25 09:27:14 +08:00
parent 1cb6c7f69c
commit 9e1182f766
19 changed files with 678835 additions and 125 deletions

108
api/mian.py Normal file
View File

@ -0,0 +1,108 @@
import json
from flask import Flask, jsonify, request
from werkzeug.exceptions import HTTPException
from paddlenlp.transformers import ErnieForTokenClassification, ErnieTokenizer
import paddle
# 1. 加载模型和 tokenizer
model_path = R"E:\workingSpace\PycharmProjects\Intention_dev\uie\uie_ner\checkpoint-4320" # 你的模型路径
model = ErnieForTokenClassification.from_pretrained(model_path)
tokenizer = ErnieTokenizer.from_pretrained(model_path)
# 标签映射
label_map = {
0: 'O', 1: 'B-date', 11: 'I-date',
2: 'B-project_name', 12: 'I-project_name',
3: 'B-project_type', 13: 'I-project_type',
4: 'B-construction_unit', 14: 'I-construction_unit',
5: 'B-implementation_organization', 15: 'I-implementation_organization',
6: 'B-project_department', 16: 'I-project_department',
7: 'B-project_manager', 17: 'I-project_manager',
8: 'B-subcontractor', 18: 'I-subcontractor',
9: 'B-team_leader', 19: 'I-team_leader',
10: 'B-risk_level', 20: 'I-risk_level'
}
app = Flask(__name__)
# 统一的异常处理函数
@app.errorhandler(Exception)
def handle_exception(e):
"""统一异常处理"""
if isinstance(e, HTTPException):
return jsonify({
"error": {
"type": e.name,
"message": e.description,
"status_code": e.code
}
}), e.code
return jsonify({
"error": {
"type": "InternalServerError",
"message": str(e)
}
}), 500
@app.route('/')
def hello_world():
"""示例路由,返回 Hello World"""
return jsonify({"message": "Hello, world!"})
@app.route('/predict', methods=['POST'])
def predict():
"""处理预测请求"""
data = request.get_json()
# 提取文本
text = data.get("text", "")
if not text:
return jsonify({"error": "No text provided"}), 400
# 处理输入文本
inputs = tokenizer(text, max_len=512, return_tensors="pd")
model.eval()
with paddle.no_grad():
logits = model(**inputs)
predictions = paddle.argmax(logits, axis=-1)
# 解析预测结果
predicted_labels = predictions.numpy()[0]
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].numpy())
entities = {}
current_entity = None
current_label = None
for token, label_id in zip(tokens, predicted_labels):
label = label_map.get(label_id, "O")
if label.startswith("B-"): # 开始新实体
if current_entity:
entities[current_label] = "".join(current_entity)
current_entity = [token]
current_label = label[2:] # 去掉 B-
elif label.startswith("I-") and current_entity and label[2:] == current_label:
current_entity.append(token) # 继续合并同一实体
else: # 非实体
if current_entity:
entities[current_label] = "".join(current_entity)
current_entity = None
current_label = None
# 处理最后一个实体
if current_entity:
entities[current_label] = "".join(current_entity)
# 输出最终的实体作为 JSON
return jsonify(entities)
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=True) # 启动 API调试模式和指定端口

View File

@ -1,125 +1,138 @@
import paddle
from paddlenlp.datasets import MapDataset, load_dataset
from paddlenlp.datasets import load_dataset
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
import yaml
import json
import numpy as np
import functools
from paddle.io import DataLoader
from paddle.nn import CrossEntropyLoss
from paddlenlp.data import DataCollatorWithPadding
from paddlenlp.trainer import Trainer, TrainingArguments
import os
import json
import paddle
# 读取 YAML 配置
with open("data.yaml", "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
from sklearn.metrics import precision_score, recall_score, f1_score
def load_config(config_path):
"""加载 YAML 配置文件"""
with open(config_path, "r", encoding="utf-8") as f:
return yaml.safe_load(f)
def generate_label_mappings(labels):
"""生成 label2id 和 id2label 映射"""
label_id = {label: idx for idx, label in enumerate(labels)}
id_label = {idx: label for label, idx in label_id.items()}
return label_id, id_label
# 生成 label2id 和 id2label
label_id = {label: idx for idx, label in enumerate(config["labels"])}
id_label = {idx: label for label, idx in label_id.items()}
# 数据预处理函数
def preprocess_function(examples, tokenizer, max_length, is_test=False):
"""数据预处理函数"""
result = tokenizer(examples["text"], max_length=max_length, truncation=True, padding='max_length')
if not is_test:
result["labels"] = np.array([examples["label"]], dtype="int64")
return result
# 读取本地数据集
def read_local_dataset(path, label2id=None, is_test=False):
"""读取本地数据集"""
with open(path, "r", encoding="utf-8") as f:
data = json.load(f) # 读取 JSON 数据
data = json.load(f)
for item in data:
if is_test:
if "text" in item:
yield {"text": item["text"]} # 测试集仅返回文本
yield {"text": item["text"]}
else:
if "text" in item and "label" in item:
yield {"text": item["text"], "label": label2id.get(item["label"], -1)} # 如果label缺失默认标记为 -1
yield {"text": item["text"], "label": label2id.get(item["label"], -1)}
# 加载数据集
train_ds = load_dataset(read_local_dataset, path=config["train"], label2id=label_id, lazy=False)
test_ds = load_dataset(read_local_dataset, path=config["test"], label2id=label_id, lazy=False)
# 加载模型
model = ErnieForSequenceClassification.from_pretrained(config["model_path"], num_classes=config["nc"],
label2id=label_id, id2label=id_label)
tokenizer = ErnieTokenizer.from_pretrained(config["model_path"])
def load_and_preprocess_dataset(path, label2id, tokenizer, max_length, is_test=False):
"""加载并预处理数据集"""
dataset = load_dataset(read_local_dataset, path=path, label2id=label2id, lazy=False, is_test=is_test)
trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_length=max_length, is_test=is_test)
return dataset.map(trans_func)
# 转换数据集
trans_func = functools.partial(preprocess_function, tokenizer=tokenizer, max_length=256)
train_ds = train_ds.map(trans_func)
test_ds = test_ds.map(trans_func)
def export_model(trainer, export_model_dir):
"""导出模型和 tokenizer"""
os.makedirs(export_model_dir, exist_ok=True)
model_to_export = trainer.model
input_spec = [paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")]
paddle.jit.save(model_to_export, os.path.join(export_model_dir, 'model'), input_spec=input_spec)
trainer.tokenizer.save_pretrained(export_model_dir)
# 定义 DataLoader并使用 DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(train_ds, batch_size=16, shuffle=True, collate_fn=data_collator)
test_dataloader = DataLoader(test_ds, batch_size=16, shuffle=False, collate_fn=data_collator)
id2label_file = os.path.join(export_model_dir, 'id2label.json')
label2id_file = os.path.join(export_model_dir, 'label2id.json')
with open(id2label_file, 'w', encoding='utf-8') as f:
json.dump(trainer.model.id2label, f, ensure_ascii=False)
with open(label2id_file, 'w', encoding='utf-8') as f:
json.dump(trainer.model.label2id, f, ensure_ascii=False)
print(f'Model and tokenizer have been saved to {export_model_dir}')
# 定义训练参数
training_args = TrainingArguments(
output_dir="./output",
evaluation_strategy="steps", # 评估频率
save_steps=500, # 保存频率
logging_dir="./logs", # 日志目录
logging_steps=100, # 日志频率
num_train_epochs=100, # 训练轮数
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
gradient_accumulation_steps=1, # 梯度累积
learning_rate=5e-5,
weight_decay=0.01, # 权重衰减
disable_tqdm=False, # 是否禁用 tqdm 进度条
)
# 定义评估指标(如果有的话)
def compute_metrics(p):
"""计算评估指标"""
predictions, labels = p
pred_labels = np.argmax(predictions, axis=1)
accuracy = np.sum(pred_labels == labels) / len(labels)
return {"accuracy": accuracy}
precision = precision_score(labels, pred_labels, average='macro')
recall = recall_score(labels, pred_labels, average='macro')
f1 = f1_score(labels, pred_labels, average='macro')
metrics = {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
print("Computed metrics:", metrics) # Debug statement
return metrics
# 创建 Trainer
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
criterion=CrossEntropyLoss(),
train_dataset=train_ds,
eval_dataset=test_ds,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
def main():
# 读取配置
config = load_config("data.yaml")
label_id, id_label = generate_label_mappings(config["labels"])
# 训练模型
trainer.train()
# 加载数据集
tokenizer = ErnieTokenizer.from_pretrained(config["model_path"])
train_ds = load_and_preprocess_dataset(config["train"], label_id, tokenizer, max_length=256)
test_ds = load_and_preprocess_dataset(config["test"], label_id, tokenizer, max_length=256, is_test=True)
# 加载模型
model = ErnieForSequenceClassification.from_pretrained(config["model_path"], num_classes=config["nc"],
label2id=label_id, id2label=id_label)
# 设置导出路径
export_model_dir = './output/export'
# 定义 DataLoader
data_collator = DataCollatorWithPadding(tokenizer)
# 确保目录存在
os.makedirs(export_model_dir, exist_ok=True)
# 定义训练参数
training_args = TrainingArguments(
output_dir="./output",
evaluation_strategy="steps", # 按步数进行评估
eval_steps=100, # 每100步评估一次
save_steps=500,
logging_dir="./logs",
logging_steps=50, # 每50步输出一次日志
num_train_epochs=10, # 训练轮数
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
gradient_accumulation_steps=1,
learning_rate=5e-5,
weight_decay=0.01,
disable_tqdm=False,
metric_for_best_model="accuracy", # 根据准确率选择最佳模型
greater_is_better=True, # 准确率越高越好
)
# 导出模型
model_to_export = trainer.model
input_spec = [paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")]
# 创建 Trainer
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
criterion=CrossEntropyLoss(),
train_dataset=train_ds,
eval_dataset=test_ds,
data_collator=data_collator,
compute_metrics=compute_metrics, # 使用自定义的评估指标
)
# 导出模型的静态图
paddle.jit.save(model_to_export, os.path.join(export_model_dir, 'model'), input_spec=input_spec)
# 训练模型
trainer.train()
# 保存 tokenizer 配置
tokenizer.save_pretrained(export_model_dir)
# 导出模型
export_model(trainer, './output/export')
# 保存标签映射文件
id2label_file = os.path.join(export_model_dir, 'id2label.json')
with open(id2label_file, 'w', encoding='utf-8') as f:
json.dump(id_label, f, ensure_ascii=False)
label2id_file = os.path.join(export_model_dir, 'label2id.json')
with open(label2id_file, 'w', encoding='utf-8') as f:
json.dump(label_id, f, ensure_ascii=False)
print(f'Model and tokenizer have been saved to {export_model_dir}')
if __name__ == "__main__":
main()

258
generated_data/generated.py Normal file
View File

@ -0,0 +1,258 @@
import json
from itertools import product
# Define the base data
implementation_organizations = ["送电一分公司", "送电二分公司", "变电分公司", "建筑分公司", "消防分公司",
"检修试验分公司", "安徽宏源电力建设有限公司", "安徽顺安电网建设有限公司"]
project_types = ["基建", "技改大修", "用户工程", "小型基建"]
project_names = [
"国网北京检修公司2024年±500kV延庆换流站直流主设备年度检修维护",
"合肥二电厂-彭郢π入长临河变电站220kV线路工程",
"杨柳四铺π入况楼变110kV电缆线路工程",
"安徽蚌埠濠州220kV变电站220千伏大唐凤阳红心镇光伏间隔扩建工程(电气安装)",
"合肥轨道7号线10kV杆线迁改工程",
"金上-湖北线路工程川12标",
"六安汤池 110kV 变电站新建工程",
"双港-独秀π入和平变电站220kV线路工程",
"茗南-熙湖T接城南变电站110kV架空线路工程",
"南屏-蓬莱路π入派河变电站110kV线路工程",
"藕池-漆园π入杨柳变电站220kV线路工程",
"芜湖站1000千伏1号主变A相局放配合项目",
"埇桥-灵泗500kV线路工程",
"月桥-火龙岗π入高村变电站220kV线路工程"
]
construction_units = ["国网安徽省电力有限公司建设分公司", "国网安徽省电力有限公司马鞍山供电公司",
"国网安徽省电力有限公司合肥供电公司", "国网安徽省电力有限公司阜阳供电公司",
"国网安徽省电力有限公司滁州供电公司", "国网安徽省电力有限公司安庆供电公司",
"国网安徽省电力有限公司黄山供电公司", "国网安徽省电力有限公司蚌埠供电公司",
"国网安徽省电力有限公司池州供电公司", "国网安徽省电力有限公司六安供电公司",
"国家电有限公司特高压建设分公司", "国网安徽省电力有限公司淮南供电公司",
"国网安徽省电力有限公司宣城供电公司", "国网北京市电力公司", "国网安徽省电力有限公司宿州供电公司",
"国网安徽省电力有限公司营销服务中心", "中国葛洲坝集团电力有限责任公司",
"银联黄山园区开发有限公司", "淮南交通控股(集团)有限公司", "国网安徽省电力有限公司舒城县供电公司",
"国网安徽省电力有限公司颍上县供电公司", "中铁二局集团电务工程有限公司",
"国网四川省电力公司建设分公司"]
project_departments = ["第九项目管理部(马鞍山)", "第十一项目管理部(马鞍山)", "第八项目管理部(芜湖)",
"第五项目管理部(阜阳)", "第六项目管理部(滁州)", "第十二项目管理部(陕皖)",
"第十三项目管理部(黄山)", "第四项目管理部(安庆)"]
project_managers = ["陈少平", "范文立", "何东洋", "胡彬", "黄东林", "姜松竺", "刘闩", "柳杰"]
subcontractors = ["安徽远宏电力工程有限公司", "安徽京硚建设有限公司", "武汉久林电力建设有限公司",
"安徽省鸿钢建设发展有限公司", "安徽星联建筑安装有限公司", "福建文港建设工程有限公司",
"芜湖冉电电力安装工程有限责任公司", "合肥市胜峰建筑安装有限公司", "安徽劦力建筑装饰有限责任公司",
"安徽苏亚建设集团有限公司"]
team_leaders = ["李元帅", "刘雨豪", "马新欣", "任家泉", "王海峰", "王书民"]
risk_levels = ["1", "2", "3", "4", "5"]
labels = ["天气查询", "互联网查询", "页面切换", "日计划数量查询", "周计划数量查询", "日计划作业内容", "周计划作业内容",
"施工人数", "作业考勤人数", "知识问答"]
import json
from itertools import product
def generate_data(template_variables, variable_values, filename,label):
samples = []
for template, variables in template_variables.items():
for values in product(*[variable_values[var] for var in variables]):
text = template.format(**dict(zip(variables, values)))
# 生成 annotations 数据
annotations = []
for var, val in zip(variables, values):
start = text.find(val)
if start != -1:
entity = {"text": val, "start": start, "end": start + len(val), "label": var}
annotations.append(entity)
samples.append({
"text": text,
"annotations": annotations # 这里改成 annotations 数组
})
# 保存到 JSON 文件
with open(filename, "w", encoding="utf-8") as f:
json.dump(samples, f, ensure_ascii=False, indent=2)
print(f"共生成 {len(samples)} 条数据,并已保存为 {filename}")
for label in labels:
if label in ["日计划作业内容", "周计划作业内容"]:
if label == "日计划作业内容":
dates = ["今天", "昨天", "2024年5月24日", "5月24日", "5月24日", "24日"]
else:
dates = ["本周", "上一周"]
template_variables = {
# Define templates and corresponding variables
"{date}{project_name}作业内容是什么?": ["date", "project_name"],
"{project_name}{date}作业内容是什么?": ["project_name", "date"],
"{date}工程性质为{project_type}的工程作业内容是什么?": ["date", "project_type"],
"工程性质为{project_type}的工程{date}作业内容是什么?": ["project_type", "date"],
"{date}{construction_unit}工程作业内容是什么?": ["date", "construction_unit"],
"{construction_unit}{date}工程作业内容是什么?": ["construction_unit", "date"],
"{date}{implementation_organization}作业内容是什么?": ["date", "implementation_organization"],
"{implementation_organization}{date}作业内容是什么?": ["implementation_organization", "date"],
"{date}{project_department}作业内容是什么?": ["date", "project_department"],
"{project_department}{date}作业内容是什么?": ["project_department", "date"],
"{date}{project_manager}项目经理作业内容是什么?": ["date", "project_manager"],
"{project_manager}项目经理{date}作业内容是什么?": ["project_manager", "date"],
"{date}{subcontractor}作业内容是什么?": ["date", "subcontractor"],
"{subcontractor}{date}作业内容是什么?": ["subcontractor", "date"],
"{date}{team_leader}班组长作业内容是什么?": ["date", "team_leader"],
"{team_leader}班组长{date}作业内容是什么?": ["team_leader", "date"],
"{date}风险等级为{risk_level}级的工程作业内容是什么?": ["date", "risk_level"],
"风险等级为{risk_level}级的工程{date}作业内容是什么?": ["risk_level", "date"],
"{date}{project_name}风险等级为{risk_level}级的工程作业内容是什么?": ["date", "project_name", "risk_level"],
"{project_name}风险等级为{risk_level}级的工程{date}作业内容是什么?": ["project_name", "risk_level", "date"],
"{date}工程性质为{project_type}风险等级为{risk_level}级的工程作业内容是什么?": ["date", "project_type",
"risk_level"],
"{project_type}工程风险等级为{risk_level}级的工程{date}作业内容是什么?": ["project_type", "risk_level",
"date"],
}
variable_values = {
"date": dates,
"project_name": project_names,
"project_type": project_types,
"construction_unit": construction_units,
"implementation_organization": implementation_organizations,
"project_department": project_departments,
"project_manager": project_managers,
"subcontractor": subcontractors,
"team_leader": team_leaders,
"risk_level": risk_levels
}
generate_data(template_variables, variable_values, f"{label}.json",label)
if label in ["日计划数量查询", "周计划数量查询"]:
if label == "日计划数量查询":
dates = ["今天", "昨天", "2024年5月24日", "5月24日", "5月24日", "24日"]
else:
dates = ["本周", "上一周"]
template_variables = {
# Define templates and corresponding variables
"{date}{project_name}有多少作业计划?": ["date", "project_name"],
"{project_name}{date}有多少作业计划?": ["project_name", "date"],
"{date}工程性质为{project_type}的工程有多少作业计划?": ["date", "project_type"],
"工程性质为{project_type}的工程{date}有多少作业计划?": ["project_type", "date"],
"{date}{construction_unit}有多少作业计划?": ["date", "construction_unit"],
"{construction_unit}{date}有多少作业计划?": ["construction_unit", "date"],
"{date}{implementation_organization}有多少作业计划?": ["date", "implementation_organization"],
"{implementation_organization}{date}有多少作业计划?": ["implementation_organization", "date"],
"{date}{project_department}有多少作业计划?": ["date", "project_department"],
"{project_department}{date}有多少作业计划?": ["project_department", "date"],
"{date}{project_manager}项目经理有多少作业计划?": ["date", "project_manager"],
"{project_manager}项目经理{date}有多少作业计划?": ["project_manager", "date"],
"{date}{subcontractor}有多少作业计划?": ["date", "subcontractor"],
"{subcontractor}{date}有多少作业计划?": ["subcontractor", "date"],
"{date}{team_leader}班组长有多少作业计划?": ["date", "team_leader"],
"{team_leader}班组长{date}有多少作业计划?": ["team_leader", "date"],
"{date}风险等级为{risk_level}级的工程有多少作业计划?": ["date", "risk_level"],
"风险等级为{risk_level}级的工程{date}有多少作业计划?": ["risk_level", "date"],
"{date}{project_name}风险等级为{risk_level}级的工程有多少作业计划?": ["date", "project_name", "risk_level"],
"{project_name}风险等级为{risk_level}级的工程{date}有多少作业计划?": ["project_name", "risk_level", "date"],
"{date}工程性质为{project_type}风险等级为{risk_level}级的工程有多少作业计划?": ["date", "project_type",
"risk_level"],
"{project_type}工程风险等级为{risk_level}级的工程{date}有多少作业计划?": ["project_type", "risk_level",
"date"],
}
variable_values = {
"date": dates,
"project_name": project_names,
"project_type": project_types,
"construction_unit": construction_units,
"implementation_organization": implementation_organizations,
"project_department": project_departments,
"project_manager": project_managers,
"subcontractor": subcontractors,
"team_leader": team_leaders,
"risk_level": risk_levels
}
generate_data(template_variables, variable_values, f"{label}.json",label)
if label == "施工人数":
dates = ["今天", "昨天", "2024年5月24日", "5月24日", "5月24日", "24日"];
template_variables = {
# Define templates and corresponding variables
"{date}{project_name}有多少施工人数?": ["date", "project_name"],
"{project_name}{date}有多少施工人数?": ["project_name", "date"],
"{date}工程性质为{project_type}的工程有多少施工人数?": ["date", "project_type"],
"工程性质为{project_type}的工程{date}有多少施工人数?": ["project_type", "date"],
"{date}{construction_unit}有多少施工人数?": ["date", "construction_unit"],
"{construction_unit}{date}有多少施工人数?": ["construction_unit", "date"],
"{date}{implementation_organization}有多少施工人数?": ["date", "implementation_organization"],
"{implementation_organization}{date}有多少施工人数?": ["implementation_organization", "date"],
"{date}{project_department}有多少施工人数?": ["date", "project_department"],
"{project_department}{date}有多少施工人数?": ["project_department", "date"],
"{date}{project_manager}项目经理有多少施工人数?": ["date", "project_manager"],
"{project_manager}项目经理{date}有多少施工人数?": ["project_manager", "date"],
"{date}{subcontractor}有多少施工人数?": ["date", "subcontractor"],
"{subcontractor}{date}有多少施工人数?": ["subcontractor", "date"],
"{date}{team_leader}班组长有多少施工人数?": ["date", "team_leader"],
"{team_leader}班组长{date}有多少施工人数?": ["team_leader", "date"],
"{date}风险等级为{risk_level}级的工程有多少施工人数?": ["date", "risk_level"],
"风险等级为{risk_level}级的工程{date}有多少施工人数?": ["risk_level", "date"],
"{date}{project_name}风险等级为{risk_level}级的工程有多少施工人数?": ["date", "project_name", "risk_level"],
"{project_name}风险等级为{risk_level}级的工程{date}有多少施工人数?": ["project_name", "risk_level", "date"],
"{date}工程性质为{project_type}风险等级为{risk_level}级的工程有多少施工人数?": ["date", "project_type",
"risk_level"],
"{project_type}工程风险等级为{risk_level}级的工程{date}有多少施工人数?": ["project_type", "risk_level",
"date"],
}
variable_values = {
"date": dates,
"project_name": project_names,
"project_type": project_types,
"construction_unit": construction_units,
"implementation_organization": implementation_organizations,
"project_department": project_departments,
"project_manager": project_managers,
"subcontractor": subcontractors,
"team_leader": team_leaders,
"risk_level": risk_levels
}
generate_data(template_variables, variable_values, f"{label}.json",label)
if label == "作业考勤人数":
dates = ["今天", "昨天", "2024年5月24日", "5月24日", "5月24日", "24日"];
template_variables = {
# Define templates and corresponding variables
"{date}{project_name}有多少作业考勤人数?": ["date", "project_name"],
"{project_name}{date}有多少作业考勤人数?": ["project_name", "date"],
"{date}工程性质为{project_type}的工程有多少作业考勤人数?": ["date", "project_type"],
"工程性质为{project_type}的工程{date}有多少作业考勤人数?": ["project_type", "date"],
"{date}{construction_unit}有多少作业考勤人数?": ["date", "construction_unit"],
"{construction_unit}{date}有多少作业考勤人数?": ["construction_unit", "date"],
"{date}{implementation_organization}有多少作业考勤人数?": ["date", "implementation_organization"],
"{implementation_organization}{date}有多少作业考勤人数?": ["implementation_organization", "date"],
"{date}{project_department}有多少作业考勤人数?": ["date", "project_department"],
"{project_department}{date}有多少作业考勤人数?": ["project_department", "date"],
"{date}{project_manager}项目经理有多少作业考勤人数?": ["date", "project_manager"],
"{project_manager}项目经理{date}有多少作业考勤人数?": ["project_manager", "date"],
"{date}{subcontractor}有多少作业考勤人数?": ["date", "subcontractor"],
"{subcontractor}{date}有多少作业考勤人数?": ["subcontractor", "date"],
"{date}{team_leader}班组长有多少作业考勤人数?": ["date", "team_leader"],
"{team_leader}班组长{date}有多少作业考勤人数?": ["team_leader", "date"],
"{date}风险等级为{risk_level}级的工程有多少作业考勤人数?": ["date", "risk_level"],
"风险等级为{risk_level}级的工程{date}有多少作业考勤人数?": ["risk_level", "date"],
"{date}{project_name}风险等级为{risk_level}级的工程有多少作业考勤人数?": ["date", "project_name",
"risk_level"],
"{project_name}风险等级为{risk_level}级的工程{date}有多少作业考勤人数?": ["project_name", "risk_level",
"date"],
"{date}工程性质为{project_type}风险等级为{risk_level}级的工程作业考勤人数?": ["date", "project_type",
"risk_level"],
"{project_type}工程风险等级为{risk_level}级的工程{date}有多少作业考勤人数?": ["project_type", "risk_level",
"date"],
}
variable_values = {
"date": dates,
"project_name": project_names,
"project_type": project_types,
"construction_unit": construction_units,
"implementation_organization": implementation_organizations,
"project_department": project_departments,
"project_manager": project_managers,
"subcontractor": subcontractors,
"team_leader": team_leaders,
"risk_level": risk_levels
}
generate_data(template_variables, variable_values, f"{label}.json", label)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

39
uie/1.py Normal file
View File

@ -0,0 +1,39 @@
import json
# 读取 JSON 文件
def load_json(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
# 按7:3比例将一个JSON文件分成两个
def split_json(input_file, output_file1, output_file2):
# 读取数据
data = load_json(input_file)
# 计算数据的分割点
split_point = int(len(data) * 0.7)
# 按比例分割数据
data_part1 = data[:split_point] # 前70%数据
data_part2 = data[split_point:] # 后30%数据
# 保存数据到两个文件
with open(output_file1, 'w', encoding='utf-8') as f1:
json.dump(data_part1, f1, ensure_ascii=False, indent=4)
with open(output_file2, 'w', encoding='utf-8') as f2:
json.dump(data_part2, f2, ensure_ascii=False, indent=4)
print(f"数据已按 7:3 比例分割并保存到 {output_file1}{output_file2}")
# 输入的 JSON 文件路径
input_file = 'merged_data.json'
# 输出的两个文件路径
output_file1 = 'data_part1.json'
output_file2 = 'data_part2.json'
# 按 7:3 比例分割并保存
split_json(input_file, output_file1, output_file2)

135975
uie/data/data_part1.json Normal file

File diff suppressed because it is too large Load Diff

61821
uie/data/data_part2.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

197794
uie/merged_data.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

Binary file not shown.

66
uie/test_model.py Normal file
View File

@ -0,0 +1,66 @@
from paddlenlp.transformers import ErnieForTokenClassification, ErnieTokenizer
import paddle
# 1. 加载模型和 tokenizer
model_path = R"E:\workingSpace\PycharmProjects\Intention_dev\uie\uie_ner\checkpoint-4320" # 你的模型路径
model = ErnieForTokenClassification.from_pretrained(model_path)
tokenizer = ErnieTokenizer.from_pretrained(model_path)
# 2. 处理输入文本
text = "5月24日金上-湖北线路工程川12标风险等级为8级的工程作业内容是什么"
inputs = tokenizer(text, max_len=512, return_tensors="pd")
# 3. 进行预测
model.eval()
with paddle.no_grad():
logits = model(**inputs)
predictions = paddle.argmax(logits, axis=-1)
# 4. 标签映射
label_map = {
0: 'O', # 非实体
1: 'B-date', 11: 'I-date',
2: 'B-project_name', 12: 'I-project_name',
3: 'B-project_type', 13: 'I-project_type',
4: 'B-construction_unit', 14: 'I-construction_unit',
5: 'B-implementation_organization', 15: 'I-implementation_organization',
6: 'B-project_department', 16: 'I-project_department',
7: 'B-project_manager', 17: 'I-project_manager',
8: 'B-subcontractor', 18: 'I-subcontractor',
9: 'B-team_leader', 19: 'I-team_leader',
10: 'B-risk_level', 20: 'I-risk_level'
}
# 5. 解析预测结果
predicted_labels = predictions.numpy()[0]
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].numpy())
entities = []
current_entity = None
current_label = None
for token, label_id in zip(tokens, predicted_labels):
label = label_map.get(label_id, "O")
if label.startswith("B-"): # 开始新实体
if current_entity:
entities.append({"text": "".join(current_entity), "label": current_label})
current_entity = [token]
current_label = label[2:] # 去掉 B-
elif label.startswith("I-") and current_entity and label[2:] == current_label:
current_entity.append(token) # 继续合并同一实体
else: # 非实体
if current_entity:
entities.append({"text": "".join(current_entity), "label": current_label})
current_entity = None
current_label = None
# 处理最后一个实体
if current_entity:
entities.append({"text": "".join(current_entity), "label": current_label})
# 输出最终的实体
for entity in entities:
print(f"Entity: {entity['text']}, Label: {entity['label']}")

View File

@ -1,66 +1,125 @@
import os
import json
import paddle
import paddlenlp
from paddlenlp.utils.log import logger
from paddlenlp.datasets import MapDataset
from paddlenlp.transformers import UIE, ErnieTokenizer
from paddlenlp.trainer import TrainingArguments, Trainer
from paddlenlp.transformers import ErnieForTokenClassification, ErnieTokenizer
from paddlenlp.trainer import Trainer, TrainingArguments
from paddlenlp.data import DataCollatorForTokenClassification
# 读取数据
def read_data(filepath):
with open(filepath, "r", encoding="utf-8") as f:
# === 1. 加载数据 ===
def load_dataset(data_path):
with open(data_path, "r", encoding="utf-8") as f:
data = json.load(f)
return data
return MapDataset(data)
# 数据转换函数
def convert_data(examples, tokenizer):
results = []
for example in examples:
text = example["text"]
encoding = tokenizer(text)
results.append({
"input_ids": encoding["input_ids"],
"token_type_ids": encoding["token_type_ids"],
"attention_mask": encoding["attention_mask"],
"labels": example["label"],
})
return results
# === 2. 预处理数据 ===
def preprocess_function(example, tokenizer):
# 预定义实体类型列表
entity_types = [
'date', 'project_name', 'project_type', 'construction_unit',
'implementation_organization', 'project_department', 'project_manager',
'subcontractor', 'team_leader', 'risk_level'
]
# 加载数据
train_data = read_data("data/train.json")
dev_data = read_data("data/dev.json")
# 文本 Tokenization
inputs = tokenizer(example["text"], max_length=512, truncation=True, return_offsets_mapping=True)
offset_mapping = inputs["offset_mapping"]
# 选择模型
model_name = "uie-base"
tokenizer = ErnieTokenizer.from_pretrained(model_name)
model = UIE.from_pretrained(model_name)
# 初始化 label_ids0 表示 O 标签)
label_ids = [0] * len(offset_mapping) # 0: O, 1: B-XXX, 2: I-XXX
# 预处理数据
train_dataset = MapDataset(convert_data(train_data, tokenizer))
dev_dataset = MapDataset(convert_data(dev_data, tokenizer))
# 处理实体
if "annotations" in example:
for entity in example["annotations"]:
print(entity)
entity_text = entity["text"]
start, end, entity_label = entity["start"], entity["end"], entity["label"]
# 训练参数
# 确保 entity_label 在我们的标签范围内
if entity_label not in entity_types:
continue # 如果实体标签不在范围内,则跳过
# 将实体类型映射到索引编号
entity_class = entity_types.index(entity_label) + 1 # 1: B-XXX, 2: B-XXX, ...
# 处理实体的起始位置
entity_started = False # 标记实体是否已开始
for idx, (char_start, char_end) in enumerate(offset_mapping):
token = inputs['input_ids'][idx]
# 排除特殊 token
if token == tokenizer.cls_token_id or token == tokenizer.sep_token_id:
continue # 跳过 [CLS] 和 [SEP] token
if char_start >= start and char_end <= end:
if not entity_started:
label_ids[idx] = entity_class # B-实体
entity_started = True
else:
label_ids[idx] = entity_class + len(entity_types) # I-实体
# 将标注结果加到输入
inputs["labels"] = label_ids
del inputs["offset_mapping"] # 删除 offset_mapping
print(inputs)
return inputs
# === 3. 加载 UIE 预训练模型 ===
model = ErnieForTokenClassification.from_pretrained("uie-base", num_classes=21) # 3 类 (O, B, I)
tokenizer = ErnieTokenizer.from_pretrained("uie-base")
# === 4. 加载数据集 ===
train_dataset = load_dataset("data/data_part1.json") # 训练数据集
dev_dataset = load_dataset("data/data_part2.json") # 验证数据集
print(train_dataset)
# === 5. 处理数据 ===
train_dataset = train_dataset.map(lambda x: preprocess_function(x, tokenizer), lazy=False)
dev_dataset = dev_dataset.map(lambda x: preprocess_function(x, tokenizer), lazy=False)
# === 6. 数据整理 ===
data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)
# === 7. 训练参数 ===
training_args = TrainingArguments(
output_dir="./checkpoint",
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
learning_rate=1e-5,
num_train_epochs=10,
logging_dir="./logs",
logging_steps=100,
save_steps=500,
output_dir="./uie_ner",
evaluation_strategy="epoch",
save_total_limit=2
save_strategy="epoch",
per_device_train_batch_size=16, # 你的显存较大,可调整 batch_size
per_device_eval_batch_size=16,
learning_rate=2e-5,
num_train_epochs=10, # 训练轮数
weight_decay=0.01,
save_total_limit=2, # 只保留最新 2 个模型
logging_dir="./logs",
logging_steps=10,
load_best_model_at_end=True,
)
# 训练器
# === 8. 训练 ===
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=dev_dataset
eval_dataset=dev_dataset,
tokenizer=tokenizer,
data_collator=data_collator,
)
# 开始训练
trainer.train()
# 为模型定义输入规格
input_spec = [
paddle.static.InputSpec(shape=[None, 512], dtype="int64", name="input_ids"),
paddle.static.InputSpec(shape=[None, 512], dtype="int64", name="token_type_ids"),
paddle.static.InputSpec(shape=[None, 512], dtype="int64", name="position_ids"),
paddle.static.InputSpec(shape=[None, 512], dtype="float32", name="attention_mask")
]
# === 9. 保存模型为静态图 ===
# 在训练完成后保存模型为静态图
paddle.jit.save(model, "./saved_model_static", input_spec=input_spec)
# === 10. 保存模型的权重 ===
# 保存模型权重,可以在之后加载
trainer.save_model("./saved_model_static") # 默认保存为 './uie_ner' 目录