Intention/generated_data/3.py

259 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
from itertools import product
# Define the base data
implementation_organizations = ["送电一分公司", "送电二分公司", "变电分公司", "建筑分公司", "消防分公司",
"检修试验分公司", "安徽宏源电力建设有限公司", "安徽顺安电网建设有限公司"]
project_types = ["基建", "技改大修", "用户工程", "小型基建"]
project_names = [
"国网北京检修公司2024年±500kV延庆换流站直流主设备年度检修维护",
"合肥二电厂-彭郢π入长临河变电站220kV线路工程",
"杨柳四铺π入况楼变110kV电缆线路工程",
"安徽蚌埠濠州220kV变电站220千伏大唐凤阳红心镇光伏间隔扩建工程(电气安装)",
"合肥轨道7号线10kV杆线迁改工程",
"金上-湖北线路工程川12标",
"六安汤池 110kV 变电站新建工程",
"双港-独秀π入和平变电站220kV线路工程",
"茗南-熙湖T接城南变电站110kV架空线路工程",
"南屏-蓬莱路π入派河变电站110kV线路工程",
"藕池-漆园π入杨柳变电站220kV线路工程",
"芜湖站1000千伏1号主变A相局放配合项目",
"埇桥-灵泗500kV线路工程",
"月桥-火龙岗π入高村变电站220kV线路工程"
]
construction_units = ["国网安徽省电力有限公司建设分公司", "国网安徽省电力有限公司马鞍山供电公司",
"国网安徽省电力有限公司合肥供电公司", "国网安徽省电力有限公司阜阳供电公司",
"国网安徽省电力有限公司滁州供电公司", "国网安徽省电力有限公司安庆供电公司",
"国网安徽省电力有限公司黄山供电公司", "国网安徽省电力有限公司蚌埠供电公司",
"国网安徽省电力有限公司池州供电公司", "国网安徽省电力有限公司六安供电公司",
"国家电有限公司特高压建设分公司", "国网安徽省电力有限公司淮南供电公司",
"国网安徽省电力有限公司宣城供电公司", "国网北京市电力公司", "国网安徽省电力有限公司宿州供电公司",
"国网安徽省电力有限公司营销服务中心", "中国葛洲坝集团电力有限责任公司",
"银联黄山园区开发有限公司", "淮南交通控股(集团)有限公司", "国网安徽省电力有限公司舒城县供电公司",
"国网安徽省电力有限公司颍上县供电公司", "中铁二局集团电务工程有限公司",
"国网四川省电力公司建设分公司"]
project_departments = ["第九项目管理部(马鞍山)", "第十一项目管理部(马鞍山)", "第八项目管理部(芜湖)",
"第五项目管理部(阜阳)", "第六项目管理部(滁州)", "第十二项目管理部(陕皖)",
"第十三项目管理部(黄山)", "第四项目管理部(安庆)"]
project_managers = ["陈少平项目经理", "范文立项目经理", "何东洋项目经理", "胡彬项目经理", "黄东林项目经理", "姜松竺项目经理", "刘闩项目经理", "柳杰项目经理"]
subcontractors = ["安徽远宏电力工程有限公司", "安徽京硚建设有限公司", "武汉久林电力建设有限公司",
"安徽省鸿钢建设发展有限公司", "安徽星联建筑安装有限公司", "福建文港建设工程有限公司",
"芜湖冉电电力安装工程有限责任公司", "合肥市胜峰建筑安装有限公司", "安徽劦力建筑装饰有限责任公司",
"安徽苏亚建设集团有限公司"]
team_leaders = ["李元帅班组长", "刘雨豪班组长", "马新欣班组长", "任家泉班组长", "王海峰班组长", "王书民班组长"]
risk_levels = ["1级", "2级", "3级", "4级", "5级"]
labels = ["天气查询", "通用对话", "页面切换", "日计划数量查询", "周计划数量查询", "日计划作业内容", "周计划作业内容",
"施工人数", "作业考勤人数", "知识问答"]
import json
from itertools import product
def generate_data(template_variables, variable_values, filename,label):
samples = []
for template, variables in template_variables.items():
for values in product(*[variable_values[var] for var in variables]):
text = template.format(**dict(zip(variables, values)))
# 生成 annotations 数据
annotations = []
for var, val in zip(variables, values):
start = text.find(val)
if start != -1:
entity = {"text": val, "start": start, "end": start + len(val), "label": var}
annotations.append(entity)
samples.append({
"text": text,
"annotations": annotations # 这里改成 annotations 数组
})
# 保存到 JSON 文件
with open(filename, "w", encoding="utf-8") as f:
json.dump(samples, f, ensure_ascii=False, indent=2)
print(f"共生成 {len(samples)} 条数据,并已保存为 {filename}")
for label in labels:
if label in ["日计划作业内容", "周计划作业内容"]:
if label == "日计划作业内容":
dates = ["今天", "昨天", "2024年5月24日", "5月24日", "5月24日", "24日"]
else:
dates = ["本周", "上一周"]
template_variables = {
# Define templates and corresponding variables
"{date}{project_name}作业内容是什么?": ["date", "project_name"],
"{project_name}{date}作业内容是什么?": ["project_name", "date"],
"{date}工程性质为{project_type}的工程作业内容是什么?": ["date", "project_type"],
"工程性质为{project_type}的工程{date}作业内容是什么?": ["project_type", "date"],
"{date}{construction_unit}工程作业内容是什么?": ["date", "construction_unit"],
"{construction_unit}{date}工程作业内容是什么?": ["construction_unit", "date"],
"{date}{implementation_organization}作业内容是什么?": ["date", "implementation_organization"],
"{implementation_organization}{date}作业内容是什么?": ["implementation_organization", "date"],
"{date}{project_department}作业内容是什么?": ["date", "project_department"],
"{project_department}{date}作业内容是什么?": ["project_department", "date"],
"{date}{project_manager}作业内容是什么?": ["date", "project_manager"],
"{project_manager}{date}作业内容是什么?": ["project_manager", "date"],
"{date}{subcontractor}作业内容是什么?": ["date", "subcontractor"],
"{subcontractor}{date}作业内容是什么?": ["subcontractor", "date"],
"{date}{team_leader}作业内容是什么?": ["date", "team_leader"],
"{team_leader}{date}作业内容是什么?": ["team_leader", "date"],
"{date}风险等级为{risk_level}的工程作业内容是什么?": ["date", "risk_level"],
"风险等级为{risk_level}的工程{date}作业内容是什么?": ["risk_level", "date"],
"{date}{project_name}风险等级为{risk_level}的工程作业内容是什么?": ["date", "project_name", "risk_level"],
"{project_name}风险等级为{risk_level}的工程{date}作业内容是什么?": ["project_name", "risk_level", "date"],
"{date}工程性质为{project_type}风险等级为{risk_level}的工程作业内容是什么?": ["date", "project_type",
"risk_level"],
"{project_type}工程风险等级为{risk_level}的工程{date}作业内容是什么?": ["project_type", "risk_level",
"date"],
}
variable_values = {
"date": dates,
"project_name": project_names,
"project_type": project_types,
"construction_unit": construction_units,
"implementation_organization": implementation_organizations,
"project_department": project_departments,
"project_manager": project_managers,
"subcontractor": subcontractors,
"team_leader": team_leaders,
"risk_level": risk_levels
}
generate_data(template_variables, variable_values, f"{label}.json",label)
if label in ["日计划数量查询", "周计划数量查询"]:
if label == "日计划数量查询":
dates = ["今天", "昨天", "2024年5月24日", "5月24日", "5月24日", "24日"]
else:
dates = ["本周", "上一周"]
template_variables = {
# Define templates and corresponding variables
"{date}{project_name}有多少作业计划?": ["date", "project_name"],
"{project_name}{date}有多少作业计划?": ["project_name", "date"],
"{date}工程性质为{project_type}的工程有多少作业计划?": ["date", "project_type"],
"工程性质为{project_type}的工程{date}有多少作业计划?": ["project_type", "date"],
"{date}{construction_unit}有多少作业计划?": ["date", "construction_unit"],
"{construction_unit}{date}有多少作业计划?": ["construction_unit", "date"],
"{date}{implementation_organization}有多少作业计划?": ["date", "implementation_organization"],
"{implementation_organization}{date}有多少作业计划?": ["implementation_organization", "date"],
"{date}{project_department}有多少作业计划?": ["date", "project_department"],
"{project_department}{date}有多少作业计划?": ["project_department", "date"],
"{date}{project_manager}有多少作业计划?": ["date", "project_manager"],
"{project_manager}{date}有多少作业计划?": ["project_manager", "date"],
"{date}{subcontractor}有多少作业计划?": ["date", "subcontractor"],
"{subcontractor}{date}有多少作业计划?": ["subcontractor", "date"],
"{date}{team_leader}有多少作业计划?": ["date", "team_leader"],
"{team_leader}{date}有多少作业计划?": ["team_leader", "date"],
"{date}风险等级为{risk_level}的工程有多少作业计划?": ["date", "risk_level"],
"风险等级为{risk_level}的工程{date}有多少作业计划?": ["risk_level", "date"],
"{date}{project_name}风险等级为{risk_level}的工程有多少作业计划?": ["date", "project_name", "risk_level"],
"{project_name}风险等级为{risk_level}的工程{date}有多少作业计划?": ["project_name", "risk_level", "date"],
"{date}工程性质为{project_type}风险等级为{risk_level}的工程有多少作业计划?": ["date", "project_type",
"risk_level"],
"{project_type}工程风险等级为{risk_level}的工程{date}有多少作业计划?": ["project_type", "risk_level",
"date"],
}
variable_values = {
"date": dates,
"project_name": project_names,
"project_type": project_types,
"construction_unit": construction_units,
"implementation_organization": implementation_organizations,
"project_department": project_departments,
"project_manager": project_managers,
"subcontractor": subcontractors,
"team_leader": team_leaders,
"risk_level": risk_levels
}
generate_data(template_variables, variable_values, f"{label}.json",label)
if label == "施工人数":
dates = ["今天", "昨天", "2024年5月24日", "5月24日", "5月24日", "24日"];
template_variables = {
# Define templates and corresponding variables
"{date}{project_name}有多少施工人数?": ["date", "project_name"],
"{project_name}{date}有多少施工人数?": ["project_name", "date"],
"{date}工程性质为{project_type}的工程有多少施工人数?": ["date", "project_type"],
"工程性质为{project_type}的工程{date}有多少施工人数?": ["project_type", "date"],
"{date}{construction_unit}有多少施工人数?": ["date", "construction_unit"],
"{construction_unit}{date}有多少施工人数?": ["construction_unit", "date"],
"{date}{implementation_organization}有多少施工人数?": ["date", "implementation_organization"],
"{implementation_organization}{date}有多少施工人数?": ["implementation_organization", "date"],
"{date}{project_department}有多少施工人数?": ["date", "project_department"],
"{project_department}{date}有多少施工人数?": ["project_department", "date"],
"{date}{project_manager}有多少施工人数?": ["date", "project_manager"],
"{project_manager}{date}有多少施工人数?": ["project_manager", "date"],
"{date}{subcontractor}有多少施工人数?": ["date", "subcontractor"],
"{subcontractor}{date}有多少施工人数?": ["subcontractor", "date"],
"{date}{team_leader}有多少施工人数?": ["date", "team_leader"],
"{team_leader}{date}有多少施工人数?": ["team_leader", "date"],
"{date}风险等级为{risk_level}的工程有多少施工人数?": ["date", "risk_level"],
"风险等级为{risk_level}的工程{date}有多少施工人数?": ["risk_level", "date"],
"{date}{project_name}风险等级为{risk_level}的工程有多少施工人数?": ["date", "project_name", "risk_level"],
"{project_name}风险等级为{risk_level}的工程{date}有多少施工人数?": ["project_name", "risk_level", "date"],
"{date}工程性质为{project_type}风险等级为{risk_level}的工程有多少施工人数?": ["date", "project_type",
"risk_level"],
"{project_type}工程风险等级为{risk_level}的工程{date}有多少施工人数?": ["project_type", "risk_level",
"date"],
}
variable_values = {
"date": dates,
"project_name": project_names,
"project_type": project_types,
"construction_unit": construction_units,
"implementation_organization": implementation_organizations,
"project_department": project_departments,
"project_manager": project_managers,
"subcontractor": subcontractors,
"team_leader": team_leaders,
"risk_level": risk_levels
}
generate_data(template_variables, variable_values, f"{label}.json",label)
if label == "作业考勤人数":
dates = ["今天", "昨天", "2024年5月24日", "5月24日", "5月24日", "24日"];
template_variables = {
# Define templates and corresponding variables
"{date}{project_name}有多少作业考勤人数?": ["date", "project_name"],
"{project_name}{date}有多少作业考勤人数?": ["project_name", "date"],
"{date}工程性质为{project_type}的工程有多少作业考勤人数?": ["date", "project_type"],
"工程性质为{project_type}的工程{date}有多少作业考勤人数?": ["project_type", "date"],
"{date}{construction_unit}有多少作业考勤人数?": ["date", "construction_unit"],
"{construction_unit}{date}有多少作业考勤人数?": ["construction_unit", "date"],
"{date}{implementation_organization}有多少作业考勤人数?": ["date", "implementation_organization"],
"{implementation_organization}{date}有多少作业考勤人数?": ["implementation_organization", "date"],
"{date}{project_department}有多少作业考勤人数?": ["date", "project_department"],
"{project_department}{date}有多少作业考勤人数?": ["project_department", "date"],
"{date}{project_manager}有多少作业考勤人数?": ["date", "project_manager"],
"{project_manager}{date}有多少作业考勤人数?": ["project_manager", "date"],
"{date}{subcontractor}有多少作业考勤人数?": ["date", "subcontractor"],
"{subcontractor}{date}有多少作业考勤人数?": ["subcontractor", "date"],
"{date}{team_leader}班组长有多少作业考勤人数?": ["date", "team_leader"],
"{team_leader}班组长{date}有多少作业考勤人数?": ["team_leader", "date"],
"{date}风险等级为{risk_level}的工程有多少作业考勤人数?": ["date", "risk_level"],
"风险等级为{risk_level}的工程{date}有多少作业考勤人数?": ["risk_level", "date"],
"{date}{project_name}风险等级为{risk_level}的工程有多少作业考勤人数?": ["date", "project_name",
"risk_level"],
"{project_name}风险等级为{risk_level}的工程{date}有多少作业考勤人数?": ["project_name", "risk_level",
"date"],
"{date}工程性质为{project_type}风险等级为{risk_level}的工程作业考勤人数?": ["date", "project_type",
"risk_level"],
"{project_type}工程风险等级为{risk_level}的工程{date}有多少作业考勤人数?": ["project_type", "risk_level",
"date"],
}
variable_values = {
"date": dates,
"project_name": project_names,
"project_type": project_types,
"construction_unit": construction_units,
"implementation_organization": implementation_organizations,
"project_department": project_departments,
"project_manager": project_managers,
"subcontractor": subcontractors,
"team_leader": team_leaders,
"risk_level": risk_levels
}
generate_data(template_variables, variable_values, f"{label}.json", label)