Intention/generated_data/generated.py

429 lines
24 KiB
Python

import json
import os
from itertools import product
# 目录路径
directory = "data"
# 确保目录存在
if not os.path.exists(directory):
os.makedirs(directory)
# 基础数据定义
BASE_DATA = {
#实施组织
"implementation_organizations": ["送电一分公司", "送电二分公司", "变电分公司", "消防分公司"],
#工程性质
"project_types": ["基建", "技改大修", "用户工程", "小型基建"],
#工程名
"project_names": [
"1号工程",
"淮南芦集改造工程",
"第十号工程",
"合肥二电厂220kV线路工程",
"九号工程",
],
#建管单位
"construction_units": ["国网安徽省电力有限公司建设分公司", "国网安徽省电力有限公司马鞍山供电公司", "中铁二局集团电务工程有限公司"],
#项目部名称
"project_departments": ["第9项目管理部", "第十一项目部", "第八项目管理部","9号项目部"],
#项目经理
"project_managers": ["陈少平项目经理", "范文立项目经理", "何东洋项目经理"],
#分包单位
"subcontractors": ["安徽劦力建筑装饰有限责任公司", "安徽苏亚建设集团有限公司"],
#班组名称
"team_names": ["张朵班组", "刘梁玉班组","魏玉龙班组"],
#班组长
"team_leaders": ["李元帅班组长", "刘雨豪班组长"],
#风险等级
"risk_levels": ["1级", "一级", "二级", "5级","四级"],
#8+2工况
"operatings": ["8+2工况","8加2工况"],
#页面切换
"pages": ["风险管控", "日计划", "周风险" ,"日计划统计报表","日计划推送"]
}
# 自然语言模板配置
TEMPLATE_CONFIG = {
"日计划数量查询": {
"date": ["今日", "昨日", "2024年5月24日", "5月24日","今天","昨天"],
"templates": [
("{date}{project_name}有多少作业计划?", ["date", "project_name"]),
("{project_name}{date}有多少项作业计划?", ["project_name","date"]),
("工程性质是{project_type}{date}有多少作业计划?", ["project_type", "date"]),
("{date}风险等级为{risk_level}的作业计划有多少?", ["date", "risk_level"]),
("{date}工程性质为{project_type}的有多少作业计划?", ["date", "project_type"]),
("工程性质为{project_type}{date}有多少作业计划?", ["project_type", "date"]),
("{project_name}{date}的作业计划数量", ["project_name", "date"]),
("{date}{project_type}类作业计划有多少?", ["date", "project_type"]),
("{project_type}{date}作业计划有多少?", ["project_type", "date"]),
("{construction_unit}{date}有多少作业计划?", ["construction_unit", "date"]),
("{date}{construction_unit}有多少作业计划?", ["date", "construction_unit"]),
("{date}有多少作业计划?", ["date"]),
("公司{date}有多少作业计划?", ["date"]),
("{date}属于{operating}有多少作业计划?", ["date","operating"]),
("{date}{implementation_organization}有多少作业计划?", ["date", "implementation_organization"]),
("{date}{project_department}有多少作业计划?", ["date", "project_department"]),
("{project_department}{date}有多少{risk_level}风险作业计划?", ["project_department","date","risk_level"]),
("{date}{project_manager}有多少作业计划?", ["date", "project_manager"]),
("{date}{subcontractor}有多少作业计划?", ["date", "subcontractor"]),
("{date}{team_leader}有多少作业计划?", ["date", "team_leader"]),
("{date}风险等级为{risk_level}的作业计划有多少?", ["date", "risk_level"]),
("{date}{project_department}有多少{risk_level}风险作业计划?", ["date","project_department", "risk_level"]),
("{date}{project_type}类风险等级为{risk_level}的作业计划有多少?",["date", "project_type", "risk_level"]),
("{date}{construction_unit}有多少{risk_level}风险作业计划?",["date", "construction_unit", "risk_level"]),
("{date}{project_type}{construction_unit}负责的作业计划有多少?",["date", "project_type", "construction_unit"]),
("{date}{project_type}{implementation_organization}组织实施的作业计划有多少?",["date", "project_type", "implementation_organization"]),
("{date}{project_department}管理的{project_type}类作业计划有多少?",["date", "project_department", "project_type"]),
("{date}{subcontractor}承包的{project_type}类作业计划有多少?",["date", "subcontractor", "project_type"]),
("{date}{project_manager}负责的{project_type}类作业计划有多少?",["date", "project_manager", "project_type"]),
("{date}{team_leader}带领的{project_type}类作业计划有多少?", ["date", "team_leader", "project_type"]),
("{date}{project_name}{project_manager}作业计划有多少?",["date", "project_name", "project_manager"]),
("{date}{project_name}中,风险等级为{risk_level}的作业计划有多少?", ["date", "project_name", "risk_level"]),
("{date}{project_manager}作业计划有多少?", ["date","project_manager"]),
("{project_manager}{date}作业计划有多少?", ["project_manager", "date"]),
("{date}{project_manager}的作业计划数量", ["date","project_manager"]),
("{project_manager}{date}的作业计划数量", ["project_manager", "date"]),
#班组
("{date}{team_name}有多少项作业计划?", ["date", "team_name"]),
("{team_name}{date}有多少作业计划?", ["team_name","date"]),
("{team_name}{date}作业计划数量", ["team_name","date"]),
("{date}{team_name}作业计划数量", ["date","team_name"]),
]
},
"周计划数量查询": {
"date": ["本周", "上周","上一周", "下周", "下一周", "最近一周", "本周内", "这一周"],
"templates": [
("{date}{project_name}作业计划有多少?", ["date", "project_name"]),
("{project_name}{date}作业计划有多少?", ["project_name", "date"]),
("{construction_unit}{date}作业计划有多少?", ["construction_unit", "date"]),
# 🎯 仅 date 维度
("{date}作业计划有多少?", ["date"]),
# 🎯 date + 其他单个维度
("{date}{project_name}有多少项作业计划?", ["date", "project_name"]),
("{date}{construction_unit}作业计划有多少?", ["date", "construction_unit"]),
("{date}{implementation_organization}作业计划有多少?", ["date", "implementation_organization"]),
("{date}{project_department}作业计划有多少?", ["date", "project_department"]),
("{date}{project_manager}作业计划有多少?", ["date", "project_manager"]),
("{date}{subcontractor}作业计划有多少?", ["date", "subcontractor"]),
("{date}{team_leader}作业计划有多少?", ["date", "team_leader"]),
("{date}{project_department}作业计划数量", ["date", "project_department"]),
("{date}{subcontractor}作业计划数量?", ["date", "subcontractor"]),
# 🎯 date + 风险维度
("{date}有多少{risk_level}风险作业计划?", ["date", "risk_level"]),
# 🎯 date + construction_unit + risk_level
("{construction_unit}{date}有多少项{risk_level}风险作业计划", ["construction_unit", "date", "risk_level"]),
# 🎯 date + implementation_organization + risk_level
("{date}{implementation_organization}风险等级为{risk_level}的作业计划有多少?",["date", "implementation_organization", "risk_level"]),
# 🎯 date + project_name + project_manager
("{date}{project_name}{project_manager}负责的作业计划有多少?", ["date", "project_name", "project_manager"]),
# 🎯 date + project_name + risk_level
("{date}{project_name}有多少项{risk_level}风险作业计划?", ["date", "project_name", "risk_level"]),
# 🎯 project_manager 维度
("{project_manager}{date}作业计划数量?", ["project_manager", "date"]),
("{project_manager}{date}作业计划有多少?", ["project_manager", "date"]),
("{project_manager}{date}负责的风险等级为{risk_level}的作业计划有多少?", ["project_manager", "date", "risk_level"]),
("{date}{team_name}有多少项作业计划?", ["date", "team_name"]),
("{team_name}{date}有多少作业计划?", ["team_name","date"]),
("{team_name}{date}作业计划数量", ["team_name","date"]),
("{date}{team_name}的作业计划数量", ["date","team_name"]),
]
},
"日计划作业内容": {
"date": ["今日", "昨日", "2024年5月24日", "5月24日","今天","昨天"],
"templates": [
("{date}{project_name}作业内容是什么?", ["date", "project_name"]),
("{project_name}{date}作业内容是什么", ["project_name", "date"]),
("{date}{project_type}类作业内容是什么?", ["date", "project_type"]),
("{project_type}{date}作业内容是什么?", ["project_type", "date"]),
("{date}工程性质为{project_type}的作业内容是什么?", ["date", "project_type"]),
("工程性质为{project_type}{date}作业内容是什么?", ["project_type", "date"]),
("{construction_unit}{date}作业内容是什么?", ["construction_unit", "date"]),
# 1. 查询特定日期和项目的作业安排
("{date}{project_name}作业内容是什么?", ["date", "project_name"]),
("{date}属于{operating}作业内容是什么?", ["date", "operating"]),
# 3. 查询特定日期和项目类型的工程计划
("{date}{project_type}类计划作业内容是什么?", ["date", "project_type"]),
("{date}{construction_unit}{risk_level}风险的作业内容是什么?",["date", "construction_unit", "risk_level"]),
("{date}{implementation_organization}{risk_level}风险的作业内容是什么?",["date", "implementation_organization", "risk_level"]),
# 5. 查询特定日期和项目经理的任务安排
("{project_manager}{date}作业内容是什么?", ["project_manager", "date"]),
# 6. 查询特定日期和风险等级的任务
("{date}风险等级为{risk_level}的作业内容是什么?", ["date", "risk_level"]),
# 7. 查询特定日期和实施单位的任务内容
("{implementation_organization}{date}作业内容是什么?", ["implementation_organization", "date"]),
# 8. 查询特定日期和团队领导的任务安排
("{team_leader}{date}作业内容是什么?", ["team_leader", "date"]),
# 9. 查询特定日期和项目类型下的高风险任务
("{date}{project_type}类风险等级为{risk_level}的作业内容是什么?", ["date", "project_type", "risk_level"]),
# 10. 查询特定日期和风险等级的任务安排
("{date}风险等级为{risk_level}的作业内容是什么?", ["date", "risk_level"]),
("{date}有多少项{risk_level}风险作业计划?", ["date", "risk_level"]),
# 11. 查询特定日期和施工单位的任务进展
("{construction_unit}{date}作业内容是什么?", ["construction_unit", "date"]),
# 12. 查询特定日期和项目经理完成的任务
("{project_manager}{date}作业内容是什么?", ["project_manager", "date"]),
# 13. 查询特定日期和项目经理的高风险任务
("{project_manager}{date}的风险等级为{risk_level}的作业内容是什么?", ["project_manager", "date", "risk_level"]),
# 15. 查询特定日期和所有任务安排
("{date}作业内容是什么?", ["date"]),
# 16. 查询特定日期和项目进度
("{date}{project_name}作业内容是什么?", ["date", "project_name"]),
#班组
("{date}{team_name}作业内容是什么?", ["date", "team_name"]),
("{team_name}{date}作业内容", ["team_name","date"]),
]
},
"周计划作业内容": {
"date": ["本周", "上周","上一周", "下周", "下一周", "最近一周", "本周内", "这一周"],
"templates": [
("工程性质为{project_type}{date}作业内容是什么?", ["project_type", "date"]),
("{date}工程性质为{project_type}作业内容是什么?", ["date", "project_type"]),
("{date}{construction_unit}作业内容是什么?", ["date", "construction_unit"]),
("{implementation_organization}{date}作业内容是什么?", ["implementation_organization", "date"]),
# 4. 查询某项目在指定周的所有作业计划
("{project_name}{date}作业内容是什么?", ["project_name", "date"]),
# 5. 查询指定周的所有项目类型作业内容
("{date}{project_type}类作业内容是什么?", ["date", "project_type"]),
# 6. 查询某施工单位在指定周的作业任务
("{construction_unit}{date}作业内容是什么?", ["construction_unit", "date"]),
# 7. 查询某项目经理在指定周负责的作业内容
("{project_manager}{date}作业内容是什么?", ["project_manager", "date"]),
# 8. 查询某团队负责人在指定周的作业安排
("{team_leader}{date}作业内容是什么?", ["team_leader", "date"]),
# 9. 查询某项目类型在指定周的高风险作业内容
("{date}{project_type}类并且风险等级为{risk_level}的作业内容是什么?", ["date", "project_type", "risk_level"]),
# 10. 查询某风险等级在指定周的作业内容
("{date}风险等级为{risk_level}的作业内容是什么?", ["date", "risk_level"]),
("{date}{risk_level}风险的作业内容是什么?", ["date", "risk_level"]),
# 11. 查询某施工单位在指定周的作业进展
("{construction_unit}{date}作业内容是什么?", ["construction_unit", "date"]),
# 13. 查询某团队在指定周的作业安排
("{team_leader}领导的团队在{date}作业内容是什么?", ["team_leader", "date"]),
# 15. 查询某项目部门在指定周的作业安排
("{project_department}{date}作业内容是什么?", ["project_department", "date"]),
("{date}{team_name}作业内容是什么", ["date", "team_name"]),
("{team_name}{date}作业内容", ["team_name","date"]),
]
},
"施工人数": {
"date": ["今日", "昨日", "2024年5月24日", "5月24日","今天","昨天"],
"templates": [
("{date}{project_name}施工人员有多少?", ["date", "project_name"]),
("{date}{project_name}施工人数是多少?", ["date", "project_name"]),
("{construction_unit}{date}的施工人数是多少?", ["construction_unit", "date"]),
# 2. 统计某施工单位在指定日期的施工总人数
("统计{construction_unit}{date}的施工人数是多少?", ["construction_unit", "date"]),
("{date}属于{operating}的施工人数是多少?", ["date", "operating"]),
# 4. 查询某项目类型在指定日期的施工人员需求
("{date}{project_type}类有多少施工人员?", ["date", "project_type"]),
("{date}工程性质为{project_type}有多少施工人员?", ["date", "project_type"]),
("{date}工程性质为{project_type}的施工人数是多少?", ["date", "project_type"]),
("工程性质为{project_type}{date}的施工人数是多少?", ["project_type", "date"]),
("工程性质为{project_type}{date}有多少施工人员?", ["project_type", "date"]),
# 5. 统计某施工单位在指定日期的各项目施工人数
("{construction_unit}{date}的施工人数是多少?", ["construction_unit", "date"]),
# 8. 统计某项目经理管理的项目在指定日期的施工总人数
("{project_manager}负责的项目在{date}的施工人数是多少?", ["project_manager", "date"]),
("{date}{project_manager}负责的项目的施工人数是多少?", ["date", "project_manager"]),
# 9. 查询某分包商在指定日期的施工人员投入
("{subcontractor}{date}的施工人员有多少?", ["subcontractor", "date"]),
("{subcontractor}{date}的施工人数是多少?", ["subcontractor", "date"]),
("{date}{subcontractor}的施工人员有多少?", ["date", "subcontractor"]),
("{date}{subcontractor}的施工人数是多少?", ["date", "subcontractor"]),
("{team_leader}{date}的施工人员有多少?", ["team_leader", "date"]),
("{team_leader}{date}的施工人数是多少?", ["team_leader", "date"]),
("{date}{team_leader}的施工人员有多少?", ["date", "team_leader"]),
("{date}{team_leader}的施工人数是多少?", ["date", "team_leader"]),
# 11. 查询某实施单位在指定日期的施工人员总数
("{implementation_organization}{date}的施工人数是多少?", ["implementation_organization", "date"]),
("{implementation_organization}{date}的施工人员有多少?", ["implementation_organization", "date"]),
("{date}{team_leader}的施工人员有多少?", ["date", "team_leader"]),
("{date}{team_leader}的施工人数是多少?", ["date", "team_leader"]),
# 16. 统计某项目部门在指定日期的施工人员数量
("{project_department}{date}的施工人员有多少?", ["project_department", "date"]),
("{project_department}{date}的施工人数是多少?", ["project_department", "date"]),
# 20. 统计某风险等级项目在指定日期的工种配置情况
("{date}{risk_level}风险的施工人数是多少?", ["date", "risk_level"]),
# 21. 查询某分包商在指定周的施工人员安排
("{subcontractor}{date}的施工人数是多少?", ["subcontractor", "date"]),
# 22. 统计某施工单位在指定周的高风险作业人员数量
("{construction_unit}{date}风险等级为{risk_level}的施工人数是多少?", ["construction_unit", "date", "risk_level"]),
("{date}{team_name}施工人数是多少", ["date", "team_name"]),
("{date}{team_name}施工人数", ["date","team_name"]),
("{team_name}{date}施工人数是多少", ["team_name","date"]),
("{team_name}{date}施工人数", ["team_name","date"]),
]
},
"作业考勤人数": {
"date": ["今日", "昨日", "2024年5月24日", "5月24日", "今天", "昨天"],
"templates": [
("{date}{project_name}作业考勤人数是多少", ["date", "project_name"]),
("{project_name}{date}作业考勤人数是多少", ["project_name", "date"]),
("查询{subcontractor}{date}的考勤人数", ["subcontractor", "date"]),
("{subcontractor}{date}的作业考勤人数是多少", ["subcontractor", "date"]),
("{date}属于{operating}的作业考勤人数是多少?", ["date", "operating"]),
("{team_leader}{date}的作业考勤人数是多少", ["team_leader", "date"]),
# 4. 统计某施工单位在指定日期的考勤人数
("统计{construction_unit}{date}的考勤人数", ["construction_unit", "date"]),
# 5. 查询某实施单位在指定日期的考勤情况
("{implementation_organization}{date}的考勤情况如何?", ["implementation_organization", "date"]),
# 6. 查询某风险等级项目在指定日期的考勤详情
("{date}{risk_level}风险项目考勤详情", ["date", "risk_level"]),
# 7. 统计某项目类型在指定日期的出勤人数
("{date}{project_type}类出勤人数是多少?", ["date", "project_type"]),
# 10. 统计某项目在指定周的出勤总人数
("{project_name}{date}的出勤人数是多少?", ["project_name", "date"]),
# 11. 查询某分包商在指定周的出勤情况
("{subcontractor}{date}的出勤情况如何?", ["subcontractor", "date"]),
("{date}{team_name}考勤人数是多少", ["date", "team_name"]),
("{team_name}{date}考勤人数", ["team_name","date"]),
]
},
"页面切换": {
"date": ["今日", "昨日", "2024年5月24日", "5月24日", "今天", "昨天"],
"templates": [
("打开{page}页面", ["page"]),
("打开{page}", ["page"]),
("打开{page}模块", ["page"]),
("进入{page}", ["page"]),
("进入{page}模块", ["page"]),
("进入{page}页面", ["page"]),
("跳转到{page}", ["page"]),
("跳转到{page}模块", ["page"]),
("跳转到{page}页面", ["page"]),
("访问{page}页面", ["page"]),
("访问{page}模块", ["page"]),
("访问{page}", ["page"]),
("请打开{page}模块", ["page"]),
("请打开{page}", ["page"]),
("显示{page}页面", ["page"]),
("加载{page}模块", ["page"]),
("加载{page}", ["page"]),
("加载{page}页面", ["page"]),
]
}
}
def generate_natural_samples(config, label):
"""生成自然语言样本"""
samples = []
variable_pool = {
"project_name": BASE_DATA["project_names"],
"project_type": BASE_DATA["project_types"],
"construction_unit": BASE_DATA["construction_units"],
"implementation_organization": BASE_DATA["implementation_organizations"],
"subcontractor": BASE_DATA["subcontractors"],
"team_leader": [f"{tl}" for tl in BASE_DATA["team_leaders"]],
"risk_level": BASE_DATA["risk_levels"],
"date": config["date"],
"project_department": BASE_DATA["project_departments"],
"project_manager": BASE_DATA["project_managers"],
"page": BASE_DATA["pages"],
"operating": BASE_DATA["operatings"],
"team_name": BASE_DATA["team_names"]
}
for template, variables in config["templates"]:
for values in product(*[variable_pool[var] for var in variables]):
text = template.format(**dict(zip(variables, values)))
# 生成标注信息
annotations = []
pos = 0
for var, val in zip(variables, values):
start = text.find(val, pos)
if start == -1:
continue
end = start + len(val)
annotations.append({
"text": val,
"start": start,
"end": end,
"label": var
})
pos = end # 更新查找位置避免重复
samples.append({
"text": text,
"annotations": annotations,
"prompt": label
})
# 保存文件
filename = f"data/{label}.json"
with open(filename, "w", encoding="utf-8") as f:
json.dump(samples, f, ensure_ascii=False, indent=2)
print(f"已生成 {len(samples)} 条自然语言 {label} 数据")
# 主执行流程
if __name__ == "__main__":
for label, config in TEMPLATE_CONFIG.items():
generate_natural_samples(config, label)