import json import os from itertools import product # 目录路径 directory = "data" # 确保目录存在 if not os.path.exists(directory): os.makedirs(directory) # 基础数据定义 BASE_DATA = { #实施组织 "implementation_organizations": ["送电一分公司", "送电二分公司", "变电分公司", "消防分公司"], #工程性质 "project_types": ["基建", "技改大修", "用户工程", "小型基建"], #工程名 "project_names": [ "1号工程", "淮南芦集改造工程", "第十号工程", "合肥二电厂220kV线路工程", "九号工程", ], #建管单位 "construction_units": ["国网安徽省电力有限公司建设分公司", "国网安徽省电力有限公司马鞍山供电公司", "中铁二局集团电务工程有限公司"], #项目部名称 "project_departments": ["第9项目管理部", "第十一项目部", "第八项目管理部","9号项目部"], #项目经理 "project_managers": ["陈少平项目经理", "范文立项目经理", "何东洋项目经理"], #分包单位 "subcontractors": ["安徽劦力建筑装饰有限责任公司", "安徽苏亚建设集团有限公司"], #班组名称 "team_names": ["张朵班组", "刘梁玉班组","魏玉龙班组"], #班组长 "team_leaders": ["李元帅班组长", "刘雨豪班组长"], #风险等级 "risk_levels": ["1级", "一级", "二级", "5级","四级"], #8+2工况 "operatings": ["8+2工况","8加2工况"], #页面切换 "pages": ["风险管控", "日计划", "周风险" ,"日计划统计报表","日计划推送"] } # 自然语言模板配置 TEMPLATE_CONFIG = { "日计划数量查询": { "date": ["今日", "昨日", "2024年5月24日", "5月24日","今天","昨天"], "templates": [ ("{date}{project_name}有多少作业计划?", ["date", "project_name"]), ("{project_name}{date}有多少项作业计划?", ["project_name","date"]), ("工程性质是{project_type}{date}有多少作业计划?", ["project_type", "date"]), ("{date}风险等级为{risk_level}的作业计划有多少?", ["date", "risk_level"]), ("{date}工程性质为{project_type}的有多少作业计划?", ["date", "project_type"]), ("工程性质为{project_type}{date}有多少作业计划?", ["project_type", "date"]), ("{project_name}在{date}的作业计划数量", ["project_name", "date"]), ("{date}{project_type}类作业计划有多少?", ["date", "project_type"]), ("{project_type}类{date}作业计划有多少?", ["project_type", "date"]), ("{construction_unit}在{date}有多少作业计划?", ["construction_unit", "date"]), ("{date}{construction_unit}有多少作业计划?", ["date", "construction_unit"]), ("{date}有多少作业计划?", ["date"]), ("公司{date}有多少作业计划?", ["date"]), ("{date}属于{operating}有多少作业计划?", ["date","operating"]), ("{date}{implementation_organization}有多少作业计划?", ["date", "implementation_organization"]), ("{date}{project_department}有多少作业计划?", ["date", "project_department"]), ("{project_department}{date}有多少{risk_level}风险作业计划?", ["project_department","date","risk_level"]), ("{date}{project_manager}有多少作业计划?", ["date", "project_manager"]), ("{date}{subcontractor}有多少作业计划?", ["date", "subcontractor"]), ("{date}{team_leader}有多少作业计划?", ["date", "team_leader"]), ("{date}风险等级为{risk_level}的作业计划有多少?", ["date", "risk_level"]), ("{date}{project_department}有多少{risk_level}风险作业计划?", ["date","project_department", "risk_level"]), ("{date}{project_type}类风险等级为{risk_level}的作业计划有多少?",["date", "project_type", "risk_level"]), ("{date}{construction_unit}有多少{risk_level}风险作业计划?",["date", "construction_unit", "risk_level"]), ("{date}{project_type}类{construction_unit}负责的作业计划有多少?",["date", "project_type", "construction_unit"]), ("{date}{project_type}类{implementation_organization}组织实施的作业计划有多少?",["date", "project_type", "implementation_organization"]), ("{date}{project_department}管理的{project_type}类作业计划有多少?",["date", "project_department", "project_type"]), ("{date}{subcontractor}承包的{project_type}类作业计划有多少?",["date", "subcontractor", "project_type"]), ("{date}{project_manager}负责的{project_type}类作业计划有多少?",["date", "project_manager", "project_type"]), ("{date}{team_leader}带领的{project_type}类作业计划有多少?", ["date", "team_leader", "project_type"]), ("{date}{project_name}由{project_manager}作业计划有多少?",["date", "project_name", "project_manager"]), ("{date}{project_name}中,风险等级为{risk_level}的作业计划有多少?", ["date", "project_name", "risk_level"]), ("{date}{project_manager}作业计划有多少?", ["date","project_manager"]), ("{project_manager}在{date}作业计划有多少?", ["project_manager", "date"]), ("{date}{project_manager}的作业计划数量", ["date","project_manager"]), ("{project_manager}在{date}的作业计划数量", ["project_manager", "date"]), #班组 ("{date}{team_name}有多少项作业计划?", ["date", "team_name"]), ("{team_name}{date}有多少作业计划?", ["team_name","date"]), ("{team_name}{date}作业计划数量", ["team_name","date"]), ("{date}{team_name}作业计划数量", ["date","team_name"]), ] }, "周计划数量查询": { "date": ["本周", "上周","上一周", "下周", "下一周", "最近一周", "本周内", "这一周"], "templates": [ ("{date}{project_name}作业计划有多少?", ["date", "project_name"]), ("{project_name}{date}作业计划有多少?", ["project_name", "date"]), ("{construction_unit}{date}作业计划有多少?", ["construction_unit", "date"]), # 🎯 仅 date 维度 ("{date}作业计划有多少?", ["date"]), # 🎯 date + 其他单个维度 ("{date}{project_name}有多少项作业计划?", ["date", "project_name"]), ("{date}{construction_unit}作业计划有多少?", ["date", "construction_unit"]), ("{date}{implementation_organization}作业计划有多少?", ["date", "implementation_organization"]), ("{date}{project_department}作业计划有多少?", ["date", "project_department"]), ("{date}{project_manager}作业计划有多少?", ["date", "project_manager"]), ("{date}{subcontractor}作业计划有多少?", ["date", "subcontractor"]), ("{date}{team_leader}作业计划有多少?", ["date", "team_leader"]), ("{date}{project_department}作业计划数量", ["date", "project_department"]), ("{date}{subcontractor}作业计划数量?", ["date", "subcontractor"]), # 🎯 date + 风险维度 ("{date}有多少{risk_level}风险作业计划?", ["date", "risk_level"]), # 🎯 date + construction_unit + risk_level ("{construction_unit}{date}有多少项{risk_level}风险作业计划", ["construction_unit", "date", "risk_level"]), # 🎯 date + implementation_organization + risk_level ("{date}{implementation_organization}风险等级为{risk_level}的作业计划有多少?",["date", "implementation_organization", "risk_level"]), # 🎯 date + project_name + project_manager ("{date}{project_name}{project_manager}负责的作业计划有多少?", ["date", "project_name", "project_manager"]), # 🎯 date + project_name + risk_level ("{date}{project_name}有多少项{risk_level}风险作业计划?", ["date", "project_name", "risk_level"]), # 🎯 project_manager 维度 ("{project_manager}{date}作业计划数量?", ["project_manager", "date"]), ("{project_manager}在{date}作业计划有多少?", ["project_manager", "date"]), ("{project_manager}在{date}负责的风险等级为{risk_level}的作业计划有多少?", ["project_manager", "date", "risk_level"]), ("{date}{team_name}有多少项作业计划?", ["date", "team_name"]), ("{team_name}{date}有多少作业计划?", ["team_name","date"]), ("{team_name}{date}作业计划数量", ["team_name","date"]), ("{date}{team_name}的作业计划数量", ["date","team_name"]), ] }, "日计划作业内容": { "date": ["今日", "昨日", "2024年5月24日", "5月24日","今天","昨天"], "templates": [ ("{date}{project_name}作业内容是什么?", ["date", "project_name"]), ("{project_name}在{date}作业内容是什么", ["project_name", "date"]), ("{date}{project_type}类作业内容是什么?", ["date", "project_type"]), ("{project_type}类{date}作业内容是什么?", ["project_type", "date"]), ("{date}工程性质为{project_type}的作业内容是什么?", ["date", "project_type"]), ("工程性质为{project_type}的{date}作业内容是什么?", ["project_type", "date"]), ("{construction_unit}在{date}作业内容是什么?", ["construction_unit", "date"]), # 1. 查询特定日期和项目的作业安排 ("{date}{project_name}作业内容是什么?", ["date", "project_name"]), ("{date}属于{operating}作业内容是什么?", ["date", "operating"]), # 3. 查询特定日期和项目类型的工程计划 ("{date}{project_type}类计划作业内容是什么?", ["date", "project_type"]), ("{date}{construction_unit}{risk_level}风险的作业内容是什么?",["date", "construction_unit", "risk_level"]), ("{date}{implementation_organization}{risk_level}风险的作业内容是什么?",["date", "implementation_organization", "risk_level"]), # 5. 查询特定日期和项目经理的任务安排 ("{project_manager}在{date}作业内容是什么?", ["project_manager", "date"]), # 6. 查询特定日期和风险等级的任务 ("{date}风险等级为{risk_level}的作业内容是什么?", ["date", "risk_level"]), # 7. 查询特定日期和实施单位的任务内容 ("{implementation_organization}在{date}作业内容是什么?", ["implementation_organization", "date"]), # 8. 查询特定日期和团队领导的任务安排 ("{team_leader}在{date}作业内容是什么?", ["team_leader", "date"]), # 9. 查询特定日期和项目类型下的高风险任务 ("{date}的{project_type}类风险等级为{risk_level}的作业内容是什么?", ["date", "project_type", "risk_level"]), # 10. 查询特定日期和风险等级的任务安排 ("{date}风险等级为{risk_level}的作业内容是什么?", ["date", "risk_level"]), ("{date}有多少项{risk_level}风险作业计划?", ["date", "risk_level"]), # 11. 查询特定日期和施工单位的任务进展 ("{construction_unit}在{date}作业内容是什么?", ["construction_unit", "date"]), # 12. 查询特定日期和项目经理完成的任务 ("{project_manager}在{date}作业内容是什么?", ["project_manager", "date"]), # 13. 查询特定日期和项目经理的高风险任务 ("{project_manager}在{date}的风险等级为{risk_level}的作业内容是什么?", ["project_manager", "date", "risk_level"]), # 15. 查询特定日期和所有任务安排 ("{date}作业内容是什么?", ["date"]), # 16. 查询特定日期和项目进度 ("{date}{project_name}作业内容是什么?", ["date", "project_name"]), #班组 ("{date}{team_name}作业内容是什么?", ["date", "team_name"]), ("{team_name}{date}作业内容", ["team_name","date"]), ] }, "周计划作业内容": { "date": ["本周", "上周","上一周", "下周", "下一周", "最近一周", "本周内", "这一周"], "templates": [ ("工程性质为{project_type}在{date}作业内容是什么?", ["project_type", "date"]), ("{date}工程性质为{project_type}作业内容是什么?", ["date", "project_type"]), ("{date}{construction_unit}作业内容是什么?", ["date", "construction_unit"]), ("{implementation_organization}在{date}作业内容是什么?", ["implementation_organization", "date"]), # 4. 查询某项目在指定周的所有作业计划 ("{project_name}在{date}作业内容是什么?", ["project_name", "date"]), # 5. 查询指定周的所有项目类型作业内容 ("{date}{project_type}类作业内容是什么?", ["date", "project_type"]), # 6. 查询某施工单位在指定周的作业任务 ("{construction_unit}在{date}作业内容是什么?", ["construction_unit", "date"]), # 7. 查询某项目经理在指定周负责的作业内容 ("{project_manager}在{date}作业内容是什么?", ["project_manager", "date"]), # 8. 查询某团队负责人在指定周的作业安排 ("{team_leader}在{date}作业内容是什么?", ["team_leader", "date"]), # 9. 查询某项目类型在指定周的高风险作业内容 ("{date}的{project_type}类并且风险等级为{risk_level}的作业内容是什么?", ["date", "project_type", "risk_level"]), # 10. 查询某风险等级在指定周的作业内容 ("{date}风险等级为{risk_level}的作业内容是什么?", ["date", "risk_level"]), ("{date}{risk_level}风险的作业内容是什么?", ["date", "risk_level"]), # 11. 查询某施工单位在指定周的作业进展 ("{construction_unit}在{date}作业内容是什么?", ["construction_unit", "date"]), # 13. 查询某团队在指定周的作业安排 ("{team_leader}领导的团队在{date}作业内容是什么?", ["team_leader", "date"]), # 15. 查询某项目部门在指定周的作业安排 ("{project_department}在{date}作业内容是什么?", ["project_department", "date"]), ("{date}{team_name}作业内容是什么", ["date", "team_name"]), ("{team_name}{date}作业内容", ["team_name","date"]), ] }, "施工人数": { "date": ["今日", "昨日", "2024年5月24日", "5月24日","今天","昨天"], "templates": [ ("{date}{project_name}施工人员有多少?", ["date", "project_name"]), ("{date}{project_name}施工人数是多少?", ["date", "project_name"]), ("{construction_unit}在{date}的施工人数是多少?", ["construction_unit", "date"]), # 2. 统计某施工单位在指定日期的施工总人数 ("统计{construction_unit}在{date}的施工人数是多少?", ["construction_unit", "date"]), ("{date}属于{operating}的施工人数是多少?", ["date", "operating"]), # 4. 查询某项目类型在指定日期的施工人员需求 ("{date}{project_type}类有多少施工人员?", ["date", "project_type"]), ("{date}工程性质为{project_type}有多少施工人员?", ["date", "project_type"]), ("{date}工程性质为{project_type}的施工人数是多少?", ["date", "project_type"]), ("工程性质为{project_type}{date}的施工人数是多少?", ["project_type", "date"]), ("工程性质为{project_type}{date}有多少施工人员?", ["project_type", "date"]), # 5. 统计某施工单位在指定日期的各项目施工人数 ("{construction_unit}在{date}的施工人数是多少?", ["construction_unit", "date"]), # 8. 统计某项目经理管理的项目在指定日期的施工总人数 ("{project_manager}负责的项目在{date}的施工人数是多少?", ["project_manager", "date"]), ("{date}{project_manager}负责的项目的施工人数是多少?", ["date", "project_manager"]), # 9. 查询某分包商在指定日期的施工人员投入 ("{subcontractor}{date}的施工人员有多少?", ["subcontractor", "date"]), ("{subcontractor}{date}的施工人数是多少?", ["subcontractor", "date"]), ("{date}{subcontractor}的施工人员有多少?", ["date", "subcontractor"]), ("{date}{subcontractor}的施工人数是多少?", ["date", "subcontractor"]), ("{team_leader}{date}的施工人员有多少?", ["team_leader", "date"]), ("{team_leader}{date}的施工人数是多少?", ["team_leader", "date"]), ("{date}{team_leader}的施工人员有多少?", ["date", "team_leader"]), ("{date}{team_leader}的施工人数是多少?", ["date", "team_leader"]), # 11. 查询某实施单位在指定日期的施工人员总数 ("{implementation_organization}{date}的施工人数是多少?", ["implementation_organization", "date"]), ("{implementation_organization}{date}的施工人员有多少?", ["implementation_organization", "date"]), ("{date}{team_leader}的施工人员有多少?", ["date", "team_leader"]), ("{date}{team_leader}的施工人数是多少?", ["date", "team_leader"]), # 16. 统计某项目部门在指定日期的施工人员数量 ("{project_department}{date}的施工人员有多少?", ["project_department", "date"]), ("{project_department}{date}的施工人数是多少?", ["project_department", "date"]), # 20. 统计某风险等级项目在指定日期的工种配置情况 ("{date}{risk_level}风险的施工人数是多少?", ["date", "risk_level"]), # 21. 查询某分包商在指定周的施工人员安排 ("{subcontractor}{date}的施工人数是多少?", ["subcontractor", "date"]), # 22. 统计某施工单位在指定周的高风险作业人员数量 ("{construction_unit}{date}风险等级为{risk_level}的施工人数是多少?", ["construction_unit", "date", "risk_level"]), ("{date}{team_name}施工人数是多少", ["date", "team_name"]), ("{date}{team_name}施工人数", ["date","team_name"]), ("{team_name}{date}施工人数是多少", ["team_name","date"]), ("{team_name}{date}施工人数", ["team_name","date"]), ] }, "作业考勤人数": { "date": ["今日", "昨日", "2024年5月24日", "5月24日", "今天", "昨天"], "templates": [ ("{date}{project_name}作业考勤人数是多少", ["date", "project_name"]), ("{project_name}{date}作业考勤人数是多少", ["project_name", "date"]), ("查询{subcontractor}{date}的考勤人数", ["subcontractor", "date"]), ("{subcontractor}{date}的作业考勤人数是多少", ["subcontractor", "date"]), ("{date}属于{operating}的作业考勤人数是多少?", ["date", "operating"]), ("{team_leader}{date}的作业考勤人数是多少", ["team_leader", "date"]), # 4. 统计某施工单位在指定日期的考勤人数 ("统计{construction_unit}{date}的考勤人数", ["construction_unit", "date"]), # 5. 查询某实施单位在指定日期的考勤情况 ("{implementation_organization}{date}的考勤情况如何?", ["implementation_organization", "date"]), # 6. 查询某风险等级项目在指定日期的考勤详情 ("{date}{risk_level}风险项目考勤详情", ["date", "risk_level"]), # 7. 统计某项目类型在指定日期的出勤人数 ("{date}{project_type}类出勤人数是多少?", ["date", "project_type"]), # 10. 统计某项目在指定周的出勤总人数 ("{project_name}{date}的出勤人数是多少?", ["project_name", "date"]), # 11. 查询某分包商在指定周的出勤情况 ("{subcontractor}在{date}的出勤情况如何?", ["subcontractor", "date"]), ("{date}{team_name}考勤人数是多少", ["date", "team_name"]), ("{team_name}{date}考勤人数", ["team_name","date"]), ] }, "页面切换": { "date": ["今日", "昨日", "2024年5月24日", "5月24日", "今天", "昨天"], "templates": [ ("打开{page}页面", ["page"]), ("打开{page}", ["page"]), ("打开{page}模块", ["page"]), ("进入{page}", ["page"]), ("进入{page}模块", ["page"]), ("进入{page}页面", ["page"]), ("跳转到{page}", ["page"]), ("跳转到{page}模块", ["page"]), ("跳转到{page}页面", ["page"]), ("访问{page}页面", ["page"]), ("访问{page}模块", ["page"]), ("访问{page}", ["page"]), ("请打开{page}模块", ["page"]), ("请打开{page}", ["page"]), ("显示{page}页面", ["page"]), ("加载{page}模块", ["page"]), ("加载{page}", ["page"]), ("加载{page}页面", ["page"]), ] } } def generate_natural_samples(config, label): """生成自然语言样本""" samples = [] variable_pool = { "project_name": BASE_DATA["project_names"], "project_type": BASE_DATA["project_types"], "construction_unit": BASE_DATA["construction_units"], "implementation_organization": BASE_DATA["implementation_organizations"], "subcontractor": BASE_DATA["subcontractors"], "team_leader": [f"{tl}" for tl in BASE_DATA["team_leaders"]], "risk_level": BASE_DATA["risk_levels"], "date": config["date"], "project_department": BASE_DATA["project_departments"], "project_manager": BASE_DATA["project_managers"], "page": BASE_DATA["pages"], "operating": BASE_DATA["operatings"], "team_name": BASE_DATA["team_names"] } for template, variables in config["templates"]: for values in product(*[variable_pool[var] for var in variables]): text = template.format(**dict(zip(variables, values))) # 生成标注信息 annotations = [] pos = 0 for var, val in zip(variables, values): start = text.find(val, pos) if start == -1: continue end = start + len(val) annotations.append({ "text": val, "start": start, "end": end, "label": var }) pos = end # 更新查找位置避免重复 samples.append({ "text": text, "annotations": annotations, "prompt": label }) # 保存文件 filename = f"data/{label}.json" with open(filename, "w", encoding="utf-8") as f: json.dump(samples, f, ensure_ascii=False, indent=2) print(f"已生成 {len(samples)} 条自然语言 {label} 数据") # 主执行流程 if __name__ == "__main__": for label, config in TEMPLATE_CONFIG.items(): generate_natural_samples(config, label)