429 lines
24 KiB
Python
429 lines
24 KiB
Python
import json
|
|
import os
|
|
from itertools import product
|
|
# 目录路径
|
|
directory = "data"
|
|
|
|
# 确保目录存在
|
|
if not os.path.exists(directory):
|
|
os.makedirs(directory)
|
|
# 基础数据定义
|
|
BASE_DATA = {
|
|
#实施组织
|
|
"implementation_organizations": ["送电一分公司", "送电二分公司", "变电分公司", "消防分公司"],
|
|
#工程性质
|
|
"project_types": ["基建", "技改大修", "用户工程", "小型基建"],
|
|
#工程名
|
|
"project_names": [
|
|
"1号工程",
|
|
"淮南芦集改造工程",
|
|
"第十号工程",
|
|
"合肥二电厂220kV线路工程",
|
|
"九号工程",
|
|
],
|
|
#建管单位
|
|
"construction_units": ["国网安徽省电力有限公司建设分公司", "国网安徽省电力有限公司马鞍山供电公司", "中铁二局集团电务工程有限公司"],
|
|
#项目部名称
|
|
"project_departments": ["第9项目管理部", "第十一项目部", "第八项目管理部","9号项目部"],
|
|
#项目经理
|
|
"project_managers": ["陈少平项目经理", "范文立项目经理", "何东洋项目经理"],
|
|
#分包单位
|
|
"subcontractors": ["安徽劦力建筑装饰有限责任公司", "安徽苏亚建设集团有限公司"],
|
|
#班组名称
|
|
"team_names": ["张朵班组", "刘梁玉班组","魏玉龙班组"],
|
|
#班组长
|
|
"team_leaders": ["李元帅班组长", "刘雨豪班组长"],
|
|
#风险等级
|
|
"risk_levels": ["1级", "一级", "二级", "5级","四级"],
|
|
#8+2工况
|
|
"operatings": ["8+2工况","8加2工况"],
|
|
#页面切换
|
|
"pages": ["风险管控", "日计划", "周风险" ,"日计划统计报表","日计划推送"]
|
|
|
|
}
|
|
|
|
# 自然语言模板配置
|
|
TEMPLATE_CONFIG = {
|
|
"日计划数量查询": {
|
|
"date": ["今日", "昨日", "2024年5月24日", "5月24日","今天","昨天"],
|
|
"templates": [
|
|
("{date}{project_name}有多少作业计划?", ["date", "project_name"]),
|
|
("{project_name}{date}有多少项作业计划?", ["project_name","date"]),
|
|
("工程性质是{project_type}{date}有多少作业计划?", ["project_type", "date"]),
|
|
("{date}风险等级为{risk_level}的作业计划有多少?", ["date", "risk_level"]),
|
|
("{date}工程性质为{project_type}的有多少作业计划?", ["date", "project_type"]),
|
|
("工程性质为{project_type}{date}有多少作业计划?", ["project_type", "date"]),
|
|
("{project_name}在{date}的作业计划数量", ["project_name", "date"]),
|
|
("{date}{project_type}类作业计划有多少?", ["date", "project_type"]),
|
|
("{project_type}类{date}作业计划有多少?", ["project_type", "date"]),
|
|
("{construction_unit}在{date}有多少作业计划?", ["construction_unit", "date"]),
|
|
("{date}{construction_unit}有多少作业计划?", ["date", "construction_unit"]),
|
|
("{date}有多少作业计划?", ["date"]),
|
|
("公司{date}有多少作业计划?", ["date"]),
|
|
|
|
("{date}属于{operating}有多少作业计划?", ["date","operating"]),
|
|
|
|
("{date}{implementation_organization}有多少作业计划?", ["date", "implementation_organization"]),
|
|
("{date}{project_department}有多少作业计划?", ["date", "project_department"]),
|
|
("{project_department}{date}有多少{risk_level}风险作业计划?", ["project_department","date","risk_level"]),
|
|
|
|
("{date}{project_manager}有多少作业计划?", ["date", "project_manager"]),
|
|
("{date}{subcontractor}有多少作业计划?", ["date", "subcontractor"]),
|
|
("{date}{team_leader}有多少作业计划?", ["date", "team_leader"]),
|
|
("{date}风险等级为{risk_level}的作业计划有多少?", ["date", "risk_level"]),
|
|
("{date}{project_department}有多少{risk_level}风险作业计划?", ["date","project_department", "risk_level"]),
|
|
|
|
("{date}{project_type}类风险等级为{risk_level}的作业计划有多少?",["date", "project_type", "risk_level"]),
|
|
("{date}{construction_unit}有多少{risk_level}风险作业计划?",["date", "construction_unit", "risk_level"]),
|
|
|
|
("{date}{project_type}类{construction_unit}负责的作业计划有多少?",["date", "project_type", "construction_unit"]),
|
|
("{date}{project_type}类{implementation_organization}组织实施的作业计划有多少?",["date", "project_type", "implementation_organization"]),
|
|
("{date}{project_department}管理的{project_type}类作业计划有多少?",["date", "project_department", "project_type"]),
|
|
("{date}{subcontractor}承包的{project_type}类作业计划有多少?",["date", "subcontractor", "project_type"]),
|
|
("{date}{project_manager}负责的{project_type}类作业计划有多少?",["date", "project_manager", "project_type"]),
|
|
("{date}{team_leader}带领的{project_type}类作业计划有多少?", ["date", "team_leader", "project_type"]),
|
|
("{date}{project_name}由{project_manager}作业计划有多少?",["date", "project_name", "project_manager"]),
|
|
("{date}{project_name}中,风险等级为{risk_level}的作业计划有多少?", ["date", "project_name", "risk_level"]),
|
|
("{date}{project_manager}作业计划有多少?", ["date","project_manager"]),
|
|
("{project_manager}在{date}作业计划有多少?", ["project_manager", "date"]),
|
|
|
|
("{date}{project_manager}的作业计划数量", ["date","project_manager"]),
|
|
("{project_manager}在{date}的作业计划数量", ["project_manager", "date"]),
|
|
|
|
#班组
|
|
("{date}{team_name}有多少项作业计划?", ["date", "team_name"]),
|
|
("{team_name}{date}有多少作业计划?", ["team_name","date"]),
|
|
("{team_name}{date}作业计划数量", ["team_name","date"]),
|
|
("{date}{team_name}作业计划数量", ["date","team_name"]),
|
|
]
|
|
},
|
|
"周计划数量查询": {
|
|
"date": ["本周", "上周","上一周", "下周", "下一周", "最近一周", "本周内", "这一周"],
|
|
"templates": [
|
|
("{date}{project_name}作业计划有多少?", ["date", "project_name"]),
|
|
("{project_name}{date}作业计划有多少?", ["project_name", "date"]),
|
|
("{construction_unit}{date}作业计划有多少?", ["construction_unit", "date"]),
|
|
# 🎯 仅 date 维度
|
|
("{date}作业计划有多少?", ["date"]),
|
|
|
|
# 🎯 date + 其他单个维度
|
|
("{date}{project_name}有多少项作业计划?", ["date", "project_name"]),
|
|
|
|
("{date}{construction_unit}作业计划有多少?", ["date", "construction_unit"]),
|
|
("{date}{implementation_organization}作业计划有多少?", ["date", "implementation_organization"]),
|
|
("{date}{project_department}作业计划有多少?", ["date", "project_department"]),
|
|
("{date}{project_manager}作业计划有多少?", ["date", "project_manager"]),
|
|
("{date}{subcontractor}作业计划有多少?", ["date", "subcontractor"]),
|
|
("{date}{team_leader}作业计划有多少?", ["date", "team_leader"]),
|
|
|
|
("{date}{project_department}作业计划数量", ["date", "project_department"]),
|
|
("{date}{subcontractor}作业计划数量?", ["date", "subcontractor"]),
|
|
|
|
# 🎯 date + 风险维度
|
|
|
|
("{date}有多少{risk_level}风险作业计划?", ["date", "risk_level"]),
|
|
|
|
# 🎯 date + construction_unit + risk_level
|
|
("{construction_unit}{date}有多少项{risk_level}风险作业计划", ["construction_unit", "date", "risk_level"]),
|
|
|
|
# 🎯 date + implementation_organization + risk_level
|
|
("{date}{implementation_organization}风险等级为{risk_level}的作业计划有多少?",["date", "implementation_organization", "risk_level"]),
|
|
|
|
# 🎯 date + project_name + project_manager
|
|
("{date}{project_name}{project_manager}负责的作业计划有多少?", ["date", "project_name", "project_manager"]),
|
|
|
|
# 🎯 date + project_name + risk_level
|
|
("{date}{project_name}有多少项{risk_level}风险作业计划?", ["date", "project_name", "risk_level"]),
|
|
|
|
# 🎯 project_manager 维度
|
|
("{project_manager}{date}作业计划数量?", ["project_manager", "date"]),
|
|
("{project_manager}在{date}作业计划有多少?", ["project_manager", "date"]),
|
|
("{project_manager}在{date}负责的风险等级为{risk_level}的作业计划有多少?", ["project_manager", "date", "risk_level"]),
|
|
|
|
("{date}{team_name}有多少项作业计划?", ["date", "team_name"]),
|
|
("{team_name}{date}有多少作业计划?", ["team_name","date"]),
|
|
("{team_name}{date}作业计划数量", ["team_name","date"]),
|
|
("{date}{team_name}的作业计划数量", ["date","team_name"]),
|
|
]
|
|
},
|
|
"日计划作业内容": {
|
|
"date": ["今日", "昨日", "2024年5月24日", "5月24日","今天","昨天"],
|
|
"templates": [
|
|
("{date}{project_name}作业内容是什么?", ["date", "project_name"]),
|
|
("{project_name}在{date}作业内容是什么", ["project_name", "date"]),
|
|
("{date}{project_type}类作业内容是什么?", ["date", "project_type"]),
|
|
("{project_type}类{date}作业内容是什么?", ["project_type", "date"]),
|
|
("{date}工程性质为{project_type}的作业内容是什么?", ["date", "project_type"]),
|
|
("工程性质为{project_type}的{date}作业内容是什么?", ["project_type", "date"]),
|
|
("{construction_unit}在{date}作业内容是什么?", ["construction_unit", "date"]),
|
|
# 1. 查询特定日期和项目的作业安排
|
|
("{date}{project_name}作业内容是什么?", ["date", "project_name"]),
|
|
("{date}属于{operating}作业内容是什么?", ["date", "operating"]),
|
|
# 3. 查询特定日期和项目类型的工程计划
|
|
("{date}{project_type}类计划作业内容是什么?", ["date", "project_type"]),
|
|
|
|
("{date}{construction_unit}{risk_level}风险的作业内容是什么?",["date", "construction_unit", "risk_level"]),
|
|
|
|
("{date}{implementation_organization}{risk_level}风险的作业内容是什么?",["date", "implementation_organization", "risk_level"]),
|
|
|
|
# 5. 查询特定日期和项目经理的任务安排
|
|
("{project_manager}在{date}作业内容是什么?", ["project_manager", "date"]),
|
|
|
|
# 6. 查询特定日期和风险等级的任务
|
|
("{date}风险等级为{risk_level}的作业内容是什么?", ["date", "risk_level"]),
|
|
|
|
# 7. 查询特定日期和实施单位的任务内容
|
|
("{implementation_organization}在{date}作业内容是什么?", ["implementation_organization", "date"]),
|
|
|
|
# 8. 查询特定日期和团队领导的任务安排
|
|
("{team_leader}在{date}作业内容是什么?", ["team_leader", "date"]),
|
|
|
|
# 9. 查询特定日期和项目类型下的高风险任务
|
|
("{date}的{project_type}类风险等级为{risk_level}的作业内容是什么?", ["date", "project_type", "risk_level"]),
|
|
|
|
# 10. 查询特定日期和风险等级的任务安排
|
|
("{date}风险等级为{risk_level}的作业内容是什么?", ["date", "risk_level"]),
|
|
|
|
("{date}有多少项{risk_level}风险作业计划?", ["date", "risk_level"]),
|
|
|
|
# 11. 查询特定日期和施工单位的任务进展
|
|
("{construction_unit}在{date}作业内容是什么?", ["construction_unit", "date"]),
|
|
|
|
# 12. 查询特定日期和项目经理完成的任务
|
|
("{project_manager}在{date}作业内容是什么?", ["project_manager", "date"]),
|
|
|
|
# 13. 查询特定日期和项目经理的高风险任务
|
|
("{project_manager}在{date}的风险等级为{risk_level}的作业内容是什么?", ["project_manager", "date", "risk_level"]),
|
|
|
|
# 15. 查询特定日期和所有任务安排
|
|
("{date}作业内容是什么?", ["date"]),
|
|
|
|
# 16. 查询特定日期和项目进度
|
|
("{date}{project_name}作业内容是什么?", ["date", "project_name"]),
|
|
#班组
|
|
("{date}{team_name}作业内容是什么?", ["date", "team_name"]),
|
|
("{team_name}{date}作业内容", ["team_name","date"]),
|
|
]
|
|
},
|
|
"周计划作业内容": {
|
|
"date": ["本周", "上周","上一周", "下周", "下一周", "最近一周", "本周内", "这一周"],
|
|
"templates": [
|
|
("工程性质为{project_type}在{date}作业内容是什么?", ["project_type", "date"]),
|
|
("{date}工程性质为{project_type}作业内容是什么?", ["date", "project_type"]),
|
|
|
|
("{date}{construction_unit}作业内容是什么?", ["date", "construction_unit"]),
|
|
|
|
("{implementation_organization}在{date}作业内容是什么?", ["implementation_organization", "date"]),
|
|
|
|
# 4. 查询某项目在指定周的所有作业计划
|
|
("{project_name}在{date}作业内容是什么?", ["project_name", "date"]),
|
|
|
|
# 5. 查询指定周的所有项目类型作业内容
|
|
("{date}{project_type}类作业内容是什么?", ["date", "project_type"]),
|
|
|
|
# 6. 查询某施工单位在指定周的作业任务
|
|
("{construction_unit}在{date}作业内容是什么?", ["construction_unit", "date"]),
|
|
|
|
# 7. 查询某项目经理在指定周负责的作业内容
|
|
("{project_manager}在{date}作业内容是什么?", ["project_manager", "date"]),
|
|
|
|
# 8. 查询某团队负责人在指定周的作业安排
|
|
("{team_leader}在{date}作业内容是什么?", ["team_leader", "date"]),
|
|
|
|
# 9. 查询某项目类型在指定周的高风险作业内容
|
|
("{date}的{project_type}类并且风险等级为{risk_level}的作业内容是什么?", ["date", "project_type", "risk_level"]),
|
|
|
|
# 10. 查询某风险等级在指定周的作业内容
|
|
("{date}风险等级为{risk_level}的作业内容是什么?", ["date", "risk_level"]),
|
|
("{date}{risk_level}风险的作业内容是什么?", ["date", "risk_level"]),
|
|
|
|
# 11. 查询某施工单位在指定周的作业进展
|
|
("{construction_unit}在{date}作业内容是什么?", ["construction_unit", "date"]),
|
|
|
|
# 13. 查询某团队在指定周的作业安排
|
|
("{team_leader}领导的团队在{date}作业内容是什么?", ["team_leader", "date"]),
|
|
|
|
# 15. 查询某项目部门在指定周的作业安排
|
|
("{project_department}在{date}作业内容是什么?", ["project_department", "date"]),
|
|
|
|
("{date}{team_name}作业内容是什么", ["date", "team_name"]),
|
|
("{team_name}{date}作业内容", ["team_name","date"]),
|
|
]
|
|
},
|
|
"施工人数": {
|
|
"date": ["今日", "昨日", "2024年5月24日", "5月24日","今天","昨天"],
|
|
"templates": [
|
|
("{date}{project_name}施工人员有多少?", ["date", "project_name"]),
|
|
("{date}{project_name}施工人数是多少?", ["date", "project_name"]),
|
|
|
|
("{construction_unit}在{date}的施工人数是多少?", ["construction_unit", "date"]),
|
|
# 2. 统计某施工单位在指定日期的施工总人数
|
|
("统计{construction_unit}在{date}的施工人数是多少?", ["construction_unit", "date"]),
|
|
("{date}属于{operating}的施工人数是多少?", ["date", "operating"]),
|
|
|
|
# 4. 查询某项目类型在指定日期的施工人员需求
|
|
("{date}{project_type}类有多少施工人员?", ["date", "project_type"]),
|
|
("{date}工程性质为{project_type}有多少施工人员?", ["date", "project_type"]),
|
|
("{date}工程性质为{project_type}的施工人数是多少?", ["date", "project_type"]),
|
|
("工程性质为{project_type}{date}的施工人数是多少?", ["project_type", "date"]),
|
|
("工程性质为{project_type}{date}有多少施工人员?", ["project_type", "date"]),
|
|
# 5. 统计某施工单位在指定日期的各项目施工人数
|
|
("{construction_unit}在{date}的施工人数是多少?", ["construction_unit", "date"]),
|
|
# 8. 统计某项目经理管理的项目在指定日期的施工总人数
|
|
("{project_manager}负责的项目在{date}的施工人数是多少?", ["project_manager", "date"]),
|
|
("{date}{project_manager}负责的项目的施工人数是多少?", ["date", "project_manager"]),
|
|
|
|
# 9. 查询某分包商在指定日期的施工人员投入
|
|
("{subcontractor}{date}的施工人员有多少?", ["subcontractor", "date"]),
|
|
("{subcontractor}{date}的施工人数是多少?", ["subcontractor", "date"]),
|
|
("{date}{subcontractor}的施工人员有多少?", ["date", "subcontractor"]),
|
|
("{date}{subcontractor}的施工人数是多少?", ["date", "subcontractor"]),
|
|
|
|
("{team_leader}{date}的施工人员有多少?", ["team_leader", "date"]),
|
|
("{team_leader}{date}的施工人数是多少?", ["team_leader", "date"]),
|
|
("{date}{team_leader}的施工人员有多少?", ["date", "team_leader"]),
|
|
("{date}{team_leader}的施工人数是多少?", ["date", "team_leader"]),
|
|
|
|
|
|
# 11. 查询某实施单位在指定日期的施工人员总数
|
|
("{implementation_organization}{date}的施工人数是多少?", ["implementation_organization", "date"]),
|
|
("{implementation_organization}{date}的施工人员有多少?", ["implementation_organization", "date"]),
|
|
("{date}{team_leader}的施工人员有多少?", ["date", "team_leader"]),
|
|
("{date}{team_leader}的施工人数是多少?", ["date", "team_leader"]),
|
|
|
|
|
|
# 16. 统计某项目部门在指定日期的施工人员数量
|
|
("{project_department}{date}的施工人员有多少?", ["project_department", "date"]),
|
|
("{project_department}{date}的施工人数是多少?", ["project_department", "date"]),
|
|
|
|
# 20. 统计某风险等级项目在指定日期的工种配置情况
|
|
("{date}{risk_level}风险的施工人数是多少?", ["date", "risk_level"]),
|
|
|
|
# 21. 查询某分包商在指定周的施工人员安排
|
|
("{subcontractor}{date}的施工人数是多少?", ["subcontractor", "date"]),
|
|
|
|
# 22. 统计某施工单位在指定周的高风险作业人员数量
|
|
("{construction_unit}{date}风险等级为{risk_level}的施工人数是多少?", ["construction_unit", "date", "risk_level"]),
|
|
|
|
("{date}{team_name}施工人数是多少", ["date", "team_name"]),
|
|
("{date}{team_name}施工人数", ["date","team_name"]),
|
|
("{team_name}{date}施工人数是多少", ["team_name","date"]),
|
|
("{team_name}{date}施工人数", ["team_name","date"]),
|
|
|
|
]
|
|
},
|
|
"作业考勤人数": {
|
|
"date": ["今日", "昨日", "2024年5月24日", "5月24日", "今天", "昨天"],
|
|
"templates": [
|
|
("{date}{project_name}作业考勤人数是多少", ["date", "project_name"]),
|
|
("{project_name}{date}作业考勤人数是多少", ["project_name", "date"]),
|
|
("查询{subcontractor}{date}的考勤人数", ["subcontractor", "date"]),
|
|
("{subcontractor}{date}的作业考勤人数是多少", ["subcontractor", "date"]),
|
|
("{date}属于{operating}的作业考勤人数是多少?", ["date", "operating"]),
|
|
("{team_leader}{date}的作业考勤人数是多少", ["team_leader", "date"]),
|
|
# 4. 统计某施工单位在指定日期的考勤人数
|
|
("统计{construction_unit}{date}的考勤人数", ["construction_unit", "date"]),
|
|
|
|
# 5. 查询某实施单位在指定日期的考勤情况
|
|
("{implementation_organization}{date}的考勤情况如何?", ["implementation_organization", "date"]),
|
|
|
|
# 6. 查询某风险等级项目在指定日期的考勤详情
|
|
("{date}{risk_level}风险项目考勤详情", ["date", "risk_level"]),
|
|
|
|
# 7. 统计某项目类型在指定日期的出勤人数
|
|
("{date}{project_type}类出勤人数是多少?", ["date", "project_type"]),
|
|
|
|
# 10. 统计某项目在指定周的出勤总人数
|
|
("{project_name}{date}的出勤人数是多少?", ["project_name", "date"]),
|
|
|
|
# 11. 查询某分包商在指定周的出勤情况
|
|
("{subcontractor}在{date}的出勤情况如何?", ["subcontractor", "date"]),
|
|
|
|
("{date}{team_name}考勤人数是多少", ["date", "team_name"]),
|
|
("{team_name}{date}考勤人数", ["team_name","date"]),
|
|
]
|
|
},
|
|
"页面切换": {
|
|
"date": ["今日", "昨日", "2024年5月24日", "5月24日", "今天", "昨天"],
|
|
"templates": [
|
|
("打开{page}页面", ["page"]),
|
|
("打开{page}", ["page"]),
|
|
("打开{page}模块", ["page"]),
|
|
("进入{page}", ["page"]),
|
|
("进入{page}模块", ["page"]),
|
|
("进入{page}页面", ["page"]),
|
|
("跳转到{page}", ["page"]),
|
|
("跳转到{page}模块", ["page"]),
|
|
("跳转到{page}页面", ["page"]),
|
|
("访问{page}页面", ["page"]),
|
|
("访问{page}模块", ["page"]),
|
|
("访问{page}", ["page"]),
|
|
("请打开{page}模块", ["page"]),
|
|
("请打开{page}", ["page"]),
|
|
("显示{page}页面", ["page"]),
|
|
("加载{page}模块", ["page"]),
|
|
("加载{page}", ["page"]),
|
|
("加载{page}页面", ["page"]),
|
|
]
|
|
}
|
|
}
|
|
|
|
|
|
def generate_natural_samples(config, label):
|
|
"""生成自然语言样本"""
|
|
samples = []
|
|
variable_pool = {
|
|
"project_name": BASE_DATA["project_names"],
|
|
"project_type": BASE_DATA["project_types"],
|
|
"construction_unit": BASE_DATA["construction_units"],
|
|
"implementation_organization": BASE_DATA["implementation_organizations"],
|
|
"subcontractor": BASE_DATA["subcontractors"],
|
|
"team_leader": [f"{tl}" for tl in BASE_DATA["team_leaders"]],
|
|
"risk_level": BASE_DATA["risk_levels"],
|
|
"date": config["date"],
|
|
"project_department": BASE_DATA["project_departments"],
|
|
"project_manager": BASE_DATA["project_managers"],
|
|
"page": BASE_DATA["pages"],
|
|
"operating": BASE_DATA["operatings"],
|
|
"team_name": BASE_DATA["team_names"]
|
|
}
|
|
|
|
for template, variables in config["templates"]:
|
|
for values in product(*[variable_pool[var] for var in variables]):
|
|
text = template.format(**dict(zip(variables, values)))
|
|
|
|
# 生成标注信息
|
|
annotations = []
|
|
pos = 0
|
|
for var, val in zip(variables, values):
|
|
start = text.find(val, pos)
|
|
if start == -1:
|
|
continue
|
|
end = start + len(val)
|
|
annotations.append({
|
|
"text": val,
|
|
"start": start,
|
|
"end": end,
|
|
"label": var
|
|
})
|
|
pos = end # 更新查找位置避免重复
|
|
|
|
samples.append({
|
|
"text": text,
|
|
"annotations": annotations,
|
|
"prompt": label
|
|
})
|
|
|
|
# 保存文件
|
|
filename = f"data/{label}.json"
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|
json.dump(samples, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"已生成 {len(samples)} 条自然语言 {label} 数据")
|
|
|
|
|
|
# 主执行流程
|
|
if __name__ == "__main__":
|
|
for label, config in TEMPLATE_CONFIG.items():
|
|
generate_natural_samples(config, label)
|