From 5413911e64f61cf3524edf0361a3086072a9fb70 Mon Sep 17 00:00:00 2001 From: weiweiw <14335254+weiweiw22@user.noreply.gitee.com> Date: Thu, 22 May 2025 18:33:10 +0800 Subject: [PATCH] =?UTF-8?q?=E6=89=93=E5=BC=80=E6=96=B9=E6=A1=88=EF=BC=8C?= =?UTF-8?q?=E8=A7=84=E7=A8=8B=E8=A7=84=E8=8C=83=E5=92=8C=E5=9B=BE=E7=BA=B8?= =?UTF-8?q?=E7=9A=84=E5=8A=9F=E8=83=BD=E7=9A=84=E8=AE=AD=E7=BB=83=E5=92=8C?= =?UTF-8?q?=E6=A0=87=E5=87=86=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/constants.py | 10 +- api/globalData.py | 105 +++++++++++++++++-- api/main.py | 7 +- api/main_temp.py | 40 ++------ api/slotRecognition.py | 18 ++-- api/utils.py | 198 +++++++++++++++++++++++++++++++++--- generated_data/generated.py | 97 +++++++++++------- 7 files changed, 373 insertions(+), 102 deletions(-) diff --git a/api/constants.py b/api/constants.py index 7a723fc..aef8cf4 100644 --- a/api/constants.py +++ b/api/constants.py @@ -13,7 +13,7 @@ USELESS_PROGRAM_DEPARTMENT_WORDS = {"项目管理部", "项目部"} #公司名标准化时需要过滤掉的词汇 USELESS_COMPANY_WORDS = ["公司","有限","责任","工程","科技"] - +USELESS_DESIGN_WORDS = {"方案", "措施"} #提取公司名热词需要过滤掉的词汇 # USELESS_COMPANY_WORDS = ["公司","有限","责任","工程","科技","安徽省","国网","四川省","安徽","集团","电力","建设","建筑","安装","股份" # "装饰","结构","能源","发展","装饰","电气","股份"] @@ -43,8 +43,14 @@ RISK_LEVEL = "riskLevel" TEAM_NAME = "teamName" PAGE = "page" -PROGRAM_NAVIGATION = "programNavigation" +# PROGRAM_NAVIGATION = "programNavigation" +DESIGN_SPECIFICATION = "designSpecificationName" +DESIGN = "designName" +SPECIFICATION = "specificationName" +PICTURE = "picName" +#方案的后缀 +design_suffix = ["措施","方案","规划","三措一案"] # 意图识别和槽位抽取服务返回的关键提示语列表 SLOT_KEYWORDS = [ diff --git a/api/globalData.py b/api/globalData.py index 71f46da..bf39754 100644 --- a/api/globalData.py +++ b/api/globalData.py @@ -9,6 +9,7 @@ from config import redis_url logger = setup_logger("GlobalData", level=logging.DEBUG) + class GlobalData: # 数据字段 standard_company_program = {} @@ -32,6 +33,8 @@ class GlobalData: simply_to_standard_team_leader_name_map = {} pinyin_simply_to_standard_team_leader_name_map = {} + standard_design_pic = {} + @classmethod def update_from_redis(cls): import sys @@ -50,20 +53,22 @@ class GlobalData: #建管单位 cls._update_list_data('SBD_QUERY_DATA:CONSTRUCTION_UNIT', './standard_data/construct_unit.txt', - cls.standard_construct_name_list, cls.simply_to_standard_construct_name_map, - cls.pinyin_simply_to_standard_construct_name_map, clean_useless_company_name) + cls.standard_construct_name_list, cls.simply_to_standard_construct_name_map, + cls.pinyin_simply_to_standard_construct_name_map, clean_useless_company_name) logger.info(f"建管单位数量:{len(cls.standard_construct_name_list)}") #分包单位 cls._update_list_data('SBD_QUERY_DATA:SUBCONTRACTOR', './standard_data/sub_contract.txt', - cls.standard_constractor_name_list, cls.simply_to_standard_constractor_name_map, - cls.pinyin_simply_to_standard_constractor_name_map, clean_useless_company_name) + cls.standard_constractor_name_list, cls.simply_to_standard_constractor_name_map, + cls.pinyin_simply_to_standard_constractor_name_map, clean_useless_company_name) logger.info(f"分包单位数量:{len(cls.standard_constractor_name_list)}") #班组名称 cls._update_list_data('SBD_QUERY_DATA:TEAM', './standard_data/team_leader.txt', - cls.standard_team_leader_name_list, cls.simply_to_standard_team_leader_name_map, - cls.pinyin_simply_to_standard_team_leader_name_map, clean_useless_team_leader_name) + cls.standard_team_leader_name_list, cls.simply_to_standard_team_leader_name_map, + cls.pinyin_simply_to_standard_team_leader_name_map, clean_useless_team_leader_name) + + cls.update_design_pic_info() logger.info(f"班组名称数量:{len(cls.standard_team_leader_name_list)}") @classmethod @@ -80,7 +85,7 @@ class GlobalData: json_str = r.get('SBD_QUERY_DATA:STANDARD_COMPANY_PROGRAM') if json_str: temp_data = json.loads(json_str) - save_dict_to_file(temp_data,"./standard_data/standard_company_program.json") + save_dict_to_file(temp_data, "./standard_data/standard_company_program.json") logger.info("[Info] Loaded STANDARD_COMPANY_PROGRAM from Redis") else: raise ValueError("Redis key not found") @@ -122,7 +127,7 @@ class GlobalData: if json_str: try: temp_list = json.loads(json_str) - save_standard_name_list_to_file(temp_list,local_path) + save_standard_name_list_to_file(temp_list, local_path) logger.info(f"[Info] Loaded {redis_key} from Redis") except json.JSONDecodeError as e: logger.info(f"[Warning] JSON decode error on key '{redis_key}': {e}") @@ -144,3 +149,87 @@ class GlobalData: pinyin_map.update({ text_to_pinyin(cleaner(kw)): kw for kw in temp_list }) + + @classmethod + def update_design_pic_info(cls): + + from utils import ( + load_standard_json_data, + save_dict_to_file, + clean_useless_company_name, + text_to_pinyin + ) + # 公司与工程关系数据 + try: + r = redis.from_url(redis_url, decode_responses=True) + json_str = r.get('SBD_QUERY_DATA:STANDARD_DESIGN_PIC_INFO') + if json_str: + temp_data = json.loads(json_str) + save_dict_to_file(temp_data, "./standard_data/standard_project_info.json") + logger.info("[Info] Loaded STANDARD_DESIGN_PIC_INFO from Redis") + else: + raise ValueError("Redis key not found") + except Exception as e: + logger.error(f"[Error] Error loading STANDARD_DESIGN_PIC_INFO: {e}") + temp_data = load_standard_json_data("./standard_data/standard_project_info.json") + + print(f"STANDARD_DESIGN_PIC_INFO:{temp_data}") + if temp_data != cls.standard_design_pic: + cls.standard_design_pic.clear() + cls.standard_design_pic.update(temp_data) + + @classmethod + def get_all_company_from_design_info(cls): + #获取所有分公司信息 + return list(cls.standard_design_pic.keys()) + + @classmethod + def get_project_from_design_info(cls): + #获取所有工程信息 + project_list = [] + + for company, projects in cls.standard_design_pic.items(): + project_list.extend(projects.keys()) + return project_list + + + @classmethod + def get_contents_by_company_proj(cls, company_name, project_name): + #根据分公司名和工程名 获取方案,图纸和规范规程 + global company_projects + if company_name and project_name: + return cls.standard_design_pic.get(company_name, {}).get(project_name, {}) + elif not company_name and project_name: + for company, projects in cls.standard_design_pic.items(): + if project_name in projects: + return projects[project_name] # 返回该工程下的“方案/图纸/规范规程” + elif company_name and not project_name: + result = { + "方案": [], + "图纸": [], + "规范规程": [] + } + company_projects = cls.standard_design_pic.get(company_name, {}) + # 忽略 "@type" 字段 + if "@type" in company_projects: + del company_projects["@type"] + + for proj_key, project in company_projects.items(): + if "@type" in project: + del project["@type"] + for key in result: + result[key].extend(project.get(key, [])) + + # print(f"最终result:{result}") + return result + else: + return None + + + @classmethod + def get_contents_by_proj(cls, project_name): + #根据工程名 获取方案,图纸和规范规程 + for company, projects in cls.standard_design_pic.items(): + if project_name in projects: + return projects[project_name] # 返回该工程下的“方案/图纸/规范规程” + return None # 没找到 diff --git a/api/main.py b/api/main.py index 258c9ee..029eb3b 100644 --- a/api/main.py +++ b/api/main.py @@ -16,7 +16,7 @@ from globalData import GlobalData from apscheduler.schedulers.background import BackgroundScheduler MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-14672" -MODEL_UIE_PATH = R"../uie/output/checkpoint-16380" +MODEL_UIE_PATH = R"../uie/output_temp/checkpoint-18774" # 类别名称列表 @@ -48,8 +48,8 @@ label_map = { 15: 'B-personName', 34: 'I-personName', 16: 'B-personQueryType', 35: 'I-personQueryType', 17: 'B-projectStatus', 36: 'I-projectStatus', - 18: 'B-skyNet', 37: 'I-skyNet', - 19: 'B-programNavigation', 38: 'I-programNavigation' + 18: 'B-picName', 37: 'I-picName', + 19: 'B-designSpecificationName', 38: 'I-designSpecificationName' } logger = setup_logger("main", level=logging.DEBUG) @@ -406,6 +406,7 @@ def extract_multi_chat(messages): 请你仅输出还原后的完整问题,不要输出任何变量、中间步骤或解释说明,确保结果自然通顺,语义完整。 ''' + message = [ {"role": "user", "content": prompt} ] diff --git a/api/main_temp.py b/api/main_temp.py index db1cc60..93d023f 100644 --- a/api/main_temp.py +++ b/api/main_temp.py @@ -16,7 +16,7 @@ from globalData import GlobalData from apscheduler.schedulers.background import BackgroundScheduler MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-14672" -MODEL_UIE_PATH = R"../uie/output/checkpoint-16380" +MODEL_UIE_PATH = R"../uie/output_temp/checkpoint-20860" # 类别名称列表 @@ -48,8 +48,8 @@ label_map = { 15: 'B-personName', 34: 'I-personName', 16: 'B-personQueryType', 35: 'I-personQueryType', 17: 'B-projectStatus', 36: 'I-projectStatus', - 18: 'B-skyNet', 37: 'I-skyNet', - 19: 'B-programNavigation', 38: 'I-programNavigation' + 18: 'B-picName', 37: 'I-picName', + 19: 'B-designSpecificationName', 38: 'I-designSpecificationName' } logger = setup_logger("main", level=logging.DEBUG) @@ -386,37 +386,16 @@ def extract_multi_chat(messages): 示例:补全模糊表达("今天送一分公司有多少作业计划", "具体是哪些") 返回 "今天送一分公司具体有哪些作业计划" 函数 是查询新属性(文本, 新问题): - 如果新问题中没有查询主体仅有查询对象 则返回TRUE - 如果新问题中仅有查询主体但没有查询对象 则返回TRUE + 如果新问题中提取不到主体 且仅能提取到查询属性 + 且这个查询属性和文本中提取到的查询属性不同 则返回TRUE 其他情况均返回FALSE 示例:是查询新属性("今天送一分公司有多少作业计划", "作业内容") 返回 True - 函数 删除数量词(文本): - 删除“有多少”、“多少”、“几条”、“几个”等数量问句词 - - 函数 替换查询属性(文本, 新查询属性): - 说明: - 本函数用于在删除数量词后,将原句中与“新查询属性”同类型的核心查询词替换为“新查询属性”,并确保其他内容保持不变且语义自然。 + 函数 替换新属性(文本,新查询属性): + 先删除文本中的"有多少"等类似的表达数量表达, + 再将文本里的查询属性替换为新查询属性,并保持其他内容不变并返回 且保持新查询属性的语气 + 示例:替换新属性("今天送一分公司有多少作业计划", "作业内容") 返回 "今天送一分公司的作业内容" - 处理步骤: - 1. 删除文本中的数量类词语,例如“有多少”、“多少”、“几个”、“几条”等。 - 2. 识别原句中的核心查询属性词,判断其与“新查询属性”是否属于相同类别(如均为对象、地点、组织等)。 - 3. 将原有核心查询词替换为“新查询属性”,保留句中其余上下文结构不变。 - 4. 保持句子语气自然,避免引入“是什么”、“有多少”等疑问表达。 - - 返回: - 返回替换后的文本,语义清晰、语气自然。 - - 示例: - 替换查询属性("今天送一分公司有多少作业计划", "作业内容") - → "今天送一分公司的作业内容" - - 替换查询属性("今天送一分公司的班组详情", "送二分公司") - → "今天送二分公司的班组详情" - - 替换查询属性("今天送一分公司的班组详情", "明天呢") - → "明天送二分公司的班组详情" - 函数 有完整的句意(新问题): 如果新问题里有主体同时有操作对象或查询对象则返回TRUE 其他情况均返回FALSE @@ -427,6 +406,7 @@ def extract_multi_chat(messages): 请你仅输出还原后的完整问题,不要输出任何变量、中间步骤或解释说明,确保结果自然通顺,语义完整。 ''' + message = [ {"role": "user", "content": prompt} ] diff --git a/api/slotRecognition.py b/api/slotRecognition.py index b88d885..543a934 100644 --- a/api/slotRecognition.py +++ b/api/slotRecognition.py @@ -2,8 +2,9 @@ import paddle from paddlenlp.transformers import ErnieForTokenClassification, ErnieTokenizer from globalData import GlobalData -from utils import standardize_name_only_high_score, clean_useless_company_name -from constants import SUBCONTRACTOR, CONSTRUCTION_UNIT, IMPLEMENTATION_ORG, PAGE, PROGRAM_NAVIGATION, PROJECT_DEPARTMENT +from utils import standardize_name_only_high_score, clean_useless_company_name, is_design_file +from constants import SUBCONTRACTOR, CONSTRUCTION_UNIT, IMPLEMENTATION_ORG, PAGE, \ + PROJECT_DEPARTMENT, DESIGN_SPECIFICATION, DESIGN, SPECIFICATION import paddle.nn.functional as F @@ -178,14 +179,13 @@ class SlotRecognition: else: updates[key] = value prob_updates[key] = slot_probabilities[key] - elif key == PROGRAM_NAVIGATION or key == PAGE: - if "施" in value: - updates[key] = "施工生产管理平台" + elif key == DESIGN_SPECIFICATION: + if is_design_file(value): + updates[DESIGN] = value + prob_updates[DESIGN] = 1 else: - updates[key] = value - prob_updates[key] = slot_probabilities[key] - - # 先不处理 PROJECT_DEPARTMENT,后续单独处理 + updates[SPECIFICATION] = value + prob_updates[SPECIFICATION] = 1 elif key != PROJECT_DEPARTMENT: updates[key] = value prob_updates[key] = slot_probabilities[key] diff --git a/api/utils.py b/api/utils.py index a4078a4..4392a57 100644 --- a/api/utils.py +++ b/api/utils.py @@ -13,7 +13,7 @@ import re from globalData import GlobalData from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS, CONSTRUCTION_UNIT, IMPLEMENTATION_ORG, \ SUBCONTRACTOR, PROJECT_NAME, PROJECT_DEPARTMENT, RISK_LEVEL, TEAM_NAME, USELESS_PROGRAM_DEPARTMENT_WORDS, \ - SLOT_KEYWORDS + SLOT_KEYWORDS, design_suffix, DESIGN, SPECIFICATION, PICTURE, USELESS_DESIGN_WORDS from logger_util import setup_logger @@ -64,6 +64,7 @@ def load_standard_json_data(path): # print(f"[Error] Failed to load local JSON file: {e}") return {} + #将字典序列的josn 存入本地文件 def save_dict_to_file(data: dict, file_path: str): """ @@ -83,6 +84,7 @@ def save_dict_to_file(data: dict, file_path: str): # print(f"[Error] 写入 JSON 文件失败:{e}") logger.error("[Error] 写入 JSON 文件失败:", exc_info=e) + #从指定文件中加载标准化的名称列表。 def load_standard_name_list(file_path: str): """ @@ -111,6 +113,7 @@ def load_standard_name_list(file_path: str): # print(f"读取文件时发生错误:{e}", flush=True) raise Exception(f"错误:文件 {file_path} 不存在") + #将标准化名称列表写入指定文件中,每行一个名称。 def save_standard_name_list_to_file(name_list, file_path): """ @@ -130,6 +133,7 @@ def save_standard_name_list_to_file(name_list, file_path): except Exception as e: logger.error(f"[Error] 写入文件失败:{e}") + def extract_number(text): """ 提取项目部中的数字(支持阿拉伯数字和中文数字),并转换为统一格式(中文数字)。 @@ -148,6 +152,7 @@ def replace_arabic_with_chinese(text): 将字符串中所有连续的阿拉伯数字转换为对应的中文数字。 示例:2024年25号 -> 二千零二十四年二十五号 """ + def convert(match): num_str = match.group() try: @@ -187,6 +192,7 @@ def fuzzy_match_and_filter(input_key, match_pool, mapping_dict, lower_score=70, else: return [mapping_dict[m[0]] for m in high_conf_matches[:top_k]] + def fuzzy_match_and_filter_only_high_score(input_key, match_pool, mapping_dict, high_score=90, top_k=3): """ 对输入字符串在候选池中执行模糊匹配,并返回匹配程度高的映射原始值。 @@ -212,6 +218,7 @@ def fuzzy_match_and_filter_only_high_score(input_key, match_pool, mapping_dict, else: return [mapping_dict[m[0]] for m in high_conf_matches[:top_k]] + def standardize_name(input_name, clean_func, simply_map, pinyin_map, lower_score=70, high_score=85): """ 通用名称标准化函数,按中文 → 清洗 → 简化匹配 → 拼音匹配 的顺序进行处理。 @@ -234,6 +241,7 @@ def standardize_name(input_name, clean_func, simply_map, pinyin_map, lower_score result = fuzzy_match_and_filter(pinyin_input, list(pinyin_map.keys()), pinyin_map, lower_score, high_score) return result + def standardize_name_only_high_score(input_name, clean_func, simply_map, pinyin_map, high_score=90): """ 通用名称标准化函数,按中文 → 清洗 → 简化匹配 → 拼音匹配 的顺序进行处理。 @@ -256,6 +264,7 @@ def standardize_name_only_high_score(input_name, clean_func, simply_map, pinyin_ result = fuzzy_match_and_filter_only_high_score(pinyin_input, list(pinyin_map.keys()), pinyin_map, high_score) return result + #标准化班组名称 def standardize_team_name(input_name, simply_map, pinyin_map, lower_score=70, high_score=90): """ @@ -283,7 +292,9 @@ def standardize_sub_company(input_name, simply_map, pinyin_map, lower_score=55, :return: 匹配的标准公司名列表 """ temp_input_name = replace_arabic_with_chinese(input_name) - return standardize_name(temp_input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score) + return standardize_name(temp_input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, + high_score) + def standardize_project_name(input_name, simply_map, pinyin_map, lower_score=70, high_score=90): """ @@ -365,7 +376,8 @@ def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin limit=len(origin_name_list)) # 找到所有相似度 > 80 的匹配项 original_high_confidence_matches = [(match[0], match[1]) for match in match_results if match[1] >= lower_score] - logger.info(f"standardize_pinyin_single_name 原始名匹配, high_confidence_matches:{original_high_confidence_matches[:3]}") + logger.info( + f"standardize_pinyin_single_name 原始名匹配, high_confidence_matches:{original_high_confidence_matches[:3]}") combined_low_confidence_matches = [] if original_high_confidence_matches: @@ -382,7 +394,7 @@ def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin return list(dict.fromkeys(combined_low_confidence_matches)) -def generate_project_prompt_with_key(matched_projects, original_name="", slot_key = IMPLEMENTATION_ORG): +def generate_project_prompt_with_key(matched_projects, original_name="", slot_key=IMPLEMENTATION_ORG): """ 生成提示信息,用于让用户确认匹配的项目名或分公司名或项目名。 @@ -393,7 +405,8 @@ def generate_project_prompt_with_key(matched_projects, original_name="", slot_ke 返回: str: 生成的提示信息。如果未找到匹配项,返回提示用户提供更准确信息的字符串。 """ - logger.info(f"generate_project_prompt_with_key slot_key:{slot_key},original_name:{original_name},matched_projects:{matched_projects} ") + logger.info( + f"generate_project_prompt_with_key slot_key:{slot_key},original_name:{original_name},matched_projects:{matched_projects} ") type = "" if slot_key == CONSTRUCTION_UNIT: type = "建管单位名" @@ -409,7 +422,7 @@ def generate_project_prompt_with_key(matched_projects, original_name="", slot_ke # print(f"generate_project_prompt_with_key type:{type} ") logger.info(f"generate_project_prompt_with_key type:{type} ") if not matched_projects: - if slot_key in (CONSTRUCTION_UNIT,IMPLEMENTATION_ORG,SUBCONTRACTOR): + if slot_key in (CONSTRUCTION_UNIT, IMPLEMENTATION_ORG, SUBCONTRACTOR): return f"

未找到匹配的公司名:{original_name},请提供更准确的公司名信息。

" else: return f"

未找到匹配的:{original_name},请提供更准确的信息。

" @@ -424,6 +437,7 @@ def generate_project_prompt_with_key(matched_projects, original_name="", slot_ke html_parts.append("

请确认您要选择哪一个?

") return "\n".join(html_parts) + def generate_project_prompt(matched_projects, original_name="", type="项目部名"): """ 生成提示信息,用于让用户确认匹配的项目名或分公司名或项目名。 @@ -448,6 +462,7 @@ def generate_project_prompt(matched_projects, original_name="", type="项目部 html_parts.append("

请确认您要选择哪一个?

") return "\n".join(html_parts) + def generate_confirm_prompt(matched_projects, original_name="", type="项目部名"): """ 生成提示信息,用于让用户确认匹配的项目名或分公司名或项目名。 @@ -495,6 +510,8 @@ company_symbols_pattern = re.compile(r"[\s\W_]+") useless_team_leader_words_pattern = re.compile("班组") +useless_design_words_pattern = re.compile("|".join(USELESS_DESIGN_WORDS)) + def clean_useless_project_name(name: str) -> str: # 去掉无意义词 @@ -510,11 +527,13 @@ def clean_useless_company_name(name: str) -> str: name = company_symbols_pattern.sub("", name) return name.strip() + def clean_useless_team_leader_name(name: str) -> str: # 去掉无意义词 name = useless_team_leader_words_pattern.sub("", name) return name.strip() + #去掉项目部里面的不重要词汇 def clean_useless_program_departement_name(name: str) -> str: # 去掉无意义词 @@ -523,6 +542,21 @@ def clean_useless_program_departement_name(name: str) -> str: name = project_symbols_pattern.sub("", name) return name.strip() + +def clean_useless_design_name(name: str) -> str: + # 去掉无意义词 + name = useless_design_words_pattern.sub("", name) + # 去掉数字、字母、符号 + name = project_symbols_pattern.sub("", name) + return name.strip() + + +def clean_useless_specification_name(name: str) -> str: + # 去掉数字、字母、符号 + name = project_symbols_pattern.sub("", name) + return name.strip() + + #槽位缺失检查 def check_lost(int_res, slot): #labels: ["天气查询","通用对话","页面切换","日计划数量查询","周计划数量查询","日计划作业内容","周计划作业内容","施工人数","作业考勤人数","知识问答"] @@ -572,7 +606,7 @@ def check_lost(int_res, slot): apologize_str = "非常抱歉," # if int_res == 2: # return CheckResult.NEEDS_MORE_ROUNDS, f"{apologize_str}请问你想查询哪个页面?" - if int_res in [3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 15,16]: + if int_res in [3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16]: return CheckResult.NEEDS_MORE_ROUNDS, f"{apologize_str}请问你想查询什么时间的{intention_mapping[int_res]}?" @@ -607,7 +641,8 @@ def check_standard_name_slot_probability(int_res, slot) -> tuple: if match_results and len(match_results) == 1: slot[key] = match_results[0] else: - prompt = generate_project_prompt_with_key(match_results, original_name=slot[IMPLEMENTATION_ORG], slot_key= IMPLEMENTATION_ORG) + prompt = generate_project_prompt_with_key(match_results, original_name=slot[IMPLEMENTATION_ORG], + slot_key=IMPLEMENTATION_ORG) return CheckResult.NEEDS_MORE_ROUNDS, prompt if key == CONSTRUCTION_UNIT: @@ -618,7 +653,8 @@ def check_standard_name_slot_probability(int_res, slot) -> tuple: if match_results and len(match_results) == 1: slot[key] = match_results[0] else: - prompt = generate_project_prompt_with_key(match_results, original_name=slot[CONSTRUCTION_UNIT], slot_key= CONSTRUCTION_UNIT) + prompt = generate_project_prompt_with_key(match_results, original_name=slot[CONSTRUCTION_UNIT], + slot_key=CONSTRUCTION_UNIT) return CheckResult.NEEDS_MORE_ROUNDS, prompt if key == SUBCONTRACTOR: @@ -629,12 +665,14 @@ def check_standard_name_slot_probability(int_res, slot) -> tuple: if match_results and len(match_results) == 1: slot[key] = match_results[0] else: - prompt = generate_project_prompt_with_key(match_results, original_name=slot[SUBCONTRACTOR], slot_key= SUBCONTRACTOR) + prompt = generate_project_prompt_with_key(match_results, original_name=slot[SUBCONTRACTOR], + slot_key=SUBCONTRACTOR) return CheckResult.NEEDS_MORE_ROUNDS, prompt if key == PROJECT_DEPARTMENT: logger.info(f"check_standard_name_slot 原始项目部名 : {slot[PROJECT_DEPARTMENT]}") - match_results = standardize_projectDepartment(slot[IMPLEMENTATION_ORG], value, GlobalData.standard_company_program, + match_results = standardize_projectDepartment(slot[IMPLEMENTATION_ORG], value, + GlobalData.standard_company_program, high_score=95) logger.info(f"check_standard_name_slot 匹配后项目部名: result:{match_results}") if match_results and len(match_results) == 1: @@ -657,9 +695,139 @@ def check_standard_name_slot_probability(int_res, slot) -> tuple: "五级"]: return CheckResult.NEEDS_MORE_ROUNDS, "您查询的风险等级在系统中未找到,请确认风险等级后再次提问" + #前提已经做过公司名和工程名的标准化 + return standardize_specification_design_pic(slot) + # return CheckResult.NO_MATCH, "" + + +def standardize_implement_company(slot_item) -> tuple: + if IMPLEMENTATION_ORG in slot_item: + value = slot_item[IMPLEMENTATION_ORG] + logger.info(f"standardize_specification_design_pic 原始分公司名 : {value}") + match_results = standardize_sub_company(value, GlobalData.simply_to_standard_company_name_map, + GlobalData.pinyin_simply_to_standard_company_name_map, 70, 90) + logger.info(f"standardize_specification_design_pic 匹配后分公司名: result:{match_results}") + if match_results and len(match_results) == 1: + slot_item[IMPLEMENTATION_ORG] = match_results[0] + else: + prompt = generate_project_prompt_with_key(match_results, original_name=slot_item[IMPLEMENTATION_ORG], + slot_key=IMPLEMENTATION_ORG) + return CheckResult.NEEDS_MORE_ROUNDS, prompt return CheckResult.NO_MATCH, "" +def standardize_project(slot_item) -> tuple: + if PROJECT_NAME in slot_item: + value = slot_item[PROJECT_NAME] + logger.info(f"standardize_specification_design_pic 原始工程名 : {slot_item[PROJECT_NAME]}") + match_results = standardize_project_name(value, GlobalData.simply_to_standard_project_name_map, + GlobalData.pinyin_simply_to_standard_project_name_map, 70, 90) + logger.info(f"standardize_specification_design_pic 匹配后工程名 :result:{match_results}") + + if match_results and len(match_results) == 1: + slot_item[PROJECT_NAME] = match_results[0] + else: + prompt = generate_project_prompt(match_results, original_name=slot_item[PROJECT_NAME], type="工程名") + return CheckResult.NEEDS_MORE_ROUNDS, prompt + return CheckResult.NO_MATCH, "" + + +def standardize_design(slot_item) -> tuple: + if PROJECT_NAME in slot_item: + value = slot_item[PROJECT_NAME] + logger.info(f"standardize_specification_design_pic 原始工程名 : {slot_item[PROJECT_NAME]}") + match_results = standardize_project_name(value, GlobalData.simply_to_standard_project_name_map, + GlobalData.pinyin_simply_to_standard_project_name_map, 70, 90) + logger.info(f"standardize_specification_design_pic 匹配后工程名 :result:{match_results}") + + if match_results and len(match_results) == 1: + slot_item[PROJECT_NAME] = match_results[0] + else: + prompt = generate_project_prompt(match_results, original_name=slot_item[PROJECT_NAME], type="工程名") + return CheckResult.NEEDS_MORE_ROUNDS, prompt + return CheckResult.NO_MATCH, "" + + +def standardize_specification_design_pic(slot) -> tuple: + # #分公司名标准化 + # result_type, prompt = standardize_implement_company(slot) + # if CheckResult.NEEDS_MORE_ROUNDS == result_type: + # return result_type, prompt + + standard_implement_company = slot[IMPLEMENTATION_ORG] if IMPLEMENTATION_ORG in slot else "" + + # #工程名标准化 + # result_type, prompt = standardize_project(slot) + # if CheckResult.NEEDS_MORE_ROUNDS == result_type: + # return result_type, prompt + standard_project = slot[PROJECT_NAME] if PROJECT_NAME in slot else "" + simple_design_map = {} + + if DESIGN in slot: + value = slot[DESIGN] + temp_content = GlobalData.get_contents_by_company_proj(standard_implement_company, standard_project) + if temp_content: + design_list = temp_content["方案"] + simple_design_map.update({ + clean_useless_design_name(kw): kw for kw in design_list + }) + simply_input = clean_useless_design_name(value) + result = fuzzy_match_and_filter(simply_input, list(simple_design_map.keys()), simple_design_map, 70, 90) + if result and len(result) == 1: + slot[DESIGN] = result[0] + else: + prompt = generate_project_prompt(result, original_name=slot[DESIGN], type="方案名") + print(f"方案名标准化返回:{prompt}") + return CheckResult.NEEDS_MORE_ROUNDS, prompt + else: + prompt = generate_project_prompt([], original_name=slot[DESIGN], type="方案名") + return CheckResult.NEEDS_MORE_ROUNDS, prompt + return CheckResult.NO_MATCH, '' + + elif SPECIFICATION in slot: + value = slot[SPECIFICATION] + temp_content = GlobalData.get_contents_by_company_proj(standard_implement_company, standard_project) + if temp_content: + design_list = temp_content["规范规程"] + simple_design_map.update({ + clean_useless_specification_name(kw): kw for kw in design_list + }) + simply_input = clean_useless_specification_name(value) + result = fuzzy_match_and_filter(simply_input, list(simple_design_map.keys()), simple_design_map, 70, 90) + if result and len(result) == 1: + slot[SPECIFICATION] = result[0] + else: + prompt = generate_project_prompt(result, original_name=slot[SPECIFICATION], type="规范规程名") + print(f"规程规范名标准化返回:{prompt}") + return CheckResult.NEEDS_MORE_ROUNDS, prompt + else: + prompt = generate_project_prompt([], original_name=slot[DESIGN], type="规范规程名") + return CheckResult.NEEDS_MORE_ROUNDS, prompt + return CheckResult.NO_MATCH, '' + + elif PICTURE in slot: + value = slot[PICTURE] + temp_content = GlobalData.get_contents_by_company_proj(standard_implement_company, standard_project) + if temp_content: + design_list = temp_content["图纸"] + simple_design_map.update({ + kw: kw for kw in design_list + }) + result = fuzzy_match_and_filter(value, list(simple_design_map.keys()), simple_design_map, 70, 90) + if result and len(result) == 1: + slot[PICTURE] = result[0] + else: + prompt = generate_project_prompt(result, original_name=slot[PICTURE], type="图纸名") + print(f"图纸名标准化返回:{prompt}") + return CheckResult.NEEDS_MORE_ROUNDS, prompt + else: + prompt = generate_project_prompt([], original_name=slot[DESIGN], type="规范规程名") + return CheckResult.NEEDS_MORE_ROUNDS, prompt + return CheckResult.NO_MATCH, '' + + return CheckResult.NO_MATCH, '' + + def process_msg_content(content): if not any(keyword in content for keyword in SLOT_KEYWORDS): match = re.search(r"^.*?[。!?.!?::]", content) @@ -669,4 +837,10 @@ def process_msg_content(content): else: return content.strip() else: - return content \ No newline at end of file + return content + + +def is_design_file(file_name): + # 清除结尾的引号、空格、标点等 + text_clean = re.sub(r"[》〉》】】))>)>」』」》))》」』)】\s]+$", "", file_name) + return text_clean.endswith(tuple(design_suffix)) diff --git a/generated_data/generated.py b/generated_data/generated.py index 22220f3..0655a4d 100644 --- a/generated_data/generated.py +++ b/generated_data/generated.py @@ -50,7 +50,8 @@ BASE_DATA = { "宿州萧砀线路工程建筑部分", "1000kV淮芜Ⅰ线(PROJ-2020-0204-0003)", "35kV接地极线路雁淮线", - "110kV接地极线路(吉泉线)(PROJ-2020-0204-0002)" + "110kV接地极线路(吉泉线)(PROJ-2020-0204-0002)", + "国网安徽宣城供电公司500kV河沥变加装固定融冰装置项目工程" ], # 项目部名称 "project_departments": ["第一项目部金上","调试一队", "第9项目管理部","第9项目管理部门", "金上第十一项目部门", "第八项目管理部(合肥)", "肥东9号项目部", @@ -77,10 +78,11 @@ BASE_DATA = { "risk_levels": ["1级", "一级", "二级", "5级", "四级"], # 8+2工况 "operatings": ["8+2工况", "8加2工况"], - # 页面切换 + # 页面切换,不能有方案,图纸和规程规范这些数据集出现在pages,否则会冲突 "pages": ["风险管控", "日计划", "周风险", "日计划统计报表", "日计划推送", "生产管控中心", "考勤统计详情", "今日作业计划", "周风险统计报表", "周风险推送", "进度管理", "技术管理", "项目团队", "质量管理", - "云上会议", "项目巡航", "施工生产管理平台"], + "云上会议", "项目巡航", "施工生产管理平台", "数字化项目部","数字化项目部管理平台","施工生产管理平台", + "经营管理", "物资管理", "共享资料", "党建+", "党建加", "摄像头", "视频"], # 具体人名 "person_names": ["何东洋", "李东","王孙强林"], # 人名查询目标 @@ -89,10 +91,22 @@ BASE_DATA = { # 工程状态 "project_status_s": ["在建", "在作业", "在施工",""], + "pic_names": ["四号线施工图", "框架柱详图", "500kvgis室吊车梁布置图", "_站区道路及进站道路详图_A2", "_辅助用房建筑设计说明一_A2","平断面定位图目录", + "基础明细表","杆塔明细表","(500-SJC31151)_1-110 ","接地装置施工图","平断面定位图卷册说明"], + + "design_specification_names": [ + "《35kV电力电缆交流耐压试验方案》","220kV南蒙2753线拆线、拆塔施工方案","悬索封网实验方案","灌注桩承台基础施工方案" + "一般跨越施工措施", "省道专项施工方案","吊车组立角钢塔施工方案","承台基础及接地施工措施","项目管理实施规划","电力电缆方案","线路拆旧跨越110kV线路施工方案", + "断面悬浮抱杆组塔施工方案","灌注桩基础及接地施工措施" + + "110kV-750kV架空输电线路铁塔基础施工工艺导则","国网(基建2)112-2022 国家电网有限公司输变电工程建设质量管理规定","1000kV架空输电线路施工质量检验及评定规程","《国家电网有限公司施工项目部标准化管理手册线路工程分册》", + "国家电网有限公司输变电工程标准工艺(电缆工程分册)2022版","架空输电线路螺旋锚基础施工及质量验收规范","国家电网有限公司安全生产反违章工作管理办法"], + + #皖送天网 - "sky_nets": ["摄像头", "视频"], + # "sky_nets": ["摄像头", "视频"], #项目巡航 - "program_navigations": ["数字化项目部", "数字化项目部管理平台", "施工生产管理平台"], + # "program_navigations": ["数字化项目部", "数字化项目部管理平台", "施工生产管理平台"], } # 自然语言模板配置 @@ -748,39 +762,43 @@ TEMPLATE_CONFIG = { "date": ["今日", "昨日", "2024年5月24日", "5月24日", "今天", "昨天"], "templates": [ ("打开{page}页面", ["page"]), - ("打开{page}", ["page"]), - ("切换{page}模块", ["page"]), - ("切换到{page}页面", ["page"]), - ("跳转到{page}。", ["page"]), - ("跳转到{page}模块", ["page"]), + ("打开{page}<页面>", ["page"]), ("切换到{page}页面", ["page"]), ("切换{page}模块", ["page"]), - ("请打开{page}模块", ["page"]), - ("请打开{page}。", ["page"]), - ("请切换到{page}页面", ["page"]), - ("切换{page}", ["page"]), - #施工生产管理平台 - ("打开{program_navigation}", ["program_navigation"]), - ("打开{program_navigation}。", ["program_navigation"]), - #项目巡航:分公司 - ("打开{implementation_organization}{program_navigation}", ["implementation_organization", "program_navigation"]), - ("打开{implementation_organization}{program_navigation}。", ["implementation_organization", "program_navigation"]), - #项目巡航:分公司、项目部 - ("打开{implementation_organization}{project_department}{program_navigation}", - ["implementation_organization", "project_department", "program_navigation"]), - #项目巡航:分公司 - ("切换到{implementation_organization}{program_navigation}", - ["implementation_organization", "program_navigation"]), - #项目巡航,工程 - ("打开{project_name}{program_navigation}", ["project_name", "program_navigation"]), - #皖智天网,工程名摄像头 - ("打开{project_name}{sky_net}", ["project_name", "sky_net"]), - ("切换到{project_name}{sky_net}", ["project_name", "sky_net"]), - #皖智天网,班组名摄像头 - ("切换到{team_name}{sky_net}", ["team_name", "sky_net"]), - ("切换{team_name}{sky_net}", ["team_name", "sky_net"]), - ("打开{team_name}{sky_net}", ["team_name", "sky_net"]), - #施工生产管理平台 + ("切换{page}模块", ["page"]), + + ("打开{implementation_organization}{page}", ["implementation_organization", "page"]), + + ("打开{implementation_organization}{project_department}{page}", + ["implementation_organization", "project_department", "page"]), + + ("切换到{implementation_organization}{page}", + ["implementation_organization", "page"]), + + ("打开{project_name}{page}", ["project_name", "page"]), + ("切换到{project_name}{page}", ["project_name", "page"]), + + ("切换到{team_name}{page}", ["team_name", "page"]), + ("打开{team_name}{page}", ["team_name", "page"]), + + # design_names, pic_names,specification_names + #方案和规程规范 + ("打开{design_specification_name}<方案>", ["project_name", "design_specification_name"]), + ("打开{design_specification_name}", ["project_name", "design_specification_name"]), + ("打开{project_name}{design_specification_name}<方案>", ["project_name", "design_specification_name"]), + + ("打开{project_name}的{design_specification_name}", ["project_name", "design_specification_name"]), + + ("打开{implementation_organization}{project_name}的{design_specification_name}", + ["implementation_organization", "project_name", "design_specification_name"]), + + #图纸 + ("打开{pic_name}", ["pic_name"]), + ("打开{project_name}{pic_name}", ["project_name", "pic_name"]), + ("打开{project_name}{pic_name}<图纸>", ["project_name", "pic_name"]), + ("打开{project_name}的{pic_name}", ["project_name", "pic_name"]), + ("打开{implementation_organization}{project_name}的{pic_name}", + ["implementation_organization", "project_name", "pic_name"]), ] }, "作业面查询": { @@ -1348,8 +1366,11 @@ def generate_natural_samples(config, label): "person_name": BASE_DATA["person_names"], "person_query_type": BASE_DATA["person_query_types"], "project_status": BASE_DATA["project_status_s"], - "sky_net": BASE_DATA["sky_nets"], - "program_navigation": BASE_DATA["program_navigations"], + "pic_name": BASE_DATA["pic_names"], + "design_specification_name": BASE_DATA["design_specification_names"], + + # "sky_net": BASE_DATA["sky_nets"], + # "program_navigation": BASE_DATA["program_navigations"], } for template, variables in config["templates"]: