diff --git a/api/constants.py b/api/constants.py index fb20c56..8970043 100644 --- a/api/constants.py +++ b/api/constants.py @@ -1,6 +1,12 @@ # constants.py SIMILARITY_VALUE = 75 -# +#匹配工程名时,需要过滤掉的词汇 +USELESS_PROJECT_WORDS = ["项目", "工程", "变电站", "线路", "变电","千伏" ,"换流站","公司","直流"] + +#匹配公司名时,需要过滤掉的词汇 +USELESS_COMPANY_WORDS = ["公司","分公司"] + + COMPANYNAME_SHA = "顺安电网建设有限公司" #日期 DATE = "date" diff --git a/api/main.py b/api/main.py index 4458dce..102ad04 100644 --- a/api/main.py +++ b/api/main.py @@ -3,24 +3,26 @@ from pydantic import BaseModel, Field from werkzeug.exceptions import HTTPException from typing import List from pydantic import ValidationError +import time from intentRecognition import IntentRecognition from slotRecognition import SlotRecognition from utils import CheckResult, load_standard_name, generate_project_prompt, \ - load_standard_data, text_to_pinyin, multiple_standardize_single_name, \ - standardize_projectDepartment + load_standard_data, text_to_pinyin, \ + standardize_projectDepartment, standardize_project_name, clean_useless_project_name, \ + clean_useless_company_name, standardize_sub_company from constants import PROJECT_NAME, PROJECT_DEPARTMENT, SIMILARITY_VALUE, IMPLEMENTATION_ORG, RISK_LEVEL from config import * -MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-30750" +MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-31940" MODEL_UIE_PATH = R"../uie/output/checkpoint-31350" # 类别名称列表 labels = [ "天气查询", "互联网查询", "页面切换", "日计划数量查询", "周计划数量查询", "日计划作业内容", "周计划作业内容", "施工人数", "作业考勤人数", "知识问答", - "通用对话", "作业面查询","班组人数查询","班组数查询","作业面内容","班组详情" + "通用对话", "作业面查询", "班组人数查询", "班组数查询", "作业面内容", "班组详情" ] # 标签映射 @@ -41,13 +43,6 @@ label_map = { 13: 'B-teamName', 26: 'I-teamName', } -# # 初始化工具类 -intent_recognizer = IntentRecognition(MODEL_ERNIE_PATH, labels) - -# 初始化槽位识别工具类 -slot_recognizer = SlotRecognition(MODEL_UIE_PATH, label_map) -# 设置Flask应用 - #标准公司名和项目名 standard_company_program = load_standard_data("./standard_data/standard_company_program.json") @@ -61,12 +56,30 @@ standard_company_name_list = list(standard_company_program.keys()) pinyin_to_standard_company_name_map = {text_to_pinyin(kw): kw for kw in standard_company_name_list} standard_company_name_pinyin_list = list(pinyin_to_standard_company_name_map.keys()) +simply_to_standard_project_name_map = {clean_useless_project_name(kw): kw for kw in standard_project_name_list} + +pinyin_simply_to_standard_project_name_map = {text_to_pinyin(clean_useless_project_name(kw)): kw for kw in + standard_project_name_list} + +simply_to_standard_company_name_map = {clean_useless_company_name(kw): kw for kw in standard_company_name_list} + +pinyin_simply_to_standard_company_name_map = {text_to_pinyin(clean_useless_company_name(kw)): kw for kw in + standard_company_name_list} + +# 初始化工具类 +intent_recognizer = IntentRecognition(MODEL_ERNIE_PATH, labels) + +# 初始化槽位识别工具类 +slot_recognizer = SlotRecognition(MODEL_UIE_PATH, label_map) +# 设置Flask应用 + print(f"标准化的工程名是:{standard_project_name_list}", flush=True) print(f"pinyin标准化的工程名是 list:{standard_project_name_pinyin_list}", flush=True) -print(f"pinyin-工程名对应关系 map:{pinyin_to_standard_company_name_map}", flush=True) +print(f"pinyin-工程民对应关系 map:{pinyin_to_standard_company_name_map}", flush=True) app = Flask(__name__) + # 统一的异常处理函数 @app.errorhandler(Exception) def handle_exception(e): @@ -217,7 +230,8 @@ def agent(): entities = slot_recognizer.recognize(query) print( - f"第一轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}",flush=True) + f"第一轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}", + flush=True) # 多轮 else: res = extract_multi_chat(messages) @@ -233,7 +247,8 @@ def agent(): }) entities = slot_recognizer.recognize(res) print( - f"多轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}",flush=True) + f"多轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}", + flush=True) #必须槽位缺失检查 status, sk = check_lost(predicted_id, entities) @@ -261,11 +276,12 @@ def agent(): except Exception as e: return jsonify({"error": str(e)}), 500 # 捕捉其他错误并返回 + def extract_multi_chat(messages): from openai import OpenAI client = OpenAI(base_url=api_base_url, api_key=api_key) - latest_message = messages[-1] # 最后一条用户提问 + latest_message = messages[-1] # 最后一条用户提问 if latest_message.role == "user": latest_user_question = latest_message.content.strip() time_prefixes = ["今天", "昨天", "本周", "下周", "明天", "今日"] # 可扩展的时间前缀列表 @@ -359,7 +375,7 @@ def extract_multi_chat(messages): messages=message, model=model_name, max_tokens=100, - temperature=0.3, # 降低随机性,提高确定性 + temperature=0.1, # 降低随机性,提高确定性 stream=False ) @@ -367,6 +383,7 @@ def extract_multi_chat(messages): print(f"多轮意图后用户想要的问题是:{res}", flush=True) return res + def check_lost(int_res, slot): #labels: ["天气查询","通用对话","页面切换","日计划数量查询","周计划数量查询","日计划作业内容","周计划作业内容","施工人数","作业考勤人数","知识问答"] mapping = { @@ -386,7 +403,7 @@ def check_lost(int_res, slot): intention_mapping = {2: "页面切换", 3: "日计划数量查询", 4: "周计划数量查询", 5: "日计划作业内容", 6: "周计划作业内容", 7: "施工人数", 8: "作业考勤人数", 11: "作业面查询", - 12:"班组人数查询", 13:"班组数查询", 14:"作业面内容", 15:"班组详情"} + 12: "班组人数查询", 13: "班组数查询", 14: "作业面内容", 15: "班组详情"} if not mapping.__contains__(int_res): return 0, "" #提取的槽位信息 @@ -411,7 +428,7 @@ def check_lost(int_res, slot): return CheckResult.NO_MATCH, cur_k #符合当前意图的的必须槽位,但是不在提取的槽位信息里 left = [x for x in mapping[int_res][idx] if x not in cur_k] - print(f"符合当前意图的的必须槽位,但是不在提取的槽位信息里, {left}",flush=True) + print(f"符合当前意图的的必须槽位,但是不在提取的槽位信息里, {left}", flush=True) apologize_str = "非常抱歉," if int_res == 2: return CheckResult.NEEDS_MORE_ROUNDS, f"{apologize_str}请问你想查询哪个页面?" @@ -434,8 +451,9 @@ def check_standard_name_slot(int_res, slot) -> tuple: for key, value in slot.items(): if key == PROJECT_NAME: print(f"check_standard_name_slot 原始工程名 : {slot[PROJECT_NAME]}") - match_results = multiple_standardize_single_name(value, standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,80) - print(f"check_standard_name_slot 匹配后工程名 :result:{match_results}",flush=True) + match_results = standardize_project_name(value, simply_to_standard_project_name_map, + pinyin_simply_to_standard_project_name_map, 70, 90) + print(f"check_standard_name_slot 匹配后工程名 :result:{match_results}", flush=True) if match_results and len(match_results) == 1: slot[key] = match_results[0] else: @@ -444,8 +462,9 @@ def check_standard_name_slot(int_res, slot) -> tuple: if key == IMPLEMENTATION_ORG and slot[key] != "公司": print(f"check_standard_name_slot 原始分公司名 : {slot[IMPLEMENTATION_ORG]}") - match_results = multiple_standardize_single_name(value, standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map, lower_score=50, high_score=80, isArabicNumConv = True) - print(f"check_standard_name_slot 匹配后分公司名: result:{match_results}",flush=True) + match_results = standardize_sub_company(value, simply_to_standard_company_name_map, + pinyin_simply_to_standard_company_name_map, 55, 80) + print(f"check_standard_name_slot 匹配后分公司名: result:{match_results}", flush=True) if match_results and len(match_results) == 1: slot[key] = match_results[0] else: @@ -454,8 +473,9 @@ def check_standard_name_slot(int_res, slot) -> tuple: if key == PROJECT_DEPARTMENT: print(f"check_standard_name_slot 原始项目部名 : {slot[PROJECT_DEPARTMENT]}") - match_results = standardize_projectDepartment(slot[IMPLEMENTATION_ORG], value, standard_company_program, high_score=85) - print(f"check_standard_name_slot 匹配后项目部名: result:{match_results}",flush=True) + match_results = standardize_projectDepartment(slot[IMPLEMENTATION_ORG], value, standard_company_program, + high_score=90) + print(f"check_standard_name_slot 匹配后项目部名: result:{match_results}", flush=True) if match_results and len(match_results) == 1: slot[key] = match_results[0] else: @@ -463,92 +483,132 @@ def check_standard_name_slot(int_res, slot) -> tuple: return CheckResult.NEEDS_MORE_ROUNDS, prompt if key == RISK_LEVEL: - if slot[RISK_LEVEL] not in["2级","3级","4级","5级"] and slot[RISK_LEVEL] not in["二级","三级","四级","五级"]: + if slot[RISK_LEVEL] not in ["2级", "3级", "4级", "5级"] and slot[RISK_LEVEL] not in ["二级", "三级", "四级", + "五级"]: return CheckResult.NEEDS_MORE_ROUNDS, "您查询的风险等级在系统中未找到,请确认风险等级后再次提问" return CheckResult.NO_MATCH, "" + +# # test_cases = [ -# ("安徽宏源电力建设有限公司", "第三项目管理部"), # 期望返回所有"第三项目管理部" -# ("安徽宏源电力建设有限公司", "第九项目部"), # 期望返回 "第九项目管理部" -# ("顺安电网公司", "第二项目部"), # 期望匹配"顺安电网建设有限公司"下的"第二项目管理部" -# ("送电一公司", "第三项目部"), # 期望返回"第三项目管理部" -# ("送电2公司", "第三项目部"), # 期望返回"第三项目管理部" -# ("消防分公司", "第七项目部"), # 期望返回"第七项目管理部 -# ("建筑分公司", "第七项目部"), # 期望返回"第七项目管理部" -# ("建筑消防分公司", "第七项目部"), # 期望返回"第七项目管理部" -# ("建筑分公司消防分公司", "第七项目部") # 期望返回"第七项目管理部" +# ("送一分公司"), +# ("送二分公司"), +# ("变电分公司"), +# ("建筑分公司"), +# ("检修试验分公司"), +# ("宏源电力公司"), +# ("宏源电力限公司"), +# ("宏源电力限公司线路"), +# ("宏源电力限公司变电"), +# ("送一分"), +# ("送二分"), +# ("变电分"), +# ("建筑分"), +# ("检修试验分"), +# ("宏源电力"), +# ("红源电力"), +# ("宏源电力有限"), +# ("宏源电力限线路"), +# ("宏源电力限变电"), # ] # -# for company, project in test_cases: -# # result = standardize_company_and_project(company, project,standard_company_program) -# result = standardize_company_and_projectDepartment(company, project,standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map) -# # result = multiple_standardize_single_name("company", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,40,70) -# print(f"输入: {company}, {project} -> 输出: {result}") +# print(f"加权混合策略 分公司名匹配**********************") +# start = time.perf_counter() +# for item in test_cases: +# match_results = standardize_sub_company(item,simply_to_standard_company_name_map, pinyin_simply_to_standard_company_name_map,55,80) +# print(f"加权混合策略 分公司名匹配 输入: {item}-> 输出: {match_results}") +# end = time.perf_counter() +# print(f"加权混合策略 耗时: {end - start:.4f} 秒") # -# result = standardize_single_name("送电一公司", standard_company_name_list) -# print(f"输入: 送一分公司-> 输出: {result}") # -# prompt = generate_project_prompt(result, "分公司名") -# print(f"prompt:{prompt}") # -# result = standardize_single_name("合肥中心变", standard_project_name_list) -# print(f"输入: 合肥中心变-> 输出: {result}") +# test_cases = [ +# ("卢集"), +# ("芦集"), +# ("芦集变电站"), +# ("安庆四变电站"), +# ("锦绣变电站"), +# ("滁州护桥变电站"), +# ("合州换流站"), +# ("陕北合州换流站"), +# ("陕北安徽合州换流站"), +# ("金牛变电站"), +# ("香涧鹭岛工程"), +# ("延庆换流站"), +# ("国网延庆换流站"), +# ("国网北京延庆换流站"), +# ("陶楼广银线路工程"), +# ("紫蓬变电站"), +# ("宿州萧砀变电站"), +# ("冯井变电站"), +# ("富邦秋浦变电站"), +# ("包河玉龙变电站"), # -# prompt = generate_project_prompt(result, "工程名") -# print(f"prompt:{prompt}") +# ("绿雪莲塘工程"), +# ("合肥循环园工程"), +# ("合肥长临河工程"), +# ("合肥中心变"), +# ("锁库变电站工程"), +# ("槽坊工程"), +# +# ("安庆四500kV变电站新建工程(PROJ-2024-0862)"), +# ("锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)"), +# ("渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)"), +# ("先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)"), +# ("安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)"), +# ("合州士800千伏换流站电气安装A包(PROJ-2025-0056)"), +# ("卫田-陶楼T接首业变电站110kV电缆线路工程(PROJ-2024-1236)"), +# ("谯城(亳三)-希夷220kV线路工程(PROJ-2024-1205)"), +# ] +# print(f"去不重要词汇 工程名匹配******************************************") +# start = time.perf_counter() +# for item in test_cases: +# match_results = standardize_project_name(item, simply_to_standard_project_name_map, pinyin_simply_to_standard_project_name_map,70,90) +# print(f"工程名匹配 输入: {item}-> 输出: {match_results}") +# end = time.perf_counter() +# print(f"词集匹配 耗时: {end - start:.4f} 秒") +# +# print(f"项目名匹配******************************************") +# oral_program_name_list = [ +# ("第1项目部"), # 期望返回所有"第三项目管理部" +# ("第2项目部"), +# ("第3项目部"), +# ("第4项目部"), +# ("第5项目部"), +# ("第6项目部"), +# ("第7项目部"), +# ("第8项目部"), +# ("第9项目部"), +# ("第10项目部"), +# ("第11项目部"), +# ("第12项目部"), +# ("第13项目部"), +# ("电缆班"), +# ("调试1队"), +# ("调试2队"), +# ("调试3队"), +# ("调试4队"), +# ("调试5队"), +# ("第一项目管理部"), +# ("第二项目管理部"), +# ("第五项目管理部"), +# ("第十一项目管理部(萧砀线路)"), +# ("第三项目管理部(张店线路)"), +# ("第三项目管理部(岳西线路)"), +# ("第五项目管理部(蚌埠)"), +# ("第三项目管理部(六安线路)"), +# ("第十一项目管理部(宿州线路)"), +# ("调试一队"), +# ("调试二队"), +# ("调试三队"), +# ("电缆班"), +# ] +# +# for company in standard_company_name_list: +# for program in oral_program_name_list: +# match_results = standardize_projectDepartment(company, program, standard_company_program, high_score=90) +# print(f"加权混合策略 项目部名称 输入: 公司:{company},项目部:{program}-> 输出: {match_results}") -# result = standardize_single_name("合肥中心变", standard_project_name_list, 60, 75) -# print(f"输入: 合肥中心变-> 输出: {result}") -# -# result = standardize_single_name("阜阳阜四变电站工程", standard_project_name_list, 60, 75) -# print(f"输入: 阜阳阜四变电站工程-> 输出: {result}") -# -# result = standardize_single_name("芦集变电站", standard_project_name_list, 20, 50) -# print(f"输入: 芦集变电站-> 输出: {result}") -# -# match_results = multiple_standardize_single_name("宋轶分公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80) -# print(f"multiple_standardize_single_name 输入: 宋轶分公司-> 输出: {match_results}") -# -# match_results = multiple_standardize_single_name("宏源电力公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80) -# print(f"multiple_standardize_single_name 输入: 宏源电力公司-> 输出: {match_results}") -# -# # -# match_results = multiple_standardize_single_name("合肥中心变", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70) -# print(f"multiple_standardize_single_name 输入: 合肥中心变-> 输出: {match_results}") -# -# match_results = multiple_standardize_single_name("淮南安丰", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70) -# print(f"multiple_standardize_single_name 输入: 淮南安丰工程-> 输出: {match_results}") -# -# match_results = multiple_standardize_single_name("宿州萧砀新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70) -# print(f"multiple_standardize_single_name 输入: 宿州萧砀新建工程-> 输出: {match_results}") -# -# match_results = multiple_standardize_single_name("芦集变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70) -# print(f"multiple_standardize_single_name 输入: 芦集变电站-> 输出: {match_results}") -# -# match_results = multiple_standardize_single_name("卢集变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70) -# print(f"multiple_standardize_single_name 输入: 卢集变电站-> 输出: {match_results}") -# -# match_results = multiple_standardize_single_name("芦集古沟变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70) -# print(f"multiple_standardize_single_name 输入: 芦集古沟变电站新建工程-> 输出: {match_results}") -# # -# match_results = multiple_standardize_single_name("金牛变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70) -# print(f"multiple_standardize_single_name 输入: 金牛变电站-> 输出: {match_results}") -# - -# -# company, project = standardize_company_and_projectDepartment("变电分公司","第一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map) -# print(f"company:{company}, project:{project}") -# -# company, project = standardize_company_and_projectDepartment("变电分公司","第十一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map) -# print(f"company:{company}, project:{project}") -# company, project = standardize_company_and_projectDepartment("试验分公司","电缆班", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map) -# print(f"company:{company}, project:{project}") -# company, project = standardize_company_and_projectDepartment("宏源电力投资有限公司","第三项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map) -# print(f"company:{company}, project:{project}") -# - -# match_results = standardize_projectDepartment("安徽宏源电力建设有限公司(变电)", "第3项目部", standard_company_program, high_score=85) -# print(f"match_results:{match_results}") if __name__ == '__main__': app.run(host='0.0.0.0', port=18074, debug=True) diff --git a/api/main_temp.py b/api/main_temp.py index 3aed8ed..a32281a 100644 --- a/api/main_temp.py +++ b/api/main_temp.py @@ -3,18 +3,21 @@ from pydantic import BaseModel, Field from werkzeug.exceptions import HTTPException from typing import List from pydantic import ValidationError +import time from intentRecognition import IntentRecognition from slotRecognition import SlotRecognition from utils import CheckResult, load_standard_name, generate_project_prompt, \ - load_standard_data, text_to_pinyin, multiple_standardize_single_name, \ - standardize_projectDepartment + load_standard_data, text_to_pinyin, \ + standardize_projectDepartment, standardize_project_name, clean_useless_project_name, \ + clean_useless_company_name, standardize_sub_company from constants import PROJECT_NAME, PROJECT_DEPARTMENT, SIMILARITY_VALUE, IMPLEMENTATION_ORG, RISK_LEVEL from config import * -MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-30750" -MODEL_UIE_PATH = R"../uie/output_temp/checkpoint-31350" +MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-17890" +MODEL_UIE_PATH = R"../uie/output/checkpoint-17290" + # 类别名称列表 labels = [ "天气查询", "互联网查询", "页面切换", "日计划数量查询", "周计划数量查询", @@ -54,6 +57,14 @@ standard_company_name_list = list(standard_company_program.keys()) pinyin_to_standard_company_name_map = {text_to_pinyin(kw): kw for kw in standard_company_name_list} standard_company_name_pinyin_list = list(pinyin_to_standard_company_name_map.keys()) +simply_to_standard_project_name_map = {clean_useless_project_name(kw): kw for kw in standard_project_name_list} + +pinyin_simply_to_standard_project_name_map = {text_to_pinyin(clean_useless_project_name(kw)): kw for kw in standard_project_name_list} + +simply_to_standard_company_name_map = {clean_useless_company_name(kw): kw for kw in standard_company_name_list} + +pinyin_simply_to_standard_company_name_map = {text_to_pinyin(clean_useless_company_name(kw)): kw for kw in standard_company_name_list} + # 初始化工具类 intent_recognizer = IntentRecognition(MODEL_ERNIE_PATH, labels) @@ -435,7 +446,7 @@ def check_standard_name_slot(int_res, slot) -> tuple: for key, value in slot.items(): if key == PROJECT_NAME: print(f"check_standard_name_slot 原始工程名 : {slot[PROJECT_NAME]}") - match_results = multiple_standardize_single_name(value, standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,80) + match_results = standardize_project_name(value, simply_to_standard_project_name_map, pinyin_simply_to_standard_project_name_map,70,90) print(f"check_standard_name_slot 匹配后工程名 :result:{match_results}",flush=True) if match_results and len(match_results) == 1: slot[key] = match_results[0] @@ -445,7 +456,7 @@ def check_standard_name_slot(int_res, slot) -> tuple: if key == IMPLEMENTATION_ORG and slot[key] != "公司": print(f"check_standard_name_slot 原始分公司名 : {slot[IMPLEMENTATION_ORG]}") - match_results = multiple_standardize_single_name(value, standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map, lower_score=50, high_score=80, isArabicNumConv = True) + match_results = standardize_sub_company(value,simply_to_standard_company_name_map, pinyin_simply_to_standard_company_name_map,55,80) print(f"check_standard_name_slot 匹配后分公司名: result:{match_results}",flush=True) if match_results and len(match_results) == 1: slot[key] = match_results[0] @@ -469,85 +480,126 @@ def check_standard_name_slot(int_res, slot) -> tuple: return CheckResult.NO_MATCH, "" + +# # test_cases = [ -# ("安徽宏源电力建设有限公司(线路)", "第三项目管理部"), # 期望返回所有"第三项目管理部" -# ("送电一分公司", "第8项目管理部"), # 期望返回 "第九项目管理部" -# # ("顺安电网公司", "第二项目部"), # 期望匹配"顺安电网建设有限公司"下的"第二项目管理部" -# # ("送电一公司", "第三项目部"), # 期望返回"第三项目管理部" -# # ("送电2公司", "第三项目部"), # 期望返回"第三项目管理部" -# # ("消防分公司", "第七项目部"), # 期望返回"第七项目管理部 -# # ("建筑分公司", "第七项目部"), # 期望返回"第七项目管理部" -# # ("建筑消防分公司", "第七项目部"), # 期望返回"第七项目管理部" -# # ("建筑分公司消防分公司", "第七项目部") # 期望返回"第七项目管理部" +# ("送一分公司"), +# ("送二分公司"), +# ("变电分公司"), +# ("建筑分公司"), +# ("检修试验分公司"), +# ("宏源电力公司"), +# ("宏源电力限公司"), +# ("宏源电力限公司线路"), +# ("宏源电力限公司变电"), +# ("送一分"), +# ("送二分"), +# ("变电分"), +# ("建筑分"), +# ("检修试验分"), +# ("宏源电力"), +# ("红源电力"), +# ("宏源电力有限"), +# ("宏源电力限线路"), +# ("宏源电力限变电"), # ] +# +# print(f"加权混合策略 分公司名匹配**********************") +# start = time.perf_counter() +# for item in test_cases: +# match_results = standardize_sub_company(item,simply_to_standard_company_name_map, pinyin_simply_to_standard_company_name_map,55,80) +# print(f"加权混合策略 分公司名匹配 输入: {item}-> 输出: {match_results}") +# end = time.perf_counter() +# print(f"加权混合策略 耗时: {end - start:.4f} 秒") +# +# +# +# test_cases = [ +# ("卢集"), +# ("芦集"), +# ("芦集变电站"), +# ("安庆四变电站"), +# ("锦绣变电站"), +# ("滁州护桥变电站"), +# ("合州换流站"), +# ("陕北合州换流站"), +# ("陕北安徽合州换流站"), +# ("金牛变电站"), +# ("香涧鹭岛工程"), +# ("延庆换流站"), +# ("国网延庆换流站"), +# ("国网北京延庆换流站"), +# ("陶楼广银线路工程"), +# ("紫蓬变电站"), +# ("宿州萧砀变电站"), +# ("冯井变电站"), +# ("富邦秋浦变电站"), +# ("包河玉龙变电站"), +# +# ("绿雪莲塘工程"), +# ("合肥循环园工程"), +# ("合肥长临河工程"), +# ("合肥中心变"), +# ("锁库变电站工程"), +# ("槽坊工程"), +# +# ("安庆四500kV变电站新建工程(PROJ-2024-0862)"), +# ("锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)"), +# ("渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)"), +# ("先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)"), +# ("安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)"), +# ("合州士800千伏换流站电气安装A包(PROJ-2025-0056)"), +# ("卫田-陶楼T接首业变电站110kV电缆线路工程(PROJ-2024-1236)"), +# ("谯城(亳三)-希夷220kV线路工程(PROJ-2024-1205)"), +# ] +# print(f"去不重要词汇 工程名匹配******************************************") +# start = time.perf_counter() +# for item in test_cases: +# match_results = standardize_project_name(item, simply_to_standard_project_name_map, pinyin_simply_to_standard_project_name_map,70,90) +# print(f"工程名匹配 输入: {item}-> 输出: {match_results}") +# end = time.perf_counter() +# print(f"词集匹配 耗时: {end - start:.4f} 秒") +# +# print(f"项目名匹配******************************************") +# oral_program_name_list = [ +# ("第1项目部"), # 期望返回所有"第三项目管理部" +# ("第2项目部"), +# ("第3项目部"), +# ("第4项目部"), +# ("第5项目部"), +# ("第6项目部"), +# ("第7项目部"), +# ("第8项目部"), +# ("第9项目部"), +# ("第10项目部"), +# ("第11项目部"), +# ("第12项目部"), +# ("第13项目部"), +# ("电缆班"), +# ("调试1队"), +# ("调试2队"), +# ("调试3队"), +# ("调试4队"), +# ("调试5队"), +# ("第一项目管理部"), +# ("第二项目管理部"), +# ("第五项目管理部"), +# ("第十一项目管理部(萧砀线路)"), +# ("第三项目管理部(张店线路)"), +# ("第三项目管理部(岳西线路)"), +# ("第五项目管理部(蚌埠)"), +# ("第三项目管理部(六安线路)"), +# ("第十一项目管理部(宿州线路)"), +# ("调试一队"), +# ("调试二队"), +# ("调试三队"), +# ("电缆班"), +# ] +# +# for company in standard_company_name_list: +# for program in oral_program_name_list: +# match_results = standardize_projectDepartment(company, program, standard_company_program, high_score=90) +# print(f"加权混合策略 项目部名称 输入: 公司:{company},项目部:{program}-> 输出: {match_results}") -# for company, project in test_cases: -# result = standardize_projectDepartment(company, project,standard_company_program, high_score=90) -# print(f"输入: {company}, {project} -> 输出: {result}") -# -# result = standardize_single_name("送电一公司", standard_company_name_list) -# print(f"输入: 送一分公司-> 输出: {result}") -# -# prompt = generate_project_prompt(result, "分公司名") -# print(f"prompt:{prompt}") -# -# result = standardize_single_name("合肥中心变", standard_project_name_list) -# print(f"输入: 合肥中心变-> 输出: {result}") -# -# prompt = generate_project_prompt(result, "工程名") -# print(f"prompt:{prompt}") - -# result = standardize_single_name("合肥中心变", standard_project_name_list, 60, 75) -# print(f"输入: 合肥中心变-> 输出: {result}") -# -# result = standardize_single_name("阜阳阜四变电站工程", standard_project_name_list, 60, 75) -# print(f"输入: 阜阳阜四变电站工程-> 输出: {result}") -# -# result = standardize_single_name("芦集变电站", standard_project_name_list, 20, 50) -# print(f"输入: 芦集变电站-> 输出: {result}") -# -# match_results = multiple_standardize_single_name("宋轶分公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80) -# print(f"multiple_standardize_single_name 输入: 宋轶分公司-> 输出: {match_results}") -# -# match_results = multiple_standardize_single_name("宏源电力公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80) -# print(f"multiple_standardize_single_name 输入: 宏源电力公司-> 输出: {match_results}") -# -# # -# match_results = multiple_standardize_single_name("合肥中心变", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70) -# print(f"multiple_standardize_single_name 输入: 合肥中心变-> 输出: {match_results}") -# -# match_results = multiple_standardize_single_name("淮南安丰", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70) -# print(f"multiple_standardize_single_name 输入: 淮南安丰工程-> 输出: {match_results}") -# -# match_results = multiple_standardize_single_name("宿州萧砀新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70) -# print(f"multiple_standardize_single_name 输入: 宿州萧砀新建工程-> 输出: {match_results}") -# -# match_results = multiple_standardize_single_name("芦集变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70) -# print(f"multiple_standardize_single_name 输入: 芦集变电站-> 输出: {match_results}") -# -# match_results = multiple_standardize_single_name("卢集变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70) -# print(f"multiple_standardize_single_name 输入: 卢集变电站-> 输出: {match_results}") -# -# match_results = multiple_standardize_single_name("芦集古沟变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70) -# print(f"multiple_standardize_single_name 输入: 芦集古沟变电站新建工程-> 输出: {match_results}") -# # -# match_results = multiple_standardize_single_name("金牛变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70) -# print(f"multiple_standardize_single_name 输入: 金牛变电站-> 输出: {match_results}") -# - -# -# company, project = standardize_company_and_projectDepartment("变电分公司","第一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map) -# print(f"company:{company}, project:{project}") -# -# company, project = standardize_company_and_projectDepartment("变电分公司","第十一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map) -# print(f"company:{company}, project:{project}") -# company, project = standardize_company_and_projectDepartment("试验分公司","电缆班", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map) -# print(f"company:{company}, project:{project}") -# company, project = standardize_company_and_projectDepartment("宏源电力投资有限公司","第三项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map) -# print(f"company:{company}, project:{project}") -# - -# match_results = standardize_projectDepartment("安徽宏源电力建设有限公司(变电)", "第3项目部", standard_company_program, high_score=85) -# print(f"match_results:{match_results}") if __name__ == '__main__': app.run(host='0.0.0.0', port=18073, debug=True) diff --git a/api/utils.py b/api/utils.py index 3f4e590..fcd536a 100644 --- a/api/utils.py +++ b/api/utils.py @@ -1,10 +1,13 @@ from enum import Enum +from typing import cast, Callable from rapidfuzz import process, fuzz -import re +from rapidfuzz.fuzz import WRatio import json from pypinyin import lazy_pinyin +from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS + # 数字转换表(1-20,常见数字) digit_to_chinese = { "1": "一", "2": "二", "3": "三", "4": "四", "5": "五", @@ -14,6 +17,7 @@ digit_to_chinese = { "19": "十九", "20": "二十" } + def arabic_to_chinese_number(text): """ 将文中阿拉伯数字转换为中文数字 @@ -28,10 +32,13 @@ def arabic_to_chinese_number(text): text = text.replace(num, cn) return text + + def text_to_pinyin(text): """将文本转换为拼音字符串""" return ''.join(lazy_pinyin(text)) + def load_standard_data(path): with open(path, "r", encoding="utf-8") as f: return json.load(f) @@ -50,23 +57,99 @@ def extract_number(text): return None +def fuzzy_match_and_filter(input_key, match_pool, mapping_dict, lower_score=70, high_score=85, top_k=3): + """ + 对输入字符串在候选池中执行模糊匹配,并返回匹配程度高的映射原始值。 + :param input_key: 清洗后的用于匹配的关键词(如简化名或拼音) + :param match_pool: 可用于匹配的候选集合(一般是映射表的 key) + :param mapping_dict: 匹配项到标准原始名的映射字典 + :param lower_score: 匹配分数的下限,低于该分数视为无效 + :param high_score: 高置信度匹配分数,超过则直接返回所有等分结果 + :param top_k: 如果不满足高置信度,则返回前 top_k 个结果 + :return: 标准原始名的列表,可能为空 + """ + match_results = process.extract(input_key, match_pool, scorer=cast(Callable, WRatio), limit=len(match_pool)) + high_conf_matches = [(m[0], m[1]) for m in match_results if m[1] >= lower_score] + + if not high_conf_matches: + return [] + + max_score = max(high_conf_matches, key=lambda x: x[1]) + best_matches = [m for m in high_conf_matches if m[1] == max_score[1]] + + if max_score[1] >= high_score: + return [mapping_dict[m[0]] for m in best_matches] + else: + return [mapping_dict[m[0]] for m in high_conf_matches[:top_k]] + + +def standardize_name(input_name, clean_func, simply_map, pinyin_map, lower_score=70, high_score=85): + """ + 通用名称标准化函数,按中文 → 清洗 → 简化匹配 → 拼音匹配 的顺序进行处理。 + + :param input_name: 用户输入的原始中文名 + :param clean_func: 清洗函数(针对不同实体类型,如工程名/公司名) + :param simply_map: 简化后的名 → 原始标准名 映射 + :param pinyin_map: 拼音名 → 原始标准名 映射 + :param lower_score: 模糊匹配最低置信分数 + :param high_score: 模糊匹配高置信分数阈值 + :return: 标准名列表,可能为空 + """ + simply_input = clean_func(input_name) + result = fuzzy_match_and_filter(simply_input, list(simply_map.keys()), simply_map, lower_score, high_score) + if result: + return result + + # 拼音匹配 + pinyin_input = text_to_pinyin(simply_input) + result = fuzzy_match_and_filter(pinyin_input, list(pinyin_map.keys()), pinyin_map, lower_score, high_score) + return result + + +def standardize_sub_company(input_name, simply_map, pinyin_map, lower_score=55, high_score=80): + """ + 对用户输入的子公司名称进行标准化,返回最匹配的标准公司名列表。 + + :param input_name: 原始中文子公司名 + :param simply_map: 清洗后的公司名 → 标准公司名映射 + :param pinyin_map: 洗后公司的拼音 → 标准公司名映射 + :param lower_score: 模糊匹配分数下限 + :param high_score: 高置信匹配分数阈值 + :return: 匹配的标准公司名列表 + """ + return standardize_name(input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score) + + +def standardize_project_name(input_name, simply_map, pinyin_map, lower_score=70, high_score=90): + """ + 对用户输入的项目名称进行标准化,返回最匹配的标准项目名列表。 + + :param input_name: 原始中文项目名 + :param simply_map: 清洗后的项目名 → 标准项目名映射 + :param pinyin_map: 清洗后项目的拼音 → 标准项目名映射 + :param lower_score: 模糊匹配分数下限 + :param high_score: 高置信匹配分数阈值 + :return: 匹配的标准项目名列表 + """ + return standardize_name(input_name, clean_useless_project_name, simply_map, pinyin_map, lower_score, high_score) + + #标准化项目部名 -def standardize_projectDepartment(standard_company, input_project , company_project_department_map, high_score=85): +def standardize_projectDepartment(standard_company, input_project, company_project_department_map, high_score=90): """ 将口语化的公司名和项目部名转换为标准化名称。 - 参数: standard_company (str): 标准化公司名。 input_project (str): 用户输入的项目部名(可能是口语化或不完整的名称)。 company_project_department_map (dict): 标准化的公司名和项目部名数据,格式为 {公司名: [项目部名1, 项目部名2, ...]}。 pinyin_to_original_map:分公司拼音和分公司原始名的映射 - 返回: tuple: (标准化公司名, 匹配的项目部名列表)。如果无法匹配,返回 (None, None)。 """ try: # **2. 先尝试直接匹配最相似的项目名** - project_match = process.extractOne(input_project, company_project_department_map[standard_company], scorer=fuzz.ratio) + project_match = process.extractOne(input_project, company_project_department_map[standard_company], + scorer=cast(Callable, WRatio)) print(f"项目部名称最相似:{project_match[0]},{project_match[1]}", flush=True) if project_match and project_match[1] >= high_score: return [project_match[0]] # 直接返回匹配的项目名 @@ -86,11 +169,13 @@ def standardize_projectDepartment(standard_company, input_project , company_proj print(f"standardize_projectDepartment:{e}", flush=True) return None -def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin_name_list = None, pinyin_to_original_map = None, lower_score=70, high_score=85, isArabicNumConv = False): + +def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin_name_list=None, + pinyin_to_original_map=None, lower_score=70, high_score=85, isArabicNumConv=False): """ 使用拼音 + rapidfuzz 进行关键词模糊匹配,并返回原始的标准名 - :param input_name: 口语化的名称(中文) - :param name_list: 关键词列表(中文) + :param origin_input_name: 口语化的名称(中文) + :param origin_name_list: 关键词列表(中文) :pinyin_name_list: 关键词列表(拼音) :param pinyin_to_original_map: 拼音到原始标准名的映射 :param lower_score: 低匹配分数阈值(默认70) @@ -100,10 +185,12 @@ def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin #First round, 原始标准名的匹配性查找,能找到直接返回 if isArabicNumConv: origin_input_name = arabic_to_chinese_number(origin_input_name) - match_results = process.extract(origin_input_name, origin_name_list, scorer=fuzz.token_sort_ratio, limit=len(origin_name_list)) + match_results = process.extract(origin_input_name, origin_name_list, scorer=fuzz.token_sort_ratio, + limit=len(origin_name_list)) # 找到所有相似度 > 80 的匹配项 original_high_confidence_matches = [(match[0], match[1]) for match in match_results if match[1] >= lower_score] - print(f"standardize_pinyin_single_name 原始名匹配, high_confidence_matches:{original_high_confidence_matches[:3]}", flush=True) + print(f"standardize_pinyin_single_name 原始名匹配, high_confidence_matches:{original_high_confidence_matches[:3]}", + flush=True) combined_low_confidence_matches = [] if original_high_confidence_matches: @@ -117,31 +204,10 @@ def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin if not pinyin_name_list or not pinyin_to_original_map: return None # - #第二轮, 拼音名的匹配性查找,能找到直接返回 - # pinyin_input_name = text_to_pinyin(origin_input_name) - # #fuzz.partial_ratio - # match_results = process.extract(pinyin_input_name, pinyin_name_list, scorer=fuzz.ratio, limit=len(pinyin_name_list)) - # - # # 筛选出匹配分数 > lower_score 的结果 - # pinyin_high_confidence_matches = [(match[0], match[1]) for match in match_results if match[1] >= lower_score] - # print(f"standardize_pinyin_single_name 拼音匹配, input_name:{pinyin_input_name}, high_confidence_matches:{pinyin_high_confidence_matches[:3]}", flush=True) - # - # if not pinyin_high_confidence_matches: - # return combined_low_confidence_matches # 没有找到匹配项 - # - # # 选择最高相似度的匹配项 - # pinyin_best_match = max(pinyin_high_confidence_matches, key=lambda x: x[1], default=None) - # - # if pinyin_best_match and pinyin_best_match[1] > high_score: - # return [pinyin_to_original_map[pinyin_best_match[0]]] # 直接返回最高相似度的原始工程名 - # - # combined_low_confidence_matches.extend( - # [pinyin_to_original_map[match[0]] for match in pinyin_high_confidence_matches[:3]] - # ) - # 返回所有匹配项对应的原始名,最多返回最低匹配项的前5个 return list(dict.fromkeys(combined_low_confidence_matches)) -def generate_project_prompt(matched_projects, original_name = "", type="项目部名"): + +def generate_project_prompt(matched_projects, original_name="", type="项目部名"): """ 生成提示信息,用于让用户确认匹配的项目名或分公司名或项目名。 @@ -203,3 +269,28 @@ class StandardType(Enum): PROJECT_CHECK = 0 #项目名检查 PROGRAM_CHECK = 1 + + +import re + +# 构建一个用于替换的正则表达式 +useless_project_words_pattern = re.compile("|".join(USELESS_PROJECT_WORDS)) +useless_company_words_pattern = re.compile("|".join(USELESS_COMPANY_WORDS)) +# 匹配所有数字、字母(含大小写)、特殊字符(包括空格、标点) +project_symbols_pattern = re.compile(r"[A-Za-z0-9\s\W_]+") +company_symbols_pattern = re.compile(r"[\s\W_]+") + + +def clean_useless_project_name(name: str) -> str: + # 去掉无意义词 + name = useless_project_words_pattern.sub("", name) + # 去掉数字、字母、符号 + name = project_symbols_pattern.sub("", name) + return name.strip() + + +def clean_useless_company_name(name: str) -> str: + # 去掉无意义词 + name = useless_company_words_pattern.sub("", name) + name = company_symbols_pattern.sub("", name) + return name.strip()