标准化算法提优
This commit is contained in:
parent
cd00c7efae
commit
510e829382
|
|
@ -1,6 +1,12 @@
|
|||
# constants.py
|
||||
SIMILARITY_VALUE = 75
|
||||
#
|
||||
#匹配工程名时,需要过滤掉的词汇
|
||||
USELESS_PROJECT_WORDS = ["项目", "工程", "变电站", "线路", "变电","千伏" ,"换流站","公司","直流"]
|
||||
|
||||
#匹配公司名时,需要过滤掉的词汇
|
||||
USELESS_COMPANY_WORDS = ["公司","分公司"]
|
||||
|
||||
|
||||
COMPANYNAME_SHA = "顺安电网建设有限公司"
|
||||
#日期
|
||||
DATE = "date"
|
||||
|
|
|
|||
258
api/main.py
258
api/main.py
|
|
@ -3,24 +3,26 @@ from pydantic import BaseModel, Field
|
|||
from werkzeug.exceptions import HTTPException
|
||||
from typing import List
|
||||
from pydantic import ValidationError
|
||||
import time
|
||||
|
||||
from intentRecognition import IntentRecognition
|
||||
from slotRecognition import SlotRecognition
|
||||
from utils import CheckResult, load_standard_name, generate_project_prompt, \
|
||||
load_standard_data, text_to_pinyin, multiple_standardize_single_name, \
|
||||
standardize_projectDepartment
|
||||
load_standard_data, text_to_pinyin, \
|
||||
standardize_projectDepartment, standardize_project_name, clean_useless_project_name, \
|
||||
clean_useless_company_name, standardize_sub_company
|
||||
|
||||
from constants import PROJECT_NAME, PROJECT_DEPARTMENT, SIMILARITY_VALUE, IMPLEMENTATION_ORG, RISK_LEVEL
|
||||
from config import *
|
||||
|
||||
MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-30750"
|
||||
MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-31940"
|
||||
MODEL_UIE_PATH = R"../uie/output/checkpoint-31350"
|
||||
|
||||
# 类别名称列表
|
||||
labels = [
|
||||
"天气查询", "互联网查询", "页面切换", "日计划数量查询", "周计划数量查询",
|
||||
"日计划作业内容", "周计划作业内容", "施工人数", "作业考勤人数", "知识问答",
|
||||
"通用对话", "作业面查询","班组人数查询","班组数查询","作业面内容","班组详情"
|
||||
"通用对话", "作业面查询", "班组人数查询", "班组数查询", "作业面内容", "班组详情"
|
||||
]
|
||||
|
||||
# 标签映射
|
||||
|
|
@ -41,13 +43,6 @@ label_map = {
|
|||
13: 'B-teamName', 26: 'I-teamName',
|
||||
}
|
||||
|
||||
# # 初始化工具类
|
||||
intent_recognizer = IntentRecognition(MODEL_ERNIE_PATH, labels)
|
||||
|
||||
# 初始化槽位识别工具类
|
||||
slot_recognizer = SlotRecognition(MODEL_UIE_PATH, label_map)
|
||||
# 设置Flask应用
|
||||
|
||||
#标准公司名和项目名
|
||||
standard_company_program = load_standard_data("./standard_data/standard_company_program.json")
|
||||
|
||||
|
|
@ -61,12 +56,30 @@ standard_company_name_list = list(standard_company_program.keys())
|
|||
pinyin_to_standard_company_name_map = {text_to_pinyin(kw): kw for kw in standard_company_name_list}
|
||||
standard_company_name_pinyin_list = list(pinyin_to_standard_company_name_map.keys())
|
||||
|
||||
simply_to_standard_project_name_map = {clean_useless_project_name(kw): kw for kw in standard_project_name_list}
|
||||
|
||||
pinyin_simply_to_standard_project_name_map = {text_to_pinyin(clean_useless_project_name(kw)): kw for kw in
|
||||
standard_project_name_list}
|
||||
|
||||
simply_to_standard_company_name_map = {clean_useless_company_name(kw): kw for kw in standard_company_name_list}
|
||||
|
||||
pinyin_simply_to_standard_company_name_map = {text_to_pinyin(clean_useless_company_name(kw)): kw for kw in
|
||||
standard_company_name_list}
|
||||
|
||||
# 初始化工具类
|
||||
intent_recognizer = IntentRecognition(MODEL_ERNIE_PATH, labels)
|
||||
|
||||
# 初始化槽位识别工具类
|
||||
slot_recognizer = SlotRecognition(MODEL_UIE_PATH, label_map)
|
||||
# 设置Flask应用
|
||||
|
||||
print(f"标准化的工程名是:{standard_project_name_list}", flush=True)
|
||||
print(f"pinyin标准化的工程名是 list:{standard_project_name_pinyin_list}", flush=True)
|
||||
print(f"pinyin-工程名对应关系 map:{pinyin_to_standard_company_name_map}", flush=True)
|
||||
print(f"pinyin-工程民对应关系 map:{pinyin_to_standard_company_name_map}", flush=True)
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
# 统一的异常处理函数
|
||||
@app.errorhandler(Exception)
|
||||
def handle_exception(e):
|
||||
|
|
@ -217,7 +230,8 @@ def agent():
|
|||
entities = slot_recognizer.recognize(query)
|
||||
|
||||
print(
|
||||
f"第一轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}",flush=True)
|
||||
f"第一轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}",
|
||||
flush=True)
|
||||
# 多轮
|
||||
else:
|
||||
res = extract_multi_chat(messages)
|
||||
|
|
@ -233,7 +247,8 @@ def agent():
|
|||
})
|
||||
entities = slot_recognizer.recognize(res)
|
||||
print(
|
||||
f"多轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}",flush=True)
|
||||
f"多轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}",
|
||||
flush=True)
|
||||
|
||||
#必须槽位缺失检查
|
||||
status, sk = check_lost(predicted_id, entities)
|
||||
|
|
@ -261,11 +276,12 @@ def agent():
|
|||
except Exception as e:
|
||||
return jsonify({"error": str(e)}), 500 # 捕捉其他错误并返回
|
||||
|
||||
|
||||
def extract_multi_chat(messages):
|
||||
from openai import OpenAI
|
||||
client = OpenAI(base_url=api_base_url, api_key=api_key)
|
||||
|
||||
latest_message = messages[-1] # 最后一条用户提问
|
||||
latest_message = messages[-1] # 最后一条用户提问
|
||||
if latest_message.role == "user":
|
||||
latest_user_question = latest_message.content.strip()
|
||||
time_prefixes = ["今天", "昨天", "本周", "下周", "明天", "今日"] # 可扩展的时间前缀列表
|
||||
|
|
@ -359,7 +375,7 @@ def extract_multi_chat(messages):
|
|||
messages=message,
|
||||
model=model_name,
|
||||
max_tokens=100,
|
||||
temperature=0.3, # 降低随机性,提高确定性
|
||||
temperature=0.1, # 降低随机性,提高确定性
|
||||
stream=False
|
||||
)
|
||||
|
||||
|
|
@ -367,6 +383,7 @@ def extract_multi_chat(messages):
|
|||
print(f"多轮意图后用户想要的问题是:{res}", flush=True)
|
||||
return res
|
||||
|
||||
|
||||
def check_lost(int_res, slot):
|
||||
#labels: ["天气查询","通用对话","页面切换","日计划数量查询","周计划数量查询","日计划作业内容","周计划作业内容","施工人数","作业考勤人数","知识问答"]
|
||||
mapping = {
|
||||
|
|
@ -386,7 +403,7 @@ def check_lost(int_res, slot):
|
|||
|
||||
intention_mapping = {2: "页面切换", 3: "日计划数量查询", 4: "周计划数量查询", 5: "日计划作业内容",
|
||||
6: "周计划作业内容", 7: "施工人数", 8: "作业考勤人数", 11: "作业面查询",
|
||||
12:"班组人数查询", 13:"班组数查询", 14:"作业面内容", 15:"班组详情"}
|
||||
12: "班组人数查询", 13: "班组数查询", 14: "作业面内容", 15: "班组详情"}
|
||||
if not mapping.__contains__(int_res):
|
||||
return 0, ""
|
||||
#提取的槽位信息
|
||||
|
|
@ -411,7 +428,7 @@ def check_lost(int_res, slot):
|
|||
return CheckResult.NO_MATCH, cur_k
|
||||
#符合当前意图的的必须槽位,但是不在提取的槽位信息里
|
||||
left = [x for x in mapping[int_res][idx] if x not in cur_k]
|
||||
print(f"符合当前意图的的必须槽位,但是不在提取的槽位信息里, {left}",flush=True)
|
||||
print(f"符合当前意图的的必须槽位,但是不在提取的槽位信息里, {left}", flush=True)
|
||||
apologize_str = "非常抱歉,"
|
||||
if int_res == 2:
|
||||
return CheckResult.NEEDS_MORE_ROUNDS, f"{apologize_str}请问你想查询哪个页面?"
|
||||
|
|
@ -434,8 +451,9 @@ def check_standard_name_slot(int_res, slot) -> tuple:
|
|||
for key, value in slot.items():
|
||||
if key == PROJECT_NAME:
|
||||
print(f"check_standard_name_slot 原始工程名 : {slot[PROJECT_NAME]}")
|
||||
match_results = multiple_standardize_single_name(value, standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,80)
|
||||
print(f"check_standard_name_slot 匹配后工程名 :result:{match_results}",flush=True)
|
||||
match_results = standardize_project_name(value, simply_to_standard_project_name_map,
|
||||
pinyin_simply_to_standard_project_name_map, 70, 90)
|
||||
print(f"check_standard_name_slot 匹配后工程名 :result:{match_results}", flush=True)
|
||||
if match_results and len(match_results) == 1:
|
||||
slot[key] = match_results[0]
|
||||
else:
|
||||
|
|
@ -444,8 +462,9 @@ def check_standard_name_slot(int_res, slot) -> tuple:
|
|||
|
||||
if key == IMPLEMENTATION_ORG and slot[key] != "公司":
|
||||
print(f"check_standard_name_slot 原始分公司名 : {slot[IMPLEMENTATION_ORG]}")
|
||||
match_results = multiple_standardize_single_name(value, standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map, lower_score=50, high_score=80, isArabicNumConv = True)
|
||||
print(f"check_standard_name_slot 匹配后分公司名: result:{match_results}",flush=True)
|
||||
match_results = standardize_sub_company(value, simply_to_standard_company_name_map,
|
||||
pinyin_simply_to_standard_company_name_map, 55, 80)
|
||||
print(f"check_standard_name_slot 匹配后分公司名: result:{match_results}", flush=True)
|
||||
if match_results and len(match_results) == 1:
|
||||
slot[key] = match_results[0]
|
||||
else:
|
||||
|
|
@ -454,8 +473,9 @@ def check_standard_name_slot(int_res, slot) -> tuple:
|
|||
|
||||
if key == PROJECT_DEPARTMENT:
|
||||
print(f"check_standard_name_slot 原始项目部名 : {slot[PROJECT_DEPARTMENT]}")
|
||||
match_results = standardize_projectDepartment(slot[IMPLEMENTATION_ORG], value, standard_company_program, high_score=85)
|
||||
print(f"check_standard_name_slot 匹配后项目部名: result:{match_results}",flush=True)
|
||||
match_results = standardize_projectDepartment(slot[IMPLEMENTATION_ORG], value, standard_company_program,
|
||||
high_score=90)
|
||||
print(f"check_standard_name_slot 匹配后项目部名: result:{match_results}", flush=True)
|
||||
if match_results and len(match_results) == 1:
|
||||
slot[key] = match_results[0]
|
||||
else:
|
||||
|
|
@ -463,92 +483,132 @@ def check_standard_name_slot(int_res, slot) -> tuple:
|
|||
return CheckResult.NEEDS_MORE_ROUNDS, prompt
|
||||
|
||||
if key == RISK_LEVEL:
|
||||
if slot[RISK_LEVEL] not in["2级","3级","4级","5级"] and slot[RISK_LEVEL] not in["二级","三级","四级","五级"]:
|
||||
if slot[RISK_LEVEL] not in ["2级", "3级", "4级", "5级"] and slot[RISK_LEVEL] not in ["二级", "三级", "四级",
|
||||
"五级"]:
|
||||
return CheckResult.NEEDS_MORE_ROUNDS, "您查询的风险等级在系统中未找到,请确认风险等级后再次提问"
|
||||
|
||||
return CheckResult.NO_MATCH, ""
|
||||
|
||||
|
||||
#
|
||||
# test_cases = [
|
||||
# ("安徽宏源电力建设有限公司", "第三项目管理部"), # 期望返回所有"第三项目管理部"
|
||||
# ("安徽宏源电力建设有限公司", "第九项目部"), # 期望返回 "第九项目管理部"
|
||||
# ("顺安电网公司", "第二项目部"), # 期望匹配"顺安电网建设有限公司"下的"第二项目管理部"
|
||||
# ("送电一公司", "第三项目部"), # 期望返回"第三项目管理部"
|
||||
# ("送电2公司", "第三项目部"), # 期望返回"第三项目管理部"
|
||||
# ("消防分公司", "第七项目部"), # 期望返回"第七项目管理部
|
||||
# ("建筑分公司", "第七项目部"), # 期望返回"第七项目管理部"
|
||||
# ("建筑消防分公司", "第七项目部"), # 期望返回"第七项目管理部"
|
||||
# ("建筑分公司消防分公司", "第七项目部") # 期望返回"第七项目管理部"
|
||||
# ("送一分公司"),
|
||||
# ("送二分公司"),
|
||||
# ("变电分公司"),
|
||||
# ("建筑分公司"),
|
||||
# ("检修试验分公司"),
|
||||
# ("宏源电力公司"),
|
||||
# ("宏源电力限公司"),
|
||||
# ("宏源电力限公司线路"),
|
||||
# ("宏源电力限公司变电"),
|
||||
# ("送一分"),
|
||||
# ("送二分"),
|
||||
# ("变电分"),
|
||||
# ("建筑分"),
|
||||
# ("检修试验分"),
|
||||
# ("宏源电力"),
|
||||
# ("红源电力"),
|
||||
# ("宏源电力有限"),
|
||||
# ("宏源电力限线路"),
|
||||
# ("宏源电力限变电"),
|
||||
# ]
|
||||
#
|
||||
# for company, project in test_cases:
|
||||
# # result = standardize_company_and_project(company, project,standard_company_program)
|
||||
# result = standardize_company_and_projectDepartment(company, project,standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
||||
# # result = multiple_standardize_single_name("company", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,40,70)
|
||||
# print(f"输入: {company}, {project} -> 输出: {result}")
|
||||
# print(f"加权混合策略 分公司名匹配**********************")
|
||||
# start = time.perf_counter()
|
||||
# for item in test_cases:
|
||||
# match_results = standardize_sub_company(item,simply_to_standard_company_name_map, pinyin_simply_to_standard_company_name_map,55,80)
|
||||
# print(f"加权混合策略 分公司名匹配 输入: {item}-> 输出: {match_results}")
|
||||
# end = time.perf_counter()
|
||||
# print(f"加权混合策略 耗时: {end - start:.4f} 秒")
|
||||
#
|
||||
# result = standardize_single_name("送电一公司", standard_company_name_list)
|
||||
# print(f"输入: 送一分公司-> 输出: {result}")
|
||||
#
|
||||
# prompt = generate_project_prompt(result, "分公司名")
|
||||
# print(f"prompt:{prompt}")
|
||||
#
|
||||
# result = standardize_single_name("合肥中心变", standard_project_name_list)
|
||||
# print(f"输入: 合肥中心变-> 输出: {result}")
|
||||
# test_cases = [
|
||||
# ("卢集"),
|
||||
# ("芦集"),
|
||||
# ("芦集变电站"),
|
||||
# ("安庆四变电站"),
|
||||
# ("锦绣变电站"),
|
||||
# ("滁州护桥变电站"),
|
||||
# ("合州换流站"),
|
||||
# ("陕北合州换流站"),
|
||||
# ("陕北安徽合州换流站"),
|
||||
# ("金牛变电站"),
|
||||
# ("香涧鹭岛工程"),
|
||||
# ("延庆换流站"),
|
||||
# ("国网延庆换流站"),
|
||||
# ("国网北京延庆换流站"),
|
||||
# ("陶楼广银线路工程"),
|
||||
# ("紫蓬变电站"),
|
||||
# ("宿州萧砀变电站"),
|
||||
# ("冯井变电站"),
|
||||
# ("富邦秋浦变电站"),
|
||||
# ("包河玉龙变电站"),
|
||||
#
|
||||
# prompt = generate_project_prompt(result, "工程名")
|
||||
# print(f"prompt:{prompt}")
|
||||
# ("绿雪莲塘工程"),
|
||||
# ("合肥循环园工程"),
|
||||
# ("合肥长临河工程"),
|
||||
# ("合肥中心变"),
|
||||
# ("锁库变电站工程"),
|
||||
# ("槽坊工程"),
|
||||
#
|
||||
# ("安庆四500kV变电站新建工程(PROJ-2024-0862)"),
|
||||
# ("锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)"),
|
||||
# ("渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)"),
|
||||
# ("先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)"),
|
||||
# ("安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)"),
|
||||
# ("合州士800千伏换流站电气安装A包(PROJ-2025-0056)"),
|
||||
# ("卫田-陶楼T接首业变电站110kV电缆线路工程(PROJ-2024-1236)"),
|
||||
# ("谯城(亳三)-希夷220kV线路工程(PROJ-2024-1205)"),
|
||||
# ]
|
||||
# print(f"去不重要词汇 工程名匹配******************************************")
|
||||
# start = time.perf_counter()
|
||||
# for item in test_cases:
|
||||
# match_results = standardize_project_name(item, simply_to_standard_project_name_map, pinyin_simply_to_standard_project_name_map,70,90)
|
||||
# print(f"工程名匹配 输入: {item}-> 输出: {match_results}")
|
||||
# end = time.perf_counter()
|
||||
# print(f"词集匹配 耗时: {end - start:.4f} 秒")
|
||||
#
|
||||
# print(f"项目名匹配******************************************")
|
||||
# oral_program_name_list = [
|
||||
# ("第1项目部"), # 期望返回所有"第三项目管理部"
|
||||
# ("第2项目部"),
|
||||
# ("第3项目部"),
|
||||
# ("第4项目部"),
|
||||
# ("第5项目部"),
|
||||
# ("第6项目部"),
|
||||
# ("第7项目部"),
|
||||
# ("第8项目部"),
|
||||
# ("第9项目部"),
|
||||
# ("第10项目部"),
|
||||
# ("第11项目部"),
|
||||
# ("第12项目部"),
|
||||
# ("第13项目部"),
|
||||
# ("电缆班"),
|
||||
# ("调试1队"),
|
||||
# ("调试2队"),
|
||||
# ("调试3队"),
|
||||
# ("调试4队"),
|
||||
# ("调试5队"),
|
||||
# ("第一项目管理部"),
|
||||
# ("第二项目管理部"),
|
||||
# ("第五项目管理部"),
|
||||
# ("第十一项目管理部(萧砀线路)"),
|
||||
# ("第三项目管理部(张店线路)"),
|
||||
# ("第三项目管理部(岳西线路)"),
|
||||
# ("第五项目管理部(蚌埠)"),
|
||||
# ("第三项目管理部(六安线路)"),
|
||||
# ("第十一项目管理部(宿州线路)"),
|
||||
# ("调试一队"),
|
||||
# ("调试二队"),
|
||||
# ("调试三队"),
|
||||
# ("电缆班"),
|
||||
# ]
|
||||
#
|
||||
# for company in standard_company_name_list:
|
||||
# for program in oral_program_name_list:
|
||||
# match_results = standardize_projectDepartment(company, program, standard_company_program, high_score=90)
|
||||
# print(f"加权混合策略 项目部名称 输入: 公司:{company},项目部:{program}-> 输出: {match_results}")
|
||||
|
||||
# result = standardize_single_name("合肥中心变", standard_project_name_list, 60, 75)
|
||||
# print(f"输入: 合肥中心变-> 输出: {result}")
|
||||
#
|
||||
# result = standardize_single_name("阜阳阜四变电站工程", standard_project_name_list, 60, 75)
|
||||
# print(f"输入: 阜阳阜四变电站工程-> 输出: {result}")
|
||||
#
|
||||
# result = standardize_single_name("芦集变电站", standard_project_name_list, 20, 50)
|
||||
# print(f"输入: 芦集变电站-> 输出: {result}")
|
||||
#
|
||||
# match_results = multiple_standardize_single_name("宋轶分公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80)
|
||||
# print(f"multiple_standardize_single_name 输入: 宋轶分公司-> 输出: {match_results}")
|
||||
#
|
||||
# match_results = multiple_standardize_single_name("宏源电力公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80)
|
||||
# print(f"multiple_standardize_single_name 输入: 宏源电力公司-> 输出: {match_results}")
|
||||
#
|
||||
# #
|
||||
# match_results = multiple_standardize_single_name("合肥中心变", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
||||
# print(f"multiple_standardize_single_name 输入: 合肥中心变-> 输出: {match_results}")
|
||||
#
|
||||
# match_results = multiple_standardize_single_name("淮南安丰", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
||||
# print(f"multiple_standardize_single_name 输入: 淮南安丰工程-> 输出: {match_results}")
|
||||
#
|
||||
# match_results = multiple_standardize_single_name("宿州萧砀新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
||||
# print(f"multiple_standardize_single_name 输入: 宿州萧砀新建工程-> 输出: {match_results}")
|
||||
#
|
||||
# match_results = multiple_standardize_single_name("芦集变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
||||
# print(f"multiple_standardize_single_name 输入: 芦集变电站-> 输出: {match_results}")
|
||||
#
|
||||
# match_results = multiple_standardize_single_name("卢集变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
||||
# print(f"multiple_standardize_single_name 输入: 卢集变电站-> 输出: {match_results}")
|
||||
#
|
||||
# match_results = multiple_standardize_single_name("芦集古沟变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
||||
# print(f"multiple_standardize_single_name 输入: 芦集古沟变电站新建工程-> 输出: {match_results}")
|
||||
# #
|
||||
# match_results = multiple_standardize_single_name("金牛变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
||||
# print(f"multiple_standardize_single_name 输入: 金牛变电站-> 输出: {match_results}")
|
||||
#
|
||||
|
||||
#
|
||||
# company, project = standardize_company_and_projectDepartment("变电分公司","第一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
||||
# print(f"company:{company}, project:{project}")
|
||||
#
|
||||
# company, project = standardize_company_and_projectDepartment("变电分公司","第十一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
||||
# print(f"company:{company}, project:{project}")
|
||||
# company, project = standardize_company_and_projectDepartment("试验分公司","电缆班", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
||||
# print(f"company:{company}, project:{project}")
|
||||
# company, project = standardize_company_and_projectDepartment("宏源电力投资有限公司","第三项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
||||
# print(f"company:{company}, project:{project}")
|
||||
#
|
||||
|
||||
# match_results = standardize_projectDepartment("安徽宏源电力建设有限公司(变电)", "第3项目部", standard_company_program, high_score=85)
|
||||
# print(f"match_results:{match_results}")
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=18074, debug=True)
|
||||
|
|
|
|||
218
api/main_temp.py
218
api/main_temp.py
|
|
@ -3,18 +3,21 @@ from pydantic import BaseModel, Field
|
|||
from werkzeug.exceptions import HTTPException
|
||||
from typing import List
|
||||
from pydantic import ValidationError
|
||||
import time
|
||||
|
||||
from intentRecognition import IntentRecognition
|
||||
from slotRecognition import SlotRecognition
|
||||
from utils import CheckResult, load_standard_name, generate_project_prompt, \
|
||||
load_standard_data, text_to_pinyin, multiple_standardize_single_name, \
|
||||
standardize_projectDepartment
|
||||
load_standard_data, text_to_pinyin, \
|
||||
standardize_projectDepartment, standardize_project_name, clean_useless_project_name, \
|
||||
clean_useless_company_name, standardize_sub_company
|
||||
|
||||
from constants import PROJECT_NAME, PROJECT_DEPARTMENT, SIMILARITY_VALUE, IMPLEMENTATION_ORG, RISK_LEVEL
|
||||
from config import *
|
||||
|
||||
MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-30750"
|
||||
MODEL_UIE_PATH = R"../uie/output_temp/checkpoint-31350"
|
||||
MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-17890"
|
||||
MODEL_UIE_PATH = R"../uie/output/checkpoint-17290"
|
||||
|
||||
# 类别名称列表
|
||||
labels = [
|
||||
"天气查询", "互联网查询", "页面切换", "日计划数量查询", "周计划数量查询",
|
||||
|
|
@ -54,6 +57,14 @@ standard_company_name_list = list(standard_company_program.keys())
|
|||
pinyin_to_standard_company_name_map = {text_to_pinyin(kw): kw for kw in standard_company_name_list}
|
||||
standard_company_name_pinyin_list = list(pinyin_to_standard_company_name_map.keys())
|
||||
|
||||
simply_to_standard_project_name_map = {clean_useless_project_name(kw): kw for kw in standard_project_name_list}
|
||||
|
||||
pinyin_simply_to_standard_project_name_map = {text_to_pinyin(clean_useless_project_name(kw)): kw for kw in standard_project_name_list}
|
||||
|
||||
simply_to_standard_company_name_map = {clean_useless_company_name(kw): kw for kw in standard_company_name_list}
|
||||
|
||||
pinyin_simply_to_standard_company_name_map = {text_to_pinyin(clean_useless_company_name(kw)): kw for kw in standard_company_name_list}
|
||||
|
||||
|
||||
# 初始化工具类
|
||||
intent_recognizer = IntentRecognition(MODEL_ERNIE_PATH, labels)
|
||||
|
|
@ -435,7 +446,7 @@ def check_standard_name_slot(int_res, slot) -> tuple:
|
|||
for key, value in slot.items():
|
||||
if key == PROJECT_NAME:
|
||||
print(f"check_standard_name_slot 原始工程名 : {slot[PROJECT_NAME]}")
|
||||
match_results = multiple_standardize_single_name(value, standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,80)
|
||||
match_results = standardize_project_name(value, simply_to_standard_project_name_map, pinyin_simply_to_standard_project_name_map,70,90)
|
||||
print(f"check_standard_name_slot 匹配后工程名 :result:{match_results}",flush=True)
|
||||
if match_results and len(match_results) == 1:
|
||||
slot[key] = match_results[0]
|
||||
|
|
@ -445,7 +456,7 @@ def check_standard_name_slot(int_res, slot) -> tuple:
|
|||
|
||||
if key == IMPLEMENTATION_ORG and slot[key] != "公司":
|
||||
print(f"check_standard_name_slot 原始分公司名 : {slot[IMPLEMENTATION_ORG]}")
|
||||
match_results = multiple_standardize_single_name(value, standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map, lower_score=50, high_score=80, isArabicNumConv = True)
|
||||
match_results = standardize_sub_company(value,simply_to_standard_company_name_map, pinyin_simply_to_standard_company_name_map,55,80)
|
||||
print(f"check_standard_name_slot 匹配后分公司名: result:{match_results}",flush=True)
|
||||
if match_results and len(match_results) == 1:
|
||||
slot[key] = match_results[0]
|
||||
|
|
@ -469,85 +480,126 @@ def check_standard_name_slot(int_res, slot) -> tuple:
|
|||
|
||||
return CheckResult.NO_MATCH, ""
|
||||
|
||||
|
||||
#
|
||||
# test_cases = [
|
||||
# ("安徽宏源电力建设有限公司(线路)", "第三项目管理部"), # 期望返回所有"第三项目管理部"
|
||||
# ("送电一分公司", "第8项目管理部"), # 期望返回 "第九项目管理部"
|
||||
# # ("顺安电网公司", "第二项目部"), # 期望匹配"顺安电网建设有限公司"下的"第二项目管理部"
|
||||
# # ("送电一公司", "第三项目部"), # 期望返回"第三项目管理部"
|
||||
# # ("送电2公司", "第三项目部"), # 期望返回"第三项目管理部"
|
||||
# # ("消防分公司", "第七项目部"), # 期望返回"第七项目管理部
|
||||
# # ("建筑分公司", "第七项目部"), # 期望返回"第七项目管理部"
|
||||
# # ("建筑消防分公司", "第七项目部"), # 期望返回"第七项目管理部"
|
||||
# # ("建筑分公司消防分公司", "第七项目部") # 期望返回"第七项目管理部"
|
||||
# ("送一分公司"),
|
||||
# ("送二分公司"),
|
||||
# ("变电分公司"),
|
||||
# ("建筑分公司"),
|
||||
# ("检修试验分公司"),
|
||||
# ("宏源电力公司"),
|
||||
# ("宏源电力限公司"),
|
||||
# ("宏源电力限公司线路"),
|
||||
# ("宏源电力限公司变电"),
|
||||
# ("送一分"),
|
||||
# ("送二分"),
|
||||
# ("变电分"),
|
||||
# ("建筑分"),
|
||||
# ("检修试验分"),
|
||||
# ("宏源电力"),
|
||||
# ("红源电力"),
|
||||
# ("宏源电力有限"),
|
||||
# ("宏源电力限线路"),
|
||||
# ("宏源电力限变电"),
|
||||
# ]
|
||||
#
|
||||
# print(f"加权混合策略 分公司名匹配**********************")
|
||||
# start = time.perf_counter()
|
||||
# for item in test_cases:
|
||||
# match_results = standardize_sub_company(item,simply_to_standard_company_name_map, pinyin_simply_to_standard_company_name_map,55,80)
|
||||
# print(f"加权混合策略 分公司名匹配 输入: {item}-> 输出: {match_results}")
|
||||
# end = time.perf_counter()
|
||||
# print(f"加权混合策略 耗时: {end - start:.4f} 秒")
|
||||
#
|
||||
#
|
||||
#
|
||||
# test_cases = [
|
||||
# ("卢集"),
|
||||
# ("芦集"),
|
||||
# ("芦集变电站"),
|
||||
# ("安庆四变电站"),
|
||||
# ("锦绣变电站"),
|
||||
# ("滁州护桥变电站"),
|
||||
# ("合州换流站"),
|
||||
# ("陕北合州换流站"),
|
||||
# ("陕北安徽合州换流站"),
|
||||
# ("金牛变电站"),
|
||||
# ("香涧鹭岛工程"),
|
||||
# ("延庆换流站"),
|
||||
# ("国网延庆换流站"),
|
||||
# ("国网北京延庆换流站"),
|
||||
# ("陶楼广银线路工程"),
|
||||
# ("紫蓬变电站"),
|
||||
# ("宿州萧砀变电站"),
|
||||
# ("冯井变电站"),
|
||||
# ("富邦秋浦变电站"),
|
||||
# ("包河玉龙变电站"),
|
||||
#
|
||||
# ("绿雪莲塘工程"),
|
||||
# ("合肥循环园工程"),
|
||||
# ("合肥长临河工程"),
|
||||
# ("合肥中心变"),
|
||||
# ("锁库变电站工程"),
|
||||
# ("槽坊工程"),
|
||||
#
|
||||
# ("安庆四500kV变电站新建工程(PROJ-2024-0862)"),
|
||||
# ("锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)"),
|
||||
# ("渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)"),
|
||||
# ("先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)"),
|
||||
# ("安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)"),
|
||||
# ("合州士800千伏换流站电气安装A包(PROJ-2025-0056)"),
|
||||
# ("卫田-陶楼T接首业变电站110kV电缆线路工程(PROJ-2024-1236)"),
|
||||
# ("谯城(亳三)-希夷220kV线路工程(PROJ-2024-1205)"),
|
||||
# ]
|
||||
# print(f"去不重要词汇 工程名匹配******************************************")
|
||||
# start = time.perf_counter()
|
||||
# for item in test_cases:
|
||||
# match_results = standardize_project_name(item, simply_to_standard_project_name_map, pinyin_simply_to_standard_project_name_map,70,90)
|
||||
# print(f"工程名匹配 输入: {item}-> 输出: {match_results}")
|
||||
# end = time.perf_counter()
|
||||
# print(f"词集匹配 耗时: {end - start:.4f} 秒")
|
||||
#
|
||||
# print(f"项目名匹配******************************************")
|
||||
# oral_program_name_list = [
|
||||
# ("第1项目部"), # 期望返回所有"第三项目管理部"
|
||||
# ("第2项目部"),
|
||||
# ("第3项目部"),
|
||||
# ("第4项目部"),
|
||||
# ("第5项目部"),
|
||||
# ("第6项目部"),
|
||||
# ("第7项目部"),
|
||||
# ("第8项目部"),
|
||||
# ("第9项目部"),
|
||||
# ("第10项目部"),
|
||||
# ("第11项目部"),
|
||||
# ("第12项目部"),
|
||||
# ("第13项目部"),
|
||||
# ("电缆班"),
|
||||
# ("调试1队"),
|
||||
# ("调试2队"),
|
||||
# ("调试3队"),
|
||||
# ("调试4队"),
|
||||
# ("调试5队"),
|
||||
# ("第一项目管理部"),
|
||||
# ("第二项目管理部"),
|
||||
# ("第五项目管理部"),
|
||||
# ("第十一项目管理部(萧砀线路)"),
|
||||
# ("第三项目管理部(张店线路)"),
|
||||
# ("第三项目管理部(岳西线路)"),
|
||||
# ("第五项目管理部(蚌埠)"),
|
||||
# ("第三项目管理部(六安线路)"),
|
||||
# ("第十一项目管理部(宿州线路)"),
|
||||
# ("调试一队"),
|
||||
# ("调试二队"),
|
||||
# ("调试三队"),
|
||||
# ("电缆班"),
|
||||
# ]
|
||||
#
|
||||
# for company in standard_company_name_list:
|
||||
# for program in oral_program_name_list:
|
||||
# match_results = standardize_projectDepartment(company, program, standard_company_program, high_score=90)
|
||||
# print(f"加权混合策略 项目部名称 输入: 公司:{company},项目部:{program}-> 输出: {match_results}")
|
||||
|
||||
# for company, project in test_cases:
|
||||
# result = standardize_projectDepartment(company, project,standard_company_program, high_score=90)
|
||||
# print(f"输入: {company}, {project} -> 输出: {result}")
|
||||
#
|
||||
# result = standardize_single_name("送电一公司", standard_company_name_list)
|
||||
# print(f"输入: 送一分公司-> 输出: {result}")
|
||||
#
|
||||
# prompt = generate_project_prompt(result, "分公司名")
|
||||
# print(f"prompt:{prompt}")
|
||||
#
|
||||
# result = standardize_single_name("合肥中心变", standard_project_name_list)
|
||||
# print(f"输入: 合肥中心变-> 输出: {result}")
|
||||
#
|
||||
# prompt = generate_project_prompt(result, "工程名")
|
||||
# print(f"prompt:{prompt}")
|
||||
|
||||
# result = standardize_single_name("合肥中心变", standard_project_name_list, 60, 75)
|
||||
# print(f"输入: 合肥中心变-> 输出: {result}")
|
||||
#
|
||||
# result = standardize_single_name("阜阳阜四变电站工程", standard_project_name_list, 60, 75)
|
||||
# print(f"输入: 阜阳阜四变电站工程-> 输出: {result}")
|
||||
#
|
||||
# result = standardize_single_name("芦集变电站", standard_project_name_list, 20, 50)
|
||||
# print(f"输入: 芦集变电站-> 输出: {result}")
|
||||
#
|
||||
# match_results = multiple_standardize_single_name("宋轶分公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80)
|
||||
# print(f"multiple_standardize_single_name 输入: 宋轶分公司-> 输出: {match_results}")
|
||||
#
|
||||
# match_results = multiple_standardize_single_name("宏源电力公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80)
|
||||
# print(f"multiple_standardize_single_name 输入: 宏源电力公司-> 输出: {match_results}")
|
||||
#
|
||||
# #
|
||||
# match_results = multiple_standardize_single_name("合肥中心变", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
||||
# print(f"multiple_standardize_single_name 输入: 合肥中心变-> 输出: {match_results}")
|
||||
#
|
||||
# match_results = multiple_standardize_single_name("淮南安丰", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
||||
# print(f"multiple_standardize_single_name 输入: 淮南安丰工程-> 输出: {match_results}")
|
||||
#
|
||||
# match_results = multiple_standardize_single_name("宿州萧砀新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
||||
# print(f"multiple_standardize_single_name 输入: 宿州萧砀新建工程-> 输出: {match_results}")
|
||||
#
|
||||
# match_results = multiple_standardize_single_name("芦集变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
||||
# print(f"multiple_standardize_single_name 输入: 芦集变电站-> 输出: {match_results}")
|
||||
#
|
||||
# match_results = multiple_standardize_single_name("卢集变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
||||
# print(f"multiple_standardize_single_name 输入: 卢集变电站-> 输出: {match_results}")
|
||||
#
|
||||
# match_results = multiple_standardize_single_name("芦集古沟变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
||||
# print(f"multiple_standardize_single_name 输入: 芦集古沟变电站新建工程-> 输出: {match_results}")
|
||||
# #
|
||||
# match_results = multiple_standardize_single_name("金牛变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
||||
# print(f"multiple_standardize_single_name 输入: 金牛变电站-> 输出: {match_results}")
|
||||
#
|
||||
|
||||
#
|
||||
# company, project = standardize_company_and_projectDepartment("变电分公司","第一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
||||
# print(f"company:{company}, project:{project}")
|
||||
#
|
||||
# company, project = standardize_company_and_projectDepartment("变电分公司","第十一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
||||
# print(f"company:{company}, project:{project}")
|
||||
# company, project = standardize_company_and_projectDepartment("试验分公司","电缆班", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
||||
# print(f"company:{company}, project:{project}")
|
||||
# company, project = standardize_company_and_projectDepartment("宏源电力投资有限公司","第三项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
||||
# print(f"company:{company}, project:{project}")
|
||||
#
|
||||
|
||||
# match_results = standardize_projectDepartment("安徽宏源电力建设有限公司(变电)", "第3项目部", standard_company_program, high_score=85)
|
||||
# print(f"match_results:{match_results}")
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=18073, debug=True)
|
||||
|
|
|
|||
157
api/utils.py
157
api/utils.py
|
|
@ -1,10 +1,13 @@
|
|||
from enum import Enum
|
||||
from typing import cast, Callable
|
||||
|
||||
from rapidfuzz import process, fuzz
|
||||
import re
|
||||
from rapidfuzz.fuzz import WRatio
|
||||
import json
|
||||
from pypinyin import lazy_pinyin
|
||||
|
||||
from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS
|
||||
|
||||
# 数字转换表(1-20,常见数字)
|
||||
digit_to_chinese = {
|
||||
"1": "一", "2": "二", "3": "三", "4": "四", "5": "五",
|
||||
|
|
@ -14,6 +17,7 @@ digit_to_chinese = {
|
|||
"19": "十九", "20": "二十"
|
||||
}
|
||||
|
||||
|
||||
def arabic_to_chinese_number(text):
|
||||
"""
|
||||
将文中阿拉伯数字转换为中文数字
|
||||
|
|
@ -28,10 +32,13 @@ def arabic_to_chinese_number(text):
|
|||
text = text.replace(num, cn)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def text_to_pinyin(text):
|
||||
"""将文本转换为拼音字符串"""
|
||||
return ''.join(lazy_pinyin(text))
|
||||
|
||||
|
||||
def load_standard_data(path):
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
|
@ -50,23 +57,99 @@ def extract_number(text):
|
|||
return None
|
||||
|
||||
|
||||
def fuzzy_match_and_filter(input_key, match_pool, mapping_dict, lower_score=70, high_score=85, top_k=3):
|
||||
"""
|
||||
对输入字符串在候选池中执行模糊匹配,并返回匹配程度高的映射原始值。
|
||||
:param input_key: 清洗后的用于匹配的关键词(如简化名或拼音)
|
||||
:param match_pool: 可用于匹配的候选集合(一般是映射表的 key)
|
||||
:param mapping_dict: 匹配项到标准原始名的映射字典
|
||||
:param lower_score: 匹配分数的下限,低于该分数视为无效
|
||||
:param high_score: 高置信度匹配分数,超过则直接返回所有等分结果
|
||||
:param top_k: 如果不满足高置信度,则返回前 top_k 个结果
|
||||
:return: 标准原始名的列表,可能为空
|
||||
"""
|
||||
match_results = process.extract(input_key, match_pool, scorer=cast(Callable, WRatio), limit=len(match_pool))
|
||||
high_conf_matches = [(m[0], m[1]) for m in match_results if m[1] >= lower_score]
|
||||
|
||||
if not high_conf_matches:
|
||||
return []
|
||||
|
||||
max_score = max(high_conf_matches, key=lambda x: x[1])
|
||||
best_matches = [m for m in high_conf_matches if m[1] == max_score[1]]
|
||||
|
||||
if max_score[1] >= high_score:
|
||||
return [mapping_dict[m[0]] for m in best_matches]
|
||||
else:
|
||||
return [mapping_dict[m[0]] for m in high_conf_matches[:top_k]]
|
||||
|
||||
|
||||
def standardize_name(input_name, clean_func, simply_map, pinyin_map, lower_score=70, high_score=85):
|
||||
"""
|
||||
通用名称标准化函数,按中文 → 清洗 → 简化匹配 → 拼音匹配 的顺序进行处理。
|
||||
|
||||
:param input_name: 用户输入的原始中文名
|
||||
:param clean_func: 清洗函数(针对不同实体类型,如工程名/公司名)
|
||||
:param simply_map: 简化后的名 → 原始标准名 映射
|
||||
:param pinyin_map: 拼音名 → 原始标准名 映射
|
||||
:param lower_score: 模糊匹配最低置信分数
|
||||
:param high_score: 模糊匹配高置信分数阈值
|
||||
:return: 标准名列表,可能为空
|
||||
"""
|
||||
simply_input = clean_func(input_name)
|
||||
result = fuzzy_match_and_filter(simply_input, list(simply_map.keys()), simply_map, lower_score, high_score)
|
||||
if result:
|
||||
return result
|
||||
|
||||
# 拼音匹配
|
||||
pinyin_input = text_to_pinyin(simply_input)
|
||||
result = fuzzy_match_and_filter(pinyin_input, list(pinyin_map.keys()), pinyin_map, lower_score, high_score)
|
||||
return result
|
||||
|
||||
|
||||
def standardize_sub_company(input_name, simply_map, pinyin_map, lower_score=55, high_score=80):
|
||||
"""
|
||||
对用户输入的子公司名称进行标准化,返回最匹配的标准公司名列表。
|
||||
|
||||
:param input_name: 原始中文子公司名
|
||||
:param simply_map: 清洗后的公司名 → 标准公司名映射
|
||||
:param pinyin_map: 洗后公司的拼音 → 标准公司名映射
|
||||
:param lower_score: 模糊匹配分数下限
|
||||
:param high_score: 高置信匹配分数阈值
|
||||
:return: 匹配的标准公司名列表
|
||||
"""
|
||||
return standardize_name(input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score)
|
||||
|
||||
|
||||
def standardize_project_name(input_name, simply_map, pinyin_map, lower_score=70, high_score=90):
|
||||
"""
|
||||
对用户输入的项目名称进行标准化,返回最匹配的标准项目名列表。
|
||||
|
||||
:param input_name: 原始中文项目名
|
||||
:param simply_map: 清洗后的项目名 → 标准项目名映射
|
||||
:param pinyin_map: 清洗后项目的拼音 → 标准项目名映射
|
||||
:param lower_score: 模糊匹配分数下限
|
||||
:param high_score: 高置信匹配分数阈值
|
||||
:return: 匹配的标准项目名列表
|
||||
"""
|
||||
return standardize_name(input_name, clean_useless_project_name, simply_map, pinyin_map, lower_score, high_score)
|
||||
|
||||
|
||||
#标准化项目部名
|
||||
def standardize_projectDepartment(standard_company, input_project , company_project_department_map, high_score=85):
|
||||
def standardize_projectDepartment(standard_company, input_project, company_project_department_map, high_score=90):
|
||||
"""
|
||||
将口语化的公司名和项目部名转换为标准化名称。
|
||||
|
||||
参数:
|
||||
standard_company (str): 标准化公司名。
|
||||
input_project (str): 用户输入的项目部名(可能是口语化或不完整的名称)。
|
||||
company_project_department_map (dict): 标准化的公司名和项目部名数据,格式为 {公司名: [项目部名1, 项目部名2, ...]}。
|
||||
pinyin_to_original_map:分公司拼音和分公司原始名的映射
|
||||
|
||||
返回:
|
||||
tuple: (标准化公司名, 匹配的项目部名列表)。如果无法匹配,返回 (None, None)。
|
||||
"""
|
||||
try:
|
||||
# **2. 先尝试直接匹配最相似的项目名**
|
||||
project_match = process.extractOne(input_project, company_project_department_map[standard_company], scorer=fuzz.ratio)
|
||||
project_match = process.extractOne(input_project, company_project_department_map[standard_company],
|
||||
scorer=cast(Callable, WRatio))
|
||||
print(f"项目部名称最相似:{project_match[0]},{project_match[1]}", flush=True)
|
||||
if project_match and project_match[1] >= high_score:
|
||||
return [project_match[0]] # 直接返回匹配的项目名
|
||||
|
|
@ -86,11 +169,13 @@ def standardize_projectDepartment(standard_company, input_project , company_proj
|
|||
print(f"standardize_projectDepartment:{e}", flush=True)
|
||||
return None
|
||||
|
||||
def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin_name_list = None, pinyin_to_original_map = None, lower_score=70, high_score=85, isArabicNumConv = False):
|
||||
|
||||
def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin_name_list=None,
|
||||
pinyin_to_original_map=None, lower_score=70, high_score=85, isArabicNumConv=False):
|
||||
"""
|
||||
使用拼音 + rapidfuzz 进行关键词模糊匹配,并返回原始的标准名
|
||||
:param input_name: 口语化的名称(中文)
|
||||
:param name_list: 关键词列表(中文)
|
||||
:param origin_input_name: 口语化的名称(中文)
|
||||
:param origin_name_list: 关键词列表(中文)
|
||||
:pinyin_name_list: 关键词列表(拼音)
|
||||
:param pinyin_to_original_map: 拼音到原始标准名的映射
|
||||
:param lower_score: 低匹配分数阈值(默认70)
|
||||
|
|
@ -100,10 +185,12 @@ def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin
|
|||
#First round, 原始标准名的匹配性查找,能找到直接返回
|
||||
if isArabicNumConv:
|
||||
origin_input_name = arabic_to_chinese_number(origin_input_name)
|
||||
match_results = process.extract(origin_input_name, origin_name_list, scorer=fuzz.token_sort_ratio, limit=len(origin_name_list))
|
||||
match_results = process.extract(origin_input_name, origin_name_list, scorer=fuzz.token_sort_ratio,
|
||||
limit=len(origin_name_list))
|
||||
# 找到所有相似度 > 80 的匹配项
|
||||
original_high_confidence_matches = [(match[0], match[1]) for match in match_results if match[1] >= lower_score]
|
||||
print(f"standardize_pinyin_single_name 原始名匹配, high_confidence_matches:{original_high_confidence_matches[:3]}", flush=True)
|
||||
print(f"standardize_pinyin_single_name 原始名匹配, high_confidence_matches:{original_high_confidence_matches[:3]}",
|
||||
flush=True)
|
||||
|
||||
combined_low_confidence_matches = []
|
||||
if original_high_confidence_matches:
|
||||
|
|
@ -117,31 +204,10 @@ def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin
|
|||
if not pinyin_name_list or not pinyin_to_original_map:
|
||||
return None #
|
||||
|
||||
#第二轮, 拼音名的匹配性查找,能找到直接返回
|
||||
# pinyin_input_name = text_to_pinyin(origin_input_name)
|
||||
# #fuzz.partial_ratio
|
||||
# match_results = process.extract(pinyin_input_name, pinyin_name_list, scorer=fuzz.ratio, limit=len(pinyin_name_list))
|
||||
#
|
||||
# # 筛选出匹配分数 > lower_score 的结果
|
||||
# pinyin_high_confidence_matches = [(match[0], match[1]) for match in match_results if match[1] >= lower_score]
|
||||
# print(f"standardize_pinyin_single_name 拼音匹配, input_name:{pinyin_input_name}, high_confidence_matches:{pinyin_high_confidence_matches[:3]}", flush=True)
|
||||
#
|
||||
# if not pinyin_high_confidence_matches:
|
||||
# return combined_low_confidence_matches # 没有找到匹配项
|
||||
#
|
||||
# # 选择最高相似度的匹配项
|
||||
# pinyin_best_match = max(pinyin_high_confidence_matches, key=lambda x: x[1], default=None)
|
||||
#
|
||||
# if pinyin_best_match and pinyin_best_match[1] > high_score:
|
||||
# return [pinyin_to_original_map[pinyin_best_match[0]]] # 直接返回最高相似度的原始工程名
|
||||
#
|
||||
# combined_low_confidence_matches.extend(
|
||||
# [pinyin_to_original_map[match[0]] for match in pinyin_high_confidence_matches[:3]]
|
||||
# )
|
||||
# 返回所有匹配项对应的原始名,最多返回最低匹配项的前5个
|
||||
return list(dict.fromkeys(combined_low_confidence_matches))
|
||||
|
||||
def generate_project_prompt(matched_projects, original_name = "", type="项目部名"):
|
||||
|
||||
def generate_project_prompt(matched_projects, original_name="", type="项目部名"):
|
||||
"""
|
||||
生成提示信息,用于让用户确认匹配的项目名或分公司名或项目名。
|
||||
|
||||
|
|
@ -203,3 +269,28 @@ class StandardType(Enum):
|
|||
PROJECT_CHECK = 0
|
||||
#项目名检查
|
||||
PROGRAM_CHECK = 1
|
||||
|
||||
|
||||
import re
|
||||
|
||||
# 构建一个用于替换的正则表达式
|
||||
useless_project_words_pattern = re.compile("|".join(USELESS_PROJECT_WORDS))
|
||||
useless_company_words_pattern = re.compile("|".join(USELESS_COMPANY_WORDS))
|
||||
# 匹配所有数字、字母(含大小写)、特殊字符(包括空格、标点)
|
||||
project_symbols_pattern = re.compile(r"[A-Za-z0-9\s\W_]+")
|
||||
company_symbols_pattern = re.compile(r"[\s\W_]+")
|
||||
|
||||
|
||||
def clean_useless_project_name(name: str) -> str:
|
||||
# 去掉无意义词
|
||||
name = useless_project_words_pattern.sub("", name)
|
||||
# 去掉数字、字母、符号
|
||||
name = project_symbols_pattern.sub("", name)
|
||||
return name.strip()
|
||||
|
||||
|
||||
def clean_useless_company_name(name: str) -> str:
|
||||
# 去掉无意义词
|
||||
name = useless_company_words_pattern.sub("", name)
|
||||
name = company_symbols_pattern.sub("", name)
|
||||
return name.strip()
|
||||
|
|
|
|||
Loading…
Reference in New Issue