标准化算法提优

This commit is contained in:
weiweiw 2025-04-18 13:19:44 +08:00
parent cd00c7efae
commit 510e829382
4 changed files with 425 additions and 216 deletions

View File

@ -1,6 +1,12 @@
# constants.py
SIMILARITY_VALUE = 75
#
#匹配工程名时,需要过滤掉的词汇
USELESS_PROJECT_WORDS = ["项目", "工程", "变电站", "线路", "变电","千伏" ,"换流站","公司","直流"]
#匹配公司名时,需要过滤掉的词汇
USELESS_COMPANY_WORDS = ["公司","分公司"]
COMPANYNAME_SHA = "顺安电网建设有限公司"
#日期
DATE = "date"

View File

@ -3,24 +3,26 @@ from pydantic import BaseModel, Field
from werkzeug.exceptions import HTTPException
from typing import List
from pydantic import ValidationError
import time
from intentRecognition import IntentRecognition
from slotRecognition import SlotRecognition
from utils import CheckResult, load_standard_name, generate_project_prompt, \
load_standard_data, text_to_pinyin, multiple_standardize_single_name, \
standardize_projectDepartment
load_standard_data, text_to_pinyin, \
standardize_projectDepartment, standardize_project_name, clean_useless_project_name, \
clean_useless_company_name, standardize_sub_company
from constants import PROJECT_NAME, PROJECT_DEPARTMENT, SIMILARITY_VALUE, IMPLEMENTATION_ORG, RISK_LEVEL
from config import *
MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-30750"
MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-31940"
MODEL_UIE_PATH = R"../uie/output/checkpoint-31350"
# 类别名称列表
labels = [
"天气查询", "互联网查询", "页面切换", "日计划数量查询", "周计划数量查询",
"日计划作业内容", "周计划作业内容", "施工人数", "作业考勤人数", "知识问答",
"通用对话", "作业面查询","班组人数查询","班组数查询","作业面内容","班组详情"
"通用对话", "作业面查询", "班组人数查询", "班组数查询", "作业面内容", "班组详情"
]
# 标签映射
@ -41,13 +43,6 @@ label_map = {
13: 'B-teamName', 26: 'I-teamName',
}
# # 初始化工具类
intent_recognizer = IntentRecognition(MODEL_ERNIE_PATH, labels)
# 初始化槽位识别工具类
slot_recognizer = SlotRecognition(MODEL_UIE_PATH, label_map)
# 设置Flask应用
#标准公司名和项目名
standard_company_program = load_standard_data("./standard_data/standard_company_program.json")
@ -61,12 +56,30 @@ standard_company_name_list = list(standard_company_program.keys())
pinyin_to_standard_company_name_map = {text_to_pinyin(kw): kw for kw in standard_company_name_list}
standard_company_name_pinyin_list = list(pinyin_to_standard_company_name_map.keys())
simply_to_standard_project_name_map = {clean_useless_project_name(kw): kw for kw in standard_project_name_list}
pinyin_simply_to_standard_project_name_map = {text_to_pinyin(clean_useless_project_name(kw)): kw for kw in
standard_project_name_list}
simply_to_standard_company_name_map = {clean_useless_company_name(kw): kw for kw in standard_company_name_list}
pinyin_simply_to_standard_company_name_map = {text_to_pinyin(clean_useless_company_name(kw)): kw for kw in
standard_company_name_list}
# 初始化工具类
intent_recognizer = IntentRecognition(MODEL_ERNIE_PATH, labels)
# 初始化槽位识别工具类
slot_recognizer = SlotRecognition(MODEL_UIE_PATH, label_map)
# 设置Flask应用
print(f"标准化的工程名是:{standard_project_name_list}", flush=True)
print(f"pinyin标准化的工程名是 list{standard_project_name_pinyin_list}", flush=True)
print(f"pinyin-工程名对应关系 map{pinyin_to_standard_company_name_map}", flush=True)
print(f"pinyin-工程对应关系 map{pinyin_to_standard_company_name_map}", flush=True)
app = Flask(__name__)
# 统一的异常处理函数
@app.errorhandler(Exception)
def handle_exception(e):
@ -217,7 +230,8 @@ def agent():
entities = slot_recognizer.recognize(query)
print(
f"第一轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}",flush=True)
f"第一轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}",
flush=True)
# 多轮
else:
res = extract_multi_chat(messages)
@ -233,7 +247,8 @@ def agent():
})
entities = slot_recognizer.recognize(res)
print(
f"多轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}",flush=True)
f"多轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}",
flush=True)
#必须槽位缺失检查
status, sk = check_lost(predicted_id, entities)
@ -261,11 +276,12 @@ def agent():
except Exception as e:
return jsonify({"error": str(e)}), 500 # 捕捉其他错误并返回
def extract_multi_chat(messages):
from openai import OpenAI
client = OpenAI(base_url=api_base_url, api_key=api_key)
latest_message = messages[-1] # 最后一条用户提问
latest_message = messages[-1] # 最后一条用户提问
if latest_message.role == "user":
latest_user_question = latest_message.content.strip()
time_prefixes = ["今天", "昨天", "本周", "下周", "明天", "今日"] # 可扩展的时间前缀列表
@ -359,7 +375,7 @@ def extract_multi_chat(messages):
messages=message,
model=model_name,
max_tokens=100,
temperature=0.3, # 降低随机性,提高确定性
temperature=0.1, # 降低随机性,提高确定性
stream=False
)
@ -367,6 +383,7 @@ def extract_multi_chat(messages):
print(f"多轮意图后用户想要的问题是:{res}", flush=True)
return res
def check_lost(int_res, slot):
#labels: ["天气查询","通用对话","页面切换","日计划数量查询","周计划数量查询","日计划作业内容","周计划作业内容","施工人数","作业考勤人数","知识问答"]
mapping = {
@ -386,7 +403,7 @@ def check_lost(int_res, slot):
intention_mapping = {2: "页面切换", 3: "日计划数量查询", 4: "周计划数量查询", 5: "日计划作业内容",
6: "周计划作业内容", 7: "施工人数", 8: "作业考勤人数", 11: "作业面查询",
12:"班组人数查询", 13:"班组数查询", 14:"作业面内容", 15:"班组详情"}
12: "班组人数查询", 13: "班组数查询", 14: "作业面内容", 15: "班组详情"}
if not mapping.__contains__(int_res):
return 0, ""
#提取的槽位信息
@ -411,7 +428,7 @@ def check_lost(int_res, slot):
return CheckResult.NO_MATCH, cur_k
#符合当前意图的的必须槽位,但是不在提取的槽位信息里
left = [x for x in mapping[int_res][idx] if x not in cur_k]
print(f"符合当前意图的的必须槽位,但是不在提取的槽位信息里, {left}",flush=True)
print(f"符合当前意图的的必须槽位,但是不在提取的槽位信息里, {left}", flush=True)
apologize_str = "非常抱歉,"
if int_res == 2:
return CheckResult.NEEDS_MORE_ROUNDS, f"{apologize_str}请问你想查询哪个页面?"
@ -434,8 +451,9 @@ def check_standard_name_slot(int_res, slot) -> tuple:
for key, value in slot.items():
if key == PROJECT_NAME:
print(f"check_standard_name_slot 原始工程名 : {slot[PROJECT_NAME]}")
match_results = multiple_standardize_single_name(value, standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,80)
print(f"check_standard_name_slot 匹配后工程名 result:{match_results}",flush=True)
match_results = standardize_project_name(value, simply_to_standard_project_name_map,
pinyin_simply_to_standard_project_name_map, 70, 90)
print(f"check_standard_name_slot 匹配后工程名 result:{match_results}", flush=True)
if match_results and len(match_results) == 1:
slot[key] = match_results[0]
else:
@ -444,8 +462,9 @@ def check_standard_name_slot(int_res, slot) -> tuple:
if key == IMPLEMENTATION_ORG and slot[key] != "公司":
print(f"check_standard_name_slot 原始分公司名 : {slot[IMPLEMENTATION_ORG]}")
match_results = multiple_standardize_single_name(value, standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map, lower_score=50, high_score=80, isArabicNumConv = True)
print(f"check_standard_name_slot 匹配后分公司名: result:{match_results}",flush=True)
match_results = standardize_sub_company(value, simply_to_standard_company_name_map,
pinyin_simply_to_standard_company_name_map, 55, 80)
print(f"check_standard_name_slot 匹配后分公司名: result:{match_results}", flush=True)
if match_results and len(match_results) == 1:
slot[key] = match_results[0]
else:
@ -454,8 +473,9 @@ def check_standard_name_slot(int_res, slot) -> tuple:
if key == PROJECT_DEPARTMENT:
print(f"check_standard_name_slot 原始项目部名 : {slot[PROJECT_DEPARTMENT]}")
match_results = standardize_projectDepartment(slot[IMPLEMENTATION_ORG], value, standard_company_program, high_score=85)
print(f"check_standard_name_slot 匹配后项目部名: result:{match_results}",flush=True)
match_results = standardize_projectDepartment(slot[IMPLEMENTATION_ORG], value, standard_company_program,
high_score=90)
print(f"check_standard_name_slot 匹配后项目部名: result:{match_results}", flush=True)
if match_results and len(match_results) == 1:
slot[key] = match_results[0]
else:
@ -463,92 +483,132 @@ def check_standard_name_slot(int_res, slot) -> tuple:
return CheckResult.NEEDS_MORE_ROUNDS, prompt
if key == RISK_LEVEL:
if slot[RISK_LEVEL] not in["2级","3级","4级","5级"] and slot[RISK_LEVEL] not in["二级","三级","四级","五级"]:
if slot[RISK_LEVEL] not in ["2级", "3级", "4级", "5级"] and slot[RISK_LEVEL] not in ["二级", "三级", "四级",
"五级"]:
return CheckResult.NEEDS_MORE_ROUNDS, "您查询的风险等级在系统中未找到,请确认风险等级后再次提问"
return CheckResult.NO_MATCH, ""
#
# test_cases = [
# ("安徽宏源电力建设有限公司", "第三项目管理部"), # 期望返回所有"第三项目管理部"
# ("安徽宏源电力建设有限公司", "第九项目部"), # 期望返回 "第九项目管理部"
# ("顺安电网公司", "第二项目部"), # 期望匹配"顺安电网建设有限公司"下的"第二项目管理部"
# ("送电一公司", "第三项目部"), # 期望返回"第三项目管理部"
# ("送电2公司", "第三项目部"), # 期望返回"第三项目管理部"
# ("消防分公司", "第七项目部"), # 期望返回"第七项目管理部
# ("建筑分公司", "第七项目部"), # 期望返回"第七项目管理部"
# ("建筑消防分公司", "第七项目部"), # 期望返回"第七项目管理部"
# ("建筑分公司消防分公司", "第七项目部") # 期望返回"第七项目管理部"
# ("送一分公司"),
# ("送二分公司"),
# ("变电分公司"),
# ("建筑分公司"),
# ("检修试验分公司"),
# ("宏源电力公司"),
# ("宏源电力限公司"),
# ("宏源电力限公司线路"),
# ("宏源电力限公司变电"),
# ("送一分"),
# ("送二分"),
# ("变电分"),
# ("建筑分"),
# ("检修试验分"),
# ("宏源电力"),
# ("红源电力"),
# ("宏源电力有限"),
# ("宏源电力限线路"),
# ("宏源电力限变电"),
# ]
#
# for company, project in test_cases:
# # result = standardize_company_and_project(company, project,standard_company_program)
# result = standardize_company_and_projectDepartment(company, project,standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
# # result = multiple_standardize_single_name("company", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,40,70)
# print(f"输入: {company}, {project} -> 输出: {result}")
# print(f"加权混合策略 分公司名匹配**********************")
# start = time.perf_counter()
# for item in test_cases:
# match_results = standardize_sub_company(item,simply_to_standard_company_name_map, pinyin_simply_to_standard_company_name_map,55,80)
# print(f"加权混合策略 分公司名匹配 输入: {item}-> 输出: {match_results}")
# end = time.perf_counter()
# print(f"加权混合策略 耗时: {end - start:.4f} 秒")
#
# result = standardize_single_name("送电一公司", standard_company_name_list)
# print(f"输入: 送一分公司-> 输出: {result}")
#
# prompt = generate_project_prompt(result, "分公司名")
# print(f"prompt:{prompt}")
#
# result = standardize_single_name("合肥中心变", standard_project_name_list)
# print(f"输入: 合肥中心变-> 输出: {result}")
# test_cases = [
# ("卢集"),
# ("芦集"),
# ("芦集变电站"),
# ("安庆四变电站"),
# ("锦绣变电站"),
# ("滁州护桥变电站"),
# ("合州换流站"),
# ("陕北合州换流站"),
# ("陕北安徽合州换流站"),
# ("金牛变电站"),
# ("香涧鹭岛工程"),
# ("延庆换流站"),
# ("国网延庆换流站"),
# ("国网北京延庆换流站"),
# ("陶楼广银线路工程"),
# ("紫蓬变电站"),
# ("宿州萧砀变电站"),
# ("冯井变电站"),
# ("富邦秋浦变电站"),
# ("包河玉龙变电站"),
#
# prompt = generate_project_prompt(result, "工程名")
# print(f"prompt:{prompt}")
# ("绿雪莲塘工程"),
# ("合肥循环园工程"),
# ("合肥长临河工程"),
# ("合肥中心变"),
# ("锁库变电站工程"),
# ("槽坊工程"),
#
# ("安庆四500kV变电站新建工程(PROJ-2024-0862)"),
# ("锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)"),
# ("渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)"),
# ("先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)"),
# ("安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)"),
# ("合州士800千伏换流站电气安装A包(PROJ-2025-0056)"),
# ("卫田-陶楼T接首业变电站110kV电缆线路工程(PROJ-2024-1236)"),
# ("谯城(亳三)-希夷220kV线路工程(PROJ-2024-1205)"),
# ]
# print(f"去不重要词汇 工程名匹配******************************************")
# start = time.perf_counter()
# for item in test_cases:
# match_results = standardize_project_name(item, simply_to_standard_project_name_map, pinyin_simply_to_standard_project_name_map,70,90)
# print(f"工程名匹配 输入: {item}-> 输出: {match_results}")
# end = time.perf_counter()
# print(f"词集匹配 耗时: {end - start:.4f} 秒")
#
# print(f"项目名匹配******************************************")
# oral_program_name_list = [
# ("第1项目部"), # 期望返回所有"第三项目管理部"
# ("第2项目部"),
# ("第3项目部"),
# ("第4项目部"),
# ("第5项目部"),
# ("第6项目部"),
# ("第7项目部"),
# ("第8项目部"),
# ("第9项目部"),
# ("第10项目部"),
# ("第11项目部"),
# ("第12项目部"),
# ("第13项目部"),
# ("电缆班"),
# ("调试1队"),
# ("调试2队"),
# ("调试3队"),
# ("调试4队"),
# ("调试5队"),
# ("第一项目管理部"),
# ("第二项目管理部"),
# ("第五项目管理部"),
# ("第十一项目管理部(萧砀线路)"),
# ("第三项目管理部(张店线路)"),
# ("第三项目管理部(岳西线路)"),
# ("第五项目管理部(蚌埠)"),
# ("第三项目管理部(六安线路)"),
# ("第十一项目管理部(宿州线路)"),
# ("调试一队"),
# ("调试二队"),
# ("调试三队"),
# ("电缆班"),
# ]
#
# for company in standard_company_name_list:
# for program in oral_program_name_list:
# match_results = standardize_projectDepartment(company, program, standard_company_program, high_score=90)
# print(f"加权混合策略 项目部名称 输入: 公司:{company},项目部:{program}-> 输出: {match_results}")
# result = standardize_single_name("合肥中心变", standard_project_name_list, 60, 75)
# print(f"输入: 合肥中心变-> 输出: {result}")
#
# result = standardize_single_name("阜阳阜四变电站工程", standard_project_name_list, 60, 75)
# print(f"输入: 阜阳阜四变电站工程-> 输出: {result}")
#
# result = standardize_single_name("芦集变电站", standard_project_name_list, 20, 50)
# print(f"输入: 芦集变电站-> 输出: {result}")
#
# match_results = multiple_standardize_single_name("宋轶分公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80)
# print(f"multiple_standardize_single_name 输入: 宋轶分公司-> 输出: {match_results}")
#
# match_results = multiple_standardize_single_name("宏源电力公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80)
# print(f"multiple_standardize_single_name 输入: 宏源电力公司-> 输出: {match_results}")
#
# #
# match_results = multiple_standardize_single_name("合肥中心变", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
# print(f"multiple_standardize_single_name 输入: 合肥中心变-> 输出: {match_results}")
#
# match_results = multiple_standardize_single_name("淮南安丰", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
# print(f"multiple_standardize_single_name 输入: 淮南安丰工程-> 输出: {match_results}")
#
# match_results = multiple_standardize_single_name("宿州萧砀新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
# print(f"multiple_standardize_single_name 输入: 宿州萧砀新建工程-> 输出: {match_results}")
#
# match_results = multiple_standardize_single_name("芦集变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
# print(f"multiple_standardize_single_name 输入: 芦集变电站-> 输出: {match_results}")
#
# match_results = multiple_standardize_single_name("卢集变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
# print(f"multiple_standardize_single_name 输入: 卢集变电站-> 输出: {match_results}")
#
# match_results = multiple_standardize_single_name("芦集古沟变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
# print(f"multiple_standardize_single_name 输入: 芦集古沟变电站新建工程-> 输出: {match_results}")
# #
# match_results = multiple_standardize_single_name("金牛变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
# print(f"multiple_standardize_single_name 输入: 金牛变电站-> 输出: {match_results}")
#
#
# company, project = standardize_company_and_projectDepartment("变电分公司","第一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
# print(f"company:{company}, project:{project}")
#
# company, project = standardize_company_and_projectDepartment("变电分公司","第十一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
# print(f"company:{company}, project:{project}")
# company, project = standardize_company_and_projectDepartment("试验分公司","电缆班", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
# print(f"company:{company}, project:{project}")
# company, project = standardize_company_and_projectDepartment("宏源电力投资有限公司","第三项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
# print(f"company:{company}, project:{project}")
#
# match_results = standardize_projectDepartment("安徽宏源电力建设有限公司(变电)", "第3项目部", standard_company_program, high_score=85)
# print(f"match_results:{match_results}")
if __name__ == '__main__':
app.run(host='0.0.0.0', port=18074, debug=True)

View File

@ -3,18 +3,21 @@ from pydantic import BaseModel, Field
from werkzeug.exceptions import HTTPException
from typing import List
from pydantic import ValidationError
import time
from intentRecognition import IntentRecognition
from slotRecognition import SlotRecognition
from utils import CheckResult, load_standard_name, generate_project_prompt, \
load_standard_data, text_to_pinyin, multiple_standardize_single_name, \
standardize_projectDepartment
load_standard_data, text_to_pinyin, \
standardize_projectDepartment, standardize_project_name, clean_useless_project_name, \
clean_useless_company_name, standardize_sub_company
from constants import PROJECT_NAME, PROJECT_DEPARTMENT, SIMILARITY_VALUE, IMPLEMENTATION_ORG, RISK_LEVEL
from config import *
MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-30750"
MODEL_UIE_PATH = R"../uie/output_temp/checkpoint-31350"
MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-17890"
MODEL_UIE_PATH = R"../uie/output/checkpoint-17290"
# 类别名称列表
labels = [
"天气查询", "互联网查询", "页面切换", "日计划数量查询", "周计划数量查询",
@ -54,6 +57,14 @@ standard_company_name_list = list(standard_company_program.keys())
pinyin_to_standard_company_name_map = {text_to_pinyin(kw): kw for kw in standard_company_name_list}
standard_company_name_pinyin_list = list(pinyin_to_standard_company_name_map.keys())
simply_to_standard_project_name_map = {clean_useless_project_name(kw): kw for kw in standard_project_name_list}
pinyin_simply_to_standard_project_name_map = {text_to_pinyin(clean_useless_project_name(kw)): kw for kw in standard_project_name_list}
simply_to_standard_company_name_map = {clean_useless_company_name(kw): kw for kw in standard_company_name_list}
pinyin_simply_to_standard_company_name_map = {text_to_pinyin(clean_useless_company_name(kw)): kw for kw in standard_company_name_list}
# 初始化工具类
intent_recognizer = IntentRecognition(MODEL_ERNIE_PATH, labels)
@ -435,7 +446,7 @@ def check_standard_name_slot(int_res, slot) -> tuple:
for key, value in slot.items():
if key == PROJECT_NAME:
print(f"check_standard_name_slot 原始工程名 : {slot[PROJECT_NAME]}")
match_results = multiple_standardize_single_name(value, standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,80)
match_results = standardize_project_name(value, simply_to_standard_project_name_map, pinyin_simply_to_standard_project_name_map,70,90)
print(f"check_standard_name_slot 匹配后工程名 result:{match_results}",flush=True)
if match_results and len(match_results) == 1:
slot[key] = match_results[0]
@ -445,7 +456,7 @@ def check_standard_name_slot(int_res, slot) -> tuple:
if key == IMPLEMENTATION_ORG and slot[key] != "公司":
print(f"check_standard_name_slot 原始分公司名 : {slot[IMPLEMENTATION_ORG]}")
match_results = multiple_standardize_single_name(value, standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map, lower_score=50, high_score=80, isArabicNumConv = True)
match_results = standardize_sub_company(value,simply_to_standard_company_name_map, pinyin_simply_to_standard_company_name_map,55,80)
print(f"check_standard_name_slot 匹配后分公司名: result:{match_results}",flush=True)
if match_results and len(match_results) == 1:
slot[key] = match_results[0]
@ -469,85 +480,126 @@ def check_standard_name_slot(int_res, slot) -> tuple:
return CheckResult.NO_MATCH, ""
#
# test_cases = [
# ("安徽宏源电力建设有限公司(线路)", "第三项目管理部"), # 期望返回所有"第三项目管理部"
# ("送电一分公司", "第8项目管理部"), # 期望返回 "第九项目管理部"
# # ("顺安电网公司", "第二项目部"), # 期望匹配"顺安电网建设有限公司"下的"第二项目管理部"
# # ("送电一公司", "第三项目部"), # 期望返回"第三项目管理部"
# # ("送电2公司", "第三项目部"), # 期望返回"第三项目管理部"
# # ("消防分公司", "第七项目部"), # 期望返回"第七项目管理部
# # ("建筑分公司", "第七项目部"), # 期望返回"第七项目管理部"
# # ("建筑消防分公司", "第七项目部"), # 期望返回"第七项目管理部"
# # ("建筑分公司消防分公司", "第七项目部") # 期望返回"第七项目管理部"
# ("送一分公司"),
# ("送二分公司"),
# ("变电分公司"),
# ("建筑分公司"),
# ("检修试验分公司"),
# ("宏源电力公司"),
# ("宏源电力限公司"),
# ("宏源电力限公司线路"),
# ("宏源电力限公司变电"),
# ("送一分"),
# ("送二分"),
# ("变电分"),
# ("建筑分"),
# ("检修试验分"),
# ("宏源电力"),
# ("红源电力"),
# ("宏源电力有限"),
# ("宏源电力限线路"),
# ("宏源电力限变电"),
# ]
#
# print(f"加权混合策略 分公司名匹配**********************")
# start = time.perf_counter()
# for item in test_cases:
# match_results = standardize_sub_company(item,simply_to_standard_company_name_map, pinyin_simply_to_standard_company_name_map,55,80)
# print(f"加权混合策略 分公司名匹配 输入: {item}-> 输出: {match_results}")
# end = time.perf_counter()
# print(f"加权混合策略 耗时: {end - start:.4f} 秒")
#
#
#
# test_cases = [
# ("卢集"),
# ("芦集"),
# ("芦集变电站"),
# ("安庆四变电站"),
# ("锦绣变电站"),
# ("滁州护桥变电站"),
# ("合州换流站"),
# ("陕北合州换流站"),
# ("陕北安徽合州换流站"),
# ("金牛变电站"),
# ("香涧鹭岛工程"),
# ("延庆换流站"),
# ("国网延庆换流站"),
# ("国网北京延庆换流站"),
# ("陶楼广银线路工程"),
# ("紫蓬变电站"),
# ("宿州萧砀变电站"),
# ("冯井变电站"),
# ("富邦秋浦变电站"),
# ("包河玉龙变电站"),
#
# ("绿雪莲塘工程"),
# ("合肥循环园工程"),
# ("合肥长临河工程"),
# ("合肥中心变"),
# ("锁库变电站工程"),
# ("槽坊工程"),
#
# ("安庆四500kV变电站新建工程(PROJ-2024-0862)"),
# ("锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)"),
# ("渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)"),
# ("先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)"),
# ("安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)"),
# ("合州士800千伏换流站电气安装A包(PROJ-2025-0056)"),
# ("卫田-陶楼T接首业变电站110kV电缆线路工程(PROJ-2024-1236)"),
# ("谯城(亳三)-希夷220kV线路工程(PROJ-2024-1205)"),
# ]
# print(f"去不重要词汇 工程名匹配******************************************")
# start = time.perf_counter()
# for item in test_cases:
# match_results = standardize_project_name(item, simply_to_standard_project_name_map, pinyin_simply_to_standard_project_name_map,70,90)
# print(f"工程名匹配 输入: {item}-> 输出: {match_results}")
# end = time.perf_counter()
# print(f"词集匹配 耗时: {end - start:.4f} 秒")
#
# print(f"项目名匹配******************************************")
# oral_program_name_list = [
# ("第1项目部"), # 期望返回所有"第三项目管理部"
# ("第2项目部"),
# ("第3项目部"),
# ("第4项目部"),
# ("第5项目部"),
# ("第6项目部"),
# ("第7项目部"),
# ("第8项目部"),
# ("第9项目部"),
# ("第10项目部"),
# ("第11项目部"),
# ("第12项目部"),
# ("第13项目部"),
# ("电缆班"),
# ("调试1队"),
# ("调试2队"),
# ("调试3队"),
# ("调试4队"),
# ("调试5队"),
# ("第一项目管理部"),
# ("第二项目管理部"),
# ("第五项目管理部"),
# ("第十一项目管理部(萧砀线路)"),
# ("第三项目管理部(张店线路)"),
# ("第三项目管理部(岳西线路)"),
# ("第五项目管理部(蚌埠)"),
# ("第三项目管理部(六安线路)"),
# ("第十一项目管理部(宿州线路)"),
# ("调试一队"),
# ("调试二队"),
# ("调试三队"),
# ("电缆班"),
# ]
#
# for company in standard_company_name_list:
# for program in oral_program_name_list:
# match_results = standardize_projectDepartment(company, program, standard_company_program, high_score=90)
# print(f"加权混合策略 项目部名称 输入: 公司:{company},项目部:{program}-> 输出: {match_results}")
# for company, project in test_cases:
# result = standardize_projectDepartment(company, project,standard_company_program, high_score=90)
# print(f"输入: {company}, {project} -> 输出: {result}")
#
# result = standardize_single_name("送电一公司", standard_company_name_list)
# print(f"输入: 送一分公司-> 输出: {result}")
#
# prompt = generate_project_prompt(result, "分公司名")
# print(f"prompt:{prompt}")
#
# result = standardize_single_name("合肥中心变", standard_project_name_list)
# print(f"输入: 合肥中心变-> 输出: {result}")
#
# prompt = generate_project_prompt(result, "工程名")
# print(f"prompt:{prompt}")
# result = standardize_single_name("合肥中心变", standard_project_name_list, 60, 75)
# print(f"输入: 合肥中心变-> 输出: {result}")
#
# result = standardize_single_name("阜阳阜四变电站工程", standard_project_name_list, 60, 75)
# print(f"输入: 阜阳阜四变电站工程-> 输出: {result}")
#
# result = standardize_single_name("芦集变电站", standard_project_name_list, 20, 50)
# print(f"输入: 芦集变电站-> 输出: {result}")
#
# match_results = multiple_standardize_single_name("宋轶分公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80)
# print(f"multiple_standardize_single_name 输入: 宋轶分公司-> 输出: {match_results}")
#
# match_results = multiple_standardize_single_name("宏源电力公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80)
# print(f"multiple_standardize_single_name 输入: 宏源电力公司-> 输出: {match_results}")
#
# #
# match_results = multiple_standardize_single_name("合肥中心变", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
# print(f"multiple_standardize_single_name 输入: 合肥中心变-> 输出: {match_results}")
#
# match_results = multiple_standardize_single_name("淮南安丰", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
# print(f"multiple_standardize_single_name 输入: 淮南安丰工程-> 输出: {match_results}")
#
# match_results = multiple_standardize_single_name("宿州萧砀新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
# print(f"multiple_standardize_single_name 输入: 宿州萧砀新建工程-> 输出: {match_results}")
#
# match_results = multiple_standardize_single_name("芦集变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
# print(f"multiple_standardize_single_name 输入: 芦集变电站-> 输出: {match_results}")
#
# match_results = multiple_standardize_single_name("卢集变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
# print(f"multiple_standardize_single_name 输入: 卢集变电站-> 输出: {match_results}")
#
# match_results = multiple_standardize_single_name("芦集古沟变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
# print(f"multiple_standardize_single_name 输入: 芦集古沟变电站新建工程-> 输出: {match_results}")
# #
# match_results = multiple_standardize_single_name("金牛变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
# print(f"multiple_standardize_single_name 输入: 金牛变电站-> 输出: {match_results}")
#
#
# company, project = standardize_company_and_projectDepartment("变电分公司","第一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
# print(f"company:{company}, project:{project}")
#
# company, project = standardize_company_and_projectDepartment("变电分公司","第十一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
# print(f"company:{company}, project:{project}")
# company, project = standardize_company_and_projectDepartment("试验分公司","电缆班", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
# print(f"company:{company}, project:{project}")
# company, project = standardize_company_and_projectDepartment("宏源电力投资有限公司","第三项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
# print(f"company:{company}, project:{project}")
#
# match_results = standardize_projectDepartment("安徽宏源电力建设有限公司(变电)", "第3项目部", standard_company_program, high_score=85)
# print(f"match_results:{match_results}")
if __name__ == '__main__':
app.run(host='0.0.0.0', port=18073, debug=True)

View File

@ -1,10 +1,13 @@
from enum import Enum
from typing import cast, Callable
from rapidfuzz import process, fuzz
import re
from rapidfuzz.fuzz import WRatio
import json
from pypinyin import lazy_pinyin
from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS
# 数字转换表1-20常见数字
digit_to_chinese = {
"1": "", "2": "", "3": "", "4": "", "5": "",
@ -14,6 +17,7 @@ digit_to_chinese = {
"19": "十九", "20": "二十"
}
def arabic_to_chinese_number(text):
"""
将文中阿拉伯数字转换为中文数字
@ -28,10 +32,13 @@ def arabic_to_chinese_number(text):
text = text.replace(num, cn)
return text
def text_to_pinyin(text):
"""将文本转换为拼音字符串"""
return ''.join(lazy_pinyin(text))
def load_standard_data(path):
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
@ -50,23 +57,99 @@ def extract_number(text):
return None
def fuzzy_match_and_filter(input_key, match_pool, mapping_dict, lower_score=70, high_score=85, top_k=3):
"""
对输入字符串在候选池中执行模糊匹配并返回匹配程度高的映射原始值
:param input_key: 清洗后的用于匹配的关键词如简化名或拼音
:param match_pool: 可用于匹配的候选集合一般是映射表的 key
:param mapping_dict: 匹配项到标准原始名的映射字典
:param lower_score: 匹配分数的下限低于该分数视为无效
:param high_score: 高置信度匹配分数超过则直接返回所有等分结果
:param top_k: 如果不满足高置信度则返回前 top_k 个结果
:return: 标准原始名的列表可能为空
"""
match_results = process.extract(input_key, match_pool, scorer=cast(Callable, WRatio), limit=len(match_pool))
high_conf_matches = [(m[0], m[1]) for m in match_results if m[1] >= lower_score]
if not high_conf_matches:
return []
max_score = max(high_conf_matches, key=lambda x: x[1])
best_matches = [m for m in high_conf_matches if m[1] == max_score[1]]
if max_score[1] >= high_score:
return [mapping_dict[m[0]] for m in best_matches]
else:
return [mapping_dict[m[0]] for m in high_conf_matches[:top_k]]
def standardize_name(input_name, clean_func, simply_map, pinyin_map, lower_score=70, high_score=85):
"""
通用名称标准化函数按中文 清洗 简化匹配 拼音匹配 的顺序进行处理
:param input_name: 用户输入的原始中文名
:param clean_func: 清洗函数针对不同实体类型如工程名/公司名
:param simply_map: 简化后的名 原始标准名 映射
:param pinyin_map: 拼音名 原始标准名 映射
:param lower_score: 模糊匹配最低置信分数
:param high_score: 模糊匹配高置信分数阈值
:return: 标准名列表可能为空
"""
simply_input = clean_func(input_name)
result = fuzzy_match_and_filter(simply_input, list(simply_map.keys()), simply_map, lower_score, high_score)
if result:
return result
# 拼音匹配
pinyin_input = text_to_pinyin(simply_input)
result = fuzzy_match_and_filter(pinyin_input, list(pinyin_map.keys()), pinyin_map, lower_score, high_score)
return result
def standardize_sub_company(input_name, simply_map, pinyin_map, lower_score=55, high_score=80):
"""
对用户输入的子公司名称进行标准化返回最匹配的标准公司名列表
:param input_name: 原始中文子公司名
:param simply_map: 清洗后的公司名 标准公司名映射
:param pinyin_map: 洗后公司的拼音 标准公司名映射
:param lower_score: 模糊匹配分数下限
:param high_score: 高置信匹配分数阈值
:return: 匹配的标准公司名列表
"""
return standardize_name(input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score)
def standardize_project_name(input_name, simply_map, pinyin_map, lower_score=70, high_score=90):
"""
对用户输入的项目名称进行标准化返回最匹配的标准项目名列表
:param input_name: 原始中文项目名
:param simply_map: 清洗后的项目名 标准项目名映射
:param pinyin_map: 清洗后项目的拼音 标准项目名映射
:param lower_score: 模糊匹配分数下限
:param high_score: 高置信匹配分数阈值
:return: 匹配的标准项目名列表
"""
return standardize_name(input_name, clean_useless_project_name, simply_map, pinyin_map, lower_score, high_score)
#标准化项目部名
def standardize_projectDepartment(standard_company, input_project , company_project_department_map, high_score=85):
def standardize_projectDepartment(standard_company, input_project, company_project_department_map, high_score=90):
"""
将口语化的公司名和项目部名转换为标准化名称
参数:
standard_company (str): 标准化公司名
input_project (str): 用户输入的项目部名可能是口语化或不完整的名称
company_project_department_map (dict): 标准化的公司名和项目部名数据格式为 {公司名: [项目部名1, 项目部名2, ...]}
pinyin_to_original_map:分公司拼音和分公司原始名的映射
返回:
tuple: (标准化公司名, 匹配的项目部名列表)如果无法匹配返回 (None, None)
"""
try:
# **2. 先尝试直接匹配最相似的项目名**
project_match = process.extractOne(input_project, company_project_department_map[standard_company], scorer=fuzz.ratio)
project_match = process.extractOne(input_project, company_project_department_map[standard_company],
scorer=cast(Callable, WRatio))
print(f"项目部名称最相似:{project_match[0]},{project_match[1]}", flush=True)
if project_match and project_match[1] >= high_score:
return [project_match[0]] # 直接返回匹配的项目名
@ -86,11 +169,13 @@ def standardize_projectDepartment(standard_company, input_project , company_proj
print(f"standardize_projectDepartment{e}", flush=True)
return None
def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin_name_list = None, pinyin_to_original_map = None, lower_score=70, high_score=85, isArabicNumConv = False):
def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin_name_list=None,
pinyin_to_original_map=None, lower_score=70, high_score=85, isArabicNumConv=False):
"""
使用拼音 + rapidfuzz 进行关键词模糊匹配并返回原始的标准名
:param input_name: 口语化的名称中文
:param name_list: 关键词列表中文
:param origin_input_name: 口语化的名称中文
:param origin_name_list: 关键词列表中文
:pinyin_name_list: 关键词列表拼音
:param pinyin_to_original_map: 拼音到原始标准名的映射
:param lower_score: 低匹配分数阈值默认70
@ -100,10 +185,12 @@ def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin
#First round, 原始标准名的匹配性查找,能找到直接返回
if isArabicNumConv:
origin_input_name = arabic_to_chinese_number(origin_input_name)
match_results = process.extract(origin_input_name, origin_name_list, scorer=fuzz.token_sort_ratio, limit=len(origin_name_list))
match_results = process.extract(origin_input_name, origin_name_list, scorer=fuzz.token_sort_ratio,
limit=len(origin_name_list))
# 找到所有相似度 > 80 的匹配项
original_high_confidence_matches = [(match[0], match[1]) for match in match_results if match[1] >= lower_score]
print(f"standardize_pinyin_single_name 原始名匹配, high_confidence_matches:{original_high_confidence_matches[:3]}", flush=True)
print(f"standardize_pinyin_single_name 原始名匹配, high_confidence_matches:{original_high_confidence_matches[:3]}",
flush=True)
combined_low_confidence_matches = []
if original_high_confidence_matches:
@ -117,31 +204,10 @@ def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin
if not pinyin_name_list or not pinyin_to_original_map:
return None #
#第二轮, 拼音名的匹配性查找,能找到直接返回
# pinyin_input_name = text_to_pinyin(origin_input_name)
# #fuzz.partial_ratio
# match_results = process.extract(pinyin_input_name, pinyin_name_list, scorer=fuzz.ratio, limit=len(pinyin_name_list))
#
# # 筛选出匹配分数 > lower_score 的结果
# pinyin_high_confidence_matches = [(match[0], match[1]) for match in match_results if match[1] >= lower_score]
# print(f"standardize_pinyin_single_name 拼音匹配, input_name{pinyin_input_name}, high_confidence_matches:{pinyin_high_confidence_matches[:3]}", flush=True)
#
# if not pinyin_high_confidence_matches:
# return combined_low_confidence_matches # 没有找到匹配项
#
# # 选择最高相似度的匹配项
# pinyin_best_match = max(pinyin_high_confidence_matches, key=lambda x: x[1], default=None)
#
# if pinyin_best_match and pinyin_best_match[1] > high_score:
# return [pinyin_to_original_map[pinyin_best_match[0]]] # 直接返回最高相似度的原始工程名
#
# combined_low_confidence_matches.extend(
# [pinyin_to_original_map[match[0]] for match in pinyin_high_confidence_matches[:3]]
# )
# 返回所有匹配项对应的原始名最多返回最低匹配项的前5个
return list(dict.fromkeys(combined_low_confidence_matches))
def generate_project_prompt(matched_projects, original_name = "", type="项目部名"):
def generate_project_prompt(matched_projects, original_name="", type="项目部名"):
"""
生成提示信息用于让用户确认匹配的项目名或分公司名或项目名
@ -203,3 +269,28 @@ class StandardType(Enum):
PROJECT_CHECK = 0
#项目名检查
PROGRAM_CHECK = 1
import re
# 构建一个用于替换的正则表达式
useless_project_words_pattern = re.compile("|".join(USELESS_PROJECT_WORDS))
useless_company_words_pattern = re.compile("|".join(USELESS_COMPANY_WORDS))
# 匹配所有数字、字母(含大小写)、特殊字符(包括空格、标点)
project_symbols_pattern = re.compile(r"[A-Za-z0-9\s\W_]+")
company_symbols_pattern = re.compile(r"[\s\W_]+")
def clean_useless_project_name(name: str) -> str:
# 去掉无意义词
name = useless_project_words_pattern.sub("", name)
# 去掉数字、字母、符号
name = project_symbols_pattern.sub("", name)
return name.strip()
def clean_useless_company_name(name: str) -> str:
# 去掉无意义词
name = useless_company_words_pattern.sub("", name)
name = company_symbols_pattern.sub("", name)
return name.strip()