标准化算法提优
This commit is contained in:
parent
cd00c7efae
commit
510e829382
|
|
@ -1,6 +1,12 @@
|
||||||
# constants.py
|
# constants.py
|
||||||
SIMILARITY_VALUE = 75
|
SIMILARITY_VALUE = 75
|
||||||
#
|
#匹配工程名时,需要过滤掉的词汇
|
||||||
|
USELESS_PROJECT_WORDS = ["项目", "工程", "变电站", "线路", "变电","千伏" ,"换流站","公司","直流"]
|
||||||
|
|
||||||
|
#匹配公司名时,需要过滤掉的词汇
|
||||||
|
USELESS_COMPANY_WORDS = ["公司","分公司"]
|
||||||
|
|
||||||
|
|
||||||
COMPANYNAME_SHA = "顺安电网建设有限公司"
|
COMPANYNAME_SHA = "顺安电网建设有限公司"
|
||||||
#日期
|
#日期
|
||||||
DATE = "date"
|
DATE = "date"
|
||||||
|
|
|
||||||
256
api/main.py
256
api/main.py
|
|
@ -3,24 +3,26 @@ from pydantic import BaseModel, Field
|
||||||
from werkzeug.exceptions import HTTPException
|
from werkzeug.exceptions import HTTPException
|
||||||
from typing import List
|
from typing import List
|
||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
|
import time
|
||||||
|
|
||||||
from intentRecognition import IntentRecognition
|
from intentRecognition import IntentRecognition
|
||||||
from slotRecognition import SlotRecognition
|
from slotRecognition import SlotRecognition
|
||||||
from utils import CheckResult, load_standard_name, generate_project_prompt, \
|
from utils import CheckResult, load_standard_name, generate_project_prompt, \
|
||||||
load_standard_data, text_to_pinyin, multiple_standardize_single_name, \
|
load_standard_data, text_to_pinyin, \
|
||||||
standardize_projectDepartment
|
standardize_projectDepartment, standardize_project_name, clean_useless_project_name, \
|
||||||
|
clean_useless_company_name, standardize_sub_company
|
||||||
|
|
||||||
from constants import PROJECT_NAME, PROJECT_DEPARTMENT, SIMILARITY_VALUE, IMPLEMENTATION_ORG, RISK_LEVEL
|
from constants import PROJECT_NAME, PROJECT_DEPARTMENT, SIMILARITY_VALUE, IMPLEMENTATION_ORG, RISK_LEVEL
|
||||||
from config import *
|
from config import *
|
||||||
|
|
||||||
MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-30750"
|
MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-31940"
|
||||||
MODEL_UIE_PATH = R"../uie/output/checkpoint-31350"
|
MODEL_UIE_PATH = R"../uie/output/checkpoint-31350"
|
||||||
|
|
||||||
# 类别名称列表
|
# 类别名称列表
|
||||||
labels = [
|
labels = [
|
||||||
"天气查询", "互联网查询", "页面切换", "日计划数量查询", "周计划数量查询",
|
"天气查询", "互联网查询", "页面切换", "日计划数量查询", "周计划数量查询",
|
||||||
"日计划作业内容", "周计划作业内容", "施工人数", "作业考勤人数", "知识问答",
|
"日计划作业内容", "周计划作业内容", "施工人数", "作业考勤人数", "知识问答",
|
||||||
"通用对话", "作业面查询","班组人数查询","班组数查询","作业面内容","班组详情"
|
"通用对话", "作业面查询", "班组人数查询", "班组数查询", "作业面内容", "班组详情"
|
||||||
]
|
]
|
||||||
|
|
||||||
# 标签映射
|
# 标签映射
|
||||||
|
|
@ -41,13 +43,6 @@ label_map = {
|
||||||
13: 'B-teamName', 26: 'I-teamName',
|
13: 'B-teamName', 26: 'I-teamName',
|
||||||
}
|
}
|
||||||
|
|
||||||
# # 初始化工具类
|
|
||||||
intent_recognizer = IntentRecognition(MODEL_ERNIE_PATH, labels)
|
|
||||||
|
|
||||||
# 初始化槽位识别工具类
|
|
||||||
slot_recognizer = SlotRecognition(MODEL_UIE_PATH, label_map)
|
|
||||||
# 设置Flask应用
|
|
||||||
|
|
||||||
#标准公司名和项目名
|
#标准公司名和项目名
|
||||||
standard_company_program = load_standard_data("./standard_data/standard_company_program.json")
|
standard_company_program = load_standard_data("./standard_data/standard_company_program.json")
|
||||||
|
|
||||||
|
|
@ -61,12 +56,30 @@ standard_company_name_list = list(standard_company_program.keys())
|
||||||
pinyin_to_standard_company_name_map = {text_to_pinyin(kw): kw for kw in standard_company_name_list}
|
pinyin_to_standard_company_name_map = {text_to_pinyin(kw): kw for kw in standard_company_name_list}
|
||||||
standard_company_name_pinyin_list = list(pinyin_to_standard_company_name_map.keys())
|
standard_company_name_pinyin_list = list(pinyin_to_standard_company_name_map.keys())
|
||||||
|
|
||||||
|
simply_to_standard_project_name_map = {clean_useless_project_name(kw): kw for kw in standard_project_name_list}
|
||||||
|
|
||||||
|
pinyin_simply_to_standard_project_name_map = {text_to_pinyin(clean_useless_project_name(kw)): kw for kw in
|
||||||
|
standard_project_name_list}
|
||||||
|
|
||||||
|
simply_to_standard_company_name_map = {clean_useless_company_name(kw): kw for kw in standard_company_name_list}
|
||||||
|
|
||||||
|
pinyin_simply_to_standard_company_name_map = {text_to_pinyin(clean_useless_company_name(kw)): kw for kw in
|
||||||
|
standard_company_name_list}
|
||||||
|
|
||||||
|
# 初始化工具类
|
||||||
|
intent_recognizer = IntentRecognition(MODEL_ERNIE_PATH, labels)
|
||||||
|
|
||||||
|
# 初始化槽位识别工具类
|
||||||
|
slot_recognizer = SlotRecognition(MODEL_UIE_PATH, label_map)
|
||||||
|
# 设置Flask应用
|
||||||
|
|
||||||
print(f"标准化的工程名是:{standard_project_name_list}", flush=True)
|
print(f"标准化的工程名是:{standard_project_name_list}", flush=True)
|
||||||
print(f"pinyin标准化的工程名是 list:{standard_project_name_pinyin_list}", flush=True)
|
print(f"pinyin标准化的工程名是 list:{standard_project_name_pinyin_list}", flush=True)
|
||||||
print(f"pinyin-工程名对应关系 map:{pinyin_to_standard_company_name_map}", flush=True)
|
print(f"pinyin-工程民对应关系 map:{pinyin_to_standard_company_name_map}", flush=True)
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
|
||||||
# 统一的异常处理函数
|
# 统一的异常处理函数
|
||||||
@app.errorhandler(Exception)
|
@app.errorhandler(Exception)
|
||||||
def handle_exception(e):
|
def handle_exception(e):
|
||||||
|
|
@ -217,7 +230,8 @@ def agent():
|
||||||
entities = slot_recognizer.recognize(query)
|
entities = slot_recognizer.recognize(query)
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"第一轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}",flush=True)
|
f"第一轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}",
|
||||||
|
flush=True)
|
||||||
# 多轮
|
# 多轮
|
||||||
else:
|
else:
|
||||||
res = extract_multi_chat(messages)
|
res = extract_multi_chat(messages)
|
||||||
|
|
@ -233,7 +247,8 @@ def agent():
|
||||||
})
|
})
|
||||||
entities = slot_recognizer.recognize(res)
|
entities = slot_recognizer.recognize(res)
|
||||||
print(
|
print(
|
||||||
f"多轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}",flush=True)
|
f"多轮意图识别后的label:{predicted_label}, id:{predicted_id},槽位抽取后的实体:{entities},message:{messages}",
|
||||||
|
flush=True)
|
||||||
|
|
||||||
#必须槽位缺失检查
|
#必须槽位缺失检查
|
||||||
status, sk = check_lost(predicted_id, entities)
|
status, sk = check_lost(predicted_id, entities)
|
||||||
|
|
@ -261,6 +276,7 @@ def agent():
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return jsonify({"error": str(e)}), 500 # 捕捉其他错误并返回
|
return jsonify({"error": str(e)}), 500 # 捕捉其他错误并返回
|
||||||
|
|
||||||
|
|
||||||
def extract_multi_chat(messages):
|
def extract_multi_chat(messages):
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
client = OpenAI(base_url=api_base_url, api_key=api_key)
|
client = OpenAI(base_url=api_base_url, api_key=api_key)
|
||||||
|
|
@ -359,7 +375,7 @@ def extract_multi_chat(messages):
|
||||||
messages=message,
|
messages=message,
|
||||||
model=model_name,
|
model=model_name,
|
||||||
max_tokens=100,
|
max_tokens=100,
|
||||||
temperature=0.3, # 降低随机性,提高确定性
|
temperature=0.1, # 降低随机性,提高确定性
|
||||||
stream=False
|
stream=False
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -367,6 +383,7 @@ def extract_multi_chat(messages):
|
||||||
print(f"多轮意图后用户想要的问题是:{res}", flush=True)
|
print(f"多轮意图后用户想要的问题是:{res}", flush=True)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
def check_lost(int_res, slot):
|
def check_lost(int_res, slot):
|
||||||
#labels: ["天气查询","通用对话","页面切换","日计划数量查询","周计划数量查询","日计划作业内容","周计划作业内容","施工人数","作业考勤人数","知识问答"]
|
#labels: ["天气查询","通用对话","页面切换","日计划数量查询","周计划数量查询","日计划作业内容","周计划作业内容","施工人数","作业考勤人数","知识问答"]
|
||||||
mapping = {
|
mapping = {
|
||||||
|
|
@ -386,7 +403,7 @@ def check_lost(int_res, slot):
|
||||||
|
|
||||||
intention_mapping = {2: "页面切换", 3: "日计划数量查询", 4: "周计划数量查询", 5: "日计划作业内容",
|
intention_mapping = {2: "页面切换", 3: "日计划数量查询", 4: "周计划数量查询", 5: "日计划作业内容",
|
||||||
6: "周计划作业内容", 7: "施工人数", 8: "作业考勤人数", 11: "作业面查询",
|
6: "周计划作业内容", 7: "施工人数", 8: "作业考勤人数", 11: "作业面查询",
|
||||||
12:"班组人数查询", 13:"班组数查询", 14:"作业面内容", 15:"班组详情"}
|
12: "班组人数查询", 13: "班组数查询", 14: "作业面内容", 15: "班组详情"}
|
||||||
if not mapping.__contains__(int_res):
|
if not mapping.__contains__(int_res):
|
||||||
return 0, ""
|
return 0, ""
|
||||||
#提取的槽位信息
|
#提取的槽位信息
|
||||||
|
|
@ -411,7 +428,7 @@ def check_lost(int_res, slot):
|
||||||
return CheckResult.NO_MATCH, cur_k
|
return CheckResult.NO_MATCH, cur_k
|
||||||
#符合当前意图的的必须槽位,但是不在提取的槽位信息里
|
#符合当前意图的的必须槽位,但是不在提取的槽位信息里
|
||||||
left = [x for x in mapping[int_res][idx] if x not in cur_k]
|
left = [x for x in mapping[int_res][idx] if x not in cur_k]
|
||||||
print(f"符合当前意图的的必须槽位,但是不在提取的槽位信息里, {left}",flush=True)
|
print(f"符合当前意图的的必须槽位,但是不在提取的槽位信息里, {left}", flush=True)
|
||||||
apologize_str = "非常抱歉,"
|
apologize_str = "非常抱歉,"
|
||||||
if int_res == 2:
|
if int_res == 2:
|
||||||
return CheckResult.NEEDS_MORE_ROUNDS, f"{apologize_str}请问你想查询哪个页面?"
|
return CheckResult.NEEDS_MORE_ROUNDS, f"{apologize_str}请问你想查询哪个页面?"
|
||||||
|
|
@ -434,8 +451,9 @@ def check_standard_name_slot(int_res, slot) -> tuple:
|
||||||
for key, value in slot.items():
|
for key, value in slot.items():
|
||||||
if key == PROJECT_NAME:
|
if key == PROJECT_NAME:
|
||||||
print(f"check_standard_name_slot 原始工程名 : {slot[PROJECT_NAME]}")
|
print(f"check_standard_name_slot 原始工程名 : {slot[PROJECT_NAME]}")
|
||||||
match_results = multiple_standardize_single_name(value, standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,80)
|
match_results = standardize_project_name(value, simply_to_standard_project_name_map,
|
||||||
print(f"check_standard_name_slot 匹配后工程名 :result:{match_results}",flush=True)
|
pinyin_simply_to_standard_project_name_map, 70, 90)
|
||||||
|
print(f"check_standard_name_slot 匹配后工程名 :result:{match_results}", flush=True)
|
||||||
if match_results and len(match_results) == 1:
|
if match_results and len(match_results) == 1:
|
||||||
slot[key] = match_results[0]
|
slot[key] = match_results[0]
|
||||||
else:
|
else:
|
||||||
|
|
@ -444,8 +462,9 @@ def check_standard_name_slot(int_res, slot) -> tuple:
|
||||||
|
|
||||||
if key == IMPLEMENTATION_ORG and slot[key] != "公司":
|
if key == IMPLEMENTATION_ORG and slot[key] != "公司":
|
||||||
print(f"check_standard_name_slot 原始分公司名 : {slot[IMPLEMENTATION_ORG]}")
|
print(f"check_standard_name_slot 原始分公司名 : {slot[IMPLEMENTATION_ORG]}")
|
||||||
match_results = multiple_standardize_single_name(value, standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map, lower_score=50, high_score=80, isArabicNumConv = True)
|
match_results = standardize_sub_company(value, simply_to_standard_company_name_map,
|
||||||
print(f"check_standard_name_slot 匹配后分公司名: result:{match_results}",flush=True)
|
pinyin_simply_to_standard_company_name_map, 55, 80)
|
||||||
|
print(f"check_standard_name_slot 匹配后分公司名: result:{match_results}", flush=True)
|
||||||
if match_results and len(match_results) == 1:
|
if match_results and len(match_results) == 1:
|
||||||
slot[key] = match_results[0]
|
slot[key] = match_results[0]
|
||||||
else:
|
else:
|
||||||
|
|
@ -454,8 +473,9 @@ def check_standard_name_slot(int_res, slot) -> tuple:
|
||||||
|
|
||||||
if key == PROJECT_DEPARTMENT:
|
if key == PROJECT_DEPARTMENT:
|
||||||
print(f"check_standard_name_slot 原始项目部名 : {slot[PROJECT_DEPARTMENT]}")
|
print(f"check_standard_name_slot 原始项目部名 : {slot[PROJECT_DEPARTMENT]}")
|
||||||
match_results = standardize_projectDepartment(slot[IMPLEMENTATION_ORG], value, standard_company_program, high_score=85)
|
match_results = standardize_projectDepartment(slot[IMPLEMENTATION_ORG], value, standard_company_program,
|
||||||
print(f"check_standard_name_slot 匹配后项目部名: result:{match_results}",flush=True)
|
high_score=90)
|
||||||
|
print(f"check_standard_name_slot 匹配后项目部名: result:{match_results}", flush=True)
|
||||||
if match_results and len(match_results) == 1:
|
if match_results and len(match_results) == 1:
|
||||||
slot[key] = match_results[0]
|
slot[key] = match_results[0]
|
||||||
else:
|
else:
|
||||||
|
|
@ -463,92 +483,132 @@ def check_standard_name_slot(int_res, slot) -> tuple:
|
||||||
return CheckResult.NEEDS_MORE_ROUNDS, prompt
|
return CheckResult.NEEDS_MORE_ROUNDS, prompt
|
||||||
|
|
||||||
if key == RISK_LEVEL:
|
if key == RISK_LEVEL:
|
||||||
if slot[RISK_LEVEL] not in["2级","3级","4级","5级"] and slot[RISK_LEVEL] not in["二级","三级","四级","五级"]:
|
if slot[RISK_LEVEL] not in ["2级", "3级", "4级", "5级"] and slot[RISK_LEVEL] not in ["二级", "三级", "四级",
|
||||||
|
"五级"]:
|
||||||
return CheckResult.NEEDS_MORE_ROUNDS, "您查询的风险等级在系统中未找到,请确认风险等级后再次提问"
|
return CheckResult.NEEDS_MORE_ROUNDS, "您查询的风险等级在系统中未找到,请确认风险等级后再次提问"
|
||||||
|
|
||||||
return CheckResult.NO_MATCH, ""
|
return CheckResult.NO_MATCH, ""
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
# test_cases = [
|
# test_cases = [
|
||||||
# ("安徽宏源电力建设有限公司", "第三项目管理部"), # 期望返回所有"第三项目管理部"
|
# ("送一分公司"),
|
||||||
# ("安徽宏源电力建设有限公司", "第九项目部"), # 期望返回 "第九项目管理部"
|
# ("送二分公司"),
|
||||||
# ("顺安电网公司", "第二项目部"), # 期望匹配"顺安电网建设有限公司"下的"第二项目管理部"
|
# ("变电分公司"),
|
||||||
# ("送电一公司", "第三项目部"), # 期望返回"第三项目管理部"
|
# ("建筑分公司"),
|
||||||
# ("送电2公司", "第三项目部"), # 期望返回"第三项目管理部"
|
# ("检修试验分公司"),
|
||||||
# ("消防分公司", "第七项目部"), # 期望返回"第七项目管理部
|
# ("宏源电力公司"),
|
||||||
# ("建筑分公司", "第七项目部"), # 期望返回"第七项目管理部"
|
# ("宏源电力限公司"),
|
||||||
# ("建筑消防分公司", "第七项目部"), # 期望返回"第七项目管理部"
|
# ("宏源电力限公司线路"),
|
||||||
# ("建筑分公司消防分公司", "第七项目部") # 期望返回"第七项目管理部"
|
# ("宏源电力限公司变电"),
|
||||||
|
# ("送一分"),
|
||||||
|
# ("送二分"),
|
||||||
|
# ("变电分"),
|
||||||
|
# ("建筑分"),
|
||||||
|
# ("检修试验分"),
|
||||||
|
# ("宏源电力"),
|
||||||
|
# ("红源电力"),
|
||||||
|
# ("宏源电力有限"),
|
||||||
|
# ("宏源电力限线路"),
|
||||||
|
# ("宏源电力限变电"),
|
||||||
# ]
|
# ]
|
||||||
#
|
#
|
||||||
# for company, project in test_cases:
|
# print(f"加权混合策略 分公司名匹配**********************")
|
||||||
# # result = standardize_company_and_project(company, project,standard_company_program)
|
# start = time.perf_counter()
|
||||||
# result = standardize_company_and_projectDepartment(company, project,standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
# for item in test_cases:
|
||||||
# # result = multiple_standardize_single_name("company", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,40,70)
|
# match_results = standardize_sub_company(item,simply_to_standard_company_name_map, pinyin_simply_to_standard_company_name_map,55,80)
|
||||||
# print(f"输入: {company}, {project} -> 输出: {result}")
|
# print(f"加权混合策略 分公司名匹配 输入: {item}-> 输出: {match_results}")
|
||||||
|
# end = time.perf_counter()
|
||||||
|
# print(f"加权混合策略 耗时: {end - start:.4f} 秒")
|
||||||
#
|
#
|
||||||
# result = standardize_single_name("送电一公司", standard_company_name_list)
|
|
||||||
# print(f"输入: 送一分公司-> 输出: {result}")
|
|
||||||
#
|
#
|
||||||
# prompt = generate_project_prompt(result, "分公司名")
|
|
||||||
# print(f"prompt:{prompt}")
|
|
||||||
#
|
#
|
||||||
# result = standardize_single_name("合肥中心变", standard_project_name_list)
|
# test_cases = [
|
||||||
# print(f"输入: 合肥中心变-> 输出: {result}")
|
# ("卢集"),
|
||||||
|
# ("芦集"),
|
||||||
|
# ("芦集变电站"),
|
||||||
|
# ("安庆四变电站"),
|
||||||
|
# ("锦绣变电站"),
|
||||||
|
# ("滁州护桥变电站"),
|
||||||
|
# ("合州换流站"),
|
||||||
|
# ("陕北合州换流站"),
|
||||||
|
# ("陕北安徽合州换流站"),
|
||||||
|
# ("金牛变电站"),
|
||||||
|
# ("香涧鹭岛工程"),
|
||||||
|
# ("延庆换流站"),
|
||||||
|
# ("国网延庆换流站"),
|
||||||
|
# ("国网北京延庆换流站"),
|
||||||
|
# ("陶楼广银线路工程"),
|
||||||
|
# ("紫蓬变电站"),
|
||||||
|
# ("宿州萧砀变电站"),
|
||||||
|
# ("冯井变电站"),
|
||||||
|
# ("富邦秋浦变电站"),
|
||||||
|
# ("包河玉龙变电站"),
|
||||||
#
|
#
|
||||||
# prompt = generate_project_prompt(result, "工程名")
|
# ("绿雪莲塘工程"),
|
||||||
# print(f"prompt:{prompt}")
|
# ("合肥循环园工程"),
|
||||||
|
# ("合肥长临河工程"),
|
||||||
|
# ("合肥中心变"),
|
||||||
|
# ("锁库变电站工程"),
|
||||||
|
# ("槽坊工程"),
|
||||||
|
#
|
||||||
|
# ("安庆四500kV变电站新建工程(PROJ-2024-0862)"),
|
||||||
|
# ("锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)"),
|
||||||
|
# ("渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)"),
|
||||||
|
# ("先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)"),
|
||||||
|
# ("安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)"),
|
||||||
|
# ("合州士800千伏换流站电气安装A包(PROJ-2025-0056)"),
|
||||||
|
# ("卫田-陶楼T接首业变电站110kV电缆线路工程(PROJ-2024-1236)"),
|
||||||
|
# ("谯城(亳三)-希夷220kV线路工程(PROJ-2024-1205)"),
|
||||||
|
# ]
|
||||||
|
# print(f"去不重要词汇 工程名匹配******************************************")
|
||||||
|
# start = time.perf_counter()
|
||||||
|
# for item in test_cases:
|
||||||
|
# match_results = standardize_project_name(item, simply_to_standard_project_name_map, pinyin_simply_to_standard_project_name_map,70,90)
|
||||||
|
# print(f"工程名匹配 输入: {item}-> 输出: {match_results}")
|
||||||
|
# end = time.perf_counter()
|
||||||
|
# print(f"词集匹配 耗时: {end - start:.4f} 秒")
|
||||||
|
#
|
||||||
|
# print(f"项目名匹配******************************************")
|
||||||
|
# oral_program_name_list = [
|
||||||
|
# ("第1项目部"), # 期望返回所有"第三项目管理部"
|
||||||
|
# ("第2项目部"),
|
||||||
|
# ("第3项目部"),
|
||||||
|
# ("第4项目部"),
|
||||||
|
# ("第5项目部"),
|
||||||
|
# ("第6项目部"),
|
||||||
|
# ("第7项目部"),
|
||||||
|
# ("第8项目部"),
|
||||||
|
# ("第9项目部"),
|
||||||
|
# ("第10项目部"),
|
||||||
|
# ("第11项目部"),
|
||||||
|
# ("第12项目部"),
|
||||||
|
# ("第13项目部"),
|
||||||
|
# ("电缆班"),
|
||||||
|
# ("调试1队"),
|
||||||
|
# ("调试2队"),
|
||||||
|
# ("调试3队"),
|
||||||
|
# ("调试4队"),
|
||||||
|
# ("调试5队"),
|
||||||
|
# ("第一项目管理部"),
|
||||||
|
# ("第二项目管理部"),
|
||||||
|
# ("第五项目管理部"),
|
||||||
|
# ("第十一项目管理部(萧砀线路)"),
|
||||||
|
# ("第三项目管理部(张店线路)"),
|
||||||
|
# ("第三项目管理部(岳西线路)"),
|
||||||
|
# ("第五项目管理部(蚌埠)"),
|
||||||
|
# ("第三项目管理部(六安线路)"),
|
||||||
|
# ("第十一项目管理部(宿州线路)"),
|
||||||
|
# ("调试一队"),
|
||||||
|
# ("调试二队"),
|
||||||
|
# ("调试三队"),
|
||||||
|
# ("电缆班"),
|
||||||
|
# ]
|
||||||
|
#
|
||||||
|
# for company in standard_company_name_list:
|
||||||
|
# for program in oral_program_name_list:
|
||||||
|
# match_results = standardize_projectDepartment(company, program, standard_company_program, high_score=90)
|
||||||
|
# print(f"加权混合策略 项目部名称 输入: 公司:{company},项目部:{program}-> 输出: {match_results}")
|
||||||
|
|
||||||
# result = standardize_single_name("合肥中心变", standard_project_name_list, 60, 75)
|
|
||||||
# print(f"输入: 合肥中心变-> 输出: {result}")
|
|
||||||
#
|
|
||||||
# result = standardize_single_name("阜阳阜四变电站工程", standard_project_name_list, 60, 75)
|
|
||||||
# print(f"输入: 阜阳阜四变电站工程-> 输出: {result}")
|
|
||||||
#
|
|
||||||
# result = standardize_single_name("芦集变电站", standard_project_name_list, 20, 50)
|
|
||||||
# print(f"输入: 芦集变电站-> 输出: {result}")
|
|
||||||
#
|
|
||||||
# match_results = multiple_standardize_single_name("宋轶分公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 宋轶分公司-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
# match_results = multiple_standardize_single_name("宏源电力公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 宏源电力公司-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
# #
|
|
||||||
# match_results = multiple_standardize_single_name("合肥中心变", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 合肥中心变-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
# match_results = multiple_standardize_single_name("淮南安丰", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 淮南安丰工程-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
# match_results = multiple_standardize_single_name("宿州萧砀新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 宿州萧砀新建工程-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
# match_results = multiple_standardize_single_name("芦集变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 芦集变电站-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
# match_results = multiple_standardize_single_name("卢集变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 卢集变电站-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
# match_results = multiple_standardize_single_name("芦集古沟变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 芦集古沟变电站新建工程-> 输出: {match_results}")
|
|
||||||
# #
|
|
||||||
# match_results = multiple_standardize_single_name("金牛变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 金牛变电站-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
|
|
||||||
#
|
|
||||||
# company, project = standardize_company_and_projectDepartment("变电分公司","第一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
|
||||||
# print(f"company:{company}, project:{project}")
|
|
||||||
#
|
|
||||||
# company, project = standardize_company_and_projectDepartment("变电分公司","第十一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
|
||||||
# print(f"company:{company}, project:{project}")
|
|
||||||
# company, project = standardize_company_and_projectDepartment("试验分公司","电缆班", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
|
||||||
# print(f"company:{company}, project:{project}")
|
|
||||||
# company, project = standardize_company_and_projectDepartment("宏源电力投资有限公司","第三项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
|
||||||
# print(f"company:{company}, project:{project}")
|
|
||||||
#
|
|
||||||
|
|
||||||
# match_results = standardize_projectDepartment("安徽宏源电力建设有限公司(变电)", "第3项目部", standard_company_program, high_score=85)
|
|
||||||
# print(f"match_results:{match_results}")
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app.run(host='0.0.0.0', port=18074, debug=True)
|
app.run(host='0.0.0.0', port=18074, debug=True)
|
||||||
|
|
|
||||||
218
api/main_temp.py
218
api/main_temp.py
|
|
@ -3,18 +3,21 @@ from pydantic import BaseModel, Field
|
||||||
from werkzeug.exceptions import HTTPException
|
from werkzeug.exceptions import HTTPException
|
||||||
from typing import List
|
from typing import List
|
||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
|
import time
|
||||||
|
|
||||||
from intentRecognition import IntentRecognition
|
from intentRecognition import IntentRecognition
|
||||||
from slotRecognition import SlotRecognition
|
from slotRecognition import SlotRecognition
|
||||||
from utils import CheckResult, load_standard_name, generate_project_prompt, \
|
from utils import CheckResult, load_standard_name, generate_project_prompt, \
|
||||||
load_standard_data, text_to_pinyin, multiple_standardize_single_name, \
|
load_standard_data, text_to_pinyin, \
|
||||||
standardize_projectDepartment
|
standardize_projectDepartment, standardize_project_name, clean_useless_project_name, \
|
||||||
|
clean_useless_company_name, standardize_sub_company
|
||||||
|
|
||||||
from constants import PROJECT_NAME, PROJECT_DEPARTMENT, SIMILARITY_VALUE, IMPLEMENTATION_ORG, RISK_LEVEL
|
from constants import PROJECT_NAME, PROJECT_DEPARTMENT, SIMILARITY_VALUE, IMPLEMENTATION_ORG, RISK_LEVEL
|
||||||
from config import *
|
from config import *
|
||||||
|
|
||||||
MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-30750"
|
MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-17890"
|
||||||
MODEL_UIE_PATH = R"../uie/output_temp/checkpoint-31350"
|
MODEL_UIE_PATH = R"../uie/output/checkpoint-17290"
|
||||||
|
|
||||||
# 类别名称列表
|
# 类别名称列表
|
||||||
labels = [
|
labels = [
|
||||||
"天气查询", "互联网查询", "页面切换", "日计划数量查询", "周计划数量查询",
|
"天气查询", "互联网查询", "页面切换", "日计划数量查询", "周计划数量查询",
|
||||||
|
|
@ -54,6 +57,14 @@ standard_company_name_list = list(standard_company_program.keys())
|
||||||
pinyin_to_standard_company_name_map = {text_to_pinyin(kw): kw for kw in standard_company_name_list}
|
pinyin_to_standard_company_name_map = {text_to_pinyin(kw): kw for kw in standard_company_name_list}
|
||||||
standard_company_name_pinyin_list = list(pinyin_to_standard_company_name_map.keys())
|
standard_company_name_pinyin_list = list(pinyin_to_standard_company_name_map.keys())
|
||||||
|
|
||||||
|
simply_to_standard_project_name_map = {clean_useless_project_name(kw): kw for kw in standard_project_name_list}
|
||||||
|
|
||||||
|
pinyin_simply_to_standard_project_name_map = {text_to_pinyin(clean_useless_project_name(kw)): kw for kw in standard_project_name_list}
|
||||||
|
|
||||||
|
simply_to_standard_company_name_map = {clean_useless_company_name(kw): kw for kw in standard_company_name_list}
|
||||||
|
|
||||||
|
pinyin_simply_to_standard_company_name_map = {text_to_pinyin(clean_useless_company_name(kw)): kw for kw in standard_company_name_list}
|
||||||
|
|
||||||
|
|
||||||
# 初始化工具类
|
# 初始化工具类
|
||||||
intent_recognizer = IntentRecognition(MODEL_ERNIE_PATH, labels)
|
intent_recognizer = IntentRecognition(MODEL_ERNIE_PATH, labels)
|
||||||
|
|
@ -435,7 +446,7 @@ def check_standard_name_slot(int_res, slot) -> tuple:
|
||||||
for key, value in slot.items():
|
for key, value in slot.items():
|
||||||
if key == PROJECT_NAME:
|
if key == PROJECT_NAME:
|
||||||
print(f"check_standard_name_slot 原始工程名 : {slot[PROJECT_NAME]}")
|
print(f"check_standard_name_slot 原始工程名 : {slot[PROJECT_NAME]}")
|
||||||
match_results = multiple_standardize_single_name(value, standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,80)
|
match_results = standardize_project_name(value, simply_to_standard_project_name_map, pinyin_simply_to_standard_project_name_map,70,90)
|
||||||
print(f"check_standard_name_slot 匹配后工程名 :result:{match_results}",flush=True)
|
print(f"check_standard_name_slot 匹配后工程名 :result:{match_results}",flush=True)
|
||||||
if match_results and len(match_results) == 1:
|
if match_results and len(match_results) == 1:
|
||||||
slot[key] = match_results[0]
|
slot[key] = match_results[0]
|
||||||
|
|
@ -445,7 +456,7 @@ def check_standard_name_slot(int_res, slot) -> tuple:
|
||||||
|
|
||||||
if key == IMPLEMENTATION_ORG and slot[key] != "公司":
|
if key == IMPLEMENTATION_ORG and slot[key] != "公司":
|
||||||
print(f"check_standard_name_slot 原始分公司名 : {slot[IMPLEMENTATION_ORG]}")
|
print(f"check_standard_name_slot 原始分公司名 : {slot[IMPLEMENTATION_ORG]}")
|
||||||
match_results = multiple_standardize_single_name(value, standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map, lower_score=50, high_score=80, isArabicNumConv = True)
|
match_results = standardize_sub_company(value,simply_to_standard_company_name_map, pinyin_simply_to_standard_company_name_map,55,80)
|
||||||
print(f"check_standard_name_slot 匹配后分公司名: result:{match_results}",flush=True)
|
print(f"check_standard_name_slot 匹配后分公司名: result:{match_results}",flush=True)
|
||||||
if match_results and len(match_results) == 1:
|
if match_results and len(match_results) == 1:
|
||||||
slot[key] = match_results[0]
|
slot[key] = match_results[0]
|
||||||
|
|
@ -469,85 +480,126 @@ def check_standard_name_slot(int_res, slot) -> tuple:
|
||||||
|
|
||||||
return CheckResult.NO_MATCH, ""
|
return CheckResult.NO_MATCH, ""
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
# test_cases = [
|
# test_cases = [
|
||||||
# ("安徽宏源电力建设有限公司(线路)", "第三项目管理部"), # 期望返回所有"第三项目管理部"
|
# ("送一分公司"),
|
||||||
# ("送电一分公司", "第8项目管理部"), # 期望返回 "第九项目管理部"
|
# ("送二分公司"),
|
||||||
# # ("顺安电网公司", "第二项目部"), # 期望匹配"顺安电网建设有限公司"下的"第二项目管理部"
|
# ("变电分公司"),
|
||||||
# # ("送电一公司", "第三项目部"), # 期望返回"第三项目管理部"
|
# ("建筑分公司"),
|
||||||
# # ("送电2公司", "第三项目部"), # 期望返回"第三项目管理部"
|
# ("检修试验分公司"),
|
||||||
# # ("消防分公司", "第七项目部"), # 期望返回"第七项目管理部
|
# ("宏源电力公司"),
|
||||||
# # ("建筑分公司", "第七项目部"), # 期望返回"第七项目管理部"
|
# ("宏源电力限公司"),
|
||||||
# # ("建筑消防分公司", "第七项目部"), # 期望返回"第七项目管理部"
|
# ("宏源电力限公司线路"),
|
||||||
# # ("建筑分公司消防分公司", "第七项目部") # 期望返回"第七项目管理部"
|
# ("宏源电力限公司变电"),
|
||||||
|
# ("送一分"),
|
||||||
|
# ("送二分"),
|
||||||
|
# ("变电分"),
|
||||||
|
# ("建筑分"),
|
||||||
|
# ("检修试验分"),
|
||||||
|
# ("宏源电力"),
|
||||||
|
# ("红源电力"),
|
||||||
|
# ("宏源电力有限"),
|
||||||
|
# ("宏源电力限线路"),
|
||||||
|
# ("宏源电力限变电"),
|
||||||
# ]
|
# ]
|
||||||
|
#
|
||||||
|
# print(f"加权混合策略 分公司名匹配**********************")
|
||||||
|
# start = time.perf_counter()
|
||||||
|
# for item in test_cases:
|
||||||
|
# match_results = standardize_sub_company(item,simply_to_standard_company_name_map, pinyin_simply_to_standard_company_name_map,55,80)
|
||||||
|
# print(f"加权混合策略 分公司名匹配 输入: {item}-> 输出: {match_results}")
|
||||||
|
# end = time.perf_counter()
|
||||||
|
# print(f"加权混合策略 耗时: {end - start:.4f} 秒")
|
||||||
|
#
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# test_cases = [
|
||||||
|
# ("卢集"),
|
||||||
|
# ("芦集"),
|
||||||
|
# ("芦集变电站"),
|
||||||
|
# ("安庆四变电站"),
|
||||||
|
# ("锦绣变电站"),
|
||||||
|
# ("滁州护桥变电站"),
|
||||||
|
# ("合州换流站"),
|
||||||
|
# ("陕北合州换流站"),
|
||||||
|
# ("陕北安徽合州换流站"),
|
||||||
|
# ("金牛变电站"),
|
||||||
|
# ("香涧鹭岛工程"),
|
||||||
|
# ("延庆换流站"),
|
||||||
|
# ("国网延庆换流站"),
|
||||||
|
# ("国网北京延庆换流站"),
|
||||||
|
# ("陶楼广银线路工程"),
|
||||||
|
# ("紫蓬变电站"),
|
||||||
|
# ("宿州萧砀变电站"),
|
||||||
|
# ("冯井变电站"),
|
||||||
|
# ("富邦秋浦变电站"),
|
||||||
|
# ("包河玉龙变电站"),
|
||||||
|
#
|
||||||
|
# ("绿雪莲塘工程"),
|
||||||
|
# ("合肥循环园工程"),
|
||||||
|
# ("合肥长临河工程"),
|
||||||
|
# ("合肥中心变"),
|
||||||
|
# ("锁库变电站工程"),
|
||||||
|
# ("槽坊工程"),
|
||||||
|
#
|
||||||
|
# ("安庆四500kV变电站新建工程(PROJ-2024-0862)"),
|
||||||
|
# ("锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)"),
|
||||||
|
# ("渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)"),
|
||||||
|
# ("先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)"),
|
||||||
|
# ("安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)"),
|
||||||
|
# ("合州士800千伏换流站电气安装A包(PROJ-2025-0056)"),
|
||||||
|
# ("卫田-陶楼T接首业变电站110kV电缆线路工程(PROJ-2024-1236)"),
|
||||||
|
# ("谯城(亳三)-希夷220kV线路工程(PROJ-2024-1205)"),
|
||||||
|
# ]
|
||||||
|
# print(f"去不重要词汇 工程名匹配******************************************")
|
||||||
|
# start = time.perf_counter()
|
||||||
|
# for item in test_cases:
|
||||||
|
# match_results = standardize_project_name(item, simply_to_standard_project_name_map, pinyin_simply_to_standard_project_name_map,70,90)
|
||||||
|
# print(f"工程名匹配 输入: {item}-> 输出: {match_results}")
|
||||||
|
# end = time.perf_counter()
|
||||||
|
# print(f"词集匹配 耗时: {end - start:.4f} 秒")
|
||||||
|
#
|
||||||
|
# print(f"项目名匹配******************************************")
|
||||||
|
# oral_program_name_list = [
|
||||||
|
# ("第1项目部"), # 期望返回所有"第三项目管理部"
|
||||||
|
# ("第2项目部"),
|
||||||
|
# ("第3项目部"),
|
||||||
|
# ("第4项目部"),
|
||||||
|
# ("第5项目部"),
|
||||||
|
# ("第6项目部"),
|
||||||
|
# ("第7项目部"),
|
||||||
|
# ("第8项目部"),
|
||||||
|
# ("第9项目部"),
|
||||||
|
# ("第10项目部"),
|
||||||
|
# ("第11项目部"),
|
||||||
|
# ("第12项目部"),
|
||||||
|
# ("第13项目部"),
|
||||||
|
# ("电缆班"),
|
||||||
|
# ("调试1队"),
|
||||||
|
# ("调试2队"),
|
||||||
|
# ("调试3队"),
|
||||||
|
# ("调试4队"),
|
||||||
|
# ("调试5队"),
|
||||||
|
# ("第一项目管理部"),
|
||||||
|
# ("第二项目管理部"),
|
||||||
|
# ("第五项目管理部"),
|
||||||
|
# ("第十一项目管理部(萧砀线路)"),
|
||||||
|
# ("第三项目管理部(张店线路)"),
|
||||||
|
# ("第三项目管理部(岳西线路)"),
|
||||||
|
# ("第五项目管理部(蚌埠)"),
|
||||||
|
# ("第三项目管理部(六安线路)"),
|
||||||
|
# ("第十一项目管理部(宿州线路)"),
|
||||||
|
# ("调试一队"),
|
||||||
|
# ("调试二队"),
|
||||||
|
# ("调试三队"),
|
||||||
|
# ("电缆班"),
|
||||||
|
# ]
|
||||||
|
#
|
||||||
|
# for company in standard_company_name_list:
|
||||||
|
# for program in oral_program_name_list:
|
||||||
|
# match_results = standardize_projectDepartment(company, program, standard_company_program, high_score=90)
|
||||||
|
# print(f"加权混合策略 项目部名称 输入: 公司:{company},项目部:{program}-> 输出: {match_results}")
|
||||||
|
|
||||||
# for company, project in test_cases:
|
|
||||||
# result = standardize_projectDepartment(company, project,standard_company_program, high_score=90)
|
|
||||||
# print(f"输入: {company}, {project} -> 输出: {result}")
|
|
||||||
#
|
|
||||||
# result = standardize_single_name("送电一公司", standard_company_name_list)
|
|
||||||
# print(f"输入: 送一分公司-> 输出: {result}")
|
|
||||||
#
|
|
||||||
# prompt = generate_project_prompt(result, "分公司名")
|
|
||||||
# print(f"prompt:{prompt}")
|
|
||||||
#
|
|
||||||
# result = standardize_single_name("合肥中心变", standard_project_name_list)
|
|
||||||
# print(f"输入: 合肥中心变-> 输出: {result}")
|
|
||||||
#
|
|
||||||
# prompt = generate_project_prompt(result, "工程名")
|
|
||||||
# print(f"prompt:{prompt}")
|
|
||||||
|
|
||||||
# result = standardize_single_name("合肥中心变", standard_project_name_list, 60, 75)
|
|
||||||
# print(f"输入: 合肥中心变-> 输出: {result}")
|
|
||||||
#
|
|
||||||
# result = standardize_single_name("阜阳阜四变电站工程", standard_project_name_list, 60, 75)
|
|
||||||
# print(f"输入: 阜阳阜四变电站工程-> 输出: {result}")
|
|
||||||
#
|
|
||||||
# result = standardize_single_name("芦集变电站", standard_project_name_list, 20, 50)
|
|
||||||
# print(f"输入: 芦集变电站-> 输出: {result}")
|
|
||||||
#
|
|
||||||
# match_results = multiple_standardize_single_name("宋轶分公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 宋轶分公司-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
# match_results = multiple_standardize_single_name("宏源电力公司", standard_company_name_list, standard_company_name_pinyin_list, pinyin_to_standard_company_name_map,75,80)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 宏源电力公司-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
# #
|
|
||||||
# match_results = multiple_standardize_single_name("合肥中心变", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 合肥中心变-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
# match_results = multiple_standardize_single_name("淮南安丰", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 淮南安丰工程-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
# match_results = multiple_standardize_single_name("宿州萧砀新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 宿州萧砀新建工程-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
# match_results = multiple_standardize_single_name("芦集变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 芦集变电站-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
# match_results = multiple_standardize_single_name("卢集变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 卢集变电站-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
# match_results = multiple_standardize_single_name("芦集古沟变电站新建工程", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 芦集古沟变电站新建工程-> 输出: {match_results}")
|
|
||||||
# #
|
|
||||||
# match_results = multiple_standardize_single_name("金牛变电站", standard_project_name_list, standard_project_name_pinyin_list, pinyin_to_standard_project_name_map,20,70)
|
|
||||||
# print(f"multiple_standardize_single_name 输入: 金牛变电站-> 输出: {match_results}")
|
|
||||||
#
|
|
||||||
|
|
||||||
#
|
|
||||||
# company, project = standardize_company_and_projectDepartment("变电分公司","第一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
|
||||||
# print(f"company:{company}, project:{project}")
|
|
||||||
#
|
|
||||||
# company, project = standardize_company_and_projectDepartment("变电分公司","第十一项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
|
||||||
# print(f"company:{company}, project:{project}")
|
|
||||||
# company, project = standardize_company_and_projectDepartment("试验分公司","电缆班", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
|
||||||
# print(f"company:{company}, project:{project}")
|
|
||||||
# company, project = standardize_company_and_projectDepartment("宏源电力投资有限公司","第三项目部", standard_company_name_list, standard_company_program, pinyin_to_standard_company_name_map)
|
|
||||||
# print(f"company:{company}, project:{project}")
|
|
||||||
#
|
|
||||||
|
|
||||||
# match_results = standardize_projectDepartment("安徽宏源电力建设有限公司(变电)", "第3项目部", standard_company_program, high_score=85)
|
|
||||||
# print(f"match_results:{match_results}")
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app.run(host='0.0.0.0', port=18073, debug=True)
|
app.run(host='0.0.0.0', port=18073, debug=True)
|
||||||
|
|
|
||||||
157
api/utils.py
157
api/utils.py
|
|
@ -1,10 +1,13 @@
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
from typing import cast, Callable
|
||||||
|
|
||||||
from rapidfuzz import process, fuzz
|
from rapidfuzz import process, fuzz
|
||||||
import re
|
from rapidfuzz.fuzz import WRatio
|
||||||
import json
|
import json
|
||||||
from pypinyin import lazy_pinyin
|
from pypinyin import lazy_pinyin
|
||||||
|
|
||||||
|
from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS
|
||||||
|
|
||||||
# 数字转换表(1-20,常见数字)
|
# 数字转换表(1-20,常见数字)
|
||||||
digit_to_chinese = {
|
digit_to_chinese = {
|
||||||
"1": "一", "2": "二", "3": "三", "4": "四", "5": "五",
|
"1": "一", "2": "二", "3": "三", "4": "四", "5": "五",
|
||||||
|
|
@ -14,6 +17,7 @@ digit_to_chinese = {
|
||||||
"19": "十九", "20": "二十"
|
"19": "十九", "20": "二十"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def arabic_to_chinese_number(text):
|
def arabic_to_chinese_number(text):
|
||||||
"""
|
"""
|
||||||
将文中阿拉伯数字转换为中文数字
|
将文中阿拉伯数字转换为中文数字
|
||||||
|
|
@ -28,10 +32,13 @@ def arabic_to_chinese_number(text):
|
||||||
text = text.replace(num, cn)
|
text = text.replace(num, cn)
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def text_to_pinyin(text):
|
def text_to_pinyin(text):
|
||||||
"""将文本转换为拼音字符串"""
|
"""将文本转换为拼音字符串"""
|
||||||
return ''.join(lazy_pinyin(text))
|
return ''.join(lazy_pinyin(text))
|
||||||
|
|
||||||
|
|
||||||
def load_standard_data(path):
|
def load_standard_data(path):
|
||||||
with open(path, "r", encoding="utf-8") as f:
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
@ -50,23 +57,99 @@ def extract_number(text):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def fuzzy_match_and_filter(input_key, match_pool, mapping_dict, lower_score=70, high_score=85, top_k=3):
|
||||||
|
"""
|
||||||
|
对输入字符串在候选池中执行模糊匹配,并返回匹配程度高的映射原始值。
|
||||||
|
:param input_key: 清洗后的用于匹配的关键词(如简化名或拼音)
|
||||||
|
:param match_pool: 可用于匹配的候选集合(一般是映射表的 key)
|
||||||
|
:param mapping_dict: 匹配项到标准原始名的映射字典
|
||||||
|
:param lower_score: 匹配分数的下限,低于该分数视为无效
|
||||||
|
:param high_score: 高置信度匹配分数,超过则直接返回所有等分结果
|
||||||
|
:param top_k: 如果不满足高置信度,则返回前 top_k 个结果
|
||||||
|
:return: 标准原始名的列表,可能为空
|
||||||
|
"""
|
||||||
|
match_results = process.extract(input_key, match_pool, scorer=cast(Callable, WRatio), limit=len(match_pool))
|
||||||
|
high_conf_matches = [(m[0], m[1]) for m in match_results if m[1] >= lower_score]
|
||||||
|
|
||||||
|
if not high_conf_matches:
|
||||||
|
return []
|
||||||
|
|
||||||
|
max_score = max(high_conf_matches, key=lambda x: x[1])
|
||||||
|
best_matches = [m for m in high_conf_matches if m[1] == max_score[1]]
|
||||||
|
|
||||||
|
if max_score[1] >= high_score:
|
||||||
|
return [mapping_dict[m[0]] for m in best_matches]
|
||||||
|
else:
|
||||||
|
return [mapping_dict[m[0]] for m in high_conf_matches[:top_k]]
|
||||||
|
|
||||||
|
|
||||||
|
def standardize_name(input_name, clean_func, simply_map, pinyin_map, lower_score=70, high_score=85):
|
||||||
|
"""
|
||||||
|
通用名称标准化函数,按中文 → 清洗 → 简化匹配 → 拼音匹配 的顺序进行处理。
|
||||||
|
|
||||||
|
:param input_name: 用户输入的原始中文名
|
||||||
|
:param clean_func: 清洗函数(针对不同实体类型,如工程名/公司名)
|
||||||
|
:param simply_map: 简化后的名 → 原始标准名 映射
|
||||||
|
:param pinyin_map: 拼音名 → 原始标准名 映射
|
||||||
|
:param lower_score: 模糊匹配最低置信分数
|
||||||
|
:param high_score: 模糊匹配高置信分数阈值
|
||||||
|
:return: 标准名列表,可能为空
|
||||||
|
"""
|
||||||
|
simply_input = clean_func(input_name)
|
||||||
|
result = fuzzy_match_and_filter(simply_input, list(simply_map.keys()), simply_map, lower_score, high_score)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
|
||||||
|
# 拼音匹配
|
||||||
|
pinyin_input = text_to_pinyin(simply_input)
|
||||||
|
result = fuzzy_match_and_filter(pinyin_input, list(pinyin_map.keys()), pinyin_map, lower_score, high_score)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def standardize_sub_company(input_name, simply_map, pinyin_map, lower_score=55, high_score=80):
|
||||||
|
"""
|
||||||
|
对用户输入的子公司名称进行标准化,返回最匹配的标准公司名列表。
|
||||||
|
|
||||||
|
:param input_name: 原始中文子公司名
|
||||||
|
:param simply_map: 清洗后的公司名 → 标准公司名映射
|
||||||
|
:param pinyin_map: 洗后公司的拼音 → 标准公司名映射
|
||||||
|
:param lower_score: 模糊匹配分数下限
|
||||||
|
:param high_score: 高置信匹配分数阈值
|
||||||
|
:return: 匹配的标准公司名列表
|
||||||
|
"""
|
||||||
|
return standardize_name(input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score)
|
||||||
|
|
||||||
|
|
||||||
|
def standardize_project_name(input_name, simply_map, pinyin_map, lower_score=70, high_score=90):
|
||||||
|
"""
|
||||||
|
对用户输入的项目名称进行标准化,返回最匹配的标准项目名列表。
|
||||||
|
|
||||||
|
:param input_name: 原始中文项目名
|
||||||
|
:param simply_map: 清洗后的项目名 → 标准项目名映射
|
||||||
|
:param pinyin_map: 清洗后项目的拼音 → 标准项目名映射
|
||||||
|
:param lower_score: 模糊匹配分数下限
|
||||||
|
:param high_score: 高置信匹配分数阈值
|
||||||
|
:return: 匹配的标准项目名列表
|
||||||
|
"""
|
||||||
|
return standardize_name(input_name, clean_useless_project_name, simply_map, pinyin_map, lower_score, high_score)
|
||||||
|
|
||||||
|
|
||||||
#标准化项目部名
|
#标准化项目部名
|
||||||
def standardize_projectDepartment(standard_company, input_project , company_project_department_map, high_score=85):
|
def standardize_projectDepartment(standard_company, input_project, company_project_department_map, high_score=90):
|
||||||
"""
|
"""
|
||||||
将口语化的公司名和项目部名转换为标准化名称。
|
将口语化的公司名和项目部名转换为标准化名称。
|
||||||
|
|
||||||
参数:
|
参数:
|
||||||
standard_company (str): 标准化公司名。
|
standard_company (str): 标准化公司名。
|
||||||
input_project (str): 用户输入的项目部名(可能是口语化或不完整的名称)。
|
input_project (str): 用户输入的项目部名(可能是口语化或不完整的名称)。
|
||||||
company_project_department_map (dict): 标准化的公司名和项目部名数据,格式为 {公司名: [项目部名1, 项目部名2, ...]}。
|
company_project_department_map (dict): 标准化的公司名和项目部名数据,格式为 {公司名: [项目部名1, 项目部名2, ...]}。
|
||||||
pinyin_to_original_map:分公司拼音和分公司原始名的映射
|
pinyin_to_original_map:分公司拼音和分公司原始名的映射
|
||||||
|
|
||||||
返回:
|
返回:
|
||||||
tuple: (标准化公司名, 匹配的项目部名列表)。如果无法匹配,返回 (None, None)。
|
tuple: (标准化公司名, 匹配的项目部名列表)。如果无法匹配,返回 (None, None)。
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# **2. 先尝试直接匹配最相似的项目名**
|
# **2. 先尝试直接匹配最相似的项目名**
|
||||||
project_match = process.extractOne(input_project, company_project_department_map[standard_company], scorer=fuzz.ratio)
|
project_match = process.extractOne(input_project, company_project_department_map[standard_company],
|
||||||
|
scorer=cast(Callable, WRatio))
|
||||||
print(f"项目部名称最相似:{project_match[0]},{project_match[1]}", flush=True)
|
print(f"项目部名称最相似:{project_match[0]},{project_match[1]}", flush=True)
|
||||||
if project_match and project_match[1] >= high_score:
|
if project_match and project_match[1] >= high_score:
|
||||||
return [project_match[0]] # 直接返回匹配的项目名
|
return [project_match[0]] # 直接返回匹配的项目名
|
||||||
|
|
@ -86,11 +169,13 @@ def standardize_projectDepartment(standard_company, input_project , company_proj
|
||||||
print(f"standardize_projectDepartment:{e}", flush=True)
|
print(f"standardize_projectDepartment:{e}", flush=True)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin_name_list = None, pinyin_to_original_map = None, lower_score=70, high_score=85, isArabicNumConv = False):
|
|
||||||
|
def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin_name_list=None,
|
||||||
|
pinyin_to_original_map=None, lower_score=70, high_score=85, isArabicNumConv=False):
|
||||||
"""
|
"""
|
||||||
使用拼音 + rapidfuzz 进行关键词模糊匹配,并返回原始的标准名
|
使用拼音 + rapidfuzz 进行关键词模糊匹配,并返回原始的标准名
|
||||||
:param input_name: 口语化的名称(中文)
|
:param origin_input_name: 口语化的名称(中文)
|
||||||
:param name_list: 关键词列表(中文)
|
:param origin_name_list: 关键词列表(中文)
|
||||||
:pinyin_name_list: 关键词列表(拼音)
|
:pinyin_name_list: 关键词列表(拼音)
|
||||||
:param pinyin_to_original_map: 拼音到原始标准名的映射
|
:param pinyin_to_original_map: 拼音到原始标准名的映射
|
||||||
:param lower_score: 低匹配分数阈值(默认70)
|
:param lower_score: 低匹配分数阈值(默认70)
|
||||||
|
|
@ -100,10 +185,12 @@ def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin
|
||||||
#First round, 原始标准名的匹配性查找,能找到直接返回
|
#First round, 原始标准名的匹配性查找,能找到直接返回
|
||||||
if isArabicNumConv:
|
if isArabicNumConv:
|
||||||
origin_input_name = arabic_to_chinese_number(origin_input_name)
|
origin_input_name = arabic_to_chinese_number(origin_input_name)
|
||||||
match_results = process.extract(origin_input_name, origin_name_list, scorer=fuzz.token_sort_ratio, limit=len(origin_name_list))
|
match_results = process.extract(origin_input_name, origin_name_list, scorer=fuzz.token_sort_ratio,
|
||||||
|
limit=len(origin_name_list))
|
||||||
# 找到所有相似度 > 80 的匹配项
|
# 找到所有相似度 > 80 的匹配项
|
||||||
original_high_confidence_matches = [(match[0], match[1]) for match in match_results if match[1] >= lower_score]
|
original_high_confidence_matches = [(match[0], match[1]) for match in match_results if match[1] >= lower_score]
|
||||||
print(f"standardize_pinyin_single_name 原始名匹配, high_confidence_matches:{original_high_confidence_matches[:3]}", flush=True)
|
print(f"standardize_pinyin_single_name 原始名匹配, high_confidence_matches:{original_high_confidence_matches[:3]}",
|
||||||
|
flush=True)
|
||||||
|
|
||||||
combined_low_confidence_matches = []
|
combined_low_confidence_matches = []
|
||||||
if original_high_confidence_matches:
|
if original_high_confidence_matches:
|
||||||
|
|
@ -117,31 +204,10 @@ def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin
|
||||||
if not pinyin_name_list or not pinyin_to_original_map:
|
if not pinyin_name_list or not pinyin_to_original_map:
|
||||||
return None #
|
return None #
|
||||||
|
|
||||||
#第二轮, 拼音名的匹配性查找,能找到直接返回
|
|
||||||
# pinyin_input_name = text_to_pinyin(origin_input_name)
|
|
||||||
# #fuzz.partial_ratio
|
|
||||||
# match_results = process.extract(pinyin_input_name, pinyin_name_list, scorer=fuzz.ratio, limit=len(pinyin_name_list))
|
|
||||||
#
|
|
||||||
# # 筛选出匹配分数 > lower_score 的结果
|
|
||||||
# pinyin_high_confidence_matches = [(match[0], match[1]) for match in match_results if match[1] >= lower_score]
|
|
||||||
# print(f"standardize_pinyin_single_name 拼音匹配, input_name:{pinyin_input_name}, high_confidence_matches:{pinyin_high_confidence_matches[:3]}", flush=True)
|
|
||||||
#
|
|
||||||
# if not pinyin_high_confidence_matches:
|
|
||||||
# return combined_low_confidence_matches # 没有找到匹配项
|
|
||||||
#
|
|
||||||
# # 选择最高相似度的匹配项
|
|
||||||
# pinyin_best_match = max(pinyin_high_confidence_matches, key=lambda x: x[1], default=None)
|
|
||||||
#
|
|
||||||
# if pinyin_best_match and pinyin_best_match[1] > high_score:
|
|
||||||
# return [pinyin_to_original_map[pinyin_best_match[0]]] # 直接返回最高相似度的原始工程名
|
|
||||||
#
|
|
||||||
# combined_low_confidence_matches.extend(
|
|
||||||
# [pinyin_to_original_map[match[0]] for match in pinyin_high_confidence_matches[:3]]
|
|
||||||
# )
|
|
||||||
# 返回所有匹配项对应的原始名,最多返回最低匹配项的前5个
|
|
||||||
return list(dict.fromkeys(combined_low_confidence_matches))
|
return list(dict.fromkeys(combined_low_confidence_matches))
|
||||||
|
|
||||||
def generate_project_prompt(matched_projects, original_name = "", type="项目部名"):
|
|
||||||
|
def generate_project_prompt(matched_projects, original_name="", type="项目部名"):
|
||||||
"""
|
"""
|
||||||
生成提示信息,用于让用户确认匹配的项目名或分公司名或项目名。
|
生成提示信息,用于让用户确认匹配的项目名或分公司名或项目名。
|
||||||
|
|
||||||
|
|
@ -203,3 +269,28 @@ class StandardType(Enum):
|
||||||
PROJECT_CHECK = 0
|
PROJECT_CHECK = 0
|
||||||
#项目名检查
|
#项目名检查
|
||||||
PROGRAM_CHECK = 1
|
PROGRAM_CHECK = 1
|
||||||
|
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
# 构建一个用于替换的正则表达式
|
||||||
|
useless_project_words_pattern = re.compile("|".join(USELESS_PROJECT_WORDS))
|
||||||
|
useless_company_words_pattern = re.compile("|".join(USELESS_COMPANY_WORDS))
|
||||||
|
# 匹配所有数字、字母(含大小写)、特殊字符(包括空格、标点)
|
||||||
|
project_symbols_pattern = re.compile(r"[A-Za-z0-9\s\W_]+")
|
||||||
|
company_symbols_pattern = re.compile(r"[\s\W_]+")
|
||||||
|
|
||||||
|
|
||||||
|
def clean_useless_project_name(name: str) -> str:
|
||||||
|
# 去掉无意义词
|
||||||
|
name = useless_project_words_pattern.sub("", name)
|
||||||
|
# 去掉数字、字母、符号
|
||||||
|
name = project_symbols_pattern.sub("", name)
|
||||||
|
return name.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def clean_useless_company_name(name: str) -> str:
|
||||||
|
# 去掉无意义词
|
||||||
|
name = useless_company_words_pattern.sub("", name)
|
||||||
|
name = company_symbols_pattern.sub("", name)
|
||||||
|
return name.strip()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue