完善项目名标准化
This commit is contained in:
parent
2f4dff403d
commit
a37b06d473
|
|
@ -1,12 +1,24 @@
|
||||||
# constants.py
|
# constants.py
|
||||||
#匹配工程名时,需要过滤掉的词汇,线路, "变电站","换流站","变电",","调试部分"
|
|
||||||
USELESS_PROJECT_WORDS = ["项目", "工程", "变电站", "线路", "变电","千伏" ,"换流站","公司","直流","部分","施工"]
|
|
||||||
# USELESS_PROJECT_WORDS = ["项目", "工程", "千伏" ,"公司","直流"]
|
|
||||||
|
|
||||||
#匹配公司名时,需要过滤掉的词汇
|
#提取工程名热词中的地名时需要用的到过滤词列表
|
||||||
|
# USELESS_PROJECT_WORDS = {"项目", "工程", "变电站", "线路", "变电", "千伏", "换流站", "公司", "直流", "部分", "施工",
|
||||||
|
# "电厂", "牵引站","改造","间隔","维修","中心","主设备","应急","抢修","服务","扩建","完善","新建",
|
||||||
|
# "配电","装置","调试","显示器","维护","设备","检修","电缆","光伏","保护","站","国网","安徽","阜阳",
|
||||||
|
# "架空","电气安装包","安装包","主变"}
|
||||||
|
#工程名标准化时需要过滤掉的词汇
|
||||||
|
USELESS_PROJECT_WORDS = ["项目", "工程", "千伏", "公司", "直流"]
|
||||||
|
|
||||||
|
#项目名标准化时需要过滤掉的词汇
|
||||||
|
USELESS_PROGRAM_DEPARTMENT_WORDS = {"项目管理部","项目部", "项目", "管理"}
|
||||||
|
|
||||||
|
#公司名标准化时需要过滤掉的词汇
|
||||||
USELESS_COMPANY_WORDS = ["公司","有限","责任","工程","科技"]
|
USELESS_COMPANY_WORDS = ["公司","有限","责任","工程","科技"]
|
||||||
|
|
||||||
COMPANYNAME_SHA = "顺安电网建设有限公司"
|
#提取公司名热词需要过滤掉的词汇
|
||||||
|
# USELESS_COMPANY_WORDS = ["公司","有限","责任","工程","科技","安徽省","国网","四川省","安徽","集团","电力","建设","建筑","安装","股份"
|
||||||
|
# "装饰","结构","能源","发展","装饰","电气","股份"]
|
||||||
|
|
||||||
|
#槽位抽取时各槽位字段名
|
||||||
#日期
|
#日期
|
||||||
DATE = "date"
|
DATE = "date"
|
||||||
#工程名称
|
#工程名称
|
||||||
|
|
|
||||||
|
|
@ -18,8 +18,8 @@ from apscheduler.schedulers.background import BackgroundScheduler
|
||||||
# MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-25910"
|
# MODEL_ERNIE_PATH = R"../ernie/output/checkpoint-25910"
|
||||||
# MODEL_UIE_PATH = R"../uie/output/checkpoint-32750"
|
# MODEL_UIE_PATH = R"../uie/output/checkpoint-32750"
|
||||||
|
|
||||||
MODEL_ERNIE_PATH = R"../ernie/output_temp/checkpoint-33510"
|
MODEL_ERNIE_PATH = R"../ernie/output_temp/checkpoint-34340"
|
||||||
MODEL_UIE_PATH = R"../uie/output_temp/checkpoint-33220"
|
MODEL_UIE_PATH = R"../uie/output_temp/checkpoint-34050"
|
||||||
|
|
||||||
# 类别名称列表
|
# 类别名称列表
|
||||||
labels = [
|
labels = [
|
||||||
|
|
@ -283,7 +283,7 @@ def extract_multi_chat(messages):
|
||||||
|
|
||||||
logger.info(f"len(history_messages):{len(history_messages)}")
|
logger.info(f"len(history_messages):{len(history_messages)}")
|
||||||
|
|
||||||
#最新问题的上一个问题里如何含有时间,则清空最老的历史对话
|
#最新问题的上一个问题里如果含有时间,则清空最老的历史对话
|
||||||
last_two_messages = history_messages[-2:]
|
last_two_messages = history_messages[-2:]
|
||||||
has_time_prefix = any(
|
has_time_prefix = any(
|
||||||
msg.role == "user" and any(prefix in msg.content and prefix != msg.content for prefix in time_prefixes)
|
msg.role == "user" and any(prefix in msg.content and prefix != msg.content for prefix in time_prefixes)
|
||||||
|
|
|
||||||
34
api/utils.py
34
api/utils.py
|
|
@ -12,7 +12,7 @@ import re
|
||||||
|
|
||||||
from globalData import GlobalData
|
from globalData import GlobalData
|
||||||
from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS, CONSTRUCTION_UNIT, IMPLEMENTATION_ORG, \
|
from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS, CONSTRUCTION_UNIT, IMPLEMENTATION_ORG, \
|
||||||
SUBCONTRACTOR, PROJECT_NAME, PROJECT_DEPARTMENT, RISK_LEVEL, TEAM_NAME
|
SUBCONTRACTOR, PROJECT_NAME, PROJECT_DEPARTMENT, RISK_LEVEL, TEAM_NAME, USELESS_PROGRAM_DEPARTMENT_WORDS
|
||||||
|
|
||||||
from logger_util import setup_logger
|
from logger_util import setup_logger
|
||||||
|
|
||||||
|
|
@ -312,16 +312,23 @@ def standardize_projectDepartment(standard_company, input_project, company_proje
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# **2. 先尝试直接匹配最相似的项目名**
|
# **2. 先尝试直接匹配最相似的项目名**
|
||||||
input_project = replace_arabic_with_chinese(input_project)
|
temp_input_project = replace_arabic_with_chinese(input_project)
|
||||||
project_match = process.extractOne(input_project, company_project_department_map[standard_company],
|
|
||||||
|
temp_input_project = clean_useless_program_departement_name(temp_input_project)
|
||||||
|
|
||||||
|
program_list = company_project_department_map.get(standard_company, [])
|
||||||
|
cleaned_map = {clean_useless_program_departement_name(p): p for p in program_list}
|
||||||
|
|
||||||
|
project_match = process.extractOne(temp_input_project, list(cleaned_map.keys()),
|
||||||
scorer=cast(Callable, WRatio))
|
scorer=cast(Callable, WRatio))
|
||||||
logger.info(f"项目部名称最相似:{project_match[0]},{project_match[1]}")
|
|
||||||
# print(f"项目部名称最相似:{project_match[0]},{project_match[1]}", flush=True)
|
logger.info(f"{input_project} 名称最相似:{project_match[0]}, {project_match[1]}")
|
||||||
if project_match and project_match[1] >= high_score:
|
if project_match and project_match[1] >= high_score:
|
||||||
return [project_match[0]] # 直接返回匹配的项目名
|
matched_raw = cleaned_map[project_match[0]]
|
||||||
|
return [matched_raw] # 返回原始名称
|
||||||
|
|
||||||
# **3. 提取项目部的数字部分**
|
# **3. 提取项目部的数字部分**
|
||||||
query_number = extract_number(input_project)
|
query_number = extract_number(temp_input_project)
|
||||||
|
|
||||||
# **4. 过滤所有符合数字的项目部**
|
# **4. 过滤所有符合数字的项目部**
|
||||||
matched_projects = []
|
matched_projects = []
|
||||||
|
|
@ -356,8 +363,6 @@ def multiple_standardize_single_name(origin_input_name, origin_name_list, pinyin
|
||||||
limit=len(origin_name_list))
|
limit=len(origin_name_list))
|
||||||
# 找到所有相似度 > 80 的匹配项
|
# 找到所有相似度 > 80 的匹配项
|
||||||
original_high_confidence_matches = [(match[0], match[1]) for match in match_results if match[1] >= lower_score]
|
original_high_confidence_matches = [(match[0], match[1]) for match in match_results if match[1] >= lower_score]
|
||||||
# print(f"standardize_pinyin_single_name 原始名匹配, high_confidence_matches:{original_high_confidence_matches[:3]}",
|
|
||||||
# flush=True)
|
|
||||||
logger.info(f"standardize_pinyin_single_name 原始名匹配, high_confidence_matches:{original_high_confidence_matches[:3]}")
|
logger.info(f"standardize_pinyin_single_name 原始名匹配, high_confidence_matches:{original_high_confidence_matches[:3]}")
|
||||||
|
|
||||||
combined_low_confidence_matches = []
|
combined_low_confidence_matches = []
|
||||||
|
|
@ -386,7 +391,6 @@ def generate_project_prompt_with_key(matched_projects, original_name="", slot_ke
|
||||||
返回:
|
返回:
|
||||||
str: 生成的提示信息。如果未找到匹配项,返回提示用户提供更准确信息的字符串。
|
str: 生成的提示信息。如果未找到匹配项,返回提示用户提供更准确信息的字符串。
|
||||||
"""
|
"""
|
||||||
# print(f"generate_project_prompt_with_key slot_key:{slot_key},original_name:{original_name},matched_projects:{matched_projects} ")
|
|
||||||
logger.info(f"generate_project_prompt_with_key slot_key:{slot_key},original_name:{original_name},matched_projects:{matched_projects} ")
|
logger.info(f"generate_project_prompt_with_key slot_key:{slot_key},original_name:{original_name},matched_projects:{matched_projects} ")
|
||||||
type = ""
|
type = ""
|
||||||
if slot_key == CONSTRUCTION_UNIT:
|
if slot_key == CONSTRUCTION_UNIT:
|
||||||
|
|
@ -480,6 +484,8 @@ class StandardType(Enum):
|
||||||
# 构建一个用于替换的正则表达式
|
# 构建一个用于替换的正则表达式
|
||||||
useless_project_words_pattern = re.compile("|".join(USELESS_PROJECT_WORDS))
|
useless_project_words_pattern = re.compile("|".join(USELESS_PROJECT_WORDS))
|
||||||
useless_company_words_pattern = re.compile("|".join(USELESS_COMPANY_WORDS))
|
useless_company_words_pattern = re.compile("|".join(USELESS_COMPANY_WORDS))
|
||||||
|
useless_program_department_words_pattern = re.compile("|".join(USELESS_PROGRAM_DEPARTMENT_WORDS))
|
||||||
|
|
||||||
# 匹配所有数字、字母(含大小写)、特殊字符(包括空格、标点)
|
# 匹配所有数字、字母(含大小写)、特殊字符(包括空格、标点)
|
||||||
project_symbols_pattern = re.compile(r"[A-Za-z0-9\s\W_ⅰ-ⅷⅠ-Ⅻⅸⅹⅺⅻ]+", re.UNICODE)
|
project_symbols_pattern = re.compile(r"[A-Za-z0-9\s\W_ⅰ-ⅷⅠ-Ⅻⅸⅹⅺⅻ]+", re.UNICODE)
|
||||||
#特殊字符
|
#特殊字符
|
||||||
|
|
@ -507,6 +513,14 @@ def clean_useless_team_leader_name(name: str) -> str:
|
||||||
name = useless_team_leader_words_pattern.sub("", name)
|
name = useless_team_leader_words_pattern.sub("", name)
|
||||||
return name.strip()
|
return name.strip()
|
||||||
|
|
||||||
|
#去掉项目部里面的不重要词汇
|
||||||
|
def clean_useless_program_departement_name(name: str) -> str:
|
||||||
|
# 去掉无意义词
|
||||||
|
name = useless_program_department_words_pattern.sub("", name)
|
||||||
|
# 去掉数字、字母、符号
|
||||||
|
name = project_symbols_pattern.sub("", name)
|
||||||
|
return name.strip()
|
||||||
|
|
||||||
#槽位缺失检查
|
#槽位缺失检查
|
||||||
def check_lost(int_res, slot):
|
def check_lost(int_res, slot):
|
||||||
#labels: ["天气查询","通用对话","页面切换","日计划数量查询","周计划数量查询","日计划作业内容","周计划作业内容","施工人数","作业考勤人数","知识问答"]
|
#labels: ["天气查询","通用对话","页面切换","日计划数量查询","周计划数量查询","日计划作业内容","周计划作业内容","施工人数","作业考勤人数","知识问答"]
|
||||||
|
|
|
||||||
|
|
@ -57,7 +57,7 @@ BASE_DATA = {
|
||||||
"第三项目管理部(谯城变、亳州楼)","第十项目管理部(亳州变电)","第三项目管理部香鹭东段","第三项目部六安线路","第八项目管理部(宿州分部)",
|
"第三项目管理部(谯城变、亳州楼)","第十项目管理部(亳州变电)","第三项目管理部香鹭东段","第三项目部六安线路","第八项目管理部(宿州分部)",
|
||||||
"第八项目管理部宿州分部"],
|
"第八项目管理部宿州分部"],
|
||||||
# 项目经理
|
# 项目经理
|
||||||
"project_managers": ["陈少平项目经理", "范文立项目经理", "何东洋项目经理"],
|
"project_managers": ["陈少平项目经理", "范文立项目经理", "何东洋项目经理","周易开项目经理"],
|
||||||
|
|
||||||
# 建管地区,"国网安徽省电力有限公司建设分公司", "国网安徽省电力有限公司马鞍山供电公司","马鞍山供电公司",
|
# 建管地区,"国网安徽省电力有限公司建设分公司", "国网安徽省电力有限公司马鞍山供电公司","马鞍山供电公司",
|
||||||
|
|
||||||
|
|
@ -69,7 +69,7 @@ BASE_DATA = {
|
||||||
"subcontractors": ["劦力建筑责任公司","安徽劦力建筑装饰有限责任公司", "安徽苏亚建设集团有限公司","大信电力建设有限公司","优越电力公司",
|
"subcontractors": ["劦力建筑责任公司","安徽劦力建筑装饰有限责任公司", "安徽苏亚建设集团有限公司","大信电力建设有限公司","优越电力公司",
|
||||||
"安徽国腾电力工程有限公司","安徽京硚建设有限公司","中国能源建设集团安徽省电力设计院有限公司"],
|
"安徽国腾电力工程有限公司","安徽京硚建设有限公司","中国能源建设集团安徽省电力设计院有限公司"],
|
||||||
# 班组名称
|
# 班组名称
|
||||||
"team_names": ["张朵班组", "刘梁玉班组", "魏玉龙班组"],
|
"team_names": ["张朵班组", "刘梁玉班组", "魏玉龙班组","周可富班组"],
|
||||||
# 班组长
|
# 班组长
|
||||||
"team_leaders": ["李元帅班组长", "刘雨豪班组长"],
|
"team_leaders": ["李元帅班组长", "刘雨豪班组长"],
|
||||||
# 风险等级
|
# 风险等级
|
||||||
|
|
@ -744,21 +744,21 @@ TEMPLATE_CONFIG = {
|
||||||
"templates": [
|
"templates": [
|
||||||
("打开{page}页面", ["page"]),
|
("打开{page}页面", ["page"]),
|
||||||
("打开{page}", ["page"]),
|
("打开{page}", ["page"]),
|
||||||
("打开{page}模块", ["page"]),
|
("切换{page}模块", ["page"]),
|
||||||
("进入{page}", ["page"]),
|
("进入{page}", ["page"]),
|
||||||
("进入{page}模块", ["page"]),
|
("进入{page}模块", ["page"]),
|
||||||
("进入{page}页面", ["page"]),
|
("切换到{page}页面", ["page"]),
|
||||||
("跳转到{page}", ["page"]),
|
("跳转到{page}", ["page"]),
|
||||||
("跳转到{page}模块", ["page"]),
|
("跳转到{page}模块", ["page"]),
|
||||||
("跳转到{page}页面", ["page"]),
|
("切换到{page}页面", ["page"]),
|
||||||
("访问{page}页面", ["page"]),
|
("访问{page}页面", ["page"]),
|
||||||
("访问{page}模块", ["page"]),
|
("切换{page}模块", ["page"]),
|
||||||
("访问{page}", ["page"]),
|
("访问{page}", ["page"]),
|
||||||
("请打开{page}模块", ["page"]),
|
("请打开{page}模块", ["page"]),
|
||||||
("请打开{page}", ["page"]),
|
("请打开{page}", ["page"]),
|
||||||
("请打开{page}页面", ["page"]),
|
("请切换到{page}页面", ["page"]),
|
||||||
("加载{page}模块", ["page"]),
|
("加载{page}模块", ["page"]),
|
||||||
("加载{page}", ["page"]),
|
("切换{page}", ["page"]),
|
||||||
("加载{page}页面", ["page"]),
|
("加载{page}页面", ["page"]),
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue