Intention/api/standard_test.py

591 lines
25 KiB
Python
Raw Normal View History

import logging
from logger_util import setup_logger
from globalData import GlobalData
from utils import standardize_name, clean_useless_team_leader_name, standardize_sub_company, standardize_project_name, \
standardize_projectDepartment, standardize_team_name, check_standard_name_slot_probability, \
2025-05-04 15:29:03 +08:00
clean_useless_project_name, save_standard_name_list_to_file, load_standard_name_list, save_dict_to_file, \
load_standard_json_data
import time
from apscheduler.schedulers.blocking import BlockingScheduler
from globalData import GlobalData
# def job():
# print("[Info] Executing update_from_redis...")
# GlobalData.update_from_redis()
#
#
logger = setup_logger("utils", level=logging.DEBUG)
2025-05-04 15:29:03 +08:00
# GlobalData.update_from_redis()
def check_standard_name_slot_probability_test():
slot_list = [{"constructionUnit": "合肥供电公司"},
{
"date": "今天",
"constructionUnit": "芜湖供电公司"
},
{
"date": "今天",
"implementationOrganization": "送电一分公司"
},
{
"date": "今天",
"subcontractor": "百瑞建设发展有限公司"
},
{
"date": "今天",
"subcontractor": "安徽宝德电力建设工程有限"
},
{
"date": "今天",
"teamName": "徐局班组"
}
]
for slot in slot_list:
match_results = check_standard_name_slot_probability(12, slot)
logger.info(f"加权混合策略 项目部名称 输入: 原始槽位:{slot},输出: {match_results}")
def standardize_team_leader_test():
team_leader_list = [
"李东班组",
"磐基班组",
"章永班组",
"张勇班组",
"王治国班组",
"代贵华班组",
"黄安印班组",
"刘闩班组",
"王虎班组",
"周勇勇班组",
"魏在华班组",
"王礼良班组",
"林学刚班组",
"崔新荣班组",
"江军班组",
"笪淦班组",
"杨海平班组",
"蔡来云班组",
"贺中林班组",
"何勇班组",
"韦幸朝班组",
"刘文虎班组",
"金生班组",
"段宝强班组",
"何计划班组",
"刘兆班组",
"徐南班组",
"贺广飞班组",
"孙泽栋班组",
"钱小林班组",
"朱锋东班组",
]
for item in team_leader_list:
match_results = standardize_team_name(item, GlobalData.simply_to_standard_team_leader_name_map,
GlobalData.pinyin_simply_to_standard_team_leader_name_map, lower_score=70, high_score=90)
# match_results = standardize_name(item, clean_useless_team_leader_name, GlobalData.simply_to_standard_team_leader_name_map,
# GlobalData.pinyin_simply_to_standard_team_leader_name_map, lower_score=70, high_score=90)
logger.info(f"班组长名匹配 输入: {item}-> 输出: {match_results}")
def standardize_company_test():
test_cases = [
2025-05-04 15:29:03 +08:00
("宋轶分公司"),
# ("送二分公司"),
# ("变电分公司"),
# ("建筑分公司"),
# ("检修试验分公司"),
# ("宏源电力公司"),
# ("宏源电力限公司"),
# ("宏源电力限公司线路"),
# ("宏源电力限公司变电"),
# ("送一分"),
# ("送二分"),
# ("变电分"),
# ("建筑分"),
# ("检修试验分"),
# ("宏源电力"),
# ("红源电力"),
# ("宏源电力有限"),
# ("宏源电力限线路"),
# ("宏源电力限变电"),
]
2025-05-04 15:29:03 +08:00
print(f"加权混合策略 分公司名匹配**********************")
start = time.perf_counter()
for item in test_cases:
match_results = standardize_sub_company(item,GlobalData.simply_to_standard_company_name_map, GlobalData.pinyin_simply_to_standard_company_name_map,70,90)
2025-05-04 15:29:03 +08:00
print(f"加权混合策略 分公司名匹配 输入: {item}-> 输出: {match_results}")
end = time.perf_counter()
2025-05-04 15:29:03 +08:00
print(f"加权混合策略 耗时: {end - start:.4f}")
def standardize_construction_test():
test_cases = [
("合肥供电公司"),
("淮北供电公司"),
("六安市城郊供电公司"),
]
logger.info(f"加权混合策略 建管单位名匹配**********************")
start = time.perf_counter()
for item in test_cases:
match_results = standardize_sub_company(item,GlobalData.simply_to_standard_construct_name_map, GlobalData.pinyin_simply_to_standard_construct_name_map,70,90)
logger.info(f"加权混合策略 建管单位名匹配 输入: {item}-> 输出: {match_results}")
def standardize_project_test():
test_cases = [
2025-05-04 15:29:03 +08:00
("陶楼夏塘工程"),
("宗阳黄桥工程")
# ("合肥卫田变电站工程").
# ("金牛变电站新建建筑"),
# ("金牛变电站建筑工程"),
# ("金牛新建工程"),
# ("金牛新建工程调试"),
# ("金牛新建调试工程"),
# ("金牛变电站工程"),
# ("芦集"),
# ("芦集变电站"),
# ("安庆四变电站"),
# ("锦绣变电站"),
# ("滁州护桥变电站"),
# ("合州换流站"),
# ("陕北合州换流站"),
# ("陕北安徽合州换流站"),
# ("金牛变电站"),
# ("香涧鹭岛工程"),
# ("延庆换流站"),
# ("国网延庆换流站"),
# ("国网北京延庆换流站"),
# ("陶楼广银线路工程"),
# ("紫蓬变电站"),
# ("宿州萧砀变电站"),
# ("冯井变电站"),
# ("富邦秋浦变电站"),
# ("包河玉龙变电站"),
# ("绿雪莲塘工程"),
# ("合肥循环园工程"),
# ("合肥长临河工程"),
# ("合肥中心变"),
# ("锁库变电站工程"),
# ("槽坊工程"),
# ("富东2798线"),
# ("安庆四500kV变电站新建工程(PROJ-2024-0862)"),
# ("锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)"),
# ("渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)"),
# ("先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)"),
# ("安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)"),
# ("合州士800千伏换流站电气安装A包(PROJ-2025-0056)"),
# ("卫田-陶楼T接首业变电站110kV电缆线路工程(PROJ-2024-1236)"),
# ("谯城(亳三)-希夷220kV线路工程(PROJ-2024-1205)"),
]
2025-05-04 15:29:03 +08:00
print(f"去不重要词汇 工程名匹配******************************************")
start = time.perf_counter()
for item in test_cases:
match_results = standardize_project_name(item, GlobalData.simply_to_standard_project_name_map,
GlobalData.pinyin_simply_to_standard_project_name_map, 70, 90)
2025-05-04 15:29:03 +08:00
print(f"***************工程名匹配 输入: {item}-> 输出: {match_results}")
end = time.perf_counter()
2025-05-04 15:29:03 +08:00
print(f"词集匹配 耗时: {end - start:.4f}")
def standardize_program_test():
logger.info(f"项目名匹配******************************************")
oral_program_name_list = [
("金上第一项目部"),
("第一项目部金上"),
# ("第1项目部"), # 期望返回所有"第三项目管理部"
# ("第2项目部"),
# ("第3项目部"),
# ("第4项目部"),
# ("第5项目部"),
# ("第6项目部"),
# ("第7项目部"),
# ("第8项目部"),
# ("第9项目部"),
# ("第10项目部"),
# ("第11项目部"),
# ("第12项目部"),
# ("第13项目部"),
# ("电缆班"),
# ("调试1队"),
# ("调试2队"),
# ("调试3队"),
# ("调试4队"),
# ("调试5队"),
# ("第一项目管理部"),
# ("第二项目管理部"),
# ("第五项目管理部"),
# ("第十一项目管理部(萧砀线路)"),
# ("第三项目管理部(张店线路)"),
# ("第三项目管理部(岳西线路)"),
# ("第五项目管理部(蚌埠)"),
# ("第三项目管理部(六安线路)"),
# ("第十一项目管理部(宿州线路)"),
# ("调试一队"),
# ("调试二队"),
# ("调试三队"),
# ("电缆班"),
]
for company in GlobalData.standard_company_name_list:
for program in oral_program_name_list:
match_results = standardize_projectDepartment(company, program, GlobalData.standard_company_program, high_score=90)
logger.info(f"加权混合策略 项目部名称 输入: 公司:{company},项目部:{program}-> 输出: {match_results}")
def standardize_sub_constractor_test():
test_cases = [
("怀电能源科技"),
("泰央建设有限责任公司"),
("泓源电力建设有限公司"),
("怀电能源科技公司"),
("宝德电力公司"),
("亿甲建筑公司"),
]
logger.info(f"加权混合策略 分包单位名匹配**********************")
start = time.perf_counter()
for item in test_cases:
match_results = standardize_sub_company(item,GlobalData.simply_to_standard_constractor_name_map, GlobalData.pinyin_simply_to_standard_constractor_name_map,70,90)
logger.info(f"分包单位名匹配 输入: {item}-> 输出: {match_results}")
def get_file_name():
import os
# 你想要遍历的目录路径
target_dir = '/Users/wangvivi/Desktop/Work/2025项目材料/送变电大模型/知识库文档/送变电文档合并V5' # ← 请替换为你的目标目录路径
# 存储文件名的列表
all_file_names = []
# 遍历目录及其子目录
for root, dirs, files in os.walk(target_dir):
for file in files:
all_file_names.append(file)
# 写入 name.txt 文件
output_path = 'name.txt' # 会生成在当前运行目录下
with open(output_path, 'w', encoding='utf-8') as f:
for name in all_file_names:
f.write(name + '\n')
logger.info(f"共收集了 {len(all_file_names)} 个文件名,已写入 {output_path}")
class Message:
def __init__(self, role, content):
self.role = role
self.content = content
class Message:
def __init__(self, role, content):
self.role = role
self.content = content
def history_message():
from collections import namedtuple
Message = namedtuple("Message", ["role", "content"])
messages = [
# Message("user", "延庆换流站今天有多少作业计划"),
# Message("assistant", "2025-04-23 ±500KV延庆换流站备用换流变安装(PROJ-2025-0162)风险等级为2级的有0项3级的有0项4级的有1项5级的有0项一共有1项作业计划"),
# Message("user", "河州换流站今天有多少作业计划"),
# Message("assistant", "您说的工程名可能是: 第1个合州±800千伏换流站电气安装A包(PROJ-2025-0056) 第2个合州换流站-文都500千伏线路工程(PROJ-2024-1089) 第3个陕北-安徽直流工程合州±800千伏换流站土建A包(PROJ-2024-0312) 第4个文都-官山改接入合州换流站500千伏线路工程(PROJ-2024-1090) 请确认您要选择哪一个"),
Message("user", "第一个")
]
latest_message = messages[-1]
latest_user_question = latest_message.content if latest_message.role == "user" else ""
time_prefixes = ["今天", "昨天", "本周", "下周", "明天", "今日"]
history_messages = [] if any(prefix in latest_user_question for prefix in time_prefixes) else messages[:-1]
logger.info("len(history_messages):\n", len(history_messages))
oldest_chat_history = "\n".join([f"{msg.role}: {msg.content}" for msg in history_messages[:2]])
last_chat_history = "\n".join([f"{msg.role}: {msg.content}" for msg in history_messages[-2:]])
logger.info("oldest_chat_history:\n", oldest_chat_history)
logger.info("last_chat_history:\n", last_chat_history)
logger.info("latest_user_question:\n", latest_user_question)
def standardize_program():
from rapidfuzz import process, fuzz
# query = "金上第一项目"
# choices = [
# "第五项目管理部(阜阳)",
# "第一项目管理部(金上)",
# "第二项目管理部(香鹭西段)",
# "第十一项目管理部(宣城)",
# "第八项目管理部(芜湖)",
# "第十三项目管理部(黄山)",
# "第六项目管理部(滁州)",
# "第四项目管理部(甘浙)",
# "第九项目管理部(马鞍山)",
# "第三项目管理部(香鹭东段)",
# "第一项目管理部(天津)"
# ]
query = "第一金上"
choices = [
"第五阜阳",
"第一金上",
"第二香鹭西段",
"第十一宣城",
"第八芜湖",
"第十三黄山",
"第六滁州",
"第四甘浙",
"第九马鞍山)",
"第三香鹭东段",
"第一天津"
]
match = process.extractOne(query, choices, scorer=fuzz.WRatio)
logger.info(match)
def get_size():
import sys
total_size = sys.getsizeof(GlobalData.standard_company_program) + sys.getsizeof(GlobalData.standard_company_name_list) + sys.getsizeof(GlobalData.simply_to_standard_company_name_map)+ sys.getsizeof(GlobalData.pinyin_simply_to_standard_company_name_map)
logger.info(f"standard_company size: {total_size} bytes")
total_size = sys.getsizeof(GlobalData.standard_project_name_list) + sys.getsizeof(GlobalData.simply_to_standard_project_name_map) + sys.getsizeof(GlobalData.pinyin_simply_to_standard_project_name_map)
logger.info(f"standard_project size: {total_size} bytes")
total_size = sys.getsizeof(GlobalData.standard_construct_name_list) + sys.getsizeof(GlobalData.simply_to_standard_construct_name_map) + sys.getsizeof(GlobalData.pinyin_simply_to_standard_construct_name_map)
logger.info(f"standard_construct size: {total_size} bytes")
total_size = sys.getsizeof(GlobalData.standard_constractor_name_list) + sys.getsizeof(GlobalData.simply_to_standard_constractor_name_map) + sys.getsizeof(GlobalData.pinyin_simply_to_standard_constractor_name_map)
logger.info(f"standard_constractor size: {total_size} bytes")
total_size = sys.getsizeof(GlobalData.standard_team_leader_name_list) + sys.getsizeof(GlobalData.simply_to_standard_team_leader_name_map) + sys.getsizeof(GlobalData.pinyin_simply_to_standard_constractor_name_map)
logger.info(f"standard_team size: {total_size} bytes")
standardize_project_test()
2025-05-04 15:29:03 +08:00
def exact_hot_words():
save_standard_name_list_to_file(list(GlobalData.simply_to_standard_company_name_map.keys()),"./hot_word/company.txt")
#save_standard_name_list_to_file(list(GlobalData.simply_to_standard_project_name_map.keys()),"./hot_word/project.txt")
save_standard_name_list_to_file(list(GlobalData.simply_to_standard_constractor_name_map.keys()),"./hot_word/constractor.txt")
save_standard_name_list_to_file(list(GlobalData.simply_to_standard_construct_name_map.keys()),"./hot_word/construct.txt")
# def exact_project_hot_words():
# import re
#
# # 示例数据,换成 GlobalData.simply_to_standard_project_name_map.keys()
# project_names = [
# "安庆四500kV变电站新建工程(PROJ-2024-0862)",
# "淮南芦集 220 千伏变电站 220 千伏配电装置改造工程(调试部分)(PROJ-2025-0022)",
# "屏显220kV变电站220kV杜岗Ⅱ间隔改造工程调试部分(PROJ-2025-0169)",
# "漆园220kV变电站220kV杨柳间隔改造工程调试部分(PROJ-2025-0042)",
# "宝桥220kV变电站220kV红桥间隔保护改造工程(调试部分)(PROJ-2025-0088)",
# "蟠龙220kV变电站220kV灵泗间隔改造工程调试部分(PROJ-2025-0018)",
# "锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)",
# "安庆和平220kV变电站新建工程调试部分(PROJ-2024-1238)",
# "渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)",
# "安徽亳州华佗220kV变电站220kV新华风电间隔扩建工程调试部分(PROJ-2024-1171)",
# "先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)",
# "安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)",
# "双岭500kV变电站间隔改造工程(PROJ-2024-0863)",
# "合州±800千伏换流站电气安装A包(PROJ-2025-0056)",
# "金牛500kV变电站新建工程(PROJ-2024-0866)",
# "况楼220kV变电站间隔扩建工程调试部分(PROJ-2025-0144)",
# "国网安徽合肥供电公司2023年GIS带电显示器维护(PROJ-2024-1260)",
# "亳州木兰220kV变电站220kV改造工程安徽亳州木兰200kV变电站GIS设备检修及调试技术服务(PROJ-2024-1256)",
# "香涧-鹭岛500kV线路工程淮河大跨越段(PROJ-2024-0722)",
# "安徽蚌埠濠州220kV变电站220千伏大唐凤阳红心镇光伏间隔扩建工程调试部分)(PROJ-2025-0164)",
# "陶楼-广银T接智迪改接首业变电站110kV电缆线路工程(PROJ-2024-1233)",
# "国网北京检修公司2024年±500kV延庆换流站直流主设备及辅助设备不停电检修维护(PROJ-2024-0841)"
# ]
#
#
# # 用来存关键词
# keywords = []
# #
# for name in GlobalData.simply_to_standard_project_name_map.keys():
# # 去掉括号和括号里的内容
# cleaned_name = re.sub(r"\(.*?\)", "", name)
#
# # 提取“-”连接的词
# if "-" in cleaned_name:
# parts = cleaned_name.split("-")
# first = parts[0].strip()
# second = re.split(r"[^\u4e00-\u9fa5]", parts[1])[0].strip() # 只取中文部分
# if first:
# keywords.append(first)
# if second:
# keywords.append(second)
# if first and second:
# keywords.append(first + second)
# else:
# # 正常提取,取第一个连续的中文词组
# match = re.match(r"([\u4e00-\u9fa5]+)", cleaned_name)
# if match:
# keywords.append(match.group(1))
#
# # 去重,且保持原顺序
# seen = set()
# unique_keywords = []
# for kw in keywords:
# if kw not in seen:
# seen.add(kw)
# unique_keywords.append(kw)
#
# # 写入到文件
# with open("new_project.txt", "w", encoding="utf-8") as f:
# for kw in unique_keywords:
# f.write(kw + "\n")
#
# print("提取完成,已写入 new_project.txt")
# # 去重且保持顺序
# seen = set()
# unique_keywords = []
# for kw in keywords:
# if kw not in seen:
# seen.add(kw)
# unique_keywords.append(kw)
#
# # 写入到文件
# with open("new_project.txt", "w", encoding="utf-8") as f:
# for kw in unique_keywords:
# f.write(kw + "\n")
#
# print("提取完成,已写入 new_project.txt")
def exact_project_hot_words():
import re
# 示例数据,换成 GlobalData.simply_to_standard_project_name_map.keys()
# project_names = [
# "安庆四500kV变电站新建工程(PROJ-2024-0862)",
# "淮南芦集 220 千伏变电站 220 千伏配电装置改造工程(调试部分)(PROJ-2025-0022)",
# "锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)",
# "渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)",
# "屏显220kV变电站220kV杜岗Ⅱ间隔改造工程调试部分(PROJ-2025-0169)",
# "漆园220kV变电站220kV杨柳间隔改造工程调试部分(PROJ-2025-0042)",
# "宝桥220kV变电站220kV红桥间隔保护改造工程(调试部分)(PROJ-2025-0088)",
# "蟠龙220kV变电站220kV灵泗间隔改造工程调试部分(PROJ-2025-0018)",
# "安庆和平220kV变电站新建工程调试部分(PROJ-2024-1238)",
# "安徽亳州华佗220kV变电站220kV新华风电间隔扩建工程调试部分(PROJ-2024-1171)",
# "先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)",
# "安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)",
# "双岭500kV变电站间隔改造工程(PROJ-2024-0863)",
# "合州±800千伏换流站电气安装A包(PROJ-2025-0056)",
# "金牛500kV变电站新建工程(PROJ-2024-0866)",
# "况楼220kV变电站间隔扩建工程调试部分(PROJ-2025-0144)",
# "国网安徽合肥供电公司2023年GIS带电显示器维护(PROJ-2024-1260)",
# "亳州木兰220kV变电站220kV改造工程安徽亳州木兰200kV变电站GIS设备检修及调试技术服务(PROJ-2024-1256)",
# "香涧-鹭岛500kV线路工程淮河大跨越段(PROJ-2024-0722)",
# "安徽蚌埠濠州220kV变电站220千伏大唐凤阳红心镇光伏间隔扩建工程调试部分)(PROJ-2025-0164)",
# "陶楼-广银T接智迪改接首业变电站110kV电缆线路工程(PROJ-2024-1233)",
# "国网北京检修公司2024年±500kV延庆换流站直流主设备及辅助设备不停电检修维护(PROJ-2024-0841)"
# ]
# 排除含有这些词的项目名
exclude_keywords = ["国网", "公司"]
keywords = []
for name in list(GlobalData.simply_to_standard_project_name_map.values()): # 正式换成
print(f"name:{name}")
# 去掉括号及里面内容
cleaned_name = re.sub(r".*?|\(.*?\)", "", name)
# 如果包含排除关键词,跳过
if any(ek in cleaned_name for ek in exclude_keywords):
continue
# 处理有"-"连接的情况
if "-" in cleaned_name:
parts = cleaned_name.split("-")
# 切出来的每一段,也要去括号内容
part0 = re.sub(r".*?|\(.*?\)", "", parts[0])
part1 = re.sub(r".*?|\(.*?\)", "", parts[1])
first = re.match(r"[\u4e00-\u9fa5]+", part0)
second = re.match(r"[\u4e00-\u9fa5]+", part1)
first_word = first.group(0) if first else ""
second_word = second.group(0) if second else ""
if first_word:
keywords.append(first_word)
if second_word:
keywords.append(second_word)
if first_word and second_word:
keywords.append(first_word + second_word)
else:
# 没有"-",提取第一个连续中文
match = re.match(r"([\u4e00-\u9fa5]+)", cleaned_name)
if match:
word = match.group(1)
if word:
keywords.append(word)
# 去重且保持顺序
seen = set()
unique_keywords = []
for kw in keywords:
if kw not in seen:
seen.add(kw)
unique_keywords.append(kw)
# 写入文件
with open("new_project.txt", "w", encoding="utf-8") as f:
for kw in unique_keywords:
f.write(kw + "\n")
print("提取完成,已写入 new_project.txt")
def removte_reduant_list():
temp_list = load_standard_name_list("./new_project.txt")
# 去重且保持顺序
seen = set()
unique_keywords = []
for kw in temp_list:
if kw not in seen:
seen.add(kw)
unique_keywords.append(kw)
with open("hot_word/final_new_project.txt", "w", encoding="utf-8") as f:
for kw in unique_keywords:
f.write(kw + "\n")
save_standard_name_list_to_file(list(GlobalData.simply_to_standard_project_name_map.keys()),"./hot_word/project.txt")
def list_to_json():
import json
my_list = load_standard_name_list("./hot_word/final_new_project.txt")
data = {
"hotwordList": my_list
}
# 转换为 JSON 格式字符串ensure_ascii=False 确保中文正常显示)
json_str = json.dumps(data, ensure_ascii=False, indent=4)
save_dict_to_file(json_str,'./hot_word/final_new_project.json')
list_to_json()
# exact_hot_words()
# exact_project_hot_words()
# unuselessStr = clean_useless_project_name("众兴杜岗ⅱ间隔改造")
# print(f"众兴杜岗ⅱ间隔改造:{unuselessStr}")
# unuselessStr = clean_useless_project_name("众兴杜岗Ⅱ间隔改造")
# print(f"众兴杜岗Ⅱ间隔改造:{unuselessStr}")
# print("今天的长度:",len("今天"))
# standardize_program()
# history_message()
2025-05-04 15:29:03 +08:00
# standardize_project_test()
# standardize_company_test()
# standardize_team_leader_test()
#
# standardize_sub_constractor_test()
#
# check_standard_name_slot_probability_test()
#
# standardize_construction_test()
# standardize_program_test()