Intention/api/standard_test.py

591 lines
25 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
from logger_util import setup_logger
from globalData import GlobalData
from utils import standardize_name, clean_useless_team_leader_name, standardize_sub_company, standardize_project_name, \
standardize_projectDepartment, standardize_team_name, check_standard_name_slot_probability, \
clean_useless_project_name, save_standard_name_list_to_file, load_standard_name_list, save_dict_to_file, \
load_standard_json_data
import time
from apscheduler.schedulers.blocking import BlockingScheduler
from globalData import GlobalData
# def job():
# print("[Info] Executing update_from_redis...")
# GlobalData.update_from_redis()
#
#
logger = setup_logger("utils", level=logging.DEBUG)
# GlobalData.update_from_redis()
def check_standard_name_slot_probability_test():
slot_list = [{"constructionUnit": "合肥供电公司"},
{
"date": "今天",
"constructionUnit": "芜湖供电公司"
},
{
"date": "今天",
"implementationOrganization": "送电一分公司"
},
{
"date": "今天",
"subcontractor": "百瑞建设发展有限公司"
},
{
"date": "今天",
"subcontractor": "安徽宝德电力建设工程有限"
},
{
"date": "今天",
"teamName": "徐局班组"
}
]
for slot in slot_list:
match_results = check_standard_name_slot_probability(12, slot)
logger.info(f"加权混合策略 项目部名称 输入: 原始槽位:{slot},输出: {match_results}")
def standardize_team_leader_test():
team_leader_list = [
"李东班组",
"磐基班组",
"章永班组",
"张勇班组",
"王治国班组",
"代贵华班组",
"黄安印班组",
"刘闩班组",
"王虎班组",
"周勇勇班组",
"魏在华班组",
"王礼良班组",
"林学刚班组",
"崔新荣班组",
"江军班组",
"笪淦班组",
"杨海平班组",
"蔡来云班组",
"贺中林班组",
"何勇班组",
"韦幸朝班组",
"刘文虎班组",
"金生班组",
"段宝强班组",
"何计划班组",
"刘兆班组",
"徐南班组",
"贺广飞班组",
"孙泽栋班组",
"钱小林班组",
"朱锋东班组",
]
for item in team_leader_list:
match_results = standardize_team_name(item, GlobalData.simply_to_standard_team_leader_name_map,
GlobalData.pinyin_simply_to_standard_team_leader_name_map, lower_score=70, high_score=90)
# match_results = standardize_name(item, clean_useless_team_leader_name, GlobalData.simply_to_standard_team_leader_name_map,
# GlobalData.pinyin_simply_to_standard_team_leader_name_map, lower_score=70, high_score=90)
logger.info(f"班组长名匹配 输入: {item}-> 输出: {match_results}")
def standardize_company_test():
test_cases = [
("宋轶分公司"),
# ("送二分公司"),
# ("变电分公司"),
# ("建筑分公司"),
# ("检修试验分公司"),
# ("宏源电力公司"),
# ("宏源电力限公司"),
# ("宏源电力限公司线路"),
# ("宏源电力限公司变电"),
# ("送一分"),
# ("送二分"),
# ("变电分"),
# ("建筑分"),
# ("检修试验分"),
# ("宏源电力"),
# ("红源电力"),
# ("宏源电力有限"),
# ("宏源电力限线路"),
# ("宏源电力限变电"),
]
print(f"加权混合策略 分公司名匹配**********************")
start = time.perf_counter()
for item in test_cases:
match_results = standardize_sub_company(item,GlobalData.simply_to_standard_company_name_map, GlobalData.pinyin_simply_to_standard_company_name_map,70,90)
print(f"加权混合策略 分公司名匹配 输入: {item}-> 输出: {match_results}")
end = time.perf_counter()
print(f"加权混合策略 耗时: {end - start:.4f}")
def standardize_construction_test():
test_cases = [
("合肥供电公司"),
("淮北供电公司"),
("六安市城郊供电公司"),
]
logger.info(f"加权混合策略 建管单位名匹配**********************")
start = time.perf_counter()
for item in test_cases:
match_results = standardize_sub_company(item,GlobalData.simply_to_standard_construct_name_map, GlobalData.pinyin_simply_to_standard_construct_name_map,70,90)
logger.info(f"加权混合策略 建管单位名匹配 输入: {item}-> 输出: {match_results}")
def standardize_project_test():
test_cases = [
("陶楼夏塘工程"),
("宗阳黄桥工程")
# ("合肥卫田变电站工程").
# ("金牛变电站新建建筑"),
# ("金牛变电站建筑工程"),
# ("金牛新建工程"),
# ("金牛新建工程调试"),
# ("金牛新建调试工程"),
# ("金牛变电站工程"),
# ("芦集"),
# ("芦集变电站"),
# ("安庆四变电站"),
# ("锦绣变电站"),
# ("滁州护桥变电站"),
# ("合州换流站"),
# ("陕北合州换流站"),
# ("陕北安徽合州换流站"),
# ("金牛变电站"),
# ("香涧鹭岛工程"),
# ("延庆换流站"),
# ("国网延庆换流站"),
# ("国网北京延庆换流站"),
# ("陶楼广银线路工程"),
# ("紫蓬变电站"),
# ("宿州萧砀变电站"),
# ("冯井变电站"),
# ("富邦秋浦变电站"),
# ("包河玉龙变电站"),
# ("绿雪莲塘工程"),
# ("合肥循环园工程"),
# ("合肥长临河工程"),
# ("合肥中心变"),
# ("锁库变电站工程"),
# ("槽坊工程"),
# ("富东2798线"),
# ("安庆四500kV变电站新建工程(PROJ-2024-0862)"),
# ("锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)"),
# ("渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)"),
# ("先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)"),
# ("安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)"),
# ("合州士800千伏换流站电气安装A包(PROJ-2025-0056)"),
# ("卫田-陶楼T接首业变电站110kV电缆线路工程(PROJ-2024-1236)"),
# ("谯城(亳三)-希夷220kV线路工程(PROJ-2024-1205)"),
]
print(f"去不重要词汇 工程名匹配******************************************")
start = time.perf_counter()
for item in test_cases:
match_results = standardize_project_name(item, GlobalData.simply_to_standard_project_name_map,
GlobalData.pinyin_simply_to_standard_project_name_map, 70, 90)
print(f"***************工程名匹配 输入: {item}-> 输出: {match_results}")
end = time.perf_counter()
print(f"词集匹配 耗时: {end - start:.4f}")
def standardize_program_test():
logger.info(f"项目名匹配******************************************")
oral_program_name_list = [
("金上第一项目部"),
("第一项目部金上"),
# ("第1项目部"), # 期望返回所有"第三项目管理部"
# ("第2项目部"),
# ("第3项目部"),
# ("第4项目部"),
# ("第5项目部"),
# ("第6项目部"),
# ("第7项目部"),
# ("第8项目部"),
# ("第9项目部"),
# ("第10项目部"),
# ("第11项目部"),
# ("第12项目部"),
# ("第13项目部"),
# ("电缆班"),
# ("调试1队"),
# ("调试2队"),
# ("调试3队"),
# ("调试4队"),
# ("调试5队"),
# ("第一项目管理部"),
# ("第二项目管理部"),
# ("第五项目管理部"),
# ("第十一项目管理部(萧砀线路)"),
# ("第三项目管理部(张店线路)"),
# ("第三项目管理部(岳西线路)"),
# ("第五项目管理部(蚌埠)"),
# ("第三项目管理部(六安线路)"),
# ("第十一项目管理部(宿州线路)"),
# ("调试一队"),
# ("调试二队"),
# ("调试三队"),
# ("电缆班"),
]
for company in GlobalData.standard_company_name_list:
for program in oral_program_name_list:
match_results = standardize_projectDepartment(company, program, GlobalData.standard_company_program, high_score=90)
logger.info(f"加权混合策略 项目部名称 输入: 公司:{company},项目部:{program}-> 输出: {match_results}")
def standardize_sub_constractor_test():
test_cases = [
("怀电能源科技"),
("泰央建设有限责任公司"),
("泓源电力建设有限公司"),
("怀电能源科技公司"),
("宝德电力公司"),
("亿甲建筑公司"),
]
logger.info(f"加权混合策略 分包单位名匹配**********************")
start = time.perf_counter()
for item in test_cases:
match_results = standardize_sub_company(item,GlobalData.simply_to_standard_constractor_name_map, GlobalData.pinyin_simply_to_standard_constractor_name_map,70,90)
logger.info(f"分包单位名匹配 输入: {item}-> 输出: {match_results}")
def get_file_name():
import os
# 你想要遍历的目录路径
target_dir = '/Users/wangvivi/Desktop/Work/2025项目材料/送变电大模型/知识库文档/送变电文档合并V5' # ← 请替换为你的目标目录路径
# 存储文件名的列表
all_file_names = []
# 遍历目录及其子目录
for root, dirs, files in os.walk(target_dir):
for file in files:
all_file_names.append(file)
# 写入 name.txt 文件
output_path = 'name.txt' # 会生成在当前运行目录下
with open(output_path, 'w', encoding='utf-8') as f:
for name in all_file_names:
f.write(name + '\n')
logger.info(f"共收集了 {len(all_file_names)} 个文件名,已写入 {output_path}")
class Message:
def __init__(self, role, content):
self.role = role
self.content = content
class Message:
def __init__(self, role, content):
self.role = role
self.content = content
def history_message():
from collections import namedtuple
Message = namedtuple("Message", ["role", "content"])
messages = [
# Message("user", "延庆换流站今天有多少作业计划"),
# Message("assistant", "2025-04-23 ±500KV延庆换流站备用换流变安装(PROJ-2025-0162)风险等级为2级的有0项3级的有0项4级的有1项5级的有0项一共有1项作业计划"),
# Message("user", "河州换流站今天有多少作业计划"),
# Message("assistant", "您说的工程名可能是: 第1个合州±800千伏换流站电气安装A包(PROJ-2025-0056) 第2个合州换流站-文都500千伏线路工程(PROJ-2024-1089) 第3个陕北-安徽直流工程合州±800千伏换流站土建A包(PROJ-2024-0312) 第4个文都-官山改接入合州换流站500千伏线路工程(PROJ-2024-1090) 请确认您要选择哪一个"),
Message("user", "第一个")
]
latest_message = messages[-1]
latest_user_question = latest_message.content if latest_message.role == "user" else ""
time_prefixes = ["今天", "昨天", "本周", "下周", "明天", "今日"]
history_messages = [] if any(prefix in latest_user_question for prefix in time_prefixes) else messages[:-1]
logger.info("len(history_messages):\n", len(history_messages))
oldest_chat_history = "\n".join([f"{msg.role}: {msg.content}" for msg in history_messages[:2]])
last_chat_history = "\n".join([f"{msg.role}: {msg.content}" for msg in history_messages[-2:]])
logger.info("oldest_chat_history:\n", oldest_chat_history)
logger.info("last_chat_history:\n", last_chat_history)
logger.info("latest_user_question:\n", latest_user_question)
def standardize_program():
from rapidfuzz import process, fuzz
# query = "金上第一项目"
# choices = [
# "第五项目管理部(阜阳)",
# "第一项目管理部(金上)",
# "第二项目管理部(香鹭西段)",
# "第十一项目管理部(宣城)",
# "第八项目管理部(芜湖)",
# "第十三项目管理部(黄山)",
# "第六项目管理部(滁州)",
# "第四项目管理部(甘浙)",
# "第九项目管理部(马鞍山)",
# "第三项目管理部(香鹭东段)",
# "第一项目管理部(天津)"
# ]
query = "第一金上"
choices = [
"第五阜阳",
"第一金上",
"第二香鹭西段",
"第十一宣城",
"第八芜湖",
"第十三黄山",
"第六滁州",
"第四甘浙",
"第九马鞍山)",
"第三香鹭东段",
"第一天津"
]
match = process.extractOne(query, choices, scorer=fuzz.WRatio)
logger.info(match)
def get_size():
import sys
total_size = sys.getsizeof(GlobalData.standard_company_program) + sys.getsizeof(GlobalData.standard_company_name_list) + sys.getsizeof(GlobalData.simply_to_standard_company_name_map)+ sys.getsizeof(GlobalData.pinyin_simply_to_standard_company_name_map)
logger.info(f"standard_company size: {total_size} bytes")
total_size = sys.getsizeof(GlobalData.standard_project_name_list) + sys.getsizeof(GlobalData.simply_to_standard_project_name_map) + sys.getsizeof(GlobalData.pinyin_simply_to_standard_project_name_map)
logger.info(f"standard_project size: {total_size} bytes")
total_size = sys.getsizeof(GlobalData.standard_construct_name_list) + sys.getsizeof(GlobalData.simply_to_standard_construct_name_map) + sys.getsizeof(GlobalData.pinyin_simply_to_standard_construct_name_map)
logger.info(f"standard_construct size: {total_size} bytes")
total_size = sys.getsizeof(GlobalData.standard_constractor_name_list) + sys.getsizeof(GlobalData.simply_to_standard_constractor_name_map) + sys.getsizeof(GlobalData.pinyin_simply_to_standard_constractor_name_map)
logger.info(f"standard_constractor size: {total_size} bytes")
total_size = sys.getsizeof(GlobalData.standard_team_leader_name_list) + sys.getsizeof(GlobalData.simply_to_standard_team_leader_name_map) + sys.getsizeof(GlobalData.pinyin_simply_to_standard_constractor_name_map)
logger.info(f"standard_team size: {total_size} bytes")
standardize_project_test()
def exact_hot_words():
save_standard_name_list_to_file(list(GlobalData.simply_to_standard_company_name_map.keys()),"./hot_word/company.txt")
#save_standard_name_list_to_file(list(GlobalData.simply_to_standard_project_name_map.keys()),"./hot_word/project.txt")
save_standard_name_list_to_file(list(GlobalData.simply_to_standard_constractor_name_map.keys()),"./hot_word/constractor.txt")
save_standard_name_list_to_file(list(GlobalData.simply_to_standard_construct_name_map.keys()),"./hot_word/construct.txt")
# def exact_project_hot_words():
# import re
#
# # 示例数据,换成 GlobalData.simply_to_standard_project_name_map.keys()
# project_names = [
# "安庆四500kV变电站新建工程(PROJ-2024-0862)",
# "淮南芦集 220 千伏变电站 220 千伏配电装置改造工程(调试部分)(PROJ-2025-0022)",
# "屏显220kV变电站220kV杜岗Ⅱ间隔改造工程调试部分(PROJ-2025-0169)",
# "漆园220kV变电站220kV杨柳间隔改造工程调试部分(PROJ-2025-0042)",
# "宝桥220kV变电站220kV红桥间隔保护改造工程(调试部分)(PROJ-2025-0088)",
# "蟠龙220kV变电站220kV灵泗间隔改造工程调试部分(PROJ-2025-0018)",
# "锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)",
# "安庆和平220kV变电站新建工程调试部分(PROJ-2024-1238)",
# "渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)",
# "安徽亳州华佗220kV变电站220kV新华风电间隔扩建工程调试部分(PROJ-2024-1171)",
# "先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)",
# "安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)",
# "双岭500kV变电站间隔改造工程(PROJ-2024-0863)",
# "合州±800千伏换流站电气安装A包(PROJ-2025-0056)",
# "金牛500kV变电站新建工程(PROJ-2024-0866)",
# "况楼220kV变电站间隔扩建工程调试部分(PROJ-2025-0144)",
# "国网安徽合肥供电公司2023年GIS带电显示器维护(PROJ-2024-1260)",
# "亳州木兰220kV变电站220kV改造工程安徽亳州木兰200kV变电站GIS设备检修及调试技术服务(PROJ-2024-1256)",
# "香涧-鹭岛500kV线路工程淮河大跨越段(PROJ-2024-0722)",
# "安徽蚌埠濠州220kV变电站220千伏大唐凤阳红心镇光伏间隔扩建工程调试部分)(PROJ-2025-0164)",
# "陶楼-广银T接智迪改接首业变电站110kV电缆线路工程(PROJ-2024-1233)",
# "国网北京检修公司2024年±500kV延庆换流站直流主设备及辅助设备不停电检修维护(PROJ-2024-0841)"
# ]
#
#
# # 用来存关键词
# keywords = []
# #
# for name in GlobalData.simply_to_standard_project_name_map.keys():
# # 去掉括号和括号里的内容
# cleaned_name = re.sub(r"\(.*?\)", "", name)
#
# # 提取“-”连接的词
# if "-" in cleaned_name:
# parts = cleaned_name.split("-")
# first = parts[0].strip()
# second = re.split(r"[^\u4e00-\u9fa5]", parts[1])[0].strip() # 只取中文部分
# if first:
# keywords.append(first)
# if second:
# keywords.append(second)
# if first and second:
# keywords.append(first + second)
# else:
# # 正常提取,取第一个连续的中文词组
# match = re.match(r"([\u4e00-\u9fa5]+)", cleaned_name)
# if match:
# keywords.append(match.group(1))
#
# # 去重,且保持原顺序
# seen = set()
# unique_keywords = []
# for kw in keywords:
# if kw not in seen:
# seen.add(kw)
# unique_keywords.append(kw)
#
# # 写入到文件
# with open("new_project.txt", "w", encoding="utf-8") as f:
# for kw in unique_keywords:
# f.write(kw + "\n")
#
# print("提取完成,已写入 new_project.txt")
# # 去重且保持顺序
# seen = set()
# unique_keywords = []
# for kw in keywords:
# if kw not in seen:
# seen.add(kw)
# unique_keywords.append(kw)
#
# # 写入到文件
# with open("new_project.txt", "w", encoding="utf-8") as f:
# for kw in unique_keywords:
# f.write(kw + "\n")
#
# print("提取完成,已写入 new_project.txt")
def exact_project_hot_words():
import re
# 示例数据,换成 GlobalData.simply_to_standard_project_name_map.keys()
# project_names = [
# "安庆四500kV变电站新建工程(PROJ-2024-0862)",
# "淮南芦集 220 千伏变电站 220 千伏配电装置改造工程(调试部分)(PROJ-2025-0022)",
# "锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)",
# "渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)",
# "屏显220kV变电站220kV杜岗Ⅱ间隔改造工程调试部分(PROJ-2025-0169)",
# "漆园220kV变电站220kV杨柳间隔改造工程调试部分(PROJ-2025-0042)",
# "宝桥220kV变电站220kV红桥间隔保护改造工程(调试部分)(PROJ-2025-0088)",
# "蟠龙220kV变电站220kV灵泗间隔改造工程调试部分(PROJ-2025-0018)",
# "安庆和平220kV变电站新建工程调试部分(PROJ-2024-1238)",
# "安徽亳州华佗220kV变电站220kV新华风电间隔扩建工程调试部分(PROJ-2024-1171)",
# "先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)",
# "安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)",
# "双岭500kV变电站间隔改造工程(PROJ-2024-0863)",
# "合州±800千伏换流站电气安装A包(PROJ-2025-0056)",
# "金牛500kV变电站新建工程(PROJ-2024-0866)",
# "况楼220kV变电站间隔扩建工程调试部分(PROJ-2025-0144)",
# "国网安徽合肥供电公司2023年GIS带电显示器维护(PROJ-2024-1260)",
# "亳州木兰220kV变电站220kV改造工程安徽亳州木兰200kV变电站GIS设备检修及调试技术服务(PROJ-2024-1256)",
# "香涧-鹭岛500kV线路工程淮河大跨越段(PROJ-2024-0722)",
# "安徽蚌埠濠州220kV变电站220千伏大唐凤阳红心镇光伏间隔扩建工程调试部分)(PROJ-2025-0164)",
# "陶楼-广银T接智迪改接首业变电站110kV电缆线路工程(PROJ-2024-1233)",
# "国网北京检修公司2024年±500kV延庆换流站直流主设备及辅助设备不停电检修维护(PROJ-2024-0841)"
# ]
# 排除含有这些词的项目名
exclude_keywords = ["国网", "公司"]
keywords = []
for name in list(GlobalData.simply_to_standard_project_name_map.values()): # 正式换成
print(f"name:{name}")
# 去掉括号及里面内容
cleaned_name = re.sub(r".*?|\(.*?\)", "", name)
# 如果包含排除关键词,跳过
if any(ek in cleaned_name for ek in exclude_keywords):
continue
# 处理有"-"连接的情况
if "-" in cleaned_name:
parts = cleaned_name.split("-")
# 切出来的每一段,也要去括号内容
part0 = re.sub(r".*?|\(.*?\)", "", parts[0])
part1 = re.sub(r".*?|\(.*?\)", "", parts[1])
first = re.match(r"[\u4e00-\u9fa5]+", part0)
second = re.match(r"[\u4e00-\u9fa5]+", part1)
first_word = first.group(0) if first else ""
second_word = second.group(0) if second else ""
if first_word:
keywords.append(first_word)
if second_word:
keywords.append(second_word)
if first_word and second_word:
keywords.append(first_word + second_word)
else:
# 没有"-",提取第一个连续中文
match = re.match(r"([\u4e00-\u9fa5]+)", cleaned_name)
if match:
word = match.group(1)
if word:
keywords.append(word)
# 去重且保持顺序
seen = set()
unique_keywords = []
for kw in keywords:
if kw not in seen:
seen.add(kw)
unique_keywords.append(kw)
# 写入文件
with open("new_project.txt", "w", encoding="utf-8") as f:
for kw in unique_keywords:
f.write(kw + "\n")
print("提取完成,已写入 new_project.txt")
def removte_reduant_list():
temp_list = load_standard_name_list("./new_project.txt")
# 去重且保持顺序
seen = set()
unique_keywords = []
for kw in temp_list:
if kw not in seen:
seen.add(kw)
unique_keywords.append(kw)
with open("hot_word/final_new_project.txt", "w", encoding="utf-8") as f:
for kw in unique_keywords:
f.write(kw + "\n")
save_standard_name_list_to_file(list(GlobalData.simply_to_standard_project_name_map.keys()),"./hot_word/project.txt")
def list_to_json():
import json
my_list = load_standard_name_list("./hot_word/final_new_project.txt")
data = {
"hotwordList": my_list
}
# 转换为 JSON 格式字符串ensure_ascii=False 确保中文正常显示)
json_str = json.dumps(data, ensure_ascii=False, indent=4)
save_dict_to_file(json_str,'./hot_word/final_new_project.json')
list_to_json()
# exact_hot_words()
# exact_project_hot_words()
# unuselessStr = clean_useless_project_name("众兴杜岗ⅱ间隔改造")
# print(f"众兴杜岗ⅱ间隔改造:{unuselessStr}")
# unuselessStr = clean_useless_project_name("众兴杜岗Ⅱ间隔改造")
# print(f"众兴杜岗Ⅱ间隔改造:{unuselessStr}")
# print("今天的长度:",len("今天"))
# standardize_program()
# history_message()
# standardize_project_test()
# standardize_company_test()
# standardize_team_leader_test()
#
# standardize_sub_constractor_test()
#
# check_standard_name_slot_probability_test()
#
# standardize_construction_test()
# standardize_program_test()