import logging from logger_util import setup_logger from globalData import GlobalData from utils import standardize_name, clean_useless_team_leader_name, standardize_sub_company, standardize_project_name, \ standardize_projectDepartment, standardize_team_name, check_standard_name_slot_probability, \ clean_useless_project_name, save_standard_name_list_to_file, load_standard_name_list, save_dict_to_file, \ load_standard_json_data import time from apscheduler.schedulers.blocking import BlockingScheduler from globalData import GlobalData # def job(): # print("[Info] Executing update_from_redis...") # GlobalData.update_from_redis() # # logger = setup_logger("utils", level=logging.DEBUG) # GlobalData.update_from_redis() def check_standard_name_slot_probability_test(): slot_list = [{"constructionUnit": "合肥供电公司"}, { "date": "今天", "constructionUnit": "芜湖供电公司" }, { "date": "今天", "implementationOrganization": "送电一分公司" }, { "date": "今天", "subcontractor": "百瑞建设发展有限公司" }, { "date": "今天", "subcontractor": "安徽宝德电力建设工程有限" }, { "date": "今天", "teamName": "徐局班组" } ] for slot in slot_list: match_results = check_standard_name_slot_probability(12, slot) logger.info(f"加权混合策略 项目部名称 输入: 原始槽位:{slot},输出: {match_results}") def standardize_team_leader_test(): team_leader_list = [ "李东班组", "磐基班组", "章永班组", "张勇班组", "王治国班组", "代贵华班组", "黄安印班组", "刘闩班组", "王虎班组", "周勇勇班组", "魏在华班组", "王礼良班组", "林学刚班组", "崔新荣班组", "江军班组", "笪淦班组", "杨海平班组", "蔡来云班组", "贺中林班组", "何勇班组", "韦幸朝班组", "刘文虎班组", "金生班组", "段宝强班组", "何计划班组", "刘兆班组", "徐南班组", "贺广飞班组", "孙泽栋班组", "钱小林班组", "朱锋东班组", ] for item in team_leader_list: match_results = standardize_team_name(item, GlobalData.simply_to_standard_team_leader_name_map, GlobalData.pinyin_simply_to_standard_team_leader_name_map, lower_score=70, high_score=90) # match_results = standardize_name(item, clean_useless_team_leader_name, GlobalData.simply_to_standard_team_leader_name_map, # GlobalData.pinyin_simply_to_standard_team_leader_name_map, lower_score=70, high_score=90) logger.info(f"班组长名匹配 输入: {item}-> 输出: {match_results}") def standardize_company_test(): test_cases = [ ("宋轶分公司"), # ("送二分公司"), # ("变电分公司"), # ("建筑分公司"), # ("检修试验分公司"), # ("宏源电力公司"), # ("宏源电力限公司"), # ("宏源电力限公司线路"), # ("宏源电力限公司变电"), # ("送一分"), # ("送二分"), # ("变电分"), # ("建筑分"), # ("检修试验分"), # ("宏源电力"), # ("红源电力"), # ("宏源电力有限"), # ("宏源电力限线路"), # ("宏源电力限变电"), ] print(f"加权混合策略 分公司名匹配**********************") start = time.perf_counter() for item in test_cases: match_results = standardize_sub_company(item,GlobalData.simply_to_standard_company_name_map, GlobalData.pinyin_simply_to_standard_company_name_map,70,90) print(f"加权混合策略 分公司名匹配 输入: {item}-> 输出: {match_results}") end = time.perf_counter() print(f"加权混合策略 耗时: {end - start:.4f} 秒") def standardize_construction_test(): test_cases = [ ("合肥供电公司"), ("淮北供电公司"), ("六安市城郊供电公司"), ] logger.info(f"加权混合策略 建管单位名匹配**********************") start = time.perf_counter() for item in test_cases: match_results = standardize_sub_company(item,GlobalData.simply_to_standard_construct_name_map, GlobalData.pinyin_simply_to_standard_construct_name_map,70,90) logger.info(f"加权混合策略 建管单位名匹配 输入: {item}-> 输出: {match_results}") def standardize_project_test(): test_cases = [ ("陶楼夏塘工程"), ("宗阳黄桥工程") # ("合肥卫田变电站工程"). # ("金牛变电站新建建筑"), # ("金牛变电站建筑工程"), # ("金牛新建工程"), # ("金牛新建工程调试"), # ("金牛新建调试工程"), # ("金牛变电站工程"), # ("芦集"), # ("芦集变电站"), # ("安庆四变电站"), # ("锦绣变电站"), # ("滁州护桥变电站"), # ("合州换流站"), # ("陕北合州换流站"), # ("陕北安徽合州换流站"), # ("金牛变电站"), # ("香涧鹭岛工程"), # ("延庆换流站"), # ("国网延庆换流站"), # ("国网北京延庆换流站"), # ("陶楼广银线路工程"), # ("紫蓬变电站"), # ("宿州萧砀变电站"), # ("冯井变电站"), # ("富邦秋浦变电站"), # ("包河玉龙变电站"), # ("绿雪莲塘工程"), # ("合肥循环园工程"), # ("合肥长临河工程"), # ("合肥中心变"), # ("锁库变电站工程"), # ("槽坊工程"), # ("富东2798线"), # ("安庆四500kV变电站新建工程(PROJ-2024-0862)"), # ("锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)"), # ("渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)"), # ("先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)"), # ("安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)"), # ("合州士800千伏换流站电气安装A包(PROJ-2025-0056)"), # ("卫田-陶楼T接首业变电站110kV电缆线路工程(PROJ-2024-1236)"), # ("谯城(亳三)-希夷220kV线路工程(PROJ-2024-1205)"), ] print(f"去不重要词汇 工程名匹配******************************************") start = time.perf_counter() for item in test_cases: match_results = standardize_project_name(item, GlobalData.simply_to_standard_project_name_map, GlobalData.pinyin_simply_to_standard_project_name_map, 70, 90) print(f"***************工程名匹配 输入: {item}-> 输出: {match_results}") end = time.perf_counter() print(f"词集匹配 耗时: {end - start:.4f} 秒") def standardize_program_test(): logger.info(f"项目名匹配******************************************") oral_program_name_list = [ ("金上第一项目部"), ("第一项目部金上"), # ("第1项目部"), # 期望返回所有"第三项目管理部" # ("第2项目部"), # ("第3项目部"), # ("第4项目部"), # ("第5项目部"), # ("第6项目部"), # ("第7项目部"), # ("第8项目部"), # ("第9项目部"), # ("第10项目部"), # ("第11项目部"), # ("第12项目部"), # ("第13项目部"), # ("电缆班"), # ("调试1队"), # ("调试2队"), # ("调试3队"), # ("调试4队"), # ("调试5队"), # ("第一项目管理部"), # ("第二项目管理部"), # ("第五项目管理部"), # ("第十一项目管理部(萧砀线路)"), # ("第三项目管理部(张店线路)"), # ("第三项目管理部(岳西线路)"), # ("第五项目管理部(蚌埠)"), # ("第三项目管理部(六安线路)"), # ("第十一项目管理部(宿州线路)"), # ("调试一队"), # ("调试二队"), # ("调试三队"), # ("电缆班"), ] for company in GlobalData.standard_company_name_list: for program in oral_program_name_list: match_results = standardize_projectDepartment(company, program, GlobalData.standard_company_program, high_score=90) logger.info(f"加权混合策略 项目部名称 输入: 公司:{company},项目部:{program}-> 输出: {match_results}") def standardize_sub_constractor_test(): test_cases = [ ("怀电能源科技"), ("泰央建设有限责任公司"), ("泓源电力建设有限公司"), ("怀电能源科技公司"), ("宝德电力公司"), ("亿甲建筑公司"), ] logger.info(f"加权混合策略 分包单位名匹配**********************") start = time.perf_counter() for item in test_cases: match_results = standardize_sub_company(item,GlobalData.simply_to_standard_constractor_name_map, GlobalData.pinyin_simply_to_standard_constractor_name_map,70,90) logger.info(f"分包单位名匹配 输入: {item}-> 输出: {match_results}") def get_file_name(): import os # 你想要遍历的目录路径 target_dir = '/Users/wangvivi/Desktop/Work/2025项目材料/送变电大模型/知识库文档/送变电文档合并V5' # ← 请替换为你的目标目录路径 # 存储文件名的列表 all_file_names = [] # 遍历目录及其子目录 for root, dirs, files in os.walk(target_dir): for file in files: all_file_names.append(file) # 写入 name.txt 文件 output_path = 'name.txt' # 会生成在当前运行目录下 with open(output_path, 'w', encoding='utf-8') as f: for name in all_file_names: f.write(name + '\n') logger.info(f"共收集了 {len(all_file_names)} 个文件名,已写入 {output_path}") class Message: def __init__(self, role, content): self.role = role self.content = content class Message: def __init__(self, role, content): self.role = role self.content = content def history_message(): from collections import namedtuple Message = namedtuple("Message", ["role", "content"]) messages = [ # Message("user", "延庆换流站今天有多少作业计划"), # Message("assistant", "2025-04-23 ±500KV延庆换流站备用换流变安装(PROJ-2025-0162)风险等级为2级的有0项,3级的有0项,4级的有1项,5级的有0项,一共有1项作业计划"), # Message("user", "河州换流站今天有多少作业计划"), # Message("assistant", "您说的工程名可能是: 第1个:合州±800千伏换流站电气安装A包(PROJ-2025-0056) 第2个:合州换流站-文都500千伏线路工程(PROJ-2024-1089) 第3个:陕北-安徽直流工程合州±800千伏换流站土建A包(PROJ-2024-0312) 第4个:文都-官山改接入合州换流站500千伏线路工程(PROJ-2024-1090) 请确认您要选择哪一个"), Message("user", "第一个") ] latest_message = messages[-1] latest_user_question = latest_message.content if latest_message.role == "user" else "" time_prefixes = ["今天", "昨天", "本周", "下周", "明天", "今日"] history_messages = [] if any(prefix in latest_user_question for prefix in time_prefixes) else messages[:-1] logger.info("len(history_messages):\n", len(history_messages)) oldest_chat_history = "\n".join([f"{msg.role}: {msg.content}" for msg in history_messages[:2]]) last_chat_history = "\n".join([f"{msg.role}: {msg.content}" for msg in history_messages[-2:]]) logger.info("oldest_chat_history:\n", oldest_chat_history) logger.info("last_chat_history:\n", last_chat_history) logger.info("latest_user_question:\n", latest_user_question) def standardize_program(): from rapidfuzz import process, fuzz # query = "金上第一项目" # choices = [ # "第五项目管理部(阜阳)", # "第一项目管理部(金上)", # "第二项目管理部(香鹭西段)", # "第十一项目管理部(宣城)", # "第八项目管理部(芜湖)", # "第十三项目管理部(黄山)", # "第六项目管理部(滁州)", # "第四项目管理部(甘浙)", # "第九项目管理部(马鞍山)", # "第三项目管理部(香鹭东段)", # "第一项目管理部(天津)" # ] query = "第一金上" choices = [ "第五阜阳", "第一金上", "第二香鹭西段", "第十一宣城", "第八芜湖", "第十三黄山", "第六滁州", "第四甘浙", "第九马鞍山)", "第三香鹭东段", "第一天津" ] match = process.extractOne(query, choices, scorer=fuzz.WRatio) logger.info(match) def get_size(): import sys total_size = sys.getsizeof(GlobalData.standard_company_program) + sys.getsizeof(GlobalData.standard_company_name_list) + sys.getsizeof(GlobalData.simply_to_standard_company_name_map)+ sys.getsizeof(GlobalData.pinyin_simply_to_standard_company_name_map) logger.info(f"standard_company size: {total_size} bytes") total_size = sys.getsizeof(GlobalData.standard_project_name_list) + sys.getsizeof(GlobalData.simply_to_standard_project_name_map) + sys.getsizeof(GlobalData.pinyin_simply_to_standard_project_name_map) logger.info(f"standard_project size: {total_size} bytes") total_size = sys.getsizeof(GlobalData.standard_construct_name_list) + sys.getsizeof(GlobalData.simply_to_standard_construct_name_map) + sys.getsizeof(GlobalData.pinyin_simply_to_standard_construct_name_map) logger.info(f"standard_construct size: {total_size} bytes") total_size = sys.getsizeof(GlobalData.standard_constractor_name_list) + sys.getsizeof(GlobalData.simply_to_standard_constractor_name_map) + sys.getsizeof(GlobalData.pinyin_simply_to_standard_constractor_name_map) logger.info(f"standard_constractor size: {total_size} bytes") total_size = sys.getsizeof(GlobalData.standard_team_leader_name_list) + sys.getsizeof(GlobalData.simply_to_standard_team_leader_name_map) + sys.getsizeof(GlobalData.pinyin_simply_to_standard_constractor_name_map) logger.info(f"standard_team size: {total_size} bytes") standardize_project_test() def exact_hot_words(): save_standard_name_list_to_file(list(GlobalData.simply_to_standard_company_name_map.keys()),"./hot_word/company.txt") #save_standard_name_list_to_file(list(GlobalData.simply_to_standard_project_name_map.keys()),"./hot_word/project.txt") save_standard_name_list_to_file(list(GlobalData.simply_to_standard_constractor_name_map.keys()),"./hot_word/constractor.txt") save_standard_name_list_to_file(list(GlobalData.simply_to_standard_construct_name_map.keys()),"./hot_word/construct.txt") # def exact_project_hot_words(): # import re # # # 示例数据,换成 GlobalData.simply_to_standard_project_name_map.keys() # project_names = [ # "安庆四500kV变电站新建工程(PROJ-2024-0862)", # "淮南芦集 220 千伏变电站 220 千伏配电装置改造工程(调试部分)(PROJ-2025-0022)", # "屏显220kV变电站220kV杜岗Ⅱ间隔改造工程(调试部分)(PROJ-2025-0169)", # "漆园220kV变电站220kV杨柳间隔改造工程(调试部分)(PROJ-2025-0042)", # "宝桥220kV变电站220kV红桥间隔保护改造工程(调试部分)(PROJ-2025-0088)", # "蟠龙220kV变电站220kV灵泗间隔改造工程(调试部分)(PROJ-2025-0018)", # "锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)", # "安庆和平220kV变电站新建工程(调试部分)(PROJ-2024-1238)", # "渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)", # "安徽亳州华佗220kV变电站220kV新华风电间隔扩建工程(调试部分)(PROJ-2024-1171)", # "先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)", # "安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)", # "双岭500kV变电站间隔改造工程(PROJ-2024-0863)", # "合州±800千伏换流站电气安装A包(PROJ-2025-0056)", # "金牛500kV变电站新建工程(PROJ-2024-0866)", # "况楼220kV变电站间隔扩建工程(调试部分)(PROJ-2025-0144)", # "国网安徽合肥供电公司2023年GIS带电显示器维护(PROJ-2024-1260)", # "亳州木兰220kV变电站220kV改造工程(安徽亳州木兰200kV变电站GIS设备检修及调试技术服务)(PROJ-2024-1256)", # "香涧-鹭岛500kV线路工程(淮河大跨越段)(PROJ-2024-0722)", # "安徽蚌埠濠州220kV变电站220千伏大唐凤阳红心镇光伏间隔扩建工程(调试部分)(PROJ-2025-0164)", # "陶楼-广银(T接智迪)改接首业变电站110kV电缆线路工程(PROJ-2024-1233)", # "国网北京检修公司2024年±500kV延庆换流站直流主设备及辅助设备不停电检修维护(PROJ-2024-0841)" # ] # # # # 用来存关键词 # keywords = [] # # # for name in GlobalData.simply_to_standard_project_name_map.keys(): # # 去掉括号和括号里的内容 # cleaned_name = re.sub(r"\(.*?\)", "", name) # # # 提取“-”连接的词 # if "-" in cleaned_name: # parts = cleaned_name.split("-") # first = parts[0].strip() # second = re.split(r"[^\u4e00-\u9fa5]", parts[1])[0].strip() # 只取中文部分 # if first: # keywords.append(first) # if second: # keywords.append(second) # if first and second: # keywords.append(first + second) # else: # # 正常提取,取第一个连续的中文词组 # match = re.match(r"([\u4e00-\u9fa5]+)", cleaned_name) # if match: # keywords.append(match.group(1)) # # # 去重,且保持原顺序 # seen = set() # unique_keywords = [] # for kw in keywords: # if kw not in seen: # seen.add(kw) # unique_keywords.append(kw) # # # 写入到文件 # with open("new_project.txt", "w", encoding="utf-8") as f: # for kw in unique_keywords: # f.write(kw + "\n") # # print("提取完成,已写入 new_project.txt") # # 去重且保持顺序 # seen = set() # unique_keywords = [] # for kw in keywords: # if kw not in seen: # seen.add(kw) # unique_keywords.append(kw) # # # 写入到文件 # with open("new_project.txt", "w", encoding="utf-8") as f: # for kw in unique_keywords: # f.write(kw + "\n") # # print("提取完成,已写入 new_project.txt") def exact_project_hot_words(): import re # 示例数据,换成 GlobalData.simply_to_standard_project_name_map.keys() # project_names = [ # "安庆四500kV变电站新建工程(PROJ-2024-0862)", # "淮南芦集 220 千伏变电站 220 千伏配电装置改造工程(调试部分)(PROJ-2025-0022)", # "锦绣-常青π入中心变电站220kV架空线路工程(PROJ-2024-1206)", # "渝北±800千伏换流站电气安装A包(调试部分)(PROJ-2024-1192)", # "屏显220kV变电站220kV杜岗Ⅱ间隔改造工程(调试部分)(PROJ-2025-0169)", # "漆园220kV变电站220kV杨柳间隔改造工程(调试部分)(PROJ-2025-0042)", # "宝桥220kV变电站220kV红桥间隔保护改造工程(调试部分)(PROJ-2025-0088)", # "蟠龙220kV变电站220kV灵泗间隔改造工程(调试部分)(PROJ-2025-0018)", # "安庆和平220kV变电站新建工程(调试部分)(PROJ-2024-1238)", # "安徽亳州华佗220kV变电站220kV新华风电间隔扩建工程(调试部分)(PROJ-2024-1171)", # "先锋-泉河π入安庆四变电站220kV线路工程(PROJ-2024-0834)", # "安徽滁州护桥220kV变电站2号主变扩建工程(PROJ-2024-0821)", # "双岭500kV变电站间隔改造工程(PROJ-2024-0863)", # "合州±800千伏换流站电气安装A包(PROJ-2025-0056)", # "金牛500kV变电站新建工程(PROJ-2024-0866)", # "况楼220kV变电站间隔扩建工程(调试部分)(PROJ-2025-0144)", # "国网安徽合肥供电公司2023年GIS带电显示器维护(PROJ-2024-1260)", # "亳州木兰220kV变电站220kV改造工程(安徽亳州木兰200kV变电站GIS设备检修及调试技术服务)(PROJ-2024-1256)", # "香涧-鹭岛500kV线路工程(淮河大跨越段)(PROJ-2024-0722)", # "安徽蚌埠濠州220kV变电站220千伏大唐凤阳红心镇光伏间隔扩建工程(调试部分)(PROJ-2025-0164)", # "陶楼-广银(T接智迪)改接首业变电站110kV电缆线路工程(PROJ-2024-1233)", # "国网北京检修公司2024年±500kV延庆换流站直流主设备及辅助设备不停电检修维护(PROJ-2024-0841)" # ] # 排除含有这些词的项目名 exclude_keywords = ["国网", "公司"] keywords = [] for name in list(GlobalData.simply_to_standard_project_name_map.values()): # 正式换成 print(f"name:{name}") # 去掉括号及里面内容 cleaned_name = re.sub(r"(.*?)|\(.*?\)", "", name) # 如果包含排除关键词,跳过 if any(ek in cleaned_name for ek in exclude_keywords): continue # 处理有"-"连接的情况 if "-" in cleaned_name: parts = cleaned_name.split("-") # 切出来的每一段,也要去括号内容 part0 = re.sub(r"(.*?)|\(.*?\)", "", parts[0]) part1 = re.sub(r"(.*?)|\(.*?\)", "", parts[1]) first = re.match(r"[\u4e00-\u9fa5]+", part0) second = re.match(r"[\u4e00-\u9fa5]+", part1) first_word = first.group(0) if first else "" second_word = second.group(0) if second else "" if first_word: keywords.append(first_word) if second_word: keywords.append(second_word) if first_word and second_word: keywords.append(first_word + second_word) else: # 没有"-",提取第一个连续中文 match = re.match(r"([\u4e00-\u9fa5]+)", cleaned_name) if match: word = match.group(1) if word: keywords.append(word) # 去重且保持顺序 seen = set() unique_keywords = [] for kw in keywords: if kw not in seen: seen.add(kw) unique_keywords.append(kw) # 写入文件 with open("new_project.txt", "w", encoding="utf-8") as f: for kw in unique_keywords: f.write(kw + "\n") print("提取完成,已写入 new_project.txt") def removte_reduant_list(): temp_list = load_standard_name_list("./new_project.txt") # 去重且保持顺序 seen = set() unique_keywords = [] for kw in temp_list: if kw not in seen: seen.add(kw) unique_keywords.append(kw) with open("hot_word/final_new_project.txt", "w", encoding="utf-8") as f: for kw in unique_keywords: f.write(kw + "\n") save_standard_name_list_to_file(list(GlobalData.simply_to_standard_project_name_map.keys()),"./hot_word/project.txt") def list_to_json(): import json my_list = load_standard_name_list("./hot_word/final_new_project.txt") data = { "hotwordList": my_list } # 转换为 JSON 格式字符串(ensure_ascii=False 确保中文正常显示) json_str = json.dumps(data, ensure_ascii=False, indent=4) save_dict_to_file(json_str,'./hot_word/final_new_project.json') list_to_json() # exact_hot_words() # exact_project_hot_words() # unuselessStr = clean_useless_project_name("众兴杜岗ⅱ间隔改造") # print(f"众兴杜岗ⅱ间隔改造:{unuselessStr}") # unuselessStr = clean_useless_project_name("众兴杜岗Ⅱ间隔改造") # print(f"众兴杜岗Ⅱ间隔改造:{unuselessStr}") # print("今天的长度:",len("今天")) # standardize_program() # history_message() # standardize_project_test() # standardize_company_test() # standardize_team_leader_test() # # standardize_sub_constractor_test() # # check_standard_name_slot_probability_test() # # standardize_construction_test() # standardize_program_test()