diff --git a/api/utils.py b/api/utils.py index f1bc530..52669af 100644 --- a/api/utils.py +++ b/api/utils.py @@ -1,10 +1,12 @@ from enum import Enum from typing import cast, Callable +import cn2an from rapidfuzz import process, fuzz from rapidfuzz.fuzz import WRatio import json from pypinyin import lazy_pinyin, Style +import re from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS @@ -57,6 +59,22 @@ def extract_number(text): return None +def replace_arabic_with_chinese(text): + """ + 将字符串中所有连续的阿拉伯数字转换为对应的中文数字。 + 示例:2024年25号 -> 二千零二十四年二十五号 + """ + def convert(match): + num_str = match.group() + try: + return cn2an.an2cn(num_str, "low") # "low" 使用小写中文数字 + except Exception as e: + print(f"转换失败,异常信息: {e}") # 打印异常信息 + return num_str # 转换失败时保留原数字 + + return re.sub(r'\d+', convert, text) + + def fuzzy_match_and_filter(input_key, match_pool, mapping_dict, lower_score=70, high_score=85, top_k=3): """ 对输入字符串在候选池中执行模糊匹配,并返回匹配程度高的映射原始值。 @@ -117,7 +135,8 @@ def standardize_sub_company(input_name, simply_map, pinyin_map, lower_score=55, :param high_score: 高置信匹配分数阈值 :return: 匹配的标准公司名列表 """ - return standardize_name(input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score) + temp_input_name = replace_arabic_with_chinese(input_name) + return standardize_name(temp_input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score) def standardize_project_name(input_name, simply_map, pinyin_map, lower_score=70, high_score=90): @@ -148,6 +167,7 @@ def standardize_projectDepartment(standard_company, input_project, company_proje """ try: # **2. 先尝试直接匹配最相似的项目名** + input_project = replace_arabic_with_chinese(input_project) project_match = process.extractOne(input_project, company_project_department_map[standard_company], scorer=cast(Callable, WRatio)) print(f"项目部名称最相似:{project_match[0]},{project_match[1]}", flush=True) @@ -271,8 +291,6 @@ class StandardType(Enum): PROGRAM_CHECK = 1 -import re - # 构建一个用于替换的正则表达式 useless_project_words_pattern = re.compile("|".join(USELESS_PROJECT_WORDS)) useless_company_words_pattern = re.compile("|".join(USELESS_COMPANY_WORDS))