项目名和分公司名匹配前现将阿拉伯数字替换为这个中文数字

2025-04-20 13:43:51 +08:00 · 2025-04-20 13:43:51 +08:00 · e92205f8e7
parent f38aa8ed01
commit e92205f8e7
1 changed files with 21 additions and 3 deletions
--- a/api/utils.py
+++ b/api/utils.py
@ -1,10 +1,12 @@
 from enum import Enum
 from typing import cast, Callable

+import cn2an
 from rapidfuzz import process, fuzz
 from rapidfuzz.fuzz import WRatio
 import json
 from pypinyin import lazy_pinyin, Style
+import re

 from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS

@ -57,6 +59,22 @@ def extract_number(text):
    return None


+def replace_arabic_with_chinese(text):
+    """
+    将字符串中所有连续的阿拉伯数字转换为对应的中文数字。
+    示例：2024年25号 -> 二千零二十四年二十五号
+    """
+    def convert(match):
+        num_str = match.group()
+        try:
+            return cn2an.an2cn(num_str, "low")  # "low" 使用小写中文数字
+        except Exception as e:
+            print(f"转换失败，异常信息: {e}")  # 打印异常信息
+            return num_str  # 转换失败时保留原数字
+
+    return re.sub(r'\d+', convert, text)
+
+
 def fuzzy_match_and_filter(input_key, match_pool, mapping_dict, lower_score=70, high_score=85, top_k=3):
    """
    对输入字符串在候选池中执行模糊匹配，并返回匹配程度高的映射原始值。
@ -117,7 +135,8 @@ def standardize_sub_company(input_name, simply_map, pinyin_map, lower_score=55,
    :param high_score: 高置信匹配分数阈值
    :return: 匹配的标准公司名列表
    """
-    return standardize_name(input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score)
+    temp_input_name = replace_arabic_with_chinese(input_name)
+    return standardize_name(temp_input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score)


 def standardize_project_name(input_name, simply_map, pinyin_map, lower_score=70, high_score=90):
@ -148,6 +167,7 @@ def standardize_projectDepartment(standard_company, input_project, company_proje
    """
    try:
        # **2. 先尝试直接匹配最相似的项目名**
+        input_project = replace_arabic_with_chinese(input_project)
        project_match = process.extractOne(input_project, company_project_department_map[standard_company],
                                           scorer=cast(Callable, WRatio))
        print(f"项目部名称最相似：{project_match[0]},{project_match[1]}", flush=True)
@ -271,8 +291,6 @@ class StandardType(Enum):
    PROGRAM_CHECK = 1


-import re
-
 # 构建一个用于替换的正则表达式
 useless_project_words_pattern = re.compile("|".join(USELESS_PROJECT_WORDS))
 useless_company_words_pattern = re.compile("|".join(USELESS_COMPANY_WORDS))