From e92205f8e71902f898d8c67aba863c807efe8ad7 Mon Sep 17 00:00:00 2001 From: weiweiw <14335254+weiweiw22@user.noreply.gitee.com> Date: Sun, 20 Apr 2025 13:43:51 +0800 Subject: [PATCH] =?UTF-8?q?=E9=A1=B9=E7=9B=AE=E5=90=8D=E5=92=8C=E5=88=86?= =?UTF-8?q?=E5=85=AC=E5=8F=B8=E5=90=8D=E5=8C=B9=E9=85=8D=E5=89=8D=E7=8E=B0?= =?UTF-8?q?=E5=B0=86=E9=98=BF=E6=8B=89=E4=BC=AF=E6=95=B0=E5=AD=97=E6=9B=BF?= =?UTF-8?q?=E6=8D=A2=E4=B8=BA=E8=BF=99=E4=B8=AA=E4=B8=AD=E6=96=87=E6=95=B0?= =?UTF-8?q?=E5=AD=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/utils.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/api/utils.py b/api/utils.py index f1bc530..52669af 100644 --- a/api/utils.py +++ b/api/utils.py @@ -1,10 +1,12 @@ from enum import Enum from typing import cast, Callable +import cn2an from rapidfuzz import process, fuzz from rapidfuzz.fuzz import WRatio import json from pypinyin import lazy_pinyin, Style +import re from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS @@ -57,6 +59,22 @@ def extract_number(text): return None +def replace_arabic_with_chinese(text): + """ + 将字符串中所有连续的阿拉伯数字转换为对应的中文数字。 + 示例:2024年25号 -> 二千零二十四年二十五号 + """ + def convert(match): + num_str = match.group() + try: + return cn2an.an2cn(num_str, "low") # "low" 使用小写中文数字 + except Exception as e: + print(f"转换失败,异常信息: {e}") # 打印异常信息 + return num_str # 转换失败时保留原数字 + + return re.sub(r'\d+', convert, text) + + def fuzzy_match_and_filter(input_key, match_pool, mapping_dict, lower_score=70, high_score=85, top_k=3): """ 对输入字符串在候选池中执行模糊匹配,并返回匹配程度高的映射原始值。 @@ -117,7 +135,8 @@ def standardize_sub_company(input_name, simply_map, pinyin_map, lower_score=55, :param high_score: 高置信匹配分数阈值 :return: 匹配的标准公司名列表 """ - return standardize_name(input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score) + temp_input_name = replace_arabic_with_chinese(input_name) + return standardize_name(temp_input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score) def standardize_project_name(input_name, simply_map, pinyin_map, lower_score=70, high_score=90): @@ -148,6 +167,7 @@ def standardize_projectDepartment(standard_company, input_project, company_proje """ try: # **2. 先尝试直接匹配最相似的项目名** + input_project = replace_arabic_with_chinese(input_project) project_match = process.extractOne(input_project, company_project_department_map[standard_company], scorer=cast(Callable, WRatio)) print(f"项目部名称最相似:{project_match[0]},{project_match[1]}", flush=True) @@ -271,8 +291,6 @@ class StandardType(Enum): PROGRAM_CHECK = 1 -import re - # 构建一个用于替换的正则表达式 useless_project_words_pattern = re.compile("|".join(USELESS_PROJECT_WORDS)) useless_company_words_pattern = re.compile("|".join(USELESS_COMPANY_WORDS))