项目名和分公司名匹配前现将阿拉伯数字替换为这个中文数字

This commit is contained in:
weiweiw 2025-04-20 13:43:51 +08:00
parent f38aa8ed01
commit e92205f8e7
1 changed files with 21 additions and 3 deletions

View File

@ -1,10 +1,12 @@
from enum import Enum from enum import Enum
from typing import cast, Callable from typing import cast, Callable
import cn2an
from rapidfuzz import process, fuzz from rapidfuzz import process, fuzz
from rapidfuzz.fuzz import WRatio from rapidfuzz.fuzz import WRatio
import json import json
from pypinyin import lazy_pinyin, Style from pypinyin import lazy_pinyin, Style
import re
from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS
@ -57,6 +59,22 @@ def extract_number(text):
return None return None
def replace_arabic_with_chinese(text):
"""
将字符串中所有连续的阿拉伯数字转换为对应的中文数字
示例2024年25号 -> 二千零二十四年二十五号
"""
def convert(match):
num_str = match.group()
try:
return cn2an.an2cn(num_str, "low") # "low" 使用小写中文数字
except Exception as e:
print(f"转换失败,异常信息: {e}") # 打印异常信息
return num_str # 转换失败时保留原数字
return re.sub(r'\d+', convert, text)
def fuzzy_match_and_filter(input_key, match_pool, mapping_dict, lower_score=70, high_score=85, top_k=3): def fuzzy_match_and_filter(input_key, match_pool, mapping_dict, lower_score=70, high_score=85, top_k=3):
""" """
对输入字符串在候选池中执行模糊匹配并返回匹配程度高的映射原始值 对输入字符串在候选池中执行模糊匹配并返回匹配程度高的映射原始值
@ -117,7 +135,8 @@ def standardize_sub_company(input_name, simply_map, pinyin_map, lower_score=55,
:param high_score: 高置信匹配分数阈值 :param high_score: 高置信匹配分数阈值
:return: 匹配的标准公司名列表 :return: 匹配的标准公司名列表
""" """
return standardize_name(input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score) temp_input_name = replace_arabic_with_chinese(input_name)
return standardize_name(temp_input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score)
def standardize_project_name(input_name, simply_map, pinyin_map, lower_score=70, high_score=90): def standardize_project_name(input_name, simply_map, pinyin_map, lower_score=70, high_score=90):
@ -148,6 +167,7 @@ def standardize_projectDepartment(standard_company, input_project, company_proje
""" """
try: try:
# **2. 先尝试直接匹配最相似的项目名** # **2. 先尝试直接匹配最相似的项目名**
input_project = replace_arabic_with_chinese(input_project)
project_match = process.extractOne(input_project, company_project_department_map[standard_company], project_match = process.extractOne(input_project, company_project_department_map[standard_company],
scorer=cast(Callable, WRatio)) scorer=cast(Callable, WRatio))
print(f"项目部名称最相似:{project_match[0]},{project_match[1]}", flush=True) print(f"项目部名称最相似:{project_match[0]},{project_match[1]}", flush=True)
@ -271,8 +291,6 @@ class StandardType(Enum):
PROGRAM_CHECK = 1 PROGRAM_CHECK = 1
import re
# 构建一个用于替换的正则表达式 # 构建一个用于替换的正则表达式
useless_project_words_pattern = re.compile("|".join(USELESS_PROJECT_WORDS)) useless_project_words_pattern = re.compile("|".join(USELESS_PROJECT_WORDS))
useless_company_words_pattern = re.compile("|".join(USELESS_COMPANY_WORDS)) useless_company_words_pattern = re.compile("|".join(USELESS_COMPANY_WORDS))