项目名和分公司名匹配前现将阿拉伯数字替换为这个中文数字
This commit is contained in:
parent
f38aa8ed01
commit
e92205f8e7
24
api/utils.py
24
api/utils.py
|
|
@ -1,10 +1,12 @@
|
|||
from enum import Enum
|
||||
from typing import cast, Callable
|
||||
|
||||
import cn2an
|
||||
from rapidfuzz import process, fuzz
|
||||
from rapidfuzz.fuzz import WRatio
|
||||
import json
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
import re
|
||||
|
||||
from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS
|
||||
|
||||
|
|
@ -57,6 +59,22 @@ def extract_number(text):
|
|||
return None
|
||||
|
||||
|
||||
def replace_arabic_with_chinese(text):
|
||||
"""
|
||||
将字符串中所有连续的阿拉伯数字转换为对应的中文数字。
|
||||
示例:2024年25号 -> 二千零二十四年二十五号
|
||||
"""
|
||||
def convert(match):
|
||||
num_str = match.group()
|
||||
try:
|
||||
return cn2an.an2cn(num_str, "low") # "low" 使用小写中文数字
|
||||
except Exception as e:
|
||||
print(f"转换失败,异常信息: {e}") # 打印异常信息
|
||||
return num_str # 转换失败时保留原数字
|
||||
|
||||
return re.sub(r'\d+', convert, text)
|
||||
|
||||
|
||||
def fuzzy_match_and_filter(input_key, match_pool, mapping_dict, lower_score=70, high_score=85, top_k=3):
|
||||
"""
|
||||
对输入字符串在候选池中执行模糊匹配,并返回匹配程度高的映射原始值。
|
||||
|
|
@ -117,7 +135,8 @@ def standardize_sub_company(input_name, simply_map, pinyin_map, lower_score=55,
|
|||
:param high_score: 高置信匹配分数阈值
|
||||
:return: 匹配的标准公司名列表
|
||||
"""
|
||||
return standardize_name(input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score)
|
||||
temp_input_name = replace_arabic_with_chinese(input_name)
|
||||
return standardize_name(temp_input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score)
|
||||
|
||||
|
||||
def standardize_project_name(input_name, simply_map, pinyin_map, lower_score=70, high_score=90):
|
||||
|
|
@ -148,6 +167,7 @@ def standardize_projectDepartment(standard_company, input_project, company_proje
|
|||
"""
|
||||
try:
|
||||
# **2. 先尝试直接匹配最相似的项目名**
|
||||
input_project = replace_arabic_with_chinese(input_project)
|
||||
project_match = process.extractOne(input_project, company_project_department_map[standard_company],
|
||||
scorer=cast(Callable, WRatio))
|
||||
print(f"项目部名称最相似:{project_match[0]},{project_match[1]}", flush=True)
|
||||
|
|
@ -271,8 +291,6 @@ class StandardType(Enum):
|
|||
PROGRAM_CHECK = 1
|
||||
|
||||
|
||||
import re
|
||||
|
||||
# 构建一个用于替换的正则表达式
|
||||
useless_project_words_pattern = re.compile("|".join(USELESS_PROJECT_WORDS))
|
||||
useless_company_words_pattern = re.compile("|".join(USELESS_COMPANY_WORDS))
|
||||
|
|
|
|||
Loading…
Reference in New Issue