项目名和分公司名匹配前现将阿拉伯数字替换为这个中文数字
This commit is contained in:
parent
f38aa8ed01
commit
e92205f8e7
24
api/utils.py
24
api/utils.py
|
|
@ -1,10 +1,12 @@
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import cast, Callable
|
from typing import cast, Callable
|
||||||
|
|
||||||
|
import cn2an
|
||||||
from rapidfuzz import process, fuzz
|
from rapidfuzz import process, fuzz
|
||||||
from rapidfuzz.fuzz import WRatio
|
from rapidfuzz.fuzz import WRatio
|
||||||
import json
|
import json
|
||||||
from pypinyin import lazy_pinyin, Style
|
from pypinyin import lazy_pinyin, Style
|
||||||
|
import re
|
||||||
|
|
||||||
from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS
|
from constants import USELESS_COMPANY_WORDS, USELESS_PROJECT_WORDS
|
||||||
|
|
||||||
|
|
@ -57,6 +59,22 @@ def extract_number(text):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def replace_arabic_with_chinese(text):
|
||||||
|
"""
|
||||||
|
将字符串中所有连续的阿拉伯数字转换为对应的中文数字。
|
||||||
|
示例:2024年25号 -> 二千零二十四年二十五号
|
||||||
|
"""
|
||||||
|
def convert(match):
|
||||||
|
num_str = match.group()
|
||||||
|
try:
|
||||||
|
return cn2an.an2cn(num_str, "low") # "low" 使用小写中文数字
|
||||||
|
except Exception as e:
|
||||||
|
print(f"转换失败,异常信息: {e}") # 打印异常信息
|
||||||
|
return num_str # 转换失败时保留原数字
|
||||||
|
|
||||||
|
return re.sub(r'\d+', convert, text)
|
||||||
|
|
||||||
|
|
||||||
def fuzzy_match_and_filter(input_key, match_pool, mapping_dict, lower_score=70, high_score=85, top_k=3):
|
def fuzzy_match_and_filter(input_key, match_pool, mapping_dict, lower_score=70, high_score=85, top_k=3):
|
||||||
"""
|
"""
|
||||||
对输入字符串在候选池中执行模糊匹配,并返回匹配程度高的映射原始值。
|
对输入字符串在候选池中执行模糊匹配,并返回匹配程度高的映射原始值。
|
||||||
|
|
@ -117,7 +135,8 @@ def standardize_sub_company(input_name, simply_map, pinyin_map, lower_score=55,
|
||||||
:param high_score: 高置信匹配分数阈值
|
:param high_score: 高置信匹配分数阈值
|
||||||
:return: 匹配的标准公司名列表
|
:return: 匹配的标准公司名列表
|
||||||
"""
|
"""
|
||||||
return standardize_name(input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score)
|
temp_input_name = replace_arabic_with_chinese(input_name)
|
||||||
|
return standardize_name(temp_input_name, clean_useless_company_name, simply_map, pinyin_map, lower_score, high_score)
|
||||||
|
|
||||||
|
|
||||||
def standardize_project_name(input_name, simply_map, pinyin_map, lower_score=70, high_score=90):
|
def standardize_project_name(input_name, simply_map, pinyin_map, lower_score=70, high_score=90):
|
||||||
|
|
@ -148,6 +167,7 @@ def standardize_projectDepartment(standard_company, input_project, company_proje
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# **2. 先尝试直接匹配最相似的项目名**
|
# **2. 先尝试直接匹配最相似的项目名**
|
||||||
|
input_project = replace_arabic_with_chinese(input_project)
|
||||||
project_match = process.extractOne(input_project, company_project_department_map[standard_company],
|
project_match = process.extractOne(input_project, company_project_department_map[standard_company],
|
||||||
scorer=cast(Callable, WRatio))
|
scorer=cast(Callable, WRatio))
|
||||||
print(f"项目部名称最相似:{project_match[0]},{project_match[1]}", flush=True)
|
print(f"项目部名称最相似:{project_match[0]},{project_match[1]}", flush=True)
|
||||||
|
|
@ -271,8 +291,6 @@ class StandardType(Enum):
|
||||||
PROGRAM_CHECK = 1
|
PROGRAM_CHECK = 1
|
||||||
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
# 构建一个用于替换的正则表达式
|
# 构建一个用于替换的正则表达式
|
||||||
useless_project_words_pattern = re.compile("|".join(USELESS_PROJECT_WORDS))
|
useless_project_words_pattern = re.compile("|".join(USELESS_PROJECT_WORDS))
|
||||||
useless_company_words_pattern = re.compile("|".join(USELESS_COMPANY_WORDS))
|
useless_company_words_pattern = re.compile("|".join(USELESS_COMPANY_WORDS))
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue