修复:知识库json文件的中文被转为unicode码,导致无法匹配 (#2128)

This commit is contained in:
liunux4odoo 2023-11-21 21:00:46 +08:00 committed by GitHub
parent a9cf191532
commit 3cf7422c21
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 11 additions and 1 deletions

View File

@ -17,7 +17,7 @@ from langchain.docstore.document import Document
from langchain.text_splitter import TextSplitter
from pathlib import Path
from server.utils import run_in_thread_pool, get_model_worker_config
import io
import json
from typing import List, Union,Dict, Tuple, Generator
import chardet
@ -101,6 +101,16 @@ LOADER_DICT = {"UnstructuredHTMLLoader": ['.html'],
SUPPORTED_EXTS = [ext for sublist in LOADER_DICT.values() for ext in sublist]
# patch json.dumps to disable ensure_ascii
def _new_json_dumps(obj, **kwargs):
kwargs["ensure_ascii"] = False
return _origin_json_dumps(obj, **kwargs)
if json.dumps is not _new_json_dumps:
_origin_json_dumps = json.dumps
json.dumps = _new_json_dumps
class JSONLinesLoader(langchain.document_loaders.JSONLoader):
'''
行式 Json 加载器要求文件扩展名为 .jsonl