增加日志

This commit is contained in:
weiweiw 2025-01-14 13:48:27 +08:00
parent ae7e658c1a
commit 4f579027c6
2 changed files with 19 additions and 1 deletions

View File

@ -59,7 +59,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
def _split_text(self, text: str, separators: List[str]) -> List[str]:
"""Split incoming text and return chunks."""
logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************")
logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************,文字个数:{len(text)}")
final_chunks = []
# Get appropriate separator to use
separator = separators[-1]
@ -145,6 +145,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
if temp_sencond !="":
return_chunks.append(temp_sencond)
self.is_recursive = False
return return_chunks
if __name__ == "__main__":

View File

@ -386,6 +386,8 @@ class KnowledgeFile:
if doc.page_content.strip() != "":
doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())
file_name_without_extension, file_extension = os.path.splitext(self.filepath)
if not docs:
return []
if self.ext not in [".csv"]:
@ -402,6 +404,11 @@ class KnowledgeFile:
if self.text_splitter_name == "MarkdownHeaderTextSplitter":
docs = text_splitter.split_text(docs[0].page_content)
else:
# print(f"**********************docs2texts: text_splitter.split_documents(docs)")
outputfile = file_name_without_extension + "_source.txt"
with open(outputfile, 'w') as file:
for doc in docs:
file.write(doc.page_content)
docs = text_splitter.split_documents(docs)
if not docs:
@ -415,6 +422,16 @@ class KnowledgeFile:
docs = zh_first_title_enhance(docs)
docs = customize_zh_title_enhance(docs)
i = 1
outputfile = file_name_without_extension + "_split.txt"
# 打开文件以写入模式
with open(outputfile, 'w') as file:
for doc in docs:
#print(f"**********切分段{i}{doc}")
file.write(f"\n**********切分段{i}")
file.write(doc.page_content)
i = i+1
self.splited_docs = docs
return self.splited_docs