diff --git a/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py b/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py index 6b6426d..9ddd525 100644 --- a/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py +++ b/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py @@ -59,7 +59,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): def _split_text(self, text: str, separators: List[str]) -> List[str]: """Split incoming text and return chunks.""" - logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************") + logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************,文字个数:{len(text)}") final_chunks = [] # Get appropriate separator to use separator = separators[-1] @@ -145,6 +145,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): if temp_sencond !="": return_chunks.append(temp_sencond) + self.is_recursive = False return return_chunks if __name__ == "__main__": diff --git a/libs/chatchat-server/chatchat/server/knowledge_base/utils.py b/libs/chatchat-server/chatchat/server/knowledge_base/utils.py index 2216295..8ee6603 100644 --- a/libs/chatchat-server/chatchat/server/knowledge_base/utils.py +++ b/libs/chatchat-server/chatchat/server/knowledge_base/utils.py @@ -386,6 +386,8 @@ class KnowledgeFile: if doc.page_content.strip() != "": doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip()) + file_name_without_extension, file_extension = os.path.splitext(self.filepath) + if not docs: return [] if self.ext not in [".csv"]: @@ -402,6 +404,11 @@ class KnowledgeFile: if self.text_splitter_name == "MarkdownHeaderTextSplitter": docs = text_splitter.split_text(docs[0].page_content) else: + # print(f"**********************docs2texts: text_splitter.split_documents(docs)") + outputfile = file_name_without_extension + "_source.txt" + with open(outputfile, 'w') as file: + for doc in docs: + file.write(doc.page_content) docs = text_splitter.split_documents(docs) if not docs: @@ -415,6 +422,16 @@ class KnowledgeFile: docs = zh_first_title_enhance(docs) docs = customize_zh_title_enhance(docs) + i = 1 + outputfile = file_name_without_extension + "_split.txt" + # 打开文件以写入模式 + with open(outputfile, 'w') as file: + for doc in docs: + #print(f"**********切分段{i}:{doc}") + file.write(f"\n**********切分段{i}") + file.write(doc.page_content) + i = i+1 + self.splited_docs = docs return self.splited_docs