增加日志

2025-01-14 13:48:27 +08:00 · 2025-01-14 13:48:27 +08:00 · 4f579027c6
parent ae7e658c1a
commit 4f579027c6
2 changed files with 19 additions and 1 deletions
--- a/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py
+++ b/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py
@ -59,7 +59,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):

    def _split_text(self, text: str, separators: List[str]) -> List[str]:
        """Split incoming text and return chunks."""
-        logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************")
+        logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************,文字个数：{len(text)}")
        final_chunks = []
        # Get appropriate separator to use
        separator = separators[-1]
@ -145,6 +145,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
        if temp_sencond !="":
            return_chunks.append(temp_sencond)

+        self.is_recursive = False
        return return_chunks

 if __name__ == "__main__":
--- a/libs/chatchat-server/chatchat/server/knowledge_base/utils.py
+++ b/libs/chatchat-server/chatchat/server/knowledge_base/utils.py
@ -386,6 +386,8 @@ class KnowledgeFile:
            if doc.page_content.strip() != "":
                doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())

+        file_name_without_extension, file_extension = os.path.splitext(self.filepath)
+
        if not docs:
            return []
        if self.ext not in [".csv"]:
@ -402,6 +404,11 @@ class KnowledgeFile:
            if self.text_splitter_name == "MarkdownHeaderTextSplitter":
                docs = text_splitter.split_text(docs[0].page_content)
            else:
+                # print(f"**********************docs2texts: text_splitter.split_documents(docs)")
+                outputfile = file_name_without_extension + "_source.txt"
+                with open(outputfile, 'w') as file:
+                    for doc in docs:
+                        file.write(doc.page_content)
                docs = text_splitter.split_documents(docs)

        if not docs:
@ -415,6 +422,16 @@ class KnowledgeFile:
            docs = zh_first_title_enhance(docs)
            docs = customize_zh_title_enhance(docs)

+            i = 1
+            outputfile = file_name_without_extension + "_split.txt"
+            # 打开文件以写入模式
+            with open(outputfile, 'w') as file:
+                for doc in docs:
+                    #print(f"**********切分段{i}：{doc}")
+                    file.write(f"\n**********切分段{i}")
+                    file.write(doc.page_content)
+                    i = i+1
+
        self.splited_docs = docs
        return self.splited_docs