From 32202f3334c8f941d0575c6539bdc308b1888968 Mon Sep 17 00:00:00 2001 From: weiweiw <14335254+weiweiw22@user.noreply.gitee.com> Date: Tue, 14 Jan 2025 11:15:54 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=97=A5=E5=BF=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../chinese_recursive_text_splitter.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py b/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py index ab66d99..6b6426d 100644 --- a/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py +++ b/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py @@ -45,6 +45,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): **kwargs: Any, ) -> None: """Create a new TextSplitter.""" + logger.info(f"Create a new ChineseRecursiveTextSplitter") super().__init__(keep_separator=keep_separator, **kwargs) self._separators = separators or [ First_SEPARATOE, @@ -58,7 +59,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): def _split_text(self, text: str, separators: List[str]) -> List[str]: """Split incoming text and return chunks.""" - #print(f"***********************************ChineseRecursiveTextSplitter***********************************") + logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************") final_chunks = [] # Get appropriate separator to use separator = separators[-1] @@ -103,28 +104,28 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): _good_splits = [] _separator = "" if self._keep_separator else separator for s in splits: - #print(f"***s:{s},len:{self._length_function(s)}") + logger.info(f"***s:{s},len:{self._length_function(s)}") if self._length_function(s) < self._chunk_size: _good_splits.append(s) - #print(f"***_good_splits.append(s):{s}") + logger.info(f"***_good_splits.append(s):{s}") else: if _good_splits: - #print(f"***_merge_splits(s):{s}") + logger.info(f"***_merge_splits(s):{s}") merged_text = self._merge_splits(_good_splits, _separator) - #print(f"***after _merge_splits,merged_text:{merged_text}") + logger.info(f"***after _merge_splits,merged_text:{merged_text}") final_chunks.extend(merged_text) _good_splits = [] if not new_separators: final_chunks.append(s) - #print(f"***final_chunks.append(s)") + logger.info(f"***final_chunks.append(s)") else: - #print(f"***下一级_split_text(s)") + logger.info(f"***下一级_split_text(s)") other_info = self._split_text(s, new_separators) final_chunks.extend(other_info) if _good_splits: - #print(f"***22_merge_splits(s):{s}") + logger.info(f"***22_merge_splits(s):{s}") merged_text = self._merge_splits(_good_splits, _separator) - #print(f"***22after _merge_splits,merged_text:{merged_text}") + logger.info(f"***22after _merge_splits,merged_text:{merged_text}") final_chunks.extend(merged_text) final_chunks = [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]