diff --git a/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py b/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py index ab66d99..6b6426d 100644 --- a/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py +++ b/libs/chatchat-server/chatchat/server/file_rag/text_splitter/chinese_recursive_text_splitter.py @@ -45,6 +45,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): **kwargs: Any, ) -> None: """Create a new TextSplitter.""" + logger.info(f"Create a new ChineseRecursiveTextSplitter") super().__init__(keep_separator=keep_separator, **kwargs) self._separators = separators or [ First_SEPARATOE, @@ -58,7 +59,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): def _split_text(self, text: str, separators: List[str]) -> List[str]: """Split incoming text and return chunks.""" - #print(f"***********************************ChineseRecursiveTextSplitter***********************************") + logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************") final_chunks = [] # Get appropriate separator to use separator = separators[-1] @@ -103,28 +104,28 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter): _good_splits = [] _separator = "" if self._keep_separator else separator for s in splits: - #print(f"***s:{s},len:{self._length_function(s)}") + logger.info(f"***s:{s},len:{self._length_function(s)}") if self._length_function(s) < self._chunk_size: _good_splits.append(s) - #print(f"***_good_splits.append(s):{s}") + logger.info(f"***_good_splits.append(s):{s}") else: if _good_splits: - #print(f"***_merge_splits(s):{s}") + logger.info(f"***_merge_splits(s):{s}") merged_text = self._merge_splits(_good_splits, _separator) - #print(f"***after _merge_splits,merged_text:{merged_text}") + logger.info(f"***after _merge_splits,merged_text:{merged_text}") final_chunks.extend(merged_text) _good_splits = [] if not new_separators: final_chunks.append(s) - #print(f"***final_chunks.append(s)") + logger.info(f"***final_chunks.append(s)") else: - #print(f"***下一级_split_text(s)") + logger.info(f"***下一级_split_text(s)") other_info = self._split_text(s, new_separators) final_chunks.extend(other_info) if _good_splits: - #print(f"***22_merge_splits(s):{s}") + logger.info(f"***22_merge_splits(s):{s}") merged_text = self._merge_splits(_good_splits, _separator) - #print(f"***22after _merge_splits,merged_text:{merged_text}") + logger.info(f"***22after _merge_splits,merged_text:{merged_text}") final_chunks.extend(merged_text) final_chunks = [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]