增加日志

This commit is contained in:
weiweiw 2025-01-14 11:15:54 +08:00
parent 464436dd20
commit 32202f3334
1 changed files with 10 additions and 9 deletions

View File

@ -45,6 +45,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
**kwargs: Any,
) -> None:
"""Create a new TextSplitter."""
logger.info(f"Create a new ChineseRecursiveTextSplitter")
super().__init__(keep_separator=keep_separator, **kwargs)
self._separators = separators or [
First_SEPARATOE,
@ -58,7 +59,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
def _split_text(self, text: str, separators: List[str]) -> List[str]:
"""Split incoming text and return chunks."""
#print(f"***********************************ChineseRecursiveTextSplitter***********************************")
logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************")
final_chunks = []
# Get appropriate separator to use
separator = separators[-1]
@ -103,28 +104,28 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
_good_splits = []
_separator = "" if self._keep_separator else separator
for s in splits:
#print(f"***s:{s},len:{self._length_function(s)}")
logger.info(f"***s:{s},len:{self._length_function(s)}")
if self._length_function(s) < self._chunk_size:
_good_splits.append(s)
#print(f"***_good_splits.append(s):{s}")
logger.info(f"***_good_splits.append(s):{s}")
else:
if _good_splits:
#print(f"***_merge_splits(s):{s}")
logger.info(f"***_merge_splits(s):{s}")
merged_text = self._merge_splits(_good_splits, _separator)
#print(f"***after _merge_splits,merged_text:{merged_text}")
logger.info(f"***after _merge_splits,merged_text:{merged_text}")
final_chunks.extend(merged_text)
_good_splits = []
if not new_separators:
final_chunks.append(s)
#print(f"***final_chunks.append(s)")
logger.info(f"***final_chunks.append(s)")
else:
#print(f"***下一级_split_text(s)")
logger.info(f"***下一级_split_text(s)")
other_info = self._split_text(s, new_separators)
final_chunks.extend(other_info)
if _good_splits:
#print(f"***22_merge_splits(s):{s}")
logger.info(f"***22_merge_splits(s):{s}")
merged_text = self._merge_splits(_good_splits, _separator)
#print(f"***22after _merge_splits,merged_text:{merged_text}")
logger.info(f"***22after _merge_splits,merged_text:{merged_text}")
final_chunks.extend(merged_text)
final_chunks = [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]