增加日志
This commit is contained in:
parent
464436dd20
commit
32202f3334
|
|
@ -45,6 +45,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
|||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Create a new TextSplitter."""
|
||||
logger.info(f"Create a new ChineseRecursiveTextSplitter")
|
||||
super().__init__(keep_separator=keep_separator, **kwargs)
|
||||
self._separators = separators or [
|
||||
First_SEPARATOE,
|
||||
|
|
@ -58,7 +59,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
|||
|
||||
def _split_text(self, text: str, separators: List[str]) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
#print(f"***********************************ChineseRecursiveTextSplitter***********************************")
|
||||
logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************")
|
||||
final_chunks = []
|
||||
# Get appropriate separator to use
|
||||
separator = separators[-1]
|
||||
|
|
@ -103,28 +104,28 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
|||
_good_splits = []
|
||||
_separator = "" if self._keep_separator else separator
|
||||
for s in splits:
|
||||
#print(f"***s:{s},len:{self._length_function(s)}")
|
||||
logger.info(f"***s:{s},len:{self._length_function(s)}")
|
||||
if self._length_function(s) < self._chunk_size:
|
||||
_good_splits.append(s)
|
||||
#print(f"***_good_splits.append(s):{s}")
|
||||
logger.info(f"***_good_splits.append(s):{s}")
|
||||
else:
|
||||
if _good_splits:
|
||||
#print(f"***_merge_splits(s):{s}")
|
||||
logger.info(f"***_merge_splits(s):{s}")
|
||||
merged_text = self._merge_splits(_good_splits, _separator)
|
||||
#print(f"***after _merge_splits,merged_text:{merged_text}")
|
||||
logger.info(f"***after _merge_splits,merged_text:{merged_text}")
|
||||
final_chunks.extend(merged_text)
|
||||
_good_splits = []
|
||||
if not new_separators:
|
||||
final_chunks.append(s)
|
||||
#print(f"***final_chunks.append(s)")
|
||||
logger.info(f"***final_chunks.append(s)")
|
||||
else:
|
||||
#print(f"***下一级_split_text(s)")
|
||||
logger.info(f"***下一级_split_text(s)")
|
||||
other_info = self._split_text(s, new_separators)
|
||||
final_chunks.extend(other_info)
|
||||
if _good_splits:
|
||||
#print(f"***22_merge_splits(s):{s}")
|
||||
logger.info(f"***22_merge_splits(s):{s}")
|
||||
merged_text = self._merge_splits(_good_splits, _separator)
|
||||
#print(f"***22after _merge_splits,merged_text:{merged_text}")
|
||||
logger.info(f"***22after _merge_splits,merged_text:{merged_text}")
|
||||
final_chunks.extend(merged_text)
|
||||
|
||||
final_chunks = [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
|
||||
|
|
|
|||
Loading…
Reference in New Issue