增加日志
This commit is contained in:
parent
464436dd20
commit
32202f3334
|
|
@ -45,6 +45,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Create a new TextSplitter."""
|
"""Create a new TextSplitter."""
|
||||||
|
logger.info(f"Create a new ChineseRecursiveTextSplitter")
|
||||||
super().__init__(keep_separator=keep_separator, **kwargs)
|
super().__init__(keep_separator=keep_separator, **kwargs)
|
||||||
self._separators = separators or [
|
self._separators = separators or [
|
||||||
First_SEPARATOE,
|
First_SEPARATOE,
|
||||||
|
|
@ -58,7 +59,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
||||||
|
|
||||||
def _split_text(self, text: str, separators: List[str]) -> List[str]:
|
def _split_text(self, text: str, separators: List[str]) -> List[str]:
|
||||||
"""Split incoming text and return chunks."""
|
"""Split incoming text and return chunks."""
|
||||||
#print(f"***********************************ChineseRecursiveTextSplitter***********************************")
|
logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************")
|
||||||
final_chunks = []
|
final_chunks = []
|
||||||
# Get appropriate separator to use
|
# Get appropriate separator to use
|
||||||
separator = separators[-1]
|
separator = separators[-1]
|
||||||
|
|
@ -103,28 +104,28 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
||||||
_good_splits = []
|
_good_splits = []
|
||||||
_separator = "" if self._keep_separator else separator
|
_separator = "" if self._keep_separator else separator
|
||||||
for s in splits:
|
for s in splits:
|
||||||
#print(f"***s:{s},len:{self._length_function(s)}")
|
logger.info(f"***s:{s},len:{self._length_function(s)}")
|
||||||
if self._length_function(s) < self._chunk_size:
|
if self._length_function(s) < self._chunk_size:
|
||||||
_good_splits.append(s)
|
_good_splits.append(s)
|
||||||
#print(f"***_good_splits.append(s):{s}")
|
logger.info(f"***_good_splits.append(s):{s}")
|
||||||
else:
|
else:
|
||||||
if _good_splits:
|
if _good_splits:
|
||||||
#print(f"***_merge_splits(s):{s}")
|
logger.info(f"***_merge_splits(s):{s}")
|
||||||
merged_text = self._merge_splits(_good_splits, _separator)
|
merged_text = self._merge_splits(_good_splits, _separator)
|
||||||
#print(f"***after _merge_splits,merged_text:{merged_text}")
|
logger.info(f"***after _merge_splits,merged_text:{merged_text}")
|
||||||
final_chunks.extend(merged_text)
|
final_chunks.extend(merged_text)
|
||||||
_good_splits = []
|
_good_splits = []
|
||||||
if not new_separators:
|
if not new_separators:
|
||||||
final_chunks.append(s)
|
final_chunks.append(s)
|
||||||
#print(f"***final_chunks.append(s)")
|
logger.info(f"***final_chunks.append(s)")
|
||||||
else:
|
else:
|
||||||
#print(f"***下一级_split_text(s)")
|
logger.info(f"***下一级_split_text(s)")
|
||||||
other_info = self._split_text(s, new_separators)
|
other_info = self._split_text(s, new_separators)
|
||||||
final_chunks.extend(other_info)
|
final_chunks.extend(other_info)
|
||||||
if _good_splits:
|
if _good_splits:
|
||||||
#print(f"***22_merge_splits(s):{s}")
|
logger.info(f"***22_merge_splits(s):{s}")
|
||||||
merged_text = self._merge_splits(_good_splits, _separator)
|
merged_text = self._merge_splits(_good_splits, _separator)
|
||||||
#print(f"***22after _merge_splits,merged_text:{merged_text}")
|
logger.info(f"***22after _merge_splits,merged_text:{merged_text}")
|
||||||
final_chunks.extend(merged_text)
|
final_chunks.extend(merged_text)
|
||||||
|
|
||||||
final_chunks = [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
|
final_chunks = [re.sub(r"\n{2,}", "\n", chunk.strip()) for chunk in final_chunks if chunk.strip()!=""]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue