增加日志
This commit is contained in:
parent
ae7e658c1a
commit
4f579027c6
|
|
@ -59,7 +59,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
||||||
|
|
||||||
def _split_text(self, text: str, separators: List[str]) -> List[str]:
|
def _split_text(self, text: str, separators: List[str]) -> List[str]:
|
||||||
"""Split incoming text and return chunks."""
|
"""Split incoming text and return chunks."""
|
||||||
logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************")
|
logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************,文字个数:{len(text)}")
|
||||||
final_chunks = []
|
final_chunks = []
|
||||||
# Get appropriate separator to use
|
# Get appropriate separator to use
|
||||||
separator = separators[-1]
|
separator = separators[-1]
|
||||||
|
|
@ -145,6 +145,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
||||||
if temp_sencond !="":
|
if temp_sencond !="":
|
||||||
return_chunks.append(temp_sencond)
|
return_chunks.append(temp_sencond)
|
||||||
|
|
||||||
|
self.is_recursive = False
|
||||||
return return_chunks
|
return return_chunks
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -386,6 +386,8 @@ class KnowledgeFile:
|
||||||
if doc.page_content.strip() != "":
|
if doc.page_content.strip() != "":
|
||||||
doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())
|
doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())
|
||||||
|
|
||||||
|
file_name_without_extension, file_extension = os.path.splitext(self.filepath)
|
||||||
|
|
||||||
if not docs:
|
if not docs:
|
||||||
return []
|
return []
|
||||||
if self.ext not in [".csv"]:
|
if self.ext not in [".csv"]:
|
||||||
|
|
@ -402,6 +404,11 @@ class KnowledgeFile:
|
||||||
if self.text_splitter_name == "MarkdownHeaderTextSplitter":
|
if self.text_splitter_name == "MarkdownHeaderTextSplitter":
|
||||||
docs = text_splitter.split_text(docs[0].page_content)
|
docs = text_splitter.split_text(docs[0].page_content)
|
||||||
else:
|
else:
|
||||||
|
# print(f"**********************docs2texts: text_splitter.split_documents(docs)")
|
||||||
|
outputfile = file_name_without_extension + "_source.txt"
|
||||||
|
with open(outputfile, 'w') as file:
|
||||||
|
for doc in docs:
|
||||||
|
file.write(doc.page_content)
|
||||||
docs = text_splitter.split_documents(docs)
|
docs = text_splitter.split_documents(docs)
|
||||||
|
|
||||||
if not docs:
|
if not docs:
|
||||||
|
|
@ -415,6 +422,16 @@ class KnowledgeFile:
|
||||||
docs = zh_first_title_enhance(docs)
|
docs = zh_first_title_enhance(docs)
|
||||||
docs = customize_zh_title_enhance(docs)
|
docs = customize_zh_title_enhance(docs)
|
||||||
|
|
||||||
|
i = 1
|
||||||
|
outputfile = file_name_without_extension + "_split.txt"
|
||||||
|
# 打开文件以写入模式
|
||||||
|
with open(outputfile, 'w') as file:
|
||||||
|
for doc in docs:
|
||||||
|
#print(f"**********切分段{i}:{doc}")
|
||||||
|
file.write(f"\n**********切分段{i}")
|
||||||
|
file.write(doc.page_content)
|
||||||
|
i = i+1
|
||||||
|
|
||||||
self.splited_docs = docs
|
self.splited_docs = docs
|
||||||
return self.splited_docs
|
return self.splited_docs
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue