增加日志
This commit is contained in:
parent
ae7e658c1a
commit
4f579027c6
|
|
@ -59,7 +59,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
|||
|
||||
def _split_text(self, text: str, separators: List[str]) -> List[str]:
|
||||
"""Split incoming text and return chunks."""
|
||||
logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************")
|
||||
logger.info(f"***********************************ChineseRecursiveTextSplitter***********************************,文字个数:{len(text)}")
|
||||
final_chunks = []
|
||||
# Get appropriate separator to use
|
||||
separator = separators[-1]
|
||||
|
|
@ -145,6 +145,7 @@ class ChineseRecursiveTextSplitter(RecursiveCharacterTextSplitter):
|
|||
if temp_sencond !="":
|
||||
return_chunks.append(temp_sencond)
|
||||
|
||||
self.is_recursive = False
|
||||
return return_chunks
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -386,6 +386,8 @@ class KnowledgeFile:
|
|||
if doc.page_content.strip() != "":
|
||||
doc.page_content = re.sub(r"\n{2,}", "\n", doc.page_content.strip())
|
||||
|
||||
file_name_without_extension, file_extension = os.path.splitext(self.filepath)
|
||||
|
||||
if not docs:
|
||||
return []
|
||||
if self.ext not in [".csv"]:
|
||||
|
|
@ -402,6 +404,11 @@ class KnowledgeFile:
|
|||
if self.text_splitter_name == "MarkdownHeaderTextSplitter":
|
||||
docs = text_splitter.split_text(docs[0].page_content)
|
||||
else:
|
||||
# print(f"**********************docs2texts: text_splitter.split_documents(docs)")
|
||||
outputfile = file_name_without_extension + "_source.txt"
|
||||
with open(outputfile, 'w') as file:
|
||||
for doc in docs:
|
||||
file.write(doc.page_content)
|
||||
docs = text_splitter.split_documents(docs)
|
||||
|
||||
if not docs:
|
||||
|
|
@ -415,6 +422,16 @@ class KnowledgeFile:
|
|||
docs = zh_first_title_enhance(docs)
|
||||
docs = customize_zh_title_enhance(docs)
|
||||
|
||||
i = 1
|
||||
outputfile = file_name_without_extension + "_split.txt"
|
||||
# 打开文件以写入模式
|
||||
with open(outputfile, 'w') as file:
|
||||
for doc in docs:
|
||||
#print(f"**********切分段{i}:{doc}")
|
||||
file.write(f"\n**********切分段{i}")
|
||||
file.write(doc.page_content)
|
||||
i = i+1
|
||||
|
||||
self.splited_docs = docs
|
||||
return self.splited_docs
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue