update loader.py

2023-05-20 01:24:35 +08:00 · 2023-05-20 01:24:35 +08:00 · e8a37ff4c7
parent d5ffdaa281
commit e8a37ff4c7
2 changed files with 10 additions and 7 deletions
--- a/chains/local_doc_qa.py
+++ b/chains/local_doc_qa.py
@ -26,6 +26,9 @@ def load_file(filepath, sentence_size=SENTENCE_SIZE):
    if filepath.lower().endswith(".md"):
        loader = UnstructuredFileLoader(filepath, mode="elements")
        docs = loader.load()
+    elif filepath.lower().endswith(".txt"):
+        loader = UnstructuredFileLoader(filepath, mode="elements")
+        docs = loader.load()
    elif filepath.lower().endswith(".pdf"):
        loader = UnstructuredPaddlePDFLoader(filepath)
        textsplitter = ChineseTextSplitter(pdf=True, sentence_size=sentence_size)
@ -47,13 +50,13 @@ def write_check_file(filepath, docs):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    fp = os.path.join(folder_path, 'load_file.txt')
-    fout = open(fp, 'a')
-    fout.write("filepath=%s,len=%s" % (filepath, len(docs)))
-    fout.write('\n')
-    for i in docs:
-        fout.write(str(i))
+    with open(fp, 'a+', encoding='utf-8') as fout:
+        fout.write("filepath=%s,len=%s" % (filepath, len(docs)))
        fout.write('\n')
-    fout.close()
+        for i in docs:
+            fout.write(str(i))
+            fout.write('\n')
+        fout.close()


 def generate_prompt(related_docs: List[str], query: str,
--- a/loader/pdf_loader.py
+++ b/loader/pdf_loader.py
@ -19,7 +19,7 @@ class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
            ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False)
            doc = fitz.open(filepath)
            txt_file_path = os.path.join(full_dir_path, "%s.txt" % (filename))
-            img_name = os.path.join(full_dir_path, '.tmp.png')
+            img_name = os.path.join(full_dir_path, 'tmp.png')
            with open(txt_file_path, 'w', encoding='utf-8') as fout:

                for i in range(doc.page_count):