From 8d4ef39c6ccaa7d5924469063a7aeac3b74a385d Mon Sep 17 00:00:00 2001 From: keenzhu Date: Fri, 26 May 2023 18:43:55 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E6=B7=BB=E5=8A=A0RSS=5Floader.py=E7=94=A8?= =?UTF-8?q?=E4=BD=9C=E4=BB=8ERSS=E8=AE=A2=E9=98=85=E7=9F=A5=E8=AF=86?= =?UTF-8?q?=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- loader/RSS_loader.py | 54 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 loader/RSS_loader.py diff --git a/loader/RSS_loader.py b/loader/RSS_loader.py new file mode 100644 index 0000000..89ff37e --- /dev/null +++ b/loader/RSS_loader.py @@ -0,0 +1,54 @@ +from langchain.docstore.document import Document +import feedparser +import html2text +import ssl +import time + + +class RSS_Url_loader: + def __init__(self, urls=None,interval=60): + '''可用参数urls数组或者是字符串形式的url列表''' + self.urls = [] + self.interval = interval + if urls is not None: + try: + if isinstance(urls, str): + urls = [urls] + elif isinstance(urls, list): + pass + else: + raise TypeError('urls must be a list or a string.') + self.urls = urls + except: + Warning('urls must be a list or a string.') + + #定时代码还要考虑是不是引入其他类,暂时先不对外开放 + def scheduled_execution(self): + while True: + docs = self.load() + return docs + time.sleep(self.interval) + + def load(self): + if hasattr(ssl, '_create_unverified_context'): + ssl._create_default_https_context = ssl._create_unverified_context + documents = [] + for url in self.urls: + parsed = feedparser.parse(url) + for entry in parsed.entries: + if "content" in entry: + data = entry.content[0].value + else: + data = entry.description or entry.summary + data = html2text.html2text(data) + metadata = {"title": entry.title, "link": entry.link} + documents.append(Document(page_content=data, metadata=metadata)) + return documents + +if __name__=="__main__": + #需要在配置文件中加入urls的配置,或者是在用户界面上加入urls的配置 + urls = ["https://www.zhihu.com/rss", "https://www.36kr.com/feed"] + loader = RSS_Url_loader(urls) + docs = loader.load() + for doc in docs: + print(doc) \ No newline at end of file From 9dc9ac6380b1235c93818ed7e12e6fdebf89f704 Mon Sep 17 00:00:00 2001 From: keenzhu Date: Fri, 26 May 2023 18:46:18 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BE=9D=E8=B5=96feedpar?= =?UTF-8?q?ser?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 233fb2f..5d22ca9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,7 @@ peft~=0.3.0 pypinyin~=0.48.0 click~=8.1.3 tabulate +llama_index azure-core bitsandbytes; platform_system != "Windows" #llama-cpp-python==0.1.34; platform_system != "Windows" From 11bcb58ae6faba1859670f2b92b0523697f1978f Mon Sep 17 00:00:00 2001 From: keenzhu Date: Fri, 26 May 2023 18:50:20 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E5=A2=9E=E5=8A=A0rss=E8=AE=A2=E9=98=85?= =?UTF-8?q?=EF=BC=8C=E4=BB=A5=E5=8F=8A=E6=B7=BB=E5=8A=A0=E4=BA=86=E4=BE=9D?= =?UTF-8?q?=E8=B5=96=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5d22ca9..17901ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,7 +19,7 @@ peft~=0.3.0 pypinyin~=0.48.0 click~=8.1.3 tabulate -llama_index +feedparser azure-core bitsandbytes; platform_system != "Windows" #llama-cpp-python==0.1.34; platform_system != "Windows"