From 8d4ef39c6ccaa7d5924469063a7aeac3b74a385d Mon Sep 17 00:00:00 2001 From: keenzhu Date: Fri, 26 May 2023 18:43:55 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0RSS=5Floader.py=E7=94=A8?= =?UTF-8?q?=E4=BD=9C=E4=BB=8ERSS=E8=AE=A2=E9=98=85=E7=9F=A5=E8=AF=86?= =?UTF-8?q?=E5=BA=93?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- loader/RSS_loader.py | 54 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 loader/RSS_loader.py diff --git a/loader/RSS_loader.py b/loader/RSS_loader.py new file mode 100644 index 0000000..89ff37e --- /dev/null +++ b/loader/RSS_loader.py @@ -0,0 +1,54 @@ +from langchain.docstore.document import Document +import feedparser +import html2text +import ssl +import time + + +class RSS_Url_loader: + def __init__(self, urls=None,interval=60): + '''可用参数urls数组或者是字符串形式的url列表''' + self.urls = [] + self.interval = interval + if urls is not None: + try: + if isinstance(urls, str): + urls = [urls] + elif isinstance(urls, list): + pass + else: + raise TypeError('urls must be a list or a string.') + self.urls = urls + except: + Warning('urls must be a list or a string.') + + #定时代码还要考虑是不是引入其他类,暂时先不对外开放 + def scheduled_execution(self): + while True: + docs = self.load() + return docs + time.sleep(self.interval) + + def load(self): + if hasattr(ssl, '_create_unverified_context'): + ssl._create_default_https_context = ssl._create_unverified_context + documents = [] + for url in self.urls: + parsed = feedparser.parse(url) + for entry in parsed.entries: + if "content" in entry: + data = entry.content[0].value + else: + data = entry.description or entry.summary + data = html2text.html2text(data) + metadata = {"title": entry.title, "link": entry.link} + documents.append(Document(page_content=data, metadata=metadata)) + return documents + +if __name__=="__main__": + #需要在配置文件中加入urls的配置,或者是在用户界面上加入urls的配置 + urls = ["https://www.zhihu.com/rss", "https://www.36kr.com/feed"] + loader = RSS_Url_loader(urls) + docs = loader.load() + for doc in docs: + print(doc) \ No newline at end of file