Langchain-Chatchat/loader/RSS_loader.py

54 lines
1.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from langchain.docstore.document import Document
import feedparser
import html2text
import ssl
import time
class RSS_Url_loader:
def __init__(self, urls=None,interval=60):
'''可用参数urls数组或者是字符串形式的url列表'''
self.urls = []
self.interval = interval
if urls is not None:
try:
if isinstance(urls, str):
urls = [urls]
elif isinstance(urls, list):
pass
else:
raise TypeError('urls must be a list or a string.')
self.urls = urls
except:
Warning('urls must be a list or a string.')
#定时代码还要考虑是不是引入其他类,暂时先不对外开放
def scheduled_execution(self):
while True:
docs = self.load()
return docs
time.sleep(self.interval)
def load(self):
if hasattr(ssl, '_create_unverified_context'):
ssl._create_default_https_context = ssl._create_unverified_context
documents = []
for url in self.urls:
parsed = feedparser.parse(url)
for entry in parsed.entries:
if "content" in entry:
data = entry.content[0].value
else:
data = entry.description or entry.summary
data = html2text.html2text(data)
metadata = {"title": entry.title, "link": entry.link}
documents.append(Document(page_content=data, metadata=metadata))
return documents
if __name__=="__main__":
#需要在配置文件中加入urls的配置或者是在用户界面上加入urls的配置
urls = ["https://www.zhihu.com/rss", "https://www.36kr.com/feed"]
loader = RSS_Url_loader(urls)
docs = loader.load()
for doc in docs:
print(doc)