54 lines
1.9 KiB
Python
54 lines
1.9 KiB
Python
from langchain.docstore.document import Document
|
||
import feedparser
|
||
import html2text
|
||
import ssl
|
||
import time
|
||
|
||
|
||
class RSS_Url_loader:
|
||
def __init__(self, urls=None,interval=60):
|
||
'''可用参数urls数组或者是字符串形式的url列表'''
|
||
self.urls = []
|
||
self.interval = interval
|
||
if urls is not None:
|
||
try:
|
||
if isinstance(urls, str):
|
||
urls = [urls]
|
||
elif isinstance(urls, list):
|
||
pass
|
||
else:
|
||
raise TypeError('urls must be a list or a string.')
|
||
self.urls = urls
|
||
except:
|
||
Warning('urls must be a list or a string.')
|
||
|
||
#定时代码还要考虑是不是引入其他类,暂时先不对外开放
|
||
def scheduled_execution(self):
|
||
while True:
|
||
docs = self.load()
|
||
return docs
|
||
time.sleep(self.interval)
|
||
|
||
def load(self):
|
||
if hasattr(ssl, '_create_unverified_context'):
|
||
ssl._create_default_https_context = ssl._create_unverified_context
|
||
documents = []
|
||
for url in self.urls:
|
||
parsed = feedparser.parse(url)
|
||
for entry in parsed.entries:
|
||
if "content" in entry:
|
||
data = entry.content[0].value
|
||
else:
|
||
data = entry.description or entry.summary
|
||
data = html2text.html2text(data)
|
||
metadata = {"title": entry.title, "link": entry.link}
|
||
documents.append(Document(page_content=data, metadata=metadata))
|
||
return documents
|
||
|
||
if __name__=="__main__":
|
||
#需要在配置文件中加入urls的配置,或者是在用户界面上加入urls的配置
|
||
urls = ["https://www.zhihu.com/rss", "https://www.36kr.com/feed"]
|
||
loader = RSS_Url_loader(urls)
|
||
docs = loader.load()
|
||
for doc in docs:
|
||
print(doc) |