添加RSS_loader.py用作从RSS订阅知识库
This commit is contained in:
parent
1c5f71beee
commit
8d4ef39c6c
|
|
@ -0,0 +1,54 @@
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
import feedparser
|
||||||
|
import html2text
|
||||||
|
import ssl
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
class RSS_Url_loader:
|
||||||
|
def __init__(self, urls=None,interval=60):
|
||||||
|
'''可用参数urls数组或者是字符串形式的url列表'''
|
||||||
|
self.urls = []
|
||||||
|
self.interval = interval
|
||||||
|
if urls is not None:
|
||||||
|
try:
|
||||||
|
if isinstance(urls, str):
|
||||||
|
urls = [urls]
|
||||||
|
elif isinstance(urls, list):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
raise TypeError('urls must be a list or a string.')
|
||||||
|
self.urls = urls
|
||||||
|
except:
|
||||||
|
Warning('urls must be a list or a string.')
|
||||||
|
|
||||||
|
#定时代码还要考虑是不是引入其他类,暂时先不对外开放
|
||||||
|
def scheduled_execution(self):
|
||||||
|
while True:
|
||||||
|
docs = self.load()
|
||||||
|
return docs
|
||||||
|
time.sleep(self.interval)
|
||||||
|
|
||||||
|
def load(self):
|
||||||
|
if hasattr(ssl, '_create_unverified_context'):
|
||||||
|
ssl._create_default_https_context = ssl._create_unverified_context
|
||||||
|
documents = []
|
||||||
|
for url in self.urls:
|
||||||
|
parsed = feedparser.parse(url)
|
||||||
|
for entry in parsed.entries:
|
||||||
|
if "content" in entry:
|
||||||
|
data = entry.content[0].value
|
||||||
|
else:
|
||||||
|
data = entry.description or entry.summary
|
||||||
|
data = html2text.html2text(data)
|
||||||
|
metadata = {"title": entry.title, "link": entry.link}
|
||||||
|
documents.append(Document(page_content=data, metadata=metadata))
|
||||||
|
return documents
|
||||||
|
|
||||||
|
if __name__=="__main__":
|
||||||
|
#需要在配置文件中加入urls的配置,或者是在用户界面上加入urls的配置
|
||||||
|
urls = ["https://www.zhihu.com/rss", "https://www.36kr.com/feed"]
|
||||||
|
loader = RSS_Url_loader(urls)
|
||||||
|
docs = loader.load()
|
||||||
|
for doc in docs:
|
||||||
|
print(doc)
|
||||||
Loading…
Reference in New Issue