Merge remote-tracking branch 'origin/dev' into dev
This commit is contained in:
commit
561c40afee
|
|
@ -0,0 +1,54 @@
|
|||
from langchain.docstore.document import Document
|
||||
import feedparser
|
||||
import html2text
|
||||
import ssl
|
||||
import time
|
||||
|
||||
|
||||
class RSS_Url_loader:
|
||||
def __init__(self, urls=None,interval=60):
|
||||
'''可用参数urls数组或者是字符串形式的url列表'''
|
||||
self.urls = []
|
||||
self.interval = interval
|
||||
if urls is not None:
|
||||
try:
|
||||
if isinstance(urls, str):
|
||||
urls = [urls]
|
||||
elif isinstance(urls, list):
|
||||
pass
|
||||
else:
|
||||
raise TypeError('urls must be a list or a string.')
|
||||
self.urls = urls
|
||||
except:
|
||||
Warning('urls must be a list or a string.')
|
||||
|
||||
#定时代码还要考虑是不是引入其他类,暂时先不对外开放
|
||||
def scheduled_execution(self):
|
||||
while True:
|
||||
docs = self.load()
|
||||
return docs
|
||||
time.sleep(self.interval)
|
||||
|
||||
def load(self):
|
||||
if hasattr(ssl, '_create_unverified_context'):
|
||||
ssl._create_default_https_context = ssl._create_unverified_context
|
||||
documents = []
|
||||
for url in self.urls:
|
||||
parsed = feedparser.parse(url)
|
||||
for entry in parsed.entries:
|
||||
if "content" in entry:
|
||||
data = entry.content[0].value
|
||||
else:
|
||||
data = entry.description or entry.summary
|
||||
data = html2text.html2text(data)
|
||||
metadata = {"title": entry.title, "link": entry.link}
|
||||
documents.append(Document(page_content=data, metadata=metadata))
|
||||
return documents
|
||||
|
||||
if __name__=="__main__":
|
||||
#需要在配置文件中加入urls的配置,或者是在用户界面上加入urls的配置
|
||||
urls = ["https://www.zhihu.com/rss", "https://www.36kr.com/feed"]
|
||||
loader = RSS_Url_loader(urls)
|
||||
docs = loader.load()
|
||||
for doc in docs:
|
||||
print(doc)
|
||||
|
|
@ -17,6 +17,7 @@ uvicorn~=0.21.1
|
|||
pypinyin~=0.48.0
|
||||
click~=8.1.3
|
||||
tabulate
|
||||
feedparser
|
||||
azure-core
|
||||
#accelerate~=0.18.0
|
||||
#peft~=0.3.0
|
||||
|
|
|
|||
Loading…
Reference in New Issue