1 回答

TA貢獻1848條經驗 獲得超10個贊
然后是循環 RSS 源的簡單情況。
import feedparser
from bs4 import BeautifulSoup
import urllib.parse, xml.sax
import pandas as pd
# get some RSS feeds....
resp = requests.get("https://blog.feedspot.com/world_news_rss_feeds/")
soup = BeautifulSoup(resp.content.decode(), "html.parser")
rawfeeds = soup.find_all("h2")
feeds = {}
for rf in rawfeeds:
? ? a = rf.find("a")
? ? if a is not None:
? ? ? ? feeds[a.string.replace("RSS Feed", "").strip()] = urllib.parse.parse_qs(a['href'])["q"][0].replace("site:","")
? ? ? ??
# now source them all into a dataframe
df = pd.DataFrame()
for k, url in feeds.items():
? ? try:
? ? ? ? df = pd.concat([df, pd.json_normalize(feedparser.parse(url)["entries"]).assign(Source=k)])
? ? except (Exception, xml.sax.SAXParseException):
? ? ? ? print(f"invalid xml: {url}")
可重入
使用etag和修改的功能
feedparser
持久化數據幀,以便再次運行時它會從上次停止的地方開始
我會使用線程,這樣它就不是純粹順序的。顯然,對于線程,您需要考慮同步您的保存點。然后,您只需在調度程序中運行即可定期在 RSS 源中獲取新項目并獲取相關文章。
import feedparser, requests, newspaper
from bs4 import BeautifulSoup
import urllib.parse, xml.sax
from pathlib import Path
import pandas as pd
if not Path().cwd().joinpath("news").is_dir(): Path.cwd().joinpath("news").mkdir()
p = Path().cwd().joinpath("news")
? ??
# get some RSS feeds....
if p.joinpath("rss.pickle").is_file():
? ? dfrss = pd.read_pickle(p.joinpath("rss.pickle"))
else:
? ? resp = requests.get("https://blog.feedspot.com/world_news_rss_feeds/")
? ? soup = BeautifulSoup(resp.content.decode(), "html.parser")
? ? rawfeeds = soup.find_all("h2")
? ? feeds = []
? ? for rf in rawfeeds:
? ? ? ? a = rf.find("a")
? ? ? ? if a is not None:
? ? ? ? ? ? feeds.append({"name":a.string.replace("RSS Feed", "").strip(),
? ? ? ? ? ? ? ? ? ? ? ? ?"url":urllib.parse.parse_qs(a['href'])["q"][0].replace("site:",""),
? ? ? ? ? ? ? ? ? ? ? ? ?"etag":"","status":0, "dubug_msg":"", "modified":""})
? ? dfrss = pd.DataFrame(feeds).set_index("url")
if p.joinpath("rssdata.pickle").is_file():
? ? df = pd.read_pickle(p.joinpath("rssdata.pickle"))
else:
? ? df = pd.DataFrame({"id":[],"link":[]})
# now source them all into a dataframe. head() is there for testing purposes
for r in dfrss.head(5).itertuples():
#? ? ?print(r.Index)
? ? try:
? ? ? ? fp = feedparser.parse(r.Index, etag=r.etag, modified=r.modified)
? ? ? ? if fp.bozo==1: raise Exception(fp.bozo_exception)
? ? except Exception as e:
? ? ? ? fp = feedparser.FeedParserDict(**{"etag":r.etag, "entries":[], "status":500, "debug_message":str(e)})
? ? # keep meta information of what has already been sourced from a RSS feed
? ? if "etag" in fp.keys(): dfrss.loc[r.Index,"etag"] = fp.etag
? ? dfrss.loc[r.Index,"status"] = fp.status
? ? if "debug_message" in fp.keys(): dfrss.loc[r.Index,"debug_mgs"] = fp.debug_message
? ? # 304 means upto date... getting 301 and entries hence test len...
? ? if len(fp["entries"])>0:
? ? ? ? dft = pd.json_normalize(fp["entries"]).assign(Source=r.Index)
? ? ? ? # don't capture items that have already been captured...
? ? ? ? df = pd.concat([df, dft[~dft["link"].isin(df["link"])]])
# save to make re-entrant...
dfrss.to_pickle(p.joinpath("rss.pickle"))
df.to_pickle(p.joinpath("rssdata.pickle"))
# finally get the text...
if p.joinpath("text.pickle").is_file():
? ? dftext = pd.read_pickle(p.joinpath("text.pickle"))
else:
? ? dftext = pd.DataFrame({"link":[], "text":[]})
# head() is there for testing purposes
for r in df[~df["link"].isin(dftext["link"])].head(5).itertuples():
? ? a = newspaper.Article(r.link)
? ? a.download()
? ? a.parse()
? ? dftext = dftext.append({"link":r.link, "text":a.text},ignore_index=True)
? ??
dftext.to_pickle(p.joinpath("text.pickle"))
對檢索到的數據進行分析。
添加回答
舉報