1 回答

TA貢獻1840條經驗 獲得超5個贊
下面的代碼能夠遍歷所有類別并提取數據。該代碼肯定需要更多的測試和一些增強的錯誤處理。
PS祝你在這個編碼項目中好運。
import requests
import time
from random import randint
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from newspaper.utils import BeautifulSoup
from newspaper import Article
chrome_options = Options()
chrome_options.add_argument("--test-type")
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('disable-infobars')
chrome_options.add_argument("--incognito")
# chrome_options.add_argument('--headless')
# window size as an argument is required in headless mode
# chrome_options.add_argument('window-size=1920x1080')
driver = webdriver.Chrome('/usr/local/bin/chromedriver', options=chrome_options)
papers = []
urls_set = set()
def get_articles(link):
while True:
try:
next_link = driver.find_element_by_link_text("Suivant")
if next_link:
raw_html = requests.get(url)
soup = BeautifulSoup(raw_html.text, 'html.parser')
for articles_tags in soup.findAll('div', {'class': 'articles'}):
for article_href in articles_tags.find_all('a', href=True):
if not str(article_href['href']).endswith('#commentaires'):
article = Article(article_href['href'])
article.download()
article.parse()
if article.url is not None:
article_url = article_href['href']
title = article.title
publish_date = datetime.strptime(str(article.publish_date),
'%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d')
text_of_article = article.text.replace('\n', '')
driver.execute_script("arguments[0].scrollIntoView(true);", next_link)
next_link.click()
# Initiates a random wait to prevent the
# harvesting operation from starting before
# the page has completely loaded
time.sleep(randint(2, 4))
except NoSuchElementException:
return
legorafi_urls = {'monde-libre': 'http://www.legorafi.fr/category/monde-libre',
'politique': 'http://www.legorafi.fr/category/france/politique',
'societe': 'http://www.legorafi.fr/category/france/societe',
'economie': 'http://www.legorafi.fr/category/france/economie',
'culture': 'http://www.legorafi.fr/category/culture',
'people': 'http://www.legorafi.fr/category/people',
'sports': 'http://www.legorafi.fr/category/sports',
'hi-tech': 'http://www.legorafi.fr/category/hi-tech',
'sciences': 'http://www.legorafi.fr/category/sciences',
'ledito': 'http://www.legorafi.fr/category/ledito/'
}
for category, url in legorafi_urls.items():
if url:
browser = driver.get(url)
driver.implicitly_wait(30)
get_articles(browser)
else:
driver.quit()
添加回答
舉報