1 回答

TA貢獻1871條經驗 獲得超8個贊
["p","ul","h2","div"]您可以使用findNextwith查找靜態標簽列表recursive=False以保持在頂層:
import requests
from bs4 import BeautifulSoup
import json
resp = requests.get("https://www.us-cert.gov/ics/advisories/icsma-20-079-01")
soup = BeautifulSoup(resp.content, "html.parser")
content_div = soup.find('div', {"class": "content"})
h2_list = [ i for i in content_div.find_all("h2")]
result = []
search_tags = ["p","ul","h2","div"]
def getChildren(tag):
text = []
while (tag):
tag = tag.findNext(search_tags, recursive=False)
if (tag is None):
break
elif (tag.name == "div") or (tag.name == "h2"):
break
else:
text.append(tag.text.strip())
return "".join(text)
for i in h2_list:
result.append({
"name": i.text.strip(),
"children": getChildren(i)
})
print(json.dumps(result, indent=4, sort_keys=True))
- 1 回答
- 0 關注
- 170 瀏覽
添加回答
舉報