2 回答

TA貢獻1878條經驗 獲得超4個贊
您可以使用遞歸BeautifulSoup來生成所有標簽和內容的列表,然后可以使用它來匹配標簽:
from bs4 import BeautifulSoup as soup
import re
content = '<body><text>Hello world!</text><text>This is my code. And this is a number 42</text></body>'
def tokenize(d):
yield f'<{d.name}>'
for i in d.contents:
if not isinstance(i, str):
yield from tokenize(i)
else:
yield from i.split()
yield f'</{d.name}>'
data = list(tokenize(soup(content, 'html.parser').body))
輸出:
['<body>', '<text>', 'Hello', 'world!', '</text>', '<text>', 'This', 'is', 'my', 'code.', 'And', 'this', 'is', 'a', 'number', '42', '</text>', '</body>']
然后,匹配標簽:
labels = [{'label':'salutation', 'start':12, 'end':25}, {'label':'verb', 'start':42, 'end':45}, {'label':'size', 'start':75, 'end':78}]
tokens = [{**i, 'word':content[i['start']:i['end']-1].split()} for i in labels]
indices = {i:iter([[c, c+len(i)+1] for c in range(len(content)) if re.findall('^\W'+i, content[c-1:])]) for i in data}
new_data = [[i, next(indices[i], None)] for i in data]
result = [(lambda x:'NONE' if not x else x[0])([c['label'] for c in tokens if b and c['start'] <= b[0] and b[-1] <= c['end']]) for a, b in new_data]
輸出:
['NONE', 'NONE', 'salutation', 'salutation', 'NONE', 'NONE', 'NONE', 'verb', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'size', 'NONE', 'NONE']

TA貢獻1804條經驗 獲得超8個贊
目前我已經使用 HTMLParser 完成了這項工作:
from html.parser import HTMLParser
from tensorflow.keras.preprocessing.text import text_to_word_sequence
class HTML_tokenizer_labeller(HTMLParser):
def __init__(self, annotations, *args, **kwargs):
super(HTML_tokenizer_labeller, self).__init__(*args, **kwargs)
self.tokens = []
self.labels = []
self.annotations = annotations
def handle_starttag(self, tag, attrs):
self.tokens.append(f'<{tag}>')
self.labels.append('OTHER')
def handle_endtag(self, tag):
self.tokens.append(f'</{tag}>')
self.labels.append('OTHER')
def handle_data(self, data):
print(f"getpos = {self.getpos()}")
tokens = text_to_word_sequence(data)
pos = self.getpos()[1]
for annotation in annotations:
if annotation['start'] <= pos <= annotation['end']:
label = annotation['tag']
break
else: label = 'OTHER'
self.tokens += tokens
self.labels += [label] * len(tokens)
添加回答
舉報