3 回答

TA貢獻1725條經驗 獲得超8個贊
我發現了問題所在。線
start = start + 1
應該在最后一個 else 語句中的位置。
所以我的代碼看起來像這樣,并為我提供了上面指定的所需輸入:
def tokenize(lines):
words = []
for line in lines:
start = 0
while start < len(line):
while line[start].isspace():
start = start + 1
end = start
if line[start].isalpha():
while line[end].isalpha():
end = end + 1
word = line[start:end]
word = word.lower()
words.append(word)
start = end
elif line[start].isdigit():
while line[end].isdigit():
end = end + 1
word = line[start:end]
words.append(word)
start = end
else:
word = line[start]
words.append(word)
start = start + 1
return words
但是,當我使用下面的測試腳本來確保沒有遺漏函數“tokenize”的極端情況時;...
import io
import sys
import importlib.util
def test(fun,x,y):
global pass_tests, fail_tests
if type(x) == tuple:
z = fun(*x)
else:
z = fun(x)
if y == z:
pass_tests = pass_tests + 1
else:
if type(x) == tuple:
s = repr(x)
else:
s = "("+repr(x)+")"
print("Condition failed:")
print(" "+fun.__name__+s+" == "+repr(y))
print(fun.__name__+" returned/printed:")
print(str(z))
fail_tests = fail_tests + 1
def run(src_path=None):
global pass_tests, fail_tests
if src_path == None:
import wordfreq
else:
spec = importlib.util.spec_from_file_location("wordfreq", src_path+"/wordfreq.py")
wordfreq = importlib.util.module_from_spec(spec)
spec.loader.exec_module(wordfreq)
pass_tests = 0
fail_tests = 0
fun_count = 0
def printTopMost(freq,n):
saved = sys.stdout
sys.stdout = io.StringIO()
wordfreq.printTopMost(freq,n)
out = sys.stdout.getvalue()
sys.stdout = saved
return out
if hasattr(wordfreq, "tokenize"):
fun_count = fun_count + 1
test(wordfreq.tokenize, [], [])
test(wordfreq.tokenize, [""], [])
test(wordfreq.tokenize, [" "], [])
test(wordfreq.tokenize, ["This is a simple sentence"], ["this","is","a","simple","sentence"])
test(wordfreq.tokenize, ["I told you!"], ["i","told","you","!"])
test(wordfreq.tokenize, ["The 10 little chicks"], ["the","10","little","chicks"])
test(wordfreq.tokenize, ["15th anniversary"], ["15","th","anniversary"])
test(wordfreq.tokenize, ["He is in the room, she said."], ["he","is","in","the","room",",","she","said","."])
else:
print("tokenize is not implemented yet!")
if hasattr(wordfreq, "countWords"):
fun_count = fun_count + 1
test(wordfreq.countWords, ([],[]), {})
test(wordfreq.countWords, (["clean","water"],[]), {"clean":1,"water":1})
test(wordfreq.countWords, (["clean","water","is","drinkable","water"],[]), {"clean":1,"water":2,"is":1,"drinkable":1})
test(wordfreq.countWords, (["clean","water","is","drinkable","water"],["is"]), {"clean":1,"water":2,"drinkable":1})
else:
print("countWords is not implemented yet!")
if hasattr(wordfreq, "printTopMost"):
fun_count = fun_count + 1
test(printTopMost,({},10),"")
test(printTopMost,({"horror": 5, "happiness": 15},0),"")
test(printTopMost,({"C": 3, "python": 5, "haskell": 2, "java": 1},3),"python 5\nC 3\nhaskell 2\n")
else:
print("printTopMost is not implemented yet!")
print(str(pass_tests)+" out of "+str(pass_tests+fail_tests)+" passed.")
return (fun_count == 3 and fail_tests == 0)
if __name__ == "__main__":
run()
...我得到以下輸出:
/usr/local/bin/python3.7 "/Users/ericjohannesson/Documents/Frist?ende kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/test.py"
Traceback (most recent call last):
File "/Users/ericjohannesson/Documents/Frist?ende kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/test.py", line 81, in <module>
run()
File "/Users/ericjohannesson/Documents/Frist?ende kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/test.py", line 50, in run
test(wordfreq.tokenize, [" "], [])
File "/Users/ericjohannesson/Documents/Frist?ende kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/test.py", line 10, in test
z = fun(x)
File "/Users/ericjohannesson/Documents/Frista?ende kurser/DAT455 – Introduktion till programmering med Python/lab1/Laborations/Laboration_1/wordfreq.py", line 44, in tokenize
while line[start].isspace():
IndexError: string index out of range
為什么說字符串索引超出范圍?我該如何解決這個問題?

TA貢獻1847條經驗 獲得超11個贊
我不確定你為什么要上下做,但這是你如何拆分它的方法:
input = ['15, delicious& Tarts.']
line = input[0]
words = line.split(' ')
words = [word for word in words if word]
out:
['15,', 'delicious&', 'Tarts.']
編輯,看到你編輯了你想要的輸出方式。只需跳過這一行即可獲得該輸出:
words = [word for word in words if word]

TA貢獻1827條經驗 獲得超9個贊
itertools.groupby可以大大簡化這一點?;旧?,您根據字符的類別或類型(字母、數字或標點符號)對字符串中的字符進行分組。在此示例中,我只定義了這三個類別,但您可以根據需要定義任意數量的類別。任何不匹配任何類別的字符(本例中為空格)將被忽略:
def get_tokens(string):
from itertools import groupby
from string import ascii_lowercase, ascii_uppercase, digits, punctuation as punct
alpha = ascii_lowercase + ascii_uppercase
yield from ("".join(group) for key, group in groupby(string, key=lambda char: next((category for category in (alpha, digits, punct) if char in category), "")) if key)
print(list(get_tokens("15, delicious& Tarts.")))
輸出:
['15', ',', 'delicious', '&', 'Tarts', '.']
>>>
添加回答
舉報