1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
| """ 你有一个目录,放了你一个月的日记,都是 txt,为了避免分词的问题,假设内容都是英文,请统计出你认为每篇日记最重要的词。 """
import re from collections import Counter import os
def most_common(filename): with open(filename,"r") as f: text = f.read().lower() text = re.sub(r'[,.!?:"]',' ',text) text = re.sub(r'-','',text)
counts = Counter(text.split()) ignore_words=['a','at','an','and','as','by','be','of','said','for','i','it','after',"it's",'in','on','is','she','us','to','not','has','the','that','this','with','have'] for word in ignore_words: if word in counts: counts[word] = 0
print("{} the most word is {}".format(filename,counts.most_common(1)[0][0]) )
if __name__ == '__main__': dirary_dir = 'dirary' for file in os.listdir(dirary_dir): filename = os.path.join(dirary_dir,file) most_common(filename)
|