38. ヒストグラム¶
単語の出現頻度のヒストグラム(横軸に出現頻度,縦軸に出現頻度をとる単語の種類数を棒グラフで表したもの)を描け.
%matplotlib inline
import re
from collections import Counter
sentences = []
with open("D:\\nlp100\\neko.txt.mecab",encoding="UTF-8") as fr:
line = fr.readline()
keitaiso = []
while line:
if "EOS" in line:
if len(keitaiso)>0:
sentences.append(keitaiso)
keitaiso = []
else:
line = re.split(r'[\t,]',line)
keitaiso.append({"surface":line[0],"base":line[7],"pos":line[1],"pos1":line[2]})
line =fr.readline()
word_count = {}
for sentence in sentences:
for index in range(0,len(sentence)):
item = sentence[index]["surface"]
if item in word_count:
word_count[item] += 1
else:
word_count[item] = 1
list = [(k, word_count[k]) for k in sorted(word_count, key=word_count.get, reverse=True)]
import matplotlib.pyplot as plt
Y = []
for y in list:
Y.append(y[1])
plt.hist(Y,bins=20,range=(1, 20))
plt.show()
<結果>