ファイル内の単語頻度をしらべる
def freqdist_count(path_name):
"""
指定したpath_nameを読み込んで、そのテキスト内単語の頻度を表示する
"""
count_dic = {}
for path in path_name[:]:
with open(path,"r") as f:
text = f.read()
#コメントを削除(複数行も削除)
text = re.sub(r"/\*([^*]|\*[^/])*\*/","",text)
#{} ()を空白に置き換える
text = re.sub(r"\("," ",text)
text = re.sub(r"\)"," ",text)
text = re.sub(r"{"," ",text)
text = re.sub(r"}"," ",text)
#単独の数値を空白に置き換える
text = re.sub(r"[0-9]+"," ", text)
#カンマを空白に置き換える
text = re.sub(r"\,"," ",text)
#;を空白に置き換える
text = re.sub(r";"," ",text)
wList = text.split()
for w in wList:
if w in c_lang_keyword:
continue
count_dic[w] = count_dic.get(w,0) + 1
count_list= sorted(count_dic.items(), key=lambda x: x[1],reverse=True)
print("--freqdist---------------")
print("上位10位まで表示")
s = 0
for k,v in count_list[:10]:
print(k,":",v)
s += v
print("{:,}".format(s)
path_name = count_file_func(c_file_list)
freqdist_count(path_name)
<結果>
上位10位まで表示
PyObject : 12235
\ : 3648
Py_DECREF : 3485
Py_ssize_t : 3052
self : 3048
c : 2699
res : 2572
len : 1922
" : 1815
value : 1693
36,169
最頻度はPyObjectで、これが最重要データ構造と思われる。