11.1 探索の開始
import pandas as pd import numpy as np import matplotlib.pyplot as plt import json import seaborn as sb %matplotlib inline
plt.rcParams['figure.figsize'] = 8,4
#Mongoデータベースがうまく動かないのでjsonファイルをDataFrameに読み込ませる df = pd.DataFrame(pd.read_json('nobel_winners_cleaned.json')) print(df.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 858 entries, 0 to 857
Data columns (total 12 columns):
award_age 858 non-null int64
category 858 non-null object
country 858 non-null object
date_of_birth 858 non-null object
date_of_death 559 non-null object
gender 858 non-null object
link 858 non-null object
name 858 non-null object
place_of_birth 831 non-null object
place_of_death 524 non-null object
text 858 non-null object
year 858 non-null int64
dtypes: int64(2), object(10)
memory usage: 87.1+ KB
None
date_of_birthとdate_of_deathをobject型からdatetime型に変換する
df.date_of_birth = pd.to_datetime(df.date_of_birth)
df.date_of_death = pd.to_datetime(df.date_of_death)
df.info('data_of_death')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 858 entries, 0 to 857
Data columns (total 12 columns):
award_age 858 non-null int64
category 858 non-null object
country 858 non-null object
date_of_birth 858 non-null datetime64[ns]
date_of_death 559 non-null datetime64[ns]
gender 858 non-null object
link 858 non-null object
name 858 non-null object
place_of_birth 831 non-null object
place_of_death 524 non-null object
text 858 non-null object
year 858 non-null int64
dtypes: datetime64[ns](2), int64(2), object(8)
memory usage: 87.1+ KB
11.2 pandasを使ったプロット
by_gender = df.groupby('gender') print(by_gender.size()) print(type(by_gender.size())) by_gender.size().plot(kind='bar') #Seriesデータに対しplotメソッドを実行している
gender
female 47
male 811
dtype: int64
[f:id:bitop:20171008101748p:plain]
11.3 男女間の格差
by_cat_gen = df.groupby(['category','gender']) print(type(by_cat_gen.get_group(('Physics','female')))) by_cat_gen.get_group(('Physics','female'))[['name','year']] #物理賞を取った女性の名前と受賞年を取得
<class 'pandas.core.frame.DataFrame'>
name | year | |
---|---|---|
267 | Maria Goeppert-Mayer | 1963 |
614 | Marie Skłodowska-Curie | 1903 |
#女性受賞者はPeace,Literature(文学賞)、Physiology or Medicine(生理学及び医学賞)におおい print(by_cat_gen.size()) by_cat_gen.size().plot(kind="barh") plt.show() #縦軸でも by_cat_gen.size().plot(kind="bar")
category gender
Chemistry female 4
male 167
Economics female 1
male 74
Literature female 13
male 93
Peace female 16
male 87
Physics female 2
male 199
Physiology or Medicine female 11
male 191
dtype: int64
[f:id:bitop:20171008101837p:plain] [f:id:bitop:20171008101816p:plain]
<matplotlib.axes._subplots.AxesSubplot at 0x7efce45467f0>
11.3.1 グループのアンスタック
by_cat_gen.size().unstack().plot(kind="barh")
<matplotlib.axes._subplots.AxesSubplot at 0x7efce1f9bcf8>
性別グループの並び替えと合計
cat_gen_sz = by_cat_gen.size().unstack() print(cat_gen_sz,"\n",type(cat_gen_sz)) cat_gen_sz['total'] = cat_gen_sz.sum(axis=1) #cat_gen_sz(DataFrame)を列方向(性別方向)に合計をとってtotal列に代入する cat_gen_sz = cat_gen_sz.sort_values(by = 'female',ascending=True) cat_gen_sz[['female','total','male']].plot(kind='barh')
gender female male
category
Chemistry 4 167
Economics 1 74
Literature 13 93
Peace 16 87
Physics 2 199
Physiology or Medicine 11 191
<class 'pandas.core.frame.DataFrame'>
<matplotlib.axes._subplots.AxesSubplot at 0x7efce1e67588>
11.3.2 歴史的傾向
by_year_gender = df.groupby(['year','gender']) year_gen_sz = by_year_gender.size().unstack() year_gen_sz.plot(kind = 'bar',figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x7efce1e77278>
x軸ラベルの削減
def thin_xticks(ax,tick_gap=10,rotation=45): #x軸を減らして回転を調整する ticks = ax.xaxis.get_ticklocs() #xaxisはtickに関するobject ticklabels = [l.get_text() for l in ax.xaxis.get_ticklabels()] ax.xaxis.set_ticks(ticks[::tick_gap]) ax.xaxis.set_ticklabels(ticklabels[::tick_gap],rotation=rotation) ax.figure.show() new_index = pd.Index(np.arange(1901,2015),name='year') by_year_gender = df.groupby(['year','gender']) year_gen_sz = by_year_gender.size().unstack().reindex(new_index) year_gen_sz.plot(kind = 'bar',figsize=(16,4)) thin_xticks(year_gen_sz.plot(kind="bar",figsize=(16,4)))
/home/beetle/anaconda3/lib/python3.6/site-packages/matplotlib/figure.py:403: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
"matplotlib is currently using a non-GUI backend, "
上下に並べた年ごとの性別での受賞者数
new_index = pd.Index(np.arange(1901,2015),name='year') by_year_gender = df.groupby(['year','gender']) year_gen_sz = by_year_gender.size().unstack().reindex(new_index) fig,axes = plt.subplots(nrows=2,ncols=1,sharex=True,sharey=True) ax_f = axes[0] ax_m = axes[1] fig.suptitle('Nobel Prize-winners by gender',fontsize=16) ax_f.bar(year_gen_sz.index,year_gen_sz.female) ax_f.set_ylabel('Female winner') ax_m.bar(year_gen_sz.index,year_gen_sz.male) ax_m.set_ylabel('male winner')
<matplotlib.text.Text at 0x7efce0a5edd8>
11.4 国の傾向
#orderメソッドを使うとそのようなものはないとエラーがでるのでsort_valuesメッソドを使用 #ascending=Falseは降順でソートの指定 df.groupby('country').size().sort_values(ascending=False).plot(kind='bar',figsize=(12,4)) #受賞した国数は print(len(df.groupby('country'))) #56国 wikiによれは世界全体の国家数は206なので残り150国はノーベル受賞者を出していない
56
ノーベル賞データ可視化のための国データの取得
MogoDBがうまく動かないのでwinning_country_data.jsonファイルから直接DataFrame化する
df_countries = pd.DataFrame(pd.read_json('winning_country_data.json')) print(df_countries.info()) print(df_countries['Argentina']) #本とは列と行が逆になっている,行列を転置する df_countries = df_countries.T print(df_countries.info()) print(df_countries.ix[0])
<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, alpha3Code to population
Data columns (total 57 columns):
Argentina 7 non-null object
Australia 7 non-null object
Austria 7 non-null object
Azerbaijan 7 non-null object
Bangladesh 7 non-null object
Belgium 7 non-null object
Canada 7 non-null object
Chile 7 non-null object
China 7 non-null object
Colombia 7 non-null object
Costa Rica 7 non-null object
Cyprus 6 non-null object
Czech Republic 7 non-null object
Denmark 7 non-null object
East Timor 7 non-null object
Egypt 7 non-null object
Finland 7 non-null object
France 7 non-null object
Germany 7 non-null object
Ghana 7 non-null object
Greece 7 non-null object
Guatemala 7 non-null object
Hungary 7 non-null object
Iceland 6 non-null object
India 7 non-null object
Iran 7 non-null object
Ireland 7 non-null object
Israel 7 non-null object
Italy 7 non-null object
Japan 7 non-null object
Kenya 7 non-null object
Korea, South 7 non-null object
Liberia 7 non-null object
Macedonia 7 non-null object
Mexico 7 non-null object
Myanmar (Burma) 6 non-null object
Netherlands 7 non-null object
Nigeria 7 non-null object
Norway 7 non-null object
Pakistan 7 non-null object
Palestinian Territory 6 non-null object
Poland 7 non-null object
Portugal 7 non-null object
Russia 7 non-null object
Saint Lucia 7 non-null object
South Africa 7 non-null object
Spain 7 non-null object
Sweden 7 non-null object
Switzerland 7 non-null object
Taiwan 6 non-null object
Turkey 7 non-null object
United Kingdom 7 non-null object
United States 7 non-null object
Venezuela 7 non-null object
Vietnam 7 non-null object
Yemen 7 non-null object
Yugoslavia 7 non-null object
dtypes: object(57)
memory usage: 3.2+ KB
None
alpha3Code ARG
area 2.7804e+06
capital Buenos Aires
gini 44.5
latlng [-34.0, -64.0]
name Argentina
population 42669500
Name: Argentina, dtype: object
<class 'pandas.core.frame.DataFrame'>
Index: 57 entries, Argentina to Yugoslavia
Data columns (total 7 columns):
alpha3Code 57 non-null object
area 56 non-null object
capital 57 non-null object
gini 53 non-null object
latlng 57 non-null object
name 57 non-null object
population 57 non-null object
dtypes: object(7)
memory usage: 6.1+ KB
None
alpha3Code ARG
area 2.7804e+06
capital Buenos Aires
gini 44.5
latlng [-34.0, -64.0]
name Argentina
population 42669500
Name: Argentina, dtype: object
/home/beetle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:7: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
import sys
1人当たりの国別のノーベル受賞者数
#本の上から9行目df.countriesではエラーがでるdf_countriesと変更 #print(df_countries) nat_group = df.groupby('country') ngsz = nat_group.size() #国別の受賞者数 #print(ngsz) #df_countries = df_countries.set_index('name') df_countries['nobel_wins'] = ngsz df_countries['nobel_wins_per_capita'] = df_countries.nobel_wins / df_countries.population #print(df_countries) df_countries.sort_values(by='nobel_wins_per_capita',ascending=False).nobel_wins_per_capita.plot(kind='bar',figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x7efce14de908>
ノーベル賞3個以上受賞している国限定
df_countries[df_countries.nobel_wins > 2].sort_values(by='nobel_wins_per_capita',ascending=False).nobel_wins_per_capita.plot(kind='bar',figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x7efce1033f98>
11.4.1 分野別の受賞数
nat_cat_sz = df.groupby(['country','category']).size().unstack() print(nat_cat_sz)
category Chemistry Economics Literature Peace Physics \
country
Argentina 1.0 NaN NaN 2.0 NaN
Australia NaN 1.0 1.0 NaN 1.0
Austria 3.0 1.0 1.0 2.0 4.0
Azerbaijan NaN NaN NaN NaN 1.0
Bangladesh NaN NaN NaN 1.0 NaN
Belgium 1.0 NaN 1.0 3.0 1.0
Canada 4.0 1.0 1.0 1.0 2.0
Chile NaN NaN 2.0 NaN NaN
China NaN NaN 1.0 2.0 2.0
Colombia NaN NaN 1.0 NaN NaN
Costa Rica NaN NaN NaN 1.0 NaN
Cyprus NaN 1.0 NaN NaN NaN
Czech Republic 1.0 NaN 1.0 NaN NaN
Denmark 1.0 NaN 3.0 1.0 3.0
East Timor NaN NaN NaN 2.0 NaN
Egypt 1.0 NaN 1.0 2.0 NaN
Finland NaN NaN NaN 1.0 NaN
France 8.0 2.0 16.0 9.0 12.0
Germany 28.0 1.0 8.0 4.0 23.0
Ghana NaN NaN NaN 1.0 NaN
Greece NaN NaN 2.0 NaN NaN
Guatemala NaN NaN 1.0 1.0 NaN
Hungary 1.0 NaN 1.0 NaN NaN
Iceland NaN NaN 1.0 NaN NaN
India NaN NaN 1.0 2.0 1.0
Iran NaN NaN NaN 1.0 NaN
Ireland NaN NaN 2.0 3.0 1.0
Israel 5.0 1.0 1.0 3.0 NaN
Italy 1.0 NaN 6.0 1.0 4.0
Japan 5.0 NaN 2.0 1.0 8.0
Kenya NaN NaN NaN 1.0 NaN
Korea, South NaN NaN NaN 1.0 NaN
Liberia NaN NaN NaN 2.0 NaN
Mexico NaN NaN 1.0 1.0 NaN
Myanmar (Burma) NaN NaN NaN 1.0 NaN
Netherlands 3.0 2.0 NaN 1.0 9.0
Nigeria NaN NaN 1.0 NaN NaN
Norway 1.0 3.0 3.0 2.0 NaN
Pakistan NaN NaN NaN 1.0 1.0
Palestinian Territory NaN NaN NaN 1.0 NaN
Poland NaN NaN 3.0 1.0 1.0
Portugal NaN NaN 1.0 NaN NaN
Russia 1.0 1.0 3.0 2.0 9.0
Saint Lucia NaN NaN 1.0 NaN NaN
South Africa NaN NaN 2.0 4.0 NaN
Spain NaN NaN 5.0 NaN NaN
Sweden 4.0 2.0 8.0 5.0 4.0
Switzerland 6.0 NaN 2.0 3.0 3.0
Taiwan 1.0 NaN NaN NaN NaN
Turkey NaN NaN 1.0 NaN NaN
United Kingdom 26.0 6.0 9.0 10.0 22.0
United States 69.0 53.0 11.0 21.0 89.0
Venezuela NaN NaN NaN NaN NaN
Vietnam NaN NaN NaN 1.0 NaN
Yemen NaN NaN NaN 1.0 NaN
Yugoslavia NaN NaN 1.0 NaN NaN
category Physiology or Medicine
country
Argentina 2.0
Australia 6.0
Austria 4.0
Azerbaijan NaN
Bangladesh NaN
Belgium 4.0
Canada 2.0
Chile NaN
China NaN
Colombia NaN
Costa Rica NaN
Cyprus NaN
Czech Republic NaN
Denmark 5.0
East Timor NaN
Egypt NaN
Finland NaN
France 12.0
Germany 16.0
Ghana NaN
Greece NaN
Guatemala NaN
Hungary 1.0
Iceland NaN
India NaN
Iran NaN
Ireland NaN
Israel NaN
Italy 1.0
Japan 2.0
Kenya NaN
Korea, South NaN
Liberia NaN
Mexico NaN
Myanmar (Burma) NaN
Netherlands 2.0
Nigeria NaN
Norway 2.0
Pakistan NaN
Palestinian Territory NaN
Poland NaN
Portugal 1.0
Russia 2.0
Saint Lucia NaN
South Africa 1.0
Spain 1.0
Sweden 6.0
Switzerland 9.0
Taiwan NaN
Turkey NaN
United Kingdom 27.0
United States 95.0
Venezuela 1.0
Vietnam NaN
Yemen NaN
Yugoslavia NaN
#python3では割り算の結果が浮動小数点になるので/ではなく//を使う #orderメソッドはないのでsort_valuesメソッドをつかう COL_NUM = 2 ROW_NUM = 3 fig,axes = plt.subplots(ROW_NUM,COL_NUM,figsize = (12,12)) for i, (lable,col) in enumerate(nat_cat_sz.iteritems()): ax = axes[i//COL_NUM,i % COL_NUM] col = col.sort_values(ascending=False)[:10] col.plot(kind='barh',ax=ax) ax.set_title(lable) plt.tight_layout()
11.4.3 受賞分布の歴史的傾向
#国家:nation 別の訳としてはstate, country, homeland, sovereign state, kingdomがある plt.rcParams['font.size'] = 20 new_index = pd.Index(np.arange(1901,2015),name='year') by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index) by_year_nat_sz['United States'].cumsum().plot(figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x7efce0bf8780>
日本の受賞者の歴史的傾向を見てみる
new_index = pd.Index(np.arange(1901,2015),name='year') by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index) by_year_nat_sz['Japan'].cumsum().plot(figsize=(16,4)) #ここのkeyをJapanに変えた
<matplotlib.axes._subplots.AxesSubplot at 0x7efce0010b38>
Nanを0に置換する
#fillnaメソッドは欠損値を引数の定数値に置換する by_year_nat_sz['United States'].fillna(0).cumsum().plot(figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x7efce1465f28>
日本も0に置換してみる
new_index = pd.Index(np.arange(1901,2015),name='year') by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index) fig,axes = plt.subplots(2,1,figsize = (16,4)) #axes[0]の描画は大きいところだけ描画しているような? by_year_nat_sz['Japan'].cumsum().plot(ax=axes[0]) by_year_nat_sz['Japan'].fillna(0).cumsum().plot(ax=axes[1])
<matplotlib.axes._subplots.AxesSubplot at 0x7efcdf9a2e10>
生データの表示
import math as m sum = 0 for item in by_year_nat_sz['Japan']: if not m.isnan(item): print(item) sum += item print('sum:',sum)
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
2.0
2.0
1.0
1.0
2.0
sum: 18.0
米国を除いた他の国の推移
#第二次世界大戦終結は1945年 new_index = pd.Index(np.arange(1901,2015),name='year') by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index) not_US = by_year_nat_sz.columns.tolist() print(type(not_US)) not_US.remove('United States') by_year_nat_sz['Not_US'] = by_year_nat_sz[not_US].sum(axis=1) ax = by_year_nat_sz[['United States','Not_US']].fillna(0).cumsum().plot(figsize=(16,4))
<class 'list'>
地域差の詳細
by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index).fillna(0) regions = [ {'label':'N.America','countries':['United States','Canada']}, {'label':'Europe','countries':['United Kingdom','Germany','France']}, {'label':'Asia','countries':['Japan','Russia','India']} #Russia=ロシアだがアジアにいれていいの?、India=インドもアジアなの ] #WikiによるとOKらしいユーラシヤ大陸のヨーロッパ以外のすべての国を言うらしい for region in regions: by_year_nat_sz[region['label']] = by_year_nat_sz[region['countries']].sum(axis=1) by_year_nat_sz[[r['label'] for r in regions]].cumsum().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7efce453e940>
受賞数上位16カ国(米国除く)の詳細
#page266の上から9行目by_nat.index(1:17]となっているがby_nat_szでは) COL_NUM = 4 ROW_NUM = 4 by_nat_sz = df.groupby('country').size() by_nat_sz.sort_values(ascending=False,inplace=True) fig, axes = plt.subplots(COL_NUM,ROW_NUM,sharex=True,sharey=True,figsize=(12,12)) for i,nat in enumerate(by_nat_sz.index[1:17]): ax = axes[i//COL_NUM,i%COL_NUM] by_year_nat_sz[nat].cumsum().plot(ax=ax) ax.set_title(nat)
ヒートマップ
import seaborn as sns bins = np.arange(df.year.min(),df.year.max(),10) by_year_nat_binned = df.groupby([pd.cut(df.year,bins,precision=0),'country']).size().unstack().fillna(0) plt.figure(figsize=(16,16)) sns.heatmap(by_year_nat_binned[by_year_nat_binned.sum(axis=1) > 2])
<matplotlib.axes._subplots.AxesSubplot at 0x7efcdebe5048>