「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

11.1 探索の開始

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import seaborn as sb

%matplotlib inline

plt.rcParams['figure.figsize'] = 8,4

#Mongoデータベースがうまく動かないのでjsonファイルをDataFrameに読み込ませる
df = pd.DataFrame(pd.read_json('nobel_winners_cleaned.json'))
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 858 entries, 0 to 857
Data columns (total 12 columns):
award_age         858 non-null int64
category          858 non-null object
country           858 non-null object
date_of_birth     858 non-null object
date_of_death     559 non-null object
gender            858 non-null object
link              858 non-null object
name              858 non-null object
place_of_birth    831 non-null object
place_of_death    524 non-null object
text              858 non-null object
year              858 non-null int64
dtypes: int64(2), object(10)
memory usage: 87.1+ KB
None

date_of_birthとdate_of_deathをobject型からdatetime型に変換する

df.date_of_birth = pd.to_datetime(df.date_of_birth)
df.date_of_death = pd.to_datetime(df.date_of_death)
df.info('data_of_death')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 858 entries, 0 to 857
Data columns (total 12 columns):
award_age         858 non-null int64
category          858 non-null object
country           858 non-null object
date_of_birth     858 non-null datetime64[ns]
date_of_death     559 non-null datetime64[ns]
gender            858 non-null object
link              858 non-null object
name              858 non-null object
place_of_birth    831 non-null object
place_of_death    524 non-null object
text              858 non-null object
year              858 non-null int64
dtypes: datetime64[ns](2), int64(2), object(8)
memory usage: 87.1+ KB

11.2 pandasを使ったプロット

by_gender = df.groupby('gender')
print(by_gender.size())
print(type(by_gender.size()))
by_gender.size().plot(kind='bar') #Seriesデータに対しplotメソッドを実行している

gender
female     47
male      811
dtype: int64

[f:id:bitop:20171008101748p:plain]

11.3 男女間の格差

by_cat_gen = df.groupby(['category','gender'])
print(type(by_cat_gen.get_group(('Physics','female'))))
by_cat_gen.get_group(('Physics','female'))[['name','year']] #物理賞を取った女性の名前と受賞年を取得

<class 'pandas.core.frame.DataFrame'>

	name	year
267	Maria Goeppert-Mayer	1963
614	Marie Skłodowska-Curie	1903

#女性受賞者はPeace,Literature（文学賞）、Physiology or Medicine（生理学及び医学賞）におおい
print(by_cat_gen.size())
by_cat_gen.size().plot(kind="barh")
plt.show()
#縦軸でも
by_cat_gen.size().plot(kind="bar")

category                gender
Chemistry               female      4
                        male      167
Economics               female      1
                        male       74
Literature              female     13
                        male       93
Peace                   female     16
                        male       87
Physics                 female      2
                        male      199
Physiology or Medicine  female     11
                        male      191
dtype: int64

[f:id:bitop:20171008101837p:plain] [f:id:bitop:20171008101816p:plain]

<matplotlib.axes._subplots.AxesSubplot at 0x7efce45467f0>

png

11.3.1 グループのアンスタック

by_cat_gen.size().unstack().plot(kind="barh")

<matplotlib.axes._subplots.AxesSubplot at 0x7efce1f9bcf8>

png

性別グループの並び替えと合計

cat_gen_sz = by_cat_gen.size().unstack()
print(cat_gen_sz,"\n",type(cat_gen_sz))
cat_gen_sz['total'] = cat_gen_sz.sum(axis=1) #cat_gen_sz(DataFrame)を列方向(性別方向)に合計をとってtotal列に代入する
cat_gen_sz = cat_gen_sz.sort_values(by = 'female',ascending=True)
cat_gen_sz[['female','total','male']].plot(kind='barh')

gender                  female  male
category                            
Chemistry                    4   167
Economics                    1    74
Literature                  13    93
Peace                       16    87
Physics                      2   199
Physiology or Medicine      11   191 
 <class 'pandas.core.frame.DataFrame'>





<matplotlib.axes._subplots.AxesSubplot at 0x7efce1e67588>

png

11.3.2 歴史的傾向

by_year_gender = df.groupby(['year','gender'])
year_gen_sz = by_year_gender.size().unstack()
year_gen_sz.plot(kind = 'bar',figsize=(16,4))

<matplotlib.axes._subplots.AxesSubplot at 0x7efce1e77278>

png

x軸ラベルの削減

def thin_xticks(ax,tick_gap=10,rotation=45):
    #x軸を減らして回転を調整する
    ticks = ax.xaxis.get_ticklocs() #xaxisはtickに関するobject
    ticklabels = [l.get_text() for l in ax.xaxis.get_ticklabels()]
    ax.xaxis.set_ticks(ticks[::tick_gap])
    ax.xaxis.set_ticklabels(ticklabels[::tick_gap],rotation=rotation)
    ax.figure.show()
    
new_index = pd.Index(np.arange(1901,2015),name='year')
by_year_gender = df.groupby(['year','gender'])
year_gen_sz = by_year_gender.size().unstack().reindex(new_index)
year_gen_sz.plot(kind = 'bar',figsize=(16,4))
thin_xticks(year_gen_sz.plot(kind="bar",figsize=(16,4)))

/home/beetle/anaconda3/lib/python3.6/site-packages/matplotlib/figure.py:403: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
  "matplotlib is currently using a non-GUI backend, "

png

上下に並べた年ごとの性別での受賞者数

new_index = pd.Index(np.arange(1901,2015),name='year')
by_year_gender = df.groupby(['year','gender'])
year_gen_sz = by_year_gender.size().unstack().reindex(new_index)

fig,axes = plt.subplots(nrows=2,ncols=1,sharex=True,sharey=True)
ax_f = axes[0]
ax_m = axes[1]

fig.suptitle('Nobel Prize-winners by gender',fontsize=16)
ax_f.bar(year_gen_sz.index,year_gen_sz.female)
ax_f.set_ylabel('Female winner')
ax_m.bar(year_gen_sz.index,year_gen_sz.male)
ax_m.set_ylabel('male winner')

<matplotlib.text.Text at 0x7efce0a5edd8>

png

11.4 国の傾向

#orderメソッドを使うとそのようなものはないとエラーがでるのでsort_valuesメッソドを使用
#ascending=Falseは降順でソートの指定
df.groupby('country').size().sort_values(ascending=False).plot(kind='bar',figsize=(12,4))
#受賞した国数は
print(len(df.groupby('country'))) #56国　wikiによれは世界全体の国家数は206なので残り150国はノーベル受賞者を出していない

png

ノーベル賞データ可視化のための国データの取得

MogoDBがうまく動かないのでwinning_country_data.jsonファイルから直接DataFrame化する

df_countries = pd.DataFrame(pd.read_json('winning_country_data.json'))
print(df_countries.info())
print(df_countries['Argentina'])
#本とは列と行が逆になっている,行列を転置する
df_countries = df_countries.T
print(df_countries.info())
print(df_countries.ix[0])

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, alpha3Code to population
Data columns (total 57 columns):
Argentina                7 non-null object
Australia                7 non-null object
Austria                  7 non-null object
Azerbaijan               7 non-null object
Bangladesh               7 non-null object
Belgium                  7 non-null object
Canada                   7 non-null object
Chile                    7 non-null object
China                    7 non-null object
Colombia                 7 non-null object
Costa Rica               7 non-null object
Cyprus                   6 non-null object
Czech Republic           7 non-null object
Denmark                  7 non-null object
East Timor               7 non-null object
Egypt                    7 non-null object
Finland                  7 non-null object
France                   7 non-null object
Germany                  7 non-null object
Ghana                    7 non-null object
Greece                   7 non-null object
Guatemala                7 non-null object
Hungary                  7 non-null object
Iceland                  6 non-null object
India                    7 non-null object
Iran                     7 non-null object
Ireland                  7 non-null object
Israel                   7 non-null object
Italy                    7 non-null object
Japan                    7 non-null object
Kenya                    7 non-null object
Korea, South             7 non-null object
Liberia                  7 non-null object
Macedonia                7 non-null object
Mexico                   7 non-null object
Myanmar (Burma)          6 non-null object
Netherlands              7 non-null object
Nigeria                  7 non-null object
Norway                   7 non-null object
Pakistan                 7 non-null object
Palestinian Territory    6 non-null object
Poland                   7 non-null object
Portugal                 7 non-null object
Russia                   7 non-null object
Saint Lucia              7 non-null object
South Africa             7 non-null object
Spain                    7 non-null object
Sweden                   7 non-null object
Switzerland              7 non-null object
Taiwan                   6 non-null object
Turkey                   7 non-null object
United Kingdom           7 non-null object
United States            7 non-null object
Venezuela                7 non-null object
Vietnam                  7 non-null object
Yemen                    7 non-null object
Yugoslavia               7 non-null object
dtypes: object(57)
memory usage: 3.2+ KB
None
alpha3Code               ARG
area              2.7804e+06
capital         Buenos Aires
gini                    44.5
latlng        [-34.0, -64.0]
name               Argentina
population          42669500
Name: Argentina, dtype: object
<class 'pandas.core.frame.DataFrame'>
Index: 57 entries, Argentina to Yugoslavia
Data columns (total 7 columns):
alpha3Code    57 non-null object
area          56 non-null object
capital       57 non-null object
gini          53 non-null object
latlng        57 non-null object
name          57 non-null object
population    57 non-null object
dtypes: object(7)
memory usage: 6.1+ KB
None
alpha3Code               ARG
area              2.7804e+06
capital         Buenos Aires
gini                    44.5
latlng        [-34.0, -64.0]
name               Argentina
population          42669500
Name: Argentina, dtype: object


/home/beetle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:7: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  import sys

1人当たりの国別のノーベル受賞者数

#本の上から9行目df.countriesではエラーがでるdf_countriesと変更
#print(df_countries)
nat_group = df.groupby('country')
ngsz = nat_group.size() #国別の受賞者数
#print(ngsz)
#df_countries = df_countries.set_index('name')
df_countries['nobel_wins'] = ngsz
df_countries['nobel_wins_per_capita'] = df_countries.nobel_wins / df_countries.population
#print(df_countries)
df_countries.sort_values(by='nobel_wins_per_capita',ascending=False).nobel_wins_per_capita.plot(kind='bar',figsize=(16,4))

<matplotlib.axes._subplots.AxesSubplot at 0x7efce14de908>

png

ノーベル賞３個以上受賞している国限定

df_countries[df_countries.nobel_wins > 2].sort_values(by='nobel_wins_per_capita',ascending=False).nobel_wins_per_capita.plot(kind='bar',figsize=(16,4))

<matplotlib.axes._subplots.AxesSubplot at 0x7efce1033f98>

png

11.4.1 分野別の受賞数

nat_cat_sz = df.groupby(['country','category']).size().unstack()
print(nat_cat_sz)

category               Chemistry  Economics  Literature  Peace  Physics  \
country                                                                   
Argentina                    1.0        NaN         NaN    2.0      NaN   
Australia                    NaN        1.0         1.0    NaN      1.0   
Austria                      3.0        1.0         1.0    2.0      4.0   
Azerbaijan                   NaN        NaN         NaN    NaN      1.0   
Bangladesh                   NaN        NaN         NaN    1.0      NaN   
Belgium                      1.0        NaN         1.0    3.0      1.0   
Canada                       4.0        1.0         1.0    1.0      2.0   
Chile                        NaN        NaN         2.0    NaN      NaN   
China                        NaN        NaN         1.0    2.0      2.0   
Colombia                     NaN        NaN         1.0    NaN      NaN   
Costa Rica                   NaN        NaN         NaN    1.0      NaN   
Cyprus                       NaN        1.0         NaN    NaN      NaN   
Czech Republic               1.0        NaN         1.0    NaN      NaN   
Denmark                      1.0        NaN         3.0    1.0      3.0   
East Timor                   NaN        NaN         NaN    2.0      NaN   
Egypt                        1.0        NaN         1.0    2.0      NaN   
Finland                      NaN        NaN         NaN    1.0      NaN   
France                       8.0        2.0        16.0    9.0     12.0   
Germany                     28.0        1.0         8.0    4.0     23.0   
Ghana                        NaN        NaN         NaN    1.0      NaN   
Greece                       NaN        NaN         2.0    NaN      NaN   
Guatemala                    NaN        NaN         1.0    1.0      NaN   
Hungary                      1.0        NaN         1.0    NaN      NaN   
Iceland                      NaN        NaN         1.0    NaN      NaN   
India                        NaN        NaN         1.0    2.0      1.0   
Iran                         NaN        NaN         NaN    1.0      NaN   
Ireland                      NaN        NaN         2.0    3.0      1.0   
Israel                       5.0        1.0         1.0    3.0      NaN   
Italy                        1.0        NaN         6.0    1.0      4.0   
Japan                        5.0        NaN         2.0    1.0      8.0   
Kenya                        NaN        NaN         NaN    1.0      NaN   
Korea, South                 NaN        NaN         NaN    1.0      NaN   
Liberia                      NaN        NaN         NaN    2.0      NaN   
Mexico                       NaN        NaN         1.0    1.0      NaN   
Myanmar (Burma)              NaN        NaN         NaN    1.0      NaN   
Netherlands                  3.0        2.0         NaN    1.0      9.0   
Nigeria                      NaN        NaN         1.0    NaN      NaN   
Norway                       1.0        3.0         3.0    2.0      NaN   
Pakistan                     NaN        NaN         NaN    1.0      1.0   
Palestinian Territory        NaN        NaN         NaN    1.0      NaN   
Poland                       NaN        NaN         3.0    1.0      1.0   
Portugal                     NaN        NaN         1.0    NaN      NaN   
Russia                       1.0        1.0         3.0    2.0      9.0   
Saint Lucia                  NaN        NaN         1.0    NaN      NaN   
South Africa                 NaN        NaN         2.0    4.0      NaN   
Spain                        NaN        NaN         5.0    NaN      NaN   
Sweden                       4.0        2.0         8.0    5.0      4.0   
Switzerland                  6.0        NaN         2.0    3.0      3.0   
Taiwan                       1.0        NaN         NaN    NaN      NaN   
Turkey                       NaN        NaN         1.0    NaN      NaN   
United Kingdom              26.0        6.0         9.0   10.0     22.0   
United States               69.0       53.0        11.0   21.0     89.0   
Venezuela                    NaN        NaN         NaN    NaN      NaN   
Vietnam                      NaN        NaN         NaN    1.0      NaN   
Yemen                        NaN        NaN         NaN    1.0      NaN   
Yugoslavia                   NaN        NaN         1.0    NaN      NaN   

category               Physiology or Medicine  
country                                        
Argentina                                 2.0  
Australia                                 6.0  
Austria                                   4.0  
Azerbaijan                                NaN  
Bangladesh                                NaN  
Belgium                                   4.0  
Canada                                    2.0  
Chile                                     NaN  
China                                     NaN  
Colombia                                  NaN  
Costa Rica                                NaN  
Cyprus                                    NaN  
Czech Republic                            NaN  
Denmark                                   5.0  
East Timor                                NaN  
Egypt                                     NaN  
Finland                                   NaN  
France                                   12.0  
Germany                                  16.0  
Ghana                                     NaN  
Greece                                    NaN  
Guatemala                                 NaN  
Hungary                                   1.0  
Iceland                                   NaN  
India                                     NaN  
Iran                                      NaN  
Ireland                                   NaN  
Israel                                    NaN  
Italy                                     1.0  
Japan                                     2.0  
Kenya                                     NaN  
Korea, South                              NaN  
Liberia                                   NaN  
Mexico                                    NaN  
Myanmar (Burma)                           NaN  
Netherlands                               2.0  
Nigeria                                   NaN  
Norway                                    2.0  
Pakistan                                  NaN  
Palestinian Territory                     NaN  
Poland                                    NaN  
Portugal                                  1.0  
Russia                                    2.0  
Saint Lucia                               NaN  
South Africa                              1.0  
Spain                                     1.0  
Sweden                                    6.0  
Switzerland                               9.0  
Taiwan                                    NaN  
Turkey                                    NaN  
United Kingdom                           27.0  
United States                            95.0  
Venezuela                                 1.0  
Vietnam                                   NaN  
Yemen                                     NaN  
Yugoslavia                                NaN

#python3では割り算の結果が浮動小数点になるので/ではなく//を使う
#orderメソッドはないのでsort_valuesメソッドをつかう
COL_NUM = 2
ROW_NUM = 3
fig,axes = plt.subplots(ROW_NUM,COL_NUM,figsize = (12,12))
for i, (lable,col) in enumerate(nat_cat_sz.iteritems()):
    ax = axes[i//COL_NUM,i % COL_NUM]
    col = col.sort_values(ascending=False)[:10]
    col.plot(kind='barh',ax=ax)
    ax.set_title(lable)
    plt.tight_layout()

png

11.4.3 受賞分布の歴史的傾向

#国家:nation  別の訳としてはstate, country, homeland, sovereign state, kingdomがある
plt.rcParams['font.size'] = 20
new_index = pd.Index(np.arange(1901,2015),name='year')
by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index)
by_year_nat_sz['United States'].cumsum().plot(figsize=(16,4))

<matplotlib.axes._subplots.AxesSubplot at 0x7efce0bf8780>

png

日本の受賞者の歴史的傾向を見てみる

new_index = pd.Index(np.arange(1901,2015),name='year')
by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index)
by_year_nat_sz['Japan'].cumsum().plot(figsize=(16,4)) #ここのkeyをJapanに変えた

<matplotlib.axes._subplots.AxesSubplot at 0x7efce0010b38>

png

Nanを0に置換する

#fillnaメソッドは欠損値を引数の定数値に置換する
by_year_nat_sz['United States'].fillna(0).cumsum().plot(figsize=(16,4))

<matplotlib.axes._subplots.AxesSubplot at 0x7efce1465f28>

png

日本も0に置換してみる

new_index = pd.Index(np.arange(1901,2015),name='year')
by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index)

fig,axes = plt.subplots(2,1,figsize = (16,4))
#axes[0]の描画は大きいところだけ描画しているような？
by_year_nat_sz['Japan'].cumsum().plot(ax=axes[0])
by_year_nat_sz['Japan'].fillna(0).cumsum().plot(ax=axes[1])

<matplotlib.axes._subplots.AxesSubplot at 0x7efcdf9a2e10>

png

生データの表示

import math as m

sum = 0
for item in by_year_nat_sz['Japan']:
    if not m.isnan(item):
        print(item)
        sum += item
print('sum:',sum)

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
2.0
2.0
1.0
1.0
2.0
sum: 18.0

米国を除いた他の国の推移

#第二次世界大戦終結は1945年
new_index = pd.Index(np.arange(1901,2015),name='year')
by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index)

not_US = by_year_nat_sz.columns.tolist()
print(type(not_US))
not_US.remove('United States')
by_year_nat_sz['Not_US'] = by_year_nat_sz[not_US].sum(axis=1)
ax = by_year_nat_sz[['United States','Not_US']].fillna(0).cumsum().plot(figsize=(16,4))

<class 'list'>

png

地域差の詳細

by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index).fillna(0)
regions = [
    {'label':'N.America','countries':['United States','Canada']},
    {'label':'Europe','countries':['United Kingdom','Germany','France']},
    {'label':'Asia','countries':['Japan','Russia','India']}    #Russia=ロシアだがアジアにいれていいの？、India=インドもアジアなの
]                                                              #WikiによるとOKらしいﾕｰﾗｼﾔ大陸のヨーロッパ以外のすべての国を言うらしい
for region in regions:
    by_year_nat_sz[region['label']] = by_year_nat_sz[region['countries']].sum(axis=1)
by_year_nat_sz[[r['label'] for r in regions]].cumsum().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7efce453e940>

png

受賞数上位16カ国（米国除く）の詳細

#page266の上から9行目by_nat.index(1:17]となっているがby_nat_szでは)
COL_NUM = 4
ROW_NUM = 4
by_nat_sz = df.groupby('country').size()
by_nat_sz.sort_values(ascending=False,inplace=True)
fig, axes = plt.subplots(COL_NUM,ROW_NUM,sharex=True,sharey=True,figsize=(12,12))
for i,nat in enumerate(by_nat_sz.index[1:17]):
    ax = axes[i//COL_NUM,i%COL_NUM]
    by_year_nat_sz[nat].cumsum().plot(ax=ax)
    ax.set_title(nat)

png

ヒートマップ

import seaborn as sns

bins = np.arange(df.year.min(),df.year.max(),10)
by_year_nat_binned = df.groupby([pd.cut(df.year,bins,precision=0),'country']).size().unstack().fillna(0)
plt.figure(figsize=(16,16))
sns.heatmap(by_year_nat_binned[by_year_nat_binned.sum(axis=1) > 2])

<matplotlib.axes._subplots.AxesSubplot at 0x7efcdebe5048>

png