11.1 探索の開始
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import seaborn as sb
%matplotlib inline
plt.rcParams['figure.figsize'] = 8,4
df = pd.DataFrame(pd.read_json('nobel_winners_cleaned.json'))
print(df.info())
<class 'pandas.core.frame.DataFrame'>
Int64Index: 858 entries, 0 to 857
Data columns (total 12 columns):
award_age 858 non-null int64
category 858 non-null object
country 858 non-null object
date_of_birth 858 non-null object
date_of_death 559 non-null object
gender 858 non-null object
link 858 non-null object
name 858 non-null object
place_of_birth 831 non-null object
place_of_death 524 non-null object
text 858 non-null object
year 858 non-null int64
dtypes: int64(2), object(10)
memory usage: 87.1+ KB
None
date_of_birthとdate_of_deathをobject型からdatetime型に変換する
df.date_of_birth = pd.to_datetime(df.date_of_birth)
df.date_of_death = pd.to_datetime(df.date_of_death)
df.info('data_of_death')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 858 entries, 0 to 857
Data columns (total 12 columns):
award_age 858 non-null int64
category 858 non-null object
country 858 non-null object
date_of_birth 858 non-null datetime64[ns]
date_of_death 559 non-null datetime64[ns]
gender 858 non-null object
link 858 non-null object
name 858 non-null object
place_of_birth 831 non-null object
place_of_death 524 non-null object
text 858 non-null object
year 858 non-null int64
dtypes: datetime64[ns](2), int64(2), object(8)
memory usage: 87.1+ KB
11.2 pandasを使ったプロット
by_gender = df.groupby('gender')
print(by_gender.size())
print(type(by_gender.size()))
by_gender.size().plot(kind='bar')
gender
female 47
male 811
dtype: int64
[f:id:bitop:20171008101748p:plain]
11.3 男女間の格差
by_cat_gen = df.groupby(['category','gender'])
print(type(by_cat_gen.get_group(('Physics','female'))))
by_cat_gen.get_group(('Physics','female'))[['name','year']]
<class 'pandas.core.frame.DataFrame'>
|
name |
year |
267 |
Maria Goeppert-Mayer |
1963 |
614 |
Marie Skłodowska-Curie |
1903 |
print(by_cat_gen.size())
by_cat_gen.size().plot(kind="barh")
plt.show()
by_cat_gen.size().plot(kind="bar")
category gender
Chemistry female 4
male 167
Economics female 1
male 74
Literature female 13
male 93
Peace female 16
male 87
Physics female 2
male 199
Physiology or Medicine female 11
male 191
dtype: int64
[f:id:bitop:20171008101837p:plain]
[f:id:bitop:20171008101816p:plain]
<matplotlib.axes._subplots.AxesSubplot at 0x7efce45467f0>
11.3.1 グループのアンスタック
by_cat_gen.size().unstack().plot(kind="barh")
<matplotlib.axes._subplots.AxesSubplot at 0x7efce1f9bcf8>
性別グループの並び替えと合計
cat_gen_sz = by_cat_gen.size().unstack()
print(cat_gen_sz,"\n",type(cat_gen_sz))
cat_gen_sz['total'] = cat_gen_sz.sum(axis=1)
cat_gen_sz = cat_gen_sz.sort_values(by = 'female',ascending=True)
cat_gen_sz[['female','total','male']].plot(kind='barh')
gender female male
category
Chemistry 4 167
Economics 1 74
Literature 13 93
Peace 16 87
Physics 2 199
Physiology or Medicine 11 191
<class 'pandas.core.frame.DataFrame'>
<matplotlib.axes._subplots.AxesSubplot at 0x7efce1e67588>
11.3.2 歴史的傾向
by_year_gender = df.groupby(['year','gender'])
year_gen_sz = by_year_gender.size().unstack()
year_gen_sz.plot(kind = 'bar',figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x7efce1e77278>
x軸ラベルの削減
def thin_xticks(ax,tick_gap=10,rotation=45):
ticks = ax.xaxis.get_ticklocs()
ticklabels = [l.get_text() for l in ax.xaxis.get_ticklabels()]
ax.xaxis.set_ticks(ticks[::tick_gap])
ax.xaxis.set_ticklabels(ticklabels[::tick_gap],rotation=rotation)
ax.figure.show()
new_index = pd.Index(np.arange(1901,2015),name='year')
by_year_gender = df.groupby(['year','gender'])
year_gen_sz = by_year_gender.size().unstack().reindex(new_index)
year_gen_sz.plot(kind = 'bar',figsize=(16,4))
thin_xticks(year_gen_sz.plot(kind="bar",figsize=(16,4)))
/home/beetle/anaconda3/lib/python3.6/site-packages/matplotlib/figure.py:403: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
"matplotlib is currently using a non-GUI backend, "
上下に並べた年ごとの性別での受賞者数
new_index = pd.Index(np.arange(1901,2015),name='year')
by_year_gender = df.groupby(['year','gender'])
year_gen_sz = by_year_gender.size().unstack().reindex(new_index)
fig,axes = plt.subplots(nrows=2,ncols=1,sharex=True,sharey=True)
ax_f = axes[0]
ax_m = axes[1]
fig.suptitle('Nobel Prize-winners by gender',fontsize=16)
ax_f.bar(year_gen_sz.index,year_gen_sz.female)
ax_f.set_ylabel('Female winner')
ax_m.bar(year_gen_sz.index,year_gen_sz.male)
ax_m.set_ylabel('male winner')
<matplotlib.text.Text at 0x7efce0a5edd8>
11.4 国の傾向
df.groupby('country').size().sort_values(ascending=False).plot(kind='bar',figsize=(12,4))
print(len(df.groupby('country')))
56
ノーベル賞データ可視化のための国データの取得
MogoDBがうまく動かないのでwinning_country_data.jsonファイルから直接DataFrame化する
df_countries = pd.DataFrame(pd.read_json('winning_country_data.json'))
print(df_countries.info())
print(df_countries['Argentina'])
df_countries = df_countries.T
print(df_countries.info())
print(df_countries.ix[0])
<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, alpha3Code to population
Data columns (total 57 columns):
Argentina 7 non-null object
Australia 7 non-null object
Austria 7 non-null object
Azerbaijan 7 non-null object
Bangladesh 7 non-null object
Belgium 7 non-null object
Canada 7 non-null object
Chile 7 non-null object
China 7 non-null object
Colombia 7 non-null object
Costa Rica 7 non-null object
Cyprus 6 non-null object
Czech Republic 7 non-null object
Denmark 7 non-null object
East Timor 7 non-null object
Egypt 7 non-null object
Finland 7 non-null object
France 7 non-null object
Germany 7 non-null object
Ghana 7 non-null object
Greece 7 non-null object
Guatemala 7 non-null object
Hungary 7 non-null object
Iceland 6 non-null object
India 7 non-null object
Iran 7 non-null object
Ireland 7 non-null object
Israel 7 non-null object
Italy 7 non-null object
Japan 7 non-null object
Kenya 7 non-null object
Korea, South 7 non-null object
Liberia 7 non-null object
Macedonia 7 non-null object
Mexico 7 non-null object
Myanmar (Burma) 6 non-null object
Netherlands 7 non-null object
Nigeria 7 non-null object
Norway 7 non-null object
Pakistan 7 non-null object
Palestinian Territory 6 non-null object
Poland 7 non-null object
Portugal 7 non-null object
Russia 7 non-null object
Saint Lucia 7 non-null object
South Africa 7 non-null object
Spain 7 non-null object
Sweden 7 non-null object
Switzerland 7 non-null object
Taiwan 6 non-null object
Turkey 7 non-null object
United Kingdom 7 non-null object
United States 7 non-null object
Venezuela 7 non-null object
Vietnam 7 non-null object
Yemen 7 non-null object
Yugoslavia 7 non-null object
dtypes: object(57)
memory usage: 3.2+ KB
None
alpha3Code ARG
area 2.7804e+06
capital Buenos Aires
gini 44.5
latlng [-34.0, -64.0]
name Argentina
population 42669500
Name: Argentina, dtype: object
<class 'pandas.core.frame.DataFrame'>
Index: 57 entries, Argentina to Yugoslavia
Data columns (total 7 columns):
alpha3Code 57 non-null object
area 56 non-null object
capital 57 non-null object
gini 53 non-null object
latlng 57 non-null object
name 57 non-null object
population 57 non-null object
dtypes: object(7)
memory usage: 6.1+ KB
None
alpha3Code ARG
area 2.7804e+06
capital Buenos Aires
gini 44.5
latlng [-34.0, -64.0]
name Argentina
population 42669500
Name: Argentina, dtype: object
/home/beetle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:7: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
import sys
1人当たりの国別のノーベル受賞者数
nat_group = df.groupby('country')
ngsz = nat_group.size()
df_countries['nobel_wins'] = ngsz
df_countries['nobel_wins_per_capita'] = df_countries.nobel_wins / df_countries.population
df_countries.sort_values(by='nobel_wins_per_capita',ascending=False).nobel_wins_per_capita.plot(kind='bar',figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x7efce14de908>
ノーベル賞3個以上受賞している国限定
df_countries[df_countries.nobel_wins > 2].sort_values(by='nobel_wins_per_capita',ascending=False).nobel_wins_per_capita.plot(kind='bar',figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x7efce1033f98>
11.4.1 分野別の受賞数
nat_cat_sz = df.groupby(['country','category']).size().unstack()
print(nat_cat_sz)
category Chemistry Economics Literature Peace Physics \
country
Argentina 1.0 NaN NaN 2.0 NaN
Australia NaN 1.0 1.0 NaN 1.0
Austria 3.0 1.0 1.0 2.0 4.0
Azerbaijan NaN NaN NaN NaN 1.0
Bangladesh NaN NaN NaN 1.0 NaN
Belgium 1.0 NaN 1.0 3.0 1.0
Canada 4.0 1.0 1.0 1.0 2.0
Chile NaN NaN 2.0 NaN NaN
China NaN NaN 1.0 2.0 2.0
Colombia NaN NaN 1.0 NaN NaN
Costa Rica NaN NaN NaN 1.0 NaN
Cyprus NaN 1.0 NaN NaN NaN
Czech Republic 1.0 NaN 1.0 NaN NaN
Denmark 1.0 NaN 3.0 1.0 3.0
East Timor NaN NaN NaN 2.0 NaN
Egypt 1.0 NaN 1.0 2.0 NaN
Finland NaN NaN NaN 1.0 NaN
France 8.0 2.0 16.0 9.0 12.0
Germany 28.0 1.0 8.0 4.0 23.0
Ghana NaN NaN NaN 1.0 NaN
Greece NaN NaN 2.0 NaN NaN
Guatemala NaN NaN 1.0 1.0 NaN
Hungary 1.0 NaN 1.0 NaN NaN
Iceland NaN NaN 1.0 NaN NaN
India NaN NaN 1.0 2.0 1.0
Iran NaN NaN NaN 1.0 NaN
Ireland NaN NaN 2.0 3.0 1.0
Israel 5.0 1.0 1.0 3.0 NaN
Italy 1.0 NaN 6.0 1.0 4.0
Japan 5.0 NaN 2.0 1.0 8.0
Kenya NaN NaN NaN 1.0 NaN
Korea, South NaN NaN NaN 1.0 NaN
Liberia NaN NaN NaN 2.0 NaN
Mexico NaN NaN 1.0 1.0 NaN
Myanmar (Burma) NaN NaN NaN 1.0 NaN
Netherlands 3.0 2.0 NaN 1.0 9.0
Nigeria NaN NaN 1.0 NaN NaN
Norway 1.0 3.0 3.0 2.0 NaN
Pakistan NaN NaN NaN 1.0 1.0
Palestinian Territory NaN NaN NaN 1.0 NaN
Poland NaN NaN 3.0 1.0 1.0
Portugal NaN NaN 1.0 NaN NaN
Russia 1.0 1.0 3.0 2.0 9.0
Saint Lucia NaN NaN 1.0 NaN NaN
South Africa NaN NaN 2.0 4.0 NaN
Spain NaN NaN 5.0 NaN NaN
Sweden 4.0 2.0 8.0 5.0 4.0
Switzerland 6.0 NaN 2.0 3.0 3.0
Taiwan 1.0 NaN NaN NaN NaN
Turkey NaN NaN 1.0 NaN NaN
United Kingdom 26.0 6.0 9.0 10.0 22.0
United States 69.0 53.0 11.0 21.0 89.0
Venezuela NaN NaN NaN NaN NaN
Vietnam NaN NaN NaN 1.0 NaN
Yemen NaN NaN NaN 1.0 NaN
Yugoslavia NaN NaN 1.0 NaN NaN
category Physiology or Medicine
country
Argentina 2.0
Australia 6.0
Austria 4.0
Azerbaijan NaN
Bangladesh NaN
Belgium 4.0
Canada 2.0
Chile NaN
China NaN
Colombia NaN
Costa Rica NaN
Cyprus NaN
Czech Republic NaN
Denmark 5.0
East Timor NaN
Egypt NaN
Finland NaN
France 12.0
Germany 16.0
Ghana NaN
Greece NaN
Guatemala NaN
Hungary 1.0
Iceland NaN
India NaN
Iran NaN
Ireland NaN
Israel NaN
Italy 1.0
Japan 2.0
Kenya NaN
Korea, South NaN
Liberia NaN
Mexico NaN
Myanmar (Burma) NaN
Netherlands 2.0
Nigeria NaN
Norway 2.0
Pakistan NaN
Palestinian Territory NaN
Poland NaN
Portugal 1.0
Russia 2.0
Saint Lucia NaN
South Africa 1.0
Spain 1.0
Sweden 6.0
Switzerland 9.0
Taiwan NaN
Turkey NaN
United Kingdom 27.0
United States 95.0
Venezuela 1.0
Vietnam NaN
Yemen NaN
Yugoslavia NaN
COL_NUM = 2
ROW_NUM = 3
fig,axes = plt.subplots(ROW_NUM,COL_NUM,figsize = (12,12))
for i, (lable,col) in enumerate(nat_cat_sz.iteritems()):
ax = axes[i//COL_NUM,i % COL_NUM]
col = col.sort_values(ascending=False)[:10]
col.plot(kind='barh',ax=ax)
ax.set_title(lable)
plt.tight_layout()
11.4.3 受賞分布の歴史的傾向
plt.rcParams['font.size'] = 20
new_index = pd.Index(np.arange(1901,2015),name='year')
by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index)
by_year_nat_sz['United States'].cumsum().plot(figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x7efce0bf8780>
日本の受賞者の歴史的傾向を見てみる
new_index = pd.Index(np.arange(1901,2015),name='year')
by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index)
by_year_nat_sz['Japan'].cumsum().plot(figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x7efce0010b38>
Nanを0に置換する
by_year_nat_sz['United States'].fillna(0).cumsum().plot(figsize=(16,4))
<matplotlib.axes._subplots.AxesSubplot at 0x7efce1465f28>
日本も0に置換してみる
new_index = pd.Index(np.arange(1901,2015),name='year')
by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index)
fig,axes = plt.subplots(2,1,figsize = (16,4))
by_year_nat_sz['Japan'].cumsum().plot(ax=axes[0])
by_year_nat_sz['Japan'].fillna(0).cumsum().plot(ax=axes[1])
<matplotlib.axes._subplots.AxesSubplot at 0x7efcdf9a2e10>
生データの表示
import math as m
sum = 0
for item in by_year_nat_sz['Japan']:
if not m.isnan(item):
print(item)
sum += item
print('sum:',sum)
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
2.0
2.0
1.0
1.0
2.0
sum: 18.0
米国を除いた他の国の推移
new_index = pd.Index(np.arange(1901,2015),name='year')
by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index)
not_US = by_year_nat_sz.columns.tolist()
print(type(not_US))
not_US.remove('United States')
by_year_nat_sz['Not_US'] = by_year_nat_sz[not_US].sum(axis=1)
ax = by_year_nat_sz[['United States','Not_US']].fillna(0).cumsum().plot(figsize=(16,4))
<class 'list'>
地域差の詳細
by_year_nat_sz = df.groupby(['year','country']).size().unstack().reindex(new_index).fillna(0)
regions = [
{'label':'N.America','countries':['United States','Canada']},
{'label':'Europe','countries':['United Kingdom','Germany','France']},
{'label':'Asia','countries':['Japan','Russia','India']}
]
for region in regions:
by_year_nat_sz[region['label']] = by_year_nat_sz[region['countries']].sum(axis=1)
by_year_nat_sz[[r['label'] for r in regions]].cumsum().plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7efce453e940>
受賞数上位16カ国(米国除く)の詳細
COL_NUM = 4
ROW_NUM = 4
by_nat_sz = df.groupby('country').size()
by_nat_sz.sort_values(ascending=False,inplace=True)
fig, axes = plt.subplots(COL_NUM,ROW_NUM,sharex=True,sharey=True,figsize=(12,12))
for i,nat in enumerate(by_nat_sz.index[1:17]):
ax = axes[i//COL_NUM,i%COL_NUM]
by_year_nat_sz[nat].cumsum().plot(ax=ax)
ax.set_title(nat)
ヒートマップ
import seaborn as sns
bins = np.arange(df.year.min(),df.year.max(),10)
by_year_nat_binned = df.groupby([pd.cut(df.year,bins,precision=0),'country']).size().unstack().fillna(0)
plt.figure(figsize=(16,16))
sns.heatmap(by_year_nat_binned[by_year_nat_binned.sum(axis=1) > 2])
<matplotlib.axes._subplots.AxesSubplot at 0x7efcdebe5048>