「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

import pandas as pd
import numpy as np
np.random.seed(0)

9.2データの調査

def reload_data(name='nobel_winners_dirty.json'):
    df = pd.read_json(open('data/' + name))
    return df
df = reload_data()
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1052 entries, 0 to 1051
Data columns (total 12 columns):
born_in           1052 non-null object
category          1052 non-null object
country           1052 non-null object
date_of_birth     1044 non-null object
date_of_death     1044 non-null object
gender            1040 non-null object
link              1052 non-null object
name              1052 non-null object
place_of_birth    1044 non-null object
place_of_death    1044 non-null object
text              1052 non-null object
year              1052 non-null int64
dtypes: int64(1), object(11)
memory usage: 106.8+ KB
df.describe()

describe()は要約統計量を返す

year
count 1052.000000
mean 1968.729087
std 33.155829
min 1809.000000
25% 1947.000000
50% 1975.000000
75% 1996.000000
max 2014.000000
df.describe(include=['object'])
born_in category country date_of_birth date_of_death gender link name place_of_birth place_of_death text
count 1052 1052 1052 1044 1044 1040 1052 1052 1044 1044 1052
unique 40 7 59 853 563 2 893 998 735 410 1043
top Physiology or Medicine United States 7 November 1867 male http://en.wikipedia.org/wiki/Michael_Levitt Felix Bloch Henry Dunant , Peace, 1901
freq 910 250 350 4 362 982 4 2 29 409 2
df.head()
born_in category country date_of_birth date_of_death gender link name place_of_birth place_of_death text year
0 Physiology or Medicine Argentina 8 October 1927 24 March 2002 male http://en.wikipedia.org/wiki/C%C3%A9sar_Milstein César Milstein Bahía Blanca , Argentina Cambridge , England César Milstein , Physiology or Medicine, 1984 1984
1 Bosnia and Herzegovina Literature 9 October 1892 13 March 1975 male http://en.wikipedia.org/wiki/Ivo_Andric Ivo Andric * Dolac (village near Travnik), Austria-Hungary ... Belgrade, SR Serbia, SFR Yugoslavia (present-d... Ivo Andric *, born in then Austria–Hungary ,... 1961
2 Bosnia and Herzegovina Chemistry July 23, 1906 1998-01-07 male http://en.wikipedia.org/wiki/Vladimir_Prelog Vladimir Prelog * Sarajevo , Bosnia and Herzegovina , then part... Zürich , Switzerland Vladimir Prelog *, born in then Austria–Hung... 1975
3 Peace Belgium None None None http://en.wikipedia.org/wiki/Institut_de_Droit... Institut de Droit International None None Institut de Droit International , Peace, 1904 1904
4 Peace Belgium 26 July 1829 6 October 1912 male http://en.wikipedia.org/wiki/Auguste_Marie_Fra... Auguste Beernaert Ostend , Netherlands (now Belgium ) Lucerne , Switzerland Auguste Beernaert , Peace, 1909 1909

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

8.7 パネル

df1 = pd.DataFrame({'foo':[1,2,3],'bar':['a','b','c']})
df2 = pd.DataFrame({'baz':[7,8,9,11],'qux':['p','q','r','t']})
print(df1);print(df2)
  bar  foo
0   a    1
1   b    2
2   c    3
   baz qux
0    7   p
1    8   q
2    9   r
3   11   t
pn = pd.Panel({'item1':df1,'item2':df2})
print(pn)
print(pn['item1'])
print(" ")
print(pn['item2'])
<class 'pandas.core.panel.Panel'>
Dimensions: 2 (items) x 4 (major_axis) x 4 (minor_axis)
Items axis: item1 to item2
Major_axis axis: 0 to 3
Minor_axis axis: bar to qux
   bar  baz  foo  qux
0    a  NaN    1  NaN
1    b  NaN    2  NaN
2    c  NaN    3  NaN
3  NaN  NaN  NaN  NaN

   bar baz  foo qux
0  NaN   7  NaN   p
1  NaN   8  NaN   q
2  NaN   9  NaN   r
3  NaN  11  NaN   t

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

8.6 SeriesからDataFrameを作成する

s = pd.Series([1,2,3,4])
print(s)
0    1
1    2
2    3
3    4
dtype: int64
s = pd.Series([1,2,3,4],index=['a','b','c','d'])
print(s)
a    1
b    2
c    3
d    4
dtype: int64
s = pd.Series({'a':1,'b':2,'c':3,'d':4})
print(s)
a    1
b    2
c    3
d    4
dtype: int64
s = pd.Series({'a':1,'b':2},index=['a','b','c'])
print(s)
a    1.0
b    2.0
c    NaN
dtype: float64
s = pd.Series({'a':1,'b':2,'c':3},index=['a','b'])
print(s)
a    1
b    2
dtype: int64
s = pd.Series(9,{'a','b','c'})
print(s)
b    9
a    9
c    9
dtype: int64
s = pd.Series([1,2,3,4],['a','b','c','d'])
print(np.sqrt(s))
a    1.000000
b    1.414214
c    1.732051
d    2.000000
dtype: float64
print(s[1:3])
b    2
c    3
dtype: int64
s = pd.Series([1,2.1,'foo']) + pd.Series([2,3,'bar'])
print(s)
0         3
1       5.1
2    foobar
dtype: object
names = pd.Series(['Albert Einstein','Marie Curie'],name='name')
categorys = pd.Series(['Physics','Chemistry'],name='category')
df = pd.concat([names,categorys],axis=1)
print(df.head())
#2人しか追加していないのに表では3人表示されている(William Faulknerが余分)
              name   category
0  Albert Einstein    Physics
1      Marie Curie  Chemistry

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

8.5 DataFrameの作成と保存

df = pd.DataFrame({
    'name':['Albert Einstein','Narie Curie','William Faulkner'],
    'category':['Physics','Chemistry','Literature']
})
print(df.head())
     category              name
0     Physics   Albert Einstein
1   Chemistry       Narie Curie
2  Literature  William Faulkner
df = pd.DataFrame.from_dict({
    'name':['Albert Einstein','Narie Curie','William Faulkner'],
    'category':['Physics','Chemistry','Literature']
})
print(df.head())
     category              name
0     Physics   Albert Einstein
1   Chemistry       Narie Curie
2  Literature  William Faulkner
8.5.2 CSV
#data.csvの中身
#name,category
#'Albert Einstein',Physics
#'Marie Curie',Chemistry

df = pd.read_csv('data.csv')
print(df)
                name   category
0  'Albert Einstein'    Physics
1      'Marie Curie'  Chemistry
from io import StringIO
data = "`Albert Einstein` | Physics \n`Marie Curie` | Chemistry"
df = pd.read_csv(StringIO(data),sep='|',names=['name','category'],skipinitialspace=True,quotechar="`")
print(df)
               name   category
0  Albert Einstein    Physics 
1      Marie Curie   Chemistry
8.5.3 Excelファイル

Excelがないのでパス
p187の
名前をキーとしたディクショナリにすべてのシートを読み込む
dfs = read_excel(...)
にpd.がないような気がする

8.5.4 SQL

パス

8.5.5 MongoDB

installは https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu/
にいってUbuntu 16.04用のパッケージをinstallしsystemctlコマンドでスタートさせた
(mongoDBはUbuntu長期サポート版でしかサポートしていない)

1番目
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 0C49F3730359A14518585931BC711F9BA15703C6
2番目
echo "deb [ arch=amd64,arm64 ] http://repo.mongodb.org/apt/ubuntu xenial/mongodb-org/3.4 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-3.4.list
3番目
sudo apt-get update
4番目
sudo apt-get install -y mongodb-org
5番目
sudo service mongod start
6番目
起動時自動起動する設定
systemctl enable mongod.service
7番目
状態を確認
systemctl list-unit-files -t service | grep mongo
mongod.service enabled

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

jupyter notebookから直接MarkDownファイルに変換してここに貼り付ける方式に変更した

8.4DataFrame

import pandas as pd 
PATH = "/home/beetle/myproject/DataVisualization/nobel_winners/dataviz-with-python-and-js/Ch06_Heavyweight_Scraping_with_Scrapy/winner_list.json"
df = pd.read_json(PATH)
df.head(3)
born_in category country date_of_birth date_of_death gender link name place_of_birth place_of_death text year
0 Economics Austria 8 May 1899 23 March 1992 male http://en.wikipedia.org/wiki/Friedrich_Hayek Friedrich Hayek Vienna Freiburg im Breisgau Friedrich Hayek , Economics, 1974 1974
1 Physiology or Medicine Austria 7 November 1903 27 February 1989 male http://en.wikipedia.org/wiki/Konrad_Lorenz Konrad Lorenz Vienna Vienna Konrad Lorenz , Physiology or Medicine, 1973 1973
2 Austria Physiology or Medicine 20 November 1886 12 June 1982 male http://en.wikipedia.org/wiki/Karl_von_Frisch Karl von Frisch * Vienna Munich Karl von Frisch *, Physiology or Medicine, 1973 1973
8.4.1インデックス
print(df.columns)
print(df.index)
df =df.set_index('name')
print(df.loc['Albert Einstein'])
df = df.reset_index()
Index(['born_in', 'category', 'country', 'date_of_birth', 'date_of_death',
       'gender', 'link', 'name', 'place_of_birth', 'place_of_death', 'text',
       'year'],
      dtype='object')
Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057],
           dtype='int64', length=1058)
                born_in category      country  date_of_birth  date_of_death  \
name                                                                          
Albert Einstein          Physics  Switzerland  14 March 1879  18 April 1955   
Albert Einstein          Physics      Germany  14 March 1879  18 April 1955   

                gender                                          link  \
name                                                                   
Albert Einstein   male  http://en.wikipedia.org/wiki/Albert_Einstein   
Albert Einstein   male  http://en.wikipedia.org/wiki/Albert_Einstein   

                place_of_birth place_of_death  \
name                                            
Albert Einstein            Ulm      Princeton   
Albert Einstein            Ulm      Princeton   

                                                              text  year  
name                                                                      
Albert Einstein  Albert Einstein ,  born in Germany , Physics, ...  1921  
Albert Einstein                    Albert Einstein , Physics, 1921  1921  
8.4.2 行と列
print(df.iloc[2])
print(df.ix[2])
df = df.set_index('name')
print(df.ix['Albert Einstein'])
print(df.ix[2])
name                                            Karl von Frisch *
born_in                                                   Austria
category                                   Physiology or Medicine
country                                                          
date_of_birth                                    20 November 1886
date_of_death                                        12 June 1982
gender                                                       male
link                 http://en.wikipedia.org/wiki/Karl_von_Frisch
place_of_birth                                             Vienna
place_of_death                                             Munich
text              Karl von Frisch *, Physiology or Medicine, 1973
year                                                         1973
Name: 2, dtype: object
name                                            Karl von Frisch *
born_in                                                   Austria
category                                   Physiology or Medicine
country                                                          
date_of_birth                                    20 November 1886
date_of_death                                        12 June 1982
gender                                                       male
link                 http://en.wikipedia.org/wiki/Karl_von_Frisch
place_of_birth                                             Vienna
place_of_death                                             Munich
text              Karl von Frisch *, Physiology or Medicine, 1973
year                                                         1973
Name: 2, dtype: object
                born_in category      country  date_of_birth  date_of_death  \
name                                                                          
Albert Einstein          Physics  Switzerland  14 March 1879  18 April 1955   
Albert Einstein          Physics      Germany  14 March 1879  18 April 1955   

                gender                                          link  \
name                                                                   
Albert Einstein   male  http://en.wikipedia.org/wiki/Albert_Einstein   
Albert Einstein   male  http://en.wikipedia.org/wiki/Albert_Einstein   

                place_of_birth place_of_death  \
name                                            
Albert Einstein            Ulm      Princeton   
Albert Einstein            Ulm      Princeton   

                                                              text  year  
name                                                                      
Albert Einstein  Albert Einstein ,  born in Germany , Physics, ...  1921  
Albert Einstein                    Albert Einstein , Physics, 1921  1921  
born_in                                                   Austria
category                                   Physiology or Medicine
country                                                          
date_of_birth                                    20 November 1886
date_of_death                                        12 June 1982
gender                                                       male
link                 http://en.wikipedia.org/wiki/Karl_von_Frisch
place_of_birth                                             Vienna
place_of_death                                             Munich
text              Karl von Frisch *, Physiology or Medicine, 1973
year                                                         1973
Name: Karl von Frisch *, dtype: object


/home/beetle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix

/home/beetle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  after removing the cwd from sys.path.
8.4.3グループ
df = df.groupby('category')
print(df.groups.keys())
#物理学受賞者のみピックアップ
phy_group = df.get_group('Physics')
print(phy_group.head(3))
dict_keys(['', 'Chemistry', 'Economics', 'Literature', 'Peace', 'Physics', 'Physiology or Medicine'])
                  born_in category        country     date_of_birth  \
name                                                                  
Brian Schmidt              Physics      Australia  24 February 1967   
Percy W. Bridgman          Physics  United States     21 April 1882   
Isidor Isaac Rabi          Physics  United States      29 July 1898   

                     date_of_death gender  \
name                                        
Brian Schmidt                  NaN   male   
Percy W. Bridgman   20 August 1961   male   
Isidor Isaac Rabi  11 January 1988   male   

                                                             link  \
name                                                                
Brian Schmidt          http://en.wikipedia.org/wiki/Brian_Schmidt   
Percy W. Bridgman  http://en.wikipedia.org/wiki/Percy_W._Bridgman   
Isidor Isaac Rabi  http://en.wikipedia.org/wiki/Isidor_Isaac_Rabi   

                  place_of_birth place_of_death  \
name                                              
Brian Schmidt           Missoula            NaN   
Percy W. Bridgman      Cambridge       Randolph   
Isidor Isaac Rabi        Rymanów  New York City   

                                                                text  year  
name                                                                        
Brian Schmidt      Brian Schmidt ,  born in the United States , P...  2011  
Percy W. Bridgman                  Percy W. Bridgman , Physics, 1946  1946  
Isidor Isaac Rabi  Isidor Isaac Rabi ,  born in Austria , Physics...  1944  

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

配列関数の作成

移動平均の計算¶
def moving_average(a,n=3):
    ret = np.cumsum(a,dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:]/ n
a = np.arange(10)
print(a)
print(moving_average(a,4))
結果

[0 1 2 3 4 5 6 7 8 9]
[ 1.5  2.5  3.5  4.5  5.5  6.5  7.5]

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

7.1.2配列のインデックス指定とスライス
a = np.array([1,2,3,4,5,6])
print(a[2])
print(a[3:5])
b = a[:4:2] = 0
print(b)
a = np.array([1,2,3,4,5,6])
print(a[::-1])
結果
3
[4 5]  
0  
[6 5 4 3 2 1]   

a[:4:2]の結果だけ違う?  
ちなみに0の代入をやめると  
[1 3]となる  
図7-22次元Numpy配列に対する基本的な数学演算
a = np.array([1,2,3,4,5,6])
a = a.reshape([2,3])
print(a)
print(a * 2)
print(a - 2)
print(a / 2.0)
結果
[[1 2 3]
[4 5 6]]

[[ 2  4  6]
 [ 8 10 12]] #ここが本と異なる

[[-1  0  1]
 [ 2  3  4]]

[[ 0.5  1.   1.5]


 [ 2.   2.5  3. ]]

a = np.array([45,65,76,32,99,22])
print(a < 50)
結果
[ True False False  True False  True]
図7-2Numpy配列数学関数
pi = np.pi
a = np.array([pi,pi/2,pi/4,pi/6])
print(np.degrees(a))
結果
[ 180.   90.   45.   30.]

sin_a = np.sin(a)
print(sin_a)
結果
[  1.22464680e-16   1.00000000e+00   7.07106781e-01   5.00000000e-01]

np.round(sin_a,7)
結果
array([ 0.       ,  1.       ,  0.7071068,  0.5      ])

a = np.arange(8).reshape(2,4)
print(a)
print(np.cumsum(a,axis=0))
print(np.cumsum(a,axis=1))
print(np.cumsum(a))
結果
[[0 1 2 3]
 [4 5 6 7]]
axis=0(axis=0は行方向)
[[ 0  1  2  3]
 [ 4  6  8 10]]
axis=1(axis=1は列方向)
[[ 0  1  3  6]
 [ 4  9 15 22]]

[ 0  1  3  6 10 15 21 28]