「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

9.4 データのクリーニング

df.born_in.describe()
count     1052
unique      40
top           
freq       910
Name: born_in, dtype: object
9.4.1 混合した型の検出
#applyはseriesのメソッドでseriesの要素にtype関数を適用させている
set(df.born_in.apply(type))
{str}
9.4.2 文字列の置換
bi_col.replace('', np.nan, inplace=True)
bi_col
0                          NaN
1       Bosnia and Herzegovina
2       Bosnia and Herzegovina
3                          NaN
4                          NaN
5                          NaN
6                          NaN
7                          NaN
8                          NaN
9                          NaN
10                         NaN
11                         NaN
12                         NaN
13                         NaN
14                     Belarus
15                     Belarus
16                     Belarus
17                         NaN
18                         NaN
19                         NaN
20                         NaN
21                         NaN
22                         NaN
23                         NaN
24                         NaN
25                         NaN
26                         NaN
27              Czech Republic
28              Czech Republic
29              Czech Republic
                 ...          
1022                       NaN
1023                   Austria
1024                   Austria
1025                       NaN
1026                       NaN
1027                   Austria
1028                       NaN
1029                       NaN
1030                   Austria
1031                   Austria
1032                       NaN
1033                       NaN
1034                   Austria
1035                 Australia
1036                       NaN
1037                       NaN
1038                       NaN
1039                 Australia
1040                       NaN
1041                 Australia
1042                       NaN
1043                       NaN
1044                       NaN
1045                       NaN
1046                 Australia
1047                       NaN
1048                       NaN
1049                       NaN
1050                       NaN
1051                       NaN
Name: born_in, Length: 1052, dtype: object
bi_col.count()
142
df.replace('', np.nan, inplace=True)
df.head()
name born_in category country date_of_birth date_of_death gender link place_of_birth place_of_death text year
0 César Milstein NaN Physiology or Medicine Argentina 8 October 1927 24 March 2002 male http://en.wikipedia.org/wiki/C%C3%A9sar_Milstein Bahía Blanca , Argentina Cambridge , England César Milstein , Physiology or Medicine, 1984 1984
1 Ivo Andric * Bosnia and Herzegovina Literature NaN 9 October 1892 13 March 1975 male http://en.wikipedia.org/wiki/Ivo_Andric Dolac (village near Travnik), Austria-Hungary ... Belgrade, SR Serbia, SFR Yugoslavia (present-d... Ivo Andric *, born in then Austria–Hungary ,... 1961
2 Vladimir Prelog * Bosnia and Herzegovina Chemistry NaN July 23, 1906 1998-01-07 male http://en.wikipedia.org/wiki/Vladimir_Prelog Sarajevo , Bosnia and Herzegovina , then part... Zürich , Switzerland Vladimir Prelog *, born in then Austria–Hung... 1975
3 Institut de Droit International NaN Peace Belgium None None None http://en.wikipedia.org/wiki/Institut_de_Droit... None None Institut de Droit International , Peace, 1904 1904
4 Auguste Beernaert NaN Peace Belgium 26 July 1829 6 October 1912 male http://en.wikipedia.org/wiki/Auguste_Marie_Fra... Ostend , Netherlands (now Belgium ) Lucerne , Switzerland Auguste Beernaert , Peace, 1909 1909
#contains関数は指定した文字列があればTrueなければFalseを返す
dfa = df[df.name.str.contains('\*')]['name']
print(dfa)
1                        Ivo Andric *
2                   Vladimir Prelog *
14                    Simon Kuznets *
15                   Menachem Begin *
16                     Shimon Peres *
27               Bertha von Suttner *
28                       Gerty Cori *
29              Carl Ferdinand Cori *
50                  Henry Kissinger *
51                     Arno Penzias *
53              Georges J.F. Köhler *
58                 Jack Steinberger *
63                  Hans G. Dehmelt *
82                  Renato Dulbecco *
87                Riccardo Giacconi *
88                   Mario Capecchi *
101     Mario José Molina Henríquez *
102                Gabriel Lippmann *
103               Jules A. Hoffmann *
104                  Andrew Schally *
105                  Czesław Miłosz *
106                      Aaron Klug *
109                 Wilhelm Ostwald *
115                    Severo Ochoa *
120                Allan M. Cormack *
126                  Sydney Brenner *
128                  Michael Levitt *
135                 Niels Kaj Jerne *
138                   Michael Smith *
268                     T. S. Eliot *
                    ...              
935            Luis Federico Leloir *
937                   Seán MacBride *
938                 Roger Guillemin *
962             Niels Ryberg Finsen *
972                 Leopold Ružička *
978                  Daniel C. Tsui *
979                    Gao Xingjian *
980                  Charles K. Kao *
987                 William Giauque *
989              Charles B. Huggins *
991                     Saul Bellow *
992                  David H. Hubel *
993                     Henry Taube *
997               Rudolph A. Marcus *
1001                William Vickrey *
1002                  Myron Scholes *
1005               Willard S. Boyle *
1008                  Elias Canetti *
1009                  Peter Medawar *
1010       Zhores Ivanovich Alferov *
1023                     Otto Loewi *
1024                   Richard Kuhn *
1027                Karl von Frisch *
1030                    Walter Kohn *
1031                 Eric R. Kandel *
1034                 Martin Karplus *
1035         William Lawrence Bragg *
1039         Aleksandr M. Prokhorov *
1041          John Warcup Cornforth *
1046         Elizabeth H. Blackburn *
Name: name, Length: 142, dtype: object
df.name = df.name.str.replace('*', '')
df.name = df.name.str.strip()
df[df.name.str.contains('\*')]
name born_in category country date_of_birth date_of_death gender link place_of_birth place_of_death text year
9.4.3 行の削除
np.nan == np.nan
False
df = df[df.born_in.isnull()]
df.count()
name              910
born_in             0
category          909
country           910
date_of_birth     901
date_of_death     589
gender            900
link              910
place_of_birth    875
place_of_death    546
text              910
year              910
dtype: int64
df = df.drop('born_in', axis=1)
df.head()
name category country date_of_birth date_of_death gender link place_of_birth place_of_death text year
0 César Milstein Physiology or Medicine Argentina 8 October 1927 24 March 2002 male http://en.wikipedia.org/wiki/C%C3%A9sar_Milstein Bahía Blanca , Argentina Cambridge , England César Milstein , Physiology or Medicine, 1984 1984
3 Institut de Droit International Peace Belgium None None None http://en.wikipedia.org/wiki/Institut_de_Droit... None None Institut de Droit International , Peace, 1904 1904
4 Auguste Beernaert Peace Belgium 26 July 1829 6 October 1912 male http://en.wikipedia.org/wiki/Auguste_Marie_Fra... Ostend , Netherlands (now Belgium ) Lucerne , Switzerland Auguste Beernaert , Peace, 1909 1909
5 Maurice Maeterlinck Literature Belgium 29 August 1862 6 May 1949 male http://en.wikipedia.org/wiki/Maurice_Maeterlinck Ghent , Belgium Nice , France Maurice Maeterlinck , Literature, 1911 1911
6 Henri La Fontaine Peace Belgium 22 April 1854 14 May 1943 male http://en.wikipedia.org/wiki/Henri_La_Fontaine Brussels Belgium Henri La Fontaine , Peace, 1913 1913
9.4.4 重複の検出
#duplicated関数は名前列で重複する行(一致する行)をみつけるとTrueを返す。そうでなければFalse
dupes_by_name = df[df.duplicated('name')]
dupes_by_name.count()
name              46
category          46
country           46
date_of_birth     45
date_of_death     24
gender            44
link              46
place_of_birth    45
place_of_death    23
text              46
year              46
dtype: int64

duplicated関数の挙動 通常のduplicated関数とkeepに'lastオプションをつけたものを|で結合すると
全く重複していないデータ以外全てTrueになる。
(ここではindex3の'&&&'が唯一重複を持たないデータ)

dfa = pd.DataFrame({'name':['###','***','$$$','%%%','&&&','###','%%%','$$$','###','$$$','***','%%%']})
print(dfa)
first_dupes = dfa.duplicated('name')
print(first_dupes)
last_dupes = dfa.duplicated('name',keep='last')
print(last_dupes)
dfa[dfa.duplicated('name') | dfa.duplicated('name',keep='last')]
   name
0   ###
1   ***
2   $$$
3   %%%
4   &&&
5   ###
6   %%%
7   $$$
8   ###
9   $$$
10  ***
11  %%%
0     False
1     False
2     False
3     False
4     False
5      True
6      True
7      True
8      True
9      True
10     True
11     True
dtype: bool
0      True
1      True
2      True
3      True
4     False
5      True
6      True
7      True
8     False
9     False
10    False
11    False
dtype: bool
name
0 ###
1 ***
2 $$$
3 %%%
5 ###
6 %%%
7 $$$
8 ###
9 $$$
10 ***
11 %%%
all_dupes = df[df.duplicated('name')\
| df.duplicated('name', keep='last')]
all_dupes.count()
name              92
category          92
country           92
date_of_birth     90
date_of_death     48
gender            88
link              92
place_of_birth    90
place_of_death    46
text              92
year              92
dtype: int64
all_dupes = df[df.name.isin(dupes_by_name.name)]
all_dupes.count()
name              92
category          92
country           92
date_of_birth     90
date_of_death     48
gender            88
link              92
place_of_birth    90
place_of_death    46
text              92
year              92
dtype: int64
pd.concat([g for _,g in df.groupby('name')\
if len(g) > 1])['name']
121                   Aaron Klug
131                   Aaron Klug
615              Albert Einstein
844              Albert Einstein
176                Arieh Warshel
798                Arieh Warshel
94                 Avram Hershko
830                Avram Hershko
228             Baruj Benacerraf
366             Baruj Benacerraf
573               Betty Williams
805               Betty Williams
162             Brian P. Schmidt
1047            Brian P. Schmidt
498               Charles K. Kao
831               Charles K. Kao
295               Chen Ning Yang
976               Chen Ning Yang
0                 César Milstein
134               César Milstein
623                 Daniel Bovet
790                 Daniel Bovet
93               Daniel Kahneman
457              Daniel Kahneman
407            Edmond H. Fischer
630            Edmond H. Fischer
505              Ei-ichi Negishi
778              Ei-ichi Negishi
524            Ernest Rutherford
985            Ernest Rutherford
                  ...           
632     Médecins Sans Frontières
947     Médecins Sans Frontières
490              Osamu Shimomura
776              Osamu Shimomura
72                Philipp Lenard
1013              Philipp Lenard
650                Ragnar Granit
960                Ragnar Granit
510            Ralph M. Steinman
1006           Ralph M. Steinman
85          Rita Levi-Montalcini
376         Rita Levi-Montalcini
96                 Robert Aumann
476                Robert Aumann
137                 Ronald Coase
405                 Ronald Coase
515               Shuji Nakamura
780               Shuji Nakamura
396                Sidney Altman
995                Sidney Altman
451               Sydney Brenner
586               Sydney Brenner
172             Thomas C. Südhof
905             Thomas C. Südhof
294                Tsung-Dao Lee
975                Tsung-Dao Lee
333             Wassily Leontief
684             Wassily Leontief
489               Yoichiro Nambu
773               Yoichiro Nambu
Name: name, Length: 92, dtype: object
9.4.5 データのソート
df2 = pd.DataFrame(\
{'name':['zak', 'alice', 'bob', 'mike', 'bob', 'bob'],\
'score':[4, 3, 5, 2, 3, 7]})
df2.sort_values(['name', 'score'],\
ascending=[1,0])
name score
1 alice 3
5 bob 7
2 bob 5
4 bob 3
3 mike 2
0 zak 4
all_dupes.sort_values('name')[['name', 'country', 'year']]
name country year
121 Aaron Klug South Africa 1982
131 Aaron Klug United Kingdom 1982
844 Albert Einstein Germany 1921
615 Albert Einstein Switzerland 1921
176 Arieh Warshel United States 2013
798 Arieh Warshel Israel 2013
830 Avram Hershko Hungary 2004
94 Avram Hershko Israel 2004
366 Baruj Benacerraf United States 1980
228 Baruj Benacerraf Venezuela 1980
805 Betty Williams Ireland 1976
573 Betty Williams United Kingdom 1976
162 Brian P. Schmidt United States 2011
1047 Brian P. Schmidt Australia 2011
498 Charles K. Kao United States 2009
831 Charles K. Kao Hong Kong 2009
976 Chen Ning Yang China 1957
295 Chen Ning Yang United States 1957
0 César Milstein Argentina 1984
134 César Milstein United Kingdom 1984
623 Daniel Bovet Switzerland 1957
790 Daniel Bovet Italy 1957
93 Daniel Kahneman Israel 2002
457 Daniel Kahneman United States 2002
630 Edmond H. Fischer Switzerland 1992
407 Edmond H. Fischer United States 1992
778 Ei-ichi Negishi Japan 2010
505 Ei-ichi Negishi United States 2010
985 Ernest Rutherford Canada 1908
524 Ernest Rutherford United Kingdom 1908
... ... ... ...
947 Médecins Sans Frontières France 1999
632 Médecins Sans Frontières Switzerland 1999
776 Osamu Shimomura Japan 2008
490 Osamu Shimomura United States 2008
1013 Philipp Lenard Austria 1905
72 Philipp Lenard Germany 1905
650 Ragnar Granit Sweden 1967
960 Ragnar Granit Finland 1809
1006 Ralph M. Steinman Canada 2011
510 Ralph M. Steinman United States 2011
85 Rita Levi-Montalcini Italy 1986
376 Rita Levi-Montalcini United States 1986
96 Robert Aumann Israel 2005
476 Robert Aumann United States 2005
405 Ronald Coase United States 1991
137 Ronald Coase United Kingdom 1991
515 Shuji Nakamura United States 2014
780 Shuji Nakamura Japan 2014
995 Sidney Altman Canada 1989
396 Sidney Altman United States 1990
451 Sydney Brenner United States 2002
586 Sydney Brenner United Kingdom 2002
905 Thomas C. Südhof Germany 2013
172 Thomas C. Südhof United States 2013
975 Tsung-Dao Lee China 1957
294 Tsung-Dao Lee United States 1957
333 Wassily Leontief United States 1973
684 Wassily Leontief Russia 1973
773 Yoichiro Nambu Japan 2008
489 Yoichiro Nambu United States 2008

92 rows × 3 columns

9.4.6重複の削除
df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
(df.year == 1911), 'country'] = 'France'
df.drop(df[(df.name == 'Sidney Altman') &\
(df.year == 1990)].index,
inplace=True)
def clean_data(df):
    df = df.replace('', np.nan)
    df = df[df.born_in.isnull()]
    df = df.drop('born_in', axis=1)
    df.drop(df[df.year == 1809].index, inplace=True)
    df = df[~(df.name == 'Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
    (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name == 'Sidney Altman') &\
    (df.year == 1990))]
    return df
# Apply our clean_data function to the reloaded dirty data
df = reload_data()
df = clean_data(df)
df = df.reindex(np.random.permutation(df.index))
df = df.drop_duplicates(['name', 'year'])
df = df.sort_index()
df.count()
category          864
country           865
date_of_birth     857
date_of_death     566
gender            857
link              865
name              865
place_of_birth    831
place_of_death    524
text              865
year              865
dtype: int64
df[df.duplicated('name') |
df.duplicated('name', keep='last')]\
.sort_values(by='name')\
[['name', 'country', 'year', 'category']]
name country year category
548 Frederick Sanger United Kingdom 1958 Chemistry
580 Frederick Sanger United Kingdom 1980 Chemistry
292 John Bardeen United States 1956 Physics
326 John Bardeen United States 1972 Physics
285 Linus C. Pauling United States 1954 Chemistry
309 Linus C. Pauling United States 1962 Peace
706 Marie Skłodowska-Curie Poland 1903 Physics
709 Marie Skłodowska-Curie France 1911 Chemistry
9.4.7 欠損フィールドの処理
df.count()
category          864
country           865
date_of_birth     857
date_of_death     566
gender            857
link              865
name              865
place_of_birth    831
place_of_death    524
text              865
year              865
dtype: int64
df[df.category.isnull()][['name', 'text']]
name text
922 Alexis Carrel Alexis Carrel , Medicine, 1912
df.ix[df.name == 'Alexis Carrel', 'category'] =\
'Physiology or Medicine'
/home/beetle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.
df[df.gender.isnull()]['name']
3                         Institut de Droit International
156                               Friends Service Council
267     American Friends Service Committee  (The Quakers)
574                                 Amnesty International
650                                         Ragnar Granit
947                              Médecins Sans Frontières
1000     Pugwash Conferences on Science and World Affairs
1033                   International Atomic Energy Agency
Name: name, dtype: object
df = df[df.gender.notnull()]
df.ix[df.name == 'Ragnar Granit', 'gender'] = 'male'
/home/beetle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
df[df.date_of_birth.isnull()]['name']
782    Hiroshi Amano
Name: name, dtype: object
df.ix[df.name == 'Hiroshi Amano', 'date_of_birth'] =\
'11 September 1960'
# Note that the example in the book uses the original DataFrame, not our newly cleaned one, used here
# Row 2 (Vladamir Prelog) is therefore missing
df[['name', 'date_of_birth']]
name date_of_birth
0 César Milstein 8 October 1927
4 Auguste Beernaert 26 July 1829
5 Maurice Maeterlinck 29 August 1862
6 Henri La Fontaine 22 April 1854
7 Jules Bordet 13 June 1870
8 Corneille Heymans 28 March 1892
9 Georges Pire 1910-02-10
10 Albert Claude 24 August 1899
11 Christian de Duve 2 October 1917
12 Ilya Prigogine 25 January 1917
13 François Englert 6 November 1932
17 Karl Adolph Gjellerup June 2, 1857
18 August Krogh November 15, 1874
19 Niels Bohr 7 October 1885
20 Johannes Andreas Grib Fibiger 23 April 1867
21 Henrik Dam 21 February 1895
22 Johannes Vilhelm Jensen 1873-01-20
23 Ben Roy Mottelson July 9, 1926
24 Aage Bohr 19 June 1922
25 Niels Kaj Jerne December 23, 1911
26 Jens Christian Skou October 8, 1918
30 Jaroslav Heyrovský December 20, 1890
31 Jaroslav Seifert 23 September 1901
32 Christopher A. Pissarides 1948-02-20
33 Irène Joliot-Curie 12 September 1897
34 Frédéric Joliot 19 March 1900
35 Roger Martin du Gard 23 March 1881
36 André Gide 1869-11-22
37 Léon Jouhaux July 1, 1879
38 Albert Schweitzer 14 January 1875
... ... ...
1011 Muhammad Yunus 28 June 1940
1012 Lev Landau January 22, 1908
1013 Philipp Lenard June 7, 1862
1014 Bertha von Suttner June 9, 1843
1015 Alfred Hermann Fried 11 November 1864
1016 Robert Bárány 22 April 1876
1017 Friderik Pregl 3 September 1869
1018 Richard Adolf Zsigmondy 1 April 1865
1019 Julius Wagner-Jauregg 7 March 1857
1020 Karl Landsteiner June 14, 1868
1021 Erwin Schrödinger 12 August 1887
1022 Victor Francis Hess 24 June 1883
1025 Wolfgang Pauli 25 April 1900
1026 Max F. Perutz 19 May 1914
1028 Konrad Lorenz November 7, 1903
1029 Friedrich Hayek 8 May 1899
1032 Elfriede Jelinek 20 October 1946
1036 Sir Howard Florey 24 September 1898
1037 Sir Frank Macfarlane Burnet 3 September 1899
1038 John Carew Eccles 27 January 1903
1040 Patrick White 28 May 1912
1042 John Harsanyi May 29, 1920
1043 Peter C. Doherty & Professor Rolf Zinkernagel 15 October 1940
1044 J. Robin Warren 11 June 1937
1045 Barry Marshall 30 September 1951
1047 Brian P. Schmidt February 24, 1967
1048 Carlos Saavedra Lamas November 1, 1878
1049 Bernardo Houssay 1887-04-10
1050 Luis Federico Leloir 1906-9-6
1051 Adolfo Pérez Esquivel November 26, 1931

857 rows × 2 columns

9.4.8 時刻と日付の処理
pd.to_datetime(df.date_of_birth, errors='raise')
0      1927-10-08
4      1829-07-26
5      1862-08-29
6      1854-04-22
7      1870-06-13
8      1892-03-28
9      1910-02-10
10     1899-08-24
11     1917-10-02
12     1917-01-25
13     1932-11-06
17     1857-06-02
18     1874-11-15
19     1885-10-07
20     1867-04-23
21     1895-02-21
22     1873-01-20
23     1926-07-09
24     1922-06-19
25     1911-12-23
26     1918-10-08
30     1890-12-20
31     1901-09-23
32     1948-02-20
33     1897-09-12
34     1900-03-19
35     1881-03-23
36     1869-11-22
37     1879-07-01
38     1875-01-14
          ...    
1011   1940-06-28
1012   1908-01-22
1013   1862-06-07
1014   1843-06-09
1015   1864-11-11
1016   1876-04-22
1017   1869-09-03
1018   1865-04-01
1019   1857-03-07
1020   1868-06-14
1021   1887-08-12
1022   1883-06-24
1025   1900-04-25
1026   1914-05-19
1028   1903-11-07
1029   1899-05-08
1032   1946-10-20
1036   1898-09-24
1037   1899-09-03
1038   1903-01-27
1040   1912-05-28
1042   1920-05-29
1043   1940-10-15
1044   1937-06-11
1045   1951-09-30
1047   1967-02-24
1048   1878-11-01
1049   1887-04-10
1050   1906-09-06
1051   1931-11-26
Name: date_of_birth, Length: 857, dtype: datetime64[ns]
#date_of_deathでは例外でた
pd.to_datetime(df.date_of_death, errors='raise')
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

/home/beetle/anaconda3/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz)
    443             try:
--> 444                 values, tz = tslib.datetime_to_datetime64(arg)
    445                 return DatetimeIndex._simple_new(values, name=name, tz=tz)


pandas/_libs/tslib.pyx in pandas._libs.tslib.datetime_to_datetime64 (pandas/_libs/tslib.c:33275)()


TypeError: Unrecognized value type: <class 'str'>


During handling of the above exception, another exception occurred:


ValueError                                Traceback (most recent call last)

<ipython-input-56-2a87872c34e6> in <module>()
----> 1 pd.to_datetime(df.date_of_death, errors='raise')


/home/beetle/anaconda3/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin)
    507     elif isinstance(arg, ABCSeries):
    508         from pandas import Series
--> 509         values = _convert_listlike(arg._values, False, format)
    510         result = Series(values, index=arg.index, name=arg.name)
    511     elif isinstance(arg, (ABCDataFrame, MutableMapping)):


/home/beetle/anaconda3/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz)
    445                 return DatetimeIndex._simple_new(values, name=name, tz=tz)
    446             except (ValueError, TypeError):
--> 447                 raise e
    448 
    449     if arg is None:


/home/beetle/anaconda3/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz)
    433                     dayfirst=dayfirst,
    434                     yearfirst=yearfirst,
--> 435                     require_iso8601=require_iso8601
    436                 )
    437 


pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:46617)()


pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:46233)()


pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:46122)()


pandas/_libs/tslib.pyx in pandas._libs.tslib.parse_datetime_string (pandas/_libs/tslib.c:35351)()


/home/beetle/anaconda3/lib/python3.6/site-packages/dateutil/parser.py in parse(timestr, parserinfo, **kwargs)
   1166         return parser(parserinfo).parse(timestr, **kwargs)
   1167     else:
-> 1168         return DEFAULTPARSER.parse(timestr, **kwargs)
   1169 
   1170 


/home/beetle/anaconda3/lib/python3.6/site-packages/dateutil/parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
    579                 repl['day'] = monthrange(cyear, cmonth)[1]
    580 
--> 581         ret = default.replace(**repl)
    582 
    583         if res.weekday is not None and not res.day:


ValueError: month must be in 1..12
for i,row in df.iterrows():
    try:
        pd.to_datetime(row.date_of_death,errors='raise')
    except:
        print('%s(%s, %d)'%(row.date_of_death.ljust(30),row['name'],i))
1968-23-07                    (Henry Hallett Dale, 150)
May 30, 2011 (aged 89)        (Rosalyn Yalow, 349)
living                        (David Trimble, 581)
Diederik Korteweg             (Johannes Diderik van der Waals, 746)
living                        (Shirin Ebadi, 809)
living                        (Rigoberta Menchú, 833)
1 February 1976, age 74       (Werner Karl Heisenberg, 858)
with_death_dates = df[df.date_of_death.notnull()]
bad_dates = pd.isnull(pd.to_datetime(\
with_death_dates.date_of_death, errors='coerce'))
with_death_dates[bad_dates][['category', 'date_of_death',\
'name']]
category date_of_death name
150 Physiology or Medicine 1968-23-07 Henry Hallett Dale
349 Physiology or Medicine May 30, 2011 (aged 89) Rosalyn Yalow
581 Peace living David Trimble
746 Physics Diederik Korteweg Johannes Diderik van der Waals
809 Peace living Shirin Ebadi
833 Peace living Rigoberta Menchú
858 Physics 1 February 1976, age 74 Werner Karl Heisenberg
df.date_of_death = pd.to_datetime(df.date_of_death,\
errors='coerce')
df['award_age'] = df.year - pd.DatetimeIndex(df.date_of_birth)\
.year
df.sort_values('award_age').iloc[:10]\
[['name', 'award_age', 'category', 'year']]
name award_age category year
725 Malala Yousafzai 17 Peace 2014
525 William Lawrence Bragg 25 Physics 1915
626 Georges J. F. Köhler 30 Physiology or Medicine 1976
858 Werner Karl Heisenberg 31 Physics 1932
975 Tsung-Dao Lee 31 Physics 1957
146 Paul Dirac 31 Physics 1933
247 Carl Anderson 31 Physics 1936
877 Rudolf Mössbauer 32 Physics 1961
226 Tawakkol Karman 32 Peace 2011
804 Mairéad Corrigan 32 Peace 1976

9.5 完成したclean_data関数

def clean_data(df):
    """The full clean data function, which returns both the cleaned Nobel data (df) and a DataFrame 
    containing those winners with a born_in field."""
    df = df.replace('', np.nan)
    df_born_in = df[df.born_in.notnull()] 
    df = df[df.born_in.isnull()]
    df = df.drop('born_in', axis=1) 
    df.drop(df[df.year == 1809].index, inplace=True) 
    df = df[~(df.name == 'Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
           (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name == 'Sidney Altman') & (df.year == 1990))]
    df = df.reindex(np.random.permutation(df.index)) 
    df = df.drop_duplicates(['name', 'year'])         
    df = df.sort_index()
    df.ix[df.name == 'Alexis Carrel', 'category'] =\
        'Physiology or Medicine' 
    df.ix[df.name == 'Ragnar Granit', 'gender'] = 'male'
    df = df[df.gender.notnull()] # remove institutional prizes
    df.ix[df.name == 'Hiroshi Amano', 'date_of_birth'] =\
    '11 September 1960'
    df.date_of_birth = pd.to_datetime(df.date_of_birth) 
    df.date_of_death = pd.to_datetime(df.date_of_death, errors='coerce') 
    df['award_age'] = df.year - pd.DatetimeIndex(df.date_of_birth).year 
    return df, df_born_in

9.6 クリーニングしたデータ・セットの保存

省略