「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

9.4 データのクリーニング

df.born_in.describe()

count     1052
unique      40
top           
freq       910
Name: born_in, dtype: object

9.4.1 混合した型の検出

#applyはseriesのメソッドでseriesの要素にtype関数を適用させている
set(df.born_in.apply(type))

{str}

9.4.2 文字列の置換

bi_col.replace('', np.nan, inplace=True)
bi_col

0                          NaN
1       Bosnia and Herzegovina
2       Bosnia and Herzegovina
3                          NaN
4                          NaN
5                          NaN
6                          NaN
7                          NaN
8                          NaN
9                          NaN
10                         NaN
11                         NaN
12                         NaN
13                         NaN
14                     Belarus
15                     Belarus
16                     Belarus
17                         NaN
18                         NaN
19                         NaN
20                         NaN
21                         NaN
22                         NaN
23                         NaN
24                         NaN
25                         NaN
26                         NaN
27              Czech Republic
28              Czech Republic
29              Czech Republic
                 ...          
1022                       NaN
1023                   Austria
1024                   Austria
1025                       NaN
1026                       NaN
1027                   Austria
1028                       NaN
1029                       NaN
1030                   Austria
1031                   Austria
1032                       NaN
1033                       NaN
1034                   Austria
1035                 Australia
1036                       NaN
1037                       NaN
1038                       NaN
1039                 Australia
1040                       NaN
1041                 Australia
1042                       NaN
1043                       NaN
1044                       NaN
1045                       NaN
1046                 Australia
1047                       NaN
1048                       NaN
1049                       NaN
1050                       NaN
1051                       NaN
Name: born_in, Length: 1052, dtype: object

bi_col.count()

df.replace('', np.nan, inplace=True)

df.head()

	name	born_in	category	country	date_of_birth	date_of_death	gender	link	place_of_birth	place_of_death	text	year
0	César Milstein	NaN	Physiology or Medicine	Argentina	8 October 1927	24 March 2002	male	http://en.wikipedia.org/wiki/C%C3%A9sar_Milstein	Bahía Blanca , Argentina	Cambridge , England	César Milstein , Physiology or Medicine, 1984	1984
1	Ivo Andric *	Bosnia and Herzegovina	Literature	NaN	9 October 1892	13 March 1975	male	http://en.wikipedia.org/wiki/Ivo_Andric	Dolac (village near Travnik), Austria-Hungary ...	Belgrade, SR Serbia, SFR Yugoslavia (present-d...	Ivo Andric *, born in then Austria–Hungary ,...	1961
2	Vladimir Prelog *	Bosnia and Herzegovina	Chemistry	NaN	July 23, 1906	1998-01-07	male	http://en.wikipedia.org/wiki/Vladimir_Prelog	Sarajevo , Bosnia and Herzegovina , then part...	Zürich , Switzerland	Vladimir Prelog *, born in then Austria–Hung...	1975
3	Institut de Droit International	NaN	Peace	Belgium	None	None	None	http://en.wikipedia.org/wiki/Institut_de_Droit...	None	None	Institut de Droit International , Peace, 1904	1904
4	Auguste Beernaert	NaN	Peace	Belgium	26 July 1829	6 October 1912	male	http://en.wikipedia.org/wiki/Auguste_Marie_Fra...	Ostend , Netherlands (now Belgium )	Lucerne , Switzerland	Auguste Beernaert , Peace, 1909	1909

#contains関数は指定した文字列があればTrueなければFalseを返す
dfa = df[df.name.str.contains('\*')]['name']
print(dfa)

1                        Ivo Andric *
2                   Vladimir Prelog *
14                    Simon Kuznets *
15                   Menachem Begin *
16                     Shimon Peres *
27               Bertha von Suttner *
28                       Gerty Cori *
29              Carl Ferdinand Cori *
50                  Henry Kissinger *
51                     Arno Penzias *
53              Georges J.F. Köhler *
58                 Jack Steinberger *
63                  Hans G. Dehmelt *
82                  Renato Dulbecco *
87                Riccardo Giacconi *
88                   Mario Capecchi *
101     Mario José Molina Henríquez *
102                Gabriel Lippmann *
103               Jules A. Hoffmann *
104                  Andrew Schally *
105                  Czesław Miłosz *
106                      Aaron Klug *
109                 Wilhelm Ostwald *
115                    Severo Ochoa *
120                Allan M. Cormack *
126                  Sydney Brenner *
128                  Michael Levitt *
135                 Niels Kaj Jerne *
138                   Michael Smith *
268                     T. S. Eliot *
                    ...              
935            Luis Federico Leloir *
937                   Seán MacBride *
938                 Roger Guillemin *
962             Niels Ryberg Finsen *
972                 Leopold Ružička *
978                  Daniel C. Tsui *
979                    Gao Xingjian *
980                  Charles K. Kao *
987                 William Giauque *
989              Charles B. Huggins *
991                     Saul Bellow *
992                  David H. Hubel *
993                     Henry Taube *
997               Rudolph A. Marcus *
1001                William Vickrey *
1002                  Myron Scholes *
1005               Willard S. Boyle *
1008                  Elias Canetti *
1009                  Peter Medawar *
1010       Zhores Ivanovich Alferov *
1023                     Otto Loewi *
1024                   Richard Kuhn *
1027                Karl von Frisch *
1030                    Walter Kohn *
1031                 Eric R. Kandel *
1034                 Martin Karplus *
1035         William Lawrence Bragg *
1039         Aleksandr M. Prokhorov *
1041          John Warcup Cornforth *
1046         Elizabeth H. Blackburn *
Name: name, Length: 142, dtype: object

df.name = df.name.str.replace('*', '')
df.name = df.name.str.strip()

df[df.name.str.contains('\*')]

	name	born_in	category	country	date_of_birth	date_of_death	gender	link	place_of_birth	place_of_death	text	year

9.4.3 行の削除

np.nan == np.nan

False

df = df[df.born_in.isnull()]
df.count()

name              910
born_in             0
category          909
country           910
date_of_birth     901
date_of_death     589
gender            900
link              910
place_of_birth    875
place_of_death    546
text              910
year              910
dtype: int64

df = df.drop('born_in', axis=1)

df.head()

	name	category	country	date_of_birth	date_of_death	gender	link	place_of_birth	place_of_death	text	year
0	César Milstein	Physiology or Medicine	Argentina	8 October 1927	24 March 2002	male	http://en.wikipedia.org/wiki/C%C3%A9sar_Milstein	Bahía Blanca , Argentina	Cambridge , England	César Milstein , Physiology or Medicine, 1984	1984
3	Institut de Droit International	Peace	Belgium	None	None	None	http://en.wikipedia.org/wiki/Institut_de_Droit...	None	None	Institut de Droit International , Peace, 1904	1904
4	Auguste Beernaert	Peace	Belgium	26 July 1829	6 October 1912	male	http://en.wikipedia.org/wiki/Auguste_Marie_Fra...	Ostend , Netherlands (now Belgium )	Lucerne , Switzerland	Auguste Beernaert , Peace, 1909	1909
5	Maurice Maeterlinck	Literature	Belgium	29 August 1862	6 May 1949	male	http://en.wikipedia.org/wiki/Maurice_Maeterlinck	Ghent , Belgium	Nice , France	Maurice Maeterlinck , Literature, 1911	1911
6	Henri La Fontaine	Peace	Belgium	22 April 1854	14 May 1943	male	http://en.wikipedia.org/wiki/Henri_La_Fontaine	Brussels	Belgium	Henri La Fontaine , Peace, 1913	1913

9.4.4 重複の検出

#duplicated関数は名前列で重複する行（一致する行）をみつけるとTrueを返す。そうでなければFalse
dupes_by_name = df[df.duplicated('name')]
dupes_by_name.count()

name              46
category          46
country           46
date_of_birth     45
date_of_death     24
gender            44
link              46
place_of_birth    45
place_of_death    23
text              46
year              46
dtype: int64

duplicated関数の挙動通常のduplicated関数とkeepに'lastオプションをつけたものを|で結合すると
全く重複していないデータ以外全てTrueになる。
（ここではindex3の'&&&'が唯一重複を持たないデータ）

dfa = pd.DataFrame({'name':['###','***','$$$','%%%','&&&','###','%%%','$$$','###','$$$','***','%%%']})
print(dfa)
first_dupes = dfa.duplicated('name')
print(first_dupes)
last_dupes = dfa.duplicated('name',keep='last')
print(last_dupes)
dfa[dfa.duplicated('name') | dfa.duplicated('name',keep='last')]

   name
0   ###
1   ***
2   $$$
3   %%%
4   &&&
5   ###
6   %%%
7   $$$
8   ###
9   $$$
10  ***
11  %%%
0     False
1     False
2     False
3     False
4     False
5      True
6      True
7      True
8      True
9      True
10     True
11     True
dtype: bool
0      True
1      True
2      True
3      True
4     False
5      True
6      True
7      True
8     False
9     False
10    False
11    False
dtype: bool

	name
0	###
1	***
2	$$$
3	%%%
5	###
6	%%%
7	$$$
8	###
9	$$$
10	***
11	%%%

all_dupes = df[df.duplicated('name')\
| df.duplicated('name', keep='last')]
all_dupes.count()

name              92
category          92
country           92
date_of_birth     90
date_of_death     48
gender            88
link              92
place_of_birth    90
place_of_death    46
text              92
year              92
dtype: int64

all_dupes = df[df.name.isin(dupes_by_name.name)]
all_dupes.count()

name              92
category          92
country           92
date_of_birth     90
date_of_death     48
gender            88
link              92
place_of_birth    90
place_of_death    46
text              92
year              92
dtype: int64

pd.concat([g for _,g in df.groupby('name')\
if len(g) > 1])['name']

121                   Aaron Klug
131                   Aaron Klug
615              Albert Einstein
844              Albert Einstein
176                Arieh Warshel
798                Arieh Warshel
94                 Avram Hershko
830                Avram Hershko
228             Baruj Benacerraf
366             Baruj Benacerraf
573               Betty Williams
805               Betty Williams
162             Brian P. Schmidt
1047            Brian P. Schmidt
498               Charles K. Kao
831               Charles K. Kao
295               Chen Ning Yang
976               Chen Ning Yang
0                 César Milstein
134               César Milstein
623                 Daniel Bovet
790                 Daniel Bovet
93               Daniel Kahneman
457              Daniel Kahneman
407            Edmond H. Fischer
630            Edmond H. Fischer
505              Ei-ichi Negishi
778              Ei-ichi Negishi
524            Ernest Rutherford
985            Ernest Rutherford
                  ...           
632     Médecins Sans Frontières
947     Médecins Sans Frontières
490              Osamu Shimomura
776              Osamu Shimomura
72                Philipp Lenard
1013              Philipp Lenard
650                Ragnar Granit
960                Ragnar Granit
510            Ralph M. Steinman
1006           Ralph M. Steinman
85          Rita Levi-Montalcini
376         Rita Levi-Montalcini
96                 Robert Aumann
476                Robert Aumann
137                 Ronald Coase
405                 Ronald Coase
515               Shuji Nakamura
780               Shuji Nakamura
396                Sidney Altman
995                Sidney Altman
451               Sydney Brenner
586               Sydney Brenner
172             Thomas C. Südhof
905             Thomas C. Südhof
294                Tsung-Dao Lee
975                Tsung-Dao Lee
333             Wassily Leontief
684             Wassily Leontief
489               Yoichiro Nambu
773               Yoichiro Nambu
Name: name, Length: 92, dtype: object

9.4.5 データのソート

df2 = pd.DataFrame(\
{'name':['zak', 'alice', 'bob', 'mike', 'bob', 'bob'],\
'score':[4, 3, 5, 2, 3, 7]})
df2.sort_values(['name', 'score'],\
ascending=[1,0])

	name	score
1	alice	3
5	bob	7
2	bob	5
4	bob	3
3	mike	2
0	zak	4

all_dupes.sort_values('name')[['name', 'country', 'year']]

	name	country	year
121	Aaron Klug	South Africa	1982
131	Aaron Klug	United Kingdom	1982
844	Albert Einstein	Germany	1921
615	Albert Einstein	Switzerland	1921
176	Arieh Warshel	United States	2013
798	Arieh Warshel	Israel	2013
830	Avram Hershko	Hungary	2004
94	Avram Hershko	Israel	2004
366	Baruj Benacerraf	United States	1980
228	Baruj Benacerraf	Venezuela	1980
805	Betty Williams	Ireland	1976
573	Betty Williams	United Kingdom	1976
162	Brian P. Schmidt	United States	2011
1047	Brian P. Schmidt	Australia	2011
498	Charles K. Kao	United States	2009
831	Charles K. Kao	Hong Kong	2009
976	Chen Ning Yang	China	1957
295	Chen Ning Yang	United States	1957
0	César Milstein	Argentina	1984
134	César Milstein	United Kingdom	1984
623	Daniel Bovet	Switzerland	1957
790	Daniel Bovet	Italy	1957
93	Daniel Kahneman	Israel	2002
457	Daniel Kahneman	United States	2002
630	Edmond H. Fischer	Switzerland	1992
407	Edmond H. Fischer	United States	1992
778	Ei-ichi Negishi	Japan	2010
505	Ei-ichi Negishi	United States	2010
985	Ernest Rutherford	Canada	1908
524	Ernest Rutherford	United Kingdom	1908
...	...	...	...
947	Médecins Sans Frontières	France	1999
632	Médecins Sans Frontières	Switzerland	1999
776	Osamu Shimomura	Japan	2008
490	Osamu Shimomura	United States	2008
1013	Philipp Lenard	Austria	1905
72	Philipp Lenard	Germany	1905
650	Ragnar Granit	Sweden	1967
960	Ragnar Granit	Finland	1809
1006	Ralph M. Steinman	Canada	2011
510	Ralph M. Steinman	United States	2011
85	Rita Levi-Montalcini	Italy	1986
376	Rita Levi-Montalcini	United States	1986
96	Robert Aumann	Israel	2005
476	Robert Aumann	United States	2005
405	Ronald Coase	United States	1991
137	Ronald Coase	United Kingdom	1991
515	Shuji Nakamura	United States	2014
780	Shuji Nakamura	Japan	2014
995	Sidney Altman	Canada	1989
396	Sidney Altman	United States	1990
451	Sydney Brenner	United States	2002
586	Sydney Brenner	United Kingdom	2002
905	Thomas C. Südhof	Germany	2013
172	Thomas C. Südhof	United States	2013
975	Tsung-Dao Lee	China	1957
294	Tsung-Dao Lee	United States	1957
333	Wassily Leontief	United States	1973
684	Wassily Leontief	Russia	1973
773	Yoichiro Nambu	Japan	2008
489	Yoichiro Nambu	United States	2008

92 rows × 3 columns

9.4.6重複の削除

df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
(df.year == 1911), 'country'] = 'France'

df.drop(df[(df.name == 'Sidney Altman') &\
(df.year == 1990)].index,
inplace=True)

def clean_data(df):
    df = df.replace('', np.nan)
    df = df[df.born_in.isnull()]
    df = df.drop('born_in', axis=1)
    df.drop(df[df.year == 1809].index, inplace=True)
    df = df[~(df.name == 'Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
    (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name == 'Sidney Altman') &\
    (df.year == 1990))]
    return df

# Apply our clean_data function to the reloaded dirty data
df = reload_data()
df = clean_data(df)

df = df.reindex(np.random.permutation(df.index))
df = df.drop_duplicates(['name', 'year'])
df = df.sort_index()
df.count()

category          864
country           865
date_of_birth     857
date_of_death     566
gender            857
link              865
name              865
place_of_birth    831
place_of_death    524
text              865
year              865
dtype: int64

df[df.duplicated('name') |
df.duplicated('name', keep='last')]\
.sort_values(by='name')\
[['name', 'country', 'year', 'category']]

	name	country	year	category
548	Frederick Sanger	United Kingdom	1958	Chemistry
580	Frederick Sanger	United Kingdom	1980	Chemistry
292	John Bardeen	United States	1956	Physics
326	John Bardeen	United States	1972	Physics
285	Linus C. Pauling	United States	1954	Chemistry
309	Linus C. Pauling	United States	1962	Peace
706	Marie Skłodowska-Curie	Poland	1903	Physics
709	Marie Skłodowska-Curie	France	1911	Chemistry

9.4.7 欠損フィールドの処理

df.count()

category          864
country           865
date_of_birth     857
date_of_death     566
gender            857
link              865
name              865
place_of_birth    831
place_of_death    524
text              865
year              865
dtype: int64

df[df.category.isnull()][['name', 'text']]

	name	text
922	Alexis Carrel	Alexis Carrel , Medicine, 1912

df.ix[df.name == 'Alexis Carrel', 'category'] =\
'Physiology or Medicine'

/home/beetle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.

df[df.gender.isnull()]['name']

3                         Institut de Droit International
156                               Friends Service Council
267     American Friends Service Committee  (The Quakers)
574                                 Amnesty International
650                                         Ragnar Granit
947                              Médecins Sans Frontières
1000     Pugwash Conferences on Science and World Affairs
1033                   International Atomic Energy Agency
Name: name, dtype: object

df = df[df.gender.notnull()]
df.ix[df.name == 'Ragnar Granit', 'gender'] = 'male'

/home/beetle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix

df[df.date_of_birth.isnull()]['name']

782    Hiroshi Amano
Name: name, dtype: object

df.ix[df.name == 'Hiroshi Amano', 'date_of_birth'] =\
'11 September 1960'

# Note that the example in the book uses the original DataFrame, not our newly cleaned one, used here
# Row 2 (Vladamir Prelog) is therefore missing
df[['name', 'date_of_birth']]

	name	date_of_birth
0	César Milstein	8 October 1927
4	Auguste Beernaert	26 July 1829
5	Maurice Maeterlinck	29 August 1862
6	Henri La Fontaine	22 April 1854
7	Jules Bordet	13 June 1870
8	Corneille Heymans	28 March 1892
9	Georges Pire	1910-02-10
10	Albert Claude	24 August 1899
11	Christian de Duve	2 October 1917
12	Ilya Prigogine	25 January 1917
13	François Englert	6 November 1932
17	Karl Adolph Gjellerup	June 2, 1857
18	August Krogh	November 15, 1874
19	Niels Bohr	7 October 1885
20	Johannes Andreas Grib Fibiger	23 April 1867
21	Henrik Dam	21 February 1895
22	Johannes Vilhelm Jensen	1873-01-20
23	Ben Roy Mottelson	July 9, 1926
24	Aage Bohr	19 June 1922
25	Niels Kaj Jerne	December 23, 1911
26	Jens Christian Skou	October 8, 1918
30	Jaroslav Heyrovský	December 20, 1890
31	Jaroslav Seifert	23 September 1901
32	Christopher A. Pissarides	1948-02-20
33	Irène Joliot-Curie	12 September 1897
34	Frédéric Joliot	19 March 1900
35	Roger Martin du Gard	23 March 1881
36	André Gide	1869-11-22
37	Léon Jouhaux	July 1, 1879
38	Albert Schweitzer	14 January 1875
...	...	...
1011	Muhammad Yunus	28 June 1940
1012	Lev Landau	January 22, 1908
1013	Philipp Lenard	June 7, 1862
1014	Bertha von Suttner	June 9, 1843
1015	Alfred Hermann Fried	11 November 1864
1016	Robert Bárány	22 April 1876
1017	Friderik Pregl	3 September 1869
1018	Richard Adolf Zsigmondy	1 April 1865
1019	Julius Wagner-Jauregg	7 March 1857
1020	Karl Landsteiner	June 14, 1868
1021	Erwin Schrödinger	12 August 1887
1022	Victor Francis Hess	24 June 1883
1025	Wolfgang Pauli	25 April 1900
1026	Max F. Perutz	19 May 1914
1028	Konrad Lorenz	November 7, 1903
1029	Friedrich Hayek	8 May 1899
1032	Elfriede Jelinek	20 October 1946
1036	Sir Howard Florey	24 September 1898
1037	Sir Frank Macfarlane Burnet	3 September 1899
1038	John Carew Eccles	27 January 1903
1040	Patrick White	28 May 1912
1042	John Harsanyi	May 29, 1920
1043	Peter C. Doherty & Professor Rolf Zinkernagel	15 October 1940
1044	J. Robin Warren	11 June 1937
1045	Barry Marshall	30 September 1951
1047	Brian P. Schmidt	February 24, 1967
1048	Carlos Saavedra Lamas	November 1, 1878
1049	Bernardo Houssay	1887-04-10
1050	Luis Federico Leloir	1906-9-6
1051	Adolfo Pérez Esquivel	November 26, 1931

857 rows × 2 columns

9.4.8 時刻と日付の処理

pd.to_datetime(df.date_of_birth, errors='raise')

0      1927-10-08
4      1829-07-26
5      1862-08-29
6      1854-04-22
7      1870-06-13
8      1892-03-28
9      1910-02-10
10     1899-08-24
11     1917-10-02
12     1917-01-25
13     1932-11-06
17     1857-06-02
18     1874-11-15
19     1885-10-07
20     1867-04-23
21     1895-02-21
22     1873-01-20
23     1926-07-09
24     1922-06-19
25     1911-12-23
26     1918-10-08
30     1890-12-20
31     1901-09-23
32     1948-02-20
33     1897-09-12
34     1900-03-19
35     1881-03-23
36     1869-11-22
37     1879-07-01
38     1875-01-14
          ...    
1011   1940-06-28
1012   1908-01-22
1013   1862-06-07
1014   1843-06-09
1015   1864-11-11
1016   1876-04-22
1017   1869-09-03
1018   1865-04-01
1019   1857-03-07
1020   1868-06-14
1021   1887-08-12
1022   1883-06-24
1025   1900-04-25
1026   1914-05-19
1028   1903-11-07
1029   1899-05-08
1032   1946-10-20
1036   1898-09-24
1037   1899-09-03
1038   1903-01-27
1040   1912-05-28
1042   1920-05-29
1043   1940-10-15
1044   1937-06-11
1045   1951-09-30
1047   1967-02-24
1048   1878-11-01
1049   1887-04-10
1050   1906-09-06
1051   1931-11-26
Name: date_of_birth, Length: 857, dtype: datetime64[ns]

#date_of_deathでは例外でた
pd.to_datetime(df.date_of_death, errors='raise')

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

/home/beetle/anaconda3/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz)
    443             try:
--> 444                 values, tz = tslib.datetime_to_datetime64(arg)
    445                 return DatetimeIndex._simple_new(values, name=name, tz=tz)


pandas/_libs/tslib.pyx in pandas._libs.tslib.datetime_to_datetime64 (pandas/_libs/tslib.c:33275)()


TypeError: Unrecognized value type: <class 'str'>


During handling of the above exception, another exception occurred:


ValueError                                Traceback (most recent call last)

<ipython-input-56-2a87872c34e6> in <module>()
----> 1 pd.to_datetime(df.date_of_death, errors='raise')


/home/beetle/anaconda3/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin)
    507     elif isinstance(arg, ABCSeries):
    508         from pandas import Series
--> 509         values = _convert_listlike(arg._values, False, format)
    510         result = Series(values, index=arg.index, name=arg.name)
    511     elif isinstance(arg, (ABCDataFrame, MutableMapping)):


/home/beetle/anaconda3/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz)
    445                 return DatetimeIndex._simple_new(values, name=name, tz=tz)
    446             except (ValueError, TypeError):
--> 447                 raise e
    448 
    449     if arg is None:


/home/beetle/anaconda3/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz)
    433                     dayfirst=dayfirst,
    434                     yearfirst=yearfirst,
--> 435                     require_iso8601=require_iso8601
    436                 )
    437 


pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:46617)()


pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:46233)()


pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:46122)()


pandas/_libs/tslib.pyx in pandas._libs.tslib.parse_datetime_string (pandas/_libs/tslib.c:35351)()


/home/beetle/anaconda3/lib/python3.6/site-packages/dateutil/parser.py in parse(timestr, parserinfo, **kwargs)
   1166         return parser(parserinfo).parse(timestr, **kwargs)
   1167     else:
-> 1168         return DEFAULTPARSER.parse(timestr, **kwargs)
   1169 
   1170 


/home/beetle/anaconda3/lib/python3.6/site-packages/dateutil/parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
    579                 repl['day'] = monthrange(cyear, cmonth)[1]
    580 
--> 581         ret = default.replace(**repl)
    582 
    583         if res.weekday is not None and not res.day:


ValueError: month must be in 1..12

for i,row in df.iterrows():
    try:
        pd.to_datetime(row.date_of_death,errors='raise')
    except:
        print('%s(%s, %d)'%(row.date_of_death.ljust(30),row['name'],i))

1968-23-07                    (Henry Hallett Dale, 150)
May 30, 2011 (aged 89)        (Rosalyn Yalow, 349)
living                        (David Trimble, 581)
Diederik Korteweg             (Johannes Diderik van der Waals, 746)
living                        (Shirin Ebadi, 809)
living                        (Rigoberta Menchú, 833)
1 February 1976, age 74       (Werner Karl Heisenberg, 858)

with_death_dates = df[df.date_of_death.notnull()]
bad_dates = pd.isnull(pd.to_datetime(\
with_death_dates.date_of_death, errors='coerce'))
with_death_dates[bad_dates][['category', 'date_of_death',\
'name']]

	category	date_of_death	name
150	Physiology or Medicine	1968-23-07	Henry Hallett Dale
349	Physiology or Medicine	May 30, 2011 (aged 89)	Rosalyn Yalow
581	Peace	living	David Trimble
746	Physics	Diederik Korteweg	Johannes Diderik van der Waals
809	Peace	living	Shirin Ebadi
833	Peace	living	Rigoberta Menchú
858	Physics	1 February 1976, age 74	Werner Karl Heisenberg

df.date_of_death = pd.to_datetime(df.date_of_death,\
errors='coerce')

df['award_age'] = df.year - pd.DatetimeIndex(df.date_of_birth)\
.year

df.sort_values('award_age').iloc[:10]\
[['name', 'award_age', 'category', 'year']]

	name	award_age	category	year
725	Malala Yousafzai	17	Peace	2014
525	William Lawrence Bragg	25	Physics	1915
626	Georges J. F. Köhler	30	Physiology or Medicine	1976
858	Werner Karl Heisenberg	31	Physics	1932
975	Tsung-Dao Lee	31	Physics	1957
146	Paul Dirac	31	Physics	1933
247	Carl Anderson	31	Physics	1936
877	Rudolf Mössbauer	32	Physics	1961
226	Tawakkol Karman	32	Peace	2011
804	Mairéad Corrigan	32	Peace	1976

9.5 完成したclean_data関数

def clean_data(df):
    """The full clean data function, which returns both the cleaned Nobel data (df) and a DataFrame 
    containing those winners with a born_in field."""
    df = df.replace('', np.nan)
    df_born_in = df[df.born_in.notnull()] 
    df = df[df.born_in.isnull()]
    df = df.drop('born_in', axis=1) 
    df.drop(df[df.year == 1809].index, inplace=True) 
    df = df[~(df.name == 'Marie Curie')]
    df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
           (df.year == 1911), 'country'] = 'France'
    df = df[~((df.name == 'Sidney Altman') & (df.year == 1990))]
    df = df.reindex(np.random.permutation(df.index)) 
    df = df.drop_duplicates(['name', 'year'])         
    df = df.sort_index()
    df.ix[df.name == 'Alexis Carrel', 'category'] =\
        'Physiology or Medicine' 
    df.ix[df.name == 'Ragnar Granit', 'gender'] = 'male'
    df = df[df.gender.notnull()] # remove institutional prizes
    df.ix[df.name == 'Hiroshi Amano', 'date_of_birth'] =\
    '11 September 1960'
    df.date_of_birth = pd.to_datetime(df.date_of_birth) 
    df.date_of_death = pd.to_datetime(df.date_of_death, errors='coerce') 
    df['award_age'] = df.year - pd.DatetimeIndex(df.date_of_birth).year 
    return df, df_born_in

9.6 クリーニングしたデータ・セットの保存

省略