9.4 データのクリーニング
df.born_in.describe()
count 1052
unique 40
top
freq 910
Name: born_in, dtype: object
9.4.1 混合した型の検出
set(df.born_in.apply(type))
{str}
9.4.2 文字列の置換
bi_col.replace('', np.nan, inplace=True)
bi_col
0 NaN
1 Bosnia and Herzegovina
2 Bosnia and Herzegovina
3 NaN
4 NaN
5 NaN
6 NaN
7 NaN
8 NaN
9 NaN
10 NaN
11 NaN
12 NaN
13 NaN
14 Belarus
15 Belarus
16 Belarus
17 NaN
18 NaN
19 NaN
20 NaN
21 NaN
22 NaN
23 NaN
24 NaN
25 NaN
26 NaN
27 Czech Republic
28 Czech Republic
29 Czech Republic
...
1022 NaN
1023 Austria
1024 Austria
1025 NaN
1026 NaN
1027 Austria
1028 NaN
1029 NaN
1030 Austria
1031 Austria
1032 NaN
1033 NaN
1034 Austria
1035 Australia
1036 NaN
1037 NaN
1038 NaN
1039 Australia
1040 NaN
1041 Australia
1042 NaN
1043 NaN
1044 NaN
1045 NaN
1046 Australia
1047 NaN
1048 NaN
1049 NaN
1050 NaN
1051 NaN
Name: born_in, Length: 1052, dtype: object
bi_col.count()
142
df.replace('', np.nan, inplace=True)
df.head()
|
name |
born_in |
category |
country |
date_of_birth |
date_of_death |
gender |
link |
place_of_birth |
place_of_death |
text |
year |
0 |
César Milstein |
NaN |
Physiology or Medicine |
Argentina |
8 October 1927 |
24 March 2002 |
male |
http://en.wikipedia.org/wiki/C%C3%A9sar_Milstein |
Bahía Blanca , Argentina |
Cambridge , England |
César Milstein , Physiology or Medicine, 1984 |
1984 |
1 |
Ivo Andric * |
Bosnia and Herzegovina |
Literature |
NaN |
9 October 1892 |
13 March 1975 |
male |
http://en.wikipedia.org/wiki/Ivo_Andric |
Dolac (village near Travnik), Austria-Hungary ... |
Belgrade, SR Serbia, SFR Yugoslavia (present-d... |
Ivo Andric *, born in then Austria–Hungary ,... |
1961 |
2 |
Vladimir Prelog * |
Bosnia and Herzegovina |
Chemistry |
NaN |
July 23, 1906 |
1998-01-07 |
male |
http://en.wikipedia.org/wiki/Vladimir_Prelog |
Sarajevo , Bosnia and Herzegovina , then part... |
Zürich , Switzerland |
Vladimir Prelog *, born in then Austria–Hung... |
1975 |
3 |
Institut de Droit International |
NaN |
Peace |
Belgium |
None |
None |
None |
http://en.wikipedia.org/wiki/Institut_de_Droit... |
None |
None |
Institut de Droit International , Peace, 1904 |
1904 |
4 |
Auguste Beernaert |
NaN |
Peace |
Belgium |
26 July 1829 |
6 October 1912 |
male |
http://en.wikipedia.org/wiki/Auguste_Marie_Fra... |
Ostend , Netherlands (now Belgium ) |
Lucerne , Switzerland |
Auguste Beernaert , Peace, 1909 |
1909 |
dfa = df[df.name.str.contains('\*')]['name']
print(dfa)
1 Ivo Andric *
2 Vladimir Prelog *
14 Simon Kuznets *
15 Menachem Begin *
16 Shimon Peres *
27 Bertha von Suttner *
28 Gerty Cori *
29 Carl Ferdinand Cori *
50 Henry Kissinger *
51 Arno Penzias *
53 Georges J.F. Köhler *
58 Jack Steinberger *
63 Hans G. Dehmelt *
82 Renato Dulbecco *
87 Riccardo Giacconi *
88 Mario Capecchi *
101 Mario José Molina Henríquez *
102 Gabriel Lippmann *
103 Jules A. Hoffmann *
104 Andrew Schally *
105 Czesław Miłosz *
106 Aaron Klug *
109 Wilhelm Ostwald *
115 Severo Ochoa *
120 Allan M. Cormack *
126 Sydney Brenner *
128 Michael Levitt *
135 Niels Kaj Jerne *
138 Michael Smith *
268 T. S. Eliot *
...
935 Luis Federico Leloir *
937 Seán MacBride *
938 Roger Guillemin *
962 Niels Ryberg Finsen *
972 Leopold Ružička *
978 Daniel C. Tsui *
979 Gao Xingjian *
980 Charles K. Kao *
987 William Giauque *
989 Charles B. Huggins *
991 Saul Bellow *
992 David H. Hubel *
993 Henry Taube *
997 Rudolph A. Marcus *
1001 William Vickrey *
1002 Myron Scholes *
1005 Willard S. Boyle *
1008 Elias Canetti *
1009 Peter Medawar *
1010 Zhores Ivanovich Alferov *
1023 Otto Loewi *
1024 Richard Kuhn *
1027 Karl von Frisch *
1030 Walter Kohn *
1031 Eric R. Kandel *
1034 Martin Karplus *
1035 William Lawrence Bragg *
1039 Aleksandr M. Prokhorov *
1041 John Warcup Cornforth *
1046 Elizabeth H. Blackburn *
Name: name, Length: 142, dtype: object
df.name = df.name.str.replace('*', '')
df.name = df.name.str.strip()
df[df.name.str.contains('\*')]
|
name |
born_in |
category |
country |
date_of_birth |
date_of_death |
gender |
link |
place_of_birth |
place_of_death |
text |
year |
9.4.3 行の削除
np.nan == np.nan
False
df = df[df.born_in.isnull()]
df.count()
name 910
born_in 0
category 909
country 910
date_of_birth 901
date_of_death 589
gender 900
link 910
place_of_birth 875
place_of_death 546
text 910
year 910
dtype: int64
df = df.drop('born_in', axis=1)
df.head()
|
name |
category |
country |
date_of_birth |
date_of_death |
gender |
link |
place_of_birth |
place_of_death |
text |
year |
0 |
César Milstein |
Physiology or Medicine |
Argentina |
8 October 1927 |
24 March 2002 |
male |
http://en.wikipedia.org/wiki/C%C3%A9sar_Milstein |
Bahía Blanca , Argentina |
Cambridge , England |
César Milstein , Physiology or Medicine, 1984 |
1984 |
3 |
Institut de Droit International |
Peace |
Belgium |
None |
None |
None |
http://en.wikipedia.org/wiki/Institut_de_Droit... |
None |
None |
Institut de Droit International , Peace, 1904 |
1904 |
4 |
Auguste Beernaert |
Peace |
Belgium |
26 July 1829 |
6 October 1912 |
male |
http://en.wikipedia.org/wiki/Auguste_Marie_Fra... |
Ostend , Netherlands (now Belgium ) |
Lucerne , Switzerland |
Auguste Beernaert , Peace, 1909 |
1909 |
5 |
Maurice Maeterlinck |
Literature |
Belgium |
29 August 1862 |
6 May 1949 |
male |
http://en.wikipedia.org/wiki/Maurice_Maeterlinck |
Ghent , Belgium |
Nice , France |
Maurice Maeterlinck , Literature, 1911 |
1911 |
6 |
Henri La Fontaine |
Peace |
Belgium |
22 April 1854 |
14 May 1943 |
male |
http://en.wikipedia.org/wiki/Henri_La_Fontaine |
Brussels |
Belgium |
Henri La Fontaine , Peace, 1913 |
1913 |
9.4.4 重複の検出
dupes_by_name = df[df.duplicated('name')]
dupes_by_name.count()
name 46
category 46
country 46
date_of_birth 45
date_of_death 24
gender 44
link 46
place_of_birth 45
place_of_death 23
text 46
year 46
dtype: int64
duplicated関数の挙動
通常のduplicated関数とkeepに'lastオプションをつけたものを|で結合すると
全く重複していないデータ以外全てTrueになる。
(ここではindex3の'&&&'が唯一重複を持たないデータ)
dfa = pd.DataFrame({'name':['###','***','$$$','%%%','&&&','###','%%%','$$$','###','$$$','***','%%%']})
print(dfa)
first_dupes = dfa.duplicated('name')
print(first_dupes)
last_dupes = dfa.duplicated('name',keep='last')
print(last_dupes)
dfa[dfa.duplicated('name') | dfa.duplicated('name',keep='last')]
name
0 ###
1 ***
2 $$$
3 %%%
4 &&&
5 ###
6 %%%
7 $$$
8 ###
9 $$$
10 ***
11 %%%
0 False
1 False
2 False
3 False
4 False
5 True
6 True
7 True
8 True
9 True
10 True
11 True
dtype: bool
0 True
1 True
2 True
3 True
4 False
5 True
6 True
7 True
8 False
9 False
10 False
11 False
dtype: bool
|
name |
0 |
### |
1 |
*** |
2 |
$$$ |
3 |
%%% |
5 |
### |
6 |
%%% |
7 |
$$$ |
8 |
### |
9 |
$$$ |
10 |
*** |
11 |
%%% |
all_dupes = df[df.duplicated('name')\
| df.duplicated('name', keep='last')]
all_dupes.count()
name 92
category 92
country 92
date_of_birth 90
date_of_death 48
gender 88
link 92
place_of_birth 90
place_of_death 46
text 92
year 92
dtype: int64
all_dupes = df[df.name.isin(dupes_by_name.name)]
all_dupes.count()
name 92
category 92
country 92
date_of_birth 90
date_of_death 48
gender 88
link 92
place_of_birth 90
place_of_death 46
text 92
year 92
dtype: int64
pd.concat([g for _,g in df.groupby('name')\
if len(g) > 1])['name']
121 Aaron Klug
131 Aaron Klug
615 Albert Einstein
844 Albert Einstein
176 Arieh Warshel
798 Arieh Warshel
94 Avram Hershko
830 Avram Hershko
228 Baruj Benacerraf
366 Baruj Benacerraf
573 Betty Williams
805 Betty Williams
162 Brian P. Schmidt
1047 Brian P. Schmidt
498 Charles K. Kao
831 Charles K. Kao
295 Chen Ning Yang
976 Chen Ning Yang
0 César Milstein
134 César Milstein
623 Daniel Bovet
790 Daniel Bovet
93 Daniel Kahneman
457 Daniel Kahneman
407 Edmond H. Fischer
630 Edmond H. Fischer
505 Ei-ichi Negishi
778 Ei-ichi Negishi
524 Ernest Rutherford
985 Ernest Rutherford
...
632 Médecins Sans Frontières
947 Médecins Sans Frontières
490 Osamu Shimomura
776 Osamu Shimomura
72 Philipp Lenard
1013 Philipp Lenard
650 Ragnar Granit
960 Ragnar Granit
510 Ralph M. Steinman
1006 Ralph M. Steinman
85 Rita Levi-Montalcini
376 Rita Levi-Montalcini
96 Robert Aumann
476 Robert Aumann
137 Ronald Coase
405 Ronald Coase
515 Shuji Nakamura
780 Shuji Nakamura
396 Sidney Altman
995 Sidney Altman
451 Sydney Brenner
586 Sydney Brenner
172 Thomas C. Südhof
905 Thomas C. Südhof
294 Tsung-Dao Lee
975 Tsung-Dao Lee
333 Wassily Leontief
684 Wassily Leontief
489 Yoichiro Nambu
773 Yoichiro Nambu
Name: name, Length: 92, dtype: object
9.4.5 データのソート
df2 = pd.DataFrame(\
{'name':['zak', 'alice', 'bob', 'mike', 'bob', 'bob'],\
'score':[4, 3, 5, 2, 3, 7]})
df2.sort_values(['name', 'score'],\
ascending=[1,0])
|
name |
score |
1 |
alice |
3 |
5 |
bob |
7 |
2 |
bob |
5 |
4 |
bob |
3 |
3 |
mike |
2 |
0 |
zak |
4 |
all_dupes.sort_values('name')[['name', 'country', 'year']]
|
name |
country |
year |
121 |
Aaron Klug |
South Africa |
1982 |
131 |
Aaron Klug |
United Kingdom |
1982 |
844 |
Albert Einstein |
Germany |
1921 |
615 |
Albert Einstein |
Switzerland |
1921 |
176 |
Arieh Warshel |
United States |
2013 |
798 |
Arieh Warshel |
Israel |
2013 |
830 |
Avram Hershko |
Hungary |
2004 |
94 |
Avram Hershko |
Israel |
2004 |
366 |
Baruj Benacerraf |
United States |
1980 |
228 |
Baruj Benacerraf |
Venezuela |
1980 |
805 |
Betty Williams |
Ireland |
1976 |
573 |
Betty Williams |
United Kingdom |
1976 |
162 |
Brian P. Schmidt |
United States |
2011 |
1047 |
Brian P. Schmidt |
Australia |
2011 |
498 |
Charles K. Kao |
United States |
2009 |
831 |
Charles K. Kao |
Hong Kong |
2009 |
976 |
Chen Ning Yang |
China |
1957 |
295 |
Chen Ning Yang |
United States |
1957 |
0 |
César Milstein |
Argentina |
1984 |
134 |
César Milstein |
United Kingdom |
1984 |
623 |
Daniel Bovet |
Switzerland |
1957 |
790 |
Daniel Bovet |
Italy |
1957 |
93 |
Daniel Kahneman |
Israel |
2002 |
457 |
Daniel Kahneman |
United States |
2002 |
630 |
Edmond H. Fischer |
Switzerland |
1992 |
407 |
Edmond H. Fischer |
United States |
1992 |
778 |
Ei-ichi Negishi |
Japan |
2010 |
505 |
Ei-ichi Negishi |
United States |
2010 |
985 |
Ernest Rutherford |
Canada |
1908 |
524 |
Ernest Rutherford |
United Kingdom |
1908 |
... |
... |
... |
... |
947 |
Médecins Sans Frontières |
France |
1999 |
632 |
Médecins Sans Frontières |
Switzerland |
1999 |
776 |
Osamu Shimomura |
Japan |
2008 |
490 |
Osamu Shimomura |
United States |
2008 |
1013 |
Philipp Lenard |
Austria |
1905 |
72 |
Philipp Lenard |
Germany |
1905 |
650 |
Ragnar Granit |
Sweden |
1967 |
960 |
Ragnar Granit |
Finland |
1809 |
1006 |
Ralph M. Steinman |
Canada |
2011 |
510 |
Ralph M. Steinman |
United States |
2011 |
85 |
Rita Levi-Montalcini |
Italy |
1986 |
376 |
Rita Levi-Montalcini |
United States |
1986 |
96 |
Robert Aumann |
Israel |
2005 |
476 |
Robert Aumann |
United States |
2005 |
405 |
Ronald Coase |
United States |
1991 |
137 |
Ronald Coase |
United Kingdom |
1991 |
515 |
Shuji Nakamura |
United States |
2014 |
780 |
Shuji Nakamura |
Japan |
2014 |
995 |
Sidney Altman |
Canada |
1989 |
396 |
Sidney Altman |
United States |
1990 |
451 |
Sydney Brenner |
United States |
2002 |
586 |
Sydney Brenner |
United Kingdom |
2002 |
905 |
Thomas C. Südhof |
Germany |
2013 |
172 |
Thomas C. Südhof |
United States |
2013 |
975 |
Tsung-Dao Lee |
China |
1957 |
294 |
Tsung-Dao Lee |
United States |
1957 |
333 |
Wassily Leontief |
United States |
1973 |
684 |
Wassily Leontief |
Russia |
1973 |
773 |
Yoichiro Nambu |
Japan |
2008 |
489 |
Yoichiro Nambu |
United States |
2008 |
92 rows × 3 columns
9.4.6重複の削除
df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
(df.year == 1911), 'country'] = 'France'
df.drop(df[(df.name == 'Sidney Altman') &\
(df.year == 1990)].index,
inplace=True)
def clean_data(df):
df = df.replace('', np.nan)
df = df[df.born_in.isnull()]
df = df.drop('born_in', axis=1)
df.drop(df[df.year == 1809].index, inplace=True)
df = df[~(df.name == 'Marie Curie')]
df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
(df.year == 1911), 'country'] = 'France'
df = df[~((df.name == 'Sidney Altman') &\
(df.year == 1990))]
return df
df = reload_data()
df = clean_data(df)
df = df.reindex(np.random.permutation(df.index))
df = df.drop_duplicates(['name', 'year'])
df = df.sort_index()
df.count()
category 864
country 865
date_of_birth 857
date_of_death 566
gender 857
link 865
name 865
place_of_birth 831
place_of_death 524
text 865
year 865
dtype: int64
df[df.duplicated('name') |
df.duplicated('name', keep='last')]\
.sort_values(by='name')\
[['name', 'country', 'year', 'category']]
|
name |
country |
year |
category |
548 |
Frederick Sanger |
United Kingdom |
1958 |
Chemistry |
580 |
Frederick Sanger |
United Kingdom |
1980 |
Chemistry |
292 |
John Bardeen |
United States |
1956 |
Physics |
326 |
John Bardeen |
United States |
1972 |
Physics |
285 |
Linus C. Pauling |
United States |
1954 |
Chemistry |
309 |
Linus C. Pauling |
United States |
1962 |
Peace |
706 |
Marie Skłodowska-Curie |
Poland |
1903 |
Physics |
709 |
Marie Skłodowska-Curie |
France |
1911 |
Chemistry |
9.4.7 欠損フィールドの処理
df.count()
category 864
country 865
date_of_birth 857
date_of_death 566
gender 857
link 865
name 865
place_of_birth 831
place_of_death 524
text 865
year 865
dtype: int64
df[df.category.isnull()][['name', 'text']]
|
name |
text |
922 |
Alexis Carrel |
Alexis Carrel , Medicine, 1912 |
df.ix[df.name == 'Alexis Carrel', 'category'] =\
'Physiology or Medicine'
/home/beetle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
"""Entry point for launching an IPython kernel.
df[df.gender.isnull()]['name']
3 Institut de Droit International
156 Friends Service Council
267 American Friends Service Committee (The Quakers)
574 Amnesty International
650 Ragnar Granit
947 Médecins Sans Frontières
1000 Pugwash Conferences on Science and World Affairs
1033 International Atomic Energy Agency
Name: name, dtype: object
df = df[df.gender.notnull()]
df.ix[df.name == 'Ragnar Granit', 'gender'] = 'male'
/home/beetle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
df[df.date_of_birth.isnull()]['name']
782 Hiroshi Amano
Name: name, dtype: object
df.ix[df.name == 'Hiroshi Amano', 'date_of_birth'] =\
'11 September 1960'
df[['name', 'date_of_birth']]
|
name |
date_of_birth |
0 |
César Milstein |
8 October 1927 |
4 |
Auguste Beernaert |
26 July 1829 |
5 |
Maurice Maeterlinck |
29 August 1862 |
6 |
Henri La Fontaine |
22 April 1854 |
7 |
Jules Bordet |
13 June 1870 |
8 |
Corneille Heymans |
28 March 1892 |
9 |
Georges Pire |
1910-02-10 |
10 |
Albert Claude |
24 August 1899 |
11 |
Christian de Duve |
2 October 1917 |
12 |
Ilya Prigogine |
25 January 1917 |
13 |
François Englert |
6 November 1932 |
17 |
Karl Adolph Gjellerup |
June 2, 1857 |
18 |
August Krogh |
November 15, 1874 |
19 |
Niels Bohr |
7 October 1885 |
20 |
Johannes Andreas Grib Fibiger |
23 April 1867 |
21 |
Henrik Dam |
21 February 1895 |
22 |
Johannes Vilhelm Jensen |
1873-01-20 |
23 |
Ben Roy Mottelson |
July 9, 1926 |
24 |
Aage Bohr |
19 June 1922 |
25 |
Niels Kaj Jerne |
December 23, 1911 |
26 |
Jens Christian Skou |
October 8, 1918 |
30 |
Jaroslav Heyrovský |
December 20, 1890 |
31 |
Jaroslav Seifert |
23 September 1901 |
32 |
Christopher A. Pissarides |
1948-02-20 |
33 |
Irène Joliot-Curie |
12 September 1897 |
34 |
Frédéric Joliot |
19 March 1900 |
35 |
Roger Martin du Gard |
23 March 1881 |
36 |
André Gide |
1869-11-22 |
37 |
Léon Jouhaux |
July 1, 1879 |
38 |
Albert Schweitzer |
14 January 1875 |
... |
... |
... |
1011 |
Muhammad Yunus |
28 June 1940 |
1012 |
Lev Landau |
January 22, 1908 |
1013 |
Philipp Lenard |
June 7, 1862 |
1014 |
Bertha von Suttner |
June 9, 1843 |
1015 |
Alfred Hermann Fried |
11 November 1864 |
1016 |
Robert Bárány |
22 April 1876 |
1017 |
Friderik Pregl |
3 September 1869 |
1018 |
Richard Adolf Zsigmondy |
1 April 1865 |
1019 |
Julius Wagner-Jauregg |
7 March 1857 |
1020 |
Karl Landsteiner |
June 14, 1868 |
1021 |
Erwin Schrödinger |
12 August 1887 |
1022 |
Victor Francis Hess |
24 June 1883 |
1025 |
Wolfgang Pauli |
25 April 1900 |
1026 |
Max F. Perutz |
19 May 1914 |
1028 |
Konrad Lorenz |
November 7, 1903 |
1029 |
Friedrich Hayek |
8 May 1899 |
1032 |
Elfriede Jelinek |
20 October 1946 |
1036 |
Sir Howard Florey |
24 September 1898 |
1037 |
Sir Frank Macfarlane Burnet |
3 September 1899 |
1038 |
John Carew Eccles |
27 January 1903 |
1040 |
Patrick White |
28 May 1912 |
1042 |
John Harsanyi |
May 29, 1920 |
1043 |
Peter C. Doherty & Professor Rolf Zinkernagel |
15 October 1940 |
1044 |
J. Robin Warren |
11 June 1937 |
1045 |
Barry Marshall |
30 September 1951 |
1047 |
Brian P. Schmidt |
February 24, 1967 |
1048 |
Carlos Saavedra Lamas |
November 1, 1878 |
1049 |
Bernardo Houssay |
1887-04-10 |
1050 |
Luis Federico Leloir |
1906-9-6 |
1051 |
Adolfo Pérez Esquivel |
November 26, 1931 |
857 rows × 2 columns
9.4.8 時刻と日付の処理
pd.to_datetime(df.date_of_birth, errors='raise')
0 1927-10-08
4 1829-07-26
5 1862-08-29
6 1854-04-22
7 1870-06-13
8 1892-03-28
9 1910-02-10
10 1899-08-24
11 1917-10-02
12 1917-01-25
13 1932-11-06
17 1857-06-02
18 1874-11-15
19 1885-10-07
20 1867-04-23
21 1895-02-21
22 1873-01-20
23 1926-07-09
24 1922-06-19
25 1911-12-23
26 1918-10-08
30 1890-12-20
31 1901-09-23
32 1948-02-20
33 1897-09-12
34 1900-03-19
35 1881-03-23
36 1869-11-22
37 1879-07-01
38 1875-01-14
...
1011 1940-06-28
1012 1908-01-22
1013 1862-06-07
1014 1843-06-09
1015 1864-11-11
1016 1876-04-22
1017 1869-09-03
1018 1865-04-01
1019 1857-03-07
1020 1868-06-14
1021 1887-08-12
1022 1883-06-24
1025 1900-04-25
1026 1914-05-19
1028 1903-11-07
1029 1899-05-08
1032 1946-10-20
1036 1898-09-24
1037 1899-09-03
1038 1903-01-27
1040 1912-05-28
1042 1920-05-29
1043 1940-10-15
1044 1937-06-11
1045 1951-09-30
1047 1967-02-24
1048 1878-11-01
1049 1887-04-10
1050 1906-09-06
1051 1931-11-26
Name: date_of_birth, Length: 857, dtype: datetime64[ns]
pd.to_datetime(df.date_of_death, errors='raise')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/home/beetle/anaconda3/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz)
443 try:
--> 444 values, tz = tslib.datetime_to_datetime64(arg)
445 return DatetimeIndex._simple_new(values, name=name, tz=tz)
pandas/_libs/tslib.pyx in pandas._libs.tslib.datetime_to_datetime64 (pandas/_libs/tslib.c:33275)()
TypeError: Unrecognized value type: <class 'str'>
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-56-2a87872c34e6> in <module>()
----> 1 pd.to_datetime(df.date_of_death, errors='raise')
/home/beetle/anaconda3/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, box, format, exact, unit, infer_datetime_format, origin)
507 elif isinstance(arg, ABCSeries):
508 from pandas import Series
--> 509 values = _convert_listlike(arg._values, False, format)
510 result = Series(values, index=arg.index, name=arg.name)
511 elif isinstance(arg, (ABCDataFrame, MutableMapping)):
/home/beetle/anaconda3/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz)
445 return DatetimeIndex._simple_new(values, name=name, tz=tz)
446 except (ValueError, TypeError):
--> 447 raise e
448
449 if arg is None:
/home/beetle/anaconda3/lib/python3.6/site-packages/pandas/core/tools/datetimes.py in _convert_listlike(arg, box, format, name, tz)
433 dayfirst=dayfirst,
434 yearfirst=yearfirst,
--> 435 require_iso8601=require_iso8601
436 )
437
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:46617)()
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:46233)()
pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime (pandas/_libs/tslib.c:46122)()
pandas/_libs/tslib.pyx in pandas._libs.tslib.parse_datetime_string (pandas/_libs/tslib.c:35351)()
/home/beetle/anaconda3/lib/python3.6/site-packages/dateutil/parser.py in parse(timestr, parserinfo, **kwargs)
1166 return parser(parserinfo).parse(timestr, **kwargs)
1167 else:
-> 1168 return DEFAULTPARSER.parse(timestr, **kwargs)
1169
1170
/home/beetle/anaconda3/lib/python3.6/site-packages/dateutil/parser.py in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
579 repl['day'] = monthrange(cyear, cmonth)[1]
580
--> 581 ret = default.replace(**repl)
582
583 if res.weekday is not None and not res.day:
ValueError: month must be in 1..12
for i,row in df.iterrows():
try:
pd.to_datetime(row.date_of_death,errors='raise')
except:
print('%s(%s, %d)'%(row.date_of_death.ljust(30),row['name'],i))
1968-23-07 (Henry Hallett Dale, 150)
May 30, 2011 (aged 89) (Rosalyn Yalow, 349)
living (David Trimble, 581)
Diederik Korteweg (Johannes Diderik van der Waals, 746)
living (Shirin Ebadi, 809)
living (Rigoberta Menchú, 833)
1 February 1976, age 74 (Werner Karl Heisenberg, 858)
with_death_dates = df[df.date_of_death.notnull()]
bad_dates = pd.isnull(pd.to_datetime(\
with_death_dates.date_of_death, errors='coerce'))
with_death_dates[bad_dates][['category', 'date_of_death',\
'name']]
|
category |
date_of_death |
name |
150 |
Physiology or Medicine |
1968-23-07 |
Henry Hallett Dale |
349 |
Physiology or Medicine |
May 30, 2011 (aged 89) |
Rosalyn Yalow |
581 |
Peace |
living |
David Trimble |
746 |
Physics |
Diederik Korteweg |
Johannes Diderik van der Waals |
809 |
Peace |
living |
Shirin Ebadi |
833 |
Peace |
living |
Rigoberta Menchú |
858 |
Physics |
1 February 1976, age 74 |
Werner Karl Heisenberg |
df.date_of_death = pd.to_datetime(df.date_of_death,\
errors='coerce')
df['award_age'] = df.year - pd.DatetimeIndex(df.date_of_birth)\
.year
df.sort_values('award_age').iloc[:10]\
[['name', 'award_age', 'category', 'year']]
|
name |
award_age |
category |
year |
725 |
Malala Yousafzai |
17 |
Peace |
2014 |
525 |
William Lawrence Bragg |
25 |
Physics |
1915 |
626 |
Georges J. F. Köhler |
30 |
Physiology or Medicine |
1976 |
858 |
Werner Karl Heisenberg |
31 |
Physics |
1932 |
975 |
Tsung-Dao Lee |
31 |
Physics |
1957 |
146 |
Paul Dirac |
31 |
Physics |
1933 |
247 |
Carl Anderson |
31 |
Physics |
1936 |
877 |
Rudolf Mössbauer |
32 |
Physics |
1961 |
226 |
Tawakkol Karman |
32 |
Peace |
2011 |
804 |
Mairéad Corrigan |
32 |
Peace |
1976 |
9.5 完成したclean_data関数
def clean_data(df):
"""The full clean data function, which returns both the cleaned Nobel data (df) and a DataFrame
containing those winners with a born_in field."""
df = df.replace('', np.nan)
df_born_in = df[df.born_in.notnull()]
df = df[df.born_in.isnull()]
df = df.drop('born_in', axis=1)
df.drop(df[df.year == 1809].index, inplace=True)
df = df[~(df.name == 'Marie Curie')]
df.loc[(df.name == u'Marie Sk\u0142odowska-Curie') &\
(df.year == 1911), 'country'] = 'France'
df = df[~((df.name == 'Sidney Altman') & (df.year == 1990))]
df = df.reindex(np.random.permutation(df.index))
df = df.drop_duplicates(['name', 'year'])
df = df.sort_index()
df.ix[df.name == 'Alexis Carrel', 'category'] =\
'Physiology or Medicine'
df.ix[df.name == 'Ragnar Granit', 'gender'] = 'male'
df = df[df.gender.notnull()]
df.ix[df.name == 'Hiroshi Amano', 'date_of_birth'] =\
'11 September 1960'
df.date_of_birth = pd.to_datetime(df.date_of_birth)
df.date_of_death = pd.to_datetime(df.date_of_death, errors='coerce')
df['award_age'] = df.year - pd.DatetimeIndex(df.date_of_birth).year
return df, df_born_in
9.6 クリーニングしたデータ・セットの保存
省略