2017-09-28

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

jupyter notebookから直接MarkDownファイルに変換してここに貼り付ける方式に変更した

8.4DataFrame

import pandas as pd 
PATH = "/home/beetle/myproject/DataVisualization/nobel_winners/dataviz-with-python-and-js/Ch06_Heavyweight_Scraping_with_Scrapy/winner_list.json"
df = pd.read_json(PATH)
df.head(3)

	born_in	category	country	date_of_birth	date_of_death	gender	link	name	place_of_birth	place_of_death	text	year
0		Economics	Austria	8 May 1899	23 March 1992	male	http://en.wikipedia.org/wiki/Friedrich_Hayek	Friedrich Hayek	Vienna	Freiburg im Breisgau	Friedrich Hayek , Economics, 1974	1974
1		Physiology or Medicine	Austria	7 November 1903	27 February 1989	male	http://en.wikipedia.org/wiki/Konrad_Lorenz	Konrad Lorenz	Vienna	Vienna	Konrad Lorenz , Physiology or Medicine, 1973	1973
2	Austria	Physiology or Medicine		20 November 1886	12 June 1982	male	http://en.wikipedia.org/wiki/Karl_von_Frisch	Karl von Frisch *	Vienna	Munich	Karl von Frisch *, Physiology or Medicine, 1973	1973

8.4.1インデックス

print(df.columns)
print(df.index)
df =df.set_index('name')
print(df.loc['Albert Einstein'])
df = df.reset_index()

Index(['born_in', 'category', 'country', 'date_of_birth', 'date_of_death',
       'gender', 'link', 'name', 'place_of_birth', 'place_of_death', 'text',
       'year'],
      dtype='object')
Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057],
           dtype='int64', length=1058)
                born_in category      country  date_of_birth  date_of_death  \
name                                                                          
Albert Einstein          Physics  Switzerland  14 March 1879  18 April 1955   
Albert Einstein          Physics      Germany  14 March 1879  18 April 1955   

                gender                                          link  \
name                                                                   
Albert Einstein   male  http://en.wikipedia.org/wiki/Albert_Einstein   
Albert Einstein   male  http://en.wikipedia.org/wiki/Albert_Einstein   

                place_of_birth place_of_death  \
name                                            
Albert Einstein            Ulm      Princeton   
Albert Einstein            Ulm      Princeton   

                                                              text  year  
name                                                                      
Albert Einstein  Albert Einstein ,  born in Germany , Physics, ...  1921  
Albert Einstein                    Albert Einstein , Physics, 1921  1921

8.4.2 行と列

print(df.iloc[2])
print(df.ix[2])
df = df.set_index('name')
print(df.ix['Albert Einstein'])
print(df.ix[2])

name                                            Karl von Frisch *
born_in                                                   Austria
category                                   Physiology or Medicine
country                                                          
date_of_birth                                    20 November 1886
date_of_death                                        12 June 1982
gender                                                       male
link                 http://en.wikipedia.org/wiki/Karl_von_Frisch
place_of_birth                                             Vienna
place_of_death                                             Munich
text              Karl von Frisch *, Physiology or Medicine, 1973
year                                                         1973
Name: 2, dtype: object
name                                            Karl von Frisch *
born_in                                                   Austria
category                                   Physiology or Medicine
country                                                          
date_of_birth                                    20 November 1886
date_of_death                                        12 June 1982
gender                                                       male
link                 http://en.wikipedia.org/wiki/Karl_von_Frisch
place_of_birth                                             Vienna
place_of_death                                             Munich
text              Karl von Frisch *, Physiology or Medicine, 1973
year                                                         1973
Name: 2, dtype: object
                born_in category      country  date_of_birth  date_of_death  \
name                                                                          
Albert Einstein          Physics  Switzerland  14 March 1879  18 April 1955   
Albert Einstein          Physics      Germany  14 March 1879  18 April 1955   

                gender                                          link  \
name                                                                   
Albert Einstein   male  http://en.wikipedia.org/wiki/Albert_Einstein   
Albert Einstein   male  http://en.wikipedia.org/wiki/Albert_Einstein   

                place_of_birth place_of_death  \
name                                            
Albert Einstein            Ulm      Princeton   
Albert Einstein            Ulm      Princeton   

                                                              text  year  
name                                                                      
Albert Einstein  Albert Einstein ,  born in Germany , Physics, ...  1921  
Albert Einstein                    Albert Einstein , Physics, 1921  1921  
born_in                                                   Austria
category                                   Physiology or Medicine
country                                                          
date_of_birth                                    20 November 1886
date_of_death                                        12 June 1982
gender                                                       male
link                 http://en.wikipedia.org/wiki/Karl_von_Frisch
place_of_birth                                             Vienna
place_of_death                                             Munich
text              Karl von Frisch *, Physiology or Medicine, 1973
year                                                         1973
Name: Karl von Frisch *, dtype: object


/home/beetle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix

/home/beetle/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:4: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  after removing the cwd from sys.path.

8.4.3グループ

df = df.groupby('category')
print(df.groups.keys())
#物理学受賞者のみピックアップ
phy_group = df.get_group('Physics')
print(phy_group.head(3))

dict_keys(['', 'Chemistry', 'Economics', 'Literature', 'Peace', 'Physics', 'Physiology or Medicine'])
                  born_in category        country     date_of_birth  \
name                                                                  
Brian Schmidt              Physics      Australia  24 February 1967   
Percy W. Bridgman          Physics  United States     21 April 1882   
Isidor Isaac Rabi          Physics  United States      29 July 1898   

                     date_of_death gender  \
name                                        
Brian Schmidt                  NaN   male   
Percy W. Bridgman   20 August 1961   male   
Isidor Isaac Rabi  11 January 1988   male   

                                                             link  \
name                                                                
Brian Schmidt          http://en.wikipedia.org/wiki/Brian_Schmidt   
Percy W. Bridgman  http://en.wikipedia.org/wiki/Percy_W._Bridgman   
Isidor Isaac Rabi  http://en.wikipedia.org/wiki/Isidor_Isaac_Rabi   

                  place_of_birth place_of_death  \
name                                              
Brian Schmidt           Missoula            NaN   
Percy W. Bridgman      Cambridge       Randolph   
Isidor Isaac Rabi        Rymanów  New York City   

                                                                text  year  
name                                                                        
Brian Schmidt      Brian Schmidt ,  born in the United States , P...  2011  
Percy W. Bridgman                  Percy W. Bridgman , Physics, 1946  1946  
Isidor Isaac Rabi  Isidor Isaac Rabi ,  born in Austria , Physics...  1944

2017-09-27

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

配列関数の作成

移動平均の計算¶

def moving_average(a,n=3):
    ret = np.cumsum(a,dtype=float)
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n - 1:]/ n
a = np.arange(10)
print(a)
print(moving_average(a,4))
結果

[0 1 2 3 4 5 6 7 8 9]
[ 1.5  2.5  3.5  4.5  5.5  6.5  7.5]

2017-09-27

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

7.1.2配列のインデックス指定とスライス

a = np.array([1,2,3,4,5,6])
print(a[2])
print(a[3:5])
b = a[:4:2] = 0
print(b)
a = np.array([1,2,3,4,5,6])
print(a[::-1])
結果
3
[4 5]  
0  
[6 5 4 3 2 1]   

a[:4:2]の結果だけ違う？  
ちなみに0の代入をやめると  
[1 3]となる

図7-2２次元Numpy配列に対する基本的な数学演算

a = np.array([1,2,3,4,5,6])
a = a.reshape([2,3])
print(a)
print(a * 2)
print(a - 2)
print(a / 2.0)
結果
[[1 2 3]
[4 5 6]]

[[ 2  4  6]
 [ 8 10 12]] #ここが本と異なる

[[-1  0  1]
 [ 2  3  4]]

[[ 0.5  1.   1.5]


 [ 2.   2.5  3. ]]

a = np.array([45,65,76,32,99,22])
print(a < 50)
結果
[ True False False  True False  True]

図7-2Numpy配列数学関数

pi = np.pi
a = np.array([pi,pi/2,pi/4,pi/6])
print(np.degrees(a))
結果
[ 180.   90.   45.   30.]

sin_a = np.sin(a)
print(sin_a)
結果
[  1.22464680e-16   1.00000000e+00   7.07106781e-01   5.00000000e-01]

np.round(sin_a,7)
結果
array([ 0.       ,  1.       ,  0.7071068,  0.5      ])

a = np.arange(8).reshape(2,4)
print(a)
print(np.cumsum(a,axis=0))
print(np.cumsum(a,axis=1))
print(np.cumsum(a))
結果
[[0 1 2 3]
 [4 5 6 7]]
axis=0(axis=0は行方向）
[[ 0  1  2  3]
 [ 4  6  8 10]]
axis=1(axis=1は列方向)
[[ 0  1  3  6]
 [ 4  9 15 22]]

[ 0  1  3  6 10 15 21 28]

2017-09-24

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.8パイプラインを使ったテキストと画像のスクレイピング

#nwinners_minbio_spider.py
import scrapy
import re

BASE_URL = 'http://en.wikipedia.org'


class NWinnerItemBio(scrapy.Item):
    link = scrapy.Field()
    name = scrapy.Field()　#このnameフィールドは使っていない？
    mini_bio = scrapy.Field() 
    image_urls = scrapy.Field()
    bio_image = scrapy.Field() #このbio_imageは使っていない？
    images = scrapy.Field() #このimagesは使っていない


class NWinnerSpiderBio(scrapy.Spider):
    """ Scrapes the Nobel prize biography pages for portrait images and a biographical snippet """

    name = 'nwinners_minibio'
    allowed_domains = ['en.wikipedia.org']
    start_urls = [
        #"http://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
        "http://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country?dfdfd"
    ]

    #For Scrapy v 1.0+, custom_settings can override the item pipelines in settings
    custom_settings = {
        'ITEM_PIPELINES': {'nobel_winners.pipelines.NobelImagesPipeline':1},
    }

    def parse(self, response):

        #filename = response.url.split('/')[-1] #このfilenameいらないような
        h2s = response.xpath('//h2')

        for h2 in h2s[2:]: #２から
            country = h2.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h2.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    wdata = {}
                    wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]

                    #print(wdata)
                    request = scrapy.Request(wdata['link'],
                                            callback=self.get_mini_bio,
                                            dont_filter=True)
                    request.meta['item'] = NWinnerItemBio(**wdata)
                    yield request


    def get_mini_bio(self, response):
        BASE_URL_ESCAPED = 'http:\/\/en.wikipedia.org'
        item = response.meta['item']
        # cache image
        item['image_urls'] = []

        # Get the URL of the winner's picture, contained in the infobox table
        img_src = response.xpath('//table[contains(@class,"infobox")]//img/@src')
        if img_src:
            item['image_urls'] = ['http:' + img_src[0].extract()]
        mini_bio = ''
        # Get the paragraphs in the biography's body-text
        ps = response.xpath('//*[@id="mw-content-text"]/div/p[text() or  normalize-space(.)=""]').extract() #本のとおりだとmini_bioは取得できない、/div/p[text()...とすること
        # Add introductory biography paragraphs till the empty breakpoint
        for p in ps:
            if p == '<p></p>':
                break
            mini_bio += p

        # correct for wiki-links
        mini_bio = mini_bio.replace('href="/wiki', 'href="' + BASE_URL + '/wiki')
        mini_bio = mini_bio.replace('href="#', 'href="' + item['link'] + '#')
        item['mini_bio'] = mini_bio
        yield item

#pipelines.py
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline
# For Scrapy v1.0+:
# from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem


class NobelImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):

        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        if item['image_urls']:
            image_paths = item['image_urls'] #[item['image_urls'] for ok, x in results if ok]
        if image_paths:
            item['bio_image'] = image_paths[0]

        return item


class DropNonPersons(object):
    """ Remove non-person winners """

    def process_item(self, item, spider):

        if not item['gender']:
            raise DropItem("No gender for %s"%item['name'])
        return item

#settings.py
# -*- coding: utf-8 -*-

# Scrapy settings for nobel_winners project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#
import os

BOT_NAME = 'nobel_winners'

SPIDER_MODULES = ['nobel_winners.spiders']
NEWSPIDER_MODULE = 'nobel_winners.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'nobel_winners (+http://www.yourdomain.com)'
# e.g., to 'impersonate' a browser':
#USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"

HTTPCACHE_ENABLED = True
# ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
# We can define the ITEM_PIPELINES here or in their respective spiders by using the
# custom_settings variable (Scrapy v 1.0+) (see the nwinners_minibio spider for an example)
# For earlier versions of Scrapy (<1.0), define the ITEM_PIPELINES variable here:
ITEM_PIPELINES = {'nobel_winners.pipelines.NobelImagesPipeline':1}
#ITEM_PIPELINES = {'nobel_winners.pipelines.DropNonPersons':1}
# We're storing the images in an 'images' subdirectory of the Scrapy project's root
IMAGES_STORE = 'images'

minibios.jsonの内訳データ数1067
keyはlink,image_urls,mini_bioのみ
本にあるように受賞者の肖像がダウンロードすることはなかった。
もう少し先にかいてあるかもしれないのでここはスルー

2017-09-24

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.7 Scrapyパイプライン

P157の最初のコードに# nobel_winners/nobel_winners/setting.pyとあるがpipelines.pyの間違いかな？

#pipelines.py
import scrapy
from scrapy.exceptions import DropItem

class DropNonPersons(object):

    def process_item(self, item, spider):

        if not item['gender']:
            raise DropItem("No gender for %s"%item['name'])
        return item


#settings.py
import os

BOT_NAME = 'nobel_winners'

SPIDER_MODULES = ['nobel_winners.spiders']
NEWSPIDER_MODULE = 'nobel_winners.spiders'


HTTPCACHE_ENABLED = True
ITEM_PIPELINES = {'nobel_winners.pipelines.DropNonPersons':1}

#shllから
$scrapy crawl nwinners_full -o winner_list.json

winner_list.jsonの内訳受賞者の数1058人、最初の数から９人抜けている
male1001人、female57人、どちらでもない0人
国別の数67国

2017-09-24

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

いままでWindows上で行っていたがUnicodeErrが頻発するのでlubuntu上に変更する

#nwinners_full_spider.py
import scrapy
import re

BASE_URL = 'http://en.wikipedia.org'

class NWinnerItem(scrapy.Item):
    name = scrapy.Field()
    link = scrapy.Field()
    year = scrapy.Field()
    category = scrapy.Field()
    country = scrapy.Field()
    gender = scrapy.Field()
    born_in = scrapy.Field()
    date_of_birth = scrapy.Field()
    date_of_death = scrapy.Field()
    place_of_birth = scrapy.Field()
    place_of_death = scrapy.Field()
    text = scrapy.Field()

class NWinnerSpider(scrapy.Spider):

    name = 'nwinners_full'
    allowed_domains = ['en.wikipedia.org']
    start_urls = [
        "http://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
    ]

    def parse(self, response):
        h2s = response.xpath('//h2')

        for h2 in h2s[2:]: #２から
            country = h2.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h2.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    wdata = process_winner_li(w, country[0])
                    request = scrapy.Request(wdata['link'], callback=self.parse_bio, dont_filter=True)
                    request.meta['item'] = NWinnerItem(**wdata)
                    yield request

    def parse_bio(self, response):
        item = response.meta['item']
        href = response.xpath("//li[@id='t-wikibase']/a/@href").extract()
        if href:
            # Wikipedia have changed the wikibase URL to include the 'https:' leader
            # url = 'https:' + href[0]
            url = href[0]
            request = scrapy.Request(url,\
                        callback=self.parse_wikidata,\
                        dont_filter=True)
            request.meta['item'] = item
            yield request

    def parse_wikidata(self, response):
        item = response.meta['item']
        property_codes = [
            {'name':'date_of_birth', 'code':'P569'},
            {'name':'date_of_death', 'code':'P570'},
            {'name':'place_of_birth', 'code':'P19', 'link':True},
            {'name':'place_of_death', 'code':'P20', 'link':True},
            {'name':'gender', 'code':'P21', 'link':True}
        ]

        p_template = '//*[@id="{code}"]/div[2]/div/div/div[2]' \
                    '/div[1]/div/div[2]/div[2]{link_html}/text()'

        for prop in property_codes:
            link_html = ''
            if prop.get('link'):
                link_html = '/a'
            sel = response.xpath(p_template.format(\
                code=prop['code'], link_html=link_html))
            if sel:
                item[prop['name']] = sel[0].extract()

        yield item


def process_winner_li(w, country=None):
    """
    Process a winner's <li> tag, adding country of birth or nationality,
    as applicable.
    """
    wdata = {}
    # get the href link-adress from the <a> tag
    wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]
    text = ' '.join(w.xpath('descendant-or-self::text()').extract())
    # we use the comma-delimited text-elements, stripping whitespace from
    # the ends.
    # split the text at the commas and take the first (name) string
    wdata['name'] = text.split(',')[0].strip()

    year = re.findall('\d{4}', text)
    if year:
        wdata['year'] = int(year[0])
    else:
        wdata['year'] = 0
        print('Oops, no year in ', text)

    category = re.findall(
            'Physics|Chemistry|Physiology or Medicine|Literature|Peace|Economics',
                text)
    if category:
        wdata['category'] = category[0]
    else:
        wdata['category'] = ''
        print('Oops, no category in ', text)

    if country:
        if text.find('*') != -1:
            wdata['country'] = ''
            wdata['born_in'] = country
        else:
            wdata['country'] = country
            wdata['born_in'] = ''

    # store a copy of the link's text-string for any manual corrections
    wdata['text'] = text
    return wdata

$scrapy crawl nwinners_full -o winner_list.json

　　出力されたwinner_list.jsonの数は1067レコードでした

winners_list.jsonの内訳

受賞者の数1067人
male1001人、female57人、どちらでもない9人
国別の数68国(‘country'に入力されていた人が58人、'born_in'に入れられていた人が10人）

2017-09-18

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.6.1 ページのキャシュ

setting.pyのHTTPCACHE_ENABLED=True行をコメントアウト

6.6.2リクエストの作成

import scrapy
import re

BASE_URL = 'http://en.wikipedia.org'

class NWinnerItem(scrapy.Item):
    name = scrapy.Field()
    link = scrapy.Field()
    year = scrapy.Field()
    category = scrapy.Field()
    country = scrapy.Field()
    gender = scrapy.Field()
    born_in = scrapy.Field()
    date_of_birth = scrapy.Field()
    date_of_death = scrapy.Field()
    place_of_birth = scrapy.Field()
    place_of_death = scrapy.Field()
    text = scrapy.Field()


class NWinnerSpider(scrapy.Spider):
    name = 'nwinners_full'
    allowed_domains = ['en.wikipedia.org']
    start_urls = [
        "http://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
    ]

    def parse(self, response):
        filename = response.url.split('/')[-1]
        h2s = response.xpath('//h2')

        for h2 in h2s[2:]: #ここは２つ飛ばす
            country = h2.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h2.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    #このwはhttps://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country内を探して
                    #国ごとの下に並んでいる受賞者の情報を取得している
                    #このようなリスト -> <Selector xpath='li' data='<li><a href="/wiki/Bertha_von_Suttner" t'>
                    #このデータをprocess_winner_li関数に渡して
                    #wdata['link']には人物情報ページへの絶対アドレス
                    #wdata['name']には受賞者の名前
                    #wdata['year']受賞年
                    #wdata['category']受賞部門
                    #wdata['country'],wdata['born'] 国籍
                    wdata = process_winner_li(w, country[0])
                    #Requestクラスに渡した人物情報URLをself.parse_bioメソッドでクロールする
                    request = scrapy.Request(wdata['link'], callback=self.parse_bio, dont_filter=True)
                    request.meta['item'] = NWinnerItem(**wdata)
                    yield request

    def parse_bio(self, response):
        #引数のresponseはscrapy.http.response.html.HtmlResponseクラス
        item = response.meta['item']
        #hrefは各人物ごとのWikiDataのページのURL
        href = response.xpath("//li[@id='t-wikibase']/a/@href").extract()
        if href:
            url = href[0]
            #Requestクラスに人物のWikiDataのURLを渡してpasrse_wikidataメソッドでクロールさせる  
            request = scrapy.Request(url,\
                        callback=self.parse_wikidata,\
                        dont_filter=True)
            request.meta['item'] = item
            yield request

    def parse_wikidata(self, response):
        print("parse_wikidata\n")
        item = response.meta['item']
        property_codes = [
            {'name':'date_of_birth', 'code':'P569'},
            {'name':'date_of_death', 'code':'P570'},
            {'name':'place_of_birth', 'code':'P19', 'link':True},
            {'name':'place_of_death', 'code':'P20', 'link':True},
            {'name':'gender', 'code':'P21', 'link':True}
        ]

        p_template = '//*[@id="{code}"]/div[2]/div/div/div[2]' \
                    '/div[1]/div/div[2]/div[2]{link_html}/text()'
        #p569は生誕日、p570没日、p19生まれた国、p20没した国、p21性別
        for prop in property_codes:
            link_html = ''
            if prop.get('link'):
                link_html = '/a'
            sel = response.xpath(p_template.format(\
                code=prop['code'], link_html=link_html))
            if sel:
                item[prop['name']] = sel[0].extract()
        yield item

def process_winner_li(w, country=None):
    """
    Process a winner's <li> tag, adding country of birth or nationality,
    as applicable.
    """
    wdata = {}
    wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]
    text = ' '.join(w.xpath('descendant-or-self::text()').extract())
    #ここで得られるtextは
    #'C\xc3\xa9sar Milstein , Physiology or Medicine, 1984'のような情報
    wdata['name'] = text.split(',')[0].strip()

    year = re.findall('\d{4}', text)
    if year:
        wdata['year'] = int(year[0])
    else:
        wdata['year'] = 0
        print('Oops, no year in ', text)

    category = re.findall(
            'Physics|Chemistry|Physiology or Medicine|Literature|Peace|Economics',
                text)
    if category:
        wdata['category'] = category[0]
    else:
        wdata['category'] = ''
        print('Oops, no category in ', text)

    #countryに*がついていると受賞時の国が出生国と同じ場合につく
    if country:
        if text.find('*') != -1:
            wdata['country'] = ''
            wdata['born_in'] = country
        else:
            wdata['country'] = country
            wdata['born_in'] = ''

    # store a copy of the link's text-string for any manual corrections
    wdata['text'] = text
    return wdata

f:id:bitop:20170918154002p:plain

Argentinaだけ採取 f:id:bitop:20170918154117p:plain