「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.8パイプラインを使ったテキストと画像のスクレイピング

#nwinners_minbio_spider.py
import scrapy
import re

BASE_URL = 'http://en.wikipedia.org'


class NWinnerItemBio(scrapy.Item):
    link = scrapy.Field()
    name = scrapy.Field() #このnameフィールドは使っていない?
    mini_bio = scrapy.Field() 
    image_urls = scrapy.Field()
    bio_image = scrapy.Field() #このbio_imageは使っていない?
    images = scrapy.Field() #このimagesは使っていない


class NWinnerSpiderBio(scrapy.Spider):
    """ Scrapes the Nobel prize biography pages for portrait images and a biographical snippet """

    name = 'nwinners_minibio'
    allowed_domains = ['en.wikipedia.org']
    start_urls = [
        #"http://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
        "http://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country?dfdfd"
    ]

    #For Scrapy v 1.0+, custom_settings can override the item pipelines in settings
    custom_settings = {
        'ITEM_PIPELINES': {'nobel_winners.pipelines.NobelImagesPipeline':1},
    }

    def parse(self, response):

        #filename = response.url.split('/')[-1] #このfilenameいらないような
        h2s = response.xpath('//h2')

        for h2 in h2s[2:]: #2から
            country = h2.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h2.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    wdata = {}
                    wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]

                    #print(wdata)
                    request = scrapy.Request(wdata['link'],
                                            callback=self.get_mini_bio,
                                            dont_filter=True)
                    request.meta['item'] = NWinnerItemBio(**wdata)
                    yield request


    def get_mini_bio(self, response):
        BASE_URL_ESCAPED = 'http:\/\/en.wikipedia.org'
        item = response.meta['item']
        # cache image
        item['image_urls'] = []

        # Get the URL of the winner's picture, contained in the infobox table
        img_src = response.xpath('//table[contains(@class,"infobox")]//img/@src')
        if img_src:
            item['image_urls'] = ['http:' + img_src[0].extract()]
        mini_bio = ''
        # Get the paragraphs in the biography's body-text
        ps = response.xpath('//*[@id="mw-content-text"]/div/p[text() or  normalize-space(.)=""]').extract() #本のとおりだとmini_bioは取得できない、/div/p[text()...とすること
        # Add introductory biography paragraphs till the empty breakpoint
        for p in ps:
            if p == '<p></p>':
                break
            mini_bio += p

        # correct for wiki-links
        mini_bio = mini_bio.replace('href="/wiki', 'href="' + BASE_URL + '/wiki')
        mini_bio = mini_bio.replace('href="#', 'href="' + item['link'] + '#')
        item['mini_bio'] = mini_bio
        yield item

#pipelines.py
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline
# For Scrapy v1.0+:
# from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem


class NobelImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):

        for image_url in item['image_urls']:
            yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        if item['image_urls']:
            image_paths = item['image_urls'] #[item['image_urls'] for ok, x in results if ok]
        if image_paths:
            item['bio_image'] = image_paths[0]

        return item


class DropNonPersons(object):
    """ Remove non-person winners """

    def process_item(self, item, spider):

        if not item['gender']:
            raise DropItem("No gender for %s"%item['name'])
        return item

#settings.py
# -*- coding: utf-8 -*-

# Scrapy settings for nobel_winners project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#
import os

BOT_NAME = 'nobel_winners'

SPIDER_MODULES = ['nobel_winners.spiders']
NEWSPIDER_MODULE = 'nobel_winners.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'nobel_winners (+http://www.yourdomain.com)'
# e.g., to 'impersonate' a browser':
#USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"

HTTPCACHE_ENABLED = True
# ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
# We can define the ITEM_PIPELINES here or in their respective spiders by using the
# custom_settings variable (Scrapy v 1.0+) (see the nwinners_minibio spider for an example)
# For earlier versions of Scrapy (<1.0), define the ITEM_PIPELINES variable here:
ITEM_PIPELINES = {'nobel_winners.pipelines.NobelImagesPipeline':1}
#ITEM_PIPELINES = {'nobel_winners.pipelines.DropNonPersons':1}
# We're storing the images in an 'images' subdirectory of the Scrapy project's root
IMAGES_STORE = 'images'

minibios.jsonの内訳 データ数1067
keyはlink,image_urls,mini_bioのみ
本にあるように受賞者の肖像がダウンロードすることはなかった。
もう少し先にかいてあるかもしれないのでここはスルー

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.7 Scrapyパイプライン

P157の最初のコードに# nobel_winners/nobel_winners/setting.pyとあるがpipelines.pyの間違いかな?

#pipelines.py
import scrapy
from scrapy.exceptions import DropItem

class DropNonPersons(object):

    def process_item(self, item, spider):

        if not item['gender']:
            raise DropItem("No gender for %s"%item['name'])
        return item


#settings.py
import os

BOT_NAME = 'nobel_winners'

SPIDER_MODULES = ['nobel_winners.spiders']
NEWSPIDER_MODULE = 'nobel_winners.spiders'


HTTPCACHE_ENABLED = True
ITEM_PIPELINES = {'nobel_winners.pipelines.DropNonPersons':1}

#shllから
$scrapy crawl nwinners_full -o winner_list.json

winner_list.jsonの内訳 受賞者の数1058人、最初の数から9人抜けている
male1001人、female57人、どちらでもない0人
国別の数67国

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

いままでWindows上で行っていたがUnicodeErrが頻発するのでlubuntu上に変更する

#nwinners_full_spider.py
import scrapy
import re

BASE_URL = 'http://en.wikipedia.org'

class NWinnerItem(scrapy.Item):
    name = scrapy.Field()
    link = scrapy.Field()
    year = scrapy.Field()
    category = scrapy.Field()
    country = scrapy.Field()
    gender = scrapy.Field()
    born_in = scrapy.Field()
    date_of_birth = scrapy.Field()
    date_of_death = scrapy.Field()
    place_of_birth = scrapy.Field()
    place_of_death = scrapy.Field()
    text = scrapy.Field()

class NWinnerSpider(scrapy.Spider):

    name = 'nwinners_full'
    allowed_domains = ['en.wikipedia.org']
    start_urls = [
        "http://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
    ]

    def parse(self, response):
        h2s = response.xpath('//h2')

        for h2 in h2s[2:]: #2から
            country = h2.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h2.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    wdata = process_winner_li(w, country[0])
                    request = scrapy.Request(wdata['link'], callback=self.parse_bio, dont_filter=True)
                    request.meta['item'] = NWinnerItem(**wdata)
                    yield request

    def parse_bio(self, response):
        item = response.meta['item']
        href = response.xpath("//li[@id='t-wikibase']/a/@href").extract()
        if href:
            # Wikipedia have changed the wikibase URL to include the 'https:' leader
            # url = 'https:' + href[0]
            url = href[0]
            request = scrapy.Request(url,\
                        callback=self.parse_wikidata,\
                        dont_filter=True)
            request.meta['item'] = item
            yield request

    def parse_wikidata(self, response):
        item = response.meta['item']
        property_codes = [
            {'name':'date_of_birth', 'code':'P569'},
            {'name':'date_of_death', 'code':'P570'},
            {'name':'place_of_birth', 'code':'P19', 'link':True},
            {'name':'place_of_death', 'code':'P20', 'link':True},
            {'name':'gender', 'code':'P21', 'link':True}
        ]

        p_template = '//*[@id="{code}"]/div[2]/div/div/div[2]' \
                    '/div[1]/div/div[2]/div[2]{link_html}/text()'

        for prop in property_codes:
            link_html = ''
            if prop.get('link'):
                link_html = '/a'
            sel = response.xpath(p_template.format(\
                code=prop['code'], link_html=link_html))
            if sel:
                item[prop['name']] = sel[0].extract()

        yield item


def process_winner_li(w, country=None):
    """
    Process a winner's <li> tag, adding country of birth or nationality,
    as applicable.
    """
    wdata = {}
    # get the href link-adress from the <a> tag
    wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]
    text = ' '.join(w.xpath('descendant-or-self::text()').extract())
    # we use the comma-delimited text-elements, stripping whitespace from
    # the ends.
    # split the text at the commas and take the first (name) string
    wdata['name'] = text.split(',')[0].strip()

    year = re.findall('\d{4}', text)
    if year:
        wdata['year'] = int(year[0])
    else:
        wdata['year'] = 0
        print('Oops, no year in ', text)

    category = re.findall(
            'Physics|Chemistry|Physiology or Medicine|Literature|Peace|Economics',
                text)
    if category:
        wdata['category'] = category[0]
    else:
        wdata['category'] = ''
        print('Oops, no category in ', text)

    if country:
        if text.find('*') != -1:
            wdata['country'] = ''
            wdata['born_in'] = country
        else:
            wdata['country'] = country
            wdata['born_in'] = ''

    # store a copy of the link's text-string for any manual corrections
    wdata['text'] = text
    return wdata

$scrapy crawl nwinners_full -o winner_list.json

  出力されたwinner_list.jsonの数は1067レコードでした

winners_list.jsonの内訳

受賞者の数1067人
male1001人、female57人、どちらでもない9人
国別の数68国(‘country'に入力されていた人が58人、'born_in'に入れられていた人が10人)

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.6.1 ページのキャシュ

setting.pyのHTTPCACHE_ENABLED=True行をコメントアウト

6.6.2リクエストの作成

import scrapy
import re

BASE_URL = 'http://en.wikipedia.org'

class NWinnerItem(scrapy.Item):
    name = scrapy.Field()
    link = scrapy.Field()
    year = scrapy.Field()
    category = scrapy.Field()
    country = scrapy.Field()
    gender = scrapy.Field()
    born_in = scrapy.Field()
    date_of_birth = scrapy.Field()
    date_of_death = scrapy.Field()
    place_of_birth = scrapy.Field()
    place_of_death = scrapy.Field()
    text = scrapy.Field()


class NWinnerSpider(scrapy.Spider):
    name = 'nwinners_full'
    allowed_domains = ['en.wikipedia.org']
    start_urls = [
        "http://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
    ]

    def parse(self, response):
        filename = response.url.split('/')[-1]
        h2s = response.xpath('//h2')

        for h2 in h2s[2:]: #ここは2つ飛ばす
            country = h2.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h2.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    #このwはhttps://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country内を探して
                    #国ごとの下に並んでいる受賞者の情報を取得している
                    #このようなリスト -> <Selector xpath='li' data='<li><a href="/wiki/Bertha_von_Suttner" t'>
                    #このデータをprocess_winner_li関数に渡して
                    #wdata['link']には人物情報ページへの絶対アドレス
                    #wdata['name']には受賞者の名前
                    #wdata['year']受賞年
                    #wdata['category']受賞部門
                    #wdata['country'],wdata['born'] 国籍
                    wdata = process_winner_li(w, country[0])
                    #Requestクラスに渡した人物情報URLをself.parse_bioメソッドでクロールする
                    request = scrapy.Request(wdata['link'], callback=self.parse_bio, dont_filter=True)
                    request.meta['item'] = NWinnerItem(**wdata)
                    yield request

    def parse_bio(self, response):
        #引数のresponseはscrapy.http.response.html.HtmlResponseクラス
        item = response.meta['item']
        #hrefは各人物ごとのWikiDataのページのURL
        href = response.xpath("//li[@id='t-wikibase']/a/@href").extract()
        if href:
            url = href[0]
            #Requestクラスに人物のWikiDataのURLを渡してpasrse_wikidataメソッドでクロールさせる  
            request = scrapy.Request(url,\
                        callback=self.parse_wikidata,\
                        dont_filter=True)
            request.meta['item'] = item
            yield request

    def parse_wikidata(self, response):
        print("parse_wikidata\n")
        item = response.meta['item']
        property_codes = [
            {'name':'date_of_birth', 'code':'P569'},
            {'name':'date_of_death', 'code':'P570'},
            {'name':'place_of_birth', 'code':'P19', 'link':True},
            {'name':'place_of_death', 'code':'P20', 'link':True},
            {'name':'gender', 'code':'P21', 'link':True}
        ]

        p_template = '//*[@id="{code}"]/div[2]/div/div/div[2]' \
                    '/div[1]/div/div[2]/div[2]{link_html}/text()'
        #p569は生誕日、p570没日、p19生まれた国、p20没した国、p21性別
        for prop in property_codes:
            link_html = ''
            if prop.get('link'):
                link_html = '/a'
            sel = response.xpath(p_template.format(\
                code=prop['code'], link_html=link_html))
            if sel:
                item[prop['name']] = sel[0].extract()
        yield item

def process_winner_li(w, country=None):
    """
    Process a winner's <li> tag, adding country of birth or nationality,
    as applicable.
    """
    wdata = {}
    wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]
    text = ' '.join(w.xpath('descendant-or-self::text()').extract())
    #ここで得られるtextは
    #'C\xc3\xa9sar Milstein , Physiology or Medicine, 1984'のような情報
    wdata['name'] = text.split(',')[0].strip()

    year = re.findall('\d{4}', text)
    if year:
        wdata['year'] = int(year[0])
    else:
        wdata['year'] = 0
        print('Oops, no year in ', text)

    category = re.findall(
            'Physics|Chemistry|Physiology or Medicine|Literature|Peace|Economics',
                text)
    if category:
        wdata['category'] = category[0]
    else:
        wdata['category'] = ''
        print('Oops, no category in ', text)

    #countryに*がついていると受賞時の国が出生国と同じ場合につく
    if country:
        if text.find('*') != -1:
            wdata['country'] = ''
            wdata['born_in'] = country
        else:
            wdata['country'] = country
            wdata['born_in'] = ''

    # store a copy of the link's text-string for any manual corrections
    wdata['text'] = text
    return wdata

f:id:bitop:20170918154002p:plain

Argentinaだけ採取 f:id:bitop:20170918154117p:plain

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.5人物情報ページのスクレイピング

人物情報ページでWikiDataへのリンクをさがした。 f:id:bitop:20170917135200p:plain

そのアドレスに移動してみると

www.wikidata.org

WikiDataの画面に遷移した
f:id:bitop:20170917135403p:plain

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.4 最初のScrapyスパイダー

#nminners_list_spider.py

import scrapy
import re

class NWinnerItem(scrapy.Item):
    country = scrapy.Field()
    name = scrapy.Field()
    link_text = scrapy.Field()

class NWinnerSpider(scrapy.Spider):
    name = 'nwinners_list'
    allowed_domains = ['en.wikipedia.org']
    start_urls = ["https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"]
    def parse(self,response):
        h2s = response.xpath('//h2')
        for h2 in h2s:
            country = h2.xpath('span[@class="mw-headline"]/text()').extract() 
            if country:
                winners = h2.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    text = w.xpath('descendant-or-self::text()').extract()
                    yield NWinnerItem(country=country[0],name=text[0],link_text=''.join(text))

これを実行するとnobel_winnersフォルダにnobel_winners.jsonが出力される。

f:id:bitop:20170917114224p:plain

よく見ると国が1段ずれている、最初の人の国はSummaryになっている。

#nminners_list_spider.py

import scrapy
import re

class NWinnerItem(scrapy.Item):
    country = scrapy.Field()
    name = scrapy.Field()
    link_text = scrapy.Field()

class NWinnerSpider(scrapy.Spider):
    name = 'nwinners_list'
    allowed_domains = ['en.wikipedia.org']
    start_urls = ["https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"]
    def parse(self,response):
        h2s = response.xpath('//h2')
        for h2 in h2s[2:]: #indexを2つずらしてループを廻すようにした
            country = h2.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h2.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    wdata = process_winner_li(w,country[0])

f:id:bitop:20170917123213p:plain

正常にになった。個数を数えたら1068であった。本とは違う。減る筈はないとおもうが変更があったのでしょう。

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.3xpathを使った対象HTMLの設定

xpathとは木構造をもつHTML要素を指定する記法

f:id:bitop:20170916140506p:plain

h2の上でCopy XPathコマンドを実行すると  
//*[@id="mw-content-text"]/div/h2[2]  
が得られた。

他のCopyコマンドも実行してみた
outerHTML
<h2><span class="mw-headline" id="Argentina">Argentina</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=List_of_Nobel_laureates_by_country&amp;action=edit&amp;section=2" title="Edit section: Argentina">edit</a><span class="mw-editsection-bracket">]</span></span></h2>   
<h2>...</h2>で囲まれた部分をコピーしてきた  

Copy selector
#mw-content-text > div > h2:nth-child(11)
6.3.1 Scrapyシェルを使ったxpathのテスト

>scrapy startproject nobel_winners で作ったnobel_winnersフォルダに入りコマンドプロンプト上でscrapy shellと打ち込むと IPython風のシェルができる。

settings.pyを開いてDOWNLOAD_DELAY=3のコメントアウトをはずして保存。
>scrapy shell https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country

In [1]: h2s = response.xpath('//h2')
In [2]: print(type(h2s))
<class 'scrapy.selector.unified.SelectorList'>
In [3]: len(h2s)
Out[3]: 72
本とはちょっと違った結果になった。
h2 = h2s[0]
h2.でTABKeyを押すと
 css()                re()                 select()
 extract()            re_first()           selectorlist_cls
 extract_unquoted()   register_namespace() type
 get()                remove_namespaces()  xpath()
 getall()             response             h2.text
 namespaces           root

 re_first(),selectorlist_cls,extract_unquoted,get(),getall(),rootが増えている。  
In [7]: h2.extract()
Out[7]: '<h2>Contents</h2>'

ここの部分のh2タグを取ってきたようです

f:id:bitop:20170916152115p:plain

In [8]: h2s[1].extract()
Out[8]: '<h2><span class="mw-headline" id="Summary">Summary</span><span class="m
w-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.ph
p?title=List_of_Nobel_laureates_by_country&amp;action=edit&amp;section=1" title=
"Edit section: Summary">edit</a><span class="mw-editsection-bracket">]</span></s
pan></h2>'
In [9]: h2_arg = h2s[1]
In [10]: country = h2_arg.xpath('span[@class="mw-headline"]/text()').extract()
In [11]: country
Out[11]: ['Summary']

ここのh2を取ってきている
f:id:bitop:20170916155544p:plain

本とは違っている。ページ構成が変更になったのだろう。  
In [12]: h2_arg = h2s[2]
In [13]: country = h2_arg.xpath('span[@class="mw-headline"]/text()').extract()
In [14]: country
Out[14]: ['Argentina']
国名が取れた。
In [15]: ol_arg = h2_arg.xpath('following-sibling::ol[1]')
In [16]: ol_arg
Out[16]: [<Selector xpath='following-sibling::ol[1]' data='<ol>\n<li><a href="/w
iki/C%C3%A9sar_Milst'>]
In [17]: ol_arg = h2_arg.xpath('following-sibling::ol[1]')[0]
In [18]: ol_arg
Out[18]: <Selector xpath='following-sibling::ol[1]' data='<ol>\n<li><a href="/wi
ki/C%C3%A9sar_Milst'>
In [19]: lis_arg = ol_arg.xpath('li')
In [20]: lis_arg
Out[20]:
[<Selector xpath='li' data='<li><a href="/wiki/C%C3%A9sar_Milstein" '>,
<Selector xpath='li' data='<li><a href="/wiki/Adolfo_P%C3%A9rez_Esq'>,
<Selector xpath='li' data='<li><a href="/wiki/Luis_Federico_Leloir"'>,
<Selector xpath='li' data='<li><a href="/wiki/Bernardo_Houssay" tit'>,
<Selector xpath='li' data='<li><a href="/wiki/Carlos_Saavedra_Lamas'>]
In [21]: len(lis_arg)
Out[21]: 5
In [22]: li = lis_arg[0]
In [23]: li.ex2017-09-16 15:32:08 [py.warnings] WARNING: C:\Users\joshua\Anacond
a3\lib\site-packages\jedi\evaluate\compiled\__init__.py:328: ScrapyDeprecationWa
rning: Attribute `_root` is deprecated, use `root` instead
getattr(obj, name)
In [24]: li.extract()
Out[24]: '<li><a href="/wiki/C%C3%A9sar_Milstein" title="César Milstein">César M
ilstein</a>, Physiology or Medicine, 1984</li>'
In [25]: name=li.xpath('a//text()')[0].extract()
In [26]: name
Out[26]: 'César Milstein'
In [27]: list_text = li.xpath('descendant-or-self::text()').extract()
In [28]: list_text
Out[28]: ['César Milstein', ', Physiology or Medicine, 1984']
In [29]: ''.join(list_text)
Out[29]: 'César Milstein, Physiology or Medicine, 1984'
名前、カテゴリ、受賞年がとれた