「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.6.1 ページのキャシュ

setting.pyのHTTPCACHE_ENABLED=True行をコメントアウト

6.6.2リクエストの作成

import scrapy
import re

BASE_URL = 'http://en.wikipedia.org'

class NWinnerItem(scrapy.Item):
    name = scrapy.Field()
    link = scrapy.Field()
    year = scrapy.Field()
    category = scrapy.Field()
    country = scrapy.Field()
    gender = scrapy.Field()
    born_in = scrapy.Field()
    date_of_birth = scrapy.Field()
    date_of_death = scrapy.Field()
    place_of_birth = scrapy.Field()
    place_of_death = scrapy.Field()
    text = scrapy.Field()


class NWinnerSpider(scrapy.Spider):
    name = 'nwinners_full'
    allowed_domains = ['en.wikipedia.org']
    start_urls = [
        "http://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"
    ]

    def parse(self, response):
        filename = response.url.split('/')[-1]
        h2s = response.xpath('//h2')

        for h2 in h2s[2:]: #ここは2つ飛ばす
            country = h2.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h2.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    #このwはhttps://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country内を探して
                    #国ごとの下に並んでいる受賞者の情報を取得している
                    #このようなリスト -> <Selector xpath='li' data='<li><a href="/wiki/Bertha_von_Suttner" t'>
                    #このデータをprocess_winner_li関数に渡して
                    #wdata['link']には人物情報ページへの絶対アドレス
                    #wdata['name']には受賞者の名前
                    #wdata['year']受賞年
                    #wdata['category']受賞部門
                    #wdata['country'],wdata['born'] 国籍
                    wdata = process_winner_li(w, country[0])
                    #Requestクラスに渡した人物情報URLをself.parse_bioメソッドでクロールする
                    request = scrapy.Request(wdata['link'], callback=self.parse_bio, dont_filter=True)
                    request.meta['item'] = NWinnerItem(**wdata)
                    yield request

    def parse_bio(self, response):
        #引数のresponseはscrapy.http.response.html.HtmlResponseクラス
        item = response.meta['item']
        #hrefは各人物ごとのWikiDataのページのURL
        href = response.xpath("//li[@id='t-wikibase']/a/@href").extract()
        if href:
            url = href[0]
            #Requestクラスに人物のWikiDataのURLを渡してpasrse_wikidataメソッドでクロールさせる  
            request = scrapy.Request(url,\
                        callback=self.parse_wikidata,\
                        dont_filter=True)
            request.meta['item'] = item
            yield request

    def parse_wikidata(self, response):
        print("parse_wikidata\n")
        item = response.meta['item']
        property_codes = [
            {'name':'date_of_birth', 'code':'P569'},
            {'name':'date_of_death', 'code':'P570'},
            {'name':'place_of_birth', 'code':'P19', 'link':True},
            {'name':'place_of_death', 'code':'P20', 'link':True},
            {'name':'gender', 'code':'P21', 'link':True}
        ]

        p_template = '//*[@id="{code}"]/div[2]/div/div/div[2]' \
                    '/div[1]/div/div[2]/div[2]{link_html}/text()'
        #p569は生誕日、p570没日、p19生まれた国、p20没した国、p21性別
        for prop in property_codes:
            link_html = ''
            if prop.get('link'):
                link_html = '/a'
            sel = response.xpath(p_template.format(\
                code=prop['code'], link_html=link_html))
            if sel:
                item[prop['name']] = sel[0].extract()
        yield item

def process_winner_li(w, country=None):
    """
    Process a winner's <li> tag, adding country of birth or nationality,
    as applicable.
    """
    wdata = {}
    wdata['link'] = BASE_URL + w.xpath('a/@href').extract()[0]
    text = ' '.join(w.xpath('descendant-or-self::text()').extract())
    #ここで得られるtextは
    #'C\xc3\xa9sar Milstein , Physiology or Medicine, 1984'のような情報
    wdata['name'] = text.split(',')[0].strip()

    year = re.findall('\d{4}', text)
    if year:
        wdata['year'] = int(year[0])
    else:
        wdata['year'] = 0
        print('Oops, no year in ', text)

    category = re.findall(
            'Physics|Chemistry|Physiology or Medicine|Literature|Peace|Economics',
                text)
    if category:
        wdata['category'] = category[0]
    else:
        wdata['category'] = ''
        print('Oops, no category in ', text)

    #countryに*がついていると受賞時の国が出生国と同じ場合につく
    if country:
        if text.find('*') != -1:
            wdata['country'] = ''
            wdata['born_in'] = country
        else:
            wdata['country'] = country
            wdata['born_in'] = ''

    # store a copy of the link's text-string for any manual corrections
    wdata['text'] = text
    return wdata

f:id:bitop:20170918154002p:plain

Argentinaだけ採取 f:id:bitop:20170918154117p:plain

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.5人物情報ページのスクレイピング

人物情報ページでWikiDataへのリンクをさがした。 f:id:bitop:20170917135200p:plain

そのアドレスに移動してみると

www.wikidata.org

WikiDataの画面に遷移した
f:id:bitop:20170917135403p:plain

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.4 最初のScrapyスパイダー

#nminners_list_spider.py

import scrapy
import re

class NWinnerItem(scrapy.Item):
    country = scrapy.Field()
    name = scrapy.Field()
    link_text = scrapy.Field()

class NWinnerSpider(scrapy.Spider):
    name = 'nwinners_list'
    allowed_domains = ['en.wikipedia.org']
    start_urls = ["https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"]
    def parse(self,response):
        h2s = response.xpath('//h2')
        for h2 in h2s:
            country = h2.xpath('span[@class="mw-headline"]/text()').extract() 
            if country:
                winners = h2.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    text = w.xpath('descendant-or-self::text()').extract()
                    yield NWinnerItem(country=country[0],name=text[0],link_text=''.join(text))

これを実行するとnobel_winnersフォルダにnobel_winners.jsonが出力される。

f:id:bitop:20170917114224p:plain

よく見ると国が1段ずれている、最初の人の国はSummaryになっている。

#nminners_list_spider.py

import scrapy
import re

class NWinnerItem(scrapy.Item):
    country = scrapy.Field()
    name = scrapy.Field()
    link_text = scrapy.Field()

class NWinnerSpider(scrapy.Spider):
    name = 'nwinners_list'
    allowed_domains = ['en.wikipedia.org']
    start_urls = ["https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country"]
    def parse(self,response):
        h2s = response.xpath('//h2')
        for h2 in h2s[2:]: #indexを2つずらしてループを廻すようにした
            country = h2.xpath('span[@class="mw-headline"]/text()').extract()
            if country:
                winners = h2.xpath('following-sibling::ol[1]')
                for w in winners.xpath('li'):
                    wdata = process_winner_li(w,country[0])

f:id:bitop:20170917123213p:plain

正常にになった。個数を数えたら1068であった。本とは違う。減る筈はないとおもうが変更があったのでしょう。

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.3xpathを使った対象HTMLの設定

xpathとは木構造をもつHTML要素を指定する記法

f:id:bitop:20170916140506p:plain

h2の上でCopy XPathコマンドを実行すると  
//*[@id="mw-content-text"]/div/h2[2]  
が得られた。

他のCopyコマンドも実行してみた
outerHTML
<h2><span class="mw-headline" id="Argentina">Argentina</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=List_of_Nobel_laureates_by_country&amp;action=edit&amp;section=2" title="Edit section: Argentina">edit</a><span class="mw-editsection-bracket">]</span></span></h2>   
<h2>...</h2>で囲まれた部分をコピーしてきた  

Copy selector
#mw-content-text > div > h2:nth-child(11)
6.3.1 Scrapyシェルを使ったxpathのテスト

>scrapy startproject nobel_winners で作ったnobel_winnersフォルダに入りコマンドプロンプト上でscrapy shellと打ち込むと IPython風のシェルができる。

settings.pyを開いてDOWNLOAD_DELAY=3のコメントアウトをはずして保存。
>scrapy shell https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country

In [1]: h2s = response.xpath('//h2')
In [2]: print(type(h2s))
<class 'scrapy.selector.unified.SelectorList'>
In [3]: len(h2s)
Out[3]: 72
本とはちょっと違った結果になった。
h2 = h2s[0]
h2.でTABKeyを押すと
 css()                re()                 select()
 extract()            re_first()           selectorlist_cls
 extract_unquoted()   register_namespace() type
 get()                remove_namespaces()  xpath()
 getall()             response             h2.text
 namespaces           root

 re_first(),selectorlist_cls,extract_unquoted,get(),getall(),rootが増えている。  
In [7]: h2.extract()
Out[7]: '<h2>Contents</h2>'

ここの部分のh2タグを取ってきたようです

f:id:bitop:20170916152115p:plain

In [8]: h2s[1].extract()
Out[8]: '<h2><span class="mw-headline" id="Summary">Summary</span><span class="m
w-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.ph
p?title=List_of_Nobel_laureates_by_country&amp;action=edit&amp;section=1" title=
"Edit section: Summary">edit</a><span class="mw-editsection-bracket">]</span></s
pan></h2>'
In [9]: h2_arg = h2s[1]
In [10]: country = h2_arg.xpath('span[@class="mw-headline"]/text()').extract()
In [11]: country
Out[11]: ['Summary']

ここのh2を取ってきている
f:id:bitop:20170916155544p:plain

本とは違っている。ページ構成が変更になったのだろう。  
In [12]: h2_arg = h2s[2]
In [13]: country = h2_arg.xpath('span[@class="mw-headline"]/text()').extract()
In [14]: country
Out[14]: ['Argentina']
国名が取れた。
In [15]: ol_arg = h2_arg.xpath('following-sibling::ol[1]')
In [16]: ol_arg
Out[16]: [<Selector xpath='following-sibling::ol[1]' data='<ol>\n<li><a href="/w
iki/C%C3%A9sar_Milst'>]
In [17]: ol_arg = h2_arg.xpath('following-sibling::ol[1]')[0]
In [18]: ol_arg
Out[18]: <Selector xpath='following-sibling::ol[1]' data='<ol>\n<li><a href="/wi
ki/C%C3%A9sar_Milst'>
In [19]: lis_arg = ol_arg.xpath('li')
In [20]: lis_arg
Out[20]:
[<Selector xpath='li' data='<li><a href="/wiki/C%C3%A9sar_Milstein" '>,
<Selector xpath='li' data='<li><a href="/wiki/Adolfo_P%C3%A9rez_Esq'>,
<Selector xpath='li' data='<li><a href="/wiki/Luis_Federico_Leloir"'>,
<Selector xpath='li' data='<li><a href="/wiki/Bernardo_Houssay" tit'>,
<Selector xpath='li' data='<li><a href="/wiki/Carlos_Saavedra_Lamas'>]
In [21]: len(lis_arg)
Out[21]: 5
In [22]: li = lis_arg[0]
In [23]: li.ex2017-09-16 15:32:08 [py.warnings] WARNING: C:\Users\joshua\Anacond
a3\lib\site-packages\jedi\evaluate\compiled\__init__.py:328: ScrapyDeprecationWa
rning: Attribute `_root` is deprecated, use `root` instead
getattr(obj, name)
In [24]: li.extract()
Out[24]: '<li><a href="/wiki/C%C3%A9sar_Milstein" title="César Milstein">César M
ilstein</a>, Physiology or Medicine, 1984</li>'
In [25]: name=li.xpath('a//text()')[0].extract()
In [26]: name
Out[26]: 'César Milstein'
In [27]: list_text = li.xpath('descendant-or-self::text()').extract()
In [28]: list_text
Out[28]: ['César Milstein', ', Physiology or Medicine, 1984']
In [29]: ''.join(list_text)
Out[29]: 'César Milstein, Physiology or Medicine, 1984'
名前、カテゴリ、受賞年がとれた

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.2対象の設定

http://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country (国ごとに受賞者のリストがある)

f:id:bitop:20170916120336p:plain

f:id:bitop:20170916120426p:plain

f:id:bitop:20170916120508p:plain

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

6.1Scrapyの準備

Scrapyがinstallされていなかったので

>conda install -c https://conda.anaconda.org/anaconda scrapy
とした。scrapyだけが」installされるのだろうとおもっていたら
新規installが18、アップデートが4個入ってきた。
attrs:            15.2.0-py35_0     anaconda 不明
automat:          0.5.0-py35_0      anaconda 不明
constantly:       15.1.0-py35_0     anaconda 不明
cssselect:        1.0.1-py35_0      anaconda 不明
hyperlink:        17.1.1-py35_0     anaconda 純粋なPythonの不変URLの実装を提供します。?
incremental:      16.10.1-py35_0    anaconda 不明
parsel:           1.2.0-py35_0      anaconda 不明
patch:            2.5.9-1           anaconda 不明
pyasn1-modules:   0.0.8-py35_0      anaconda 不明
pydispatcher:     2.0.5-py35_0      anaconda 不明
queuelib:         1.4.2-py35_0      anaconda 不明
scrapy:           1.3.3-py35_0      anaconda スクレイピングツール
service_identity: 17.0.0-py35_0     anaconda サービスID検証
twisted:          17.5.0-py35_0     anaconda 不明
w3lib:            1.17.0-py35_0     anaconda 不明
zope:             1.0-py35_0        anaconda Webアプリケーションフレームワーク
zope.interface:   4.4.2-py35_0      anaconda
anaconda:         2.4.1-np110py35_0          --> custom-py35_0 anaconda
conda:            4.3.24-py35_0              --> 4.3.25-py35_0 anaconda OSに依存しない、システムレベルのバイナリパッケージと環境マネージャ。
conda-env:        2.6.0-0                    --> 2.6.0-0       anaconda 不明
spyder:           3.2.0-py35_0               --> 2.3.8-py35_1  anaconda  IDE

installできたので
>scrapy startproject nobel_winners
で実行すると本のとおりのフォルダー構造ができていた。

__init__.pyには何も記載なし
items.pyには
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class NobelWinnersItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
とありスクレイピングしたいitemを定義するファイル

middlewares.py
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class NobelWinnersSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)
spiderと接続を定義する?

pipelines.py
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class NobelWinnersPipeline(object):
    def process_item(self, item, spider):
        return item
Item pipelineを定義するファイル

settings.py
# -*- coding: utf-8 -*-

# Scrapy settings for nobel_winners project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'nobel_winners'

SPIDER_MODULES = ['nobel_winners.spiders']
NEWSPIDER_MODULE = 'nobel_winners.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'nobel_winners (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'nobel_winners.middlewares.NobelWinnersSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'nobel_winners.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'nobel_winners.pipelines.NobelWinnersPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

プロジェクト設定ファイル

scrapy.cfg
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = nobel_winners.settings

[deploy]
#url = http://localhost:6800/
project = nobel_winners
これも設定ファイル

「PythonとJavaScriptではじめるデータビジュアライゼーション」を読む

5.7.1選択パターンの作成
from bs4 import BeautifulSoup
import requests

def get_column_titles(table):
    cols = []
    for th in table.select_one('tr').select('th')[1:]:
        link = th.select_one('a')
        if link:
            cols.append({'name':link.text,'href':link.attrs['href']})
        else:
            cols.append({'name':th.text,'href':None})
    return cols

BASE_URL = "http://en.wikipedia.org"
HEADERS = {'User-Agent':'Mozilla/5.0'}
def get_Nobel_soup():
    response = requests.get(BASE_URL + '/wiki/List_of_Nobel_laureates',headers=HEADERS)
    return BeautifulSoup(response.content,"lxml")

soup = get_Nobel_soup()
table = soup.select_one('table.sortable.wikitable')
d = get_column_titles(table)

for item in d:
    print(item['name']," ",item['href'])

結果 表のヘッダ部分を取り出し名前と賞の説明ページへのリンクを辞書の配列の形で帰してくれる 

f:id:bitop:20170911160737p:plain

from bs4 import BeautifulSoup
import requests

def get_nobel_winners(table):
    cols = get_column_titles(table)
    winners = []
    for row in table.select('tr')[1:-2]:
        year = int(row.select_one('td').text)
        for i,td in enumerate(row.select('td')[1:]):
            for winner in td.select('a'):
                href = winner.attrs['href']
                if not href.startswith('#endnote'):
                    winners.append({'year':year,
                                'category':cols[i]['name'],
                                'name':winner.text,
                                'link':winner['href']})
    return winners

def get_column_titles(table):
    cols = []
    for th in table.select_one('tr').select('th')[1:]:
        link = th.select_one('a')
        if link:
            cols.append({'name':link.text,'href':link.attrs['href']})
        else:
            cols.append({'name':th.text,'href':None})
    return cols

BASE_URL = "http://en.wikipedia.org"
HEADERS = {'User-Agent':'Mozilla/5.0'}
def get_Nobel_soup():
    response = requests.get(BASE_URL + '/wiki/List_of_Nobel_laureates',headers=HEADERS)
    return BeautifulSoup(response.content,"lxml")

soup = get_Nobel_soup()
table = soup.select_one('table.sortable.wikitable')
d = get_nobel_winners(table)
print(str(d).encode('UTF-8'))

結果 受賞者のデータ(受賞年、受賞者の名前、カテゴリ(何賞をとったのか)、アドレス)を辞書の配列形式で返してくれる f:id:bitop:20170911164903p:plain

5.7.2 Webページのキャッシュ

reuests_cacheはinstallされていなかった。
>pip install reuests_cache
でinstall ver 0.4.13でした

5.7.3受賞者の国籍のスクレイピング

例5-3のget_winner_nationality()関数内のget_url関数がわからなかったでのパス