「ビックデータ分析・活用のためのSQLレシピ」を読む

3-1-3 文字列を配列に分解する

import re

def process(s):
    m=re.search('//[^/]+([^?#]+)',s)
    return m.group(0).split('/')[3]

conn = psycopg2.connect("dbname=BigData host=localhost user=testuser")
df = pd.read_sql("SELECT * FROM access_log", conn)
df['path1']=df['url'].map(process)

f:id:bitop:20180707153227p:plain

3-1-4 日付やタイムスタンプを扱う

import datetime 
import time

df = pd.DataFrame({'dt':[datetime.date.today()],'timestamp':[time.ctime()]})

f:id:bitop:20180707152303p:plain

指定した値の日付や時刻データを取得する

import datetime 
import time
from dateutil.parser import parse

df = pd.DataFrame({'dt':[parse('2015-10-12')],'timestamp':[parse('2015-10-12 12:00:00')]})

f:id:bitop:20180707152418p:plain

日付け/時刻から特定のフィールドを取り出す

import datetime 
import time
from dateutil.parser import parse

df = pd.DataFrame({'stamp':[parse('2015-10-12 12:00')]})
df['year'] = df['stamp'].map(lambda x:x.year)
df['month'] = df['stamp'].map(lambda x:x.month)
df['day'] = df['stamp'].map(lambda x:x.day)
df['hour'] = df['stamp'].map(lambda x:x.hour)

f:id:bitop:20180707152548p:plain

#別解
df = pd.DataFrame({'stamp':[parse('2015-11-30 23:00')]})
df['year'] = df['stamp'].dt.year
df['month'] = df['stamp'].dt.month
df['day'] = df['stamp'].dt.day
df['hour'] = df['stamp'].dt.hour

f:id:bitop:20180707152642p:plain

3-1-5 欠損値をデフォルト

conn = psycopg2.connect("dbname=BigData host=localhost user=testuser")
df = pd.read_sql("SELECT * FROM purchase_log_with_coupon", conn)

f:id:bitop:20180707152739p:plain

df = df.fillna(0)

f:id:bitop:20180707152816p:plain