「ビックデータ分析・活用のためのSQLレシピ」を読む

3-2 複数の値に対する操作

3-2-1 文字列を連結する

conn = psycopg2.connect("dbname=BigData host=localhost user=testuser")
df = pd.read_sql("SELECT * FROM mst_user_location", conn)
print(df)

f:id:bitop:20180708112536p:plain

df['pref_city'] = df['pref_name'] + df['city_name']
print(df[['user_id','pref_city']])

#別解
df = pd.read_sql("SELECT * FROM mst_user_location", conn)
#間にスペースを挟む
df['pref_city'] = df['pref_name'].str.cat(df['city_name'],sep = ' ')
print(df[['user_id','pref_city']])

f:id:bitop:20180708112637p:plain

3-2-2 複数の値を比較する

conn = psycopg2.connect("dbname=BigData host=localhost user=testuser")
df = pd.read_sql("SELECT * FROM quarterly_sales", conn)
print(df)

f:id:bitop:20180708112738p:plain

import numpy as np

df.loc[df['q1'] < df['q2'], 'judge_q1_q2'] = '+'
df.loc[df['q1'] > df['q2'], 'judge_q1_q2'] = '-'
df.loc[df['q1'] == df['q2'], 'judge_q1_q2'] = ' '
df['diff_q2_q1'] = df['q2'] - df['q1']
df['sign_q2_q1'] = np.sign(df['q2'] - df['q1'])

f:id:bitop:20180708112919p:plain

df['greatest_sales'] = df[['q1','q2','q3','q4']].max(axis=1)
df['least_sales'] =    df[['q1','q2','q3','q4']].min(axis=1)
df[['year','greatest_sales','least_sales']]

f:id:bitop:20180708113017p:plain

df['average'] = df[['q1','q2','q3','q4']].mean(axis=1)
print(df)

f:id:bitop:20180708113132p:plain

3-2-3 2つの値の比率を計算する

conn = psycopg2.connect("dbname=BigData host=localhost user=testuser")
df = pd.read_sql("SELECT * FROM advertising_stats", conn)
print(df)

f:id:bitop:20180708113246p:plain

df1 = df[df.dt=='2017-04-01']
df1['ctf'] = df1['clicks']/df1['impressions']
print(df1[['dt','ad_id','ctf']])

f:id:bitop:20180708113340p:plain

df['ctf'] = df['clicks']/df['impressions']
print(df[['dt','ad_id','ctf']])

f:id:bitop:20180708113614p:plain

3-2-4 2つの値の距離を計算する

conn = psycopg2.connect("dbname=BigData host=localhost user=testuser")
df = pd.read_sql("SELECT * FROM location_1d", conn)
print(df)

f:id:bitop:20180708113718p:plain

df['abs'] = np.abs(df.x1 - df.x2)
df['rms'] = np.sqrt(np.power(df['x1'] - df['x2'],2)) #df['x1']とdf.x1は等価
print(df)

f:id:bitop:20180708113828p:plain

xy平面で2点間のユーグリッド距離を計算する

conn = psycopg2.connect("dbname=BigData host=localhost user=testuser")
df = pd.read_sql("SELECT * FROM location_2d", conn)
print(df)

f:id:bitop:20180708113936p:plain

df['dist'] = np.sqrt(np.power(df.x1 - df.x2,2) + np.power(df.y1 - df.y2,2))
print(df[['dist']])

f:id:bitop:20180708114017p:plain

3-2-5 日付・時刻を計算する

conn = psycopg2.connect("dbname=BigData host=localhost user=testuser")
df = pd.read_sql("SELECT * FROM mst_users_with_birthday", conn)
print(df)

f:id:bitop:20180708114121p:plain

df['register_stamp'] = pd.to_datetime(df['register_stamp'])
df['birth_date'] = pd.to_datetime(df['birth_date'])
from pandas.tseries.offsets import Hour,Minute,Day
#before 1 month NG
#月の差分計算はうまくいかなかった
df['after_1_hour'] = df['register_stamp'] + Hour(1)
df['before_30_minutes'] = df['register_stamp'] - Minute(30)
df['after_1_day'] = df['register_stamp'] + Day(1)
print(df)

f:id:bitop:20180708114254p:plain

df[['user_id','register_stamp','after_1_hour','before_30_minutes','after_1_day']]

f:id:bitop:20180708114337p:plain

from datetime import datetime
df['today'] = datetime.today()
df['diff_days'] = df['today'] - df['register_stamp']
df[['user_id','today','register_stamp','diff_days']]

f:id:bitop:20180708114437p:plain

#年齢計算はうまくいかなかった
df['age'] = (df['today'] -  df['birth_date'])/365
df[['user_id','today','birth_date','age']]

f:id:bitop:20180708114525p:plain