ASHRAE

EDA for ASHRAEをやってみる

square_feet

fig, axes = plt.subplots(2, 2, figsize=(14, 12))
# kdeplot カーネル密度
sns.kdeplot(train['square_feet'], ax=axes[0][0], label='Train');
sns.kdeplot(test['square_feet'], ax=axes[0][0], label='Test');
sns.boxplot(x=train['square_feet'], ax=axes[1][0]);
sns.boxplot(x=test['square_feet'], ax=axes[1][1]);
pd.DataFrame({'train': [train['square_feet'].isnull().sum()], 'test': [test['square_feet'].isnull().sum()]}).plot(kind='bar', rot=0, ax=axes[0][1]);
axes[0][0].legend();
axes[0][0].set_title('Train/Test KDE distribution');
axes[0][1].set_title('Number of NaNs');
axes[1][0].set_title('Boxplot for train');
axes[1][1].set_title('Boxplot for test');
gc.collect();

f:id:bitop:20191231102548p:plain 

fig, axes = plt.subplots(1, 2, figsize=(14, 6))
train['site_id'].value_counts(dropna=False, normalize=True).sort_index().plot(kind='bar', rot=0, ax=axes[0]).set_xlabel('site_id value');
train[train['building_id']!=1099]['site_id'].value_counts(dropna=False, normalize=True).sort_index().plot(kind='bar', rot=0, ax=axes[1]).set_xlabel('site_id value');
ax2 = axes[0].twinx()
ax3 = axes[1].twinx()
train.groupby('site_id')['meter_reading'].mean().sort_index().plot(ax=ax2, style='D-', grid=False, color='tab:orange');
train[train['building_id']!=1099].groupby('site_id')['meter_reading'].mean().sort_index().plot(ax=ax3, style='D-', grid=False, color='tab:orange');
ax2.set_ylabel('Mean meter reading', color='tab:orange', fontsize=14);
ax3.set_ylabel('Mean meter reading', color='tab:orange', fontsize=14);
ax2.tick_params(axis='y', labelcolor='tab:orange');
ax3.tick_params(axis='y', labelcolor='tab:orange');
plt.subplots_adjust(wspace=0.4)
axes[0].set_title('WITH building_id 1099');
axes[1].set_title('WITHOUT building_id 1099');

f:id:bitop:20191231104035p:plain

fig, axes = plt.subplots(1,1,figsize=(14, 6))
train[train['building_id'] != 1099].groupby('building_id')['meter_reading'].mean().plot();
axes.set_title('Mean meter reading by building_id', fontsize=14);
axes.set_ylabel('Mean meter reading', fontsize=14);

f:id:bitop:20191231112322p:plain

year_built

fig, axes = plt.subplots(1,1,figsize=(14, 6))
# インデックス(行名・列名)でソートするsort_index()
train['year_built'].value_counts(dropna=False).sort_index().plot(ax=axes).set_xlabel('year_built');
test['year_built'].value_counts(dropna=False).sort_index().plot(ax=axes).set_ylabel('Number of examples');
axes.legend(['Train', 'Test']);
axes.set_title('Number of examples per year_built', fontsize=16);

f:id:bitop:20191231112510p:plain

fig, axes = plt.subplots(1,1,figsize=(14, 6))
train.groupby('year_built')['meter_reading'].mean().plot().set_ylabel('Mean meter reading');
axes.set_title('Mean meter reading by year_built of the building', fontsize=16);

f:id:bitop:20191231112559p:plain

fig, axes = plt.subplots(2, 2, figsize=(14, 12))
sns.kdeplot(train['year_built'], ax=axes[0][0], label='Train');
sns.kdeplot(test['year_built'], ax=axes[0][0], label='Test');
sns.boxplot(x=train['year_built'], ax=axes[1][0]);
sns.boxplot(x=test['year_built'], ax=axes[1][1]);
pd.DataFrame({'train': [train['year_built'].isnull().sum()], 'test': [test['year_built'].isnull().sum()]}).plot(kind='bar', rot=0, ax=axes[0][1]);
axes[0][0].legend();
axes[0][0].set_title('Train/Test KDE distribution');
axes[0][1].set_title('Number of NaNs');
axes[1][0].set_title('Boxplot for train');
axes[1][1].set_title('Boxplot for test');
gc.collect();

f:id:bitop:20191231112644p:plain

floor_count(階数)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))
sns.kdeplot(train['floor_count'], label='Train', ax=axes[0]);
sns.kdeplot(test['floor_count'], label='Test', ax=axes[0]);
test.index += len(train)
print(train['floor_count'].dropna().head())
print(train['floor_count'].dropna().tail())
axes[1].plot(train['floor_count'], '.', label='Train');
axes[1].plot(test['floor_count'], '.', label='Test');
test.index -= len(train)
axes[0].set_title('Train/Test KDE distribution');
axes[1].set_title('Index versus value: Train/Test distribution');
gc.collect();

f:id:bitop:20191231112745p:plain

fig, axes = plt.subplots(1,1,figsize=(14, 6))
pd.Series(index=train['floor_count'].value_counts().index, 
          data=train.groupby('floor_count')['meter_reading'].transform('mean').value_counts().index).sort_index().plot(kind='bar', rot=0, ax=axes);
axes.set_xlabel('Floor count');
axes.set_ylabel('Mean meter reading');
axes.set_title('Mean meter reading by floor count');

f:id:bitop:20191231112850p:plain

air_temperature

fig, axes = plt.subplots(1,1,figsize=(14, 6), dpi=100)
train[['timestamp', 'air_temperature']].set_index('timestamp').resample('H').mean()['air_temperature'].plot(ax=axes, alpha=0.8, label='By hour', color='tab:blue').set_ylabel('Mean temperature', fontsize=14);
test[['timestamp', 'air_temperature']].set_index('timestamp').resample('H').mean()['air_temperature'].plot(ax=axes, alpha=0.8, color='tab:blue', label='');
train[['timestamp', 'air_temperature']].set_index('timestamp').resample('D').mean()['air_temperature'].plot(ax=axes, alpha=1, label='By day', color='tab:orange');
test[['timestamp', 'air_temperature']].set_index('timestamp').resample('D').mean()['air_temperature'].plot(ax=axes, alpha=1, color='tab:orange', label='');
axes.legend();
axes.text(train['timestamp'].iloc[9000000], -3, 'Train', fontsize=16);
axes.text(test['timestamp'].iloc[29400000], 30, 'Test', fontsize=16);
axes.axvspan(test['timestamp'].min(), test['timestamp'].max(), facecolor='green', alpha=0.2);

f:id:bitop:20191231113323p:plain

fig, axes = plt.subplots(8,2,figsize=(14, 30), dpi=100)
for i in range(train['site_id'].nunique()):
    train[train['site_id'] == i][['timestamp', 'air_temperature']].set_index('timestamp').resample('H').mean()['air_temperature'].plot(ax=axes[i%8][i//8], alpha=0.8, label='By hour', color='tab:blue').set_ylabel('Mean temperature', fontsize=13);
    test[test['site_id'] == i][['timestamp', 'air_temperature']].set_index('timestamp').resample('H').mean()['air_temperature'].plot(ax=axes[i%8][i//8], alpha=0.8, color='tab:blue', label='').set_xlabel('')
    train[train['site_id'] == i][['timestamp', 'air_temperature']].set_index('timestamp').resample('D').mean()['air_temperature'].plot(ax=axes[i%8][i//8], alpha=1, label='By day', color='tab:orange')
    test[test['site_id'] == i][['timestamp', 'air_temperature']].set_index('timestamp').resample('D').mean()['air_temperature'].plot(ax=axes[i%8][i//8], alpha=1, color='tab:orange', label='').set_xlabel('')
    axes[i%8][i//8].legend();
    axes[i%8][i//8].set_title('site_id {}'.format(i), fontsize=13);
    axes[i%8][i//8].axvspan(test['timestamp'].min(), test['timestamp'].max(), facecolor='green', alpha=0.2);
    plt.subplots_adjust(hspace=0.45)

f:id:bitop:20191231114447p:plain

dew_temperature

fig, axes = plt.subplots(1,1,figsize=(14, 6), dpi=100)
train[['timestamp', 'dew_temperature']].set_index('timestamp').resample('H').mean()['dew_temperature'].plot(ax=axes, alpha=0.8, label='By hour', color='tab:blue').set_ylabel('Mean dew temperature', fontsize=14);
test[['timestamp', 'dew_temperature']].set_index('timestamp').resample('H').mean()['dew_temperature'].plot(ax=axes, alpha=0.8, color='tab:blue', label='');
train[['timestamp', 'dew_temperature']].set_index('timestamp').resample('D').mean()['dew_temperature'].plot(ax=axes, alpha=1, label='By day', color='tab:orange');
test[['timestamp', 'dew_temperature']].set_index('timestamp').resample('D').mean()['dew_temperature'].plot(ax=axes, alpha=1, color='tab:orange', label='');
axes.legend();
axes.text(train['timestamp'].iloc[9000000], -5, 'Train', fontsize=16);
axes.text(test['timestamp'].iloc[29400000], 16, 'Test', fontsize=16);
axes.axvspan(test['timestamp'].min(), test['timestamp'].max(), facecolor='green', alpha=0.2);

f:id:bitop:20191231115024p:plain

fig, axes = plt.subplots(8,2,figsize=(14, 30), dpi=100)
for i in range(train['site_id'].nunique()):
    train[train['site_id'] == i][['timestamp', 'dew_temperature']].set_index('timestamp').resample('H').mean()['dew_temperature'].plot(ax=axes[i%8][i//8], alpha=0.8, label='By hour', color='tab:blue').set_ylabel('Mean dew temperature', fontsize=13);
    test[test['site_id'] == i][['timestamp', 'dew_temperature']].set_index('timestamp').resample('H').mean()['dew_temperature'].plot(ax=axes[i%8][i//8], alpha=0.8, color='tab:blue', label='').set_xlabel('')
    train[train['site_id'] == i][['timestamp', 'dew_temperature']].set_index('timestamp').resample('D').mean()['dew_temperature'].plot(ax=axes[i%8][i//8], alpha=1, label='By day', color='tab:orange')
    test[test['site_id'] == i][['timestamp', 'dew_temperature']].set_index('timestamp').resample('D').mean()['dew_temperature'].plot(ax=axes[i%8][i//8], alpha=1, color='tab:orange', label='').set_xlabel('')
    axes[i%8][i//8].legend();
    axes[i%8][i//8].set_title('site_id {}'.format(i), fontsize=13);
    axes[i%8][i//8].axvspan(test['timestamp'].min(), test['timestamp'].max(), facecolor='green', alpha=0.2);
    plt.subplots_adjust(hspace=0.45)

f:id:bitop:20191231115238p:plain

precip_depth_1_hr

fig, axes = plt.subplots(1,1,figsize=(14, 6), dpi=100)
train[['timestamp', 'precip_depth_1_hr']].set_index('timestamp').resample('M').mean()['precip_depth_1_hr'].plot(ax=axes, alpha=0.8, label='By month', color='tab:blue').set_ylabel('Mean precip_depth_1_hr', fontsize=14);
test[['timestamp', 'precip_depth_1_hr']].set_index('timestamp').resample('M').mean()['precip_depth_1_hr'].plot(ax=axes, alpha=0.8, color='tab:blue', label='');
axes.legend();

f:id:bitop:20191231115425p:plain

sea_level_pressure(海面圧力)

fig, axes = plt.subplots(1,1,figsize=(14, 6), dpi=100)
train[['timestamp', 'sea_level_pressure']].set_index('timestamp').resample('H').mean()['sea_level_pressure'].plot(ax=axes, alpha=0.8, label='By hour', color='tab:blue').set_ylabel('Mean sea_level_pressure', fontsize=14);
test[['timestamp', 'sea_level_pressure']].set_index('timestamp').resample('H').mean()['sea_level_pressure'].plot(ax=axes, alpha=0.8, color='tab:blue', label='');
train[['timestamp', 'sea_level_pressure']].set_index('timestamp').resample('D').mean()['sea_level_pressure'].plot(ax=axes, alpha=1, label='By day', color='tab:orange');
test[['timestamp', 'sea_level_pressure']].set_index('timestamp').resample('D').mean()['sea_level_pressure'].plot(ax=axes, alpha=1, color='tab:orange', label='');
axes.legend();
axes.text(train['timestamp'].iloc[9000000], 1004, 'Train', fontsize=16);
axes.text(test['timestamp'].iloc[21000000], 1032, 'Test', fontsize=16);
axes.axvspan(test['timestamp'].min(), test['timestamp'].max(), facecolor='green', alpha=0.2);

f:id:bitop:20191231115655p:plain

fig, axes = plt.subplots(8,2,figsize=(14, 30), dpi=100)
for i in range(train['site_id'].nunique()):
    train[train['site_id'] == i][['timestamp', 'sea_level_pressure']].set_index('timestamp').resample('H').mean()['sea_level_pressure'].plot(ax=axes[i%8][i//8], alpha=0.8, label='By hour', color='tab:blue').set_ylabel('Mean sea_level_pressure', fontsize=13);
    test[test['site_id'] == i][['timestamp', 'sea_level_pressure']].set_index('timestamp').resample('H').mean()['sea_level_pressure'].plot(ax=axes[i%8][i//8], alpha=0.8, color='tab:blue', label='').set_xlabel('')
    train[train['site_id'] == i][['timestamp', 'sea_level_pressure']].set_index('timestamp').resample('D').mean()['sea_level_pressure'].plot(ax=axes[i%8][i//8], alpha=1, label='By day', color='tab:orange')
    test[test['site_id'] == i][['timestamp', 'sea_level_pressure']].set_index('timestamp').resample('D').mean()['sea_level_pressure'].plot(ax=axes[i%8][i//8], alpha=1, color='tab:orange', label='').set_xlabel('')
    axes[i%8][i//8].legend();
    axes[i%8][i//8].set_title('site_id {}'.format(i), fontsize=13);
    axes[i%8][i//8].axvspan(test['timestamp'].min(), test['timestamp'].max(), facecolor='green', alpha=0.2);
    plt.subplots_adjust(hspace=0.45)

f:id:bitop:20191231115926p:plain

wind_direction & wind_speed(風向と風速)

def speed_labels(bins:list, units:str) -> list:   
    labels = list()
    for left, right in zip(bins[:-1], bins[1:]):
        if left == bins[0]:
            labels.append('calm'.format(right))
        elif np.isinf(right):
            labels.append('>{} {}'.format(left, units))
        else:
            labels.append('{} - {} {}'.format(left, right, units))
    return labels

def _convert_dir(directions, N=None):
    if N is None:
        N = directions.shape[0]
    barDir = directions * np.pi/180. - np.pi/N
    barWidth = 2 * np.pi / N
    return barDir, barWidth

spd_bins = [-1, 0, 5, 10, 15, 20, 25, 30, np.inf]
spd_labels = speed_labels(spd_bins, units='m/s')

dir_bins = np.arange(-7.5, 370, 15)
dir_labels = (dir_bins[:-1] + dir_bins[1:]) / 2

calm_count = train[train['wind_speed'] == 0].shape[0]
total_count = len(train)
rose = (train.assign(WindSpd_bins=lambda df:
            pd.cut(df['wind_speed'], bins=spd_bins, labels=spd_labels, right=True)).assign(WindDir_bins=lambda df: pd.cut(df['wind_direction'], bins=dir_bins, labels=dir_labels, right=False)).replace({'WindDir_bins': {360: 0}}).groupby(by=['WindSpd_bins', 'WindDir_bins']).size().unstack(level='WindSpd_bins').fillna(0).assign(calm=lambda df: calm_count / df.shape[0]).sort_index(axis=1).applymap(lambda x: x / total_count * 100))
rose.drop(rose.index[0], inplace=True)
directions = np.arange(0, 360, 15)

def wind_rose(rosedata, wind_dirs, palette=None):
    if palette is None:
        palette = sns.color_palette('inferno', n_colors=rosedata.shape[1])

    bar_dir, bar_width = _convert_dir(wind_dirs)

    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
    ax.set_theta_direction('clockwise')
    ax.set_theta_zero_location('N')

    for n, (c1, c2) in enumerate(zip(rosedata.columns[:-1], rosedata.columns[1:])):
        if n == 0:
            # first column only
            ax.bar(bar_dir, rosedata[c1].values, 
                   width=bar_width,
                   color=palette[0],
                   edgecolor='none',
                   label=c1,
                   linewidth=0)

        # all other columns
        ax.bar(bar_dir, rosedata[c2].values, 
               width=bar_width, 
               bottom=rosedata.cumsum(axis=1)[c1].values,
               color=palette[n+1],
               edgecolor='none',
               label=c2,
               linewidth=0)

    leg = ax.legend(loc=(0.75, 0.95), ncol=2)
    xtl = ax.set_xticklabels(['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW'])

    return fig

fig = wind_rose(rose, directions)

f:id:bitop:20191231120700p:plain

ASHRAE

EDA for ASHRAEをやってみる

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
warnings.simplefilter('ignore') #ワーニングを無視する設定
matplotlib.rcParams['figure.dpi'] = 100
sns.set()
%matplotlib inline

building = pd.read_csv('input/building_metadata.csv')
weather_train = pd.read_csv('input/weather_train.csv')
weather_test = pd.read_csv('input/weather_test.csv')
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

Merging everything into two datasets: train and test

train.head()

f:id:bitop:20191230104727p:plain

building.head()

f:id:bitop:20191230104804p:plain

weather_train.head()

f:id:bitop:20191230104837p:plain

train = train.merge(building, on='building_id', how='left')
test = test.merge(building, on='building_id', how='left')

train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')
test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')
del weather_train, weather_test,building
gc.collect(); #明示的にガベージコレクションを行うとき呼び出す

#6.17GB-> 4.55GB = 1.62GBの節約
# Saving some memory
d_types = {'building_id': np.int16,
          'meter': np.int8,
          'site_id': np.int8,
          'primary_use': 'category',
          'square_feet': np.int32,
          'year_built': np.float16,
          'floor_count': np.float16,
          'air_temperature': np.float32,
          'cloud_coverage': np.float16,
          'dew_temperature': np.float32,
          'precip_depth_1_hr': np.float16,
          'sea_level_pressure': np.float32,
          'wind_direction': np.float16,
          'wind_speed': np.float32}

for feature in d_types:
    train[feature] = train[feature].astype(d_types[feature])
    test[feature] = test[feature].astype(d_types[feature])

train["timestamp"] = pd.to_datetime(train["timestamp"])
test["timestamp"] = pd.to_datetime(test["timestamp"])
gc.collect();

f:id:bitop:20191230105021p:plain

Amount of data and NaNs

print(train.shape)
print(test.shape)

f:id:bitop:20191230105130p:plain

train.count()

f:id:bitop:20191230105206p:plain

len(train)

f:id:bitop:20191230105236p:plain

train_data = (train.count() / len(train)).drop('meter_reading').sort_values().values #各列の欠損状況、欠損なしなら1.0
ind = np.arange(len(train_data))
width = 0.35

fig, axes = plt.subplots(1,1,figsize=(14, 6), dpi=100)
tr = axes.bar(ind, train_data, width, color='royalblue')

test_data = (test.count() / len(test)).drop('row_id').sort_values().values
tt = axes.bar(ind+width, test_data, width, color='seagreen')

axes.set_ylabel('Amount of data available');
axes.set_xticks(ind + width / 2)
axes.set_xticklabels((train.count() / len(train)).drop('meter_reading').sort_values().index, rotation=40)
axes.legend([tr, tt], ['Train', 'Test']);

f:id:bitop:20191230105325p:plain

Mean meter reading by day

# 時間軸の長さが違っても同一グラフ内に描画している,月単位でリサンプリしたグラフを追加
fig, axes = plt.subplots(1, 1, figsize=(14, 6), dpi=100)
train[['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes, label='By hour', alpha=0.8).set_ylabel('Meter reading', fontsize=14);
train[['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes, label='By day', alpha=1).set_ylabel('Meter reading', fontsize=14);
train[['timestamp', 'meter_reading']].set_index('timestamp').resample('M').mean()['meter_reading'].plot(ax=axes, label='By Month', alpha=1).set_ylabel('Meter reading', fontsize=14);
axes.set_title('Mean Meter reading by hour and day', fontsize=24);
axes.legend();

f:id:bitop:20191230105511p:plain  

# 建物別に描画
fig, axes = plt.subplots(8,2,figsize=(14, 30), dpi=100)
for i in range(train['site_id'].nunique()):
    train[train['site_id'] == i][['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes[i%8][i//8], alpha=0.8, label='By hour', color='tab:blue').set_ylabel('Mean meter reading', fontsize=13);
    train[train['site_id'] == i][['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes[i%8][i//8], alpha=1, label='By day', color='tab:orange').set_xlabel('');
    axes[i%8][i//8].legend();
    axes[i%8][i//8].set_title('site_id {}'.format(i), fontsize=13);
    plt.subplots_adjust(hspace=0.45)

f:id:bitop:20191230105628p:plain

Mean meter reading by primary_use for site_id==13

train['primary_use'].unique()
# 全部で16
# Education 教育
# Lodging/residential 宿泊
# Office
# Entertainment/public assembly エンターテイメント/公開集会
# Healthcare 健康管理
# Utility
# Technology/science
# Manufacturing/industrial 製造/産業
# Services

f:id:bitop:20191230105744p:plain

# primary_use 建物の主な用途(教育とか)
fig, axes = plt.subplots(8,2,figsize=(14, 30), dpi=100)
for i, use in enumerate(train['primary_use'].value_counts().index.to_list()):
    try:
        train[(train['site_id'] == 13) & (train['primary_use'] == use)][['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes[i%8][i//8], alpha=0.8, label='By hour', color='tab:blue').set_ylabel('Mean meter reading', fontsize=13);
        train[(train['site_id'] == 13) & (train['primary_use'] == use)][['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes[i%8][i//8], alpha=1, label='By day', color='tab:orange').set_xlabel('');
        axes[i%8][i//8].legend();
    except TypeError:
        pass
    axes[i%8][i//8].set_title(use, fontsize=13);
    plt.subplots_adjust(hspace=0.45)

f:id:bitop:20191230105857p:plain

# メータの種類は4つある
# 0: 'electricity', 1: 'chilledwater', 2: 'steam', 3: 'hotwater'
fig, axes = plt.subplots(3,1,figsize=(14, 18), dpi=100)
for i in train[(train['site_id'] == 13) & (train['primary_use'] == 'Education')]['meter'].value_counts(dropna=False).index.to_list():
    print(i)
    train[(train['site_id'] == 13) & (train['primary_use'] == 'Education') & (train['meter'] == i)][['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes[i], alpha=0.8, label='By hour', color='tab:blue').set_ylabel('Mean meter reading', fontsize=13);
    train[(train['site_id'] == 13) & (train['primary_use'] == 'Education') & (train['meter'] == i)][['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes[i], alpha=1, label='By day', color='tab:orange').set_xlabel('');
    axes[i].legend();
    axes[i].set_title('Meter: ' + str(i), fontsize=13);

f:id:bitop:20191230110009p:plain

# site_id == 13 で meter == 2(steam) primary_use==教育
fig, axes = plt.subplots(9,2,figsize=(14, 36), dpi=100)
for i, building in enumerate(train[(train['site_id'] == 13) & (train['primary_use'] == 'Education') & (train['meter'] == 2)]['building_id'].value_counts(dropna=False).index.to_list()):
    train[(train['site_id'] == 13) & (train['primary_use'] == 'Education') & (train['meter'] == 2) & (train['building_id'] == building)][['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes[i%9][i//9], alpha=0.8, label='By hour', color='tab:blue').set_ylabel('Mean meter reading', fontsize=13);
    train[(train['site_id'] == 13) & (train['primary_use'] == 'Education') & (train['meter'] == 2) & (train['building_id'] == building)][['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes[i%9][i//9], alpha=1, label='By day', color='tab:orange').set_xlabel('');
    axes[i%9][i//9].legend();
    axes[i%9][i//9].set_title('building_id: ' + str(building), fontsize=13);
    plt.subplots_adjust(hspace=0.45)

f:id:bitop:20191230110118p:plain

# 特定の建物 == 1099のスチームメーターを詳しく調べている
fig, axes = plt.subplots(3,1,figsize=(14, 20), dpi=100)

train[(train['meter'] == 2) & (train['building_id'] == 1099)][['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes[0], alpha=0.8, label='By hour', color='tab:blue').set_ylabel('Mean meter reading', fontsize=13);
train[(train['meter'] == 2) & (train['building_id'] == 1099)][['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes[0], alpha=1, label='By day', color='tab:orange').set_xlabel('');

train[['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes[1], alpha=0.8, label='By hour', color='tab:blue').set_ylabel('Mean meter reading', fontsize=13);
train[['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes[1], alpha=1, label='By day', color='tab:orange').set_xlabel('');

train[~((train['meter'] == 2) & (train['building_id'] == 1099))][['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes[2], alpha=0.8, label='By hour', color='tab:blue').set_ylabel('Mean meter reading', fontsize=13);
train[~((train['meter'] == 2) & (train['building_id'] == 1099))][['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes[2], alpha=1, label='By day', color='tab:orange').set_xlabel('');

axes[0].set_title('building_id==1099 and meter==2', fontsize=13);
axes[1].set_title('Full dataset', fontsize=13);
axes[2].set_title('building_id 1099 excluded', fontsize=13);
plt.subplots_adjust(hspace=0.45)

f:id:bitop:20191230110237p:plain

Number of observations by day

# value_counts : ユニークな要素の値とその出現回数を返す
# dt.floorは時間の切捨てをするdで指定してあったら11/15 01:15->11/15になる
fig, axes = plt.subplots(1, 1, figsize=(14, 6))
train['timestamp'].dt.floor('d').value_counts().sort_index().plot(ax=axes).set_xlabel('Date', fontsize=14);
test['timestamp'].dt.floor('d').value_counts().sort_index().plot(ax=axes).set_ylabel('Number of observations', fontsize=14);
axes.set_title('Number of observations by day', fontsize=16);
axes.legend(['Train', 'Test']);

f:id:bitop:20191230110357p:plain

Meter

# value_countsはユニークな要素の値がindex、その出現個数がdataとなるpandas.Seriesを返す
# value_countsのdropnaはFalseにするとNanを除去せずにカウントする

m = train['meter'].value_counts()
print(type(m[0]))
print(m.shape)
m

f:id:bitop:20191230110500p:plain

train_data = train['meter'].value_counts(dropna=False, normalize=True).sort_index().values
ind = np.arange(len(train_data))
width = 0.35
# 
print(ind)
fig, axes = plt.subplots(1,1,figsize=(14, 6), dpi=100)
tr = axes.bar(ind, train_data, width, color='royalblue')

test_data = test['meter'].value_counts(dropna=False, normalize=True).sort_index().values
tt = axes.bar(ind+width, test_data, width, color='seagreen')

axes.set_ylabel('Normalized number of observations');
axes.set_xlabel('meter type');
axes.set_xticks(ind + width / 2)
axes.set_xticklabels(train['meter'].value_counts().sort_index().index, rotation=0)
# twinxは2軸グラフを描くとき使う(棒グラフ+折れ線)
axes2 = axes.twinx()
mr = axes2.plot(ind, train[['meter', 'meter_reading']].groupby('meter')['meter_reading'].mean().sort_index().values, 'D-', color='tab:orange', label='Mean meter reading');
axes2.grid(False);
axes2.tick_params(axis='y', labelcolor='tab:orange');
axes2.set_ylabel('Mean meter reading by meter type', color='tab:orange');
axes.legend([tr, tt], ['Train', 'Test'], facecolor='white');
axes2.legend(loc=5, facecolor='white');

f:id:bitop:20191230110547p:plain

fig, axes = plt.subplots(1, 1, figsize=(14, 6))
sns.boxplot(x='meter', y='meter_reading', data=train, showfliers=False);

f:id:bitop:20191230110623p:plain

tmp_df = pd.concat([train[['meter', 'air_temperature']], test[['meter', 'air_temperature']]], ignore_index=True)
tmp_df['dataset'] = 'Train'
tmp_df.loc[tmp_df.index > len(train), 'dataset'] = 'Test'

fig, axes = plt.subplots(1, 1, figsize=(14, 6))
sns.boxplot(x='meter', y='air_temperature', data=tmp_df, hue='dataset', palette="Set3");

del tmp_df
gc.collect();

f:id:bitop:20191230110708p:plain

train_data = train['site_id'].value_counts(dropna=False, normalize=True).sort_index().values
ind = np.arange(len(train_data))
width = 0.35

fig, axes = plt.subplots(1,1,figsize=(14, 6), dpi=100)
tr = axes.bar(ind, train_data, width, color='royalblue')

test_data = test['site_id'].value_counts(dropna=False, normalize=True).sort_index().values
tt = axes.bar(ind+width, test_data, width, color='seagreen')

axes.set_ylabel('Normalized number of observations');
axes.set_xlabel('site_id');
axes.set_xticks(ind + width / 2)
axes.set_xticklabels(train['site_id'].value_counts().sort_index().index, rotation=0)
axes2 = axes.twinx()
mr = axes2.plot(ind, train[['site_id', 'meter_reading']].groupby('site_id')['meter_reading'].mean().sort_index().values, 'D-', color='tab:orange', label='Mean meter reading');
axes2.grid(False);
axes2.tick_params(axis='y', labelcolor='tab:orange');
axes2.set_ylabel('Mean meter reading by site_id', color='tab:orange');
axes.legend([tr, tt], ['Train', 'Test'], facecolor='white');
axes2.legend(loc=2, facecolor='white');

f:id:bitop:20191230110800p:plain

train_data = train['primary_use'].value_counts(dropna=False, normalize=True).sort_index().values
ind = np.arange(len(train_data))
width = 0.35

fig, axes = plt.subplots(1,1,figsize=(14, 6), dpi=100)
tr = axes.bar(ind, train_data, width, color='royalblue')

test_data = test['primary_use'].value_counts(dropna=False, normalize=True).sort_index().values
tt = axes.bar(ind+width, test_data, width, color='seagreen')

axes.set_ylabel('Normalized number of observations');
axes.set_xlabel('primary_use');
axes.set_xticks(ind + width / 2)
axes.set_xticklabels(train['primary_use'].value_counts().sort_index().index, rotation=90)
axes2 = axes.twinx()
mr = axes2.plot(ind, train[['primary_use', 'meter_reading']].groupby('primary_use')['meter_reading'].mean().sort_index().values, 'D-', color='tab:orange', label='Mean meter reading');
axes2.grid(False);
axes2.tick_params(axis='y', labelcolor='tab:orange');
axes2.set_ylabel('Mean meter reading by primary_use', color='tab:orange');
axes.legend([tr, tt], ['Train', 'Test'], facecolor='white');
axes2.legend(loc=5, facecolor='white');

f:id:bitop:20191230110851p:plain

ASHRAE

kaggleのASHRAE -Start Here: A GENTLE Introductionをやってみる

plt.hist(weather_train_df['cloud_coverage'],bins=60,color='#f46d43')

f:id:bitop:20191229053614p:plain

plt.hist(weather_test_df['cloud_coverage'],bins=60,color='#66bd63')

f:id:bitop:20191229053653p:plain

plt.hist(weather_train_df['dew_temperature'],bins=60,color='#f46d43')

f:id:bitop:20191229053848p:plain

plt.hist(weather_test_df['dew_temperature'],bins=60,color='#66bd63')

f:id:bitop:20191229053920p:plain

plt.hist(weather_train_df['precip_depth_1_hr'],bins=60,color='#f46d43')

f:id:bitop:20191229054219p:plain

plt.hist(weather_test_df['precip_depth_1_hr'],bins=60,color='#66bd63')

f:id:bitop:20191229054251p:plain

plt.hist(weather_train_df['sea_level_pressure'],bins=60,color='#f46d43')

f:id:bitop:20191229054749p:plain

plt.hist(weather_test_df['sea_level_pressure'],bins=60,color='#66bd63')

f:id:bitop:20191229054830p:plain

plt.hist(weather_train_df['wind_direction'],bins=60,color='#f46d43')

f:id:bitop:20191229055029p:plain

plt.hist(weather_test_df['wind_direction'],bins=60,color='#66bd63')

f:id:bitop:20191229055101p:plain

plt.hist(weather_train_df['wind_speed'],bins=60,color='#f46d43')

f:id:bitop:20191229055240p:plain

plt.hist(weather_test_df['wind_speed'],bins=60,color='#66bd63')

f:id:bitop:20191229055320p:plain

from statsmodels.tsa.seasonal import seasonal_decompose #時系列データを扱うためのライブラリ
ts=train_df.groupby(["timestamp"])["meter_reading"].sum()

print(ts.head())

ts.astype('float')
plt.figure(figsize=(16,8))
plt.title('meter_reading')
plt.xlabel('timestamp')
plt.ylabel('meter_reading')
plt.plot(ts);

f:id:bitop:20191229061411p:plain
f:id:bitop:20191229061437p:plain

import statsmodels.api as sm
# multiplicative
res = sm.tsa.seasonal_decompose(ts.values,freq=12,model="multiplicative")
fig = res.plot()

f:id:bitop:20191229063631p:plain

# Additive model
res = sm.tsa.seasonal_decompose(ts.values,freq=12,model="additive") #モデルをmultiplicativeからadditiveに変更
fig = res.plot()

f:id:bitop:20191229063902p:plain

Outlier Distribution

y_mean_time = train_df.groupby('timestamp').meter_reading.mean()
y_mean_time.plot(figsize=(20, 8))

f:id:bitop:20191229064251p:plain

y_mean_time.rolling(window=10).std().plot(figsize=(20, 8))
ax = plt.axhline(y=0.009, color='red')

f:id:bitop:20191229064412p:plain

y_mean_time.rolling(window=10).std().plot(figsize=(20, 8))
plt.axhline(y=0.009, color='red')
plt.axvspan(0, 905, color='green', alpha=0.1)
plt.axvspan(906, 1505, color='red', alpha=0.1)

f:id:bitop:20191229064559p:plain

Group data in a daily basis

# https://qiita.com/TomokIshii/items/8acb138bd36e1b51b148
print(train_df.head())
train_df['meter'] = pd.Categorical(train_df['meter']).rename_categories({0: 'electricity',     1: 'chilledwater', 2: 'steam', 3: 'hotwater'})
print(train_df.head()) #rename_categoriesはメータの種類をわかりやすいようにリネームしている
daily_train = train_df.copy()
daily_train['date'] = daily_train['timestamp'].dt.date #1時間単位のデータをdt.dateで日単位    のデータにまとめなおしている 
# https://qiita.com/Takemura-T/items/79b16313e45576bb6492
daily_train = daily_train.groupby(['date', 'building_id', 'meter']).sum()
daily_train

f:id:bitop:20191229074456p:plain
f:id:bitop:20191229074521p:plain

Aggregate the data for buildings

daily_train_agg = daily_train.groupby(['date', 'meter']).agg(['sum', 'mean', 'idxmax',     'max'])
daily_train_agg = daily_train_agg.reset_index()
level_0 = daily_train_agg.columns.droplevel(0)
level_1 = daily_train_agg.columns.droplevel(1)
level_0 = ['' if x == '' else '-' + x for x in level_0]
daily_train_agg.columns = level_1 + level_0
daily_train_agg.rename_axis(None, axis=1)
print(daily_train_agg.head())

fig_total = px.line(daily_train_agg, x='date', y='meter_reading-sum', color='meter',     render_mode='svg')
fig_total.update_layout(title='Total kWh per energy aspect')
fig_total.show()

f:id:bitop:20191229075434p:plain
f:id:bitop:20191229075455p:plain

print(daily_train_agg.head())

fig_maximum = px.line(daily_train_agg, x='date', y='meter_reading-max', color='meter', render_mode='svg')
fig_maximum.update_layout(title='Maximum kWh value per energy aspect')
fig_maximum.show()

f:id:bitop:20191229082019p:plain

Identifying outliers

daily_train_agg['building_id_max'] = [x[1] for x in daily_train_agg['meter_reading-idxmax']]
daily_train_agg.head()

f:id:bitop:20191229082319p:plain

print('Number of days that a building has the maximum electricity consumption of all the buildings:\n') #建物がすべての建物の最大電力消費量を持つ日数:
print(daily_train_agg[daily_train_agg['meter'] == 'electricity']['building_id_max'].value_counts())
print(daily_train_agg[daily_train_agg['meter'] == 'electricity']['building_id_max'].value_counts())

f:id:bitop:20191229082512p:plain

# 電気の消費
daily_train_electricity = daily_train_agg[daily_train_agg['meter']=='electricity'].copy()
daily_train_electricity['building_id_max'] = pd.Categorical(daily_train_electricity    ['building_id_max'])
fig_daily_electricity = px.scatter(daily_train_electricity,
                                   x='date',
                                   y='meter_reading-max',
                                   color='building_id_max',
                                   render_mode='svg')
fig_daily_electricity.update_layout(title='Maximum consumption values for the day and     energy aspect')
fig_daily_electricity.show()

f:id:bitop:20191229082749p:plain

#冷水
print('Number of days that a building has the maximum chilledwater consumption of all the     buildings:\n')
print(daily_train_agg[daily_train_agg['meter'] == 'chilledwater']['building_id_max']    .value_counts())

f:id:bitop:20191229083038p:plain

daily_train_chilledwater = daily_train_agg[daily_train_agg['meter']=='chilledwater'].copy    ()
daily_train_chilledwater['building_id_max'] = pd.Categorical(daily_train_chilledwater    ['building_id_max'])
fig_daily_chilledwater = px.scatter(daily_train_chilledwater,
                                    x='date',
                                    y='meter_reading-max',  
                                    color='building_id_max', 
                                    render_mode='svg')
fig_daily_chilledwater.update_layout(title='Maximum consumption values for the day and     energy aspect')
fig_daily_chilledwater.show()

f:id:bitop:20191229083205p:plain

# 蒸気
print('Number of days that a building has the maximum steam consumption of all the     buildings:\n')
print(daily_train_agg[daily_train_agg['meter'] == 'steam']['building_id_max'].value_counts    ())

f:id:bitop:20191229083351p:plain

# 温水
print('Number of days that a building has the maximum hotwater consumption of all the     buildings:\n')
print(daily_train_agg[daily_train_agg['meter'] == 'hotwater']['building_id_max']    .value_counts())

f:id:bitop:20191229083619p:plain

daily_train_hotwater = daily_train_agg[daily_train_agg['meter']=='hotwater'].copy()
daily_train_hotwater['building_id_max'] = pd.Categorical(daily_train_hotwater    ['building_id_max'])
fig_daily_hotwater = px.scatter(daily_train_hotwater,
                                x='date',
                                y='meter_reading-max',
                                color='building_id_max',
                                render_mode='svg')
fig_daily_hotwater.update_layout(title='Maximum consumption values for the day and energy     aspect')
fig_daily_hotwater.show()

f:id:bitop:20191229083753p:plain

ASHRAE

kaggleのASHRAE -Start Here: A GENTLE Introductionをやってみる

リンク

www.kaggle.com

データーはtrain.csvtest.csvsample_submission.csvbuilding_metadata.csvweather_test.csvweather_train.csvになる。

インポート

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
import matplotlib.patches as patches #matolotlibで円や長方形などの図形を描くライブラリ https://note.nkmk.me/python-matplotlib-patches-circle-rectangle/

from plotly import tools, subplots # plotlyは対話的なグラフが描画できる。
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('max_columns', 150) #pandasのオプションを設定。 https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.describe_option.html

py.init_notebook_mode(connected=True) #connectedをTrueにすると、PlotlyのJavascriptをインターネットから取得するようになります。
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

import os,random, math, psutil, pickle  #psutil CPUのコア数、ディスクの空き容量、メモリの使用量等、ハードウェアの情報を取得する。

print(os.listdir("input"))

f:id:bitop:20191228135107p:plain

%%time
root = './input/'
train_df = pd.read_csv(root + 'train.csv')
train_df["timestamp"] = pd.to_datetime(train_df["timestamp"], format='%Y-%m-%d %H:%M:%S')

weather_train_df = pd.read_csv(root + 'weather_train.csv')
test_df = pd.read_csv(root + 'test.csv')
weather_test_df = pd.read_csv(root + 'weather_test.csv')
building_meta_df = pd.read_csv(root + 'building_metadata.csv')
sample_submission = pd.read_csv(root + 'sample_submission.csv')

f:id:bitop:20191228135313p:plain

データの大きさ
f:id:bitop:20191228140846p:plain

train_dfの概要
building_id:建物に付けられたID
meter:メータの種類
timestamp:タイムスタンプ
meter_reading:メータ読み値

train_df.head()  

f:id:bitop:20191228141136p:plain

train_df.tail()

f:id:bitop:20191228155634p:plain

weather_train_dfの概要
site_id:サイトID
timestamp:タイムスタンプ
air_temperature:気温
cloud_coverage:雲
dew_temperature:露点温度
precip_depth_1_hr:1時間あたりの降雨量
sea_level_pressure:気圧
wind_direction:風向
wind_speed::風速

weather_train_df.head()

f:id:bitop:20191228141355p:plain

weather_test_dfの概要

weather_test_df.head()

f:id:bitop:20191228141519p:plain

building_meta_dfの概要
f:id:bitop:20191228151946p:plain

欠損値を調べる
train_dfのtimestampをindexにする
グラフのsubplotを1行4列作る
メータの種類ごとにデータを抜き取ってindexをリセットする
to_timedeltaメソッドで時間の差を算出しtotal_seconds()で秒にしている
そしてそれを3600で割っているので1時間値にしてintに変換している。
ここでto_timedeltaが返す時間の差はUNIXエポックとの差を出している。
最初の時間が2016-01-01なので403224.0になる。
df.timestamp -= df.timestamp.min()はdf.timestamp列の中で一番小さい数値
を各行の値から差し引いている。つまり最初の数値403224を引いているので
0,1,2,3,4,5,....8783となる。8760と成らないのは2016はうるう年だから。
そのあと1449x8784のNumpyの配列をつくりnp.nanで初期化している

df.valuesを1行つづ取り出して2列目のメータ番号が目的のメータ番号でなければ
continueする。missmap[ビルのID番号、時間を整数に置き換えた]=meter_readingがゼロなら0
そうでなければ1を入れる。これをsns.heatmapに入れて描画させる

f:id:bitop:20191229051906p:plain

train_df欠損値の集計

total = train_df.isnull().sum().sort_values(ascending = False)
percent = (train_df.isnull().sum()/train_df.isnull().count()*100).sort_values(ascending = False)
missing__train_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing__train_data.head(4)

f:id:bitop:20191228170125p:plain

weather_train_df欠損値の集計

total = weather_train_df.isnull().sum().sort_values(ascending = False)
percent = (weather_train_df.isnull().sum()/weather_train_df.isnull().count()*100).sort_values(ascending = False)
missing_weather_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_weather_data.head(9)

f:id:bitop:20191228170315p:plain

weather_test_df欠損値の集計

total = weather_test_df.isnull().sum().sort_values(ascending = False)
percent = (weather_test_df.isnull().sum()/weather_test_df.isnull().count()*100).sort_values(ascending = False)
missing_weather_test_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_weather_test_data.head(9)

f:id:bitop:20191228170537p:plain

building_meta_df欠損値の集計

total = building_meta_df.isnull().sum().sort_values(ascending = False)
percent = (building_meta_df.isnull().sum()/building_meta_df.isnull().count()*100).sort_values(ascending = False)
missing_building_meta_df  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_building_meta_df.head(6)

f:id:bitop:20191228170700p:plain

train_dfの各列間の相関係数を算出

correlations = train_df.corr()['meter_reading'].sort_values()

print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))

f:id:bitop:20191228171159p:plain

weather_train_dfの気温をヒストグラムに描画する

plt.hist(weather_train_df['air_temperature'],bins=60,color='#f46d43')

f:id:bitop:20191228173055p:plain

weather_test_dfの気温をヒストグラムに描画する

plt.hist(weather_train_df['air_temperature'],bins=60,color='#66bd63')

f:id:bitop:20191228173137p:plain

Plotlyを使ってみた

Version 4 Migration Guide in Python

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd

# Make figure with subplots
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "bar"},
                                            {"type": "surface"}]])

# Add bar traces to subplot (1, 1)
fig.add_trace(go.Bar(y=[2, 1, 3]), row=1, col=1)
fig.add_trace(go.Bar(y=[3, 2, 1]), row=1, col=1)
fig.add_trace(go.Bar(y=[2.5, 2.5, 3.5]), row=1, col=1)

# Add surface trace to subplot (1, 2)
# Read data from a csv
z_data = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/api_docs/mt_bruno_elevation.csv")
fig.add_surface(z=z_data)

# Hide legend
fig.update_layout(
    showlegend=False,
    title_text="Default Theme",
    height=500,
    width=800,
)

fig.show()

f:id:bitop:20191222122524p:plain

Plotly Express-in-Python

"""
plotly.expressはplotly.graph_objectsのハイレベルなラッパー
"""
import plotly.express as px
print(px.data.iris.__doc__)
print(px.data.iris().head())

f:id:bitop:20191222125453p:plain

Scatter and Line plots

import plotly.express as px
iris = px.data.iris()
fig = px.scatter(iris, x="sepal_width", y="sepal_length")
fig.show()

f:id:bitop:20191222125835p:plain

Plotlyを使ってみた

Displaying-Figures-in-Python

import plotly
import plotly.graph_objects as go

fig = go.Figure(
    data=[go.Bar(y=[2, 1, 3])],
    layout_title_text="A Figure Displayed with fig.show()"
)
fig.show()

f:id:bitop:20191215164158p:plain

import plotly
import plotly.graph_objects as go

data = [go.Bar(y = [20,10,30])]
plotly.offline.plot(data)

f:id:bitop:20191215164418p:plain

import plotly.io as pio
print(pio.renderers)

f:id:bitop:20191215164548p:plain

#VSCODE環境では実行できなかった。Jupyter notebook環境で実行
import plotly
import plotly.io as pio
import plotly.graph_objects as go

png_renderer = pio.renderers["png"]
png_renderer.width = 500
png_renderer.height = 500

pio.renderers.default = "png"

fig = go.Figure(
    data = [go.Bar(y=[2,1,3])],
    layout_title_text="A Figure Display    with the 'png' Render"
)
fig.show()

f:id:bitop:20191215165051p:plain

Creating and Updating Figures in Python

fig = {
    "data": [{"type": "bar",
              "x": [1, 2, 3],
              "y": [1, 3, 2]}],
    "layout": {"title": {"text": "A Bar Chart"}}
}

# To display the figure defined by this dict, use the low-level plotly.io.show function
# この辞書で定義された図を表示するには、低レベルのplotly.io.show関数を使用します
import plotly.io as pio
pio.show(fig)    

f:id:bitop:20191215165646p:plain

import plotly.graph_objects as go
fig = go.Figure(
    data=[go.Bar(x=[1, 2, 3], y=[1, 3, 2])],
    layout=go.Layout(
        title=go.layout.Title(text="A Bar Chart")
    )
)
fig.show()

f:id:bitop:20191215170132p:plain

import plotly.express as px
iris = px.data.iris()
fig = px.scatter(iris, x="sepal_width", y="sepal_length", color="species")

# If you print fig, you'll see that it's just a regular figure with data and layout
# print(fig)

fig.show()

f:id:bitop:20191215170315p:plain

import numpy as np
import plotly.figure_factory as ff

x1,y1 = np.meshgrid(np.arange(-2, 2, .1), np.arange(-2, 2, .1))
u1 = np.cos(x1)*y1
v1 = np.sin(x1)*y1

fig = ff.create_quiver(x1, y1, u1, v1)
fig.show()

f:id:bitop:20191215170509p:plain

import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Scatter(y=[4, 2, 1], mode="lines"), row=1, col=1)
fig.add_trace(go.Bar(y=[2, 1, 3]), row=1, col=2)
fig.show()

f:id:bitop:20191215170637p:plain

import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Bar(x=[1, 2, 3], y=[1, 3, 2]))
fig.show()

f:id:bitop:20191215170747p:plain

import plotly.graph_objects as go
import plotly.express as px
iris = px.data.iris()
fig = px.scatter(iris, x="sepal_width", y="sepal_length", color="species")
fig.add_trace(
    go.Scatter(
    x=[2, 4],
    y=[4, 8],
    mode="lines",
    line=go.scatter.Line(color="gray"),
    showlegend=False)
)
fig.show()

f:id:bitop:20191215170856p:plain

Python実践データ分析100本ノックを読む

ノック92 不要な文字を除去してみよう

f:id:bitop:20191124055800p:plain
括弧とその中身を除去
f:id:bitop:20191124055709p:plain
全角の括弧も除去
f:id:bitop:20191124060404p:plain

ノック93 文字数をカウント

f:id:bitop:20191124060811p:plain
f:id:bitop:20191124061032p:plain

ノック94 形態素解析で文書を分割してみよう

f:id:bitop:20191124071044p:plain
f:id:bitop:20191124072346p:plain

ノック95 形態素解析で文書から「動詞、名詞」を抽出してみよう

f:id:bitop:20191124073046p:plain

ノック96 形態素解析で抽出した頻出する名詞を確認してみよう

全部で281個
f:id:bitop:20191124083126p:plain
f:id:bitop:20191124084603p:plain

ノック97 関係のない単語を除去

「の」除去
f:id:bitop:20191124084641p:plain

ノック98 顧客満足度と頻度単語の関係をみてみよう

f:id:bitop:20191124085755p:plain
集計
f:id:bitop:20191124091734p:plain
頻度が高いものを表示
f:id:bitop:20191124092316p:plain

ノック99 アンケート毎の特徴を表現してみよう

f:id:bitop:20191124095230p:plain
f:id:bitop:20191124095447p:plain

ノック100 類似アンケートを探してみよう

省略