NFL-1st-and-Future-2019をやってみる

インポート

import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
# matplotlib.patchesは円や長方形などの図形を描く。
import matplotlib.patches as patches
sns.set_style("whitegrid")

!dir input

f:id:bitop:20200101130730p:plain

f:id:bitop:20200101130801p:plain

playlist = pd.read_csv('input/PlayList.csv')
inj = pd.read_csv('input/InjuryRecord.csv')
trk = pd.read_csv('input/PlayerTrackData.csv')

inj.head()

f:id:bitop:20200101131027p:plain

print(inj.shape)

print('-'*50,'PlayerKey','-'*50)
print(inj['PlayerKey'].unique()); print(len(inj['PlayerKey'].unique()))

print('-'*50,'GameID','-'*50)
print(inj['GameID'].unique()); print(len(inj['GameID'].unique()))

print('-'*50,'PlayKey','-'*50)
print(inj['PlayKey'].unique()); print(len(inj['PlayKey'].unique()))

print('-'*50,'BodyPart','-'*50)
print(inj['BodyPart'].unique()); print(len(inj['BodyPart'].unique()))

print('-'*50,'Surface','-'*50)
print(inj['Surface'].unique()); print(len(inj['Surface'].unique()))

print('-'*50,'DM_M1','-'*50)
print(inj['DM_M1'].unique()); print(len(inj['DM_M1'].unique()))

print('-'*50,'DM_M7','-'*50)
print(inj['DM_M7'].unique()); print(len(inj['DM_M7'].unique()))

print('-'*50,'DM_M28','-'*50)
print(inj['DM_M28'].unique()); print(len(inj['DM_M28'].unique()))

print('-'*50,'DM_M42','-'*50)
print(inj['DM_M42'].unique()); print(len(inj['DM_M42'].unique()))

f:id:bitop:20200101132434p:plain  

# 28 Injurties without PlayKey
# playKeyに28の欠損値がある
inj['PlayKey'].isna().sum() 

# BodyPartは5パートある(足首とか)
# そのパートごとにPlayerが何人いるか数えて棒グラフにする。膝が一番多い、次が足首
inj.groupby('BodyPart').count()['PlayerKey'] \
    .sort_values() \
    .plot(kind='bar', figsize=(15, 5), title='Count of injuries by Body Part')
plt.show()

f:id:bitop:20200101132600p:plain

# SurfaceはSynthetic,Natural
# チエーンがつながっているのでよくわからないので、分解して表示させた。
inj.groupby(['BodyPart','Surface']).count()

f:id:bitop:20200101132707p:plain

inj.groupby(['BodyPart','Surface']) \
    .count() \
    .unstack('BodyPart')['PlayerKey']

f:id:bitop:20200101133048p:plain

inj.groupby(['BodyPart','Surface']) \
    .count() \
    .unstack('BodyPart')['PlayerKey'] \
    .T.sort_values('Natural').T

f:id:bitop:20200101133133p:plain

inj.groupby(['BodyPart','Surface']) \
    .count() \
    .unstack('BodyPart')['PlayerKey'] \
    .T.sort_values('Natural').T \
    .sort_values('Ankle') 

f:id:bitop:20200101133207p:plain

# Surfaceの意味がわからないのでいまひとつデータの解釈ができない
inj.groupby(['BodyPart','Surface']) \
    .count() \
    .unstack('BodyPart')['PlayerKey'] \
    .T.sort_values('Natural').T \
    .sort_values('Ankle') \
    .plot(kind='bar', figsize=(15, 5), title='Injury Body Part by Turf Type')
plt.show()

f:id:bitop:20200101133257p:plain

Playlist Data

playlist.head()  

f:id:bitop:20200101133346p:plain

# Quarterback : クオータバック
# Wide Receiver : ワイド・レシーバー
# Linebacker : ラインバッカー
# Running Back : ランニングバック
# Defensive Lineman : ラインマン 
# Tight End : タイトエンド
# Safety : セイフティ
# Cornerback : コーナーバック
# Offensive Lineman : オフェンシブライン

print(playlist['RosterPosition'].unique())

f:id:bitop:20200101133433p:plain

print(playlist['PlayerDay'].unique())

f:id:bitop:20200101133551p:plain

# スタジアムタイプ
print(playlist['StadiumType'].unique())

f:id:bitop:20200101133632p:plain

# フィールドが天然芝か合成芝かな
print(playlist['FieldType'].unique())

f:id:bitop:20200101133705p:plain

# 天気
# Clear and warm : Clear and warm
# Mostly Cloudy : だいたい曇など
print(playlist['Weather'].unique())

f:id:bitop:20200101133746p:plain

# そのときのPlay Type
print(playlist['PlayType'].unique())

f:id:bitop:20200101133825p:plain

# RosterPositionとよく似ている、欠損値らしきものも
print(playlist['Position'].unique())

f:id:bitop:20200101133916p:plain

# PositionともRosterPositionとよく似ている
print(playlist['PositionGroup'].unique())

f:id:bitop:20200101133956p:plain

# Number of unique plays in the playlist dataset
playlist['PlayKey'].nunique()

playlist[['PlayKey','PlayType']].drop_duplicates() \
    .groupby('PlayType').count()['PlayKey'] \
    .sort_values() \
    .plot(kind='barh',
         figsize=(15, 6),
          color='black',
         title='Number of plays provided by type')
plt.show()

f:id:bitop:20200101134103p:plain

Match Player info with injury data(プレーヤー情報と傷害データを一致させる)

# PlayerKeyとPlaykeyとGameIDが一致する。全部で20のカラムとなる
inj_detailed = inj.merge(playlist)

# 怪我をするPlayerはLinebacker Wide Receiverが多い
inj_detailed.groupby('RosterPosition').count()['PlayerKey'] \
    .sort_values() \
    .plot(figsize=(15, 5), kind='barh', title='Injured Players by Position')
plt.show()

f:id:bitop:20200101134223p:plain

# 怪我をするPlayTypeはPass Rushが多い
inj_detailed.groupby('PlayType').count()['PlayerKey'] \
    .sort_values() \
    .plot(figsize=(15, 5), kind='barh', title='Injured Players by PlayType', color='green')
plt.show()

f:id:bitop:20200101134309p:plain

Distribution of Injury Types(傷害タイプの分布)

# ポジションごとの怪我の部位の率
inj_detailed.groupby(['RosterPosition','BodyPart']) \
    .count() \
    .unstack('BodyPart')['PlayerKey'] \
    .T.apply(lambda x: x / x.sum()) \
    .sort_values('BodyPart').T.sort_values('Ankle', ascending=False) \
    .plot(kind='barh',
          figsize=(15, 5),
          title='Injury Body Part by Player Position',
          stacked=True)
plt.show()

f:id:bitop:20200101134412p:plain

# PlayTypeごとの怪我の部位率
# 特徴的な分布がみられる
inj_detailed.groupby(['PlayType','BodyPart']) \
    .count() \
    .unstack('BodyPart')['PlayerKey'] \
    .T.apply(lambda x: x / x.sum()) \
    .sort_values('BodyPart').T.sort_values('Ankle', ascending=False) \
    .plot(kind='barh',
          figsize=(15, 5),
          title='Injury Body Part by Play Type',
          stacked=True)
plt.show()

f:id:bitop:20200101134500p:plain

# ポジションごとのフィールドの種類の分布
inj_detailed.groupby(['RosterPosition','Surface']) \
    .count() \
    .unstack('Surface')['PlayerKey'] \
    .T.apply(lambda x: x / x.sum()) \
    .sort_values('Surface').T.sort_values('Natural', ascending=False) \
    .plot(kind='barh',
          figsize=(15, 5),
          title='Injury Body Part by Turf Type',
          stacked=True)
plt.show()

f:id:bitop:20200101134549p:plain

Plotting Plays

# 
def create_football_field(linenumbers=True,
                          endzones=True,
                          highlight_line=False,
                          highlight_line_number=50,
                          highlighted_name='Line of Scrimmage',
                          fifty_is_los=False,
                          figsize=(12, 6.33)):
    """
    Function that plots the football field for viewing plays.
    Allows for showing or hiding endzones.
    """
    rect = patches.Rectangle((0, 0), 120, 53.3, linewidth=0.1,
                             edgecolor='r', facecolor='darkgreen', zorder=0)

    fig, ax = plt.subplots(1, figsize=figsize)
    ax.add_patch(rect)

    plt.plot([10, 10, 10, 20, 20, 30, 30, 40, 40, 50, 50, 60, 60, 70, 70, 80,
              80, 90, 90, 100, 100, 110, 110, 120, 0, 0, 120, 120],
             [0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3,
              53.3, 0, 0, 53.3, 53.3, 0, 0, 53.3, 53.3, 53.3, 0, 0, 53.3],
             color='white')
    if fifty_is_los:
        plt.plot([60, 60], [0, 53.3], color='gold')
        plt.text(62, 50, '<- Player Yardline at Snap', color='gold')
    # Endzones
    if endzones:
        ez1 = patches.Rectangle((0, 0), 10, 53.3,
                                linewidth=0.1,
                                edgecolor='r',
                                facecolor='blue',
                                alpha=0.2,
                                zorder=0)
        ez2 = patches.Rectangle((110, 0), 120, 53.3,
                                linewidth=0.1,
                                edgecolor='r',
                                facecolor='blue',
                                alpha=0.2,
                                zorder=0)
        ax.add_patch(ez1)
        ax.add_patch(ez2)
    plt.xlim(0, 120)
    plt.ylim(-5, 58.3)
    plt.axis('off')
    if linenumbers:
        for x in range(20, 110, 10):
            numb = x
            if x > 50:
                numb = 120 - x
            plt.text(x, 5, str(numb - 10),
                     horizontalalignment='center',
                     fontsize=20,  # fontname='Arial',
                     color='white')
            plt.text(x - 0.95, 53.3 - 5, str(numb - 10),
                     horizontalalignment='center',
                     fontsize=20,  # fontname='Arial',
                     color='white', rotation=180)
    if endzones:
        hash_range = range(11, 110)
    else:
        hash_range = range(1, 120)

    for x in hash_range:
        ax.plot([x, x], [0.4, 0.7], color='white')
        ax.plot([x, x], [53.0, 52.5], color='white')
        ax.plot([x, x], [22.91, 23.57], color='white')
        ax.plot([x, x], [29.73, 30.39], color='white')

    if highlight_line:
        hl = highlight_line_number + 10
        plt.plot([hl, hl], [0, 53.3], color='yellow')
        plt.text(hl + 2, 50, '<- {}'.format(highlighted_name),
                 color='yellow')
    return fig, ax

Plot path of injured player

# trkは1秒に10回のplayerの位置情報
trk.head()

f:id:bitop:20200101134743p:plain

trk.tail()

f:id:bitop:20200101134812p:plain

trk.shape

f:id:bitop:20200101134845p:plain

fig, ax = create_football_field()

f:id:bitop:20200101134920p:plain

# player idを取り出しその位置情報(xとy)で散布図で描画させている
# query()メソッドの条件文字列の中で変数を使用するには変数名の前に@をつける。 https://note.nkmk.me/python-pandas-query/
example_play_id = inj['PlayKey'].values[0]
print(example_play_id)
fig, ax = create_football_field()
trk.query('PlayKey == @example_play_id').plot(kind='scatter', x='x', y='y', ax=ax, color='orange')
plt.show()

f:id:bitop:20200101135015p:plain

Plotting every route of injured players

# Loop through all 99 inj plays
inj_play_list = inj['PlayKey'].tolist()
fig, ax = create_football_field()
for playkey, inj_play in trk.query('PlayKey in @inj_play_list').groupby('PlayKey'):
    inj_play.plot(kind='scatter', x='x', y='y', ax=ax, color='orange', alpha=0.2)
plt.show()

f:id:bitop:20200101135115p:plain

Plotting routes of some non-injured players(負傷していないプレイヤーのルートをプロットする)

import random
playids = trk['PlayKey'].unique() #.sample(100)
non_inj_play = [x for x in playids if x not in inj_play_list]
sample_non_inj_plays = random.sample(non_inj_play, 100)

fig, ax = create_football_field()
for playkey, inj_play in trk.query('PlayKey in @sample_non_inj_plays').groupby('PlayKey'):
    inj_play.plot(kind='scatter', x='x', y='y', ax=ax, color='red', alpha=0.2)
plt.show()

f:id:bitop:20200101135221p:plain

Distribution of tracking info for players with injuries(負傷したプレーヤーの追跡情報の配布)

print(trk['s'].max())
print(trk['s'].min())

f:id:bitop:20200101135315p:plain

# playerのスピードを負傷者と非負傷者とで分布の違いを比較
fig, axes = plt.subplots(1, 2)

trk.query('PlayKey in @inj_play_list')['s'].plot(kind='hist',
                                                 title='Distribution of player Speed injured',
                                                 figsize=(15, 5), bins=30, ax=axes[0],ylim=[0,10000])
trk.query('PlayKey not in @inj_play_list')['s'].sample(10000).plot(kind='hist',
                                                 title='Distribution of player Speed not injured',
                                                 figsize=(15, 5), bins=30, ax=axes[1], color='orange',ylim=[0,10000])
plt.show()

f:id:bitop:20200101135403p:plain

fig, axes = plt.subplots(1, 2)

trk.query('PlayKey in @inj_play_list')['o'].plot(kind='hist',
                                                 title='Distribution of player Orientation injured',
                                                 figsize=(15, 5), bins=30, ax=axes[0],ylim=[0,1600])
trk.query('PlayKey not in @inj_play_list')['o'].sample(10000).plot(kind='hist',
                                                 title='Distribution of player Orientation not injured',
                                                 figsize=(15, 5), bins=30, ax=axes[1], color='orange',ylim=[0,1600])
plt.show()

f:id:bitop:20200101135446p:plain

Differences in x, y could be attributed to the player positions which are more likely to have injury

(負傷の可能性が高いプレーヤーの位置に起因する可能性があります。)

# x座標で怪我の分布、広く分布しておりこれっと行った特徴なし
fig, axes = plt.subplots(1, 2)

trk.query('PlayKey in @inj_play_list')['x'].plot(kind='hist',
                                                 title='Distribution of player X injured',
                                                 figsize=(15, 5), bins=30, ax=axes[0],ylim=[0,2000])
trk.query('PlayKey not in @inj_play_list')['x'].sample(10000).plot(kind='hist',
                                                 title='Distribution of player X not injured',
                                                 figsize=(15, 5), bins=30, ax=axes[1], color='orange',ylim=[0,2000])
plt.show()

f:id:bitop:20200101135544p:plain

# 縦軸方向では中央で怪我が発生しやすいとわかりやすい特徴がでた
fig, axes = plt.subplots(1, 2)

trk.query('PlayKey in @inj_play_list')['y'].plot(kind='hist',
                                                 title='Distribution of player Y injured',
                                                 figsize=(15, 5), bins=30, ax=axes[0])
trk.query('PlayKey not in @inj_play_list')['y'].sample(10000).plot(kind='hist',
                                                 title='Distribution of player Y not injured',
                                                 figsize=(15, 5), bins=30, ax=axes[1], color='orange')
plt.show()

f:id:bitop:20200101135631p:plain

Compass plots of direction/velocity(方向/速度のコンパスプロット)

def compass(angles, radii, arrowprops=None, ax=None):
    """
    * Modified for NFL data plotting
    Compass draws a graph that displays the vectors with
    components `u` and `v` as arrows from the origin.

    Examples
    --------
    >>> import numpy as np
    >>> u = [+0, +0.5, -0.50, -0.90]
    >>> v = [+1, +0.5, -0.45, +0.85]
    >>> compass(u, v)
    """

    #angles, radii = cart2pol(u, v)
    if ax is None:
        fig, ax = plt.subplots(subplot_kw=dict(polar=True))

    kw = dict(arrowstyle="->", color='k')
    if arrowprops:
        kw.update(arrowprops)
    [ax.annotate("", xy=(angle, radius), xytext=(0, 0),
                 arrowprops=kw) for
     angle, radius in zip(angles, radii)]

    ax.set_ylim(0, np.max(radii))

    return ax

def plot_play_compass(playkey, **kwargs):
    d = trk.loc[trk['PlayKey'] == playkey].copy()
    d['dir_theta'] = d['dir'] * np.pi / 180
    # Calculate velocity in meters per second
    d['dis_meters'] = d['dis'] / 1.0936  # Add distance in meters
    # Speed
    d['dis_meters'] / 0.01
    d['v_mps'] = d['dis_meters'] / 0.1

    ax = compass(d['dir_theta'], d['v_mps'],
                  arrowprops={'alpha': 0.3},
                **kwargs)
    return ax

fig, axes = plt.subplots(2, 3,  subplot_kw=dict(polar=True), figsize=(15, 10))
axes = np.array(axes)
axes = axes.reshape(-1)

i = 0
for p in inj_detailed['PlayKey'].values[:6]:
    plot_play_compass(p, ax=axes[i])
    axes[i].set_title(f'PlayKey: {p}')
    i += 1
plt.show()

f:id:bitop:20200101135754p:plain

Player Position vs Compass Plot(プレイヤーの位置とコンパスのプロット)

# Play Details
example_play_id = inj['PlayKey'].values[6]
inj_detailed.query('PlayKey == @example_play_id')

f:id:bitop:20200101135845p:plain

fig, ax = create_football_field(ax)
ax.set_title(f'PlayKey: {example_play_id}')
trk.query('PlayKey == @example_play_id').plot(kind='scatter', x='x', y='y', ax=ax, color='orange')
plt.show()

ax = plot_play_compass(example_play_id)
ax.set_title(f'PlayKey: {p}')
plt.show()

f:id:bitop:20200101135931p:plain

Plays by max speed

trk.groupby('PlayKey')[['s']].max() \
    .sort_values('s', ascending=False) \
    .query('s != 0').head(20) \
    .plot(kind='barh', figsize=(15, 5), title='Top 20 Plays by Max Player Speed')
plt.show()

trk.groupby('PlayKey')[['s']].max() \
    .sort_values('s', ascending=True) \
    .query('s != 0').head(20) \
    .plot(kind='barh', figsize=(15, 5), title='Bottom 20 Plays by Min Player Speed')
plt.show()

f:id:bitop:20200101140034p:plain

Injury Length(怪我の治療にかかった長さ?)

inj[['DM_M1','DM_M7','DM_M28','DM_M42']].mean() \
    .plot(figsize=(15, 5),
          kind='bar',
          title='Percent of injuries by injury length')
plt.show()

f:id:bitop:20200101140124p:plain