Sabermetrics scripts for some old blog posts analyzing the 2014 Oakland A's.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

161 lines
4.8 KiB

print "Loading pandas..."
from pandas import *
print "Done loading pandas."
import matplotlib.pylab as plt
import numpy as np
from scipy import stats
def load_data():
df = read_csv('data/oak_team_batting.csv',index_col='Rk')
# Data Cleanup:
# -----------------------
# Let's exclude some years:
# - 2014 (incomplete season)
# - 1994 (the baseball strike)
df = df[ df['Year'] != 1994 ]
df = df[ df['Year'] != 2014 ]
df['ABpG'] = df['AB']/df['G']
return df
df = load_data()
# AB
# -----------------------
# We're going to start with something simple:
# a kernel density function for number of at bats per season
ab = df['AB']
kernel_scott = stats.gaussian_kde(ab,'scott')
xx = np.linspace(4000,6000,500)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xx,kernel_scott(xx),'b-')
ax.set_xlabel("At Bats Per Season")
ax.set_ylabel(r"$P(N_{AB})$")
ax.set_title("Athletics At-Bats, "+str( min(df['Year']) )+"-"+str( max(df['Year']) ))
print "Bandwidth =",kernel_scott.covariance_factor()
fig.savefig('figs/kde_AB.png')
plt.show()
# That's interesting, but let's compare at bats over the entire history of the Athletics club
# to at bats after 1968 (the year the Athletics moved to Oakland)
ab_post1968 = df['AB'][ df['Year'] >= 1968 ]
kernel_scott_1968 = stats.gaussian_kde(ab_post1968,'scott')
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xx,kernel_scott(xx),'b-',label='All Years')
ax.plot(xx,kernel_scott_1968(xx),'r-',label='After 68')
ax.legend(loc='best')
ax.set_xlabel("At Bats Per Season")
ax.set_ylabel(r"$P(N_{AB})$")
ax.set_title("Athletics At-Bats, "+str( min(df['Year']) )+"-"+str( max(df['Year']) )+" vs Oakland (post-1968)")
fig.savefig('figs/kde_AB_1968.png')
plt.show()
# And finally, let's see where the last decade stacks up...
ab_decade = df['AB'][ df['Year'] >= 2003 ]
kernel_scott_decade = stats.gaussian_kde(ab_decade,'scott')
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xx,kernel_scott(xx),'b-',label='All Years')
ax.plot(xx,kernel_scott_1968(xx),'r-',label='After 68')
ax.plot(xx,kernel_scott_decade(xx),'g-',label='Last Decade')
ax.legend(loc='best')
ax.set_xlabel("At Bats Per Season")
ax.set_ylabel(r"$P(N_{AB})$")
ax.set_title("Athletics At-Bats (All Years, Post-1968, and Past Decade)")
fig.savefig('figs/kde_AB_decade.png')
plt.show()
# To de-convolute the effects of years,
# let's re-do some of the KDEs over discrete time periods.
ab_pre1933 = df['AB'][df['Year']<1933]
ab_1933_1954 = df['AB'][(df['Year']>=1933) & (df['Year']<1955)]
ab_1955_1967 = df['AB'][(df['Year']>=1955) & (df['Year']<1968)]
ab_1968_2002 = df['AB'][(df['Year']>=1968) & (df['Year']<2003)]
ab_2003_2013 = df['AB'][df['Year']>=2003]
kernel_pre1933 = stats.gaussian_kde(ab_pre1933 )
kernel_1933_1954 = stats.gaussian_kde(ab_1933_1954)
kernel_1955_1967 = stats.gaussian_kde(ab_1955_1967)
kernel_1968_2002 = stats.gaussian_kde(ab_1968_2002)
kernel_2003_2013 = stats.gaussian_kde(ab_2003_2013)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xx,kernel_pre1933( xx), 'b-', label='Pre 1933')
ax.plot(xx,kernel_1933_1954(xx), 'r-', label='33-54')
ax.plot(xx,kernel_1955_1967(xx), 'g-', label='55-67')
ax.plot(xx,kernel_1968_2002(xx), 'c-', label='68-02')
ax.plot(xx,kernel_2003_2013(xx), 'k-', label='03-Present')
ax.legend(loc='best')
ax.set_xlabel("At Bats Per Season")
ax.set_ylabel(r"$P(N_{AB})$")
ax.set_title("Athletics At-Bats, Split Over Eras")
fig.savefig('figs/kde_AB_split.png')
plt.show()
# Now repeat the above analysis, but with at bats per game
abpg = df['ABpG']
abpg_pre1933 = df['ABpG'][df['Year']<1933]
abpg_1933_1954 = df['ABpG'][(df['Year']>=1933) & (df['Year']<1955)]
abpg_1955_1967 = df['ABpG'][(df['Year']>=1955) & (df['Year']<1968)]
abpg_1968_2002 = df['ABpG'][(df['Year']>=1968) & (df['Year']<2003)]
abpg_2003_2013 = df['ABpG'][df['Year']>=2003]
kernel_all = stats.gaussian_kde(abpg )
kernel_pre1933 = stats.gaussian_kde(abpg_pre1933 )
kernel_1933_1954 = stats.gaussian_kde(abpg_1933_1954)
kernel_1955_1967 = stats.gaussian_kde(abpg_1955_1967)
kernel_1968_2002 = stats.gaussian_kde(abpg_1968_2002)
kernel_2003_2013 = stats.gaussian_kde(abpg_2003_2013)
xxpg = np.linspace(min(df['ABpG']),max(df['ABpG']),500)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xxpg,kernel_all( xxpg), 'k--', label='All Yrs')
ax.plot(xxpg,kernel_pre1933( xxpg), 'b-', label='Pre 1933')
ax.plot(xxpg,kernel_1933_1954(xxpg), 'r-', label='33-54')
ax.plot(xxpg,kernel_1955_1967(xxpg), 'g-', label='55-67')
ax.plot(xxpg,kernel_1968_2002(xxpg), 'c-', label='68-02')
ax.plot(xxpg,kernel_2003_2013(xxpg), 'k-', label='03-Present')
ax.legend(loc='best')
ax.set_xlabel("At Bats Per Game")
ax.set_ylabel(r"$P(N_{ABpG})$")
ax.set_title("Athletics At-Bats Per Game, Split Over Eras")
fig.savefig('figs/kde_ABpG_split.png')
plt.show()