Sabermetrics scripts for some old blog posts analyzing the 2014 Oakland A's.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

252 lines
7.5 KiB

print "Loading pandas..."
from pandas import *
print "Done loading pandas."
import re
import matplotlib.pylab as plt
import numpy as np
from scipy import stats
from scipy.optimize import curve_fit
import statsmodels.api as sm
keys = ['H','1B','2B','3B','BB','HR','AB','DP','SLG','SO','BA','OPS','R','RpG','RBI','Fld%']
labels = ['Hits','Singles','Doubles','Triples','Walks','Home Runs','At Bats','Hit Into Doub Play','Slugging Pct','Strikeouts','Batting Avg','OBP+SP','Runs','Runs per Game','RBIs','Fielding Pct']
cutoff_yrs = [[1973,2013]]
wins_key = 'W'
wins_label = 'Wins'
def curve_fit_function(x,b0,b1):
return b0 + b1*x
def load_data(cutoff_yr,league):
df = read_csv('data/master_team_batting.csv')
# Ignore years of baseball strikes
df = df[ df['Year'] != 1981 ]
df = df[ df['Year'] != 1994 ]
# Limit to cutoff years
df = df[ df['Year'] > cutoff_yr[0] ]
df = df[ df['Year'] < cutoff_yr[1] ]
df = df[ (df['Lg'].str.contains(league) ) ]
# Add in data about singles
df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']
# Win pct
df['Wpct'] = df['W']/df['G']
# Add per game columns
df['RpG'] = df['R']/df['G']
df['HpG'] = df['H']/df['G']
return df
def load_AL_data(cutoff_yr):
return load_data(cutoff_yr,'AL')
def load_NL_data(cutoff_yr):
return load_data(cutoff_yr,'NL')
def wins_multivariate_kdes():
"""
Look at how different stats are distributed
versus wins, from cutoff_yr to present,
by plotting the multivariate KDE
(this is an eyeball-norm test)
"""
print wins_label+": Multivariate KDEs"
for cutoff_yr in cutoff_yrs:
print "Now on year",cutoff_yr
for key, label in zip(keys,labels):
print " Stat:",label
fig = plt.figure(figsize=(12,5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
axes = { 'AL' : ax1,
'NL' : ax2 }
load_data_function = { 'AL' : load_AL_data,
'NL' : load_NL_data }
for league in ['AL','NL']:
df = load_data_function[league](cutoff_yr)
ax = axes[league]
mx = df[key].values
my = df[wins_key].values
# ensure we have consistent axes across all cutoff years
lim_df = load_data_function[league](cutoff_yrs[0])
lim_x = lim_df[key].values
lim_y = lim_df[wins_key].values
xmin = lim_x.min()
xmax = lim_x.max()
ymin = lim_y.min()
ymax = lim_y.max()
X, Y = np.mgrid[xmin:xmax:200j, ymin:ymax:200j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([mx, my])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
ax.imshow(np.rot90(Z), cmap=plt.cm.jet,
extent=[xmin, xmax, ymin, ymax],
aspect='auto')
ax.set_xlabel(label)
ax.set_ylabel(wins_label)
ax.set_title(league + ": "+label+"-"+wins_label+" Joint PDF, "+str(cutoff_yr[0])+"-"+str(cutoff_yr[1]))
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
#plt.show()
fig.savefig('figs_wins/ALNL_multivariate_kde_'+key+'_w_'+str(cutoff_yr[0])+'-'+str(cutoff_yr[1]))
plt.close('all')
def wins_regression_significance_test():
"""
Do regression for wins vs. X,
and determine if the regression
coefficient is statistically significant
"""
print wins_label+": Univariate Linear Regression Significance Test"
for cutoff_yr in cutoff_yrs:
print "Now on year",cutoff_yr
for key, label in zip(keys,labels):
print " Stat:",label
load_data_function = { 'AL' : load_AL_data,
'NL' : load_NL_data }
fig = plt.figure(figsize=(12,10))
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
ax4 = fig.add_subplot(224)
axes = { 'AL' : [ax1, ax3],
'NL' : [ax2, ax4] }
for league in ['AL','NL']:
print " League:",league
df = load_data_function[league](cutoff_yr)
mx = df[key].values
my = df[wins_key].values
# Use scipy curve fit
coeff, covar_matrix = curve_fit(curve_fit_function,mx,my)
# testing the null hypothesis:
# H0: slope = 0
# H1: slope != 0
# Test statistic:
# T0 = b1hat / SE(b1hat)
# (SE = standard error)
# Compute standard error,
# which is the sqrt of the covariance matrix
variance = np.diagonal(covar_matrix)
SE = np.sqrt(variance)
# test statistic
b0 = coeff[0]
SE_b0 = SE[0]
b1 = coeff[1]
SE_b1 = SE[1]
T = b1/SE_b1
# For effect to be insignificant,
# and null hypothesis to be accepted,
# -t_{alpha/2,n-2} < T0 < +t_{alpha/2,n-2}
# or,
# | T0 | < t_{alpha/2,n-2}
#
# For effect to be significant,
# and null hypothesis to be rejected,
# | T0 | >= t_{alpha/2,n-2}
alpha = [0.10, 0.05]
Nobs = len(my)
Nparams = len(coeff)
dof = Nobs - Nparams - 1
t10, t05 = stats.t.isf(alpha,dof)
#print " y = beta_0 + beta_1 x = %0.2g + %0.2g x"%(b0,b1)
#print " MSE(beta_0) = %0.2g"%(SE_b0)
#print " MSE(beta_1) = %0.2g"%(SE_b1)
#print " T = %0.2g"%(T)
print " Null Hypothesis Accepted? (10%%) = %0.2f < %0.2f ="%(abs(T),t10), abs(T) < t10
#print " Null Hypothesis Accepted? ( 5%%) = %0.2f < %0.2f ="%(abs(T),t05), abs(T) < t05
# Compute residuals
resid = my - (b0 + b1*mx)
# Quantile-quantile and residual plots
axis1 = axes[league][0]
axis2 = axes[league][1]
# Quantile-quantile plot:
axis1.set_title(league + ": "+label+"-Wins Lin Reg\nQQ/Resid Plots "+str(cutoff_yr[0])+"-"+str(cutoff_yr[1]))
sm.qqplot(resid,ax=axis1)
# Quantile-quantile line:
osm, _ = stats.probplot(resid)
quants = osm[0]
sm.qqline(ax=axis1,line='s',x=quants,y=resid)
# Residual plot
axis2.plot(mx,np.abs(resid),'bo')
fig.savefig('figs_wins/ALNL_qq_'+key+'_w_linreg_'+str(cutoff_yr[0])+'-'+str(cutoff_yr[1])+'.png')
# Remove the commented line to save memory
plt.show()
plt.close('all')
if __name__=="__main__":
wins_multivariate_kdes()
wins_regression_significance_test()