Sabermetrics scripts for some old blog posts analyzing the 2014 Oakland A's.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

267 lines
8.2 KiB

print "Loading pandas..."
from pandas import *
print "Done loading pandas."
import re
import matplotlib.pylab as plt
import numpy as np
from scipy import stats
from scipy.optimize import curve_fit
import statsmodels.api as sm
start_yr = 1973
end_yr = 2013
keys = ['H','1B','2B','3B','BB','HR','AB','DP','SLG','SO','BA','OPS','R','RpG','RBI','Fld%']
labels = ['Hits','Singles','Doubles','Triples','Walks','Home Runs','At Bats','Hit Into Doub Play','Slugging Pct','Strikeouts','Batting Avg','OBP+SP','Runs','Runs per Game','RBIs','Fielding Pct']
#keys = ['BB']
#labels = ['Walks']
#keys = ['WOBA']
#labels = ['Weighted On Base Avg']
wins_key = 'Wpct'
wins_label = 'Win Pct'
def curve_fit_function(x,b0,b1):
return b0 + b1*x
def load_data(cutoff_yr):
df = read_csv('data/master_team_batting.csv')
# Ignore years of baseball strikes
df = df[ df['Year'] != 1981 ]
df = df[ df['Year'] != 1994 ]
# Limit to cutoff years
df = df[ df['Year'] > cutoff_yr[0] ]
df = df[ df['Year'] < cutoff_yr[1] ]
# Add in data about singles
df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']
# Win pct
df['Wpct'] = df['W']/df['G']
# Add runs per game column
df['RpG'] = df['R']/df['G']
# woba http://www.fangraphs.com/library/offense/woba/
df['WOBA'] = (0.7 * df['BB'] + 0.9 * df['1B'] + 1.2 * df['2B'] + 1.6 * df['3B'] + 2.1 * df['HR'])/( df['AB'] + df['BB'] )
return df
def wins_teams_multivariate_kdes():
"""
Look at how different stats are distributed
team-by-team versus wins, from cutoff_yr to present,
by plotting the multivariate KDE
(this is an eyeball-norm test)
"""
cutoff_yrs = [[start_yr,end_yr]]
print wins_label+": Multivariate KDEs"
for cutoff_yr in cutoff_yrs:
print "Now on year",cutoff_yr
all_df = load_data(cutoff_yr)
teams = unique(all_df['Team'])
for key, label in zip(keys,labels):
print " Stat:",label
lim_x = [min(all_df[key].values),max(all_df[key].values)]
lim_y = [min(all_df[wins_key].values),max(all_df[wins_key].values)]
for team in teams:
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
print " Team:",team
df = all_df[all_df['Team']==team]
mx = df[key].values
my = df[wins_key].values
xmin = lim_x[0]#mx.min()
xmax = lim_x[1]#mx.max()
ymin = lim_y[0]#my.min()
ymax = lim_y[1]#my.max()
X, Y = np.mgrid[xmin:xmax:200j, ymin:ymax:200j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([mx, my])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
ax.imshow(np.rot90(Z), cmap=plt.cm.jet,
extent=[xmin, xmax, ymin, ymax],
aspect='auto')
ax.plot(mx, my, 'k.', markersize=6)
ax.set_xlabel(label)
ax.set_ylabel(wins_label)
ax.set_title(team.upper()+": "+label+"-"+wins_label+" Joint PDF, "+str(cutoff_yr[0])+"-"+str(cutoff_yr[1]))
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
plt.show()
fig.savefig('figs_teamwins/teamwins_multivariate_kde_'+team+'_'+key+'_w_'+str(cutoff_yr[0])+'-'+str(cutoff_yr[1]))
plt.close('all')
def wins_teams_regression_significance_test():
"""
Do team-by-team regression for wins vs. X,
and determine if the regression
coefficient is statistically significant
"""
cutoff_yrs = [[start_yr,end_yr]]
print wins_label+": Regression Significance Test"
do_plots = False#True
test_df = DataFrame()
for cutoff_yr in cutoff_yrs:
print "Now on year",cutoff_yr
all_df = load_data(cutoff_yr)
teams = unique(all_df['Team'])
for key, label in zip(keys,labels):
print " Stat:",label
lim_x = [min(all_df[key].values),max(all_df[key].values)]
lim_y = [min(all_df[wins_key].values),max(all_df[wins_key].values)]
for team in teams:
if do_plots:
fig = plt.figure(figsize=(6,10))
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)
print " Team:",team
df = all_df[all_df['Team']==team]
mx = df[key].values
my = df[wins_key].values
xmin = mx.min()
xmax = mx.max()
ymin = my.min()
ymax = my.max()
# Use scipy curve fit
coeff, covar_matrix = curve_fit(curve_fit_function,mx,my)
# testing the null hypothesis:
# H0: slope = 0
# H1: slope != 0
# Test statistic:
# T0 = b1hat / SE(b1hat)
# (SE = standard error)
# Compute standard error,
# which is the sqrt of the covariance matrix
variance = np.diagonal(covar_matrix)
SE = np.sqrt(variance)
# test statistic
b0 = coeff[0]
SE_b0 = SE[0]
b1 = coeff[1]
SE_b1 = SE[1]
T = b1/SE_b1
# For effect to be insigificant,
# and null hypothesis to be accepted,
# -t_{alpha/2,n-2} < T0 < +t_{alpha/2,n-2}
# or,
# | T0 | < t_{alpha/2,n-2}
#
#
# For effect to be significant,
# and null hypothesis to be rejected,
# | T0 | > t_{alpha/2,n-2}
alpha = [0.10, 0.05]
Nobs = len(my)
Nparams = len(coeff)
dof = Nobs - Nparams - 1
t10, t05 = stats.t.isf(alpha,dof)
#print " y = beta_0 + beta_1 x = %0.2g + %0.2g x"%(b0,b1)
#print " MSE(beta_0) = %0.2g"%(SE_b0)
#print " MSE(beta_1) = %0.2g"%(SE_b1)
#print " T = %0.2g"%(T)
print " Null Hypothesis Accepted? (10%%) = %0.2f < %0.2f ="%(abs(T),t10), abs(T) < t10
#print " Null Hypothesis Accepted? ( 5%%) = %0.2f < %0.2f ="%(abs(T),t05), abs(T) < t05
d = {'Team':team,
'Stat':key,
'WinPct Mean': df[wins_key].mean(),
'WinPct Med': df[wins_key].mean(),
'WinPct Std': df[wins_key].std(),
'Stat Label':label,
'T Value': T,
'T Threshold': t10,
'SE' : SE_b1,
'Accept H0' : abs(T) < t10
}
test_df_item = DataFrame([d])
test_df = concat([test_df,test_df_item],ignore_index=True)
# Compute residuals
resid = my - (b0 + b1*mx)
if do_plots:
# Quantile-quantile and residual plots
ax1.set_title(team.upper()+": "+label+"-Wins Lin Reg\nQQ/Resid Plots "+str(cutoff_yr[0])+"-"+str(cutoff_yr[1]))
sm.qqplot(resid,ax=ax1)
# Quantile-quantile line:
osm, _ = stats.probplot(resid)
quants = osm[0]
sm.qqline(ax=ax1,line='s',x=quants,y=resid)
# Residual plot
ax2.plot(mx,np.abs(resid),'bo')
ax2.set_xlabel('Win Pct')
ax2.set_ylabel('Residual')
if do_plots:
fig.savefig('figs_teamwins/teamwins_qq_'+team+'_'+key+'_w_linreg_'+str(cutoff_yr[0])+'-'+str(cutoff_yr[1])+'.png')
plt.show()
plt.close('all')
# save stat test df
test_df.to_csv('data/teamwins_hypothesis_test_df.csv',index=False)
if __name__=="__main__":
#wins_teams_multivariate_kdes()
wins_teams_regression_significance_test()