Sabermetrics scripts for some old blog posts analyzing the 2014 Oakland A's.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

208 lines
5.9 KiB

print "Loading pandas..."
from pandas import *
print "Done loading pandas."
import copy
import ols
import matplotlib.pylab as plt
import numpy as np
from scipy import stats
# from http://wiki.scipy.org/Cookbook/OLS
start_yr = 1973
end_yr = 2013
def load_data(cutoff_yr):
df = read_csv('data/master_team_batting.csv')
# Ignore years of baseball strikes
df = df[ df['Year'] != 1981 ]
df = df[ df['Year'] != 1994 ]
# Limit to cutoff years
df = df[ df['Year'] > cutoff_yr[0] ]
df = df[ df['Year'] < cutoff_yr[1] ]
# Add in data about singles
df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']
# Win pct
df['Wpct'] = df['W']/df['G']
# Add hits per game columns
df['1BpG'] = df['1B']/df['G']
df['2BpG'] = df['2B']/df['G']
df['3BpG'] = df['3B']/df['G']
df['HRpG'] = df['HR']/df['G']
# Add runs per game column
df['RpG'] = df['R']/df['G']
return df
def wins_teams_multiple_regression_significance_test():
cutoff_yrs = [[start_yr,end_yr]]
wins_key = 'Wpct'
wins_label = 'Win Pct'
#keys = ['1BpG','2BpG','3BpG','HRpG']
#labels = ['Singles','Doubles','Triples','Home Runs']
keys = ['1BpG','3BpG','HRpG']
labels = ['Singles','Triples','Home Runs']
for cutoff_yr in cutoff_yrs:
all_df = load_data(cutoff_yr)
####################
# 1. All data
print " All Teams:"
mx = all_df[[key for key in keys]].values
## weight base hits by number of bases
#for i in [0,1,2,3]:
# mx[:,i] = (i+1)*mx[:,i]
my = all_df[wins_key].values
orig_keys = copy.deepcopy(keys)
orig_labels = copy.deepcopy(labels)
## Add this block if you want interaction effects
## Block start
#for i in range(len(orig_keys)):
# for j in range(i):
# if i <> j:
# mx = np.append( mx, all_df[orig_keys[i]].values * all_df[orig_keys[j]].values )
# keys.append( orig_keys[i] + ' x ' + orig_keys[j] )
# labels.append( orig_labels[i] + ' x ' + orig_labels[j] )
#total_len = np.shape(my)[0]
#total_dims = len(keys)
#mx = np.reshape(mx,[total_len,total_dims])
## Block end
all_model = ols.ols(my,mx,wins_label,[lab for lab in labels])
#all_model.summary()
SE = all_model.se
coeff = all_model.b
# test statistic
T = coeff/SE
# For effect to be insigificant,
# and null hypothesis to be accepted,
# -t_{alpha/2,n-2} < T0 < +t_{alpha/2,n-2}
# or,
# | T0 | < t_{alpha/2,n-2}
#
#
# For effect to be significant,
# and null hypothesis to be rejected,
# | T0 | > t_{alpha/2,n-2}
alpha = [0.10, 0.05]
Nobs = all_model.nobs
Nparams = all_model.ncoef
dof = Nobs - Nparams - 1
t10, t05 = stats.t.isf(alpha,dof)
print " Adj R2 =",all_model.R2adj
for key,lab,coeff_i,t_i,se_i in zip(keys,labels,coeff[1:],T[1:],SE[1:]):
print " "+lab+":"
print " Coeff =",coeff_i
print " MSE =",se_i
print " T0 =",t_i
print " Null Hypothesis Accepted? (10%%) = %0.2f < %0.2f ="%(abs(t_i),t10), abs(t_i) < t10
print " Null Hypothesis Accepted? ( 5%%) = %0.2f < %2.4f ="%(abs(t_i),t05), abs(t_i) < t05
print "\n\n"
print "="*40
print "="*40
print "="*40
print "\n\n"
fig = plt.figure()
for i in range(3):
ip1 = i + 1
ax = fig.add_subplot(1,4,ip1)
ax.plot(mx[:,i],my,'bo')
ax.set_title(labels[i])
plt.draw()
plt.show()
### ####################
### # 2. Team-by-team data
### teams = unique(all_df['Team'])
###
### for team in teams:
### print " Team:",team
### df = all_df[all_df['Team']==team]
### mx = df[[key for key in orig_keys]].values
### my = df[wins_key].values
### ## Add this if you want interaction effects
### #for i in range(len(orig_keys)):
### # for j in range(i):
### # if i <> j:
### # mx = np.append( mx, all_df[orig_keys[i]].values * all_df[orig_keys[j]].values )
### # keys.append( orig_keys[i] + ' x ' + orig_keys[j] )
### # labels.append( orig_labels[i] + ' x ' + orig_labels[j] )
### #total_len = np.shape(my)[0]
### #total_dims = len(keys)
### #mx = np.reshape(mx,[total_len,total_dims])
### team_model = ols.ols(my,mx,wins_label,[lab for lab in orig_labels])
### #team_model.summary()
### SE = team_model.se
### coeff = team_model.b
### # test statistic
### T = coeff/SE
### alpha = [0.10, 0.05]
### Nobs = team_model.nobs
### Nparams = team_model.ncoef
### dof = Nobs - Nparams - 1
### t10, t05 = stats.t.isf(alpha,dof)
### print " Adj R2 =",team_model.R2adj
### for key,lab,coeff_i,t_i,se_i in zip(keys,labels,coeff[1:],T[1:],SE[1:]):
### print " "+lab+":"
### print " Coeff =",coeff_i
### print " MSE =",se_i
### print " T0 =",t_i
### print " Null Hypothesis Accepted? (10%%) = %0.2f < %0.2f ="%(abs(t_i),t10), abs(t_i) < t10
### print " Null Hypothesis Accepted? ( 5%%) = %0.2f < %2.4f ="%(abs(t_i),t05), abs(t_i) < t05
### print "\n\n"
if __name__=="__main__":
wins_teams_multiple_regression_significance_test()