Browse Source

updates to team-by-team win stats

master
Charles Reid 7 years ago
parent
commit
adae1ba871
  1. 31
      TeamWins.py
  2. 208
      TeamWinsMultiple.py
  3. 212
      ols.py

31
TeamWins.py

@ -11,9 +11,14 @@ import statsmodels.api as sm
start_yr = 1993
start_yr = 1973
end_yr = 2013
#keys = ['H','1B','2B','3B','BB','HR','AB','DP','SLG','SO','BA','OPS','R','RpG','RBI','Fld%']
#labels = ['Hits','Singles','Doubles','Triples','Walks','Home Runs','At Bats','Hit Into Doub Play','Slugging Pct','Strikeouts','Batting Avg','OBP+SP','Runs','Runs per Game','RBIs','Fielding Pct']
keys = ['BB']
labels = ['Walks']
def curve_fit_function(x,b0,b1):
return b0 + b1*x
@ -49,17 +54,13 @@ def wins_teams_multivariate_kdes():
by plotting the multivariate KDE
(this is an eyeball-norm test)
"""
#keys = ['H','1B','2B','3B','BB','HR','AB','DP','SLG','SO','BA','OPS','R','RpG','RBI','Fld%']
#labels = ['Hits','Singles','Doubles','Triples','Walks','Home Runs','At Bats','Hit Into Doub Play','Slugging Pct','Strikeouts','Batting Avg','OBP+SP','Runs','Runs per Game','RBIs','Fielding Pct']
keys = ['H']
labels = ['Hits']
cutoff_yrs = [[start_yr,end_yr]]
wins_key = 'Wpct'
wins_label = 'Win Pct'
print wins_label+": Multivariate KDEs"
for cutoff_yr in cutoff_yrs:
print "Now on year",cutoff_yr
@ -121,17 +122,12 @@ def wins_teams_regression_significance_test():
and determine if the regression
coefficient is statistically significant
"""
#keys = ['H','1B','2B','3B','BB','HR','SLG','SO','BA','OPS','RBI','Fld%']
#labels = ['Hits','Singles','Doubles','Triples','Walks','Home Runs','Slugging Pct','Batting Avg','OBP+SP','RBIs','Fielding Pct']
keys = ['H']#,'1B','2B','3B','BB','HR','SLG','SO','BA','OPS','RBI','Fld%']
labels = ['Hits']#,'Singles','Doubles','Triples','Walks','Home Runs','Slugging Pct','Batting Avg','OBP+SP','RBIs','Fielding Pct']
cutoff_yrs = [[start_yr,end_yr]]
wins_key = 'Wpct'
wins_label = 'Win Pct'
print wins_label+": Regression Significance Test"
do_plots = True
@ -146,8 +142,10 @@ def wins_teams_regression_significance_test():
print " Stat:",label
lim_x = [min(all_df[wins_key].values),max(all_df[wins_key].values)]
lim_y = [min(all_df[key].values),max(all_df[key].values)]
#lim_x = [min(all_df[wins_key].values),max(all_df[wins_key].values)]
#lim_y = [min(all_df[key].values),max(all_df[key].values)]
lim_x = [min(all_df[key].values),max(all_df[key].values)]
lim_y = [min(all_df[wins_key].values),max(all_df[wins_key].values)]
for team in teams:
@ -160,6 +158,8 @@ def wins_teams_regression_significance_test():
df = all_df[all_df['Team']==team]
#mx = df[key].values
#my = df[wins_key].values
mx = df[wins_key].values
my = df[key].values
@ -246,6 +246,5 @@ def wins_teams_regression_significance_test():
if __name__=="__main__":
#wins_teams_multivariate_kdes()
wins_teams_multivariate_kdes()
wins_teams_regression_significance_test()

208
TeamWinsMultiple.py

@ -0,0 +1,208 @@
print "Loading pandas..."
from pandas import *
print "Done loading pandas."
import copy
import ols
import matplotlib.pylab as plt
import numpy as np
from scipy import stats
# from http://wiki.scipy.org/Cookbook/OLS
start_yr = 1973
end_yr = 2013
def load_data(cutoff_yr):
df = read_csv('data/master_team_batting.csv')
# Ignore years of baseball strikes
df = df[ df['Year'] != 1981 ]
df = df[ df['Year'] != 1994 ]
# Limit to cutoff years
df = df[ df['Year'] > cutoff_yr[0] ]
df = df[ df['Year'] < cutoff_yr[1] ]
# Add in data about singles
df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']
# Win pct
df['Wpct'] = df['W']/df['G']
# Add hits per game columns
df['1BpG'] = df['1B']/df['G']
df['2BpG'] = df['2B']/df['G']
df['3BpG'] = df['3B']/df['G']
df['HRpG'] = df['HR']/df['G']
# Add runs per game column
df['RpG'] = df['R']/df['G']
return df
def wins_teams_multiple_regression_significance_test():
cutoff_yrs = [[start_yr,end_yr]]
wins_key = 'Wpct'
wins_label = 'Win Pct'
#keys = ['H','1B','2B','3B','BB','HR','AB','DP','SLG','SO','BA','OPS','R','RpG','RBI','Fld%']
#labels = ['Hits','Singles','Doubles','Triples','Walks','Home Runs','At Bats','Hit Into Doub Play','Slugging Pct','Strikeouts','Batting Avg','OBP+SP','Runs','Runs per Game','RBIs','Fielding Pct']
#keys = ['SLG','BA','OPS','RpG','RBI']
#labels = ['Slugging Pct','Batting Avg','OBP+SP','Runs per Game','RBIs']
keys = ['1BpG','2BpG','3BpG','HRpG']
labels = ['Singles','Doubles','Triples','Home Runs']
for cutoff_yr in cutoff_yrs:
all_df = load_data(cutoff_yr)
####################
# 1. All data
print " All Teams:"
mx = all_df[[key for key in keys]].values
# weight base hits by number of bases
for i in [0,1,2,3]:
mx[:,i] = (i+1)*mx[:,i]
my = all_df[wins_key].values
orig_keys = copy.deepcopy(keys)
orig_labels = copy.deepcopy(labels)
## Add this block if you want interaction effects
## Block start
#for i in range(len(orig_keys)):
# for j in range(i):
# if i <> j:
# mx = np.append( mx, all_df[orig_keys[i]].values * all_df[orig_keys[j]].values )
# keys.append( orig_keys[i] + ' x ' + orig_keys[j] )
# labels.append( orig_labels[i] + ' x ' + orig_labels[j] )
#total_len = np.shape(my)[0]
#total_dims = len(keys)
#mx = np.reshape(mx,[total_len,total_dims])
## Block end
all_model = ols.ols(my,mx,wins_label,[lab for lab in labels])
#all_model.summary()
SE = all_model.se
coeff = all_model.b
# test statistic
T = coeff/SE
# For effect to be insigificant,
# and null hypothesis to be accepted,
# -t_{alpha/2,n-2} < T0 < +t_{alpha/2,n-2}
# or,
# | T0 | < t_{alpha/2,n-2}
#
#
# For effect to be significant,
# and null hypothesis to be rejected,
# | T0 | > t_{alpha/2,n-2}
alpha = [0.10, 0.05]
Nobs = all_model.nobs
Nparams = all_model.ncoef
dof = Nobs - Nparams - 1
t10, t05 = stats.t.isf(alpha,dof)
print " Adj R2 =",all_model.R2adj
for key,lab,coeff_i,t_i,se_i in zip(keys,labels,coeff[1:],T[1:],SE[1:]):
print " "+lab+":"
print " Coeff =",coeff_i
print " MSE =",se_i
print " T0 =",t_i
print " Null Hypothesis Accepted? (10%%) = %0.2f < %0.2f ="%(abs(t_i),t10), abs(t_i) < t10
print " Null Hypothesis Accepted? ( 5%%) = %0.2f < %2.4f ="%(abs(t_i),t05), abs(t_i) < t05
print "\n\n"
print "="*40
print "="*40
print "="*40
print "\n\n"
fig = plt.figure()
for i in range(4):
ip1 = i + 1
ax = fig.add_subplot(1,4,ip1)
ax.plot(mx[:,i],my,'bo')
ax.set_title(labels[i])
#plt.draw()
plt.show()
### ####################
### # 2. Team-by-team data
### teams = unique(all_df['Team'])
###
### for team in teams:
### print " Team:",team
### df = all_df[all_df['Team']==team]
### mx = df[[key for key in orig_keys]].values
### my = df[wins_key].values
### ## Add this if you want interaction effects
### #for i in range(len(orig_keys)):
### # for j in range(i):
### # if i <> j:
### # mx = np.append( mx, all_df[orig_keys[i]].values * all_df[orig_keys[j]].values )
### # keys.append( orig_keys[i] + ' x ' + orig_keys[j] )
### # labels.append( orig_labels[i] + ' x ' + orig_labels[j] )
### #total_len = np.shape(my)[0]
### #total_dims = len(keys)
### #mx = np.reshape(mx,[total_len,total_dims])
### team_model = ols.ols(my,mx,wins_label,[lab for lab in orig_labels])
### #team_model.summary()
### SE = team_model.se
### coeff = team_model.b
### # test statistic
### T = coeff/SE
### alpha = [0.10, 0.05]
### Nobs = team_model.nobs
### Nparams = team_model.ncoef
### dof = Nobs - Nparams - 1
### t10, t05 = stats.t.isf(alpha,dof)
### print " Adj R2 =",team_model.R2adj
### for key,lab,coeff_i,t_i,se_i in zip(keys,labels,coeff[1:],T[1:],SE[1:]):
### print " "+lab+":"
### print " Coeff =",coeff_i
### print " MSE =",se_i
### print " T0 =",t_i
### print " Null Hypothesis Accepted? (10%%) = %0.2f < %0.2f ="%(abs(t_i),t10), abs(t_i) < t10
### print " Null Hypothesis Accepted? ( 5%%) = %0.2f < %2.4f ="%(abs(t_i),t05), abs(t_i) < t05
### print "\n\n"
if __name__=="__main__":
wins_teams_multiple_regression_significance_test()

212
ols.py

@ -0,0 +1,212 @@
from __future__ import division
from scipy import c_, ones, dot, stats, diff
from scipy.linalg import inv, solve, det
from numpy import log, pi, sqrt, square, diagonal
from numpy.random import randn, seed
import time
# from http://wiki.scipy.org/Cookbook/OLS
class ols:
"""
Author: Vincent Nijs (+ ?)
Email: v-nijs at kellogg.northwestern.edu
Last Modified: Mon Jan 15 17:56:17 CST 2007
Dependencies: See import statement at the top of this file
Doc: Class for multi-variate regression using OLS
For usage examples of other class methods see the class tests at the bottom of this file. To see the class in action
simply run this file using 'python ols.py'. This will generate some simulated data and run various analyses. If you have rpy installed
the same model will also be estimated by R for confirmation.
Input:
y = dependent variable
y_varnm = string with the variable label for y
x = independent variables, note that a constant is added by default
x_varnm = string or list of variable labels for the independent variables
Output:
There are no values returned by the class. Summary provides printed output.
All other measures can be accessed as follows:
Step 1: Create an OLS instance by passing data to the class
m = ols(y,x,y_varnm = 'y',x_varnm = ['x1','x2','x3','x4'])
Step 2: Get specific metrics
To print the coefficients:
>>> print m.b
To print the coefficients p-values:
>>> print m.p
"""
def __init__(self,y,x,y_varnm = 'y',x_varnm = ''):
"""
Initializing the ols class.
"""
self.y = y
self.x = c_[ones(x.shape[0]),x]
self.y_varnm = y_varnm
if not isinstance(x_varnm,list):
self.x_varnm = ['const'] + list(x_varnm)
else:
self.x_varnm = ['const'] + x_varnm
# Estimate model using OLS
self.estimate()
def estimate(self):
# estimating coefficients, and basic stats
self.inv_xx = inv(dot(self.x.T,self.x))
xy = dot(self.x.T,self.y)
self.b = dot(self.inv_xx,xy) # estimate coefficients
self.nobs = self.y.shape[0] # number of observations
self.ncoef = self.x.shape[1] # number of coef.
self.df_e = self.nobs - self.ncoef # degrees of freedom, error
self.df_r = self.ncoef - 1 # degrees of freedom, regression
self.e = self.y - dot(self.x,self.b) # residuals
self.sse = dot(self.e,self.e)/self.df_e # SSE
self.se = sqrt(diagonal(self.sse*self.inv_xx)) # coef. standard errors
self.t = self.b / self.se # coef. t-statistics
self.p = (1-stats.t.cdf(abs(self.t), self.df_e)) * 2 # coef. p-values
self.R2 = 1 - self.e.var()/self.y.var() # model R-squared
self.R2adj = 1-(1-self.R2)*((self.nobs-1)/(self.nobs-self.ncoef)) # adjusted R-square
self.F = (self.R2/self.df_r) / ((1-self.R2)/self.df_e) # model F-statistic
self.Fpv = 1-stats.f.cdf(self.F, self.df_r, self.df_e) # F-statistic p-value
def dw(self):
"""
Calculates the Durbin-Waston statistic
"""
de = diff(self.e,1)
dw = dot(de,de) / dot(self.e,self.e);
return dw
def omni(self):
"""
Omnibus test for normality
"""
return stats.normaltest(self.e)
def JB(self):
"""
Calculate residual skewness, kurtosis, and do the JB test for normality
"""
# Calculate residual skewness and kurtosis
skew = stats.skew(self.e)
kurtosis = 3 + stats.kurtosis(self.e)
# Calculate the Jarque-Bera test for normality
JB = (self.nobs/6) * (square(skew) + (1/4)*square(kurtosis-3))
JBpv = 1-stats.chi2.cdf(JB,2);
return JB, JBpv, skew, kurtosis
def ll(self):
"""
Calculate model log-likelihood and two information criteria
"""
# Model log-likelihood, AIC, and BIC criterion values
ll = -(self.nobs*1/2)*(1+log(2*pi)) - (self.nobs/2)*log(dot(self.e,self.e)/self.nobs)
aic = -2*ll/self.nobs + (2*self.ncoef/self.nobs)
bic = -2*ll/self.nobs + (self.ncoef*log(self.nobs))/self.nobs
return ll, aic, bic
def summary(self):
"""
Printing model output to screen
"""
# local time & date
t = time.localtime()
# extra stats
ll, aic, bic = self.ll()
JB, JBpv, skew, kurtosis = self.JB()
omni, omnipv = self.omni()
# printing output to screen
print '\n=============================================================================='
print "Dependent Variable: " + self.y_varnm
print "Method: Least Squares"
print "Date: ", time.strftime("%a, %d %b %Y",t)
print "Time: ", time.strftime("%H:%M:%S",t)
print '# obs: %5.0f' % self.nobs
print '# variables: %5.0f' % self.ncoef
print '=============================================================================='
print 'variable coefficient std. Error t-statistic prob.'
print '=============================================================================='
for i in range(len(self.x_varnm)):
print '''% -5s % -5.6f % -5.6f % -5.6f % -5.6f''' % tuple([self.x_varnm[i],self.b[i],self.se[i],self.t[i],self.p[i]])
print '=============================================================================='
print 'Models stats Residual stats'
print '=============================================================================='
print 'R-squared % -5.6f Durbin-Watson stat % -5.6f' % tuple([self.R2, self.dw()])
print 'Adjusted R-squared % -5.6f Omnibus stat % -5.6f' % tuple([self.R2adj, omni])
print 'F-statistic % -5.6f Prob(Omnibus stat) % -5.6f' % tuple([self.F, omnipv])
print 'Prob (F-statistic) % -5.6f JB stat % -5.6f' % tuple([self.Fpv, JB])
print 'Log likelihood % -5.6f Prob(JB) % -5.6f' % tuple([ll, JBpv])
print 'AIC criterion % -5.6f Skew % -5.6f' % tuple([aic, skew])
print 'BIC criterion % -5.6f Kurtosis % -5.6f' % tuple([bic, kurtosis])
print '=============================================================================='
if __name__ == '__main__':
##########################
### testing the ols class
##########################
# creating simulated data and variable labels
seed(1)
data = randn(100,5) # the data array
# intercept is added, by default
m = ols(data[:,0],data[:,1:],y_varnm = 'y',x_varnm = ['x1','x2','x3','x4'])
m.summary()
# if you have rpy installed, use it to test the results
have_rpy = False
try:
print "\n"
print "="*30
print "Validating OLS results in R"
print "="*30
import rpy
have_rpy = True
except ImportError:
print "\n"
print "="*30
print "Validating OLS-class results in R"
print "="*30
print "rpy is not installed"
print "="*30
if have_rpy:
y = data[:,0]
x1 = data[:,1]
x2 = data[:,2]
x3 = data[:,3]
x4 = data[:,4]
rpy.set_default_mode(rpy.NO_CONVERSION)
linear_model = rpy.r.lm(rpy.r("y ~ x1 + x2 + x3 + x4"), data = rpy.r.data_frame(x1=x1,x2=x2,x3=x3,x4=x4,y=y))
rpy.set_default_mode(rpy.BASIC_CONVERSION)
print linear_model.as_py()['coefficients']
summary = rpy.r.summary(linear_model)
print summary
Loading…
Cancel
Save