Browse Source

updating wins analysis script. adding lin reg, qq plots, resid plots, etc.

master
Charles Reid 7 years ago
parent
commit
e0b73486ae
  1. 4
      Regression.py
  2. 245
      Wins.py
  3. BIN
      figs/ALNL_multivariate_kde_1B_w_since1950.png
  4. BIN
      figs/ALNL_multivariate_kde_1B_w_since1970.png
  5. BIN
      figs/ALNL_multivariate_kde_1B_w_since2000.png
  6. BIN
      figs/ALNL_multivariate_kde_2B_w_since1950.png
  7. BIN
      figs/ALNL_multivariate_kde_2B_w_since1970.png
  8. BIN
      figs/ALNL_multivariate_kde_2B_w_since2000.png
  9. BIN
      figs/ALNL_multivariate_kde_3B_w_since1950.png
  10. BIN
      figs/ALNL_multivariate_kde_3B_w_since1970.png
  11. BIN
      figs/ALNL_multivariate_kde_3B_w_since2000.png
  12. BIN
      figs/ALNL_multivariate_kde_AB_w_since1950.png
  13. BIN
      figs/ALNL_multivariate_kde_AB_w_since1970.png
  14. BIN
      figs/ALNL_multivariate_kde_AB_w_since2000.png
  15. BIN
      figs/ALNL_multivariate_kde_BA_w_since1950.png
  16. BIN
      figs/ALNL_multivariate_kde_BA_w_since1970.png
  17. BIN
      figs/ALNL_multivariate_kde_BA_w_since2000.png
  18. BIN
      figs/ALNL_multivariate_kde_BB_w_since1950.png
  19. BIN
      figs/ALNL_multivariate_kde_BB_w_since1970.png
  20. BIN
      figs/ALNL_multivariate_kde_BB_w_since2000.png
  21. BIN
      figs/ALNL_multivariate_kde_DP_w_since1950.png
  22. BIN
      figs/ALNL_multivariate_kde_DP_w_since1970.png
  23. BIN
      figs/ALNL_multivariate_kde_DP_w_since2000.png
  24. BIN
      figs/ALNL_multivariate_kde_Fld%_w_since1950.png
  25. BIN
      figs/ALNL_multivariate_kde_Fld%_w_since1970.png
  26. BIN
      figs/ALNL_multivariate_kde_Fld%_w_since2000.png
  27. BIN
      figs/ALNL_multivariate_kde_HR_w_since1950.png
  28. BIN
      figs/ALNL_multivariate_kde_HR_w_since1970.png
  29. BIN
      figs/ALNL_multivariate_kde_HR_w_since2000.png
  30. BIN
      figs/ALNL_multivariate_kde_H_w_since1950.png
  31. BIN
      figs/ALNL_multivariate_kde_H_w_since1970.png
  32. BIN
      figs/ALNL_multivariate_kde_H_w_since2000.png
  33. BIN
      figs/ALNL_multivariate_kde_OPS_w_since1950.png
  34. BIN
      figs/ALNL_multivariate_kde_OPS_w_since1970.png
  35. BIN
      figs/ALNL_multivariate_kde_OPS_w_since2000.png
  36. BIN
      figs/ALNL_multivariate_kde_RBI_w_since1950.png
  37. BIN
      figs/ALNL_multivariate_kde_RBI_w_since1970.png
  38. BIN
      figs/ALNL_multivariate_kde_RBI_w_since2000.png
  39. BIN
      figs/ALNL_multivariate_kde_R_w_since1950.png
  40. BIN
      figs/ALNL_multivariate_kde_R_w_since1970.png
  41. BIN
      figs/ALNL_multivariate_kde_R_w_since2000.png
  42. BIN
      figs/ALNL_multivariate_kde_RpG_w_since1950.png
  43. BIN
      figs/ALNL_multivariate_kde_RpG_w_since1970.png
  44. BIN
      figs/ALNL_multivariate_kde_RpG_w_since2000.png
  45. BIN
      figs/ALNL_multivariate_kde_SLG_w_since1950.png
  46. BIN
      figs/ALNL_multivariate_kde_SLG_w_since1970.png
  47. BIN
      figs/ALNL_multivariate_kde_SLG_w_since2000.png
  48. BIN
      figs/ALNL_multivariate_kde_SO_w_since1950.png
  49. BIN
      figs/ALNL_multivariate_kde_SO_w_since1970.png
  50. BIN
      figs/ALNL_multivariate_kde_SO_w_since2000.png
  51. 14
      plot_bivariate_normal_dist.py

4
Regression.py

@ -57,11 +57,11 @@ class OneDLinearRegression(OneDRegression):
self.r2 = 1 - ( resid/(len(y)*y.var()) )
def slope(self):
def get_slope(self):
"""Return slope of regressed line"""
return self.slope
def intercept(self):
def get_intercept(self):
"""Return intercept of regressed line"""
return self.intercept

245
Wins.py

@ -6,104 +6,227 @@ import re
import matplotlib.pylab as plt
import numpy as np
from scipy import stats
from scipy.optimize import curve_fit
import statsmodels.api as sm
def curve_fit_function(x,b0,b1):
return b0 + b1*x
def load_data(cutoff_yr,league):
df = read_csv('data/master_team_batting.csv')
# Remove incomplete season
df = df[ df['Year'] < 2014]
# Ignore years of baseball strikes
df = df[ df['Year'] != 1981 ]
df = df[ df['Year'] != 1994 ]
df = df[ df['Year'] > cutoff_yr ]
# Limit to cutoff years
df = df[ df['Year'] > cutoff_yr[0] ]
df = df[ df['Year'] < cutoff_yr[1] ]
df = df[ (df['Lg'].str.contains(league) ) ]
# Add in data about singles
df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']
# Win pct
df['Wpct'] = df['W']/df['G']
# Add runs per game column
df['RpG'] = df['R']/df['G']
return df
def load_AL_data(cutoff_yr):
load_data(cutoff_yr,'AL')
return load_data(cutoff_yr,'AL')
def load_NL_data(cutoff_yr):
load_data(cutoff_yr,'NL')
# Multivariate KDEs
# ------------------------------
# Let's look at how different stats
# are distributed versus wins
# from cutoff_yr to the present
return load_data(cutoff_yr,'NL')
keys = ['H','1B','2B','3B','BB','HR','AB','DP','SLG','SO','BA','OPS','R','RpG','RBI','Fld%']
labels = ['Hits','Singles','Doubles','Triples','Walks','Home Runs','At Bats','Hit Into Double Plays','Slugging Pct','Strikeouts','Batting Avg','OBP+SP','Runs','Runs per Game','RBIs','Fielding Pct']
cutoff_yrs = [1950, 1970, 2000]
for cutoff_yr in [1950, 1970, 2000]:
print "Now on year",cutoff_yr
def wins_multivariate_kdes():
"""
Look at how different stats are distributed
versus wins, from cutoff_yr to present,
by plotting the multivariate KDE
(this is an eyeball-norm test)
"""
for key, label in zip(keys,labels):
keys = ['H','1B','2B','3B','BB','HR','AB','DP','SLG','SO','BA','OPS','R','RpG','RBI','Fld%']
labels = ['Hits','Singles','Doubles','Triples','Walks','Home Runs','At Bats','Hit Into Doub Play','Slugging Pct','Strikeouts','Batting Avg','OBP+SP','Runs','Runs per Game','RBIs','Fielding Pct']
cutoff_yrs = [[1970,2000],[2000,2013]]
wins_key = 'Wpct'
wins_label = 'Win Pct'
print wins_label+": Multivariate KDEs"
print " Stat:",label
for cutoff_yr in cutoff_yrs:
print "Now on year",cutoff_yr
for key, label in zip(keys,labels):
fig = plt.figure(figsize=(12,5))
print " Stat:",label
fig = plt.figure(figsize=(12,5))
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
axes = { 'AL' : ax1,
'NL' : ax2 }
load_data_function = { 'AL' : load_AL_data,
'NL' : load_NL_data }
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
for league in ['AL','NL']:
axes = { 'AL' : ax1,
'NL' : ax2 }
df = load_data_function[league](cutoff_yr)
ax = axes[league]
load_data_function = { 'AL' : load_AL_data,
'NL' : load_NL_data }
mx = df[wins_key].values
my = df[key].values
for league in ['AL','NL']:
df = load_data_function[league](cutoff_yr)
ax = axes[league]
# ensure we have consistent axes across all cutoff years
lim_df = load_data_function[league](cutoff_yrs[0])
lim_x = lim_df[wins_key].values
lim_y = lim_df[key].values
mx = df['W'].values
my = df[key].values
# ensure we have consistent axes across all cutoff years
lim_df = load_data_function[league](cutoff_yrs[0])
lim_x = lim_df['W'].values
lim_y = lim_df[key].values
xmin = lim_x.min()
xmax = lim_x.max()
ymin = lim_y.min()
ymax = lim_y.max()
xmin = lim_x.min()
xmax = lim_x.max()
ymin = lim_y.min()
ymax = lim_y.max()
X, Y = np.mgrid[xmin:xmax:200j, ymin:ymax:200j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([mx, my])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
X, Y = np.mgrid[xmin:xmax:200j, ymin:ymax:200j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([mx, my])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
ax.imshow(np.rot90(Z), cmap=plt.cm.jet,
extent=[xmin, xmax, ymin, ymax],
aspect='auto')
ax.set_ylabel(label)
ax.set_xlabel("Wins")
ax.set_title(league + ": "+label+"-Wins Joint PDF, "+str(cutoff_yr)+"-2013")
ax.imshow(np.rot90(Z), cmap=plt.cm.jet,
extent=[xmin, xmax, ymin, ymax],
aspect='auto')
ax.set_ylabel(label)
ax.set_xlabel(wins_label)
ax.set_title(league + ": "+label+"-"+wins_label+" Joint PDF, "+str(cutoff_yr[0])+"-"+str(cutoff_yr[1]))
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
#plt.show()
fig.savefig('figs/ALNL_multivariate_kde_'+key+'_w_'+str(cutoff_yr[0])+'-'+str(cutoff_yr[1]))
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
plt.close('all')
#plt.show()
fig.savefig('figs/ALNL_multivariate_kde_'+key+'_w_since'+str(cutoff_yr)+'.png')
def wins_regression_significance_test():
"""
Do regression for wins vs. X,
and determine if the regression
coefficient is statistically significant
"""
keys = ['H','1B','2B','3B','BB','HR','AB','DP','SLG','SO','BA','OPS','R','RpG','RBI','Fld%']
labels = ['Hits','Singles','Doubles','Triples','Walks','Home Runs','At Bats','Hit Into Doub Play','Slugging Pct','Strikeouts','Batting Avg','OBP+SP','Runs','Runs per Game','RBIs','Fielding Pct']
cutoff_yrs = [[1970,2000],[2000,2013]]
wins_key = 'Wpct'
wins_label = 'Win Pct'
print wins_label+": Univariate Linear Regression Significance Test"
for cutoff_yr in cutoff_yrs:
plt.close('all')
print "Now on year",cutoff_yr
for key, label in zip(keys,labels):
print " Stat:",label
load_data_function = { 'AL' : load_AL_data,
'NL' : load_NL_data }
for league in ['AL','NL']:
print " League:",league
df = load_data_function[league](cutoff_yr)
mx = df[wins_key].values
my = df[key].values
# Use scipy curve fit
coeff, covar_matrix = curve_fit(curve_fit_function,mx,my)
# testing the null hypothesis:
# H0: slope = 0
# H1: slope != 0
# Test statistic:
# T0 = b1hat / SE(b1hat)
# (SE = standard error)
# Compute standard error,
# which is the sqrt of the covariance matrix
variance = np.diagonal(covar_matrix)
SE = np.sqrt(variance)
# test statistic
b0 = coeff[0]
SE_b0 = SE[0]
b1 = coeff[1]
SE_b1 = SE[1]
T = b1/SE_b1
# For effect to be significant,
# and null hypothesis to be rejected,
# -t_{alpha/2,n-2} < T0 < +t_{alpha/2,n-2}
# or,
# | T0 | < t_{alpha/2,n-2}
alpha = [0.10, 0.05]
Nobs = len(my)
Nparams = len(coeff)
dof = Nobs - Nparams - 1
t10, t05 = stats.t.isf(alpha,dof)
print " y = beta_0 + beta_1 x = %0.4f + %0.4f x"%(b0,b1)
print " MSE(beta_0) =", SE_b0
print " MSE(beta_1) =", SE_b1
print " T =",T
print " Null Hypothesis Rejected? (10%%) = %0.4f < %0.4f ="%(abs(T),t10), abs(T) < t10
print " Null Hypothesis Rejected? ( 5%%) = %0.4f < %0.4f ="%(abs(T),t05), abs(T) < t05
# Compute residuals
resid = my - (b0 + b1*mx)
# Quantile-quantile and residual plots
fig = plt.figure(figsize=(8,10))
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)
ax1.set_title(league + ": "+label+"-Wins Lin Reg QQ/Resid Plots "+str(cutoff_yr[0])+"-"+str(cutoff_yr[1]))
sm.qqplot(resid,ax=ax1)
ax2.plot(mx,np.abs(resid),'bo')
fig.savefig('figs/ALNL_qq_'+key+'_w_linreg_'+str(cutoff_yr[0])+'-'+str(cutoff_yr[1])+'.png')
plt.show()
plt.close('all')
if __name__=="__main__":
#wins_multivariate_kdes()
wins_regression_significance_test()

BIN
figs/ALNL_multivariate_kde_1B_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 146 KiB

BIN
figs/ALNL_multivariate_kde_1B_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 147 KiB

BIN
figs/ALNL_multivariate_kde_1B_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 128 KiB

BIN
figs/ALNL_multivariate_kde_2B_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 167 KiB

BIN
figs/ALNL_multivariate_kde_2B_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 157 KiB

BIN
figs/ALNL_multivariate_kde_2B_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 127 KiB

BIN
figs/ALNL_multivariate_kde_3B_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 156 KiB

BIN
figs/ALNL_multivariate_kde_3B_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 148 KiB

BIN
figs/ALNL_multivariate_kde_3B_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 138 KiB

BIN
figs/ALNL_multivariate_kde_AB_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 101 KiB

BIN
figs/ALNL_multivariate_kde_AB_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 100 KiB

BIN
figs/ALNL_multivariate_kde_AB_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 67 KiB

BIN
figs/ALNL_multivariate_kde_BA_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 167 KiB

BIN
figs/ALNL_multivariate_kde_BA_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 160 KiB

BIN
figs/ALNL_multivariate_kde_BA_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 155 KiB

BIN
figs/ALNL_multivariate_kde_BB_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 149 KiB

BIN
figs/ALNL_multivariate_kde_BB_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 147 KiB

BIN
figs/ALNL_multivariate_kde_BB_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 144 KiB

BIN
figs/ALNL_multivariate_kde_DP_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 158 KiB

BIN
figs/ALNL_multivariate_kde_DP_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 156 KiB

BIN
figs/ALNL_multivariate_kde_DP_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 149 KiB

BIN
figs/ALNL_multivariate_kde_Fld%_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 169 KiB

BIN
figs/ALNL_multivariate_kde_Fld%_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 168 KiB

BIN
figs/ALNL_multivariate_kde_Fld%_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 130 KiB

BIN
figs/ALNL_multivariate_kde_HR_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 166 KiB

BIN
figs/ALNL_multivariate_kde_HR_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 166 KiB

BIN
figs/ALNL_multivariate_kde_HR_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 142 KiB

BIN
figs/ALNL_multivariate_kde_H_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 135 KiB

BIN
figs/ALNL_multivariate_kde_H_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 133 KiB

BIN
figs/ALNL_multivariate_kde_H_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 113 KiB

BIN
figs/ALNL_multivariate_kde_OPS_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 162 KiB

BIN
figs/ALNL_multivariate_kde_OPS_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 161 KiB

BIN
figs/ALNL_multivariate_kde_OPS_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 135 KiB

BIN
figs/ALNL_multivariate_kde_RBI_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 143 KiB

BIN
figs/ALNL_multivariate_kde_RBI_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 144 KiB

BIN
figs/ALNL_multivariate_kde_RBI_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 127 KiB

BIN
figs/ALNL_multivariate_kde_R_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 142 KiB

BIN
figs/ALNL_multivariate_kde_R_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 141 KiB

BIN
figs/ALNL_multivariate_kde_R_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 127 KiB

BIN
figs/ALNL_multivariate_kde_RpG_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 158 KiB

BIN
figs/ALNL_multivariate_kde_RpG_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 157 KiB

BIN
figs/ALNL_multivariate_kde_RpG_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 141 KiB

BIN
figs/ALNL_multivariate_kde_SLG_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 163 KiB

BIN
figs/ALNL_multivariate_kde_SLG_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 162 KiB

BIN
figs/ALNL_multivariate_kde_SLG_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 135 KiB

BIN
figs/ALNL_multivariate_kde_SO_w_since1950.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 157 KiB

BIN
figs/ALNL_multivariate_kde_SO_w_since1970.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 154 KiB

BIN
figs/ALNL_multivariate_kde_SO_w_since2000.png

Binary file not shown.

Before

Width:  |  Height:  |  Size: 131 KiB

14
plot_bivariate_normal_dist.py

@ -0,0 +1,14 @@
from pylab import *
from mpl_toolkits.mplot3d import Axes3D
x = linspace(-5, 5, 200)
y = x
X,Y = meshgrid(x, y)
Z = bivariate_normal(X, Y)
fig = figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(X, Y, Z, cmap='Accent_r')
title('Bivariate Normal Distribution')
fig.savefig('figs/bivariate_normal_dist.png')
Loading…
Cancel
Save