Browse Source

Removing unused files. Updating readme.

master
Charles Reid 7 years ago
parent
commit
ae2e2f4fd2
  1. 67
      AthleticsCorr.py
  2. 191
      AthleticsTeamStats.py
  3. 159
      AthleticsWHHRKDE.py
  4. 27
      README.md
  5. 119
      RedSoxTeamStats.py

67
AthleticsCorr.py

@ -1,67 +0,0 @@
print "Loading pandas..."
from pandas import *
print "Done loading pandas."
import matplotlib.pylab as plt
import numpy as np
from scipy import stats
def load_data():
df = read_csv('data/oak_team_batting.csv',index_col='Rk')
# Data Cleanup:
# -----------------------
# Let's exclude some years:
# - 2014 (incomplete season)
# - 1994 (the baseball strike)
df = df[ df['Year'] != 1994 ]
df = df[ df['Year'] != 2014 ]
# Add in data about singles
df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']
df['ABpG'] = df['AB']/df['G']
df['BBpG'] = df['BB']/df['G']
df['1BpG'] = df['1B']/df['G']
df['2BpG'] = df['2B']/df['G']
df['3BpG'] = df['3B']/df['G']
df['HRpG'] = df['HR']/df['G']
df['Avg'] = (df['H']/df['AB'])
return df
df = load_data()
# Time series of data
# --------------------------------
fig = plt.figure()
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
ax1.plot(df['Year'],df['W']/df['G'],'-k',label='WinAvg')
ax1.plot(df['Year'],df['Avg'],'-g',label='BatAvg')
ax2.plot(df['Year'],df['BBpG'],'-b',label='BBpG')
ax2.plot(df['Year'],df['1BpG'],'-r',label='1BpG')
ax2.plot(df['Year'],df['2BpG'],'-g',label='2BpG')
ax2.plot(df['Year'],df['3BpG'],'-c',label='3BpG')
ax2.plot(df['Year'],df['HRpG'],'-k',label='HRpG')
plt.show()

191
AthleticsTeamStats.py

@ -1,191 +0,0 @@
print "Loading pandas..."
from pandas import *
print "Done loading pandas."
import matplotlib.pylab as plt
import numpy as np
from scipy import stats
import Regression as lrg
df = read_csv('data/athletics_team_batting.csv',index_col='Rk')
# Data Cleanup:
# -----------------------
# Let's begin by exclude some years:
# - 2014 (for obvious reasons)
# - 1994 (the baseball strike)
df = df[ df['Year'] != 1994 ]
df = df[ df['Year'] != 2014 ]
# AB
# -----------------------
# We're going to start with something simple:
# a kernel density function for number of at bats per season
ab = df['AB']
kernel_scott = stats.gaussian_kde(ab,'scott')
xx = np.linspace(4000,6000,500)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xx,kernel_scott(xx),'b-')
ax.set_xlabel("At Bats Per Season")
ax.set_ylabel(r"$P(N_{AB})$")
ax.set_title("Athletics At-Bats, "+str( min(df['Year']) )+"-"+str( max(df['Year']) ))
fig.savefig('kde_AB.png')
print "Scott's Factor bandwidth =",kernel_scott.scotts_factor()
print "Silverman's Factor bandwidth =",kernel_scott.silverman_factor()
plt.show()
# That's interesting, but let's compare at bats over the entire history of the Athletics club
# to at bats after 1968 (the year the Athletics moved to Oakland)
ab_post1968 = df['AB'][ df['Year'] >= 1968 ]
kernel_scott_1968 = stats.gaussian_kde(ab_post1968,'scott')
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xx,kernel_scott(xx),'b-',label='All Years')
ax.plot(xx,kernel_scott_1968(xx),'r-',label='After 68')
ax.legend(loc='best')
ax.set_xlabel("At Bats Per Season")
ax.set_ylabel(r"$P(N_{AB})$")
ax.set_title("Athletics At-Bats, "+str( min(df['Year']) )+"-"+str( max(df['Year']) )+" vs Oakland (post-1968)")
fig.savefig('kde_AB_1968.png')
plt.show()
# And finally, let's see where the last decade stacks up...
ab_decade = df['AB'][ df['Year'] >= 2003 ]
kernel_scott_decade = stats.gaussian_kde(ab_decade,'scott')
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xx,kernel_scott(xx),'b-',label='All Years')
ax.plot(xx,kernel_scott_1968(xx),'r-',label='After 68')
ax.plot(xx,kernel_scott_decade(xx),'g-',label='Last Decade')
ax.legend(loc='best')
ax.set_xlabel("At Bats Per Season")
ax.set_ylabel(r"$P(N_{AB})$")
ax.set_title("Athletics At-Bats (All Years, Post-1968, and Past Decade)")
fig.savefig('kde_AB_decade.png')
plt.show()
# To de-convolute the effects of years,
# let's re-do some of the KDEs over discrete time periods.
ab_pre1933 = df['AB'][df['Year']<1933]
ab_1933_1954 = df['AB'][(df['Year']>=1933) & (df['Year']<1955)]
ab_1955_1967 = df['AB'][(df['Year']>=1955) & (df['Year']<1968)]
ab_1968_2002 = df['AB'][(df['Year']>=1968) & (df['Year']<2003)]
ab_2003_2013 = df['AB'][df['Year']>=2003]
kernel_pre1933 = stats.gaussian_kde(ab_pre1933 )
kernel_1933_1954 = stats.gaussian_kde(ab_1933_1954)
kernel_1955_1967 = stats.gaussian_kde(ab_1955_1967)
kernel_1968_2002 = stats.gaussian_kde(ab_1968_2002)
kernel_2003_2013 = stats.gaussian_kde(ab_2003_2013)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xx,kernel_pre1933( xx), 'b-', label='Pre 1933')
ax.plot(xx,kernel_1933_1954(xx), 'r-', label='33-54')
ax.plot(xx,kernel_1955_1967(xx), 'g-', label='55-67')
ax.plot(xx,kernel_1968_2002(xx), 'c-', label='68-02')
ax.plot(xx,kernel_2003_2013(xx), 'k-', label='03-Present')
ax.legend(loc='best')
ax.set_xlabel("At Bats Per Season")
ax.set_ylabel(r"$P(N_{AB})$")
ax.set_title("Athletics At-Bats, Split Over Eras")
fig.savefig('kde_AB_split.png')
plt.show()
######################
# Batting average
## What kind of batting averages did the Athletics have?
#print df['BA'].values
## Make a histogram of that
#plt.hist(df['BA'].values)
######################
# Do some eyeball norm correlation plots
## What variables correlate well with number of at bats?
#plt.plot(df['BA'].values,df['AB'].values,'ko') # none
#plt.plot(df['R/G'].values,df['AB'].values,'ko') # weak
#plt.plot(df['H'].values,df['AB'].values,'ko') # strong
#plt.plot(df['BatAge'].values,df['AB'].values,'ko') # none
## Does number of at bats translate into more runs? more wins?
#plt.plot(df['AB'].values,df['R/G'].values,'ko') # weak
#plt.plot(df['AB'].values,df['W'].values,'ko') # weak
#######################
# More Quantitative
# linear regression
xx = df['AB'].values
yy = df['R/G'].values
rg = lrg.OneDLinearRegression(xx,yy)
#plt.figure()
#plt.plot(xx,yy,'ko')
#plt.plot(xx,rg.f(xx))
# KDE
# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html
xx = df['BA'].values
yy = df['R/G'].values
xmin = xx.min()
xmax = xx.max()
ymin = yy.min()
ymax = yy.max()
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([xx, yy])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.imshow(np.rot90(Z), cmap=plt.cm.Pastel2,
extent=[xmin, xmax, ymin, ymax],
aspect=0.02)
ax.plot(xx, yy, 'k.', markersize=2)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
plt.show()

159
AthleticsWHHRKDE.py

@ -1,159 +0,0 @@
print "Loading pandas..."
from pandas import *
print "Done loading pandas."
import matplotlib.pylab as plt
import numpy as np
from scipy import stats
def load_data():
df = read_csv('data/oak_team_batting.csv',index_col='Rk')
# Data Cleanup:
# -----------------------
# Let's exclude some years:
# - 2014 (incomplete season)
# - 1994 (the baseball strike)
df = df[ df['Year'] != 1994 ]
df = df[ df['Year'] != 2014 ]
# Add in data about singles
df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']
df['ABpG'] = df['AB']/df['G']
df['BBpG'] = df['BB']/df['G']
df['1BpG'] = df['1B']/df['G']
df['2BpG'] = df['2B']/df['G']
df['3BpG'] = df['3B']/df['G']
df['HRpG'] = df['HR']/df['G']
return df
df = load_data()
# Walks
# -----------------------
# Let's look at walks using a KDE.
bb = df['BB']
bb_pre1933 = df['BB'][df['Year']<1933]
bb_1933_1954 = df['BB'][(df['Year']>=1933) & (df['Year']<1955)]
bb_1955_1967 = df['BB'][(df['Year']>=1955) & (df['Year']<1968)]
bb_1968_2002 = df['BB'][(df['Year']>=1968) & (df['Year']<2003)]
bb_2003_2013 = df['BB'][df['Year']>=2003]
kernel_all = stats.gaussian_kde(bb )
kernel_pre1933 = stats.gaussian_kde(bb_pre1933 )
kernel_1933_1954 = stats.gaussian_kde(bb_1933_1954)
kernel_1955_1967 = stats.gaussian_kde(bb_1955_1967)
kernel_1968_2002 = stats.gaussian_kde(bb_1968_2002)
kernel_2003_2013 = stats.gaussian_kde(bb_2003_2013)
xx = np.linspace(min(df['BB']),max(df['BB']),500)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xx,kernel_all( xx), 'k--', label='All Yrs')
ax.plot(xx,kernel_pre1933( xx), 'b-', label='Pre 1933')
ax.plot(xx,kernel_1933_1954(xx), 'r-', label='33-54')
ax.plot(xx,kernel_1955_1967(xx), 'g-', label='55-67')
ax.plot(xx,kernel_1968_2002(xx), 'c-', label='68-02')
ax.plot(xx,kernel_2003_2013(xx), 'k-', label='03-Present')
ax.legend(loc='best')
ax.set_xlabel("Walks")
ax.set_ylabel(r"$P(N_{BB})$")
ax.set_title("Athletics Walks Per Game, Split Over Eras")
fig.savefig('figs/kde_BB_split.png')
plt.show()
# Hits
# -----------------------
h = df['H']
h_pre1933 = df['H'][df['Year']<1933]
h_1933_1954 = df['H'][(df['Year']>=1933) & (df['Year']<1955)]
h_1955_1967 = df['H'][(df['Year']>=1955) & (df['Year']<1968)]
h_1968_2002 = df['H'][(df['Year']>=1968) & (df['Year']<2003)]
h_2003_2013 = df['H'][df['Year']>=2003]
kernel_all = stats.gaussian_kde(h )
kernel_pre1933 = stats.gaussian_kde(h_pre1933 )
kernel_1933_1954 = stats.gaussian_kde(h_1933_1954)
kernel_1955_1967 = stats.gaussian_kde(h_1955_1967)
kernel_1968_2002 = stats.gaussian_kde(h_1968_2002)
kernel_2003_2013 = stats.gaussian_kde(h_2003_2013)
xx = np.linspace(min(df['H']),max(df['H']),500)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xx,kernel_all( xx), 'k--', label='All Yrs')
ax.plot(xx,kernel_pre1933( xx), 'b-', label='Pre 1933')
ax.plot(xx,kernel_1933_1954(xx), 'r-', label='33-54')
ax.plot(xx,kernel_1955_1967(xx), 'g-', label='55-67')
ax.plot(xx,kernel_1968_2002(xx), 'c-', label='68-02')
ax.plot(xx,kernel_2003_2013(xx), 'k-', label='03-Present')
ax.legend(loc='best')
ax.set_xlabel("Hits")
ax.set_ylabel(r"$P(N_{H})$")
ax.set_title("Athletics Hits Per Game, Split Over Eras")
fig.savefig('figs/kde_H_split.png')
plt.show()
# Hits, Singles, Doubles, Triples, Home Runs
# --------------------------------------------
keys = ['H','1B','2B','3B','HR']
titles = ['Hits','Singles','Doubles','Triples','Home Runs']
for key,title in zip(keys,titles):
dat = df[key]
dat_pre1933 = df[key][df['Year']<1933]
dat_1933_1954 = df[key][(df['Year']>=1933) & (df['Year']<1955)]
dat_1955_1967 = df[key][(df['Year']>=1955) & (df['Year']<1968)]
dat_1968_2002 = df[key][(df['Year']>=1968) & (df['Year']<2003)]
dat_2003_2013 = df[key][df['Year']>=2003]
kernel_all = stats.gaussian_kde(dat )
kernel_pre1933 = stats.gaussian_kde(dat_pre1933 )
kernel_1933_1954 = stats.gaussian_kde(dat_1933_1954)
kernel_1955_1967 = stats.gaussian_kde(dat_1955_1967)
kernel_1968_2002 = stats.gaussian_kde(dat_1968_2002)
kernel_2003_2013 = stats.gaussian_kde(dat_2003_2013)
xx = np.linspace(min(df[key]),max(df[key]),500)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xx,kernel_all( xx), 'k--', label='All Yrs')
ax.plot(xx,kernel_pre1933( xx), 'b-', label='Pre 1933')
ax.plot(xx,kernel_1933_1954(xx), 'r-', label='33-54')
ax.plot(xx,kernel_1955_1967(xx), 'g-', label='55-67')
ax.plot(xx,kernel_1968_2002(xx), 'c-', label='68-02')
ax.plot(xx,kernel_2003_2013(xx), 'k-', label='03-Present')
ax.legend(loc='best')
ax.set_xlabel(title)
ax.set_ylabel(r"$P(N_{"+key+"})$")
ax.set_title("Athletics "+title+" Per Game, Split Over Eras")
fig.savefig('figs/kde_'+key+'_split.png')
plt.show()

27
README.md

@ -1,10 +1,29 @@
# List of Scripts
This repository contains some scripts and data for sabermetrics
(analysis of baseball statistics.)
## Athletics AB KDE
# Octopress Blog Posts
Constructs kernel density estimates for the at bat statistic.
I am using these scripts to write a series of Octopress blog posts.
These are listed here, along with the scripts that correspond to
each post.
Does KDE for total at bats, then splits up into different eras.
[Kernel Density Functions and the Oakland Athletics](http://charlesreid1.github.io/blog/2014/08/31/kernel-density-functions-and-the-oakland-athletics/)
* The file ```AthleticsABKDE.py``` generates KDEs for at-bats, analyzed in this first post.
[Using Multivariate KDEs to Examine How Baseball Is Changing](http://charlesreid1.github.io/blog/2014/09/03/using-multivariate-kdes-to-examine-how-baseball-is-changing/)
* The file ```AllTeams.py``` generates univariate and multivariate KDEs for multiple batting stats, for all teams
* The file ```HRVariance.py``` contains variance and correlation plots for home runs and how they change, or are correlated with other variables
# Data
I've put together some data files, contained in the ```data/``` directory.
This consists of batting statistics for individual teams, plus
a master batting stats file that contains batting statistics for all teams.
* The data are all in CSV format and come from [Baseball-Reference.com](http://baseball-reference.com)
* The file ```CombineEverybody.py``` combines data for all teams into the master batting stats file.

119
RedSoxTeamStats.py

@ -1,119 +0,0 @@
print "Loading pandas..."
from pandas import *
print "Done loading pandas."
import matplotlib.pylab as plt
import numpy as np
import Regression as lrg
from scipy import stats
df = read_csv('data/redsox_team_batting.csv',index_col='Rk')
################
# Data processing
# Let's exclude years with fewest at bats: 1981, 1994, 1918
#df = df[ df['Year'] != 1918 ]
#df = df[ df['Year'] != 1981 ]
df = df[ df['Year'] != 1994 ]
df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']
df['1B/G'] = df['1B']/df['G']
df['2B/G'] = df['2B']/df['G']
df['3B/G'] = df['3B']/df['G']
df['HR/G'] = df['HR']/df['G']
################
# Explore data
## What data do we have?
#print df.columns
################
# At bats
## How many at bats did the Athletics have?
#print df['AB'].values
## Make a histogram of that
#plt.hist(df['AB'].values)
## Same histogram, but everything after 1950
#plt.hist(df['AB'][df['Year']>1950].values)
## Same histogram, but everything after 1990
#plt.hist(df['AB'][df['Year']>1990].values)
######################
# Batting average
## What kind of batting averages did the Athletics have?
#print df['BA'].values
## Make a histogram of that
#plt.hist(df['BA'].values)
######################
# Do some eyeball norm correlation plots
## What variables correlate well with number of at bats?
#plt.plot(df['BA'].values,df['AB'].values,'ko') # none
#plt.plot(df['R/G'].values,df['AB'].values,'ko') # weak
#plt.plot(df['H'].values,df['AB'].values,'ko') # strong
#plt.plot(df['BatAge'].values,df['AB'].values,'ko') # none
## Does number of at bats translate into more runs? more wins?
#plt.plot(df['AB'].values,df['R/G'].values,'ko') # weak
#plt.plot(df['AB'].values,df['W'].values,'ko') # weak
#######################
# More Quantitative
# linear regression
xx = df['AB'].values
yy = df['R/G'].values
rg = lrg.OneDLinearRegression(xx,yy)
plt.figure()
plt.plot(xx,yy,'ko')
plt.plot(xx,rg.f(xx))
# KDE
# http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.gaussian_kde.html
xx = df['BA'].values
yy = df['R/G'].values
xmin = xx.min()
xmax = xx.max()
ymin = yy.min()
ymax = yy.max()
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([xx, yy])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.imshow(np.rot90(Z), cmap=plt.cm.Pastel2,
extent=[xmin, xmax, ymin, ymax],
aspect=0.02)
ax.plot(xx, yy, 'k.', markersize=2)
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
plt.show()
Loading…
Cancel
Save