Sabermetrics scripts for some old blog posts analyzing the 2014 Oakland A's.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

137 lines
3.3 KiB

print "Loading pandas..."
from pandas import *
print "Done loading pandas."
import matplotlib.pylab as plt
import numpy as np
from scipy import stats
def load_data():
df = read_csv('data/master_team_batting.csv')
# Add in data about singles
df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']
df['ABpG'] = df['AB']/df['G']
df['BBpG'] = df['BB']/df['G']
df['1BpG'] = df['1B']/df['G']
df['2BpG'] = df['2B']/df['G']
df['3BpG'] = df['3B']/df['G']
df['HRpG'] = df['HR']/df['G']
df['Avg'] = (df['H']/df['AB'])
return df
df = load_data()
# KDE of a couple of key quantities
keys = ['1B','2B','3B','BB','HR','AB','DP','SLG','SO']
labels = ['Singles','Doubles','Triples','Walks','Home Runs','At Bats','Hit Into Double Plays','Slugging Pct','Strikeouts']
for key,label in zip(keys,labels):
print "Now on "+label
dat = df[key]
dat_oak = df[key][df['Team']=='oak']
kernel = stats.gaussian_kde(dat,'scott')
kernel_oak = stats.gaussian_kde(dat_oak,'scott')
xx = np.linspace(min(df[key]),max(df[key]),500)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xx,kernel(xx),'k-')
ax.plot(xx,kernel_oak(xx),'g-')
ax.set_xlabel(label+" Per Season")
ax.set_ylabel(r"$P(N_{"+key+"})$")
ax.set_title(label+", "+str( min(df['Year']) )+"-"+str( max(df['Year']) ))
print "Bandwidth =",kernel.covariance_factor()
print "OAK Bandwidth =",kernel_oak.covariance_factor()
fig.savefig('figs/kde_'+key+'_all_vs_oak.png')
plt.show()
plt.close('all')
# Multivariate KDEs
# ------------------------------
# Let's look at how different stats
# are distributed versus year
keys = ['H','1B','2B','3B','BB','HR','AB','DP','SLG','SO']
labels = ['Hits','Singles','Doubles','Triples','Walks','Home Runs','At Bats','Hit Into Double Plays','Slugging Pct','Strikeouts']
for key, label in zip(keys,labels):
mx = df['Year'].values
my = df[key].values
xmin = mx.min()
xmax = mx.max()
ymin = my.min()
ymax = my.max()
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([mx, my])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel(positions).T, X.shape)
# ---
fig = plt.figure()
ax = fig.add_subplot(111)
ax.imshow(np.rot90(Z), cmap=plt.cm.jet,
extent=[xmin, xmax, ymin, ymax],
aspect='auto')
#aspect=0.5)
#ax.plot(mx, my, 'k.', markersize=3)
ax.set_ylabel(label)
ax.set_xlabel("Year")
ax.set_title(label+", "+str( min(df['Year']) )+"-"+str( max(df['Year']) ))
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
fig.savefig('figs/multivariate_kde_'+key+'_yr.png')
# ---
fig = plt.figure()
ax = fig.add_subplot(111)
ax.imshow(np.rot90(Z), cmap=plt.cm.jet,
extent=[xmin, xmax, ymin, ymax],
aspect='auto')
#aspect=0.5)
ax.plot(mx, my, 'k.', markersize=3)
ax.set_ylabel(label)
ax.set_xlabel("Year")
ax.set_title(label+", "+str( min(df['Year']) )+"-"+str( max(df['Year']) ))
ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])
fig.savefig('figs/multivariate_kde_'+key+'_yr_points.png')
plt.show()
plt.close('all')