Sabermetrics scripts for some old blog posts analyzing the 2014 Oakland A's.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

178 lines
4.6 KiB

print "Loading pandas..."
from pandas import *
print "Done loading pandas."
import matplotlib.pylab as plt
import numpy as np
from scipy import stats
from scipy import interpolate
def load_data():
df = read_csv('data/master_team_batting.csv')
# Data Cleanup:
# -----------------------
# Let's exclude some years:
# - 2014 (incomplete season)
# - 1994 (the baseball strike)
df = df[ df['Year'] != 1981 ]
df = df[ df['Year'] != 1994 ]
df = df[ df['Year'] != 2014 ]
df = df[ df['Year'] >= 1900 ]
# Add in data about singles
df['1B'] = df['H'] - df['2B'] - df['3B'] - df['HR']
df['ABpG'] = df['AB']/df['G']
df['BBpG'] = df['BB']/df['G']
df['1BpG'] = df['1B']/df['G']
df['2BpG'] = df['2B']/df['G']
df['3BpG'] = df['3B']/df['G']
df['HRpG'] = df['HR']/df['G']
return df
df = load_data()
# Time series: home run variance with time
# --------------------------------
fig = plt.figure(figsize=(10,3))
ax1 = fig.add_subplot(111)
hr_var = [np.var(df['HR'][df['Year']==yr]) for yr in unique(df['Year'])]
ax1.plot(unique(df['Year']),hr_var,'k-')
ax1.set_title('Variance, Home Runs Per Season')
ax1.set_xlabel('Year')
ax1.set_ylabel('HR Var')
ax1.axvline(x=1927,linewidth=2,color='r')
fig.savefig('figs/variance_HRsSOs_yrs.png')
plt.show()
# How do strikeouts and home runs correlate?
# (year-by-year basis)
# -----------------------------------------
years = unique(df['Year'])
so_hr_corr = np.zeros(len(years),)
so_h_corr = np.zeros(len(years),)
for ii,yr in enumerate(years):
strikeouts = df['SO'][df['Year']==yr].values
home_runs = df['HR'][df['Year']==yr].values
hits = df['H'][df['Year']==yr].values
# Compute correlation between # of strikeouts team had
# and number of home runs/hits a team had
so_hr_corr[ii] = np.corrcoef(strikeouts,home_runs)[0][1]
so_h_corr[ii] = np.corrcoef(strikeouts,hits)[0][1]
fig = plt.figure(figsize=(11,3))
ax1 = fig.add_subplot(111)
ax1.plot(years,so_hr_corr,'b-',label='SO-HR')
ax1.plot(years,so_h_corr,'r-',label='SO-H')
ax1.set_ylabel('SO-HR Corr')
ax1.set_xlabel('Year')
ax1.legend(loc='best')
ax1.set_xlim([1900,2020])
ax1.axhline(y=0,linewidth=1,color='k')
fig.savefig('figs/corr_HRSOH_yrs.png')
plt.show()
# Same plot, but with smoothed data (excluding pre-1900 years, which are noisy):
smoothed_so_hr_corr = interpolate.UnivariateSpline(years,so_hr_corr,k=5)
smoothed_so_h_corr = interpolate.UnivariateSpline(years,so_h_corr ,k=5)
fig = plt.figure(figsize=(11,3))
ax1 = fig.add_subplot(111)
ax1.plot(years,so_hr_corr,'b-',label='SO-HR')
ax1.plot(years,so_h_corr,'r-',label='SO-H')
ax1.plot(years,smoothed_so_hr_corr(years),'b--')
ax1.plot(years,smoothed_so_h_corr(years),'r--')
ax1.set_ylabel('SO-HR Corr')
ax1.set_xlabel('Year')
ax1.legend(loc='best')
#ax1.set_xlim([1900,2020])
ax1.axhline(y=0,linewidth=1,color='k')
fig.savefig('figs/corr_smoothedHRSOH_yrs.png')
plt.show()
### # How do batting averages and home runs correlate?
### # (i.e., are hitters striking out more in an attempt to hit more home runs?)
### # A: not really... BA-HR and BB-HR and BB-H are all positively correlated.
### # -----------------------------------------
###
### years = unique(df['Year'])
### ba_hr_corr = np.zeros(len(years),)
### bb_hr_corr = np.zeros(len(years),)
### bb_h_corr = np.zeros(len(years),)
###
### for ii,yr in enumerate(years):
###
### batting_avg = df['BA'][df['Year']==yr].values
### walks = df['BB'][df['Year']==yr].values
### home_runs = df['HR'][df['Year']==yr].values
### hits = df['H'][df['Year']==yr].values
###
### # Compute correlation between batting average and home runs
### ba_hr_corr[ii] = np.corrcoef(batting_avg,home_runs)[0][1]
### bb_hr_corr[ii] = np.corrcoef(walks,home_runs)[0][1]
### bb_h_corr[ii] = np.corrcoef(walks,hits)[0][1]
###
### smoothed_ba_hr_corr = interpolate.UnivariateSpline(years,ba_hr_corr,k=5)
### smoothed_bb_hr_corr = interpolate.UnivariateSpline(years,bb_hr_corr,k=5)
### smoothed_bb_h_corr = interpolate.UnivariateSpline(years,bb_h_corr, k=5)
###
### fig = plt.figure(figsize=(11,3))
### ax1 = fig.add_subplot(111)
###
### ax1.plot(years,ba_hr_corr,'b-',label='BA-HR')
### ax1.plot(years,smoothed_ba_hr_corr(years),'b--')
###
### ax1.plot(years,bb_hr_corr,'r-',label='BB-HR')
### ax1.plot(years,smoothed_bb_hr_corr(years),'r--')
###
### ax1.plot(years,bb_h_corr,'g-',label='BB-H')
### ax1.plot(years,smoothed_bb_h_corr(years),'g--')
###
### ax1.set_ylabel('Correlation')
### ax1.set_xlabel('Year')
### ax1.legend(loc='best')
### ax1.set_xlim([1900,2020])
###
### ax1.axhline(y=0,linewidth=1,color='k')
###
### fig.savefig('figs/corr_HRBA_yrs.png')
###
### plt.show()