Browse Source

adding new table analysis script. adding bar chart script.

master
Charles Reid 10 years ago
parent
commit
a5ad9c4d2b
  1. 213
      analysis/OneCity_Bar.py
  2. 193
      analysis/OneCity_Scatter.py
  3. 276
      analysis/Table15002.py
  4. 55
      analysis/Table25070.py

213
analysis/OneCity_Bar.py

@ -0,0 +1,213 @@ @@ -0,0 +1,213 @@
from pymongo import MongoClient
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
import metro as mt
do_univariate_kde = True
cities = ['Seattle']
def main():
do_25070()
def do_25070():
from Table25070 import Table25070
# Seaborn
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})
c1, c2, c3 = sns.color_palette("Set1", 3)
# Mongo
client = MongoClient()
db = client['metros']
metaprops = db['PropertiesMeta']
props = db['Properties']
for city in cities:
#########################
# Mongo lookup
cbsa = mt.CBSACode(city)[0]
pre_search = metaprops.find_one({'geoid':cbsa})
mongo_search = props.find({'$and': [
{'metroid':cbsa},
{'geoid':{'$nin':[cbsa]}}
]
})
if 'B25070' not in pre_search['tables']:
print "Error: could not find table B25070 for city",city,"in db."
continue
if mongo_search.count()==0:
print "Error: could not find city",city,"in db."
continue
df = pd.DataFrame([])
srch = list(mongo_search)
for i,r in enumerate(srch):
if cbsa in r['geoid']:
del srch[i]
break
df = df.append(srch)
df = Table25070(df)
#######################
# Bar plot:
f, ax1 = plt.subplots(1, 1, sharex=True, figsize=(12, 6))
# ------------------------
# Subplot 1
ax1.set_title(cities[0])
labels = ["0-10","10-15","15-20","20-25","25-30","30-35","35-40","40-50","50+"]
ddt = {}
for i,collab in enumerate(labels):
lab = "Rent_"+collab+"_Pct"
ddt[lab] = df[lab].values
xx = np.array(ddt.keys())
yy = np.array(ddt.values())
sns.barplot(xx,yy,palette="Set1",ax=ax1)
#xx = np.array(ddm.keys())
#yy = np.array(ddm.values())
#sns.barplot(xx,yy,palette=xcp,ax=ax2)
#xx = np.array(ddf.keys())
#yy = np.array(ddf.values())
#sns.barplot(xx,yy,palette=xcp,ax=ax3)
plt.show()
plt.draw()
def do_15002():
from Table15002 import Table15002
# Seaborn
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})
# Mongo
client = MongoClient()
db = client['metros']
metaprops = db['PropertiesMeta']
props = db['Properties']
for city in cities:
#########################
# Mongo lookup
cbsa = mt.CBSACode(city)[0]
pre_search = metaprops.find_one({'geoid':cbsa})
mongo_search = props.find({'$and': [
{'metroid':cbsa},
{'geoid':{'$nin':[cbsa]}}
]
})
if 'B15002' not in pre_search['tables']:
print "Error: could not find table B15002 for city",city,"in db."
continue
if mongo_search.count()==0:
print "Error: could not find city",city,"in db."
continue
df = pd.DataFrame([])
srch = list(mongo_search)
for i,r in enumerate(srch):
if cbsa in r['geoid']:
del srch[i]
break
df = df.append(srch)
df = Table15002(df)
colors = ["windows blue", "amber", "greyish", "faded green", "dusty purple"]
xcp = sns.xkcd_palette(colors)
#######################
# Bar plot:
f, ax1 = plt.subplots(1, 1, sharex=True, figsize=(8, 8))
# ------------------------
# Subplot 1
ax1.set_title(cities[0])
#ddm = {}
#ddf = {}
ddt = {}
for i in range(1,5+1):
#ddm[str(i)] = df["M_EdCat%d_Pct"%(i)].values
#ddf[str(i)] = df["F_EdCat%d_Pct"%(i)].values
ddt[str(i)] = df["EdCat%d_Pct"%(i)].values
xx = np.array(ddt.keys())
yy = np.array(ddt.values())
sns.barplot(xx,yy,palette=xcp,ax=ax1)
#xx = np.array(ddm.keys())
#yy = np.array(ddm.values())
#sns.barplot(xx,yy,palette=xcp,ax=ax2)
#xx = np.array(ddf.keys())
#yy = np.array(ddf.values())
#sns.barplot(xx,yy,palette=xcp,ax=ax3)
plt.show()
plt.draw()
plt.draw()
plt.show()
if __name__=="__main__":
main()

193
analysis/OneCity_Scatter.py

@ -11,8 +11,6 @@ from scipy import stats @@ -11,8 +11,6 @@ from scipy import stats
import metro as mt
from Table17001 import Table17001
do_univariate_kde = True
@ -20,6 +18,184 @@ do_univariate_kde = True @@ -20,6 +18,184 @@ do_univariate_kde = True
cities = ['Seattle']#,'Houston','Los Angeles','New York','Chicago']
def main():
do_15002()
def do_15002():
from Table15002 import Table15002
# Seaborn
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})
c1, c2, c3 = sns.color_palette("Set1", 3)
# Mongo
client = MongoClient()
db = client['metros']
metaprops = db['PropertiesMeta']
props = db['Properties']
for city in cities:
#########################
# Mongo lookup
cbsa = mt.CBSACode(city)[0]
pre_search = metaprops.find_one({'geoid':cbsa})
mongo_search = props.find({'$and': [
{'metroid':cbsa},
{'geoid':{'$nin':[cbsa]}}
]
})
if 'B15002' not in pre_search['tables']:
print "Error: could not find table B15002 for city",city,"in db."
continue
if mongo_search.count()==0:
print "Error: could not find city",city,"in db."
continue
df = pd.DataFrame([])
srch = list(mongo_search)
for i,r in enumerate(srch):
if cbsa in r['geoid']:
del srch[i]
break
df = df.append(srch)
df = Table15002(df)
#######################
# Scatter plot:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(8, 8))
# ------------------------
# Subplot 1
ax1.set_title(cities[0])
xcode = 'AvgEd'
xlabel=xcode
ycode = 'VarEd_M'
ylabel=ycode
sns.regplot(df[xcode],df[ycode], color=c2, ax=ax1, fit_reg=False)
ycode = 'VarEd_F'
ylabel=ycode
sns.regplot(df[xcode],df[ycode], color=c1, ax=ax1, fit_reg=False)
#ycode = 'VarEd'
#ylabel=ycode
#sns.regplot(df[xcode],df[ycode], color=c3, ax=ax1, fit_reg=False)
xlim = (min(df[xcode]),
max(df[xcode]))
ylim = (min(df[ycode]),
max(df[ycode]))
ax1.set_xlim(xlim)
ax1.set_ylim(ylim)
ax1.set_xlabel(xlabel)
ax1.set_ylabel(ylabel)
# ------------------------
# Subplot 2
xcode = 'AvgEd'
xlabel=xcode
ycode = 'EdGenderImbalance'
ylabel=ycode
sns.regplot(df[xcode],df[ycode], color=c3, ax=ax2, fit_reg=False)
xlim = (min(df[xcode]),
max(df[xcode]))
ylim = (min(df[ycode]),
max(df[ycode]))
ax2.set_xlim(xlim)
ax2.set_ylim(ylim)
ax2.set_xlabel(xlabel)
ax2.set_ylabel(ylabel)
#######################
# Scatter plot:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(8, 8))
# ------------------------
# Subplot 1
ax1.set_title(cities[0])
xcode = 'VarEd_M'
xlabel=xcode
ycode = 'AvgEd_M'
ylabel=ycode
sns.regplot(df[xcode],df[ycode], color=c2, ax=ax1, fit_reg=False)
xcode = 'VarEd_F'
xlabel=xcode
ycode = 'AvgEd_F'
ylabel=ycode
sns.regplot(df[xcode],df[ycode], color=c1, ax=ax1, fit_reg=False)
xcode = 'VarEd'
xlabel=xcode
ycode = 'AvgEd'
ylabel=ycode
sns.regplot(df[xcode],df[ycode], color=c3, ax=ax1, fit_reg=False)
# ------------------------
# Subplot 2
ax1.set_title(cities[0])
xcode = 'VarEd_M'
xlabel=xcode
ycode = 'EdGenderImbalance'
ylabel=ycode
sns.regplot(df[xcode],df[ycode], color=c2, ax=ax2, fit_reg=False)
xcode = 'VarEd_F'
xlabel=xcode
ycode = 'EdGenderImbalance'
ylabel=ycode
sns.regplot(df[xcode],df[ycode], color=c1, ax=ax2, fit_reg=False)
xcode = 'VarEd'
xlabel=xcode
ycode = 'EdGenderImbalance'
ylabel=ycode
sns.regplot(df[xcode],df[ycode], color=c3, ax=ax2, fit_reg=False)
plt.draw()
plt.show()
def do_17001():
from Table17001 import Table17001
# Seaborn
sns.set_palette("deep", desat=.6)
@ -32,7 +208,6 @@ db = client['metros'] @@ -32,7 +208,6 @@ db = client['metros']
metaprops = db['PropertiesMeta']
props = db['Properties']
if do_univariate_kde:
for city in cities:
@ -58,7 +233,6 @@ if do_univariate_kde: @@ -58,7 +233,6 @@ if do_univariate_kde:
continue
df = pd.DataFrame([])
srch = list(mongo_search)
for i,r in enumerate(srch):
@ -69,8 +243,6 @@ if do_univariate_kde: @@ -69,8 +243,6 @@ if do_univariate_kde:
df = Table17001(df)
#######################
# Scatter plots:
@ -140,13 +312,18 @@ if do_univariate_kde: @@ -140,13 +312,18 @@ if do_univariate_kde:
ax2.set_xlabel(xlabel)
#ax2.set_ylabel('Pct of M/F Pre-Adol. Pop in Pov')
plt.draw()
plt.show()
plt.draw()
plt.show()
if __name__=="__main__":
main()

276
analysis/Table15002.py

@ -0,0 +1,276 @@ @@ -0,0 +1,276 @@
from pymongo import MongoClient
import numpy as np
import pandas as pd
def Table15002(df):
# Populate a numpy ndarray
# with field names in correct positions
"""
B15002001 : Total:
B15002002 : Male:
B15002003 : No schooling completed
B15002004 : Nursery to 4th grade
B15002005 : 5th and 6th grade
B15002006 : 7th and 8th grade
B15002007 : 9th grade
B15002008 : 10th grade
B15002009 : 11th grade
B15002010 : 12th grade, no diploma
B15002011 : High school graduate (includes equivalency)
B15002012 : Some college, less than 1 year
B15002013 : Some college, 1 or more years, no degree
B15002014 : Associate's degree
B15002015 : Bachelor's degree
B15002016 : Master's degree
B15002017 : Professional school degree
B15002018 : Doctorate degree
B15002019 : Female:
B15002020 : No schooling completed
B15002021 : Nursery to 4th grade
B15002022 : 5th and 6th grade
B15002023 : 7th and 8th grade
B15002024 : 9th grade
B15002025 : 10th grade
B15002026 : 11th grade
B15002027 : 12th grade, no diploma
B15002028 : High school graduate (includes equivalency)
B15002029 : Some college, less than 1 year
B15002030 : Some college, 1 or more years, no degree
B15002031 : Associate's degree
B15002032 : Bachelor's degree
B15002033 : Master's degree
B15002034 : Professional school degree
B15002035 : Doctorate degree
"""
# column 1: male
# column 2: female
table_fields = np.array([['B15002003','B15002020'], # No schooling completed
['B15002004','B15002021'], # Nursery to 4th grade
['B15002005','B15002022'], # 5th and 6th grade
['B15002006','B15002023'], # 7th and 8th grade
['B15002007','B15002024'], # 9th grade
['B15002008','B15002025'], # 10th grade
['B15002009','B15002026'], # 11th grade
['B15002010','B15002027'], # 12th grade, no diploma
['B15002011','B15002028'], # High school graduate (includes equivalency)
['B15002012','B15002029'], # Some college, less than 1 year
['B15002013','B15002030'], # Some college, 1 or more years, no degree
['B15002014','B15002031'], # Associate's degree
['B15002015','B15002032'], # Bachelor's degree
['B15002016','B15002033'], # Master's degree
['B15002017','B15002034'], # Professional school degree
['B15002018','B15002035'] # Doctorate degree
])
# -----------------------------
# PDF:
#
# Education level by gender
#
# Education bins:
# 1 = less than high school
# 2 = high school, associates, some college
# 3 = bachelors
# 4 = masters
# 5 = doctorate/professional
edcat = [1,1,1,1,1,1,1,1,2,2,2,2,3,4,5,5]
edcat1 = []
edcat2 = []
edcat3 = []
edcat4 = []
edcat5 = []
edcat1_m = []
edcat2_m = []
edcat3_m = []
edcat4_m = []
edcat5_m = []
edcat1_f = []
edcat2_f = []
edcat3_f = []
edcat4_f = []
edcat5_f = []
for k in range(table_fields.shape[0]):
ied = edcat[k]
if ied==1:
edcat1_m.append(table_fields[k,0])
edcat1_f.append(table_fields[k,1])
elif ied==2:
edcat2_m.append(table_fields[k,0])
edcat2_f.append(table_fields[k,1])
elif ied==3:
edcat3_m.append(table_fields[k,0])
edcat3_f.append(table_fields[k,1])
elif ied==4:
edcat4_m.append(table_fields[k,0])
edcat4_f.append(table_fields[k,1])
elif ied==5:
edcat5_m.append(table_fields[k,0])
edcat5_f.append(table_fields[k,1])
edcat1 = edcat1_m + edcat1_f
edcat2 = edcat2_m + edcat2_f
edcat3 = edcat3_m + edcat3_f
edcat4 = edcat4_m + edcat4_f
edcat5 = edcat5_m + edcat5_f
m_edcat1_tot = df[edcat1_m].sum(axis=1)
m_edcat2_tot = df[edcat2_m].sum(axis=1)
m_edcat3_tot = df[edcat3_m].sum(axis=1)
m_edcat4_tot = df[edcat4_m].sum(axis=1)
m_edcat5_tot = df[edcat5_m].sum(axis=1)
f_edcat1_tot = df[edcat1_f].sum(axis=1)
f_edcat2_tot = df[edcat2_f].sum(axis=1)
f_edcat3_tot = df[edcat3_f].sum(axis=1)
f_edcat4_tot = df[edcat4_f].sum(axis=1)
f_edcat5_tot = df[edcat5_f].sum(axis=1)
edcat1_tot = df[edcat1].sum(axis=1)
edcat2_tot = df[edcat2].sum(axis=1)
edcat3_tot = df[edcat3].sum(axis=1)
edcat4_tot = df[edcat4].sum(axis=1)
edcat5_tot = df[edcat5].sum(axis=1)
df['M_EdCat1_Total'] = m_edcat1_tot
df['F_EdCat1_Total'] = f_edcat1_tot
df['EdCat1_Total'] = edcat1_tot
df['M_EdCat2_Total'] = m_edcat2_tot
df['F_EdCat2_Total'] = f_edcat2_tot
df['EdCat2_Total'] = edcat2_tot
df['M_EdCat3_Total'] = m_edcat3_tot
df['F_EdCat3_Total'] = f_edcat3_tot
df['EdCat3_Total'] = edcat3_tot
df['M_EdCat4_Total'] = m_edcat4_tot
df['F_EdCat4_Total'] = f_edcat4_tot
df['EdCat4_Total'] = edcat4_tot
df['M_EdCat5_Total'] = m_edcat5_tot
df['F_EdCat5_Total'] = f_edcat5_tot
df['EdCat5_Total'] = edcat5_tot
m_edcat_all = list(table_fields[:,0])
f_edcat_all = list(table_fields[:,1])
edcat_all = m_edcat_all + f_edcat_all
df['M_EdCatAll_Total'] = df[m_edcat_all].sum(axis=1)
df['F_EdCatAll_Total'] = df[f_edcat_all].sum(axis=1)
df['EdCatAll_Total'] = df[edcat_all].sum(axis=1)
m_avg_ed = 0
f_avg_ed = 0
avg_ed = 0
for i in range(1,5+1):
mlab = "M_EdCat%d_Total"%(i)
flab = "F_EdCat%d_Total"%(i)
m_avg_ed += i*df[mlab]
f_avg_ed += i*df[flab]
avg_ed += i*(df[mlab]+df[flab])
m_avg_ed /= df['M_EdCatAll_Total']
f_avg_ed /= df['F_EdCatAll_Total']
avg_ed /= df['EdCatAll_Total']
df['AvgEd_M'] = m_avg_ed
df['AvgEd_F'] = f_avg_ed
df['AvgEd'] = avg_ed
df = df.fillna(0)
# ------------------
# Compute percent of pop
# in each ed category
m_denom_lab = 'M_EdCatAll_Total'
f_denom_lab = 'F_EdCatAll_Total'
t_denom_lab = 'EdCatAll_Total'
for i in range(1,5+1):
m_num_lab = 'M_EdCat%d_Total'%(i)
f_num_lab = 'F_EdCat%d_Total'%(i)
t_num_lab = 'EdCat%d_Total'%(i)
m_pct = df[m_num_lab]/df[m_denom_lab]
f_pct = df[f_num_lab]/df[f_denom_lab]
t_pct = df[t_num_lab]/df[t_denom_lab]
mlab = "M_EdCat%d_Pct"%(i)
flab = "F_EdCat%d_Pct"%(i)
tlab = "EdCat%d_Pct"%(i)
df[mlab] = m_pct
df[flab] = f_pct
df[tlab] = t_pct
df = df.fillna(0)
# ----------------------
# Compute variance
# in education levels
m_var_ed = 0
f_var_ed = 0
t_var_ed = 0
m_edmean = df['AvgEd_M']
f_edmean = df['AvgEd_F']
t_edmean = df['AvgEd']
for i in range(1,5+1):
m_env_pop_lab = 'M_EdCat%d_Total'%(i)
f_env_pop_lab = 'F_EdCat%d_Total'%(i)
t_env_pop_lab = 'EdCat%d_Total'%(i)
m_se = pow((i-m_edmean),2)
f_se = pow((i-f_edmean),2)
t_se = pow((i-t_edmean),2)
m_var_ed += df[m_env_pop_lab]*m_se
f_var_ed += df[f_env_pop_lab]*f_se
t_var_ed += df[t_env_pop_lab]*t_se
m_var_ed /= df['M_EdCatAll_Total']
f_var_ed /= df['F_EdCatAll_Total']
t_var_ed /= df['EdCatAll_Total']
df['VarEd_M'] = m_var_ed
df['VarEd_F'] = f_var_ed
df['VarEd'] = t_var_ed
df = df.fillna(0)
# ------------------------------
# Imbalance in education levels
# relative to total population
imbalance = ( df['F_EdCatAll_Total']*df['AvgEd_F'] \
- df['M_EdCatAll_Total']*df['AvgEd_M']) / \
df['EdCatAll_Total']*df['AvgEd']
df['EdGenderImbalance'] = imbalance
df = df.fillna(0)
return df

55
analysis/Table25070.py

@ -0,0 +1,55 @@ @@ -0,0 +1,55 @@
from pymongo import MongoClient
import numpy as np
import pandas as pd
def Table25070(df):
# Populate a numpy ndarray
# with field names in correct positions
"""
B25070 : Gross Rent as a Percentage of Household Income in the Past 12 Months
B25070001 : Total:
B25070002 : Less than 10.0 percent
B25070003 : 10.0 to 14.9 percent
B25070004 : 15.0 to 19.9 percent
B25070005 : 20.0 to 24.9 percent
B25070006 : 25.0 to 29.9 percent
B25070007 : 30.0 to 34.9 percent
B25070008 : 35.0 to 39.9 percent
B25070009 : 40.0 to 49.9 percent
B25070010 : 50.0 percent or more
B25070011 : Not computed
B25070universe : Renter-occupied Housing Units
B25070yr : 2013
"""
# rent as pct of income
table_fields = np.array(['B25070002', # <10
'B25070003', # 10-15
'B25070004', # 15-20
'B25070005', # 20-25
'B25070006', # 25-30
'B25070007', # 30-35
'B25070008', # 35-40
'B25070009', # 40-50
'B25070010' # 50+
])
labels = ["0-10","10-15","15-20","20-25","25-30","30-35","35-40","40-50","50+"]
tot_pop = df[table_fields].sum(axis=1)
df['TotalRenters'] = tot_pop
# ----------------------
# Compute percentages
for i,(colname,collab) in enumerate(zip(table_fields,labels)):
df["Rent_"+collab+"_Pct"] = df[colname]/df['TotalRenters']
df = df.fillna(0)
return df
Loading…
Cancel
Save