4 changed files with 761 additions and 40 deletions
@ -0,0 +1,213 @@
@@ -0,0 +1,213 @@
|
||||
from pymongo import MongoClient |
||||
|
||||
import numpy as np |
||||
import pandas as pd |
||||
|
||||
import matplotlib.pyplot as plt |
||||
import seaborn as sns |
||||
|
||||
import statsmodels.api as sm |
||||
from scipy import stats |
||||
|
||||
import metro as mt |
||||
|
||||
|
||||
|
||||
do_univariate_kde = True |
||||
|
||||
cities = ['Seattle'] |
||||
|
||||
|
||||
def main(): |
||||
do_25070() |
||||
|
||||
|
||||
def do_25070(): |
||||
|
||||
from Table25070 import Table25070 |
||||
|
||||
# Seaborn |
||||
sns.set_palette("deep", desat=.6) |
||||
sns.set_context(rc={"figure.figsize": (8, 4)}) |
||||
c1, c2, c3 = sns.color_palette("Set1", 3) |
||||
|
||||
# Mongo |
||||
client = MongoClient() |
||||
db = client['metros'] |
||||
metaprops = db['PropertiesMeta'] |
||||
props = db['Properties'] |
||||
|
||||
for city in cities: |
||||
|
||||
######################### |
||||
# Mongo lookup |
||||
|
||||
cbsa = mt.CBSACode(city)[0] |
||||
|
||||
pre_search = metaprops.find_one({'geoid':cbsa}) |
||||
|
||||
mongo_search = props.find({'$and': [ |
||||
{'metroid':cbsa}, |
||||
{'geoid':{'$nin':[cbsa]}} |
||||
] |
||||
}) |
||||
|
||||
if 'B25070' not in pre_search['tables']: |
||||
print "Error: could not find table B25070 for city",city,"in db." |
||||
continue |
||||
|
||||
if mongo_search.count()==0: |
||||
print "Error: could not find city",city,"in db." |
||||
continue |
||||
|
||||
|
||||
df = pd.DataFrame([]) |
||||
srch = list(mongo_search) |
||||
for i,r in enumerate(srch): |
||||
if cbsa in r['geoid']: |
||||
del srch[i] |
||||
break |
||||
df = df.append(srch) |
||||
df = Table25070(df) |
||||
|
||||
|
||||
|
||||
####################### |
||||
# Bar plot: |
||||
|
||||
f, ax1 = plt.subplots(1, 1, sharex=True, figsize=(12, 6)) |
||||
|
||||
|
||||
# ------------------------ |
||||
# Subplot 1 |
||||
|
||||
ax1.set_title(cities[0]) |
||||
|
||||
labels = ["0-10","10-15","15-20","20-25","25-30","30-35","35-40","40-50","50+"] |
||||
|
||||
ddt = {} |
||||
for i,collab in enumerate(labels): |
||||
lab = "Rent_"+collab+"_Pct" |
||||
ddt[lab] = df[lab].values |
||||
|
||||
xx = np.array(ddt.keys()) |
||||
yy = np.array(ddt.values()) |
||||
sns.barplot(xx,yy,palette="Set1",ax=ax1) |
||||
|
||||
#xx = np.array(ddm.keys()) |
||||
#yy = np.array(ddm.values()) |
||||
#sns.barplot(xx,yy,palette=xcp,ax=ax2) |
||||
|
||||
#xx = np.array(ddf.keys()) |
||||
#yy = np.array(ddf.values()) |
||||
#sns.barplot(xx,yy,palette=xcp,ax=ax3) |
||||
|
||||
plt.show() |
||||
plt.draw() |
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def do_15002(): |
||||
|
||||
from Table15002 import Table15002 |
||||
|
||||
# Seaborn |
||||
sns.set_palette("deep", desat=.6) |
||||
sns.set_context(rc={"figure.figsize": (8, 4)}) |
||||
|
||||
# Mongo |
||||
client = MongoClient() |
||||
db = client['metros'] |
||||
metaprops = db['PropertiesMeta'] |
||||
props = db['Properties'] |
||||
|
||||
for city in cities: |
||||
|
||||
######################### |
||||
# Mongo lookup |
||||
|
||||
cbsa = mt.CBSACode(city)[0] |
||||
|
||||
pre_search = metaprops.find_one({'geoid':cbsa}) |
||||
|
||||
mongo_search = props.find({'$and': [ |
||||
{'metroid':cbsa}, |
||||
{'geoid':{'$nin':[cbsa]}} |
||||
] |
||||
}) |
||||
|
||||
if 'B15002' not in pre_search['tables']: |
||||
print "Error: could not find table B15002 for city",city,"in db." |
||||
continue |
||||
|
||||
if mongo_search.count()==0: |
||||
print "Error: could not find city",city,"in db." |
||||
continue |
||||
|
||||
|
||||
df = pd.DataFrame([]) |
||||
srch = list(mongo_search) |
||||
for i,r in enumerate(srch): |
||||
if cbsa in r['geoid']: |
||||
del srch[i] |
||||
break |
||||
df = df.append(srch) |
||||
df = Table15002(df) |
||||
|
||||
|
||||
|
||||
colors = ["windows blue", "amber", "greyish", "faded green", "dusty purple"] |
||||
xcp = sns.xkcd_palette(colors) |
||||
|
||||
|
||||
|
||||
####################### |
||||
# Bar plot: |
||||
|
||||
f, ax1 = plt.subplots(1, 1, sharex=True, figsize=(8, 8)) |
||||
|
||||
|
||||
# ------------------------ |
||||
# Subplot 1 |
||||
|
||||
ax1.set_title(cities[0]) |
||||
|
||||
#ddm = {} |
||||
#ddf = {} |
||||
ddt = {} |
||||
for i in range(1,5+1): |
||||
#ddm[str(i)] = df["M_EdCat%d_Pct"%(i)].values |
||||
#ddf[str(i)] = df["F_EdCat%d_Pct"%(i)].values |
||||
ddt[str(i)] = df["EdCat%d_Pct"%(i)].values |
||||
|
||||
xx = np.array(ddt.keys()) |
||||
yy = np.array(ddt.values()) |
||||
sns.barplot(xx,yy,palette=xcp,ax=ax1) |
||||
|
||||
#xx = np.array(ddm.keys()) |
||||
#yy = np.array(ddm.values()) |
||||
#sns.barplot(xx,yy,palette=xcp,ax=ax2) |
||||
|
||||
#xx = np.array(ddf.keys()) |
||||
#yy = np.array(ddf.values()) |
||||
#sns.barplot(xx,yy,palette=xcp,ax=ax3) |
||||
|
||||
plt.show() |
||||
plt.draw() |
||||
|
||||
|
||||
|
||||
plt.draw() |
||||
plt.show() |
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__=="__main__": |
||||
main() |
||||
|
||||
|
@ -0,0 +1,276 @@
@@ -0,0 +1,276 @@
|
||||
from pymongo import MongoClient |
||||
|
||||
import numpy as np |
||||
import pandas as pd |
||||
|
||||
def Table15002(df): |
||||
|
||||
# Populate a numpy ndarray |
||||
# with field names in correct positions |
||||
|
||||
""" |
||||
B15002001 : Total: |
||||
B15002002 : Male: |
||||
B15002003 : No schooling completed |
||||
B15002004 : Nursery to 4th grade |
||||
B15002005 : 5th and 6th grade |
||||
B15002006 : 7th and 8th grade |
||||
B15002007 : 9th grade |
||||
B15002008 : 10th grade |
||||
B15002009 : 11th grade |
||||
B15002010 : 12th grade, no diploma |
||||
B15002011 : High school graduate (includes equivalency) |
||||
B15002012 : Some college, less than 1 year |
||||
B15002013 : Some college, 1 or more years, no degree |
||||
B15002014 : Associate's degree |
||||
B15002015 : Bachelor's degree |
||||
B15002016 : Master's degree |
||||
B15002017 : Professional school degree |
||||
B15002018 : Doctorate degree |
||||
B15002019 : Female: |
||||
B15002020 : No schooling completed |
||||
B15002021 : Nursery to 4th grade |
||||
B15002022 : 5th and 6th grade |
||||
B15002023 : 7th and 8th grade |
||||
B15002024 : 9th grade |
||||
B15002025 : 10th grade |
||||
B15002026 : 11th grade |
||||
B15002027 : 12th grade, no diploma |
||||
B15002028 : High school graduate (includes equivalency) |
||||
B15002029 : Some college, less than 1 year |
||||
B15002030 : Some college, 1 or more years, no degree |
||||
B15002031 : Associate's degree |
||||
B15002032 : Bachelor's degree |
||||
B15002033 : Master's degree |
||||
B15002034 : Professional school degree |
||||
B15002035 : Doctorate degree |
||||
""" |
||||
|
||||
|
||||
# column 1: male |
||||
# column 2: female |
||||
table_fields = np.array([['B15002003','B15002020'], # No schooling completed |
||||
['B15002004','B15002021'], # Nursery to 4th grade |
||||
['B15002005','B15002022'], # 5th and 6th grade |
||||
['B15002006','B15002023'], # 7th and 8th grade |
||||
['B15002007','B15002024'], # 9th grade |
||||
['B15002008','B15002025'], # 10th grade |
||||
['B15002009','B15002026'], # 11th grade |
||||
['B15002010','B15002027'], # 12th grade, no diploma |
||||
['B15002011','B15002028'], # High school graduate (includes equivalency) |
||||
['B15002012','B15002029'], # Some college, less than 1 year |
||||
['B15002013','B15002030'], # Some college, 1 or more years, no degree |
||||
['B15002014','B15002031'], # Associate's degree |
||||
['B15002015','B15002032'], # Bachelor's degree |
||||
['B15002016','B15002033'], # Master's degree |
||||
['B15002017','B15002034'], # Professional school degree |
||||
['B15002018','B15002035'] # Doctorate degree |
||||
]) |
||||
|
||||
|
||||
# ----------------------------- |
||||
# PDF: |
||||
# |
||||
# Education level by gender |
||||
# |
||||
# Education bins: |
||||
# 1 = less than high school |
||||
# 2 = high school, associates, some college |
||||
# 3 = bachelors |
||||
# 4 = masters |
||||
# 5 = doctorate/professional |
||||
|
||||
|
||||
edcat = [1,1,1,1,1,1,1,1,2,2,2,2,3,4,5,5] |
||||
|
||||
edcat1 = [] |
||||
edcat2 = [] |
||||
edcat3 = [] |
||||
edcat4 = [] |
||||
edcat5 = [] |
||||
edcat1_m = [] |
||||
edcat2_m = [] |
||||
edcat3_m = [] |
||||
edcat4_m = [] |
||||
edcat5_m = [] |
||||
edcat1_f = [] |
||||
edcat2_f = [] |
||||
edcat3_f = [] |
||||
edcat4_f = [] |
||||
edcat5_f = [] |
||||
|
||||
for k in range(table_fields.shape[0]): |
||||
|
||||
ied = edcat[k] |
||||
|
||||
if ied==1: |
||||
edcat1_m.append(table_fields[k,0]) |
||||
edcat1_f.append(table_fields[k,1]) |
||||
elif ied==2: |
||||
edcat2_m.append(table_fields[k,0]) |
||||
edcat2_f.append(table_fields[k,1]) |
||||
elif ied==3: |
||||
edcat3_m.append(table_fields[k,0]) |
||||
edcat3_f.append(table_fields[k,1]) |
||||
elif ied==4: |
||||
edcat4_m.append(table_fields[k,0]) |
||||
edcat4_f.append(table_fields[k,1]) |
||||
elif ied==5: |
||||
edcat5_m.append(table_fields[k,0]) |
||||
edcat5_f.append(table_fields[k,1]) |
||||
|
||||
edcat1 = edcat1_m + edcat1_f |
||||
edcat2 = edcat2_m + edcat2_f |
||||
edcat3 = edcat3_m + edcat3_f |
||||
edcat4 = edcat4_m + edcat4_f |
||||
edcat5 = edcat5_m + edcat5_f |
||||
|
||||
m_edcat1_tot = df[edcat1_m].sum(axis=1) |
||||
m_edcat2_tot = df[edcat2_m].sum(axis=1) |
||||
m_edcat3_tot = df[edcat3_m].sum(axis=1) |
||||
m_edcat4_tot = df[edcat4_m].sum(axis=1) |
||||
m_edcat5_tot = df[edcat5_m].sum(axis=1) |
||||
|
||||
f_edcat1_tot = df[edcat1_f].sum(axis=1) |
||||
f_edcat2_tot = df[edcat2_f].sum(axis=1) |
||||
f_edcat3_tot = df[edcat3_f].sum(axis=1) |
||||
f_edcat4_tot = df[edcat4_f].sum(axis=1) |
||||
f_edcat5_tot = df[edcat5_f].sum(axis=1) |
||||
|
||||
edcat1_tot = df[edcat1].sum(axis=1) |
||||
edcat2_tot = df[edcat2].sum(axis=1) |
||||
edcat3_tot = df[edcat3].sum(axis=1) |
||||
edcat4_tot = df[edcat4].sum(axis=1) |
||||
edcat5_tot = df[edcat5].sum(axis=1) |
||||
|
||||
df['M_EdCat1_Total'] = m_edcat1_tot |
||||
df['F_EdCat1_Total'] = f_edcat1_tot |
||||
df['EdCat1_Total'] = edcat1_tot |
||||
|
||||
df['M_EdCat2_Total'] = m_edcat2_tot |
||||
df['F_EdCat2_Total'] = f_edcat2_tot |
||||
df['EdCat2_Total'] = edcat2_tot |
||||
|
||||
df['M_EdCat3_Total'] = m_edcat3_tot |
||||
df['F_EdCat3_Total'] = f_edcat3_tot |
||||
df['EdCat3_Total'] = edcat3_tot |
||||
|
||||
df['M_EdCat4_Total'] = m_edcat4_tot |
||||
df['F_EdCat4_Total'] = f_edcat4_tot |
||||
df['EdCat4_Total'] = edcat4_tot |
||||
|
||||
df['M_EdCat5_Total'] = m_edcat5_tot |
||||
df['F_EdCat5_Total'] = f_edcat5_tot |
||||
df['EdCat5_Total'] = edcat5_tot |
||||
|
||||
|
||||
|
||||
m_edcat_all = list(table_fields[:,0]) |
||||
f_edcat_all = list(table_fields[:,1]) |
||||
edcat_all = m_edcat_all + f_edcat_all |
||||
|
||||
df['M_EdCatAll_Total'] = df[m_edcat_all].sum(axis=1) |
||||
df['F_EdCatAll_Total'] = df[f_edcat_all].sum(axis=1) |
||||
df['EdCatAll_Total'] = df[edcat_all].sum(axis=1) |
||||
|
||||
m_avg_ed = 0 |
||||
f_avg_ed = 0 |
||||
avg_ed = 0 |
||||
for i in range(1,5+1): |
||||
mlab = "M_EdCat%d_Total"%(i) |
||||
flab = "F_EdCat%d_Total"%(i) |
||||
|
||||
m_avg_ed += i*df[mlab] |
||||
f_avg_ed += i*df[flab] |
||||
avg_ed += i*(df[mlab]+df[flab]) |
||||
|
||||
m_avg_ed /= df['M_EdCatAll_Total'] |
||||
f_avg_ed /= df['F_EdCatAll_Total'] |
||||
avg_ed /= df['EdCatAll_Total'] |
||||
|
||||
df['AvgEd_M'] = m_avg_ed |
||||
df['AvgEd_F'] = f_avg_ed |
||||
df['AvgEd'] = avg_ed |
||||
|
||||
df = df.fillna(0) |
||||
|
||||
|
||||
# ------------------ |
||||
# Compute percent of pop |
||||
# in each ed category |
||||
|
||||
m_denom_lab = 'M_EdCatAll_Total' |
||||
f_denom_lab = 'F_EdCatAll_Total' |
||||
t_denom_lab = 'EdCatAll_Total' |
||||
|
||||
for i in range(1,5+1): |
||||
|
||||
m_num_lab = 'M_EdCat%d_Total'%(i) |
||||
f_num_lab = 'F_EdCat%d_Total'%(i) |
||||
t_num_lab = 'EdCat%d_Total'%(i) |
||||
|
||||
m_pct = df[m_num_lab]/df[m_denom_lab] |
||||
f_pct = df[f_num_lab]/df[f_denom_lab] |
||||
t_pct = df[t_num_lab]/df[t_denom_lab] |
||||
|
||||
mlab = "M_EdCat%d_Pct"%(i) |
||||
flab = "F_EdCat%d_Pct"%(i) |
||||
tlab = "EdCat%d_Pct"%(i) |
||||
|
||||
df[mlab] = m_pct |
||||
df[flab] = f_pct |
||||
df[tlab] = t_pct |
||||
|
||||
df = df.fillna(0) |
||||
|
||||
|
||||
# ---------------------- |
||||
# Compute variance |
||||
# in education levels |
||||
|
||||
m_var_ed = 0 |
||||
f_var_ed = 0 |
||||
t_var_ed = 0 |
||||
|
||||
m_edmean = df['AvgEd_M'] |
||||
f_edmean = df['AvgEd_F'] |
||||
t_edmean = df['AvgEd'] |
||||
|
||||
for i in range(1,5+1): |
||||
|
||||
m_env_pop_lab = 'M_EdCat%d_Total'%(i) |
||||
f_env_pop_lab = 'F_EdCat%d_Total'%(i) |
||||
t_env_pop_lab = 'EdCat%d_Total'%(i) |
||||
|
||||
m_se = pow((i-m_edmean),2) |
||||
f_se = pow((i-f_edmean),2) |
||||
t_se = pow((i-t_edmean),2) |
||||
|
||||
m_var_ed += df[m_env_pop_lab]*m_se |
||||
f_var_ed += df[f_env_pop_lab]*f_se |
||||
t_var_ed += df[t_env_pop_lab]*t_se |
||||
|
||||
m_var_ed /= df['M_EdCatAll_Total'] |
||||
f_var_ed /= df['F_EdCatAll_Total'] |
||||
t_var_ed /= df['EdCatAll_Total'] |
||||
|
||||
df['VarEd_M'] = m_var_ed |
||||
df['VarEd_F'] = f_var_ed |
||||
df['VarEd'] = t_var_ed |
||||
|
||||
df = df.fillna(0) |
||||
|
||||
|
||||
# ------------------------------ |
||||
# Imbalance in education levels |
||||
# relative to total population |
||||
|
||||
imbalance = ( df['F_EdCatAll_Total']*df['AvgEd_F'] \ |
||||
- df['M_EdCatAll_Total']*df['AvgEd_M']) / \ |
||||
df['EdCatAll_Total']*df['AvgEd'] |
||||
|
||||
df['EdGenderImbalance'] = imbalance |
||||
df = df.fillna(0) |
||||
|
||||
return df |
||||
|
@ -0,0 +1,55 @@
@@ -0,0 +1,55 @@
|
||||
from pymongo import MongoClient |
||||
|
||||
import numpy as np |
||||
import pandas as pd |
||||
|
||||
|
||||
|
||||
def Table25070(df): |
||||
|
||||
# Populate a numpy ndarray |
||||
# with field names in correct positions |
||||
|
||||
""" |
||||
B25070 : Gross Rent as a Percentage of Household Income in the Past 12 Months |
||||
B25070001 : Total: |
||||
B25070002 : Less than 10.0 percent |
||||
B25070003 : 10.0 to 14.9 percent |
||||
B25070004 : 15.0 to 19.9 percent |
||||
B25070005 : 20.0 to 24.9 percent |
||||
B25070006 : 25.0 to 29.9 percent |
||||
B25070007 : 30.0 to 34.9 percent |
||||
B25070008 : 35.0 to 39.9 percent |
||||
B25070009 : 40.0 to 49.9 percent |
||||
B25070010 : 50.0 percent or more |
||||
B25070011 : Not computed |
||||
B25070universe : Renter-occupied Housing Units |
||||
B25070yr : 2013 |
||||
""" |
||||
# rent as pct of income |
||||
table_fields = np.array(['B25070002', # <10 |
||||
'B25070003', # 10-15 |
||||
'B25070004', # 15-20 |
||||
'B25070005', # 20-25 |
||||
'B25070006', # 25-30 |
||||
'B25070007', # 30-35 |
||||
'B25070008', # 35-40 |
||||
'B25070009', # 40-50 |
||||
'B25070010' # 50+ |
||||
]) |
||||
labels = ["0-10","10-15","15-20","20-25","25-30","30-35","35-40","40-50","50+"] |
||||
|
||||
tot_pop = df[table_fields].sum(axis=1) |
||||
df['TotalRenters'] = tot_pop |
||||
|
||||
# ---------------------- |
||||
# Compute percentages |
||||
for i,(colname,collab) in enumerate(zip(table_fields,labels)): |
||||
|
||||
df["Rent_"+collab+"_Pct"] = df[colname]/df['TotalRenters'] |
||||
|
||||
df = df.fillna(0) |
||||
|
||||
return df |
||||
|
||||
|
Loading…
Reference in new issue