#setup # -*- coding: utf-8 -*- import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np %pylab inline pd.options.display.mpl_style = 'default' import time import urllib2 from urllib2 import urlopen import datetime import pandas as pd from HTMLParser import HTMLParser import re psqftZRI = 'https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/City_ZriPerSqft_AllHomes.csv' dpsf = pd.read_csv(psqftZRI) #re-organizing ZRI data df = dpsf.sort(['State', 'RegionName']).interpolate() df = df.drop('Metro',1).drop('CountyName',1) df = df.reset_index(level=1).drop('index',1) df.rename(columns={'RegionName':'Region'}, inplace=True) df['Location'] = df.Region.map(str) + ", " + df.State df = df.drop('Region',1,).drop('State',1) #moving Location to 0th position in the dataframe cols = df.columns.tolist() cols = cols[-1:] + cols[:-1] df = df[cols] #write resulting data to a csv file #df.to_csv('df.csv') nyc = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/nyc.csv') #cleaning NYC data nydf = nyc.sort(['State', 'RegionName']).interpolate() nydf = nydf.reset_index(level=1).drop('index',1) nydf.rename(columns={'RegionName':'Region'}, inplace=True) nydf['Location'] = nydf.Region.map(str) + ", " + nydf.State nydf = nydf.drop('Region',1,).drop('State',1) #moving Location to 0th position in the dataframe cols = nydf.columns.tolist() cols = cols[-1:] + cols[:-1] nydf = nydf[cols] #write resulting data to a csv file for further cleaning (transpose, remove spaces + commas) #nydf.to_csv('nydf.csv') #Appending New York City data nyc = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/nydf.csv') #open cleaned csv df = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/df.csv') nydf = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/nydf.csv') df = pd.merge(df,nydf,on='Month') #highestrent = df.max(axis=0) #lowestrent = df.min(axis=0) #averagerent = df.mean(axis=0) #exploring average rent data averageNewarkNJ = "Newark, " + str(df.NewarkNJ.mean(axis=0)) averageHobokenNJ = "Hoboken, " + str(df.HobokenNJ.mean(axis=0)) averageJerseyCityNJ = "Jersey City, " + str(df.JerseyCityNJ.mean(axis=0)) averageShortHillsNJ = "ShortHills, " + str(df.ShortHillsNJ.mean(axis=0)) averageNewYorkNY = "New York, " + str(df.NewYorkNY.mean(axis=0)) averages = [averageNewarkNJ, averageHobokenNJ,averageJerseyCityNJ,averageNewYorkNY] print averageNewarkNJ print averageHobokenNJ print averageJerseyCityNJ print averageNewYorkNY print averageShortHillsNJ #graph settings mpl.rc('lines', linewidth=5) mpl.rc('font', size=12) plt.figure(figsize=(15,5)) plt.ylabel('$ per Sq. ft', fontsize=20) plt.title('Median ZRI per sq. ft. ($)', fontsize=20) df.HobokenNJ.plot(label='Hoboken, NJ', alpha=.7, xlim=(0,46)) df.JerseyCityNJ.plot(label='Jersey City, NJ', alpha=.7, xlim=(0,46)) df.BridgewaterNJ.plot(label='Bridgewater, NJ', alpha=.7, xlim=(0,46)) df.NewarkNJ.plot(label='Newark, NJ', alpha=.7, xlim=(0,46)) df.BedminsterNJ.plot(label='Bedminster, NJ', alpha=.7, xlim=(0,46)) legend = plt.legend(loc=2, shadow=True, fontsize=10) legend.get_frame().set_facecolor('#eeeeee') #graph settings mpl.rc('lines', linewidth=5) mpl.rc('font', size=12) plt.figure(figsize=(15,5)) plt.ylabel('$ per Sq. ft', fontsize=20) plt.title('Median ZRI - Greater Somerville, NJ Area', fontsize=20) df.SomervilleNJ.plot(label='Somerville, NJ', alpha=.7, xlim=(0,46)) df.BridgewaterNJ.plot(label='Bridgewater, NJ', alpha=.7, xlim=(0,46)) df.RaritanNJ.plot(label='Raritan, NJ', alpha=.7, xlim=(0,46)) df.HillsboroughNJ.plot(label='Hillsborough, NJ', alpha=.7, xlim=(0,46)) legend = plt.legend(loc=2, shadow=True, fontsize=12) legend.get_frame().set_facecolor('#eeeeee') nj = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/njdf.csv') seats = nj[['Month','Belvidere','Bridgeton','Camden','CapeMayCourtHouse','Eastampton','Elizabeth','Flemington','FreeholdTownship','HamiltonTownship','HasbrouckHeights','JerseyCity','MaysLanding','Morristown','NewBrunswick','Newark','Newton','Paterson','Somerville','TomsRiver','Trenton','Woodbury']] #KeyError: "['FreeholdBorough' 'Hackensack' 'MountHolly' 'Salem'] not in index" #Freehold Township, Hasbrouck Heights, Eastampton used as substitutes seats = seats.set_index('Month') #seats.count(axis=1) #with 18 cities, we can slice 'seats' into 3 line graphs with 6 cities in each. seats1 = seats.iloc[:,[0,1,2,3,4,5]] seats2 = seats.iloc[:,[6,7,8,9,10,11]] seats3 = seats.iloc[:,[12,13,14,15,16,17]] # #Ideally the 3 lists should be ordered by average median ZRI #but for the sake of simplicity, they're left as it - alphabetically. # #graph settings mpl.rc('lines', linewidth=5) mpl.rc('font', size=12) seats1.plot(figsize=(15,5), xlim=(0,46)) seats2.plot(figsize=(15,5), xlim=(0,46)) seats3.plot(figsize=(15,5), xlim=(0,46)) means = seats.mean() print means mpl.rc('lines', linewidth=0) mpl.rc('font', size=12) plt.ylabel('Average $ per Sq. ft', fontsize=20) plt.title('Mean-Median ZRI(M-MZRI)', fontsize=20) plt.tick_params(which='both', width=0, length=0) means.plot(kind='bar', figsize=(10, 5), color='#333333', ylim=(0,2.3), grid=(False)) mpl.rc('lines', linewidth=0) maxs = (seats.max()) plt.ylabel('Max $ per Sq. ft', fontsize=20) plt.title('Maximum Median ZRI - New Jersey County Seats', fontsize=20) plt.colors() means.plot(kind='bar', figsize=(10,5), color='#333333', grid=False) nj.head() top = nj.max() sort = top.order(ascending=False) sort.head(11) mpl.rc('lines', linewidth=5) top = nj[['Month', 'Hoboken', 'JerseyCity', 'SeaIsleCity', 'PortReading', 'UnionCity', 'Edgewater', 'Guttenberg', 'ShortHills', 'ChathamTownship', 'SeaBright']] top.plot(figsize=(20,10), x='Month', xlim=(0,46)) plt.ylabel('$ per Sq. ft', fontsize=20) plt.title('Top 10 Median ZRI Prices in New Jersey', fontsize=20) plt.legend(loc=2) # from http://www.usa.com/rank/new-jersey-state--population-density--city-rank.htm, parse, munge, write to df url = 'http://www.usa.com/rank/new-jersey-state--population-density--city-rank.htm' sourceCode = urllib2.urlopen(url).read() #defining split parameters topSplit = 'City / Population' bottomSplit = '

Please' sourceCodeSplit = sourceCode.split(topSplit)[1].split(bottomSplit)[0] content = sourceCodeSplit.split('\n') content #ranks = [] cities = [] densities = [] pops = [] for x in content: #rank = re.findall(r'(.*?)', x) city = re.findall(r'">(.*?)', x) density = re.findall(r'(.*?)/sq', x) pop = re.findall(r' / (.*?)', x) #for i in rank: # ranks.append(i) for i in city: cities.append(i) for i in density: densities.append(i) for i in pop: pops.append(i) populations = [] for i in pops: i = i.replace(',', '') i = int(i) populations.append(i) popdensities = [] for i in densities: i = i.replace(',', '') i = re.findall(r'(.*?)\.', i) i = map(int, i) popdensities.append(i) p = pd.DataFrame(populations, cities) p = p.reset_index() p.columns = ['City', 'Population'] p.to_csv('populations.csv') pddf = pd.DataFrame(popdensities, cities) pddf = pddf.reset_index() pddf.columns = ['City','Pop.Density'] pddf.to_csv('pddfs.csv') pddfs = pd.read_csv('pddfs.csv') pddfs = pddfs.drop('Unnamed: 0', axis=1) City0 = [] for i in pddfs['City']: i = i.replace(' ', '')[:-3] City0.append(i) Population0 = [] for i in p['City']: i = i.replace(' ', '')[:-3] Population0.append(i) pddfs['City'] = City0 p['City'] = Population0 pddfs['Population'] = p['Population'] merging = [] for i in pddfs['City']: if i in nj: merging.append(i) nj[merging].head() meanmerge = nj[merging].mean() meanmerge.head() merger = pd.DataFrame(meanmerge) merger = merger.reset_index() merger.columns = ['City', 'M-MZRI'] merger.head() merged = pd.merge(merger, pddfs, on='City') merged.head() merged.plot(kind='scatter', x='Pop.Density', y='M-MZRI', figsize=(15,5), s=(merged['Population'])**1.5/(merged['Population'].mean()), c=merged['Pop.Density'], alpha=.75, xlim=(0,60000), ylim=(0.5,3)) merged['Population'].mean() hi_pop_den = {'UnionCity':51810.2, 'Guttenberg':46128.3, 'WestNewYork':37379.0, 'CliffsidePark':24508.7, 'Passaic':21512.2, 'Paterson':16796.2, 'Fairview':16400.6, 'EastOrange':16377.1 } hpd = pd.DataFrame.from_dict(hi_pop_den.items()) hpd.columns = ['City', 'Pop.Density'] hpd top_pop_den = nj[[ 'UnionCity', 'Guttenberg', 'WestNewYork', 'CliffsidePark', 'Passaic', 'Paterson', 'Fairview', 'EastOrange']] tpd_mean = top_pop_den.mean() tpd_mean = pd.DataFrame(tpd_mean, columns=['M-MZRI']) tpd_mean = tpd_mean.reset_index() tpd_mean.columns = ['City', 'M-MZRI'] tpd = pd.merge(tpd_mean,hpd,on='City') tpd # pop.den > 15,000/sq.mi low_pop_den = {'EstellManor':31.5, 'Folsom':223.4, 'Woodbine':308.2, 'Hope':317.0, 'LongValley':406.4, 'HolidayHeights':424.6, 'Ringwood':434.0, 'Peapack':441.1 } lpd = pd.DataFrame.from_dict(low_pop_den.items()) lpd.columns = ['City', 'Pop.Density'] lpd bot_pop_den = nj[[ 'EstellManor', 'Folsom', 'Woodbine', 'Hope', 'LongValley', 'HolidayHeights', 'Ringwood', 'Peapack' ]] bpd_mean = bot_pop_den.mean() bpd_mean = pd.DataFrame(bpd_mean, columns=['M-MZRI']) bpd_mean = bpd_mean.reset_index() bpd_mean.columns = ['City', 'M-MZRI'] bpd = pd.merge(bpd_mean,lpd,on='City') bpd #pop.den < 500/sq.mi mpl.rc('font', size=12) tpd.plot(kind='scatter', x='Pop.Density', y='M-MZRI', s=200, figsize=(15,5), title='Mean-Median ZRI in High-Population Density Areas') print tpd bpd.plot(kind='scatter', x='Pop.Density', y='M-MZRI', s=200, figsize=(15,5), title='Mean-Median ZRI in Low-Population Density Areas') print bpd pdd = tpd.append(bpd) pdd pdd.plot(kind='scatter', x='Pop.Density', y='M-MZRI', xlim=(0,60000), figsize=(15,5), s=100, title='Top and Bottom 7 Places in NJ by Population Density') print pdd bot_pop_den.plot(figsize=(25,10), linewidth=5, xlim=(0,46)) #nj.mean().plot(figsize=(10,5), kind='kde', xlim=(0,3.5), ylim=(0,2.1), color='#333333', linewidth=5) top_mean = top_pop_den.mean() bot_mean = bot_pop_den.mean() top_mean top_mean_df = pd.DataFrame(top_mean, columns=['MM-ZRI']) top_mean_df = top_mean_df.reset_index() top_mean_df.columns = ['City', 'MM-ZRI'] top_mean_df bot_mean_df = pd.DataFrame(bot_mean, columns=['MM-ZRI']) bot_mean_df = bot_mean_df.reset_index() bot_mean_df.columns = ['City', 'MM-ZRI'] bot_mean_df top_mean_df.plot(kind='barh', x='City', figsize=(10,5)) bot_mean_df.plot(kind='barh', x='City', figsize=(10,5)) njue = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/njue.csv') njue = njue.set_index('Month') njue.describe() njue.plot(kind='area', figsize=(15,5), ylim=(0,12), alpha=.5) plt.title('New Jersey Unemployment Rates 2011 - 2014',fontsize=20) plt.ylabel('Unemployment Rate', fontsize=15) plt.show() print njue[18:28] #graph settings njue.plot(figsize=(15,5), kind='area', linewidth=5, xlim=(0,46), alpha=.5) nj.Hoboken.plot(figsize=(15,5), kind='area', linewidth=5, alpha=.5, color='#111111') nj.JerseyCity.plot(figsize=(15,5), kind='area', linewidth=5, alpha=.5, color='#aaaaaa') legend = plt.legend(loc=1, shadow=True, fontsize=12) legend.get_frame().set_facecolor('#eeeeee') plt.title('New Jersey Unemployment Rate (%) vs. Median ZRI ($/sq.ft)', fontsize=20) plt.ylabel('$/sq.ft', fontsize=15) plt.xlim(0,46) plt.ylim(0,12) plt.show()