#setup # -*- coding: utf-8 -*- import pandas as pd import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np %pylab inline pd.options.display.mpl_style = 'default' import time import urllib2 from urllib2 import urlopen import datetime import pandas as pd from HTMLParser import HTMLParser import re psqftZRI = 'https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/City_ZriPerSqft_AllHomes.csv' dpsf = pd.read_csv(psqftZRI) #re-organizing ZRI data df = dpsf.sort(['State', 'RegionName']).interpolate() df = df.drop('Metro',1).drop('CountyName',1) df = df.reset_index(level=1).drop('index',1) df.rename(columns={'RegionName':'Region'}, inplace=True) df['Location'] = df.Region.map(str) + ", " + df.State df = df.drop('Region',1,).drop('State',1) #moving Location to 0th position in the dataframe cols = df.columns.tolist() cols = cols[-1:] + cols[:-1] df = df[cols] #write resulting data to a csv file #df.to_csv('df.csv') nyc = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/nyc.csv') #cleaning NYC data nydf = nyc.sort(['State', 'RegionName']).interpolate() nydf = nydf.reset_index(level=1).drop('index',1) nydf.rename(columns={'RegionName':'Region'}, inplace=True) nydf['Location'] = nydf.Region.map(str) + ", " + nydf.State nydf = nydf.drop('Region',1,).drop('State',1) #moving Location to 0th position in the dataframe cols = nydf.columns.tolist() cols = cols[-1:] + cols[:-1] nydf = nydf[cols] #write resulting data to a csv file for further cleaning (transpose, remove spaces + commas) #nydf.to_csv('nydf.csv') #Appending New York City data nyc = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/nydf.csv') #open cleaned csv df = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/df.csv') nydf = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/nydf.csv') df = pd.merge(df,nydf,on='Month') #highestrent = df.max(axis=0) #lowestrent = df.min(axis=0) #averagerent = df.mean(axis=0) #exploring average rent data averageNewarkNJ = "Newark, " + str(df.NewarkNJ.mean(axis=0)) averageHobokenNJ = "Hoboken, " + str(df.HobokenNJ.mean(axis=0)) averageJerseyCityNJ = "Jersey City, " + str(df.JerseyCityNJ.mean(axis=0)) averageShortHillsNJ = "ShortHills, " + str(df.ShortHillsNJ.mean(axis=0)) averageNewYorkNY = "New York, " + str(df.NewYorkNY.mean(axis=0)) averages = [averageNewarkNJ, averageHobokenNJ,averageJerseyCityNJ,averageNewYorkNY] print averageNewarkNJ print averageHobokenNJ print averageJerseyCityNJ print averageNewYorkNY print averageShortHillsNJ #graph settings mpl.rc('lines', linewidth=5) mpl.rc('font', size=12) plt.figure(figsize=(15,5)) plt.ylabel('$ per Sq. ft', fontsize=20) plt.title('Median ZRI per sq. ft. ($)', fontsize=20) df.HobokenNJ.plot(label='Hoboken, NJ', alpha=.7, xlim=(0,46)) df.JerseyCityNJ.plot(label='Jersey City, NJ', alpha=.7, xlim=(0,46)) df.BridgewaterNJ.plot(label='Bridgewater, NJ', alpha=.7, xlim=(0,46)) df.NewarkNJ.plot(label='Newark, NJ', alpha=.7, xlim=(0,46)) df.BedminsterNJ.plot(label='Bedminster, NJ', alpha=.7, xlim=(0,46)) legend = plt.legend(loc=2, shadow=True, fontsize=10) legend.get_frame().set_facecolor('#eeeeee') #graph settings mpl.rc('lines', linewidth=5) mpl.rc('font', size=12) plt.figure(figsize=(15,5)) plt.ylabel('$ per Sq. ft', fontsize=20) plt.title('Median ZRI - Greater Somerville, NJ Area', fontsize=20) df.SomervilleNJ.plot(label='Somerville, NJ', alpha=.7, xlim=(0,46)) df.BridgewaterNJ.plot(label='Bridgewater, NJ', alpha=.7, xlim=(0,46)) df.RaritanNJ.plot(label='Raritan, NJ', alpha=.7, xlim=(0,46)) df.HillsboroughNJ.plot(label='Hillsborough, NJ', alpha=.7, xlim=(0,46)) legend = plt.legend(loc=2, shadow=True, fontsize=12) legend.get_frame().set_facecolor('#eeeeee') nj = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/njdf.csv') seats = nj[['Month','Belvidere','Bridgeton','Camden','CapeMayCourtHouse','Eastampton','Elizabeth','Flemington','FreeholdTownship','HamiltonTownship','HasbrouckHeights','JerseyCity','MaysLanding','Morristown','NewBrunswick','Newark','Newton','Paterson','Somerville','TomsRiver','Trenton','Woodbury']] #KeyError: "['FreeholdBorough' 'Hackensack' 'MountHolly' 'Salem'] not in index" #Freehold Township, Hasbrouck Heights, Eastampton used as substitutes seats = seats.set_index('Month') #seats.count(axis=1) #with 18 cities, we can slice 'seats' into 3 line graphs with 6 cities in each. seats1 = seats.iloc[:,[0,1,2,3,4,5]] seats2 = seats.iloc[:,[6,7,8,9,10,11]] seats3 = seats.iloc[:,[12,13,14,15,16,17]] # #Ideally the 3 lists should be ordered by average median ZRI #but for the sake of simplicity, they're left as it - alphabetically. # #graph settings mpl.rc('lines', linewidth=5) mpl.rc('font', size=12) seats1.plot(figsize=(15,5), xlim=(0,46)) seats2.plot(figsize=(15,5), xlim=(0,46)) seats3.plot(figsize=(15,5), xlim=(0,46)) means = seats.mean() print means mpl.rc('lines', linewidth=0) mpl.rc('font', size=12) plt.ylabel('Average $ per Sq. ft', fontsize=20) plt.title('Mean-Median ZRI(M-MZRI)', fontsize=20) plt.tick_params(which='both', width=0, length=0) means.plot(kind='bar', figsize=(10, 5), color='#333333', ylim=(0,2.3), grid=(False)) mpl.rc('lines', linewidth=0) maxs = (seats.max()) plt.ylabel('Max $ per Sq. ft', fontsize=20) plt.title('Maximum Median ZRI - New Jersey County Seats', fontsize=20) plt.colors() means.plot(kind='bar', figsize=(10,5), color='#333333', grid=False) nj.head() top = nj.max() sort = top.order(ascending=False) sort.head(11) mpl.rc('lines', linewidth=5) top = nj[['Month', 'Hoboken', 'JerseyCity', 'SeaIsleCity', 'PortReading', 'UnionCity', 'Edgewater', 'Guttenberg', 'ShortHills', 'ChathamTownship', 'SeaBright']] top.plot(figsize=(20,10), x='Month', xlim=(0,46)) plt.ylabel('$ per Sq. ft', fontsize=20) plt.title('Top 10 Median ZRI Prices in New Jersey', fontsize=20) plt.legend(loc=2) # from http://www.usa.com/rank/new-jersey-state--population-density--city-rank.htm, parse, munge, write to df url = 'http://www.usa.com/rank/new-jersey-state--population-density--city-rank.htm' sourceCode = urllib2.urlopen(url).read() #defining split parameters topSplit = 'City / Population' bottomSplit = '
Please' sourceCodeSplit = sourceCode.split(topSplit)[1].split(bottomSplit)[0] content = sourceCodeSplit.split('\n') content #ranks = [] cities = [] densities = [] pops = [] for x in content: #rank = re.findall(r'