#setup
# -*- coding: utf-8 -*-
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
%pylab inline

pd.options.display.mpl_style = 'default'

import time
import urllib2
from urllib2 import urlopen
import datetime
import pandas as pd
from HTMLParser import HTMLParser
import re

psqftZRI = 'https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/City_ZriPerSqft_AllHomes.csv'
dpsf = pd.read_csv(psqftZRI)

#re-organizing ZRI data
df = dpsf.sort(['State', 'RegionName']).interpolate()
df = df.drop('Metro',1).drop('CountyName',1)
df = df.reset_index(level=1).drop('index',1)
df.rename(columns={'RegionName':'Region'}, inplace=True)
df['Location'] = df.Region.map(str) + ", " + df.State
df = df.drop('Region',1,).drop('State',1)

#moving Location to 0th position in the dataframe
cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]

#write resulting data to a csv file
#df.to_csv('df.csv')

nyc = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/nyc.csv')

#cleaning NYC data
nydf = nyc.sort(['State', 'RegionName']).interpolate()
nydf = nydf.reset_index(level=1).drop('index',1)
nydf.rename(columns={'RegionName':'Region'}, inplace=True)
nydf['Location'] = nydf.Region.map(str) + ", " + nydf.State
nydf = nydf.drop('Region',1,).drop('State',1)

#moving Location to 0th position in the dataframe
cols = nydf.columns.tolist()
cols = cols[-1:] + cols[:-1]
nydf = nydf[cols]

#write resulting data to a csv file for further cleaning (transpose, remove spaces + commas)
#nydf.to_csv('nydf.csv')

#Appending New York City data
nyc = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/nydf.csv')

#open cleaned csv
df = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/df.csv')
nydf = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/nydf.csv')
df = pd.merge(df,nydf,on='Month')

#highestrent = df.max(axis=0)
#lowestrent = df.min(axis=0)
#averagerent = df.mean(axis=0)

#exploring average rent data

averageNewarkNJ = "Newark, " + str(df.NewarkNJ.mean(axis=0))
averageHobokenNJ = "Hoboken, " + str(df.HobokenNJ.mean(axis=0))
averageJerseyCityNJ = "Jersey City, " + str(df.JerseyCityNJ.mean(axis=0))
averageShortHillsNJ = "ShortHills, " + str(df.ShortHillsNJ.mean(axis=0))
averageNewYorkNY = "New York, " + str(df.NewYorkNY.mean(axis=0))

averages = [averageNewarkNJ, averageHobokenNJ,averageJerseyCityNJ,averageNewYorkNY]

print averageNewarkNJ
print averageHobokenNJ
print averageJerseyCityNJ
print averageNewYorkNY
print averageShortHillsNJ

#graph settings

mpl.rc('lines', linewidth=5)
mpl.rc('font', size=12)

plt.figure(figsize=(15,5))
plt.ylabel('$ per Sq. ft', fontsize=20)
plt.title('Median ZRI per sq. ft. ($)', fontsize=20)

df.HobokenNJ.plot(label='Hoboken, NJ', alpha=.7, xlim=(0,46))
df.JerseyCityNJ.plot(label='Jersey City, NJ', alpha=.7, xlim=(0,46))
df.BridgewaterNJ.plot(label='Bridgewater, NJ', alpha=.7, xlim=(0,46))
df.NewarkNJ.plot(label='Newark, NJ', alpha=.7, xlim=(0,46))
df.BedminsterNJ.plot(label='Bedminster, NJ', alpha=.7, xlim=(0,46))

legend = plt.legend(loc=2, shadow=True, fontsize=10)
legend.get_frame().set_facecolor('#eeeeee')


#graph settings

mpl.rc('lines', linewidth=5)
mpl.rc('font', size=12)

plt.figure(figsize=(15,5))

plt.ylabel('$ per Sq. ft', fontsize=20)
plt.title('Median ZRI - Greater Somerville, NJ Area', fontsize=20)

df.SomervilleNJ.plot(label='Somerville, NJ', alpha=.7, xlim=(0,46))
df.BridgewaterNJ.plot(label='Bridgewater, NJ', alpha=.7, xlim=(0,46))
df.RaritanNJ.plot(label='Raritan, NJ', alpha=.7, xlim=(0,46))
df.HillsboroughNJ.plot(label='Hillsborough, NJ', alpha=.7, xlim=(0,46))

legend = plt.legend(loc=2, shadow=True, fontsize=12)
legend.get_frame().set_facecolor('#eeeeee')

nj = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/njdf.csv')

seats = nj[['Month','Belvidere','Bridgeton','Camden','CapeMayCourtHouse','Eastampton','Elizabeth','Flemington','FreeholdTownship','HamiltonTownship','HasbrouckHeights','JerseyCity','MaysLanding','Morristown','NewBrunswick','Newark','Newton','Paterson','Somerville','TomsRiver','Trenton','Woodbury']]
#KeyError: "['FreeholdBorough' 'Hackensack' 'MountHolly' 'Salem'] not in index"
#Freehold Township, Hasbrouck Heights, Eastampton used as substitutes

seats = seats.set_index('Month')
#seats.count(axis=1)
#with 18 cities, we can slice 'seats' into 3 line graphs with 6 cities in each.
seats1 = seats.iloc[:,[0,1,2,3,4,5]]
seats2 = seats.iloc[:,[6,7,8,9,10,11]]
seats3 = seats.iloc[:,[12,13,14,15,16,17]]

#
#Ideally the 3 lists should be ordered by average median ZRI
#but for the sake of simplicity, they're left as it - alphabetically.
#

#graph settings

mpl.rc('lines', linewidth=5)
mpl.rc('font', size=12)

seats1.plot(figsize=(15,5), xlim=(0,46))
seats2.plot(figsize=(15,5), xlim=(0,46))
seats3.plot(figsize=(15,5), xlim=(0,46))

means = seats.mean()
print means

mpl.rc('lines', linewidth=0)
mpl.rc('font', size=12)

plt.ylabel('Average $ per Sq. ft', fontsize=20)
plt.title('Mean-Median ZRI(M-MZRI)', fontsize=20)
plt.tick_params(which='both', width=0, length=0)

means.plot(kind='bar', figsize=(10, 5), color='#333333', ylim=(0,2.3), grid=(False))

mpl.rc('lines', linewidth=0)
maxs = (seats.max())
plt.ylabel('Max $ per Sq. ft', fontsize=20)
plt.title('Maximum Median ZRI - New Jersey County Seats', fontsize=20)
plt.colors()
means.plot(kind='bar', figsize=(10,5), color='#333333', grid=False)


nj.head()

top = nj.max()
sort = top.order(ascending=False)
sort.head(11)

mpl.rc('lines', linewidth=5)

top = nj[['Month',
'Hoboken',
'JerseyCity',
'SeaIsleCity',
'PortReading',
'UnionCity',
'Edgewater',
'Guttenberg',
'ShortHills',
'ChathamTownship',
'SeaBright']]
top.plot(figsize=(20,10), x='Month', xlim=(0,46))

plt.ylabel('$ per Sq. ft', fontsize=20)
plt.title('Top 10 Median ZRI Prices in New Jersey', fontsize=20)
plt.legend(loc=2)

# from http://www.usa.com/rank/new-jersey-state--population-density--city-rank.htm, parse, munge, write to df


url = 'http://www.usa.com/rank/new-jersey-state--population-density--city-rank.htm'
sourceCode = urllib2.urlopen(url).read()

#defining split parameters
topSplit = '<b>City / Population</b></td></tr>'
bottomSplit = '</table><p>Please'

sourceCodeSplit = sourceCode.split(topSplit)[1].split(bottomSplit)[0]
content = sourceCodeSplit.split('\n')
content

#ranks = []
cities = []
densities = []
pops = []

for x in content:
    #rank = re.findall(r'<tr><td>(.*?)</td>', x)
    city = re.findall(r'">(.*?)</a>', x)
    density = re.findall(r'</td><td>(.*?)/sq', x)
    pop = re.findall(r'</a> / (.*?)</td>', x)
    
    #for i in rank:
    #    ranks.append(i)
        
    for i in city:
        cities.append(i)
        
    for i in density:
        densities.append(i)
        
    for i in pop:
        pops.append(i)

populations = []

for i in pops:
    i = i.replace(',', '')
    i = int(i)
    populations.append(i)
    

popdensities = []

for i in densities:
    i = i.replace(',', '')
    i = re.findall(r'(.*?)\.', i)
    i = map(int, i)
    popdensities.append(i)

p = pd.DataFrame(populations, cities)
p = p.reset_index()
p.columns = ['City', 'Population']

p.to_csv('populations.csv')

pddf = pd.DataFrame(popdensities, cities)
pddf = pddf.reset_index()
pddf.columns = ['City','Pop.Density']

pddf.to_csv('pddfs.csv')

pddfs = pd.read_csv('pddfs.csv')
pddfs = pddfs.drop('Unnamed: 0', axis=1)

City0 = []

for i in pddfs['City']:
    i = i.replace(' ', '')[:-3]
    City0.append(i)
    
Population0 = []

for i in p['City']:
    i = i.replace(' ', '')[:-3]
    Population0.append(i)

pddfs['City'] = City0
p['City'] = Population0

pddfs['Population'] = p['Population']

merging = []

for i in pddfs['City']:
    if i in nj:
        merging.append(i)
        

nj[merging].head()

meanmerge = nj[merging].mean()
meanmerge.head()        

merger = pd.DataFrame(meanmerge)
merger = merger.reset_index()
merger.columns = ['City', 'M-MZRI']

merger.head()

merged = pd.merge(merger, pddfs, on='City')
merged.head()

merged.plot(kind='scatter', x='Pop.Density', y='M-MZRI', figsize=(15,5), s=(merged['Population'])**1.5/(merged['Population'].mean()), c=merged['Pop.Density'], alpha=.75, xlim=(0,60000), ylim=(0.5,3))

merged['Population'].mean()

hi_pop_den = {'UnionCity':51810.2,
'Guttenberg':46128.3,
'WestNewYork':37379.0,
'CliffsidePark':24508.7,
'Passaic':21512.2,
'Paterson':16796.2,
'Fairview':16400.6,
'EastOrange':16377.1
}

hpd = pd.DataFrame.from_dict(hi_pop_den.items())
hpd.columns = ['City', 'Pop.Density']
hpd

top_pop_den = nj[[
'UnionCity',
'Guttenberg',
'WestNewYork',
'CliffsidePark',
'Passaic',
'Paterson',
'Fairview',
'EastOrange']]

tpd_mean = top_pop_den.mean()
tpd_mean = pd.DataFrame(tpd_mean, columns=['M-MZRI'])
tpd_mean = tpd_mean.reset_index()
tpd_mean.columns = ['City', 'M-MZRI']
tpd = pd.merge(tpd_mean,hpd,on='City')
tpd
# pop.den > 15,000/sq.mi

low_pop_den = {'EstellManor':31.5,
'Folsom':223.4,
'Woodbine':308.2,
'Hope':317.0,
'LongValley':406.4,
'HolidayHeights':424.6,
'Ringwood':434.0,
'Peapack':441.1
}

lpd = pd.DataFrame.from_dict(low_pop_den.items())
lpd.columns = ['City', 'Pop.Density']
lpd

bot_pop_den = nj[[
'EstellManor',
'Folsom',
'Woodbine',
'Hope',
'LongValley',
'HolidayHeights',
'Ringwood',
'Peapack'
]]

bpd_mean = bot_pop_den.mean()
bpd_mean = pd.DataFrame(bpd_mean, columns=['M-MZRI'])
bpd_mean = bpd_mean.reset_index()
bpd_mean.columns = ['City', 'M-MZRI']
bpd = pd.merge(bpd_mean,lpd,on='City')
bpd

#pop.den < 500/sq.mi

mpl.rc('font', size=12)
tpd.plot(kind='scatter', x='Pop.Density', y='M-MZRI', s=200, figsize=(15,5), title='Mean-Median ZRI in High-Population Density Areas')
print tpd

bpd.plot(kind='scatter', x='Pop.Density', y='M-MZRI', s=200, figsize=(15,5), title='Mean-Median ZRI in Low-Population Density Areas')
print bpd

pdd = tpd.append(bpd)
pdd

pdd.plot(kind='scatter', x='Pop.Density', y='M-MZRI', xlim=(0,60000), figsize=(15,5), s=100, title='Top and Bottom 7 Places in NJ by Population Density')
print pdd

bot_pop_den.plot(figsize=(25,10), linewidth=5, xlim=(0,46))


#nj.mean().plot(figsize=(10,5), kind='kde', xlim=(0,3.5), ylim=(0,2.1), color='#333333', linewidth=5)

top_mean = top_pop_den.mean()
bot_mean = bot_pop_den.mean()

top_mean

top_mean_df = pd.DataFrame(top_mean, columns=['MM-ZRI'])
top_mean_df = top_mean_df.reset_index()
top_mean_df.columns = ['City', 'MM-ZRI']
top_mean_df

bot_mean_df = pd.DataFrame(bot_mean, columns=['MM-ZRI'])
bot_mean_df = bot_mean_df.reset_index()
bot_mean_df.columns = ['City', 'MM-ZRI']
bot_mean_df

top_mean_df.plot(kind='barh', x='City', figsize=(10,5))
bot_mean_df.plot(kind='barh', x='City', figsize=(10,5))

njue = pd.read_csv('https://raw.githubusercontent.com/c-trl/median-rent-prices-exploration/master/njue.csv')
njue = njue.set_index('Month')
njue.describe()

njue.plot(kind='area', figsize=(15,5), ylim=(0,12), alpha=.5)
plt.title('New Jersey Unemployment Rates 2011 - 2014',fontsize=20)

plt.ylabel('Unemployment Rate', fontsize=15)

plt.show()
print njue[18:28]

#graph settings
njue.plot(figsize=(15,5), kind='area', linewidth=5, xlim=(0,46), alpha=.5)
nj.Hoboken.plot(figsize=(15,5), kind='area', linewidth=5, alpha=.5, color='#111111')
nj.JerseyCity.plot(figsize=(15,5), kind='area', linewidth=5, alpha=.5, color='#aaaaaa')

legend = plt.legend(loc=1, shadow=True, fontsize=12)
legend.get_frame().set_facecolor('#eeeeee')

plt.title('New Jersey Unemployment Rate (%) vs. Median ZRI ($/sq.ft)', fontsize=20)

plt.ylabel('$/sq.ft', fontsize=15)

plt.xlim(0,46)
plt.ylim(0,12)

plt.show()