!ls data/housingdata

!head data/housingdata/house-building-starts-tenure-2009-2010.csv

!head data/housingdata/house-building-starts-tenure-2010-2011.csv

!head data/housingdata/households-social-lettings-general-needs-rents-prp-number-bedrooms-2012-2013.csv

!head -n 5 data/housingdata/yorksAndHumberside.csv

import pandas as pd

#Read in some of the data
bldg_2009_10=pd.read_csv('data/housingdata/house-building-starts-tenure-2009-2010.csv',skiprows=5)
bldg_2010_11=pd.read_csv('data/housingdata/house-building-starts-tenure-2010-2011.csv',skiprows=5)

#Preview the data we have loaded
bldg_2009_10[:5]

#YOUR TURN
#Import the remaning house building files into separate dataframes
#As an additional exercise, you may want to try to develop a script to automate the loading of the data from the separate files

#Just use a sample of the data rows for now as we develop the code
sample1=bldg_2009_10[:3]
sample2=bldg_2010_11[:3]
sample2

import pBlocks_demo as pb

pb.pBlockGrid3(sample1)

#Try out the .concat() function - pass in a list of dataframes to be concatenated
pd.concat( [sample1, sample2] )

#The pBlockGrid is still all experimental - we should pass a list here for example...
df_concat,b=pb.pBlockGrid5(sample1, sample2)
b

bldg_2009_11=pd.concat([bldg_2009_10,bldg_2010_11])
#Check to see if the dataframes appear to have been concatenated together by inspecting row counts
print(len(bldg_2009_10),len(bldg_2010_11),len(bldg_2009_11))

#Create a sample dataframe containing the same columns as the original but in a different order
sample3=bldg_2009_10[['Reference area','All','Housing-Associations','http://opendatacommunities.org/def/ontology/geography/refArea','Local-Authority','Private-Enterprise']][:3]
sample3

#Let's look at the shape and datatype structure of sample1
pb.pBlockGrid3(sample1)

#And sample2? We see the columns are differently ordered in terms of datatype
pb.pBlockGrid3(sample3)

#Concatenate some sample dataframes with the same columns, but differently ordered
concat_difforder = pd.concat([sample1,sample3])
concat_difforder

#What is the datatype structure of the resulting concatenated datatable?
pb.pBlockGrid3(concat_difforder)

#Create a sample dataframe that contains only a subset of the columns from an original dataframe
sample4=bldg_2009_10[['Reference area','All','Housing-Associations']][:3]
sample4

#Concatenate two data frames with non-identical columns
concat_diffcolumns = pd.concat([sample1,sample4])
concat_diffcolumns 

#Here I am going to "colour" the missing cells white.
#The colour field accepts (red, green, blue) values in the range 0..255 each
pb.pBlockGrid4(concat_diffcolumns,color_NA=(255,255,255))

#Explicitly use an INNER join ("inner") on the concatenation; "outer" is the default value
concat_inner = pd.concat([sample1,sample4],join='inner')
concat_inner

#pBlockGrid TO DO
pb.pBlockGrid5(sample1,sample4,join='inner')[1]

#YOUR ATTEMPT HERE

#Now add a Period column to each annual building dataframe that contains an appropriate value

#Create a single dataframe containing all the housebuilding data distinguishable by period

bldg_2012_13 = pd.read_csv('data/housingdata/house-building-starts-tenure-2012-2013.csv',skiprows=5)
bldgSample=bldg_2012_13[:3]
bldgSample

lettings_2012_13=pd.read_csv('data/housingdata/households-social-lettings-general-needs-rents-prp-number-bedrooms-2012-2013.csv',skiprows=5)
lettingsSample=lettings_2012_13[:3]
lettingsSample

simplemerge=pd.merge(bldgSample,lettingsSample,on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'])
simplemerge

pb.pBlockGrid6(bldgSample,lettingsSample,on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'])[1]

pd.merge(bldgSample,lettingsSample,on=['http://opendatacommunities.org/def/ontology/geography/refArea'])

pb.pBlockGrid6(bldgSample,lettingsSample,on=['http://opendatacommunities.org/def/ontology/geography/refArea'])[1]

#Rename one of the merge columns in one table
lettingsSample.columns=['Ref Area Code']+lettingsSample.columns[1:].tolist()
lettingsSample

pd.merge( bldgSample, lettingsSample, 
         left_on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'],
         right_on=['Ref Area Code','Reference area'])

#pBlockGrid - TO DO - cope with left/right keys; colour them separately?

bldgSample_long=bldg_2012_13[:4] #4 rows compared to 3 in the lettings sample
bldgSample_long.columns=['Ref Area Code']+bldgSample_long.columns[1:].tolist()

pd.merge(bldgSample_long,lettingsSample,on=['Ref Area Code','Reference area'])

pb.pBlockGrid6(bldgSample_long,lettingsSample,on=['Ref Area Code','Reference area'])[1]

from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

#At the moment, we need to hack our way round dataframe column names that don't work as sqlite3 column names.
bldgSample_sqldf=bldgSample.copy()
bldgSample_sqldf.rename(columns=lambda x: x.replace(' ','_'), inplace=True)

lettingsSample_sqldf=lettingsSample.copy()
lettingsSample_sqldf.rename(columns=lambda x: x.replace(' ','_'), inplace=True)


q='''
    SELECT * FROM bldgSample_sqldf INNER JOIN lettingsSample_sqldf
        ON bldgSample_sqldf.Reference_Area = lettingsSample_sqldf.Reference_Area ;
  '''
pysqldf(q)

q='''
    SELECT * FROM bldgSample_sqldf INNER JOIN lettingsSample_sqldf
        USING(Reference_Area) ;
  '''
pysqldf(q)

lettingsSample_long=pd.concat( [ lettings_2012_13[:2], lettings_2012_13[-2:] ])
lettingsSample_long

#What happens if you try to inner join bldgSample and lettingsSample_long ?

pd.merge( bldgSample, lettingsSample_long,
         on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'],
         how='left')

pb.pBlockGrid6( bldgSample, lettingsSample_long,
         on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'],
         how='left')[1]

bldgSample_sqldf=bldgSample.copy()
bldgSample_sqldf.rename(columns=lambda x: x.replace(' ','_'), inplace=True)

lettingsSample_long_sqldf=lettingsSample_long.copy()
lettingsSample_long_sqldf.rename(columns=lambda x: x.replace(' ','_'), inplace=True)

q='''
    SELECT * FROM bldgSample_sqldf LEFT JOIN lettingsSample_long_sqldf
        USING(Reference_Area);
  '''
pysqldf(q)

pd.merge( bldgSample, lettingsSample_long,
         on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'],
         how='right')

pb.pBlockGrid6( bldgSample, lettingsSample_long,
         on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'],
         how='right')[1]

pd.merge( bldgSample, lettingsSample_long,
         on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'],
         how='outer')

pb.pBlockGrid6( bldgSample, lettingsSample_long,
         on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'],
         how='outer')[1]

bldg_sample_mixed = pd.concat([ bldg_2009_10[:2], bldg_2012_13[:2] ])
bldg_sample_mixed

#Your experiments here... inner join on bldg_sample_mixed and lettingsSample_long

##I'm not sure the pBlocks stuff handles this sensible yet? What would we want the blocks to display?

#Your experiments here... left join on bldg_sample_mixed and lettingsSample_long

#Your experiments here... right join on bldg_sample_mixed and lettingsSample_long

#Your experiments here... outer join on bldg_sample_mixed and lettingsSample_long

pd.read_csv('data/housingdata/yorksAndHumberside.csv')[15:20]