!ls data/housingdata !head data/housingdata/house-building-starts-tenure-2009-2010.csv !head data/housingdata/house-building-starts-tenure-2010-2011.csv !head data/housingdata/households-social-lettings-general-needs-rents-prp-number-bedrooms-2012-2013.csv !head -n 5 data/housingdata/yorksAndHumberside.csv import pandas as pd #Read in some of the data bldg_2009_10=pd.read_csv('data/housingdata/house-building-starts-tenure-2009-2010.csv',skiprows=5) bldg_2010_11=pd.read_csv('data/housingdata/house-building-starts-tenure-2010-2011.csv',skiprows=5) #Preview the data we have loaded bldg_2009_10[:5] #YOUR TURN #Import the remaning house building files into separate dataframes #As an additional exercise, you may want to try to develop a script to automate the loading of the data from the separate files #Just use a sample of the data rows for now as we develop the code sample1=bldg_2009_10[:3] sample2=bldg_2010_11[:3] sample2 import pBlocks_demo as pb pb.pBlockGrid3(sample1) #Try out the .concat() function - pass in a list of dataframes to be concatenated pd.concat( [sample1, sample2] ) #The pBlockGrid is still all experimental - we should pass a list here for example... df_concat,b=pb.pBlockGrid5(sample1, sample2) b bldg_2009_11=pd.concat([bldg_2009_10,bldg_2010_11]) #Check to see if the dataframes appear to have been concatenated together by inspecting row counts print(len(bldg_2009_10),len(bldg_2010_11),len(bldg_2009_11)) #Create a sample dataframe containing the same columns as the original but in a different order sample3=bldg_2009_10[['Reference area','All','Housing-Associations','http://opendatacommunities.org/def/ontology/geography/refArea','Local-Authority','Private-Enterprise']][:3] sample3 #Let's look at the shape and datatype structure of sample1 pb.pBlockGrid3(sample1) #And sample2? We see the columns are differently ordered in terms of datatype pb.pBlockGrid3(sample3) #Concatenate some sample dataframes with the same columns, but differently ordered concat_difforder = pd.concat([sample1,sample3]) concat_difforder #What is the datatype structure of the resulting concatenated datatable? pb.pBlockGrid3(concat_difforder) #Create a sample dataframe that contains only a subset of the columns from an original dataframe sample4=bldg_2009_10[['Reference area','All','Housing-Associations']][:3] sample4 #Concatenate two data frames with non-identical columns concat_diffcolumns = pd.concat([sample1,sample4]) concat_diffcolumns #Here I am going to "colour" the missing cells white. #The colour field accepts (red, green, blue) values in the range 0..255 each pb.pBlockGrid4(concat_diffcolumns,color_NA=(255,255,255)) #Explicitly use an INNER join ("inner") on the concatenation; "outer" is the default value concat_inner = pd.concat([sample1,sample4],join='inner') concat_inner #pBlockGrid TO DO pb.pBlockGrid5(sample1,sample4,join='inner')[1] #YOUR ATTEMPT HERE #Now add a Period column to each annual building dataframe that contains an appropriate value #Create a single dataframe containing all the housebuilding data distinguishable by period bldg_2012_13 = pd.read_csv('data/housingdata/house-building-starts-tenure-2012-2013.csv',skiprows=5) bldgSample=bldg_2012_13[:3] bldgSample lettings_2012_13=pd.read_csv('data/housingdata/households-social-lettings-general-needs-rents-prp-number-bedrooms-2012-2013.csv',skiprows=5) lettingsSample=lettings_2012_13[:3] lettingsSample simplemerge=pd.merge(bldgSample,lettingsSample,on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area']) simplemerge pb.pBlockGrid6(bldgSample,lettingsSample,on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'])[1] pd.merge(bldgSample,lettingsSample,on=['http://opendatacommunities.org/def/ontology/geography/refArea']) pb.pBlockGrid6(bldgSample,lettingsSample,on=['http://opendatacommunities.org/def/ontology/geography/refArea'])[1] #Rename one of the merge columns in one table lettingsSample.columns=['Ref Area Code']+lettingsSample.columns[1:].tolist() lettingsSample pd.merge( bldgSample, lettingsSample, left_on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'], right_on=['Ref Area Code','Reference area']) #pBlockGrid - TO DO - cope with left/right keys; colour them separately? bldgSample_long=bldg_2012_13[:4] #4 rows compared to 3 in the lettings sample bldgSample_long.columns=['Ref Area Code']+bldgSample_long.columns[1:].tolist() pd.merge(bldgSample_long,lettingsSample,on=['Ref Area Code','Reference area']) pb.pBlockGrid6(bldgSample_long,lettingsSample,on=['Ref Area Code','Reference area'])[1] from pandasql import sqldf pysqldf = lambda q: sqldf(q, globals()) #At the moment, we need to hack our way round dataframe column names that don't work as sqlite3 column names. bldgSample_sqldf=bldgSample.copy() bldgSample_sqldf.rename(columns=lambda x: x.replace(' ','_'), inplace=True) lettingsSample_sqldf=lettingsSample.copy() lettingsSample_sqldf.rename(columns=lambda x: x.replace(' ','_'), inplace=True) q=''' SELECT * FROM bldgSample_sqldf INNER JOIN lettingsSample_sqldf ON bldgSample_sqldf.Reference_Area = lettingsSample_sqldf.Reference_Area ; ''' pysqldf(q) q=''' SELECT * FROM bldgSample_sqldf INNER JOIN lettingsSample_sqldf USING(Reference_Area) ; ''' pysqldf(q) lettingsSample_long=pd.concat( [ lettings_2012_13[:2], lettings_2012_13[-2:] ]) lettingsSample_long #What happens if you try to inner join bldgSample and lettingsSample_long ? pd.merge( bldgSample, lettingsSample_long, on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'], how='left') pb.pBlockGrid6( bldgSample, lettingsSample_long, on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'], how='left')[1] bldgSample_sqldf=bldgSample.copy() bldgSample_sqldf.rename(columns=lambda x: x.replace(' ','_'), inplace=True) lettingsSample_long_sqldf=lettingsSample_long.copy() lettingsSample_long_sqldf.rename(columns=lambda x: x.replace(' ','_'), inplace=True) q=''' SELECT * FROM bldgSample_sqldf LEFT JOIN lettingsSample_long_sqldf USING(Reference_Area); ''' pysqldf(q) pd.merge( bldgSample, lettingsSample_long, on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'], how='right') pb.pBlockGrid6( bldgSample, lettingsSample_long, on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'], how='right')[1] pd.merge( bldgSample, lettingsSample_long, on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'], how='outer') pb.pBlockGrid6( bldgSample, lettingsSample_long, on=['http://opendatacommunities.org/def/ontology/geography/refArea','Reference area'], how='outer')[1] bldg_sample_mixed = pd.concat([ bldg_2009_10[:2], bldg_2012_13[:2] ]) bldg_sample_mixed #Your experiments here... inner join on bldg_sample_mixed and lettingsSample_long ##I'm not sure the pBlocks stuff handles this sensible yet? What would we want the blocks to display? #Your experiments here... left join on bldg_sample_mixed and lettingsSample_long #Your experiments here... right join on bldg_sample_mixed and lettingsSample_long #Your experiments here... outer join on bldg_sample_mixed and lettingsSample_long pd.read_csv('data/housingdata/yorksAndHumberside.csv')[15:20]