import pandas as pd !ls data ! unzip data/pay_and_display_ticket_machine_l.zip -P df=pd.read_excel("data/iw_parkingMeterData/4_5_Transaction Report RR Dec 2012 March 2013.xls") df[:10] df=pd.read_excel("data/iw_parkingMeterData/4_5_Transaction Report RR Dec 2012 March 2013.xls", \ skiprows=6) df.dropna(how='all',axis=1,inplace=True) df[:10] #Check to see how the columns are typed df.dtypes #Cast the date column as a date type, specifying how to parse the dates. #Set coerce = True to cast any strings that aren't recognised to a NaT value. df.Date=pd.to_datetime(df.Date, format="%Y-%m-%d %H:%M:%S",coerce=True) #The final row - which originally had a "Date" labelled Total was actually a total row df[-4:] #Let's see if any other dates weren;t recognised as such df[df["Date"].isnull()] #Let's also just check the total by summing the values (except the total) in the Cash column df[['Cash']][:-1].sum() #Let's just check the row count too df[['Cash']][:-1].count() #Drop the final total row df.dropna(subset=["Date"],inplace=True) #The ggplot library is currently under active development #Grab the most recent version from the github repository #!pip3 uninstall -y ggplot #!pip3 install git+https://github.com/yhat/ggplot.git from ggplot import * ggplot(df, aes(x="Date",y="Cash"))+geom_point() ggplot(df, aes(x="Date",y="Cash")) + geom_point() \ + ggtitle("Payments made over time") g = ggplot(df, aes(x="Date",y="Cash")) + geom_point() g = g + ggtitle("Payments made over time") g = g + labs("Transaction Date", "Transaction amount (£)") g df[["Tariff","Machine"]].groupby(['Tariff',"Machine"]).agg(len).sort_index() p = ggplot(aes(x='Tariff'), data=df) p + geom_bar() + ggtitle("Number of Tickets per Tariff") + labs("Tariff Code", "Count") p = ggplot(aes(x='Tariff',fill="Machine"), data=df) p + geom_bar() + ggtitle("Number of Tickets per Tariff") + labs("Tariff Code", "Count") p = ggplot(aes(x='Cash'), data=df) p + geom_histogram(binwidth=0.1) p = ggplot(aes(x='Cash'), data=df) p + geom_density() + ggtitle("Number of Tickets per Tariff") + labs("Payment (£)", "Proportion") df['Description.1'].unique() p = ggplot(aes(x='Cash'), data=df) p = p + geom_density() + ggtitle("Number of Tickets per Tariff") + labs("Payment (£)", "Proportion") p + geom_vline(xintercept=[10, 6.6, 4.5, 3.4, 3.0, 1.9, 1,0, 0.6 ],colour='blue') df.sort(['Date'],inplace=True) df['Cash_cumul'] = df.Cash.cumsum() df[:5] #As well as passing a dataframe to the ggplot function as the first argument, we can also pass it via the data= attribute g = ggplot(aes(x="Date",y="Cash_cumul"), data=df )+ geom_line() g #Add title #Add suitable axis labels, and then display the chart group=df[['Tariff','Cash']].groupby('Tariff') #For group of rows, apply the transformation to each row in the group #The number of rows in the response will be the same as the number of rows in the original data frame df['Cash_cumul2']=group.transform(cumsum)['Cash'] df[:10] ggplot(df,aes(x="Date",y="Cash_cumul2",colour="Tariff"))+geom_line() ggplot(df, aes(x="Date",y="Cash_cumul2")) + geom_line() \ + ggtitle("Payments made over time") \ + labs("Transaction Date", "Transaction amount (£)") \ + facet_wrap("Tariff") ggplot(df, aes(x="Date",y="Cash_cumul2")) + geom_line() \ + ggtitle("Payments made over time") \ + labs("Transaction Date", "Transaction amount (£)") \ + facet_wrap("Tariff",scales = "fixed") # Identifying the number of distinct machines and number of transactions recorded by each one # Accumulated total for each machine #Chart faceted by ticket machine p = ggplot(aes(x='Tariff',fill="Machine"), data=df) p = p + geom_bar() + ggtitle("Number of Tickets per Tariff") + labs("Tariff Code", "Count") p + theme_xkcd() p + theme_bw()