import gcp import gcp.bigquery as bq class Column(object): def __init__(self, name): self._name = name def _repr_sql_(self): return self._name ga_view_id = [your View id here] cloud_project_number = [your Cloud Project number here] visitor_id = Column('fullVisitorId') conversion = Column('totals.transactions') conversion_revenue = Column('totals.transactionRevenue') start_date = '2014-01-01' end_date = '2014-06-01' cohort_end_date = '2014-01-15' end_of_cal_period = '2014-03-15' sampling_rate = Column('1/50') %%bigquery sql --name event_log_query SELECT * FROM ( SELECT date, $visitor_id, $conversion_revenue, $conversion, FROM TABLE_DATE_RANGE([$cloud_project_number:$ga_view_id.ga_sessions_], TIMESTAMP($start_date), TIMESTAMP($end_date)) WHERE $conversion IS NOT NULL ) a JOIN ( SELECT $visitor_id FROM TABLE_DATE_RANGE([$cloud_project_number:$ga_view_id.ga_sessions_], TIMESTAMP($start_date), TIMESTAMP($cohort_end_date)) WHERE totals.newVisits IS NOT NULL AND RAND() < $sampling_rate GROUP BY 1 ) b ON a.$visitor_id=b.$visitor_id ORDER BY date print event_log_query.sql # View the query event_log_data = event_log_query.results() # Run the query event_log_data.to_dataframe() # Save the query to a dataframe and view the results %load_ext rpy2.ipython %%R library(BTYD) import pandas df = pandas.DataFrame(event_log_data) %Rpush df %%R -i start_date,end_date,cohort_end_date,end_of_cal_period elog <- data.frame(cust = df$a_fullVisitorId, date = df$a_date, sales = df$a_totals_transactionRevenue) elog$sales <- elog$sales / 10^6 # format dates elog$date <- as.Date(elog$date, "%Y%m%d") # set key dates end.of.cal.period <- as.Date(end_of_cal_period) T.end <- max(elog$date) T.star <- as.numeric(T.end - end.of.cal.period) T.tot <- as.numeric(max(elog$date) - min(elog$date)) dataset <- dc.ElogToCbsCbt(elog, per="day", T.cal=end.of.cal.period) # estimate model cal.cbs <- dataset[["cal"]][["cbs"]] params <- pnbd.EstimateParameters(cal.cbs) print(params) %%R mape <- function(actual, forecasted) { n <- length(actual) (1/n) * sum(abs((actual - forecasted) / actual)) } %%R # calculate incremental and cumulative vectors T.cal <- cal.cbs[,3] actual.inc.tracking.data <- cbind(dataset[['cal']][['cbt']], dataset[['holdout']][['cbt']]) actual.inc.tracking.data <- apply(actual.inc.tracking.data, 2, sum) actual.cum.tracking.data <- cumsum(actual.inc.tracking.data) expected.cum.trans <- pnbd.ExpectedCumulativeTransactions(params, T.cal, T.tot, length(actual.cum.tracking.data)) # find end of calibration date end.of.cal.day <- as.numeric(end.of.cal.period - min(elog$date)) total.mape <- mape(actual.cum.tracking.data, expected.cum.trans) in.sample.mape <- mape(actual.cum.tracking.data[1:end.of.cal.day], expected.cum.trans[1:end.of.cal.day]) out.of.sample.mape <- mape(actual.cum.tracking.data[end.of.cal.day:T.tot], expected.cum.trans[end.of.cal.day:T.tot]) mape.outputs <- data.frame("MAPE"=c(in.sample.mape, out.of.sample.mape, total.mape)) row.names(mape.outputs) <- c("In sample", "Out of sample", "Total") print(mape.outputs) %R print(round(pnbd.Expectation(params, t=365), digits=1)) %%R # histogram pnbd.PlotFrequencyInCalibration(params, cal.cbs, 7) %%R # cum tracking cum.tracking <- pnbd.PlotTrackingCum(params, T.cal, T.tot, actual.cum.tracking.data, xlab="Day") %%R # inc tracking inc.tracking <- pnbd.PlotTrackingInc(params, T.cal, T.tot, actual.inc.tracking.data, xlab="Day", title="Tracking Daily Transactions") %%R x.star <- dataset$holdout$cbs[,1] censor <- 7 cond.expectation <- pnbd.PlotFreqVsConditionalExpectedFrequency(params, T.star, cal.cbs, dataset$holdout$cbs[,1], censor) cond.expectation %%R heatmap.palive.data <- matrix(NA, nrow=10, ncol=200) heatmap.cet.data <- matrix(NA, nrow=10, ncol=200) for(i in 1:10) { heatmap.cet.data[i,] <- pnbd.ConditionalExpectedTransactions(params, T.star=365, x=i, t.x=1:200, T.cal=200) heatmap.palive.data[i,] <- pnbd.PAlive(params, x=i, t.x=1:200, T.cal=200) } image(heatmap.palive.data, axes=FALSE, xlab="Number of Transactions", ylab="Days since last transaction", main="P(Alive) by Recency and Frequency") axis(1, at=seq(0,1,.1), labels=0:10) axis(2, at=seq(0,1,.1), labels=seq(200,0,-20)) image(heatmap.cet.data, axes=FALSE, xlab="Number of Transactions", ylab="Days since last transaction", main="Conditional Expected Transactions by Recency and Frequency") axis(1, at=seq(0,1,.1), labels=0:10) axis(2, at=seq(0,1,.1), labels=seq(200,0,-20)) %%R #cal.cbt <- dc.CreateSpendCBT(elog, is.avg.spend = FALSE) cal.cbt <- dc.ElogToCbsCbt(elog, per="day", T.cal=end.of.cal.period, cohort.birth.per=as.Date(cohort_end_date), statistic="total.spend") cal.cbs <- cal.cbt$cal$cbs cal.cbt <- cal.cbt$cal$cbt calculateAvgSpend <- function(cbt.row) { purchaseCols = which(cbt.row != 0) sum(cbt.row[purchaseCols]) / length(purchaseCols) } m.x <- apply(cal.cbt, 1, calculateAvgSpend) m.x[which(is.na(m.x))] <- 0 spendParams <- spend.EstimateParameters(m.x, cal.cbs[,1]) print(spendParams) %%R expected.spend <- spend.plot.average.transaction.value(spendParams, m.x, cal.cbs[,1], title = "Actual vs. Expected Avg Transaction Value Across Customers") %%R mape(m.x[which(m.x > 0)], expected.spend) %%R end.of.cal.period <- as.Date(end_date) dataset <- dc.ElogToCbsCbt(elog, per="day", T.cal=end.of.cal.period) # estimate model cal.cbs <- dataset[["cal"]][["cbs"]] params <- pnbd.EstimateParameters(cal.cbs) print(params) %%R x <- cal.cbs[, "x"] t.x <- cal.cbs[, "t.x"] T.cal <- cal.cbs[, "T.cal"] d <- .15 # discount rate to be divided by 365, as we are running the model on daily data discounted_expected_transactions <- pnbd.DERT(params, x, t.x, T.cal, d / 365) p_Alive <- pnbd.PAlive(params, x, t.x, T.cal) output <- data.frame(visitorId=dataset$cust.data$cust, discounted_expected_transactions=discounted_expected_transactions, p_Alive=p_Alive) head(output) %%R expected_spend <- spend.expected.value(spendParams, m.x, cal.cbs[,1]) output$expected_spend <- expected_spend output$projected_future_revenue <- output$expected_spend * output$discounted_expected_transactions output[1:20,] %Rpull output output[:5] output.fillna(value=0, inplace=True) schema = bq.schema(output) schema bq.dataset('outputs').create() table = bq.table('outputs.jan_june_2014').create(schema, truncate=True) table.insertAll(output) import time time.sleep(2) table.sample() %%bigquery sql det_by_campaign SELECT trafficSource.campaign, round(avg(discounted_expected_transactions), 2) FROM ( SELECT date, $visitor_id, trafficSource.campaign FROM TABLE_DATE_RANGE([$cloud_project_number:$ga_view_id.ga_sessions_], TIMESTAMP($start_date), TIMESTAMP($cohort_end_date)) WHERE totals.newVisits IS NOT NULL ) a JOIN ( SELECT visitorId, discounted_expected_transactions FROM [outputs.jan_june_2014] ) b ON a.$visitor_id=b.visitorId GROUP BY trafficSource.campaign ORDER BY 2 DESC print det_by_campaign.sql det_by_campaign.results() %%bigquery sql SELECT * FROM [outputs.jan_june_2014] ORDER BY projected_future_revenue DESC LIMIT 100 %%bigquery sql SELECT top_products, COUNT(top_products) AS quantity FROM ( SELECT $visitor_id, hits.item.productName AS top_products FROM TABLE_DATE_RANGE([$cloud_project_number:$ga_view_id.ga_sessions_], TIMESTAMP($start_date), TIMESTAMP($cohort_end_date)) ) a JOIN ( SELECT visitorId FROM [outputs.jan_june_2014] ORDER BY projected_future_revenue DESC LIMIT 200 ) b ON a.$visitor_id=b.visitorId GROUP BY top_products ORDER BY quantity DESC;