# Uncomment and install as necessary if packages below don't load
# install.packages("data.table")
# install.packages("ggplot2")
# install.packages("gridExtra")
# install.packages("magrittr")
# install.packages("Matrix")
# install.packages("mgcv")
library(data.table)
library(ggplot2)
library(gridExtra)
library(magrittr)
library(Matrix)
library(mgcv)
# library(bigrquery) #only used for data acquisition
# library(dplyr) #only use for data acquisition

# Data directory
DATA = "C:/Users/w_rpg/Downloads/continuous_productivity" 
# Set this wherever you unzip the associated .rds file (that's the only
# thing we need this directory for)
# # Block group for testing -- go beavs
# chintimini="410030011011"
# 
# # Authenticate BQ account
# bq_auth(scopes = "https://www.googleapis.com/auth/bigquery")
# 
# # Connect to the BigQuery northwest 2021 trips table
# con = dbConnect(
#   bigquery(),
#   project = "replica-customer",
#   dataset = "northwest"
# )
# dbListTables(con)
# trip_con = tbl(con, "northwest_2021_Q2_thursday_trip")
# 
# # Grab the start and end time for all trips originating in our zone
# se = trip_con %>%
#   filter(origin_bgrp==chintimini) %>%
#   select(start_time, end_time) %>%
#   collect() %>%
#   data.table(se) %>%
#   .[, start_time := as.numeric(start_time)] %>%
#   .[, end_time := as.numeric(end_time)] %>%
#   .[order(start_time)]
# Data
se = file.path(DATA, "continuous_productivity_data.rds") %>%
  readRDS()

# Set up a matrix to track completed trips by minute over the course of the day.
# The rows will be minutes; the columns will be trips. The "completeness" of the
# trip is judged by how much of the trip has been completed by the beginning of
# that minute (e.g. a 2 minute trip that begins at 15.5 mins and ends at 17.5
# mins is 0% complete at 15, 25% complete at 16, 75% complete at 17, and 100%
# complete from 18 on). In this way, row sums across this matrix will give us
# the number of completed trips by minute.
ntrips = nrow(se)
ijx = lapply(1:ntrips, function(r){
  # print(r)
  s = se$start_time[r]
  e = se$end_time[r]
  if(e < s){
    e = 86400 + e
  }
  smin = ceiling(s/60)
  emin = min(floor(e/60), 1440)
  i_add = smin:1440
  j_add = rep(r, length(i_add))
  if(emin > smin){
    if(emin == 1440){
      x_add = c((smin:emin*60-s)/(e - s))
    } else{
      x_add = c(c((smin:emin*60-s)/(e - s)), rep(1, length((emin+1):1440)))
    }
  } else if(emin == smin){
    x_add = rep(1, length(emin:1440))
  } else{
    x_add = rep(1, length((emin+1):1440))
  }
  return(list(i=i_add, j=j_add, x=x_add))
})
i = lapply(ijx, function(l){l$i}) %>% unlist()
j = lapply(ijx, function(l){l$j}) %>% unlist()
x = lapply(ijx, function(l){l$x}) %>% unlist()
M = sparseMatrix(i=i, j=j, x=x, dims=c(1440,ntrips))

# Create a table of completed trips, and plot
ct = data.table(x = 1:1440,
                y = rowSums(M))
plot_completed_trips = ggplot(data=ct) +
  geom_line(aes(x=x, y=y), color="red") +
  geom_hline(aes(yintercept=0)) +
  geom_vline(aes(xintercept=0)) +
  scale_x_continuous(name="Time (hours)",
                     breaks=seq(0,1440,60),
                     labels=0:24,
                     expand=c(0.025,0.025)) +
  scale_y_continuous(name="Completed trips",
                     breaks=seq(0,2500,100),
                     expand=c(0.025,0.025),
                     limits=c(0,2500)) +
  labs(title="Raw completed trips leaving the origin zone on a Thursday")

# Create a table of productivity, and plot. Productivity here is the derivative
# of the completed trips curve; it gives us the instantaneous rate of increase
# in completed trips at any given minute. Completed trips is monotonically
# increasing with time, so our productivity will always be positive, but will
# be higher at some points in the day than others
dydx_ct = data.table(x = rowMeans(embed(ct$x,2)),
                     y = diff(ct$y)/diff(ct$x))
plot_productivity = ggplot(data=dydx_ct) +
  geom_line(aes(x=x, y=y), color="red") +
  geom_hline(aes(yintercept=0)) +
  geom_vline(aes(xintercept=0)) +
  scale_x_continuous(name="Time (hours)",
                     breaks=seq(0,1440,60),
                     labels=0:24,
                     expand=c(0.025,0.025)) +
  scale_y_continuous(name="Productivity (completed trips per minute)",
                     breaks=seq(0,8.5,0.5),
                     expand=c(0.025,0.025),
                     limits=c(0,8.5)) +
  labs(title="Raw productivity over the course of the day for trips leaving the origin zone")
# Show completed trips and productivity side by side
grid.arrange(plot_completed_trips,
             plot_productivity,
             nrow=1)

smooth = gam(y ~ s(x, bs = "cs"), data=ct)
smooth_ct = data.table(x=ct$x,
                       y=predict(smooth))
smooth_dydx_ct = data.table(x = rowMeans(embed(smooth_ct$x,2)),
                            y = diff(smooth_ct$y)/diff(smooth_ct$x))
plot_smooth_completed_trips = ggplot(data=smooth_ct) +
  geom_line(aes(x=x, y=y), color="red") +
  geom_hline(aes(yintercept=0)) +
  geom_vline(aes(xintercept=0)) +
  scale_x_continuous(name="Time (hours)",
                     breaks=seq(0,1440,60),
                     labels=0:24,
                     expand=c(0.025,0.025)) +
  scale_y_continuous(name="Completed trips",
                     breaks=seq(0,2500,100),
                     expand=c(0.025,0.025),
                     limits=c(0,2500)) +
  labs(title="Smoothed completed trips leaving the origin zone on a Thursday")
plot_smooth_productivity = ggplot(data=smooth_dydx_ct) +
  geom_line(aes(x=x, y=y), color="red") +
  geom_hline(aes(yintercept=0)) +
  geom_vline(aes(xintercept=0)) +
  scale_x_continuous(name="Time (hours)",
                     breaks=seq(0,1440,60),
                     labels=0:24,
                     expand=c(0.025,0.025)) +
  scale_y_continuous(name="Productivity (completed trips per minute)",
                     breaks=seq(0,8.5,0.5),
                     expand=c(0.025,0.025),
                     limits=c(0,8.5)) +
  labs(title="Smoothed productivity over the course of the day for trips leaving the origin zone")
grid.arrange(plot_smooth_completed_trips,
             plot_smooth_productivity,
             nrow=1)