Traffic volume prediction model - Report for Roads and Maritime Services, and other NSW Government stakeholders

library(httr)
library(jsonlite)
library(tidyverse)
library(lubridate)
library(rsdmx)
library(data.table)
library(geosphere)
library(reshape2)
library(gmapsdistance)
library(sqldf)
library(tictoc)
library(RDS)
library(scales)
library(gdata)
library(AER)
library(caret)
library(MASS)
library(arm)
library(polycor)

#########################################################
############ DATA ACQUISITION AND CLEANING ##############
#########################################################

# GET the full stations database using the API
stations_api<- GET("https://api.transport.nsw.gov.au/v1/roads/spatial?format=geojson&q=select%20*%20from%20road_traffic_counts_station_reference%20",
                   verbose(), 
                   encode="json", 
                   add_headers(`Authorization` = "apikey fUa8N1LC42AYtVDKIt6jbAzQXFPcf9b31GYv"))

# Extract a clean stations dataframe from the raw API output
stations_raw <- rawToChar(stations_api$content)
stations_clean <- fromJSON(stations_raw)
stations_df <- as.data.frame(stations_clean[[2]])
stations <- as.data.frame(stations_df$properties)

saveRDS(stations,file= "Stations-initial.rds")


# compare each station location to BOM weather stations and choose closest (SOME ARE WRONG!)
BOMrain <- read_rds("/Users/RohanDanisCox/STDS/BOMNSW.RDS") # need to change to your location
BOMrainStation <- data.frame(subset(BOMrain, select = c(1,2,4,5)))
colnames(BOMrainStation) <- c("StationNum","name", "lat", "lon")
UniqueBOMStations <- sqldf('SELECT DISTINCT * FROM BOMrainStation') 

saveRDS(UniqueBOMStations,file= "UniqueBOMStations.rds")

# create distance matrix
DistMatrix <- distm(stations[,c('wgs84_longitude','wgs84_latitude')], UniqueBOMStations[,c('lon','lat')], fun=distVincentyEllipsoid)/1000
NearestStation <- data.frame(UniqueBOMStations$StationNum[max.col(-DistMatrix)])

# Only take the unique BOM Stations required
UniqueNearestStation<-unique(NearestStation)

# Combine with traffic stations information
stations$nearest_weather_station <- NearestStation$UniqueBOMStations.StationNum.max.col..DistMatrix..
stations <- inner_join(stations, UniqueBOMStations,c("nearest_weather_station" = "StationNum"))

saveRDS(stations,file= "StationsAndWeather.rds")

# Split traffic stations to allow for separate calls to Googlemaps distance API
station.locations <- stations[,c("station_key","wgs84_latitude","wgs84_longitude")]
station.locations.200 <- station.locations %>%
  filter(row_number() %in% 1:200)
station.locations.400 <- station.locations %>%
  filter(row_number() %in% 201:400)
station.locations.600 <- station.locations %>%
  filter(row_number() %in% 401:600)
station.locations.800 <- station.locations %>%
  filter(row_number() %in% 601:800)
station.locations.1000 <- station.locations %>%
  filter(row_number() %in% 801:1000)
station.locations.1200 <- station.locations %>%
  filter(row_number() %in% 1001:1200)
station.locations.1400 <- station.locations %>%
  filter(row_number() %in% 1201:1400)
station.locations.1600 <- station.locations %>%
  filter(row_number() %in% 1401:1600)
station.locations.1800 <- station.locations %>%
  filter(row_number() %in% 1601:1800)
station.locations.1818 <- station.locations %>%
  filter(row_number() %in% 1801:1823)

# Create function to calculate distance by road to Sydney Tower
get.distance <- function(x,y,z) {
  results = gmapsdistance(
    origin = paste(x,y,sep=","), 
    destination = "-33.8704512,151.2058792", 
    mode = "driving", 
    traffic_model = "best_guess",
    shape = "long",
    key = "AIzaSyCwoq2tqj0RX54SrVnnRYccCs6AV1kHj34")
  results.df<-data.frame(results)
  results.df$station_key <- z
  results.df
}

# Calculate distance for each station
distance.df200 <- get.distance(station.locations.200$wgs84_latitude,station.locations.200$wgs84_longitude, station.locations.200$station_key)
distance.df400 <- get.distance(station.locations.400$wgs84_latitude,station.locations.400$wgs84_longitude, station.locations.400$station_key)
distance.df600 <- get.distance(station.locations.600$wgs84_latitude,station.locations.600$wgs84_longitude, station.locations.600$station_key)
distance.df800 <- get.distance(station.locations.800$wgs84_latitude,station.locations.800$wgs84_longitude, station.locations.800$station_key)
distance.df1000 <- get.distance(station.locations.1000$wgs84_latitude,station.locations.1000$wgs84_longitude, station.locations.1000$station_key)
distance.df1200 <- get.distance(station.locations.1200$wgs84_latitude,station.locations.1200$wgs84_longitude, station.locations.1200$station_key)
distance.df1400 <- get.distance(station.locations.1400$wgs84_latitude,station.locations.1400$wgs84_longitude, station.locations.1400$station_key)
distance.df1600 <- get.distance(station.locations.1600$wgs84_latitude,station.locations.1600$wgs84_longitude, station.locations.1600$station_key)
distance.df1800 <- get.distance(station.locations.1800$wgs84_latitude,station.locations.1800$wgs84_longitude, station.locations.1800$station_key)
distance.df1818 <- get.distance(station.locations.1818$wgs84_latitude,station.locations.1818$wgs84_longitude, station.locations.1818$station_key)
combine <- rbind(distance.df200,distance.df400,distance.df600,distance.df800,distance.df1000,distance.df1200,distance.df1400,distance.df1600,distance.df1800,distance.df1818)

# Establish dataframe of distances and merge to stations
distances.table <- combine[,c("station_key", "Distance.Distance")]
stations <- inner_join(stations,distances.table,by="station_key")
stations <- rename(stations,"Distance_CBD" ="Distance.Distance")

# Save file and access 
saveRDS(stations,file= "StationsWeatherAndDistance.rds")
stations <- read_rds("/Users/RohanDanisCox/STDS/StationsWeatherAndDistance.rds") # need to change to your location

# Loop to access permanent stations traffic counts NSW Transport API (monthly to access full dataset) and store 
year <- 2011:2016
month <- 1:12
perm_count_wide <-data.frame()
for(i in year) {
    for (j in month){
      url<- paste("https://api.transport.nsw.gov.au/v1/roads/spatial?format=geojson&q=select%20*%20from%20road_traffic_counts_hourly_permanent%20where%20year%20%3D%20",i,"%20and%20month%20%3D%20",j,"%20",sep="")
      perm_count_api <- GET(url,
                            verbose(),
                            encode="json",
                            add_headers(`Authorization` = "apikey fUa8N1LC42AYtVDKIt6jbAzQXFPcf9b31GYv"))
      perm_count_raw <- rawToChar(perm_count_api$content)
      perm_count_clean <- fromJSON(perm_count_raw)
      perm_count_df <- as.data.frame(perm_count_clean[[2]])
      perm_count_wide <- rbind(perm_count_wide,perm_count_df$properties)
    }}

# Save the compiled file as an RDS
saveRDS(perm_count_wide,file="perm_count_wide.rds")

# Loop to access sample stations traffic counts NSW Transport API (monthly to access full dataset) and store 
samp_count_wide <-data.frame()
for(i in year) {
  for (j in month){
    url<- paste("https://api.transport.nsw.gov.au/v1/roads/spatial?format=geojson&q=select%20*%20from%20road_traffic_counts_hourly_sample%20where%20year%20%3D%20'",i,"'%20and%20month%20%3D%20'",j,"'",sep="")
    samp_count_api <- GET(url,
                          verbose(),
                          encode="json",
                          add_headers(`Authorization` = "apikey fUa8N1LC42AYtVDKIt6jbAzQXFPcf9b31GYv"))
    samp_count_raw <- rawToChar(samp_count_api$content)
    samp_count_clean <- fromJSON(samp_count_raw)
    samp_count_df <- as.data.frame(samp_count_clean[[2]])
    samp_count_wide <- rbind(samp_count_wide,samp_count_df$properties)
  }}

# combine the permanent stations with the sample stations
count_wide <- rbind(perm_count_wide,samp_count_wide)

# add a monthly day count
count_wide <- count_wide%>%
  mutate(day=mday(date))

# Fix station_key to be an integer ready for merging
count_wide$station_key <- as.integer(count_wide$station_key)

# Merge the two tables using the key identifier of Station key - inner join returns where a match is found
traffic_vol <- inner_join(count_wide,stations,by = "station_key")

# Save and Retrieve
saveRDS(traffic_vol,file= "traffic_vol.rds")
traffic_vol <- read_rds("/Users/RohanDanisCox/STDS/traffic_vol.rds") # need to change to your location

# Reclassify certain factor variables to labels from NSW Transport data documentation
traffic_vol$classification_seq <- factor(traffic_vol$classification_seq,levels=c(0,1,2,3,-9),labels = c("UNCLASSIFIED","ALL VEHICLES","LIGHT VEHICLES","HEAVY VEHICLES","MISSING"))
traffic_vol$traffic_direction_seq <- factor(traffic_vol$traffic_direction_seq,levels=c(0,1,2),labels = c("COUNTER","PRESCRIBED","BOTH"))
traffic_vol$cardinal_direction_seq <- factor(traffic_vol$cardinal_direction_seq,levels=c(1,3,5,7,9,10),labels = c("NORTH","EAST","SOUTH","WEST","NORTHBOUND AND SOUTHBOUND","EASTBOUND & WESTBOUND"))
traffic_vol$day_of_week <- factor(traffic_vol$day_of_week,levels=c(1,2,3,4,5,6,7),labels = c("MONDAY","TUESDAY","WEDNESDAY","THURSDAY","FRIDAY","SATURDAY","SUNDAY"))

# Read in selected variables from the ABS table ABS_REGIONAL_LGA
LGAurl <- "http://stat.data.abs.gov.au/restsdmx/sdmx.ashx/GetData/ABS_REGIONAL_LGA/ERP_18+ERP_21+ERP_6+200IND+MVC_14+MVC_15+MVC_16+MVC_17+MVC_18+MVC_19+MVC_20+MVC_21+MVC_22+MVC_38+CABEE_36+CABEE_35+INCOME_17+LAND.LGA2014..A/all?startTime=2011&endTime=2016&format=compact_v2"

LGA_raw <- readSDMX(LGAurl)
LGA <- as.data.frame(LGA_raw)

# Names
LGAdsd_url <- "http://stat.data.abs.gov.au/restsdmx/sdmx.ashx/GetDataStructure/ABS_REGIONAL_LGA"
LGAdsd <- readSDMX(LGAdsd_url)

# Get codelists from DSD
LGAcls <- slot(LGAdsd, "codelists")
LGAcodelists <- sapply(slot(LGAcls, "codelists"), function(x) slot(x, "id")) #get list of codelists
LGAcodelists
LGAcodelist <- as.data.frame(slot(LGAdsd, "codelists"), codelistId = "CL_ABS_REGIONAL_LGA_REGION") #get a codelist

# Get concepts from DSD
LGAconcepts <- as.data.frame(slot(LGAdsd, "concepts"))

# Join
LGA_full <- inner_join(LGA,LGAcodelist,c("REGION" = "id")) 

# Transform LGA region names into correct format to match other data
LGA_full$label.en <- gsub(" \\(.*$","", LGA_full$label.en)

# Apply 2014 LGA land area to all years
land.area.2014 <- LGA_full[LGA_full$MEASURE=="LAND",c("REGION","OBS_VALUE")]
LGA_full <- merge(x = LGA_full, y = land.area.2014, by = "REGION", all.x = TRUE)

# Make a wide table
LGA.wide <- dcast(LGA_full, REGION + label.en + TIME + OBS_VALUE.y ~ MEASURE, value.var = c("OBS_VALUE.x"))

# Change col names
setnames(LGA.wide,c("label.en","TIME", "OBS_VALUE.y","ERP_21","ERP_18", "ERP_6", "CABEE_36", "CABEE_35", "INCOME_17"),c("lga","year","lga.area.2014","pop.density","pop.work.age.percent","pop.school.age.percent","transport.businesses", "total.businesses", "median.income"))

# Change variable classes 
LGA.wide$year <- as.integer(LGA.wide$year)
LGA.wide[,c("lga.area.2014","pop.density","pop.work.age.percent","pop.school.age.percent","MVC_14","MVC_15","MVC_16","MVC_17","MVC_18","MVC_19","MVC_20","MVC_21","MVC_22","transport.businesses", "total.businesses", "median.income")] <- as.numeric(unlist(LGA.wide[,c("lga.area.2014","pop.density","pop.work.age.percent","pop.school.age.percent","MVC_14","MVC_15","MVC_16","MVC_17","MVC_18","MVC_19","MVC_20","MVC_21","MVC_22","transport.businesses", "total.businesses", "median.income")]))

# Create light vehicle and heavy vehicle stats
LGA.wide$vehicles.light <- sum(LGA.wide$MVC_14, LGA.wide$MVC_15, LGA.wide$MVC_16, LGA.wide$MVC_17, LGA.wide$MVC_22, na.rm=TRUE)
LGA.wide$vehicles.heavy <- sum(LGA.wide$MVC_18, LGA.wide$MVC_19, LGA.wide$MVC_20, LGA.wide$MVC_21, na.rm=TRUE)

# Create densities for vehicles, businesses
LGA.wide$density.vehicles.light <- LGA.wide$vehicles.light / LGA.wide$lga.area.2014
LGA.wide$density.vehicles.heavy <- LGA.wide$vehicles.heavy / LGA.wide$lga.area.2014
LGA.wide$density.transport.businesses <- LGA.wide$transport.businesses / LGA.wide$lga.area.2014
LGA.wide$density.total.businesses <- LGA.wide$total.businesses / LGA.wide$lga.area.2014

# Subset the final columns we need
LGA.wide <- LGA.wide[,c("lga","year","lga.area.2014","pop.density","pop.work.age.percent","pop.school.age.percent","density.vehicles.light","density.vehicles.heavy","density.transport.businesses","density.total.businesses","median.income")]

saveRDS(LGA.wide,file="LGA_wide.rds")
LGA.wide <- read_rds("/Users/RohanDanisCox/STDS/LGA_wide.rds") # need to change to your location

# Left join to add ABS data for LGA/Year to each row of traffic data
traffic.with.abs <- merge(x = traffic_vol, y = LGA.wide, by = c("lga", "year"), all.x = TRUE)
traffic_with_abs <- traffic.with.abs

# Save and Retrieve 
saveRDS(traffic.with.abs,file= "traffic_with_abs.rds")
traffic_with_abs <- read_rds("/Users/RohanDanisCox/STDS/traffic_with_abs.rds") # need to change to your location

# Select the variables of interest in the dataframe (exclude unnecessary variables)
names(traffic_with_abs)
data <- traffic_with_abs %>%
  select(station_key, 
         full_name, 
         road_functional_hierarchy,
         mab_way_type,
         road_classification_admin,
         lane_count,
         cardinal_direction_seq,
         classification_seq,
         rms_region,
         lga,
         suburb,
         post_code,
         device_type,
         permanent_station,
         wgs84_latitude,
         wgs84_longitude,
         year,
         month,
         day,
         day_of_week,
         public_holiday,
         school_holiday,
         Distance_CBD,
         nearest_weather_station,
         lat,
         lon,
         pop.density,
         pop.work.age.percent,
         pop.school.age.percent,
         density.vehicles.light,
         density.vehicles.heavy,
         daily_total)

# Clean the variables
data$daily_total <- as.integer(data$daily_total)
data$Distance_CBD <- as.numeric(data$Distance_CBD)
data$school_holiday[data$school_holiday == 0] <- FALSE
data$school_holiday[data$school_holiday == 1] <- TRUE
data$school_holiday <- as.factor(data$school_holiday)
data$public_holiday[data$public_holiday == 0] <- FALSE
data$public_holiday[data$public_holiday == 1] <- TRUE
data$public_holiday <- as.factor(data$public_holiday)
data$year <- as.integer(data$year)
data$permanent_station <- as.factor(data$permanent_station)
data$device_type <- as.factor(data$device_type)
data$rms_region <- as.factor(data$rms_region)
data$lane_count <- as.factor(data$lane_count)
data$road_classification_admin <- as.factor(data$road_classification_admin)
data$mab_way_type <- as.factor(data$mab_way_type)
data$road_functional_hierarchy <- as.factor(data$road_functional_hierarchy)
data$road_functional_hierarchy <- trim(data$road_functional_hierarchy)

# Fixing up duplicates
data <- data %>%
  distinct(station_key,year,month,day,cardinal_direction_seq,classification_seq,.keep_all = TRUE)

# Removing impact of classification
data <- data %>% 
  group_by(station_key,full_name,road_functional_hierarchy,mab_way_type,road_classification_admin,
           lane_count,cardinal_direction_seq,rms_region,lga,suburb,post_code,device_type,permanent_station,
           wgs84_latitude,wgs84_longitude,year,month,day,day_of_week,public_holiday,school_holiday,
           Distance_CBD,nearest_weather_station,lat,lon,pop.density,pop.work.age.percent,
           pop.school.age.percent,density.vehicles.light,density.vehicles.heavy) %>%
  summarise(daily_total=sum(daily_total))

# Store and Retrieve Data
saveRDS(data,file= "FullData.rds")
FullData <- read_rds("/Users/RohanDanisCox/STDS/FullData.rds") # fix to your computer location

# Adding in weather data
BOMNSW <- read_rds("/Users/RohanDanisCox/STDS/BOMNSW.RDS")
str(BOMNSW)

BOMNSW$Date1<-dmy(BOMNSW$Day.Month.Year.in.DD.MM.YYYY.format)
BOMNSW$Year1<-year(BOMNSW$Date1)
BOMNSW$Month1<-month(BOMNSW$Date1)
BOMNSW$Day1<-day(BOMNSW$Date1)
BOMNSW$Key1<-paste(BOMNSW$Station.Number,"_",BOMNSW$Date1)

# Subset BOM file for relevant columns
RainByDay <- data.frame(subset(BOMNSW, select = c(1,2,3,4,5,8,17,19,20,18,21)))
colnames(RainByDay) <- c("StationNum","name", "Locality","lat", "lon", "rain","Year1","Month1", "Day1", "Date1","Key1")

RainCount <- RainByDay %>%
  group_by(Key1) %>%
  mutate(DailyRain=sum(rain, na.rm = TRUE)) %>%
  distinct(StationNum,name,Locality,lat,lon,Year1, Month1,Day1, Date1, DailyRain)

# Fix data for merge
FullData$date <- as.Date(with(FullData, paste(year, month, day,sep="-")), "%Y-%m-%d")
FullData$Key1<-paste(FullData$nearest_weather_station,"_",FullData$date)
FullDataRain<-merge(FullData,RainCount, by="Key1")

# Save
saveRDS(FullDataRain,file= "FullDataRain.rds")

# Fixing columns
FinalData <- FullDataRain %>%
  select(-c(Key1,hour_01:hour_23,StationNum:Date1)) %>%
  select(c(station_key:wgs84_longitude,Distance_CBD:lon.x,pop.density:density.vehicles.heavy,
           year:day,date,day_of_week:school_holiday,DailyRain,daily_total))

# Fix ABS figures for 2016 which were missing from the data
LGA2015 <- LGA.wide %>%
  filter(year<2016)
LGA2016 <- LGA.wide %>%
  filter(year==2015)
LGA2016 <- LGA2016 %>%
  mutate(year2=(year+1)) %>%
  select(lga,year2,pop.density,pop.work.age.percent,pop.work.age.percent,pop.school.age.percent,density.vehicles.light,density.vehicles.heavy)
LGA2015 <- LGA2015 %>%
  select(lga,year,pop.density,pop.work.age.percent,pop.work.age.percent,pop.school.age.percent,density.vehicles.light,density.vehicles.heavy)
LGA2016 <- rename(LGA2016, year=year2)
LGAfix <- rbind(LGA2015,LGA2016)

Check <- FinalData %>%
  select(-c(pop.density,pop.work.age.percent,pop.work.age.percent,pop.school.age.percent,density.vehicles.light,density.vehicles.heavy))
  
Check2 <- merge(x = Check, y = LGAfix, by = c("lga", "year"), all.x = TRUE)
FinalData<-Check2

# Remove Null values for RMS_region
FinalData <- FinalData %>%
  drop_na()
FinalData$rms_region <- factor(FinalData$rms_region)

# Save and Retrieve Data
saveRDS(FinalData,file= "FinalData.rds")
FinalData <- read_rds("/Users/RohanDanisCox/STDS/FinalData.rds") # fix to your computer location

#########################################################
################       MODELLING     ####################
#########################################################

# Subset to data following April 2015
DataApril2015 <- FinalData %>%
  filter(date>"2015-03-31")

# Save and Retrieve
saveRDS(DataApril2015,file= "DataApril2015.rds")
DataApril2015 <- read_rds("/Users/RohanDanisCox/STDS/DataApril2015.rds") # fix to your computer location

# Split into Holdout and Working sets
set.seed(42)
train = createDataPartition(y = DataApril2015$daily_total, p = 0.80, list = F)
Working = DataApril2015[train,]
Holdout = DataApril2015[-train,]

# Split Working set into training and test set
set.seed(42)
train1 = createDataPartition(y = Working$daily_total, p = 0.70, list = F)
trainset = Working[train1,]
testset = Working[-train1,]

# Full model
QP_Full <- glm(daily_total~ Distance_CBD + road_functional_hierarchy + day_of_week + public_holiday + DailyRain + school_holiday+
                    pop.density + density.vehicles.light + pop.work.age.percent + year + month + day + date + rms_region,
                  data=trainset,family=quasipoisson(link=log)) 

NB_Full <-glm.nb(daily_total~ Distance_CBD + road_functional_hierarchy + day_of_week + public_holiday + DailyRain + school_holiday+
                            pop.density + density.vehicles.light + pop.work.age.percent + year + month + day + date + rms_region,
                          data=trainset) 

# Consider summary for each model
summary.glm(QP_Full)
summary.glm(NB_Full)

# Model Selection -> via forward selection
variables <- c("DailyRain","pop.density","day_of_week","road_functional_hierarchy","density.vehicles.light","pop.work.age.percent",
              "year","month","day","Distance_CBD","public_holiday","school_holiday","date","rms_region")
Variable1 <-data.frame()
for(i in variables) {
    model <- glm(paste("daily_total~", i[[1]]),
                 data=trainset,family=quasipoisson(link=log))
    prediction <- exp(predict(model,newdata = testset))
    check <- cbind(testset,prediction)
    SummaryTest <- check %>%
      mutate(error=sqrt((daily_total-prediction)^2)) %>%
      summarise(average=mean(error,na.rm=TRUE))
    bind <- cbind(i,SummaryTest)
    Variable1 <- rbind(Variable1,bind)
}
# Variable 1 = Distance_CBD
variables2 <- variables <- c("DailyRain","pop.density","day_of_week","road_functional_hierarchy","density.vehicles.light","pop.work.age.percent",
                             "year","month","day","public_holiday","school_holiday","date","rms_region")
Variable2 <-data.frame()
for(i in variables2) {
  model <- glm(paste("daily_total~ Distance_CBD +", i[[1]]),
               data=trainset,family=quasipoisson(link=log))
  prediction <- exp(predict(model,newdata = testset))
  check <- cbind(testset,prediction)
  SummaryTest <- check %>%
    mutate(error=sqrt((daily_total-prediction)^2)) %>%
    summarise(average=mean(error,na.rm=TRUE))
  bind <- cbind(i,SummaryTest)
  Variable2 <- rbind(Variable2,bind)
}
# Variable 2 = road_functional_heirarchy
variables3 <- variables <- c("DailyRain","pop.density","day_of_week","density.vehicles.light","pop.work.age.percent",
                             "year","month","day","public_holiday","school_holiday","date","rms_region")
Variable3 <-data.frame()
for(i in variables3) {
  model <- glm(paste("daily_total~ Distance_CBD + road_functional_hierarchy +", i[[1]]),
               data=trainset,family=quasipoisson(link=log))
  prediction <- exp(predict(model,newdata = testset))
  check <- cbind(testset,prediction)
  SummaryTest <- check %>%
    mutate(error=sqrt((daily_total-prediction)^2)) %>%
    summarise(average=mean(error,na.rm=TRUE))
  bind <- cbind(i,SummaryTest)
  Variable3 <- rbind(Variable3,bind)
}
# Variable 3 = rms_region
variables4 <- variables <- c("DailyRain","pop.density","day_of_week","density.vehicles.light","pop.work.age.percent",
                             "year","month","day","public_holiday","school_holiday","date")
Variable4 <-data.frame()
for(i in variables4) {
  model <- glm(paste("daily_total~ Distance_CBD + road_functional_hierarchy + rms_region +", i[[1]]),
               data=trainset,family=quasipoisson(link=log))
  prediction <- exp(predict(model,newdata = testset))
  check <- cbind(testset,prediction)
  SummaryTest <- check %>%
    mutate(error=sqrt((daily_total-prediction)^2)) %>%
    summarise(average=mean(error,na.rm=TRUE))
  bind <- cbind(i,SummaryTest)
  Variable4 <- rbind(Variable4,bind)
}
# Variable 4 = day_of_week
variables5 <- variables <- c("DailyRain","pop.density","density.vehicles.light","pop.work.age.percent",
                             "year","month","day","public_holiday","school_holiday","date")
Variable5 <-data.frame()
for(i in variables5) {
  model <- glm(paste("daily_total~ Distance_CBD + road_functional_hierarchy + rms_region + day_of_week +", i[[1]]),
               data=trainset,family=quasipoisson(link=log))
  prediction <- exp(predict(model,newdata = testset))
  check <- cbind(testset,prediction)
  SummaryTest <- check %>%
    mutate(error=sqrt((daily_total-prediction)^2)) %>%
    summarise(average=mean(error,na.rm=TRUE))
  bind <- cbind(i,SummaryTest)
  Variable5 <- rbind(Variable5,bind)
}
# Variable 5 = public_holiday
variables6 <- variables <- c("DailyRain","pop.density","density.vehicles.light","pop.work.age.percent",
                             "year","month","day","school_holiday","date")
Variable6 <-data.frame()
for(i in variables6) {
  model <- glm(paste("daily_total~ Distance_CBD + road_functional_hierarchy + rms_region + day_of_week + public_holiday + ", i[[1]]),
               data=trainset,family=quasipoisson(link=log))
  prediction <- exp(predict(model,newdata = testset))
  check <- cbind(testset,prediction)
  SummaryTest <- check %>%
    mutate(error=sqrt((daily_total-prediction)^2)) %>%
    summarise(average=mean(error,na.rm=TRUE))
  bind <- cbind(i,SummaryTest)
  Variable6 <- rbind(Variable6,bind)
}
# Variable 6 = DailyRain
variables7 <- variables <- c("pop.density","density.vehicles.light","pop.work.age.percent",
                             "year","month","day","school_holiday","date")
Variable7 <-data.frame()
for(i in variables7) {
  model <- glm(paste("daily_total~ Distance_CBD + road_functional_hierarchy + rms_region + day_of_week + public_holiday + DailyRain + ", i[[1]]),
               data=trainset,family=quasipoisson(link=log))
  prediction <- exp(predict(model,newdata = testset))
  check <- cbind(testset,prediction)
  SummaryTest <- check %>%
    mutate(error=sqrt((daily_total-prediction)^2)) %>%
    summarise(average=mean(error,na.rm=TRUE))
  bind <- cbind(i,SummaryTest)
  Variable7 <- rbind(Variable7,bind)
}
# Variable 7 = school_holiday STOP -> No further benefit - This code was also completed for a negative binomial model

# Final Model 
QP_Final <- glm(daily_total~ Distance_CBD + road_functional_hierarchy +rms_region + day_of_week + public_holiday + DailyRain + school_holiday,
                   data=trainset,family=quasipoisson(link=log)) 
NB_Final <- glm.nb(daily_total~ Distance_CBD + road_functional_hierarchy + rms_region + day_of_week + public_holiday + DailyRain + school_holiday,
                   data=trainset) 

# Summary of Final Models
summary.glm(QP_Final)
summary.glm(NB_Final)

# Considering performance on the testset
QP_prediction <- exp(predict.glm(QP_Final,newdata = testset))
NB_prediction <- exp(predict.glm(NB_Final,newdata = testset))
testset <- cbind(testset,QP_prediction,NB_prediction)
SummaryTest <- testset %>%
  mutate(QP_error=sqrt((daily_total-QP_prediction)^2),NB_error=sqrt((daily_total-NB_prediction)^2)) %>%
  summarise(QP_average=mean(QP_error,na.rm=TRUE),NB__average=mean(NB_error,na.rm=TRUE))

# Considering performance on the holdout set
prediction <- exp(predict.glm(QP_Final,newdata = Holdout))
Holdout <- cbind(Holdout,prediction)
SummaryHoldout <- Holdout %>%
  mutate(Error=sqrt((daily_total-prediction)^2)) %>%
  summarise(AverageError=mean(Error,na.rm=TRUE))

# Consider performance on holdout set by RMS region
RMS_Region_Summary <- Holdout %>%
  mutate(Error=sqrt((daily_total-prediction)^2)) %>%
  group_by(rms_region) %>%
  summarise(Average_Error = mean(Error))
write.csv(RMS_Region_Summary,"RMSerror.csv")

# Consider performance on holdout set by Road Functional Hierarchy
RoadHierarchy <- Holdout %>%
  mutate(Error=sqrt((daily_total-prediction)^2)) %>%
  group_by(road_functional_hierarchy) %>%
  summarise(Average_Error = mean(Error))
write.csv(RoadHierarchy,"RoadHierarchy.csv")

# Consider performance on holdout set by Day of week
DayOfWeek <- Holdout %>%
  mutate(Error=sqrt((daily_total-prediction)^2)) %>%
  group_by(day_of_week) %>%
  summarise(Average_Error = mean(Error))
write.csv(DayOfWeek,"DayOfWeek.csv")

# Consider performance on holdout set by rainfall groups
summary(Holdout$Distance_CBD)

Distance <- Holdout %>%
  mutate(DtoCBD= cut(Holdout$Distance_CBD,c(0,5000,20000,40000,100000,250000, Inf),labels=c("Within 5km","5km to 20km","20km to 40km","40km to 100km","100km to 250km", "Greater than 250km"))) %>%
  mutate(Error=sqrt((daily_total-prediction)^2)) %>%
  group_by(DtoCBD) %>%
  summarise(Average_Error = mean(Error))
write.csv(Distance,"DtoCBD.csv")

# Consider performance on holdout set by rainfall groups
Rain <- Holdout %>%
  mutate(rainclassification = cut(Holdout$DailyRain,c(-1,5,10,20,50, Inf),labels=c("No Rain","Sprinkle","Rain","Heavy Rain", "Torrential Rain"))) %>%
  mutate(Error=sqrt((daily_total-prediction)^2)) %>%
  group_by(rainclassification) %>%
  summarise(Average_Error = mean(Error))
write.csv(Rain,"Rain.csv")

# Try Scenarios
str(Holdout)
summary(Holdout)
Scenarios <- data.frame(Distance_CBD=c(10000,50000,25000,200000,100000),
                        road_functional_hierarchy=c("Primary Road","Motorway","Arterial Road","Local Road","Primary Road"),
                        rms_region=c("Sydney","Sydney","Sydney","Western","Hunter"),
                        day_of_week=c("MONDAY","FRIDAY","SUNDAY","TUESDAY","SATURDAY"),
                        public_holiday=c("FALSE","TRUE","FALSE","FALSE","TRUE"),
                        DailyRain=c(10,0,30,0,0),
                        school_holiday=c("FALSE","FALSE","TRUE","FALSE","TRUE"))
ScenarioPrediction <- exp(predict.glm(QP_Final,Scenarios))
ScenarioOutput <- cbind(Scenarios,ScenarioPrediction)

write.csv(ScenarioOutput,"scenarios.csv")

#########################################################
################    VISUALISATIONS    ###################
#########################################################



#### INCLUDE VISUALISATIONS HERE ####


#FIGURE 3 - boxplot for final report showing changed nature of Primary Road data collected from 2015
primary_data <- FinalData[FinalData$road_functional_hierarchy=="Primary Road",] #only Primary Roads
primary_data$fakedate <- paste(as.character(primary_data$year), as.character(primary_data$month),"01", sep="-") #create a date value for each Month for charting
primary_data$fakedate <- as.Date(primary_data$fakedate)
ggplot(primary_data) +
  geom_boxplot(aes(y=daily_total,x=fakedate, group=fakedate)) + 
  theme(text = element_text(size = 16, family="Raleway")) +
  scale_y_continuous(label=comma) +
  labs(y="Daily traffic volume", 
       x="Time", 
       title = "Primary Road daily traffic distribution, by month",
       subtitle = "There is a clear discontinuity in April 2015"
  ) +
  scale_x_date(name = 'Time', 
               date_breaks = '1 year',
               date_labels = '%Y'
  ) 

#FIGURE 4 - boxplot to show the variability of daily counts in different regions and road types
sydney_hunter_data <- FinalData[FinalData$rms_region %in% c("Sydney","Hunter"),]
sydney_hunter_data$rms_region <- ordered(sydney_hunter_data$rms_region, levels = c("Sydney", "Hunter", "Northern", "Southern", "South West", "Western"))
sydney_hunter_data$road_functional_hierarchy <- ordered(sydney_hunter_data$road_functional_hierarchy, levels = c("Motorway", "Primary Road", "Arterial Road", "Sub-Arterial Road", "Distributor Road", "Local Road"))
ggplot(sydney_hunter_data) +
  geom_boxplot(aes(y=daily_total,x=road_functional_hierarchy)) + 
  facet_wrap(~rms_region) + 
  theme(text = element_text(size = 16, family="Raleway")) +
  labs(y="Daily traffic volume", 
       x="Road function", 
       title = "Different RMS Regions and Road Functions\nexhibit different levels of traffic variability",
       subtitle = "In particular, Hunter motorways are more likely to have quiet periods"
  ) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 


#data for TABLE 3 - Pearson correlation coefficients
pairs_data <- FinalData[,c("daily_total", "DailyRain", "rainclassification", "rms_region", "road_functional_hierarchy", "month", "day_of_week", "Distance_CBD", "pop.density", "pop.work.age.percent", "pop.school.age.percent", "density.vehicles.light", "density.vehicles.heavy","public_holiday", "school_holiday")]
cor_data[,] <- as.numeric(unlist(pairs_data[,]))
cor_results <- hetcor(cor_data)
cor_results


#FIGURE 5 - rain vs traffic
ggplot(FinalData) +
  geom_point(aes(y=daily_total,x=DailyRain)) + 
  theme(text = element_text(size = 16, family="Raleway")) +
  labs(y="Daily traffic volume", 
       x="Daily rainfall (mm)", 
       title = "Traffic vs Rainfall - potential correlation",
       subtitle = "However, vertical bands invite a closer investigation"
  ) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 


#FIGURE 6 - rain vs traffic v2 - colour for days of week
ggplot(FinalData) +
  geom_point(aes(y=daily_total,x=DailyRain, colour=factor(day_of_week))) + 
  theme(text = element_text(size = 16, family="Raleway")) +
  labs(y="Daily traffic volume", 
       x="Daily rainfall (mm)", 
       colour="",
       title = "Traffic vs Rainfall, by Day of Week",
       subtitle = "Makes clear that the plot is skewed by observations\nfrom just a few days of extreme rain"
  ) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) 


#FIGURE 7 - LGA scatterplots - WARNING - takes 30 mins to run pairs plots on this data set
pairs_data_LGA <- FinalData[,c("daily_total", "rms_region", "Distance_CBD", "pop.density", "pop.work.age.percent", "pop.school.age.percent", "density.vehicles.light", "density.vehicles.heavy")]
pairs(pairs_data_LGA)


# Visualising predicted vs actuals
ggplot(Holdout,aes(x=daily_total,y=prediction,colour=road_functional_hierarchy))+
  geom_point(size=0.5,alpha=0.4) +
  geom_abline(intercept=0,slope=1,linetype="dotted") +
  guides(colour = guide_legend(override.aes = list(alpha=1,size=5))) +
  labs(y="QuasiPoisson Predicted Daily Count",x="Actual Daily Count") + 
  theme(axis.title = element_text(size=20)) +
  theme(legend.title = element_text(size=20)) +
  theme(legend.text = element_text(size=16))+
  theme(text=element_text(size=16)) 

# Visualising the distribution by Day of Week
ggplot(Holdout) +
  geom_jitter(aes(x=day_of_week,y=prediction),colour="red",size=0.2,alpha=0.1) +
  geom_jitter(aes(x=day_of_week,y=daily_total),colour="blue",size=0.2,alpha=0.1) +
  labs(y="Daily Count (Predicted = Red, Actuals = Blue)",x="Day of Week") + 
  theme(axis.title = element_text(size=20)) +
  theme(legend.title = element_text(size=20)) +
  theme(legend.text = element_text(size=16))+
  theme(text=element_text(size=16)) 

# Visualising for rainfall
Rain <- Holdout %>%
  filter(DailyRain>5)
ggplot(Rain,aes(x=daily_total,y=prediction,colour=DailyRain))+
  geom_point(size=3,alpha=0.3) +
  scale_color_gradient2(low="white",mid="blue",high ="darkblue",midpoint=40)+
  guides(colour = guide_legend(override.aes = list(alpha=1,size=5))) +
  labs(y="QuasiPoisson Predicted Daily Count",x="Actual Daily Count") + 
  theme(axis.title = element_text(size=20)) +
  theme(legend.title = element_text(size=20)) +
  theme(legend.text = element_text(size=16))+
  theme(text=element_text(size=16))
Traffic volume prediction model - Report for Roads and Maritime Services, and other NSW Government stakeholders

Gridlock Analytics

31 September 2017