Traffic Volume Prediction

A list of traffic counter stations is obtained using the NSW Transport OpenData API.

stations_api<- GET("https://api.transport.nsw.gov.au/v1/roads/spatial?format=geojson&q=select%20*%20from%20road_traffic_counts_station_reference%20limit%2050",
                   verbose(), 
                   encode="json", 
                   add_headers(`Authorization` = "apikey fUa8N1LC42AYtVDKIt6jbAzQXFPcf9b31GYv"))

# Extract a clean stations dataframe from the raw API output
stations_raw <- rawToChar(stations_api$content)
stations_clean <- fromJSON(stations_raw)
stations_df <- as.data.frame(stations_clean[[2]])
stations <- as.data.frame(stations_df$properties)
datatable(
  head(stations), rownames=FALSE,
  options = list(
  dom = 't',
  scrollX = TRUE
))

***

Each traffic counter station is then matched to the closest weather station and this information is saved against each traffic counter station.

BOMrain <- read_rds("/Users/RohanDanisCox/STDS/BOM_NSW_rain.RDS") 
BOMrainStation <- data.frame(subset(BOMrain, select = c(1,2, 5,6)))
colnames(BOMrainStation) <- c("StationNum","name", "lat", "lon")
UniqueBOMStations <- sqldf('SELECT DISTINCT * FROM BOMrainStation') 

# create distance matrix
DistMatrix <- distm(stations[,c('wgs84_longitude','wgs84_latitude')], UniqueBOMStations[,c('lon','lat')], fun=distVincentyEllipsoid)/1000
NearestStation <- data.frame(UniqueBOMStations$StationNum[max.col(-DistMatrix)])

# Only take the unique BOM Stations required
UniqueNearestStation<-unique(NearestStation)

stations$nearest_weather_station <- NearestStation$UniqueBOMStations.StationNum.max.col..DistMatrix..
stationsW <- inner_join(stations, UniqueBOMStations,c("nearest_weather_station" = "StationNum"))
datatable(
  head(stationsW), rownames=FALSE,
  options = list(
  dom = 't',
  scrollX = TRUE
))

***

The NSW Transport OpenData API is then accessed again to obtain the number of vehicles which passed each station hourly.

count_api<- GET("https://api.transport.nsw.gov.au/v1/roads/spatial?format=geojson&q=select%20*%20from%20road_traffic_counts_hourly_permanent%20where%20year%3D2012%20limit%2050",
                verbose(), 
                encode="json", 
                add_headers(`Authorization` = "apikey fUa8N1LC42AYtVDKIt6jbAzQXFPcf9b31GYv"))
count_raw <- rawToChar(count_api$content)
count_clean <- fromJSON(count_raw)
count_df <- as.data.frame(count_clean[[2]])
count_wide <- as.data.frame(count_df$properties)

# add the a monthly day count
count_wide <- count_wide%>%
  mutate(day=mday(date))

# gather the hourly counts into a single column by hour
count <-gather(count_wide,key=hour,value=count,hour_00:hour_23)
count$hour <- gsub("hour_","", count$hour)
count$hour <- as.numeric(count$hour)
datatable(
  head(count), rownames=FALSE,
  options = list(
  dom = 't',
  scrollX = TRUE
))

***

These two tables are then merged and cleaned.

# Merge the two tables using the key identifier of Station key - inner join returns where a match is found
traffic_vol <- inner_join(stationsW,count,by = "station_key") 

# select the variables of interest in the dataframe (exclude unnecessary variables)
names(traffic_vol)

##  [1] "cartodb_id.x"                   "record_id.x"                   
##  [3] "station_key"                    "station_id"                    
##  [5] "name.x"                         "road_name"                     
##  [7] "full_name"                      "common_road_name"              
##  [9] "secondary_name"                 "road_name_base"                
## [11] "road_name_type"                 "intersection"                  
## [13] "distance_to_intersection"       "road_number"                   
## [15] "link_number"                    "mab_way_type"                  
## [17] "mab_way_number"                 "mab_identifier"                
## [19] "road_functional_hierarchy"      "road_on_type"                  
## [21] "lane_count"                     "road_classification_type"      
## [23] "road_classification_admin"      "rms_region"                    
## [25] "lga"                            "suburb"                        
## [27] "post_code"                      "device_type"                   
## [29] "heavy_vehicle_checking_station" "permanent_station"             
## [31] "vehicle_classifier"             "lambert_easting"               
## [33] "lambert_northing"               "wgs84_latitude"                
## [35] "wgs84_longitude"                "direction_seq"                 
## [37] "quality_rating"                 "publish"                       
## [39] "nearest_weather_station"        "name.y"                        
## [41] "lat"                            "lon"                           
## [43] "cartodb_id.y"                   "record_id.y"                   
## [45] "traffic_direction_seq"          "cardinal_direction_seq"        
## [47] "classification_seq"             "date"                          
## [49] "year"                           "month"                         
## [51] "day_of_week"                    "public_holiday"                
## [53] "school_holiday"                 "daily_total"                   
## [55] "day"                            "hour"                          
## [57] "count"

traffic_vol_small <- traffic_vol %>%
  select(station_key,
         full_name,
         road_functional_hierarchy,
         road_classification_type,
         lane_count,
         classification_seq,
         rms_region,
         lga,
         suburb,
         post_code,
         device_type,
         wgs84_latitude,
         wgs84_longitude,
         traffic_direction_seq,
         cardinal_direction_seq,
         year,
         month,
         day,
         day_of_week,
         public_holiday,
         school_holiday,
         nearest_weather_station,
         name.y,
         lon,
         lat,
         daily_total,
         hour,
         count)

## classification_seq = 0: UNCLASSIFIED 1: ALL VEHICLES 2: LIGHT VEHICLES 3: HEAVY VEHICLES -9: MISSING    
## traffic_direction_seq = 0: COUNTER 1: PRESCRIBED 2: BOTH -- not sure if this is needed
## cardinal_direction_seq = 1: NORTH 3: EAST 5: SOUTH 7: WEST 9: NORTHBOUND AND SOUTHBOUND 10: EASTBOUND AND WESTBOUND

## update factors based on above characteristics
traffic_vol_small$classification_seq <- factor(traffic_vol_small$classification_seq,levels=c(0,1,2,3,-9),labels = c("UNCLASSIFIED","ALL VEHICLES","LIGHT VEHICLES","HEAVY VEHICLES","MISSING"))
traffic_vol_small$traffic_direction_seq <- factor(traffic_vol_small$traffic_direction_seq,levels=c(0,1,2),labels = c("COUNTER","PRESCRIBED","BOTH"))
traffic_vol_small$cardinal_direction_seq <- factor(traffic_vol_small$cardinal_direction_seq,levels=c(1,3,5,7,9,10),labels = c("NORTH","EAST","SOUTH","WEST","NORTHBOUND AND SOUTHBOUND","EASTBOUND & WESTBOUND"))
traffic_vol_small$day_of_week <- factor(traffic_vol_small$day_of_week,levels=c(1,2,3,4,5,6,7),labels = c("MONDAY","TUESDAY","WEDNESDAY","THURSDAY","FRIDAY","SATURDAY","SUNDAY"))
datatable(
  head(traffic_vol_small), rownames=FALSE,
  options = list(
  dom = 't',
  scrollX = TRUE
))

***

Next, demographic information for each Local Government Area (LGA) is obtained using an ABS API.

#read in selected variables from the ABS table ABS_REGIONAL_LGA
LGAurl <- "http://stat.data.abs.gov.au/restsdmx/sdmx.ashx/GetData/ABS_REGIONAL_LGA/ERP_18+ERP_21+ERP_6+200IND+MVC_14+MVC_15+MVC_16+MVC_17+MVC_18+MVC_19+MVC_20+MVC_21+MVC_22+MVC_38+CABEE_36+CABEE_35+INCOME_17+LAND.LGA2014..A/all?startTime=2011&endTime=2016&format=compact_v2"

LGA_raw <- readSDMX(LGAurl)
LGA <- as.data.frame(LGA_raw)
datatable(
  head(LGA), rownames=FALSE,
  options = list(
  dom = 't'
  ))

***

This does not contain the LGA name so this must also be sourced from the metadata.

# Names
LGAdsd_url <- "http://stat.data.abs.gov.au/restsdmx/sdmx.ashx/GetDataStructure/ABS_REGIONAL_LGA"
LGAdsd <- readSDMX(LGAdsd_url)
# Get codelists from DSD
LGAcls <- slot(LGAdsd, "codelists")
LGAcodelists <- sapply(slot(LGAcls, "codelists"), function(x) slot(x, "id")) #get list of codelists
LGAcodelist <- as.data.frame(slot(LGAdsd, "codelists"), codelistId = "CL_ABS_REGIONAL_LGA_REGION") #get a codelist
# Get concepts from DSD
LGAconcepts <- as.data.frame(slot(LGAdsd, "concepts"))
datatable(
  head(LGAcodelist), rownames=FALSE,
  options = list(
  dom = 't'
  ))

***

The two tables are then merged and cleaned.

# Join
LGA_full <- inner_join(LGA,LGAcodelist,c("REGION" = "id")) 

# Transform LGA region names into correct format to match other data
LGA_full$label.en <- gsub(" \\(.*$","", LGA_full$label.en)

# Apply 2014 LGA land area to all years
land.area.2014 <- LGA_full[LGA_full$MEASURE=="LAND",c("REGION","OBS_VALUE")]
LGA_full <- merge(x = LGA_full, y = land.area.2014, by = "REGION", all.x = TRUE)

# Make a wide table
LGA.wide <- dcast(LGA_full, REGION + label.en + TIME + OBS_VALUE.y ~ MEASURE, value.var = c("OBS_VALUE.x"))

#change col names
setnames(LGA.wide,c("label.en","TIME", "OBS_VALUE.y","ERP_21","ERP_18", "ERP_6", "CABEE_36", "CABEE_35", "INCOME_17"),c("lga","year","lga.area.2014","pop.density","pop.work.age.percent","pop.school.age.percent","transport.businesses", "total.businesses", "median.income"))

# Change variable classes 
LGA.wide$year <- as.integer(LGA.wide$year)
LGA.wide[,c("lga.area.2014","pop.density","pop.work.age.percent","pop.school.age.percent","MVC_14","MVC_15","MVC_16","MVC_17","MVC_18","MVC_19","MVC_20","MVC_21","MVC_22","transport.businesses", "total.businesses", "median.income")] <- as.numeric(unlist(LGA.wide[,c("lga.area.2014","pop.density","pop.work.age.percent","pop.school.age.percent","MVC_14","MVC_15","MVC_16","MVC_17","MVC_18","MVC_19","MVC_20","MVC_21","MVC_22","transport.businesses", "total.businesses", "median.income")]))

# Create light vehicle and heavy vehicle stats
LGA.wide$vehicles.light <- sum(LGA.wide$MVC_14, LGA.wide$MVC_15, LGA.wide$MVC_16, LGA.wide$MVC_17, LGA.wide$MVC_22, na.rm=TRUE)
LGA.wide$vehicles.heavy <- sum(LGA.wide$MVC_18, LGA.wide$MVC_19, LGA.wide$MVC_20, LGA.wide$MVC_21, na.rm=TRUE)

# Create densities for vehicles, businesses
LGA.wide$density.vehicles.light <- LGA.wide$vehicles.light / LGA.wide$lga.area.2014
LGA.wide$density.vehicles.heavy <- LGA.wide$vehicles.heavy / LGA.wide$lga.area.2014
LGA.wide$density.transport.businesses <- LGA.wide$transport.businesses / LGA.wide$lga.area.2014
LGA.wide$density.total.businesses <- LGA.wide$total.businesses / LGA.wide$lga.area.2014

# Subset for final columns
LGA.wide <- LGA.wide[,c("lga","year","lga.area.2014","pop.density","pop.work.age.percent","pop.school.age.percent","density.vehicles.light","density.vehicles.heavy","density.transport.businesses","density.total.businesses","median.income")]

datatable(
  head(LGA.wide), rownames=FALSE,
  options = list(
  dom = 't',
  scrollX = TRUE
  ))

***

Once cleaned, the ABS demographic data is added to the traffic volume data. Here, the distance from each traffic counter station to the CBD by road is computed using a Google Maps API. We will reorder this computation to ensure that all data points are considered.

# just a TEST for now, not the full traffic dataset
traffic_vol_test <- traffic_vol_small[seq(1, 90000, by = 6500),]
traffic.with.abs <- merge(x = traffic_vol_test, y = LGA.wide, by = c("lga", "year"), all.x = TRUE)

# Calculate distance from Sydney CBD for each station

station.locations <- unique(traffic_vol_test[,c("station_key","wgs84_latitude","wgs84_longitude")])

# Function to calculate distance by road to Sydney Tower
get.distance <- function(x,y,z) {
  results = gmapsdistance(
    origin = paste(x,y,sep=","), 
    destination = "-33.8704512,151.2058792", 
    mode = "driving", 
    traffic_model = "best_guess",
    shape = "long",
    key = "AIzaSyDDihK34ya701nYseOdUXLDwH7XWfYMuC0")
  results.df<-data.frame(results)
  results.df$station_key <- z
  results.df
}

# calculate distance for each station
distance.df <- get.distance(station.locations$wgs84_latitude,station.locations$wgs84_longitude, station.locations$station_key)
distances.table <- distance.df[,c("station_key", "Distance.Distance")]

# add to the traffic table
traffic.with.abs <- inner_join(traffic.with.abs,distances.table, by = "station_key")

The following table shows a subsection of the merged data. It includes traffic volume counts at various counter stations which are linked to the closest weather station and demographic data for the relevant LGA. (Only one example is shown due to RMarkdown functionality.)

datatable(
  head(traffic.with.abs,n=1), rownames=FALSE,
  options = list(
  dom = 't',
  scrollX = TRUE
  ))

***

Traffic Volume Prediction

Gridlock Analytics

17 September 2017