knitr::opts_chunk$set(echo = TRUE)

Task: Explore the Dublin Bus GTFS data set, which contains scheduled service information.

My Approach was to explore the geographical information contained within this GTFS data set to create some interesting insights and visually appealing maps.

1. Set-up

Load some data-wrangling, visualisation and geospatial/GIS libraries into R.

# Load some libraries for reading & wrangling data
library(knitr)
library(tidyverse)
library(pryr)
library(dplyr)
library(ggplot2)
library(plotly)
# Spatial packages
library(GISTools)
library(sp)
library(rgdal)
library(rgeos)
library(leaflet)
library(raster)

Create a custom function used throughout the script to view the first couple of records in a data frame.

# Save writing two functions, create a combination of the head()
#  and the kable() functions
kead <- function(x){
  kable(head(x))
}

2. Load data

The next step was to load in the GTFS data from the zip file that was provided with the technical exercise.

# Importing the GTFS data
# Get absolute path to zip file
zip <- "C:\\Users\\billy.archbold\\Documents\\Research\\Python\\GTFS using jupyter notebook\\data\\raw\\gtfs.zip"

# Create directory to unzip and store gtfs txt files
outDir <- substring(zip, 1, nchar(zip)-4)
if (file.exists(outDir)){
    setwd(outDir)
} else {
    dir.create(outDir)
    setwd(outDir)
}

# Read gtfs txt files into R session memory
unzip(zip, exdir = outDir)
agency <- read_csv("agency.txt")
calendar <- read_csv("calendar.txt")
calendar_dates <- read_csv("calendar_dates.txt")
routes <- read_csv("routes.txt")
shapes <- read_csv("shapes.txt")
stop_times <- read_csv("stop_times.txt")
stops <- read_csv("stops.txt")
transfers <- read_csv("transfers.txt")
trips <- read_csv("trips.txt")

3. Initial exploration

Get an idea of the quantity of data in each txt file and also glimpse the first few records of each table.

# First, get an idea of the dimensions of each table
dim(agency)
## [1] 1 5
dim(calendar)
## [1]  5 10
dim(calendar_dates)
## [1] 17  3
dim(routes)
## [1] 272   5
dim(shapes)
## [1] 285725      5
dim(stop_times)
## [1] 1549757       9
dim(stops)
## [1] 4297    4
dim(transfers)
## [1] 5233    4
dim(trips)
## [1] 28338     6
# Look at the head of each table

kead(calendar)
service_id monday tuesday wednesday thursday friday saturday sunday start_date end_date
y103m 0 0 0 0 0 1 0 20190203 20190531
y103l 1 1 1 1 1 0 0 20190203 20190531
y103j 1 0 0 0 0 0 1 20190203 20190531
y103l#1 0 1 1 1 1 0 0 20190129 20190202
y103k 0 0 0 0 0 1 0 20190129 20190202
kead(calendar_dates)
service_id date exception_type
y103j 20190204 2
y103j 20190211 2
y103j 20190218 2
y103j 20190225 2
y103j 20190304 2
y103j 20190311 2
kead(routes)
route_id agency_id route_short_name route_long_name route_type
60-1-b12-1 978 1 NA 3
60-102-b12-1 978 102 NA 3
60-104-b12-1 978 104 NA 3
60-11-b12-1 978 11 NA 3
60-111-b12-1 978 111 NA 3
60-114-b12-1 978 114 NA 3
kead(shapes)
shape_id shape_pt_lat shape_pt_lon shape_pt_sequence shape_dist_traveled
60-116-b12-1.58.O 53.32963 -6.249012 1 0.0000
60-116-b12-1.58.O 53.32932 -6.248409 2 53.2635
60-116-b12-1.58.O 53.32906 -6.247820 3 102.0897
60-116-b12-1.58.O 53.32852 -6.246715 4 196.9001
60-116-b12-1.58.O 53.32841 -6.246390 5 221.9600
60-116-b12-1.58.O 53.32841 -6.246390 6 221.9600
kead(stop_times)
trip_id arrival_time departure_time stop_id stop_sequence stop_headsign pickup_type drop_off_type shape_dist_traveled
10304.y103m.60-1-d12-1.1.O 14:40:00 14:40:00 8240DB000226 1 Sandymount 0 0 0.0000
10304.y103m.60-1-d12-1.1.O 14:40:58 14:40:58 8240DB000228 2 Sandymount 0 0 258.1390
10304.y103m.60-1-d12-1.1.O 14:41:48 14:41:48 8240DB000229 3 Sandymount 0 0 484.9086
10304.y103m.60-1-d12-1.1.O 14:43:06 14:43:06 8240DB000227 4 Sandymount 0 0 836.9790
10304.y103m.60-1-d12-1.1.O 14:43:56 14:43:56 8240DB000230 5 Sandymount 0 0 1066.4451
10304.y103m.60-1-d12-1.1.O 14:45:00 14:45:00 8240DB000231 6 Sandymount 0 0 1359.0993
kead(stops)
stop_id stop_name stop_lat stop_lon
8220B007612 Davenport Hotel Merrion Street 53.34135 -6.250529
8220DB000002 Rotunda, Parnell Square West 53.35224 -6.263723
8220DB000003 Rotunda, Granby Place 53.35231 -6.263811
8220DB000004 Rotunda, Rotunda Hospital 53.35257 -6.264176
8220DB000006 Rotunda, Saint Martin’s Chapel 53.35275 -6.264454
8220DB000007 Rotunda, Rotunda Hospital 53.35284 -6.264570
kead(transfers)
from_stop_id to_stop_id transfer_type min_transfer_time
8250DB003073 8250DB003040 2 60
8250DB003073 8250DB003039 2 240
8230DB003372 8230DB002223 2 300
8230DB003372 8230DB002224 2 180
8230DB003367 8230DB002223 2 300
8230DB003367 8230DB002224 2 180
kead(trips)
route_id service_id trip_id shape_id trip_headsign direction_id
60-1-d12-1 y103j 15367.y103j.60-1-d12-1.1.O 60-1-d12-1.1.O Shanard Road (Shanard Avenue) - Saint John’s Road East 0
60-1-d12-1 y103j 15397.y103j.60-1-d12-1.1.O 60-1-d12-1.1.O Shanard Road (Shanard Avenue) - Saint John’s Road East 0
60-1-d12-1 y103j 15417.y103j.60-1-d12-1.1.O 60-1-d12-1.1.O Shanard Road (Shanard Avenue) - Saint John’s Road East 0
60-1-d12-1 y103j 15369.y103j.60-1-d12-1.1.O 60-1-d12-1.1.O Shanard Road (Shanard Avenue) - Saint John’s Road East 0
60-1-d12-1 y103j 15385.y103j.60-1-d12-1.1.O 60-1-d12-1.1.O Shanard Road (Shanard Avenue) - Saint John’s Road East 0
60-1-d12-1 y103j 15419.y103j.60-1-d12-1.1.O 60-1-d12-1.1.O Shanard Road (Shanard Avenue) - Saint John’s Road East 0
# We can see there is data in every table - which is not always the case with GTFS data

4. Spatial data QA check

Some quick QA checks for any corrupted spatial data in the shapes & stops tables. Plot all of the coordinates to make sure all are located within/around Dublin. Sometimes with Irish coordinates, if lat and lon are swapped by accident, coordinates will plot off the coast of Africa… not good for distance calculations etc!

# Create a SpatialPointsDataFrame object to plot the data
shapes_spdf <- SpatialPointsDataFrame(shapes[c("shape_pt_lon", "shape_pt_lat")], shapes)

# Also do same for stops coordinates
stops_spdf <- SpatialPointsDataFrame(stops[c("stop_lon", "stop_lat")], stops)

# Create a projection for data to World Geodetic System 1984, used in GPS - EPSG:4326
#  i.e. what Google Maps uses
# https://epsg.io/4326
newProj <- CRS("+init=epsg:4326")
proj4string(shapes_spdf) <- newProj
proj4string(stops_spdf) <- newProj

# Update the working directoy having read in the zip file 
setwd("C:/Users/billy.archbold/Documents/Research/Python/GTFS using jupyter notebook/")

# Read county shapefile into R to help with sense-checking
# Data downloaed from data.gov.ie
# https://data.gov.ie/dataset/counties-osi-national-statutory-boundaries-generalised-20m1/resource/366a5e7f-7bc5-470f-a820-34a2a5469d73
county <- readOGR(dsn = "./data/raw/county", layer = "county")
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Users\billy.archbold\Documents\Research\Python\GTFS using jupyter notebook\data\raw\county", layer: "county"
## with 26 features
## It has 14 fields
county_dublin <- county[county$COUNTY %in% "DUBLIN", ]
# Re-project this shapefile to the same projection WGS84
county_dublin <- spTransform(county_dublin, newProj)

# Quick plot to verify data 
plot(county_dublin)
plot(shapes_spdf, pch = 1, cex = 0.2, add = TRUE, col = "red")

# Spatial data QA looks good at first glance, no misplaced coordinates

5. Understand the data

To understand the data, its structure and how it all ties together, I decided to narrow in on a route I know well from when I lived in Rathgar in Dublin a couple of years ago - the 15B.

I begin by searching the route_id for “15B” as I notice the names of routes are in this id.

I then take a specific route and narrow down my selection finally to a single trip (i.e. one journey in a particular direction at a specific time of the day - for a specific service).

# Investigate Routes table
dim(routes)
## [1] 272   5
kead(routes)
route_id agency_id route_short_name route_long_name route_type
60-1-b12-1 978 1 NA 3
60-102-b12-1 978 102 NA 3
60-104-b12-1 978 104 NA 3
60-11-b12-1 978 11 NA 3
60-111-b12-1 978 111 NA 3
60-114-b12-1 978 114 NA 3
length(routes$route_id)
## [1] 272
length(unique(routes$route_id))
## [1] 272

Search for a specific route I know & used to use when I lived in Rathgar (15b).

# Search for a specific route I know & used to use when I lived in Rathgar (15b)
routes[grep("15b", routes$route_id, ignore.case=T), ] %>% kable
route_id agency_id route_short_name route_long_name route_type
60-15B-b12-1 978 15b NA 3
60-15B-d12-1 978 15b NA 3
# 60-15B-b12-1
# 60-15B-d12-1

# To plot the route, first join routes to trips, then trips to shapes
route_spdf <- routes %>% 
  left_join(trips) %>% 
  left_join(shapes)

# Select a specific route
route_spdf <- route_spdf[route_spdf$route_id %in% "60-15B-d12-1", ]

# Create a SPDF object
route_spdf <- SpatialPointsDataFrame(
  coords = route_spdf[c("shape_pt_lon", "shape_pt_lat")], 
  data = route_spdf
)

# Rename columns
colnames(route_spdf@data)[11] <- "lat"
colnames(route_spdf@data)[12] <- "lon"

# Create individual route objects to explore
route_spdf1 <- route_spdf[route_spdf$shape_id %in% "60-15B-d12-1.410.O", ]
route_spdf2 <- route_spdf[route_spdf$shape_id %in% "60-15B-d12-1.411.I", ]

# Plot both routes on a simple map of Dublin
plot(county_dublin)
plot(route_spdf1, pch = 1, cex = 0.2, add = TRUE, col = "red")
plot(route_spdf2, pch = 1, cex = 0.2, add = TRUE, col = "blue")

# 1 Route: 15b
# Has how many trip?
length(unique(route_spdf$trip_id))
## [1] 332
# Get information for one trip (i.e. get a trip ID to investigate)
kead(route_spdf2)
route_id agency_id route_short_name route_long_name route_type service_id trip_id shape_id trip_headsign direction_id lat lon shape_pt_sequence shape_dist_traveled
60-15B-d12-1 978 15b NA 3 y103j 15042.y103j.60-15B-d12-1.411.I 60-15B-d12-1.411.I Dalriada Estate - Barrow Street 1 53.27137 -6.325088 1 0.0000
60-15B-d12-1 978 15b NA 3 y103j 15042.y103j.60-15B-d12-1.411.I 60-15B-d12-1.411.I Dalriada Estate - Barrow Street 1 53.27150 -6.324513 2 40.8534
60-15B-d12-1 978 15b NA 3 y103j 15042.y103j.60-15B-d12-1.411.I 60-15B-d12-1.411.I Dalriada Estate - Barrow Street 1 53.27164 -6.323653 3 100.3345
60-15B-d12-1 978 15b NA 3 y103j 15042.y103j.60-15B-d12-1.411.I 60-15B-d12-1.411.I Dalriada Estate - Barrow Street 1 53.27170 -6.323651 4 106.4172
60-15B-d12-1 978 15b NA 3 y103j 15042.y103j.60-15B-d12-1.411.I 60-15B-d12-1.411.I Dalriada Estate - Barrow Street 1 53.27175 -6.323574 5 114.2275
60-15B-d12-1 978 15b NA 3 y103j 15042.y103j.60-15B-d12-1.411.I 60-15B-d12-1.411.I Dalriada Estate - Barrow Street 1 53.27176 -6.323453 6 122.2898

Plot the 15b on an interactive leaflet map.

These type of maps are very useful for exploring geospatial data.

Colour the markers by the “stop_sequence” to check which direction the trip is heading.

# Use the stop lat/lon instead of shape, as gives approximate enough shape
#  without duplicating data we don't want duplicated like distance & time
trip_15b <- stop_times %>% 
  left_join(trips) %>% 
  left_join(routes) %>% 
  left_join(stops) %>%
  filter(trip_id %in% "4765.y103l.60-15B-d12-1.411.I")

# Create a SpatialPointsDataFrame object 
trip_15b <- SpatialPointsDataFrame(
  coords = trip_15b[c("stop_lon", "stop_lat")], 
  data = trip_15b
)

# Load additional library for colours 
library(RColorBrewer)
#  display.brewer.all()

# Create a colour palette for map
pal <- colorNumeric(
  palette = "YlOrRd",
  domain = trip_15b$stop_sequence)

# Display all stops for 15b trip on a map coloured by stop sequence 
#  thus indicating route direction on map
leaflet() %>%
  addProviderTiles("CartoDB.DarkMatter") %>%
  addCircleMarkers(
    data = trip_15b, 
    lat = trip_15b$stop_lat, 
    lng = trip_15b$stop_lon, 
    group = "red", 
    color = pal(trip_15b$stop_sequence),
    label = trip_15b$stop_sequence,
    radius = 2
    ) %>%
  addLegend("bottomright", pal = pal, values = trip_15b$stop_sequence,
    title = "Stop Sequence",
    opacity = 1
  )

6. Create info for trip_15b

Next, I think it would be interesting to use the time fields in the data set.
I want to plot each stop in the 15b route by the time it took to get there from the previous stop.
To achieve this, I need to copy the time field (doesn’t seem to matter which one as arrival and departure seem to be the same) into a new column and shift the values down by one.

To do this I use the handy dplyr::lag() function.

# Colour stops of trip by time?

# Look at the time values
kead(trip_15b[c("arrival_time", "departure_time")])
arrival_time departure_time
08:00:00 08:00:00
08:00:36 08:00:36
08:01:53 08:01:53
08:02:38 08:02:38
08:03:29 08:03:29
08:04:12 08:04:12
# Arrival & departure times are the same

# Calculate the difference in time it took to get from last stop to 
#  current stop to yeild some interesting info associated with each 
#  stop lat/lon

# First need to shift values in columns, then subtract
# Create lagged arrival time column
# Coalesce NA with original time value so that difference in 
#  time from first stop will be zero
trip_15b$arrival_time_shift <- lag(trip_15b$arrival_time, 1) %>% 
  coalesce(trip_15b$arrival_time)

# View results to ensure time is shifted correctly as we want
kead(trip_15b[c("arrival_time", "arrival_time_shift")])
arrival_time arrival_time_shift
08:00:00 08:00:00
08:00:36 08:00:00
08:01:53 08:00:36
08:02:38 08:01:53
08:03:29 08:02:38
08:04:12 08:03:29
# Looks good

# Get time difference
#trip_15b$time_diff <- trip_15b$arrival_time - trip_15b$arrival_time_shift
trip_15b$time_diff <- as.numeric(
    as.POSIXct(strptime(trip_15b$arrival_time, "%H:%M:%S")) -
    as.POSIXct(strptime(trip_15b$arrival_time_shift, "%H:%M:%S"))
    )

# Once again, view results to ensure time is shifted correctly as we want
kead(trip_15b[c("arrival_time", "arrival_time_shift", "time_diff")])
arrival_time arrival_time_shift time_diff
08:00:00 08:00:00 0
08:00:36 08:00:00 36
08:01:53 08:00:36 77
08:02:38 08:01:53 45
08:03:29 08:02:38 51
08:04:12 08:03:29 43
# Looks good


# Plot the stops on a map, coloured by time_diff
pal <- colorNumeric(
  palette = "YlOrRd",
  domain = trip_15b$time_diff
  )
leaflet() %>%
  addProviderTiles("CartoDB.DarkMatter") %>%
  addCircleMarkers(
    data = trip_15b, 
    lat = trip_15b$stop_lat, 
    lng = trip_15b$stop_lon, 
    group = "red", 
    color = pal(trip_15b$time_diff),
    radius = 2,
    label = ~htmltools::htmlEscape(time_diff)
    ) %>%
  addLegend("bottomright", pal = pal, values = trip_15b$time_diff,
    title = "Time Difference (s)",
    opacity = 1
  )

Interestingly, and as you would expect, the stops in and around the city centre/quays have largest time difference between them. This is probably to do with traffic/lights in the city centre.

It seems like an interesting angle to look at for the rest of the task.

7. Zone in on a particular dataset

As mentioned above, it looks like the time between stops is greater in the city centre (as you would expect) for the single trip I looked at for the 15b route.

For the rest of the task, I’m going to explore this geographical pattern.

To do this, I’ll need to wrangle a subset of the main data. I don’t need to look at every trip, rather I will identify a single service to look at (e.g. Mon - Friday which is probably the busiest service) and I will zone in on a particular time of the day (e.g. the morning traffic rush at 08:00 - 09:00).

I will then explore the relationship between congestion and location further.

# First of all, identify the busiest service 
kable(calendar)
service_id monday tuesday wednesday thursday friday saturday sunday start_date end_date
y103m 0 0 0 0 0 1 0 20190203 20190531
y103l 1 1 1 1 1 0 0 20190203 20190531
y103j 1 0 0 0 0 0 1 20190203 20190531
y103l#1 0 1 1 1 1 0 0 20190129 20190202
y103k 0 0 0 0 0 1 0 20190129 20190202
# Service "y103l" looks like the typical Mon - Fri working week service

# Select the largest service
largest_service <- trips %>% 
  group_by(service_id) %>% 
  count(service_id) %>%
  arrange(desc(n)) %>%
  data.frame() %>% 
  head(1)

This is the busiest service (Mon - Fri).

# "y103l" is the largest service with the most no. of trips at 6,839
kable(largest_service)
service_id n
y103l 6839
# Now, create the main/base data set to work with 
# Everything we need should be in here, allowing us to just filter out
#  what we don't need
stop_times <- stop_times %>% 
  left_join(trips) %>% 
  left_join(routes) %>% 
  left_join(stops)

Confirm there are only two directions.

# Confirm there are only two directions in the stop_times data
table(stop_times$direction_id)
## 
##      0      1 
## 768237 781520

Get a feel for how many stops there are per route/trip.

# Get a feel for how many stops there are per route/trip
# Some exploratory stats on some fields
class(stop_times$stop_sequence)
## [1] "numeric"
mean(stop_times$stop_sequence) # mean
## [1] 31.41657
median(stop_times$stop_sequence) # median
## [1] 29
sd(stop_times$stop_sequence) # sd
## [1] 20.23516
library(e1071) # library for skew
skewness(stop_times$stop_sequence)
## [1] 0.5108326
hist(
  stop_times$stop_sequence,
  xlab='No. Stops',
  main='Distribution of the no. stops in the data set'
  )

boxplot(stop_times$stop_sequence, main = "No. stops per trip")

So, I just want to pick one trip for each route around 08:30 am (i.e. a busy time of the morning with traffic etc). I want to create a snapshot of Dublin Bus public transport during the week.

# So, I just want to pick one trip for each route around 08:30 am (i.e. 
#  a busy time of the morning with traffic etc)
# I want to create a snapshot of Dublin Bus public transort during the 
#  week
# How do I do this?
# Query below currently has all trips for a route in a particular 
#  direction for a particular service
my_trips <- stop_times %>% 
  left_join(trips) %>% 
  left_join(routes) %>% 
  left_join(stops) %>%
  filter(
    # Limit to only one direction
    direction_id == 1 &
    # Limit to the weekday service (i.e. service that has most trips scheduled)
    service_id == largest_service$service_id
    ) %>%
    group_by(route_id, trip_id) %>%
    arrange(stop_sequence, departure_time) %>%
    mutate(
      arrival_time_shift = coalesce(lag(arrival_time, 1), arrival_time),
      time_diff = as.numeric(
        as.POSIXct(strptime(arrival_time, "%H:%M:%S")) -
        as.POSIXct(strptime(arrival_time_shift, "%H:%M:%S"))
      )
    ) %>%
    arrange(route_id, trip_id, stop_sequence, departure_time) %>% 
    data.frame()


# Next, get a subset of trips from ALL routes that occur between a certain time
# Here we're looking at trips between 08:10 and 08:50 
my_trips_timeslice <- my_trips %>%
  filter(
    stop_sequence == 1 & 
    as.POSIXct(strptime(departure_time, "%H:%M:%S")) > as.POSIXct(strptime("08:10:00", "%H:%M:%S")) &
    as.POSIXct(strptime(departure_time, "%H:%M:%S")) < as.POSIXct(strptime("08:50:00", "%H:%M:%S"))
  ) %>% 
  dplyr::select(route_id, trip_id)

# Some routes obviously have trips that leave every five mins or so, where as some 
#  originating in places like Maynooth may only leave once or twice in the hour. 

dim(my_trips_timeslice)
## [1] 148   2
length(unique(my_trips_timeslice$route_id))
## [1] 62
# There are some duplicated trips in this subset now - however we only want one trip 
#  per route, so lets remove dupes. 
# We just want to get single trips all around the same time so that they have 
#  similar traffic. 
# No use comparing a 6am trip with an 8.45am trip as traffic will be vastly different,
#  so the times between stops will be different. 
# When removing dupes here - it doesn't matter which records we retain per route as we only 
#  want to display one trip per route to get a general idea of the shape etc...

# Remove duplicated trips per route
my_trips_timeslice <- my_trips_timeslice[!duplicated(my_trips_timeslice$route_id), ]

# "my_trips_timeslice" now contains a single trip for all routes that occur within a 
#  specified time-frame 

# Subset the main df with everything to only desired trips for the "time slice" I am 
#  interested in
my_trips_timeslice <- my_trips %>%
  filter(
    trip_id %in% my_trips_timeslice$trip_id
  ) 

# Create SPDF for plotting
my_trips_timeslice <- SpatialPointsDataFrame(
  coords = my_trips_timeslice[c("stop_lon", "stop_lat")], 
  data = my_trips_timeslice
)

dim(my_trips_timeslice)
## [1] 3125   23
plot(county_dublin)
plot(my_trips_timeslice, pch = 1, cex = 0.3, col = "red", add = TRUE)

# We can see this is a pretty good representation of a weekday morning in 
#  Dublin. There are routes from all over coming into the city 

Create a boxplot of the difference in time between stops for my “snapshot” data set of around 08:30.

# Let's explore this data set a little bit more
boxplot(my_trips_timeslice$time_diff, main = "Time diff between stops")

# Time diff between stops is mostly around 100 seconds, with a couple of 
#  outliers

First, plot all the stop data on a map and verify the direction of travel is all heading one way (i.e. in toward the city).

For the most part, this looks to be correct.

# Let's first verify that all traffic is heading in towards the city
# Verify that all traffic is heading one-way
pal <- colorNumeric(
  palette = "YlOrRd",
  domain = my_trips_timeslice$stop_sequence
  )
leaflet() %>%
  addProviderTiles("CartoDB.DarkMatter") %>%
  addCircleMarkers(
    data = my_trips_timeslice, 
    lat = my_trips_timeslice$stop_lat, 
    lng = my_trips_timeslice$stop_lon, 
    color = pal(my_trips_timeslice$stop_sequence),
    radius = 1,
    weight = 2,
    label = ~htmltools::htmlEscape(
      paste0( 
        stop_sequence, " - ",
        time_diff
      )
    )
  ) %>%
  addLegend("bottomright", pal = pal, values = my_trips_timeslice$stop_sequence,
    title = "Stop Sequence",
    opacity = 1
  )
# For the most part it looks like trips are heading in towards the city centre.
#  Dots are coloured by stop sequence with a colour scale of yellow = low, 
#  red = high as per map legend

Now we will plot all of the stops for our snapshot data set of around 08:30, with each marker coloured by the time_diff variable.

# Next, lets map the time_diff variable at each stop to explore congestion 
pal <- colorNumeric(
  palette = "YlOrRd",
  domain = my_trips_timeslice$time_diff
  )
  
qpal <- colorQuantile(
  "YlOrRd", 
  my_trips_timeslice$time_diff, 
  n = 7
  )
  
leaflet() %>%
  addProviderTiles("CartoDB.DarkMatter") %>%
  addCircleMarkers(
    data = my_trips_timeslice, 
    lat = my_trips_timeslice$stop_lat, 
    lng = my_trips_timeslice$stop_lon, 
    color = qpal(my_trips_timeslice$time_diff),
    radius = 1,
    weight = 2,
    label = ~htmltools::htmlEscape(
      paste0( 
        stop_sequence, " - ",
        time_diff
      )
    )
  ) %>%
  addLegend("bottomright", pal = pal, values = my_trips_timeslice$time_diff,
    title = "Time Difference (s)",
    opacity = 1
  )

We can see some geographical pattern to the time_diff at stops such as in the city centre it looks like a cluster of high “time_diff” stops. This also seems to occur in some towns/bottlenecks en-route to the city.

8. Generate more useful info

We have some useful information at each stop for all these trips in question, however lets build up this data set a little bit more.

Things to add are;

  1. Distance traveled between each stop
  2. Speed of travel between stops (speed = distance / time)

Using speed between stops might give us a bit of a better representation of congestion at peak traffic times as the time difference will be larger and the stops (I’m assuming) will be closer together.

# Distance
#------------
# Create the shifted distance column (like the time before it)
my_trips_timeslice$shape_dist_traveled_shift <- coalesce(
  lag(my_trips_timeslice$shape_dist_traveled, 1), 
  my_trips_timeslice$shape_dist_traveled
  )

# As we can see, the lag() function has shifted the distance down into the next group 
#  this needs to be replace with zero for the dist_diff calc
kable(head(my_trips_timeslice[c("shape_dist_traveled", "shape_dist_traveled_shift")], 10))
shape_dist_traveled shape_dist_traveled_shift
0.0000 0.0000
346.5103 0.0000
477.3297 346.5103
779.5546 477.3297
1141.6930 779.5546
1298.5584 1141.6930
1660.4862 1298.5584
1949.7433 1660.4862
2065.9719 1949.7433
2346.1597 2065.9719
# Identify the first stop sequence in each group and replace the dist_diff with zero
# Display the info to verify it
my_trips_timeslice@data %>% 
  group_by(trip_id) %>% 
  filter(stop_sequence == min(stop_sequence)) %>% 
  dplyr::select(trip_id, arrival_time, departure_time, stop_sequence, arrival_time_shift, time_diff, shape_dist_traveled_shift) %>%
  kead()
trip_id arrival_time departure_time stop_sequence arrival_time_shift time_diff shape_dist_traveled_shift
4387.y103l.60-1-d12-1.3.I 08:20:00 08:20:00 1 08:20:00 0 0.00
6556.y103l.60-11-d12-1.16.I 08:35:00 08:35:00 1 08:35:00 0 12267.31
5174.y103l.60-120-d12-1.62.I 08:15:00 08:15:00 1 08:15:00 0 18834.73
3591.y103l.60-122-d12-1.71.I 08:20:00 08:20:00 1 08:20:00 0 10909.22
3438.y103l.60-123-d12-1.75.I 08:40:00 08:40:00 1 08:40:00 0 15432.26
267.y103l.60-13-d12-1.29.I 08:12:00 08:12:00 1 08:12:00 0 13350.55
# Replace each incorrect field with zero
my_trips_timeslice@data[my_trips_timeslice$stop_sequence == 1, ]$shape_dist_traveled_shift <- 0

# Display info to verify it has been replaced correctlty 
kable(head(my_trips_timeslice[c("shape_dist_traveled", "shape_dist_traveled_shift")], 10))
shape_dist_traveled shape_dist_traveled_shift
0.0000 0.0000
346.5103 0.0000
477.3297 346.5103
779.5546 477.3297
1141.6930 779.5546
1298.5584 1141.6930
1660.4862 1298.5584
1949.7433 1660.4862
2065.9719 1949.7433
2346.1597 2065.9719
# Now create the dist_diff field by subtracting the distance travelled between stops
my_trips_timeslice$dist_diff <- my_trips_timeslice$shape_dist_traveled - my_trips_timeslice$shape_dist_traveled_shift

# View the info to verify it has been correctly computed
kable(head(my_trips_timeslice[c("shape_dist_traveled", "shape_dist_traveled_shift", "dist_diff")], 10))
shape_dist_traveled shape_dist_traveled_shift dist_diff
0.0000 0.0000 0.0000
346.5103 0.0000 346.5103
477.3297 346.5103 130.8194
779.5546 477.3297 302.2249
1141.6930 779.5546 362.1384
1298.5584 1141.6930 156.8654
1660.4862 1298.5584 361.9278
1949.7433 1660.4862 289.2572
2065.9719 1949.7433 116.2286
2346.1597 2065.9719 280.1878
# Speed
#------------
# Create the speed field
my_trips_timeslice$speed <- (my_trips_timeslice$dist_diff / my_trips_timeslice$time_diff) * 3.6 # to get km/h

# View to verify it's correct
kable(head(my_trips_timeslice[c("stop_sequence", "speed")], 10))
stop_sequence speed
1 NaN
2 17.56954
3 17.44259
4 17.27000
5 18.10692
6 17.11259
7 17.37253
8 17.64959
9 17.43428
10 15.51809
# Replace any NaN with 0 casued by zero divided by itself
my_trips_timeslice@data[is.na(my_trips_timeslice$speed), ]$speed <- 0

# Boxplots of the three new variables that are useful for us to explore congestion
# Create panel of three plot areas
par(mfrow=c(1,3))
boxplot(my_trips_timeslice$time_diff, main = "Time Diff")
boxplot(my_trips_timeslice$dist_diff, main = "Distance Diff")
boxplot(my_trips_timeslice$speed, main = "Speed")

# Reset from two-panel display back to single
par(mfrow=c(1,1)) 

Now, lets plot the speed between stops on the map for this snapshot data set.

# Plot the spatial distribution of speed variable at each stop
qpal <- colorQuantile(
  "YlOrRd", 
  my_trips_timeslice$speed, 
  n = 9,
  reverse = TRUE
  )
  
pal <- colorNumeric(
  palette = "YlOrRd",
  domain = my_trips_timeslice$speed,
  reverse = TRUE
  )
  
leaflet() %>%
  addProviderTiles("CartoDB.DarkMatter") %>%
  addCircleMarkers(
    data = my_trips_timeslice, 
    lat = my_trips_timeslice$stop_lat, 
    lng = my_trips_timeslice$stop_lon, 
    color = qpal(my_trips_timeslice$speed),
    radius = 1,
    weight = 2,
    opacity = my_trips_timeslice$speed/25, 
    label = ~htmltools::htmlEscape(
      paste0( 
        stop_sequence, " - ",
        round(speed, 1)
      )
    )
  ) %>%
  addLegend("bottomright", pal = pal, values = my_trips_timeslice$speed,
    title = "Speed Between Stops",
    opacity = 1
  )

Ignore the really high values of 300 km/h on the legend of the above leaflet map, these are likely just outliers in the data created by errors when calculating the distance variable - not too worried about them as the visualisation holds up.

boxplot(my_trips_timeslice$speed)

Now we’re starting to see this congestion pattern a bit more clearly where towns and of course the city centre are “slower” between stops and more rural straight runs outside towns and before the city centre are faster.

9. Basic summary stats

Perhaps we can display this pattern better if we bring in Geographically Weighted Statistics. However, first lets look at some basic summary statistics of the data set before we investigate how these vary geographically.

# First, lets look at basic summary stats

# Speed
mean(my_trips_timeslice$speed)
## [1] 18.36454
sd(my_trips_timeslice$speed)
## [1] 12.5538
library(e1071)
skewness(my_trips_timeslice$speed)
## [1] 9.170483
hist(
  my_trips_timeslice$speed,
  xlab='Speed',
  main='Distribution of speed between stops'
  )

# Distance
mean(my_trips_timeslice$dist_diff)
## [1] 370.7838
sd(my_trips_timeslice$dist_diff)
## [1] 400.1265
library(e1071)
skewness(my_trips_timeslice$dist_diff)
## [1] 16.55855
hist(
  my_trips_timeslice$dist_diff/1000,
  xlab='Disatnce (km)',
  main='Distribution of distance between stops'
  )

# Lets look at the correlation between thse variables
cor(my_trips_timeslice$speed, my_trips_timeslice$dist_diff)
## [1] 0.2871195
plot(
  my_trips_timeslice$speed,
  my_trips_timeslice$dist_diff,
  xlab='Speed',
  ylab='Distance',
  pch=16,
  main = "Relationship of distance & speed"
  )

cor(my_trips_timeslice$time_diff, my_trips_timeslice$dist_diff)
## [1] 0.6470263
plot(
  my_trips_timeslice$time_diff,
  my_trips_timeslice$dist_diff,
  xlab='Time',
  ylab='Distance',
  pch=16,
  main = "Relationship of time & distance"
  )

cor(my_trips_timeslice$time_diff, my_trips_timeslice$speed)
## [1] -0.2292451
plot(
  my_trips_timeslice$time_diff, 
  my_trips_timeslice$speed,
  xlab='Time',
  ylab='Speed',
  pch=16,
  main = "Relationship of time & speed"
  )

10. GW summary stats

In order to look at Geographically Weighted summary stats, we will utilise the GWmodel package.

This package allows us to apply summary statistics using a moving window, so that summary statistics can be mapped as you move through different geographical regions in a data set. This allows us to see the spatial distribution of the mean speed at stops, for example. This is quite a useful tool to further explore congestion in Dublin using the GTFS Dublin Bus data.

# Now, GW summary stats
library(GWmodel)

# Run the geographically weighted statistics with a very small bandwidth. 
# The bandwidth and distance metrics etc can all be tuned and chosen to best fit the problem. 
# Supply the SPDF object with the fields you want to run GW stats for
localstats1 <- gwss(my_trips_timeslice, vars=c("speed", "dist_diff", "time_diff"), bw = 0.02) 

# View the output of the GWSS (Geographically Weighted Summary Stats)
kead(data.frame(localstats1$SDF))
speed_LM dist_diff_LM time_diff_LM speed_LSD dist_diff_LSD time_diff_LSD speed_LVar dist_diff_LVar time_diff_LVar speed_LSKe dist_diff_LSKe time_diff_LSKe speed_LCV dist_diff_LCV time_diff_LCV Cov_speed.dist_diff Cov_speed.time_diff Cov_dist_diff.time_diff Corr_speed.dist_diff Corr_speed.time_diff Corr_dist_diff.time_diff Spearman_rho_speed.dist_diff Spearman_rho_speed.time_diff Spearman_rho_dist_diff.time_diff stop_lon stop_lat optional
16.31361 264.2227 57.85347 5.810505 111.5373 28.31494 33.76197 12440.56 801.7357 -0.2541301 -0.0439535 0.6080990 0.3561754 0.4221335 0.4894251 272.4219 7.574207 2845.948 0.4108448 0.0449963 0.8807670 0.2257314 -0.1938933 0.8432225 -6.212304 53.32421 TRUE
16.09739 255.9956 55.97294 5.718708 108.3206 27.04076 32.70362 11733.36 731.2025 -0.8878311 -0.4018681 0.2235289 0.3552569 0.4231349 0.4831041 303.2687 21.119996 2661.875 0.4767295 0.1329936 0.8849347 0.2416384 -0.1493769 0.8492666 -6.207643 53.32510 TRUE
16.00897 254.9370 55.63352 5.739082 110.2739 27.32418 32.93706 12160.32 746.6107 -0.9862921 -0.4209372 0.2197405 0.3584917 0.4325533 0.4911460 320.3995 27.888362 2775.448 0.4926481 0.1730587 0.8963413 0.2501043 -0.1122691 0.8629429 -6.207683 53.32593 TRUE
15.65033 251.9230 55.14702 5.771032 116.1046 28.42342 33.30482 13480.27 807.8905 -1.2035337 -0.3906860 0.2419225 0.3687482 0.4608731 0.5154116 367.0849 45.156220 3120.532 0.5320496 0.2673475 0.9183148 0.2809744 -0.0166861 0.8877120 -6.208925 53.32854 TRUE
15.11734 248.1796 55.44638 5.799734 121.6384 30.10894 33.63692 14795.90 906.5481 -1.4006973 -0.3277394 0.2797972 0.3836479 0.4901226 0.5430280 413.5066 57.152179 3461.550 0.5665810 0.3163644 0.9136142 0.3202199 0.0379037 0.8775845 -6.210162 53.33163 TRUE
14.95200 248.8810 56.51866 5.717727 122.0164 30.96639 32.69240 14888.00 958.9172 -1.4430954 -0.3028515 0.3739840 0.3824055 0.4902599 0.5478967 404.4570 52.677162 3513.728 0.5602319 0.2875050 0.8986620 0.3188191 0.0137395 0.8616336 -6.211228 53.33235 TRUE
localstats1$SDF
## class       : SpatialPointsDataFrame 
## features    : 3125 
## extent      : -6.590004, -6.054799, 53.19282, 53.46941  (xmin, xmax, ymin, ymax)
## crs         : NA 
## variables   : 24
## names       :         speed_LM,     dist_diff_LM,     time_diff_LM,        speed_LSD,    dist_diff_LSD,    time_diff_LSD,       speed_LVar,   dist_diff_LVar,   time_diff_LVar,        speed_LSKe,    dist_diff_LSKe,    time_diff_LSKe,          speed_LCV,     dist_diff_LCV,     time_diff_LCV, ... 
## min values  : 4.63632424950545, 180.858759721382, 24.6208555282233, 1.79797037843775, 64.8835687434621, 11.5814202138041, 3.23269748173957, 4209.87749288757,  134.12929416871, -5.43531766593676, -1.99660232751572, -2.13968952529695, 0.0727272498295415, 0.200669317790105, 0.154236753752155, ... 
## max values  : 54.8671213280394,  2063.8020264648, 178.167614637855, 88.3659890183201, 3046.64335712379, 246.051496175096, 7808.54801518587,  9282035.7455065, 60541.3387700035,  21.8085467981949,   46.024322984747,     18.1735660169,    2.0696581999384,  3.14684543591258,  1.69490279088316, ...

Once again for reference, the avg. speed in our snapshot data set is 19.3 km/hr.

# So the avg. speed in the data set is 18.3 km/hr
mean(my_trips_timeslice$speed)
## [1] 18.36454

However, we assume this varies geographically across Co. Dublin.

The next step is to plot the geographically weighted avg. speed on a map to study its spatial pattern.

# localstats1$SDF$speed_LM is the geographically weighted mean
# We can plot this on a leaflet map
qpal <- colorQuantile(
  "BuPu", 
  localstats1$SDF$speed_LM, 
  n = 9,
  reverse = FALSE
  )
  
pal <- colorNumeric(
  palette = "BuPu",
  domain = localstats1$SDF$speed_LM,
  reverse = FALSE
  )
  
leaflet() %>%
  addProviderTiles("CartoDB.DarkMatter") %>%
  addCircleMarkers(
    data = localstats1$SDF, 
    color = qpal(localstats1$SDF$speed_LM),
    radius = 1,
    weight = 1.5,
    opacity = localstats1$SDF$speed_LM/25, 
    label = ~htmltools::htmlEscape(
      paste0( 
        round(speed_LM, 1)
      )
    )
  ) %>%
  addLegend("bottomright", pal = pal, values = localstats1$SDF$speed_LM,
    title = "GW Mean Speed",
    opacity = 1
  )

We can see from plotting the geographically weighted mean speed on the leaflet map that there is a clear pattern of congestion in the city centre and in towns.

This is what we assumed. Using the spatially weighted mean, we can clearly show this pattern.

There is a clear pin point of white in the city centre that spreads out gradually to a blue and then a purple.

Note:

  1. Purple = High Avg. Speed
  2. White = Low Avg. Speed

We can also see clusters of white stops quite clearly on the map in and around towns like Celbridge, Maynooth and Swords.

11. Frequency of trips per route

Finally, I wanted to create a map displaying the frequency of trips per route as I thought this would make for an interesting visual.

To do this, I went back to my main “base” data set (i.e. not limited to my “time slice”) and calculated the no. of trips per each route.

I then create a SpatiaLines object from the stop latitudes and longitudes and style the colour and weight of the lines by the no. of trips.

#=========================================
# Go back to the base data set "my_trips" (before the time slice)
froutes <- my_trips
length(unique(froutes$route_id))
## [1] 110
# Create a list of all routes with the count of trips 
froutes_data <- froutes %>%
  group_by(route_id) %>%
  summarise(
    n_trips = n_distinct(trip_id),
    n_stops=(n_distinct(stop_id)/2)
  ) %>% 
  arrange(-n_trips)


# Get the DISTINCT route_id, trip_id
froutes <- froutes %>% distinct(route_id, trip_id) 
# Just get any one of the trips for each route - only want the approx 
#  geometry of each route
froutes <- froutes[!duplicated(froutes$route_id), ]

# Subset the main data set with the unique trip_ids from froutes
froutes <- my_trips %>%
  filter(trip_id %in% froutes$trip_id)

# Create a SPDF object
froutes <- SpatialPointsDataFrame(
  coords = froutes[c("stop_lon", "stop_lat")], 
  data = froutes
)

# Convert to a SpatialLines object
data <- split(froutes@data, froutes@data$route_id)
data <- lapply(data, function(i) raster::spLines(as.matrix(i[, c('stop_lon', 'stop_lat')]), attr=data.frame(route_id=i$route_id[1])))
names(data) <- NULL
data <- do.call(bind, data)
# Join back on the froutes_data info (no. trips etc)
data@data <- dplyr::left_join(data@data, froutes_data, by = c("route_id"))



# Plot the route frequency on a leaflet map
qpal <- colorQuantile(
  "YlOrRd", 
  data$n_trips, 
  n = 9,
  reverse = TRUE
  )
  
pal <- colorNumeric(
  palette = "BuPu",
  domain = data$n_trips,
  reverse = FALSE
  )

leaflet() %>%
  addProviderTiles("CartoDB.DarkMatter") %>%
  addPolylines(
    data = data,
    color = pal(data$n_trips),
    weight = ~n_trips / 25,
    label = ~n_trips
  ) %>%
  addLegend("bottomright", pal = pal, values = data$n_trips,
    title = "No. Trips",
    opacity = 1
  )

Finally, here are the top 5 most frequent routes for the service y103l.

# Plot top 5 routes by frequency
data_top5 <- data[data$n_trips> 92, ]

pal <- colorNumeric(
  palette = "YlOrRd",
  domain = data_top5$n_trips,
  reverse = TRUE
  )

leaflet() %>%
  addProviderTiles("CartoDB.DarkMatter") %>%
  addPolylines(
    data = data_top5,
    color = pal(data_top5$n_trips),
    weight = ~n_trips / 25,
    label = ~n_trips
  ) %>%
  addLegend("bottomright", pal = pal, values = data$n_trips,
    title = "No. Trips",
    opacity = 1
  )
#========================================================================
# END

12. Conclusion/Summary

My intention was to explore the spatial data contained within this data set and attempt to highlight some sort of an interesting spatial pattern.

Unsurprisingly - the pattern I found is that towns and city centre are more congested than areas outside towns!

This isn’t a big revelation by any means, however it was an interesting challenge to pull this insight out of the GTFS data. It was also a nice application of Geographically Weighted analysis which I haven’t looked at a whole lot since my Masters degree.

So, in summary;

  1. I began by exploring the data
  2. I then zoned in on a route I was familiar with (this helped me to better understand the data)
  3. Next, I began to look at the relationship between “time since last stop” and geographical location for a trip in route 15b
  4. I then chose to investigate this relationship further by looking at a snapshot of all routes for the weekday service
  5. I applied geographically weighted summary statistics to highlight the congestion pattern by looking at the spatially weighted avg speed between stops
  6. Finally, I made a nice map of frequency of trips per route (before I ran out of time!)