knitr::opts_chunk$set(echo = TRUE)
Task: Explore the Dublin Bus GTFS data set, which contains scheduled service information.
My Approach was to explore the geographical information contained within this GTFS data set to create some interesting insights and visually appealing maps.
Load some data-wrangling, visualisation and geospatial/GIS libraries into R.
# Load some libraries for reading & wrangling data
library(knitr)
library(tidyverse)
library(pryr)
library(dplyr)
library(ggplot2)
library(plotly)
# Spatial packages
library(GISTools)
library(sp)
library(rgdal)
library(rgeos)
library(leaflet)
library(raster)
Create a custom function used throughout the script to view the first couple of records in a data frame.
# Save writing two functions, create a combination of the head()
# and the kable() functions
kead <- function(x){
kable(head(x))
}
The next step was to load in the GTFS data from the zip file that was provided with the technical exercise.
# Importing the GTFS data
# Get absolute path to zip file
zip <- "C:\\Users\\billy.archbold\\Documents\\Research\\Python\\GTFS using jupyter notebook\\data\\raw\\gtfs.zip"
# Create directory to unzip and store gtfs txt files
outDir <- substring(zip, 1, nchar(zip)-4)
if (file.exists(outDir)){
setwd(outDir)
} else {
dir.create(outDir)
setwd(outDir)
}
# Read gtfs txt files into R session memory
unzip(zip, exdir = outDir)
agency <- read_csv("agency.txt")
calendar <- read_csv("calendar.txt")
calendar_dates <- read_csv("calendar_dates.txt")
routes <- read_csv("routes.txt")
shapes <- read_csv("shapes.txt")
stop_times <- read_csv("stop_times.txt")
stops <- read_csv("stops.txt")
transfers <- read_csv("transfers.txt")
trips <- read_csv("trips.txt")
Get an idea of the quantity of data in each txt file and also glimpse the first few records of each table.
# First, get an idea of the dimensions of each table
dim(agency)
## [1] 1 5
dim(calendar)
## [1] 5 10
dim(calendar_dates)
## [1] 17 3
dim(routes)
## [1] 272 5
dim(shapes)
## [1] 285725 5
dim(stop_times)
## [1] 1549757 9
dim(stops)
## [1] 4297 4
dim(transfers)
## [1] 5233 4
dim(trips)
## [1] 28338 6
# Look at the head of each table
kead(calendar)
| service_id | monday | tuesday | wednesday | thursday | friday | saturday | sunday | start_date | end_date |
|---|---|---|---|---|---|---|---|---|---|
| y103m | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 20190203 | 20190531 |
| y103l | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 20190203 | 20190531 |
| y103j | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 20190203 | 20190531 |
| y103l#1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 20190129 | 20190202 |
| y103k | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 20190129 | 20190202 |
kead(calendar_dates)
| service_id | date | exception_type |
|---|---|---|
| y103j | 20190204 | 2 |
| y103j | 20190211 | 2 |
| y103j | 20190218 | 2 |
| y103j | 20190225 | 2 |
| y103j | 20190304 | 2 |
| y103j | 20190311 | 2 |
kead(routes)
| route_id | agency_id | route_short_name | route_long_name | route_type |
|---|---|---|---|---|
| 60-1-b12-1 | 978 | 1 | NA | 3 |
| 60-102-b12-1 | 978 | 102 | NA | 3 |
| 60-104-b12-1 | 978 | 104 | NA | 3 |
| 60-11-b12-1 | 978 | 11 | NA | 3 |
| 60-111-b12-1 | 978 | 111 | NA | 3 |
| 60-114-b12-1 | 978 | 114 | NA | 3 |
kead(shapes)
| shape_id | shape_pt_lat | shape_pt_lon | shape_pt_sequence | shape_dist_traveled |
|---|---|---|---|---|
| 60-116-b12-1.58.O | 53.32963 | -6.249012 | 1 | 0.0000 |
| 60-116-b12-1.58.O | 53.32932 | -6.248409 | 2 | 53.2635 |
| 60-116-b12-1.58.O | 53.32906 | -6.247820 | 3 | 102.0897 |
| 60-116-b12-1.58.O | 53.32852 | -6.246715 | 4 | 196.9001 |
| 60-116-b12-1.58.O | 53.32841 | -6.246390 | 5 | 221.9600 |
| 60-116-b12-1.58.O | 53.32841 | -6.246390 | 6 | 221.9600 |
kead(stop_times)
| trip_id | arrival_time | departure_time | stop_id | stop_sequence | stop_headsign | pickup_type | drop_off_type | shape_dist_traveled |
|---|---|---|---|---|---|---|---|---|
| 10304.y103m.60-1-d12-1.1.O | 14:40:00 | 14:40:00 | 8240DB000226 | 1 | Sandymount | 0 | 0 | 0.0000 |
| 10304.y103m.60-1-d12-1.1.O | 14:40:58 | 14:40:58 | 8240DB000228 | 2 | Sandymount | 0 | 0 | 258.1390 |
| 10304.y103m.60-1-d12-1.1.O | 14:41:48 | 14:41:48 | 8240DB000229 | 3 | Sandymount | 0 | 0 | 484.9086 |
| 10304.y103m.60-1-d12-1.1.O | 14:43:06 | 14:43:06 | 8240DB000227 | 4 | Sandymount | 0 | 0 | 836.9790 |
| 10304.y103m.60-1-d12-1.1.O | 14:43:56 | 14:43:56 | 8240DB000230 | 5 | Sandymount | 0 | 0 | 1066.4451 |
| 10304.y103m.60-1-d12-1.1.O | 14:45:00 | 14:45:00 | 8240DB000231 | 6 | Sandymount | 0 | 0 | 1359.0993 |
kead(stops)
| stop_id | stop_name | stop_lat | stop_lon |
|---|---|---|---|
| 8220B007612 | Davenport Hotel Merrion Street | 53.34135 | -6.250529 |
| 8220DB000002 | Rotunda, Parnell Square West | 53.35224 | -6.263723 |
| 8220DB000003 | Rotunda, Granby Place | 53.35231 | -6.263811 |
| 8220DB000004 | Rotunda, Rotunda Hospital | 53.35257 | -6.264176 |
| 8220DB000006 | Rotunda, Saint Martin’s Chapel | 53.35275 | -6.264454 |
| 8220DB000007 | Rotunda, Rotunda Hospital | 53.35284 | -6.264570 |
kead(transfers)
| from_stop_id | to_stop_id | transfer_type | min_transfer_time |
|---|---|---|---|
| 8250DB003073 | 8250DB003040 | 2 | 60 |
| 8250DB003073 | 8250DB003039 | 2 | 240 |
| 8230DB003372 | 8230DB002223 | 2 | 300 |
| 8230DB003372 | 8230DB002224 | 2 | 180 |
| 8230DB003367 | 8230DB002223 | 2 | 300 |
| 8230DB003367 | 8230DB002224 | 2 | 180 |
kead(trips)
| route_id | service_id | trip_id | shape_id | trip_headsign | direction_id |
|---|---|---|---|---|---|
| 60-1-d12-1 | y103j | 15367.y103j.60-1-d12-1.1.O | 60-1-d12-1.1.O | Shanard Road (Shanard Avenue) - Saint John’s Road East | 0 |
| 60-1-d12-1 | y103j | 15397.y103j.60-1-d12-1.1.O | 60-1-d12-1.1.O | Shanard Road (Shanard Avenue) - Saint John’s Road East | 0 |
| 60-1-d12-1 | y103j | 15417.y103j.60-1-d12-1.1.O | 60-1-d12-1.1.O | Shanard Road (Shanard Avenue) - Saint John’s Road East | 0 |
| 60-1-d12-1 | y103j | 15369.y103j.60-1-d12-1.1.O | 60-1-d12-1.1.O | Shanard Road (Shanard Avenue) - Saint John’s Road East | 0 |
| 60-1-d12-1 | y103j | 15385.y103j.60-1-d12-1.1.O | 60-1-d12-1.1.O | Shanard Road (Shanard Avenue) - Saint John’s Road East | 0 |
| 60-1-d12-1 | y103j | 15419.y103j.60-1-d12-1.1.O | 60-1-d12-1.1.O | Shanard Road (Shanard Avenue) - Saint John’s Road East | 0 |
# We can see there is data in every table - which is not always the case with GTFS data
Some quick QA checks for any corrupted spatial data in the shapes & stops tables. Plot all of the coordinates to make sure all are located within/around Dublin. Sometimes with Irish coordinates, if lat and lon are swapped by accident, coordinates will plot off the coast of Africa… not good for distance calculations etc!
# Create a SpatialPointsDataFrame object to plot the data
shapes_spdf <- SpatialPointsDataFrame(shapes[c("shape_pt_lon", "shape_pt_lat")], shapes)
# Also do same for stops coordinates
stops_spdf <- SpatialPointsDataFrame(stops[c("stop_lon", "stop_lat")], stops)
# Create a projection for data to World Geodetic System 1984, used in GPS - EPSG:4326
# i.e. what Google Maps uses
# https://epsg.io/4326
newProj <- CRS("+init=epsg:4326")
proj4string(shapes_spdf) <- newProj
proj4string(stops_spdf) <- newProj
# Update the working directoy having read in the zip file
setwd("C:/Users/billy.archbold/Documents/Research/Python/GTFS using jupyter notebook/")
# Read county shapefile into R to help with sense-checking
# Data downloaed from data.gov.ie
# https://data.gov.ie/dataset/counties-osi-national-statutory-boundaries-generalised-20m1/resource/366a5e7f-7bc5-470f-a820-34a2a5469d73
county <- readOGR(dsn = "./data/raw/county", layer = "county")
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\billy.archbold\Documents\Research\Python\GTFS using jupyter notebook\data\raw\county", layer: "county"
## with 26 features
## It has 14 fields
county_dublin <- county[county$COUNTY %in% "DUBLIN", ]
# Re-project this shapefile to the same projection WGS84
county_dublin <- spTransform(county_dublin, newProj)
# Quick plot to verify data
plot(county_dublin)
plot(shapes_spdf, pch = 1, cex = 0.2, add = TRUE, col = "red")
# Spatial data QA looks good at first glance, no misplaced coordinates
To understand the data, its structure and how it all ties together, I decided to narrow in on a route I know well from when I lived in Rathgar in Dublin a couple of years ago - the 15B.
I begin by searching the route_id for “15B” as I notice the names of routes are in this id.
I then take a specific route and narrow down my selection finally to a single trip (i.e. one journey in a particular direction at a specific time of the day - for a specific service).
# Investigate Routes table
dim(routes)
## [1] 272 5
kead(routes)
| route_id | agency_id | route_short_name | route_long_name | route_type |
|---|---|---|---|---|
| 60-1-b12-1 | 978 | 1 | NA | 3 |
| 60-102-b12-1 | 978 | 102 | NA | 3 |
| 60-104-b12-1 | 978 | 104 | NA | 3 |
| 60-11-b12-1 | 978 | 11 | NA | 3 |
| 60-111-b12-1 | 978 | 111 | NA | 3 |
| 60-114-b12-1 | 978 | 114 | NA | 3 |
length(routes$route_id)
## [1] 272
length(unique(routes$route_id))
## [1] 272
Search for a specific route I know & used to use when I lived in Rathgar (15b).
# Search for a specific route I know & used to use when I lived in Rathgar (15b)
routes[grep("15b", routes$route_id, ignore.case=T), ] %>% kable
| route_id | agency_id | route_short_name | route_long_name | route_type |
|---|---|---|---|---|
| 60-15B-b12-1 | 978 | 15b | NA | 3 |
| 60-15B-d12-1 | 978 | 15b | NA | 3 |
# 60-15B-b12-1
# 60-15B-d12-1
# To plot the route, first join routes to trips, then trips to shapes
route_spdf <- routes %>%
left_join(trips) %>%
left_join(shapes)
# Select a specific route
route_spdf <- route_spdf[route_spdf$route_id %in% "60-15B-d12-1", ]
# Create a SPDF object
route_spdf <- SpatialPointsDataFrame(
coords = route_spdf[c("shape_pt_lon", "shape_pt_lat")],
data = route_spdf
)
# Rename columns
colnames(route_spdf@data)[11] <- "lat"
colnames(route_spdf@data)[12] <- "lon"
# Create individual route objects to explore
route_spdf1 <- route_spdf[route_spdf$shape_id %in% "60-15B-d12-1.410.O", ]
route_spdf2 <- route_spdf[route_spdf$shape_id %in% "60-15B-d12-1.411.I", ]
# Plot both routes on a simple map of Dublin
plot(county_dublin)
plot(route_spdf1, pch = 1, cex = 0.2, add = TRUE, col = "red")
plot(route_spdf2, pch = 1, cex = 0.2, add = TRUE, col = "blue")
# 1 Route: 15b
# Has how many trip?
length(unique(route_spdf$trip_id))
## [1] 332
# Get information for one trip (i.e. get a trip ID to investigate)
kead(route_spdf2)
| route_id | agency_id | route_short_name | route_long_name | route_type | service_id | trip_id | shape_id | trip_headsign | direction_id | lat | lon | shape_pt_sequence | shape_dist_traveled |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 60-15B-d12-1 | 978 | 15b | NA | 3 | y103j | 15042.y103j.60-15B-d12-1.411.I | 60-15B-d12-1.411.I | Dalriada Estate - Barrow Street | 1 | 53.27137 | -6.325088 | 1 | 0.0000 |
| 60-15B-d12-1 | 978 | 15b | NA | 3 | y103j | 15042.y103j.60-15B-d12-1.411.I | 60-15B-d12-1.411.I | Dalriada Estate - Barrow Street | 1 | 53.27150 | -6.324513 | 2 | 40.8534 |
| 60-15B-d12-1 | 978 | 15b | NA | 3 | y103j | 15042.y103j.60-15B-d12-1.411.I | 60-15B-d12-1.411.I | Dalriada Estate - Barrow Street | 1 | 53.27164 | -6.323653 | 3 | 100.3345 |
| 60-15B-d12-1 | 978 | 15b | NA | 3 | y103j | 15042.y103j.60-15B-d12-1.411.I | 60-15B-d12-1.411.I | Dalriada Estate - Barrow Street | 1 | 53.27170 | -6.323651 | 4 | 106.4172 |
| 60-15B-d12-1 | 978 | 15b | NA | 3 | y103j | 15042.y103j.60-15B-d12-1.411.I | 60-15B-d12-1.411.I | Dalriada Estate - Barrow Street | 1 | 53.27175 | -6.323574 | 5 | 114.2275 |
| 60-15B-d12-1 | 978 | 15b | NA | 3 | y103j | 15042.y103j.60-15B-d12-1.411.I | 60-15B-d12-1.411.I | Dalriada Estate - Barrow Street | 1 | 53.27176 | -6.323453 | 6 | 122.2898 |
Plot the 15b on an interactive leaflet map.
These type of maps are very useful for exploring geospatial data.
Colour the markers by the “stop_sequence” to check which direction the trip is heading.
# Use the stop lat/lon instead of shape, as gives approximate enough shape
# without duplicating data we don't want duplicated like distance & time
trip_15b <- stop_times %>%
left_join(trips) %>%
left_join(routes) %>%
left_join(stops) %>%
filter(trip_id %in% "4765.y103l.60-15B-d12-1.411.I")
# Create a SpatialPointsDataFrame object
trip_15b <- SpatialPointsDataFrame(
coords = trip_15b[c("stop_lon", "stop_lat")],
data = trip_15b
)
# Load additional library for colours
library(RColorBrewer)
# display.brewer.all()
# Create a colour palette for map
pal <- colorNumeric(
palette = "YlOrRd",
domain = trip_15b$stop_sequence)
# Display all stops for 15b trip on a map coloured by stop sequence
# thus indicating route direction on map
leaflet() %>%
addProviderTiles("CartoDB.DarkMatter") %>%
addCircleMarkers(
data = trip_15b,
lat = trip_15b$stop_lat,
lng = trip_15b$stop_lon,
group = "red",
color = pal(trip_15b$stop_sequence),
label = trip_15b$stop_sequence,
radius = 2
) %>%
addLegend("bottomright", pal = pal, values = trip_15b$stop_sequence,
title = "Stop Sequence",
opacity = 1
)
Next, I think it would be interesting to use the time fields in the data set.
I want to plot each stop in the 15b route by the time it took to get there from the previous stop.
To achieve this, I need to copy the time field (doesn’t seem to matter which one as arrival and departure seem to be the same) into a new column and shift the values down by one.
To do this I use the handy dplyr::lag() function.
# Colour stops of trip by time?
# Look at the time values
kead(trip_15b[c("arrival_time", "departure_time")])
| arrival_time | departure_time |
|---|---|
| 08:00:00 | 08:00:00 |
| 08:00:36 | 08:00:36 |
| 08:01:53 | 08:01:53 |
| 08:02:38 | 08:02:38 |
| 08:03:29 | 08:03:29 |
| 08:04:12 | 08:04:12 |
# Arrival & departure times are the same
# Calculate the difference in time it took to get from last stop to
# current stop to yeild some interesting info associated with each
# stop lat/lon
# First need to shift values in columns, then subtract
# Create lagged arrival time column
# Coalesce NA with original time value so that difference in
# time from first stop will be zero
trip_15b$arrival_time_shift <- lag(trip_15b$arrival_time, 1) %>%
coalesce(trip_15b$arrival_time)
# View results to ensure time is shifted correctly as we want
kead(trip_15b[c("arrival_time", "arrival_time_shift")])
| arrival_time | arrival_time_shift |
|---|---|
| 08:00:00 | 08:00:00 |
| 08:00:36 | 08:00:00 |
| 08:01:53 | 08:00:36 |
| 08:02:38 | 08:01:53 |
| 08:03:29 | 08:02:38 |
| 08:04:12 | 08:03:29 |
# Looks good
# Get time difference
#trip_15b$time_diff <- trip_15b$arrival_time - trip_15b$arrival_time_shift
trip_15b$time_diff <- as.numeric(
as.POSIXct(strptime(trip_15b$arrival_time, "%H:%M:%S")) -
as.POSIXct(strptime(trip_15b$arrival_time_shift, "%H:%M:%S"))
)
# Once again, view results to ensure time is shifted correctly as we want
kead(trip_15b[c("arrival_time", "arrival_time_shift", "time_diff")])
| arrival_time | arrival_time_shift | time_diff |
|---|---|---|
| 08:00:00 | 08:00:00 | 0 |
| 08:00:36 | 08:00:00 | 36 |
| 08:01:53 | 08:00:36 | 77 |
| 08:02:38 | 08:01:53 | 45 |
| 08:03:29 | 08:02:38 | 51 |
| 08:04:12 | 08:03:29 | 43 |
# Looks good
# Plot the stops on a map, coloured by time_diff
pal <- colorNumeric(
palette = "YlOrRd",
domain = trip_15b$time_diff
)
leaflet() %>%
addProviderTiles("CartoDB.DarkMatter") %>%
addCircleMarkers(
data = trip_15b,
lat = trip_15b$stop_lat,
lng = trip_15b$stop_lon,
group = "red",
color = pal(trip_15b$time_diff),
radius = 2,
label = ~htmltools::htmlEscape(time_diff)
) %>%
addLegend("bottomright", pal = pal, values = trip_15b$time_diff,
title = "Time Difference (s)",
opacity = 1
)
Interestingly, and as you would expect, the stops in and around the city centre/quays have largest time difference between them. This is probably to do with traffic/lights in the city centre.
It seems like an interesting angle to look at for the rest of the task.
As mentioned above, it looks like the time between stops is greater in the city centre (as you would expect) for the single trip I looked at for the 15b route.
For the rest of the task, I’m going to explore this geographical pattern.
To do this, I’ll need to wrangle a subset of the main data. I don’t need to look at every trip, rather I will identify a single service to look at (e.g. Mon - Friday which is probably the busiest service) and I will zone in on a particular time of the day (e.g. the morning traffic rush at 08:00 - 09:00).
I will then explore the relationship between congestion and location further.
# First of all, identify the busiest service
kable(calendar)
| service_id | monday | tuesday | wednesday | thursday | friday | saturday | sunday | start_date | end_date |
|---|---|---|---|---|---|---|---|---|---|
| y103m | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 20190203 | 20190531 |
| y103l | 1 | 1 | 1 | 1 | 1 | 0 | 0 | 20190203 | 20190531 |
| y103j | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 20190203 | 20190531 |
| y103l#1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 20190129 | 20190202 |
| y103k | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 20190129 | 20190202 |
# Service "y103l" looks like the typical Mon - Fri working week service
# Select the largest service
largest_service <- trips %>%
group_by(service_id) %>%
count(service_id) %>%
arrange(desc(n)) %>%
data.frame() %>%
head(1)
This is the busiest service (Mon - Fri).
# "y103l" is the largest service with the most no. of trips at 6,839
kable(largest_service)
| service_id | n |
|---|---|
| y103l | 6839 |
# Now, create the main/base data set to work with
# Everything we need should be in here, allowing us to just filter out
# what we don't need
stop_times <- stop_times %>%
left_join(trips) %>%
left_join(routes) %>%
left_join(stops)
Confirm there are only two directions.
# Confirm there are only two directions in the stop_times data
table(stop_times$direction_id)
##
## 0 1
## 768237 781520
Get a feel for how many stops there are per route/trip.
# Get a feel for how many stops there are per route/trip
# Some exploratory stats on some fields
class(stop_times$stop_sequence)
## [1] "numeric"
mean(stop_times$stop_sequence) # mean
## [1] 31.41657
median(stop_times$stop_sequence) # median
## [1] 29
sd(stop_times$stop_sequence) # sd
## [1] 20.23516
library(e1071) # library for skew
skewness(stop_times$stop_sequence)
## [1] 0.5108326
hist(
stop_times$stop_sequence,
xlab='No. Stops',
main='Distribution of the no. stops in the data set'
)
boxplot(stop_times$stop_sequence, main = "No. stops per trip")
So, I just want to pick one trip for each route around 08:30 am (i.e. a busy time of the morning with traffic etc). I want to create a snapshot of Dublin Bus public transport during the week.
# So, I just want to pick one trip for each route around 08:30 am (i.e.
# a busy time of the morning with traffic etc)
# I want to create a snapshot of Dublin Bus public transort during the
# week
# How do I do this?
# Query below currently has all trips for a route in a particular
# direction for a particular service
my_trips <- stop_times %>%
left_join(trips) %>%
left_join(routes) %>%
left_join(stops) %>%
filter(
# Limit to only one direction
direction_id == 1 &
# Limit to the weekday service (i.e. service that has most trips scheduled)
service_id == largest_service$service_id
) %>%
group_by(route_id, trip_id) %>%
arrange(stop_sequence, departure_time) %>%
mutate(
arrival_time_shift = coalesce(lag(arrival_time, 1), arrival_time),
time_diff = as.numeric(
as.POSIXct(strptime(arrival_time, "%H:%M:%S")) -
as.POSIXct(strptime(arrival_time_shift, "%H:%M:%S"))
)
) %>%
arrange(route_id, trip_id, stop_sequence, departure_time) %>%
data.frame()
# Next, get a subset of trips from ALL routes that occur between a certain time
# Here we're looking at trips between 08:10 and 08:50
my_trips_timeslice <- my_trips %>%
filter(
stop_sequence == 1 &
as.POSIXct(strptime(departure_time, "%H:%M:%S")) > as.POSIXct(strptime("08:10:00", "%H:%M:%S")) &
as.POSIXct(strptime(departure_time, "%H:%M:%S")) < as.POSIXct(strptime("08:50:00", "%H:%M:%S"))
) %>%
dplyr::select(route_id, trip_id)
# Some routes obviously have trips that leave every five mins or so, where as some
# originating in places like Maynooth may only leave once or twice in the hour.
dim(my_trips_timeslice)
## [1] 148 2
length(unique(my_trips_timeslice$route_id))
## [1] 62
# There are some duplicated trips in this subset now - however we only want one trip
# per route, so lets remove dupes.
# We just want to get single trips all around the same time so that they have
# similar traffic.
# No use comparing a 6am trip with an 8.45am trip as traffic will be vastly different,
# so the times between stops will be different.
# When removing dupes here - it doesn't matter which records we retain per route as we only
# want to display one trip per route to get a general idea of the shape etc...
# Remove duplicated trips per route
my_trips_timeslice <- my_trips_timeslice[!duplicated(my_trips_timeslice$route_id), ]
# "my_trips_timeslice" now contains a single trip for all routes that occur within a
# specified time-frame
# Subset the main df with everything to only desired trips for the "time slice" I am
# interested in
my_trips_timeslice <- my_trips %>%
filter(
trip_id %in% my_trips_timeslice$trip_id
)
# Create SPDF for plotting
my_trips_timeslice <- SpatialPointsDataFrame(
coords = my_trips_timeslice[c("stop_lon", "stop_lat")],
data = my_trips_timeslice
)
dim(my_trips_timeslice)
## [1] 3125 23
plot(county_dublin)
plot(my_trips_timeslice, pch = 1, cex = 0.3, col = "red", add = TRUE)
# We can see this is a pretty good representation of a weekday morning in
# Dublin. There are routes from all over coming into the city
Create a boxplot of the difference in time between stops for my “snapshot” data set of around 08:30.
# Let's explore this data set a little bit more
boxplot(my_trips_timeslice$time_diff, main = "Time diff between stops")
# Time diff between stops is mostly around 100 seconds, with a couple of
# outliers
First, plot all the stop data on a map and verify the direction of travel is all heading one way (i.e. in toward the city).
For the most part, this looks to be correct.
# Let's first verify that all traffic is heading in towards the city
# Verify that all traffic is heading one-way
pal <- colorNumeric(
palette = "YlOrRd",
domain = my_trips_timeslice$stop_sequence
)
leaflet() %>%
addProviderTiles("CartoDB.DarkMatter") %>%
addCircleMarkers(
data = my_trips_timeslice,
lat = my_trips_timeslice$stop_lat,
lng = my_trips_timeslice$stop_lon,
color = pal(my_trips_timeslice$stop_sequence),
radius = 1,
weight = 2,
label = ~htmltools::htmlEscape(
paste0(
stop_sequence, " - ",
time_diff
)
)
) %>%
addLegend("bottomright", pal = pal, values = my_trips_timeslice$stop_sequence,
title = "Stop Sequence",
opacity = 1
)
# For the most part it looks like trips are heading in towards the city centre.
# Dots are coloured by stop sequence with a colour scale of yellow = low,
# red = high as per map legend
Now we will plot all of the stops for our snapshot data set of around 08:30, with each marker coloured by the time_diff variable.
# Next, lets map the time_diff variable at each stop to explore congestion
pal <- colorNumeric(
palette = "YlOrRd",
domain = my_trips_timeslice$time_diff
)
qpal <- colorQuantile(
"YlOrRd",
my_trips_timeslice$time_diff,
n = 7
)
leaflet() %>%
addProviderTiles("CartoDB.DarkMatter") %>%
addCircleMarkers(
data = my_trips_timeslice,
lat = my_trips_timeslice$stop_lat,
lng = my_trips_timeslice$stop_lon,
color = qpal(my_trips_timeslice$time_diff),
radius = 1,
weight = 2,
label = ~htmltools::htmlEscape(
paste0(
stop_sequence, " - ",
time_diff
)
)
) %>%
addLegend("bottomright", pal = pal, values = my_trips_timeslice$time_diff,
title = "Time Difference (s)",
opacity = 1
)
We can see some geographical pattern to the time_diff at stops such as in the city centre it looks like a cluster of high “time_diff” stops. This also seems to occur in some towns/bottlenecks en-route to the city.
We have some useful information at each stop for all these trips in question, however lets build up this data set a little bit more.
Things to add are;
Using speed between stops might give us a bit of a better representation of congestion at peak traffic times as the time difference will be larger and the stops (I’m assuming) will be closer together.
# Distance
#------------
# Create the shifted distance column (like the time before it)
my_trips_timeslice$shape_dist_traveled_shift <- coalesce(
lag(my_trips_timeslice$shape_dist_traveled, 1),
my_trips_timeslice$shape_dist_traveled
)
# As we can see, the lag() function has shifted the distance down into the next group
# this needs to be replace with zero for the dist_diff calc
kable(head(my_trips_timeslice[c("shape_dist_traveled", "shape_dist_traveled_shift")], 10))
| shape_dist_traveled | shape_dist_traveled_shift |
|---|---|
| 0.0000 | 0.0000 |
| 346.5103 | 0.0000 |
| 477.3297 | 346.5103 |
| 779.5546 | 477.3297 |
| 1141.6930 | 779.5546 |
| 1298.5584 | 1141.6930 |
| 1660.4862 | 1298.5584 |
| 1949.7433 | 1660.4862 |
| 2065.9719 | 1949.7433 |
| 2346.1597 | 2065.9719 |
# Identify the first stop sequence in each group and replace the dist_diff with zero
# Display the info to verify it
my_trips_timeslice@data %>%
group_by(trip_id) %>%
filter(stop_sequence == min(stop_sequence)) %>%
dplyr::select(trip_id, arrival_time, departure_time, stop_sequence, arrival_time_shift, time_diff, shape_dist_traveled_shift) %>%
kead()
| trip_id | arrival_time | departure_time | stop_sequence | arrival_time_shift | time_diff | shape_dist_traveled_shift |
|---|---|---|---|---|---|---|
| 4387.y103l.60-1-d12-1.3.I | 08:20:00 | 08:20:00 | 1 | 08:20:00 | 0 | 0.00 |
| 6556.y103l.60-11-d12-1.16.I | 08:35:00 | 08:35:00 | 1 | 08:35:00 | 0 | 12267.31 |
| 5174.y103l.60-120-d12-1.62.I | 08:15:00 | 08:15:00 | 1 | 08:15:00 | 0 | 18834.73 |
| 3591.y103l.60-122-d12-1.71.I | 08:20:00 | 08:20:00 | 1 | 08:20:00 | 0 | 10909.22 |
| 3438.y103l.60-123-d12-1.75.I | 08:40:00 | 08:40:00 | 1 | 08:40:00 | 0 | 15432.26 |
| 267.y103l.60-13-d12-1.29.I | 08:12:00 | 08:12:00 | 1 | 08:12:00 | 0 | 13350.55 |
# Replace each incorrect field with zero
my_trips_timeslice@data[my_trips_timeslice$stop_sequence == 1, ]$shape_dist_traveled_shift <- 0
# Display info to verify it has been replaced correctlty
kable(head(my_trips_timeslice[c("shape_dist_traveled", "shape_dist_traveled_shift")], 10))
| shape_dist_traveled | shape_dist_traveled_shift |
|---|---|
| 0.0000 | 0.0000 |
| 346.5103 | 0.0000 |
| 477.3297 | 346.5103 |
| 779.5546 | 477.3297 |
| 1141.6930 | 779.5546 |
| 1298.5584 | 1141.6930 |
| 1660.4862 | 1298.5584 |
| 1949.7433 | 1660.4862 |
| 2065.9719 | 1949.7433 |
| 2346.1597 | 2065.9719 |
# Now create the dist_diff field by subtracting the distance travelled between stops
my_trips_timeslice$dist_diff <- my_trips_timeslice$shape_dist_traveled - my_trips_timeslice$shape_dist_traveled_shift
# View the info to verify it has been correctly computed
kable(head(my_trips_timeslice[c("shape_dist_traveled", "shape_dist_traveled_shift", "dist_diff")], 10))
| shape_dist_traveled | shape_dist_traveled_shift | dist_diff |
|---|---|---|
| 0.0000 | 0.0000 | 0.0000 |
| 346.5103 | 0.0000 | 346.5103 |
| 477.3297 | 346.5103 | 130.8194 |
| 779.5546 | 477.3297 | 302.2249 |
| 1141.6930 | 779.5546 | 362.1384 |
| 1298.5584 | 1141.6930 | 156.8654 |
| 1660.4862 | 1298.5584 | 361.9278 |
| 1949.7433 | 1660.4862 | 289.2572 |
| 2065.9719 | 1949.7433 | 116.2286 |
| 2346.1597 | 2065.9719 | 280.1878 |
# Speed
#------------
# Create the speed field
my_trips_timeslice$speed <- (my_trips_timeslice$dist_diff / my_trips_timeslice$time_diff) * 3.6 # to get km/h
# View to verify it's correct
kable(head(my_trips_timeslice[c("stop_sequence", "speed")], 10))
| stop_sequence | speed |
|---|---|
| 1 | NaN |
| 2 | 17.56954 |
| 3 | 17.44259 |
| 4 | 17.27000 |
| 5 | 18.10692 |
| 6 | 17.11259 |
| 7 | 17.37253 |
| 8 | 17.64959 |
| 9 | 17.43428 |
| 10 | 15.51809 |
# Replace any NaN with 0 casued by zero divided by itself
my_trips_timeslice@data[is.na(my_trips_timeslice$speed), ]$speed <- 0
# Boxplots of the three new variables that are useful for us to explore congestion
# Create panel of three plot areas
par(mfrow=c(1,3))
boxplot(my_trips_timeslice$time_diff, main = "Time Diff")
boxplot(my_trips_timeslice$dist_diff, main = "Distance Diff")
boxplot(my_trips_timeslice$speed, main = "Speed")
# Reset from two-panel display back to single
par(mfrow=c(1,1))
Now, lets plot the speed between stops on the map for this snapshot data set.
# Plot the spatial distribution of speed variable at each stop
qpal <- colorQuantile(
"YlOrRd",
my_trips_timeslice$speed,
n = 9,
reverse = TRUE
)
pal <- colorNumeric(
palette = "YlOrRd",
domain = my_trips_timeslice$speed,
reverse = TRUE
)
leaflet() %>%
addProviderTiles("CartoDB.DarkMatter") %>%
addCircleMarkers(
data = my_trips_timeslice,
lat = my_trips_timeslice$stop_lat,
lng = my_trips_timeslice$stop_lon,
color = qpal(my_trips_timeslice$speed),
radius = 1,
weight = 2,
opacity = my_trips_timeslice$speed/25,
label = ~htmltools::htmlEscape(
paste0(
stop_sequence, " - ",
round(speed, 1)
)
)
) %>%
addLegend("bottomright", pal = pal, values = my_trips_timeslice$speed,
title = "Speed Between Stops",
opacity = 1
)
Ignore the really high values of 300 km/h on the legend of the above leaflet map, these are likely just outliers in the data created by errors when calculating the distance variable - not too worried about them as the visualisation holds up.
boxplot(my_trips_timeslice$speed)
Now we’re starting to see this congestion pattern a bit more clearly where towns and of course the city centre are “slower” between stops and more rural straight runs outside towns and before the city centre are faster.
Perhaps we can display this pattern better if we bring in Geographically Weighted Statistics. However, first lets look at some basic summary statistics of the data set before we investigate how these vary geographically.
# First, lets look at basic summary stats
# Speed
mean(my_trips_timeslice$speed)
## [1] 18.36454
sd(my_trips_timeslice$speed)
## [1] 12.5538
library(e1071)
skewness(my_trips_timeslice$speed)
## [1] 9.170483
hist(
my_trips_timeslice$speed,
xlab='Speed',
main='Distribution of speed between stops'
)
# Distance
mean(my_trips_timeslice$dist_diff)
## [1] 370.7838
sd(my_trips_timeslice$dist_diff)
## [1] 400.1265
library(e1071)
skewness(my_trips_timeslice$dist_diff)
## [1] 16.55855
hist(
my_trips_timeslice$dist_diff/1000,
xlab='Disatnce (km)',
main='Distribution of distance between stops'
)
# Lets look at the correlation between thse variables
cor(my_trips_timeslice$speed, my_trips_timeslice$dist_diff)
## [1] 0.2871195
plot(
my_trips_timeslice$speed,
my_trips_timeslice$dist_diff,
xlab='Speed',
ylab='Distance',
pch=16,
main = "Relationship of distance & speed"
)
cor(my_trips_timeslice$time_diff, my_trips_timeslice$dist_diff)
## [1] 0.6470263
plot(
my_trips_timeslice$time_diff,
my_trips_timeslice$dist_diff,
xlab='Time',
ylab='Distance',
pch=16,
main = "Relationship of time & distance"
)
cor(my_trips_timeslice$time_diff, my_trips_timeslice$speed)
## [1] -0.2292451
plot(
my_trips_timeslice$time_diff,
my_trips_timeslice$speed,
xlab='Time',
ylab='Speed',
pch=16,
main = "Relationship of time & speed"
)
In order to look at Geographically Weighted summary stats, we will utilise the GWmodel package.
This package allows us to apply summary statistics using a moving window, so that summary statistics can be mapped as you move through different geographical regions in a data set. This allows us to see the spatial distribution of the mean speed at stops, for example. This is quite a useful tool to further explore congestion in Dublin using the GTFS Dublin Bus data.
# Now, GW summary stats
library(GWmodel)
# Run the geographically weighted statistics with a very small bandwidth.
# The bandwidth and distance metrics etc can all be tuned and chosen to best fit the problem.
# Supply the SPDF object with the fields you want to run GW stats for
localstats1 <- gwss(my_trips_timeslice, vars=c("speed", "dist_diff", "time_diff"), bw = 0.02)
# View the output of the GWSS (Geographically Weighted Summary Stats)
kead(data.frame(localstats1$SDF))
| speed_LM | dist_diff_LM | time_diff_LM | speed_LSD | dist_diff_LSD | time_diff_LSD | speed_LVar | dist_diff_LVar | time_diff_LVar | speed_LSKe | dist_diff_LSKe | time_diff_LSKe | speed_LCV | dist_diff_LCV | time_diff_LCV | Cov_speed.dist_diff | Cov_speed.time_diff | Cov_dist_diff.time_diff | Corr_speed.dist_diff | Corr_speed.time_diff | Corr_dist_diff.time_diff | Spearman_rho_speed.dist_diff | Spearman_rho_speed.time_diff | Spearman_rho_dist_diff.time_diff | stop_lon | stop_lat | optional |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 16.31361 | 264.2227 | 57.85347 | 5.810505 | 111.5373 | 28.31494 | 33.76197 | 12440.56 | 801.7357 | -0.2541301 | -0.0439535 | 0.6080990 | 0.3561754 | 0.4221335 | 0.4894251 | 272.4219 | 7.574207 | 2845.948 | 0.4108448 | 0.0449963 | 0.8807670 | 0.2257314 | -0.1938933 | 0.8432225 | -6.212304 | 53.32421 | TRUE |
| 16.09739 | 255.9956 | 55.97294 | 5.718708 | 108.3206 | 27.04076 | 32.70362 | 11733.36 | 731.2025 | -0.8878311 | -0.4018681 | 0.2235289 | 0.3552569 | 0.4231349 | 0.4831041 | 303.2687 | 21.119996 | 2661.875 | 0.4767295 | 0.1329936 | 0.8849347 | 0.2416384 | -0.1493769 | 0.8492666 | -6.207643 | 53.32510 | TRUE |
| 16.00897 | 254.9370 | 55.63352 | 5.739082 | 110.2739 | 27.32418 | 32.93706 | 12160.32 | 746.6107 | -0.9862921 | -0.4209372 | 0.2197405 | 0.3584917 | 0.4325533 | 0.4911460 | 320.3995 | 27.888362 | 2775.448 | 0.4926481 | 0.1730587 | 0.8963413 | 0.2501043 | -0.1122691 | 0.8629429 | -6.207683 | 53.32593 | TRUE |
| 15.65033 | 251.9230 | 55.14702 | 5.771032 | 116.1046 | 28.42342 | 33.30482 | 13480.27 | 807.8905 | -1.2035337 | -0.3906860 | 0.2419225 | 0.3687482 | 0.4608731 | 0.5154116 | 367.0849 | 45.156220 | 3120.532 | 0.5320496 | 0.2673475 | 0.9183148 | 0.2809744 | -0.0166861 | 0.8877120 | -6.208925 | 53.32854 | TRUE |
| 15.11734 | 248.1796 | 55.44638 | 5.799734 | 121.6384 | 30.10894 | 33.63692 | 14795.90 | 906.5481 | -1.4006973 | -0.3277394 | 0.2797972 | 0.3836479 | 0.4901226 | 0.5430280 | 413.5066 | 57.152179 | 3461.550 | 0.5665810 | 0.3163644 | 0.9136142 | 0.3202199 | 0.0379037 | 0.8775845 | -6.210162 | 53.33163 | TRUE |
| 14.95200 | 248.8810 | 56.51866 | 5.717727 | 122.0164 | 30.96639 | 32.69240 | 14888.00 | 958.9172 | -1.4430954 | -0.3028515 | 0.3739840 | 0.3824055 | 0.4902599 | 0.5478967 | 404.4570 | 52.677162 | 3513.728 | 0.5602319 | 0.2875050 | 0.8986620 | 0.3188191 | 0.0137395 | 0.8616336 | -6.211228 | 53.33235 | TRUE |
localstats1$SDF
## class : SpatialPointsDataFrame
## features : 3125
## extent : -6.590004, -6.054799, 53.19282, 53.46941 (xmin, xmax, ymin, ymax)
## crs : NA
## variables : 24
## names : speed_LM, dist_diff_LM, time_diff_LM, speed_LSD, dist_diff_LSD, time_diff_LSD, speed_LVar, dist_diff_LVar, time_diff_LVar, speed_LSKe, dist_diff_LSKe, time_diff_LSKe, speed_LCV, dist_diff_LCV, time_diff_LCV, ...
## min values : 4.63632424950545, 180.858759721382, 24.6208555282233, 1.79797037843775, 64.8835687434621, 11.5814202138041, 3.23269748173957, 4209.87749288757, 134.12929416871, -5.43531766593676, -1.99660232751572, -2.13968952529695, 0.0727272498295415, 0.200669317790105, 0.154236753752155, ...
## max values : 54.8671213280394, 2063.8020264648, 178.167614637855, 88.3659890183201, 3046.64335712379, 246.051496175096, 7808.54801518587, 9282035.7455065, 60541.3387700035, 21.8085467981949, 46.024322984747, 18.1735660169, 2.0696581999384, 3.14684543591258, 1.69490279088316, ...
Once again for reference, the avg. speed in our snapshot data set is 19.3 km/hr.
# So the avg. speed in the data set is 18.3 km/hr
mean(my_trips_timeslice$speed)
## [1] 18.36454
However, we assume this varies geographically across Co. Dublin.
The next step is to plot the geographically weighted avg. speed on a map to study its spatial pattern.
# localstats1$SDF$speed_LM is the geographically weighted mean
# We can plot this on a leaflet map
qpal <- colorQuantile(
"BuPu",
localstats1$SDF$speed_LM,
n = 9,
reverse = FALSE
)
pal <- colorNumeric(
palette = "BuPu",
domain = localstats1$SDF$speed_LM,
reverse = FALSE
)
leaflet() %>%
addProviderTiles("CartoDB.DarkMatter") %>%
addCircleMarkers(
data = localstats1$SDF,
color = qpal(localstats1$SDF$speed_LM),
radius = 1,
weight = 1.5,
opacity = localstats1$SDF$speed_LM/25,
label = ~htmltools::htmlEscape(
paste0(
round(speed_LM, 1)
)
)
) %>%
addLegend("bottomright", pal = pal, values = localstats1$SDF$speed_LM,
title = "GW Mean Speed",
opacity = 1
)
We can see from plotting the geographically weighted mean speed on the leaflet map that there is a clear pattern of congestion in the city centre and in towns.
This is what we assumed. Using the spatially weighted mean, we can clearly show this pattern.
There is a clear pin point of white in the city centre that spreads out gradually to a blue and then a purple.
Note:
We can also see clusters of white stops quite clearly on the map in and around towns like Celbridge, Maynooth and Swords.
Finally, I wanted to create a map displaying the frequency of trips per route as I thought this would make for an interesting visual.
To do this, I went back to my main “base” data set (i.e. not limited to my “time slice”) and calculated the no. of trips per each route.
I then create a SpatiaLines object from the stop latitudes and longitudes and style the colour and weight of the lines by the no. of trips.
#=========================================
# Go back to the base data set "my_trips" (before the time slice)
froutes <- my_trips
length(unique(froutes$route_id))
## [1] 110
# Create a list of all routes with the count of trips
froutes_data <- froutes %>%
group_by(route_id) %>%
summarise(
n_trips = n_distinct(trip_id),
n_stops=(n_distinct(stop_id)/2)
) %>%
arrange(-n_trips)
# Get the DISTINCT route_id, trip_id
froutes <- froutes %>% distinct(route_id, trip_id)
# Just get any one of the trips for each route - only want the approx
# geometry of each route
froutes <- froutes[!duplicated(froutes$route_id), ]
# Subset the main data set with the unique trip_ids from froutes
froutes <- my_trips %>%
filter(trip_id %in% froutes$trip_id)
# Create a SPDF object
froutes <- SpatialPointsDataFrame(
coords = froutes[c("stop_lon", "stop_lat")],
data = froutes
)
# Convert to a SpatialLines object
data <- split(froutes@data, froutes@data$route_id)
data <- lapply(data, function(i) raster::spLines(as.matrix(i[, c('stop_lon', 'stop_lat')]), attr=data.frame(route_id=i$route_id[1])))
names(data) <- NULL
data <- do.call(bind, data)
# Join back on the froutes_data info (no. trips etc)
data@data <- dplyr::left_join(data@data, froutes_data, by = c("route_id"))
# Plot the route frequency on a leaflet map
qpal <- colorQuantile(
"YlOrRd",
data$n_trips,
n = 9,
reverse = TRUE
)
pal <- colorNumeric(
palette = "BuPu",
domain = data$n_trips,
reverse = FALSE
)
leaflet() %>%
addProviderTiles("CartoDB.DarkMatter") %>%
addPolylines(
data = data,
color = pal(data$n_trips),
weight = ~n_trips / 25,
label = ~n_trips
) %>%
addLegend("bottomright", pal = pal, values = data$n_trips,
title = "No. Trips",
opacity = 1
)
Finally, here are the top 5 most frequent routes for the service y103l.
# Plot top 5 routes by frequency
data_top5 <- data[data$n_trips> 92, ]
pal <- colorNumeric(
palette = "YlOrRd",
domain = data_top5$n_trips,
reverse = TRUE
)
leaflet() %>%
addProviderTiles("CartoDB.DarkMatter") %>%
addPolylines(
data = data_top5,
color = pal(data_top5$n_trips),
weight = ~n_trips / 25,
label = ~n_trips
) %>%
addLegend("bottomright", pal = pal, values = data$n_trips,
title = "No. Trips",
opacity = 1
)
#========================================================================
# END
My intention was to explore the spatial data contained within this data set and attempt to highlight some sort of an interesting spatial pattern.
Unsurprisingly - the pattern I found is that towns and city centre are more congested than areas outside towns!
This isn’t a big revelation by any means, however it was an interesting challenge to pull this insight out of the GTFS data. It was also a nice application of Geographically Weighted analysis which I haven’t looked at a whole lot since my Masters degree.
So, in summary;