Overview

This document presents the code and output from some basic exploratory data analysis (EDA) of STATS19 road casualty data. The data visualisation package ggvis developed by Hadley Wickham and Winston Chang is used throughout. ggvis will eventually replace the popular ggplot2 graphics package and includes the ability to create interactive plots. For more information visit: http://ggvis.rstudio.com

The data used are STATS19 road casualties for Greater London during 2013 available from: https://www.tfl.gov.uk/corporate/publications-and-reports/road-safety

Setting up your session

# Load the relevant packages
library(ggvis)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Read the data

# Load the casualty data
casualties <- read.csv("http://tfl.gov.uk/cdn/static/cms/documents/2013-gla-data-extract-casualty.csv", header=T)
head(casualties, 2)
##        AREFNO        Borough Boro Easting Northing CREFNO Casualty.Class
## 1 0113CP00001 CITY OF LONDON    0  531630   180790      1 1 Driver/Rider
## 2 0113CP00002 CITY OF LONDON    0  531910   181110      1   3 Pedestrian
##   Casualty.Sex Casualty.Age..Banded. Casualty.Age No..of.Casualties
## 1       1 Male                 25-59           52                 1
## 2       1 Male                 25-59           36                 1
##   Casualty.Severity                Ped..Location         Ped..Movement
## 1          3 Slight                   -2 Unknown                -2 N/A
## 2          3 Slight 01 Crossing Road On Ped Xing 3 From Drivers O/Side
##        Mode.of.Travel  X
## 1 3 Powered 2 Wheeler NA
## 2        1 Pedestrian NA
casualties$X <- NULL # remove the redundant 'X' variable
# Load the attendant data
attendant <- read.csv("http://tfl.gov.uk/cdn/static/cms/documents/2013-gla-data-extract-attendant.csv", header=T)

Merge the two datasets

# Merge the attendant data with the casualty data using the AREFNO
casualties <- merge(casualties, attendant, by="AREFNO")
names(casualties)
##  [1] "AREFNO"                    "Borough.x"                
##  [3] "Boro.x"                    "Easting.x"                
##  [5] "Northing.x"                "CREFNO"                   
##  [7] "Casualty.Class"            "Casualty.Sex"             
##  [9] "Casualty.Age..Banded."     "Casualty.Age"             
## [11] "No..of.Casualties"         "Casualty.Severity"        
## [13] "Ped..Location"             "Ped..Movement"            
## [15] "Mode.of.Travel"            "Borough.y"                
## [17] "Boro.y"                    "Easting.y"                
## [19] "Northing.y"                "Accident.Severity"        
## [21] "No..of.Casualties.in.Acc." "No..of.Vehicles.in.Acc."  
## [23] "Accident.Date"             "Day"                      
## [25] "Time"                      "Highway"                  
## [27] "Road.Class.1"              "Road.No..1"               
## [29] "Road.Type"                 "Speed.Limit"              
## [31] "Junction.Detail"           "Junction.Control"         
## [33] "Road.Class.2"              "Road.No..2"               
## [35] "Ped..Crossing.Decoded"     "Light.Conditions..Banded."
## [37] "Weather"                   "Road.Surface"             
## [39] "Special.Conditions"        "C.W.Hazard"
rm(attendant) # remove the attendant dataset to save memory

Cleaning the data

Part 1: create date and time variables

# Convert 'Accident.Date' from a factor to a date
casualties$Accident.Date <- as.Date(casualties$Accident.Date, "%d-%b-%y")
class(casualties$Accident.Date)
## [1] "Date"
# Extract the month
casualties$month <- format(casualties$Accident.Date, format="%B")
casualties$month <- as.factor(casualties$month)
casualties$month <- factor(casualties$month,levels=month.name)
summary(casualties$month)
##   January  February     March     April       May      June      July 
##      1809      1763      1997      2033      2324      2367      2640 
##    August September   October  November  December 
##      2339      2404      2798      2530      2195
# Extract the day of the week
casualties$day <- format(casualties$Accident.Date, format="%A")
casualties$day <- factor(casualties$day, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"),
                     labels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
summary(casualties$day)
##    Monday   Tuesday Wednesday  Thursday    Friday  Saturday    Sunday 
##      3865      4190      4071      4047      4579      3438      3009
# Add an hour band variable
casualties$Time <- gsub("[ [:punct:]]", "" , casualties$Time) # remove punctuation from 'Time' variable
casualties$Time <- gsub("(\\d\\d)(\\d\\d)", "\\1:\\2", casualties$Time) # add a colon after 2nd value
casualties$hour<- as.POSIXlt(casualties$Time, format="%H:%M")$hour # add a new column called 'hour' with hours band
casualties[1:3,c(1, 23, 25, 43)] # check the results
##        AREFNO Accident.Date  Time hour
## 1 0113BS70003    2013-01-02 09:08    9
## 2 0113BS70005    2013-01-04 08:16    8
## 3 0113BS70005    2013-01-04 08:16    8

Cleaning the data

Part 2: relabel some of the categories

# Relabel the 'Casualty.Severity' categories
casualties$Casualty.Severity <- factor(casualties$Casualty.Severity,
                                       levels= c("1 Fatal", "2 Serious", "3 Slight"),
                                       labels= c("Fatal", "Serious", "Slight"))
levels(casualties$Casualty.Severity)
## [1] "Fatal"   "Serious" "Slight"
# Relabel the 'Mode.of.Travel' categories
casualties$Mode.of.Travel <- factor(casualties$Mode.of.Travel,
                                    levels= c("1 Pedestrian", "2 Pedal Cycle", "3 Powered 2 Wheeler", "4 Car",
                                              "5 Taxi", "6 Bus Or Coach", "7 Goods Vehicle", "8 Other Vehicle"),
                                    labels= c("Pedestrian", "Pedal Cycle", "Powered 2 Wheeler", "Car",
                                              "Taxi", "Bus or Coach", "Goods Vehicle", "Other Vehicle"))
levels(casualties$Mode.of.Travel)
## [1] "Pedestrian"        "Pedal Cycle"       "Powered 2 Wheeler"
## [4] "Car"               "Taxi"              "Bus or Coach"     
## [7] "Goods Vehicle"     "Other Vehicle"
# Relabel the 'Casualty.Sex' categories
levels(casualties$Casualty.Sex)
## [1] "1 Male"   "2 Female"
casualties$Casualty.Sex <- factor(casualties$Casualty.Sex,
                                    levels= c("1 Male", "2 Female"),
                                    labels= c("Male", "Female"))
levels(casualties$Casualty.Sex)
## [1] "Male"   "Female"
# Relabel the 'Light.Conditions..Banded.' categories
levels(casualties$Light.Conditions..Banded.)
## [1] "1 Daylight" "2 Dark"
casualties$Light.Conditions..Banded. <- factor(casualties$Light.Conditions..Banded.,
                                    levels= c("1 Daylight", "2 Dark"),
                                    labels= c("Daylight", "Dark"))
levels(casualties$Light.Conditions..Banded.)
## [1] "Daylight" "Dark"

Cleaning the data

Part 3: create casualty age bands

# Create age bands
bands <- c("0-4","5-9","10-14","15-19","20-24","25-29","30-34","35-39",
          "40-44","45-49","50-54","55-59","60-64","65-69","70-74","75-79","80-84")
casualties$ageband <- cut(casualties$Casualty.Age, 
                         breaks=c(0,4,9,14,19,24,29,34,39,44,49,54,59,64,69,74,79,84), 
                         labels = bands)
casualties[1:3,c(10, 44)]
##   Casualty.Age ageband
## 1           43   40-44
## 2           26   25-29
## 3           17   15-19

Calculate some summary statistics

# Number of casualties by mode of travel
table(casualties$Mode.of.Travel)
## 
##        Pedestrian       Pedal Cycle Powered 2 Wheeler               Car 
##              5181              4623              4502             10185 
##              Taxi      Bus or Coach     Goods Vehicle     Other Vehicle 
##               544              1471               571               122
# Number of casualties by mode and casualty severity
with(casualties, table(Mode.of.Travel, Casualty.Severity))
##                    Casualty.Severity
## Mode.of.Travel      Fatal Serious Slight
##   Pedestrian           65     773   4343
##   Pedal Cycle          14     475   4134
##   Powered 2 Wheeler    22     488   3992
##   Car                  25     310   9850
##   Taxi                  1      21    522
##   Bus or Coach          1      89   1381
##   Goods Vehicle         2      26    543
##   Other Vehicle         2      10    110
# Calculate the mean age of KSI casualties by mode of travel
casualties[casualties$Casualty.Severity == "Fatal" | casualties$Casualty.Severity == "Serious", ] %>%
  group_by(Mode.of.Travel) %>% 
  summarise(mean = round(mean(Casualty.Age), 0))
## Source: local data frame [8 x 2]
## 
##      Mode.of.Travel mean
## 1        Pedestrian   35
## 2       Pedal Cycle   33
## 3 Powered 2 Wheeler   32
## 4               Car   35
## 5              Taxi   48
## 6      Bus or Coach   53
## 7     Goods Vehicle   33
## 8     Other Vehicle   49

Create some bar charts

# Casualties by mode of travel
casualties %>% 
  ggvis(x = ~Mode.of.Travel, fill = ~factor(Mode.of.Travel) ) %>% 
  layer_bars() %>%
  add_axis("x", title = "Mode of Travel") %>%
  add_axis("y", title = "Number of casualties") %>%
  add_legend("fill", title="Mode of Travel")
## Warning in rbind_all(out[[1]]): Unequal factor levels: coercing to
## character

# Casualties by month
casualties %>% 
  ggvis(~month, fill :="#e5f5f9") %>% 
  layer_bars() %>%
  add_axis("x", title = "Month") %>%
  add_axis("y", title = "Number of casualties")

# Casualties by month and mode of travel
casualties %>%
  ggvis(x = ~month, fill = ~as.factor(Mode.of.Travel)) %>%
  layer_bars() %>%
  add_axis("x", title = "Month") %>%
  add_axis("y", title = "Number of casualties") %>%
  add_legend("fill", title="Mode of Travel")

# Casualties by mode and severity
casualties %>%
  group_by(Casualty.Severity) %>%
  ggvis(x = ~Mode.of.Travel, fill = ~Casualty.Severity, fillOpacity := 0.7) %>%
  layer_bars() %>%
  scale_nominal("fill", range = c("#1b9e77", "#d95f02", "#7570b3")) %>%
  add_axis("x", title = "Mode of travel") %>%
  add_axis("y", title = "Number of casualties") %>%
  add_legend("fill", title="Casualty severity")

# KSIs by mode of travel
casualties[casualties$Casualty.Severity == "Fatal" | casualties$Casualty.Severity == "Serious",] %>%
  group_by(Mode.of.Travel) %>%
  ggvis(~Mode.of.Travel, fill = ~Casualty.Severity) %>%
  add_axis("x", title = "Mode of travel") %>%
  add_axis("y", title = "Number of casualties") %>%
  add_legend("fill", title="Casualty severity")
## Guessing layer_bars()

# Casualty severity by month amongst pedal cyclists
casualties[casualties$Mode.of.Travel == "Pedal Cycle", ] %>%
  group_by(Casualty.Severity) %>%
  ggvis(~month, fill = ~Casualty.Severity) %>%
  add_axis("x", title = "Month") %>%
  add_axis("y", title = "Number of casualties") %>%
  add_legend("fill", title="Casualty severity")
## Guessing layer_bars()

# KSIs by month amongst pedal cyclists
casualties[casualties$Mode.of.Travel == "Pedal Cycle" & 
             casualties$Casualty.Severity == "Fatal" | casualties$Casualty.Severity == "Serious", ] %>%
  group_by(Casualty.Severity) %>%
  ggvis(~month, fill = ~Casualty.Severity) %>%
  layer_bars() %>%
  add_axis("x", title = "Month") %>%
  add_axis("y", title = "Number of casualties") %>%
  add_legend("fill", title="Casualty severity")

# Powered 2 Wheeler casualties by age band and gender
casualties[casualties$Mode.of.Travel == "Powered 2 Wheeler", ] %>% 
  group_by(Casualty.Sex) %>%
  ggvis(x = ~ageband, fill = ~Casualty.Sex, fillOpacity := 0.8) %>%
  layer_bars() %>%
  add_axis("x", title = "Casualty age band") %>%
  add_axis("y", title = "Number of casualties") %>%
  add_legend("fill", title="Casualty sex")

# Powered 2 Wheeler casualties by month and light conditions
casualties[casualties$Mode.of.Travel == "Powered 2 Wheeler", ] %>% 
  group_by(Light.Conditions..Banded.) %>%
  ggvis(x = ~month, fill = ~Light.Conditions..Banded.) %>%
  layer_bars() %>%
  scale_nominal("fill", range = c("Yellow", "Black")) %>%
  add_axis("x", title = "Month") %>%
  add_axis("y", title = "Number of casualties") %>%
  add_legend("fill", title="Light conditions")

Create a box plot

# Mean age of casualties by mode of travel
casualties %>% 
  ggvis(~factor(Mode.of.Travel), ~Casualty.Age) %>% 
  layer_boxplots(width = 0.5)
## Warning in rbind_all(out[[1]]): Unequal factor levels: coercing to
## character
## Warning in rbind_all(out[[1]]): Unequal factor levels: coercing to
## character

Create a line chart

# Casualties by day
casualties %>%
  group_by(Accident.Date) %>%
  summarize(count = n()) %>%
  ggvis(~Accident.Date, ~count) %>%
  layer_lines()

Create some interactive charts

# Number of casualties by mode of travel
casualty_freq <- casualties %>%
  group_by(Mode.of.Travel, Casualty.Severity) %>%
  summarize(count = n()) 
casualty_freq <- as.data.frame(casualty_freq)
head(casualty_freq)
##   Mode.of.Travel Casualty.Severity count
## 1     Pedestrian             Fatal    65
## 2     Pedestrian           Serious   773
## 3     Pedestrian            Slight  4343
## 4    Pedal Cycle             Fatal    14
## 5    Pedal Cycle           Serious   475
## 6    Pedal Cycle            Slight  4134
freq <- function(x) {
  if(is.null(x)) return(NULL)
  paste0(x[4], collapse = "<br />")
}

casualty_freq %>% ggvis(~Mode.of.Travel, ~count, fill = ~factor(Casualty.Severity)) %>% 
  layer_bars() %>%
  scale_nominal("fill", range = c("#e41a1c", "#377eb8", "#4daf4a")) %>%
  add_axis("x", title = "Mode of travel", properties = axis_props(labels=list(angle=90, align="left")) ) %>% 
  add_axis("y", title = "") %>%
  add_legend("fill", title="Casualty severity") %>%
  add_tooltip(freq, "hover")
## Warning: Can't output dynamic/interactive ggvis plots in a knitr document.
## Generating a static (non-dynamic, non-interactive) version of the plot.
## Warning in rbind_all(out[[1]]): Unequal factor levels: coercing to
## character

# Filtering casualties by age band and mode of travel
casualty_ages <- casualties %>%
  group_by(Mode.of.Travel, ageband) %>%
  summarize(count = n()) 

casualty_ages %>% 
  ggvis(~ageband, ~count) %>% 
  filter(Mode.of.Travel %in% eval(input_select(c("Pedestrian", "Pedal Cycle", "Powered 2 Wheeler", 
                                                 "Car", "Taxi", "Bus or Coach", "Goods Vehicle", 
                                                 "Other Vehicle"), selected = "Pedestrian"))) %>% 
  add_axis("x", title = "Casualty age band") %>%
  add_axis("y", title = "Number of casualties") %>%
  layer_bars(fill = ~Mode.of.Travel) %>%
  hide_legend("fill")
## Warning: Can't output dynamic/interactive ggvis plots in a knitr document.
## Generating a static (non-dynamic, non-interactive) version of the plot.