This document presents the code and output from some basic exploratory data analysis (EDA) of STATS19 road casualty data. The data visualisation package ggvis developed by Hadley Wickham and Winston Chang is used throughout. ggvis will eventually replace the popular ggplot2 graphics package and includes the ability to create interactive plots. For more information visit: http://ggvis.rstudio.com
The data used are STATS19 road casualties for Greater London during 2013 available from: https://www.tfl.gov.uk/corporate/publications-and-reports/road-safety
# Load the relevant packages
library(ggvis)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load the casualty data
casualties <- read.csv("http://tfl.gov.uk/cdn/static/cms/documents/2013-gla-data-extract-casualty.csv", header=T)
head(casualties, 2)
## AREFNO Borough Boro Easting Northing CREFNO Casualty.Class
## 1 0113CP00001 CITY OF LONDON 0 531630 180790 1 1 Driver/Rider
## 2 0113CP00002 CITY OF LONDON 0 531910 181110 1 3 Pedestrian
## Casualty.Sex Casualty.Age..Banded. Casualty.Age No..of.Casualties
## 1 1 Male 25-59 52 1
## 2 1 Male 25-59 36 1
## Casualty.Severity Ped..Location Ped..Movement
## 1 3 Slight -2 Unknown -2 N/A
## 2 3 Slight 01 Crossing Road On Ped Xing 3 From Drivers O/Side
## Mode.of.Travel X
## 1 3 Powered 2 Wheeler NA
## 2 1 Pedestrian NA
casualties$X <- NULL # remove the redundant 'X' variable
# Load the attendant data
attendant <- read.csv("http://tfl.gov.uk/cdn/static/cms/documents/2013-gla-data-extract-attendant.csv", header=T)
# Merge the attendant data with the casualty data using the AREFNO
casualties <- merge(casualties, attendant, by="AREFNO")
names(casualties)
## [1] "AREFNO" "Borough.x"
## [3] "Boro.x" "Easting.x"
## [5] "Northing.x" "CREFNO"
## [7] "Casualty.Class" "Casualty.Sex"
## [9] "Casualty.Age..Banded." "Casualty.Age"
## [11] "No..of.Casualties" "Casualty.Severity"
## [13] "Ped..Location" "Ped..Movement"
## [15] "Mode.of.Travel" "Borough.y"
## [17] "Boro.y" "Easting.y"
## [19] "Northing.y" "Accident.Severity"
## [21] "No..of.Casualties.in.Acc." "No..of.Vehicles.in.Acc."
## [23] "Accident.Date" "Day"
## [25] "Time" "Highway"
## [27] "Road.Class.1" "Road.No..1"
## [29] "Road.Type" "Speed.Limit"
## [31] "Junction.Detail" "Junction.Control"
## [33] "Road.Class.2" "Road.No..2"
## [35] "Ped..Crossing.Decoded" "Light.Conditions..Banded."
## [37] "Weather" "Road.Surface"
## [39] "Special.Conditions" "C.W.Hazard"
rm(attendant) # remove the attendant dataset to save memory
# Convert 'Accident.Date' from a factor to a date
casualties$Accident.Date <- as.Date(casualties$Accident.Date, "%d-%b-%y")
class(casualties$Accident.Date)
## [1] "Date"
# Extract the month
casualties$month <- format(casualties$Accident.Date, format="%B")
casualties$month <- as.factor(casualties$month)
casualties$month <- factor(casualties$month,levels=month.name)
summary(casualties$month)
## January February March April May June July
## 1809 1763 1997 2033 2324 2367 2640
## August September October November December
## 2339 2404 2798 2530 2195
# Extract the day of the week
casualties$day <- format(casualties$Accident.Date, format="%A")
casualties$day <- factor(casualties$day, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"),
labels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
summary(casualties$day)
## Monday Tuesday Wednesday Thursday Friday Saturday Sunday
## 3865 4190 4071 4047 4579 3438 3009
# Add an hour band variable
casualties$Time <- gsub("[ [:punct:]]", "" , casualties$Time) # remove punctuation from 'Time' variable
casualties$Time <- gsub("(\\d\\d)(\\d\\d)", "\\1:\\2", casualties$Time) # add a colon after 2nd value
casualties$hour<- as.POSIXlt(casualties$Time, format="%H:%M")$hour # add a new column called 'hour' with hours band
casualties[1:3,c(1, 23, 25, 43)] # check the results
## AREFNO Accident.Date Time hour
## 1 0113BS70003 2013-01-02 09:08 9
## 2 0113BS70005 2013-01-04 08:16 8
## 3 0113BS70005 2013-01-04 08:16 8
# Relabel the 'Casualty.Severity' categories
casualties$Casualty.Severity <- factor(casualties$Casualty.Severity,
levels= c("1 Fatal", "2 Serious", "3 Slight"),
labels= c("Fatal", "Serious", "Slight"))
levels(casualties$Casualty.Severity)
## [1] "Fatal" "Serious" "Slight"
# Relabel the 'Mode.of.Travel' categories
casualties$Mode.of.Travel <- factor(casualties$Mode.of.Travel,
levels= c("1 Pedestrian", "2 Pedal Cycle", "3 Powered 2 Wheeler", "4 Car",
"5 Taxi", "6 Bus Or Coach", "7 Goods Vehicle", "8 Other Vehicle"),
labels= c("Pedestrian", "Pedal Cycle", "Powered 2 Wheeler", "Car",
"Taxi", "Bus or Coach", "Goods Vehicle", "Other Vehicle"))
levels(casualties$Mode.of.Travel)
## [1] "Pedestrian" "Pedal Cycle" "Powered 2 Wheeler"
## [4] "Car" "Taxi" "Bus or Coach"
## [7] "Goods Vehicle" "Other Vehicle"
# Relabel the 'Casualty.Sex' categories
levels(casualties$Casualty.Sex)
## [1] "1 Male" "2 Female"
casualties$Casualty.Sex <- factor(casualties$Casualty.Sex,
levels= c("1 Male", "2 Female"),
labels= c("Male", "Female"))
levels(casualties$Casualty.Sex)
## [1] "Male" "Female"
# Relabel the 'Light.Conditions..Banded.' categories
levels(casualties$Light.Conditions..Banded.)
## [1] "1 Daylight" "2 Dark"
casualties$Light.Conditions..Banded. <- factor(casualties$Light.Conditions..Banded.,
levels= c("1 Daylight", "2 Dark"),
labels= c("Daylight", "Dark"))
levels(casualties$Light.Conditions..Banded.)
## [1] "Daylight" "Dark"
# Create age bands
bands <- c("0-4","5-9","10-14","15-19","20-24","25-29","30-34","35-39",
"40-44","45-49","50-54","55-59","60-64","65-69","70-74","75-79","80-84")
casualties$ageband <- cut(casualties$Casualty.Age,
breaks=c(0,4,9,14,19,24,29,34,39,44,49,54,59,64,69,74,79,84),
labels = bands)
casualties[1:3,c(10, 44)]
## Casualty.Age ageband
## 1 43 40-44
## 2 26 25-29
## 3 17 15-19
# Number of casualties by mode of travel
table(casualties$Mode.of.Travel)
##
## Pedestrian Pedal Cycle Powered 2 Wheeler Car
## 5181 4623 4502 10185
## Taxi Bus or Coach Goods Vehicle Other Vehicle
## 544 1471 571 122
# Number of casualties by mode and casualty severity
with(casualties, table(Mode.of.Travel, Casualty.Severity))
## Casualty.Severity
## Mode.of.Travel Fatal Serious Slight
## Pedestrian 65 773 4343
## Pedal Cycle 14 475 4134
## Powered 2 Wheeler 22 488 3992
## Car 25 310 9850
## Taxi 1 21 522
## Bus or Coach 1 89 1381
## Goods Vehicle 2 26 543
## Other Vehicle 2 10 110
# Calculate the mean age of KSI casualties by mode of travel
casualties[casualties$Casualty.Severity == "Fatal" | casualties$Casualty.Severity == "Serious", ] %>%
group_by(Mode.of.Travel) %>%
summarise(mean = round(mean(Casualty.Age), 0))
## Source: local data frame [8 x 2]
##
## Mode.of.Travel mean
## 1 Pedestrian 35
## 2 Pedal Cycle 33
## 3 Powered 2 Wheeler 32
## 4 Car 35
## 5 Taxi 48
## 6 Bus or Coach 53
## 7 Goods Vehicle 33
## 8 Other Vehicle 49
# Casualties by mode of travel
casualties %>%
ggvis(x = ~Mode.of.Travel, fill = ~factor(Mode.of.Travel) ) %>%
layer_bars() %>%
add_axis("x", title = "Mode of Travel") %>%
add_axis("y", title = "Number of casualties") %>%
add_legend("fill", title="Mode of Travel")
## Warning in rbind_all(out[[1]]): Unequal factor levels: coercing to
## character
# Casualties by month
casualties %>%
ggvis(~month, fill :="#e5f5f9") %>%
layer_bars() %>%
add_axis("x", title = "Month") %>%
add_axis("y", title = "Number of casualties")
# Casualties by month and mode of travel
casualties %>%
ggvis(x = ~month, fill = ~as.factor(Mode.of.Travel)) %>%
layer_bars() %>%
add_axis("x", title = "Month") %>%
add_axis("y", title = "Number of casualties") %>%
add_legend("fill", title="Mode of Travel")
# Casualties by mode and severity
casualties %>%
group_by(Casualty.Severity) %>%
ggvis(x = ~Mode.of.Travel, fill = ~Casualty.Severity, fillOpacity := 0.7) %>%
layer_bars() %>%
scale_nominal("fill", range = c("#1b9e77", "#d95f02", "#7570b3")) %>%
add_axis("x", title = "Mode of travel") %>%
add_axis("y", title = "Number of casualties") %>%
add_legend("fill", title="Casualty severity")
# KSIs by mode of travel
casualties[casualties$Casualty.Severity == "Fatal" | casualties$Casualty.Severity == "Serious",] %>%
group_by(Mode.of.Travel) %>%
ggvis(~Mode.of.Travel, fill = ~Casualty.Severity) %>%
add_axis("x", title = "Mode of travel") %>%
add_axis("y", title = "Number of casualties") %>%
add_legend("fill", title="Casualty severity")
## Guessing layer_bars()
# Casualty severity by month amongst pedal cyclists
casualties[casualties$Mode.of.Travel == "Pedal Cycle", ] %>%
group_by(Casualty.Severity) %>%
ggvis(~month, fill = ~Casualty.Severity) %>%
add_axis("x", title = "Month") %>%
add_axis("y", title = "Number of casualties") %>%
add_legend("fill", title="Casualty severity")
## Guessing layer_bars()
# KSIs by month amongst pedal cyclists
casualties[casualties$Mode.of.Travel == "Pedal Cycle" &
casualties$Casualty.Severity == "Fatal" | casualties$Casualty.Severity == "Serious", ] %>%
group_by(Casualty.Severity) %>%
ggvis(~month, fill = ~Casualty.Severity) %>%
layer_bars() %>%
add_axis("x", title = "Month") %>%
add_axis("y", title = "Number of casualties") %>%
add_legend("fill", title="Casualty severity")
# Powered 2 Wheeler casualties by age band and gender
casualties[casualties$Mode.of.Travel == "Powered 2 Wheeler", ] %>%
group_by(Casualty.Sex) %>%
ggvis(x = ~ageband, fill = ~Casualty.Sex, fillOpacity := 0.8) %>%
layer_bars() %>%
add_axis("x", title = "Casualty age band") %>%
add_axis("y", title = "Number of casualties") %>%
add_legend("fill", title="Casualty sex")
# Powered 2 Wheeler casualties by month and light conditions
casualties[casualties$Mode.of.Travel == "Powered 2 Wheeler", ] %>%
group_by(Light.Conditions..Banded.) %>%
ggvis(x = ~month, fill = ~Light.Conditions..Banded.) %>%
layer_bars() %>%
scale_nominal("fill", range = c("Yellow", "Black")) %>%
add_axis("x", title = "Month") %>%
add_axis("y", title = "Number of casualties") %>%
add_legend("fill", title="Light conditions")
# Mean age of casualties by mode of travel
casualties %>%
ggvis(~factor(Mode.of.Travel), ~Casualty.Age) %>%
layer_boxplots(width = 0.5)
## Warning in rbind_all(out[[1]]): Unequal factor levels: coercing to
## character
## Warning in rbind_all(out[[1]]): Unequal factor levels: coercing to
## character
# Casualties by day
casualties %>%
group_by(Accident.Date) %>%
summarize(count = n()) %>%
ggvis(~Accident.Date, ~count) %>%
layer_lines()
# Number of casualties by mode of travel
casualty_freq <- casualties %>%
group_by(Mode.of.Travel, Casualty.Severity) %>%
summarize(count = n())
casualty_freq <- as.data.frame(casualty_freq)
head(casualty_freq)
## Mode.of.Travel Casualty.Severity count
## 1 Pedestrian Fatal 65
## 2 Pedestrian Serious 773
## 3 Pedestrian Slight 4343
## 4 Pedal Cycle Fatal 14
## 5 Pedal Cycle Serious 475
## 6 Pedal Cycle Slight 4134
freq <- function(x) {
if(is.null(x)) return(NULL)
paste0(x[4], collapse = "<br />")
}
casualty_freq %>% ggvis(~Mode.of.Travel, ~count, fill = ~factor(Casualty.Severity)) %>%
layer_bars() %>%
scale_nominal("fill", range = c("#e41a1c", "#377eb8", "#4daf4a")) %>%
add_axis("x", title = "Mode of travel", properties = axis_props(labels=list(angle=90, align="left")) ) %>%
add_axis("y", title = "") %>%
add_legend("fill", title="Casualty severity") %>%
add_tooltip(freq, "hover")
## Warning: Can't output dynamic/interactive ggvis plots in a knitr document.
## Generating a static (non-dynamic, non-interactive) version of the plot.
## Warning in rbind_all(out[[1]]): Unequal factor levels: coercing to
## character
# Filtering casualties by age band and mode of travel
casualty_ages <- casualties %>%
group_by(Mode.of.Travel, ageband) %>%
summarize(count = n())
casualty_ages %>%
ggvis(~ageband, ~count) %>%
filter(Mode.of.Travel %in% eval(input_select(c("Pedestrian", "Pedal Cycle", "Powered 2 Wheeler",
"Car", "Taxi", "Bus or Coach", "Goods Vehicle",
"Other Vehicle"), selected = "Pedestrian"))) %>%
add_axis("x", title = "Casualty age band") %>%
add_axis("y", title = "Number of casualties") %>%
layer_bars(fill = ~Mode.of.Travel) %>%
hide_legend("fill")
## Warning: Can't output dynamic/interactive ggvis plots in a knitr document.
## Generating a static (non-dynamic, non-interactive) version of the plot.