All data obtained for 2015 (currently the most recently available) from https://data.gov.uk/dataset/road-accidents-safety-data All shapefiles sourced from the Office for National Statistics
library("dplyr") # For data reshaping / summarising
library("tidyr") # Data reshaping
library("lubridate") # For handling dates
library("ggplot2") # For data visualisation
library("viridis") # Colour palettes
library("knitr") # Contains the kable function
library("leaflet") # Interactive maps
library("leaflet.extras") # For heatmaps
library("htmltools") # For the htmlEscape function used by leaflet for labels
library("zoo") # Has the rollmeans function
library("sp") # Spatial stuff
library("rgdal") # Spatial stuff
library("rgeos") # Spatial stuff
library("maptools") # Spatial stuff
#library("shiny") # Interactivity
theme_set(theme_bw()) # Set ggplot default theme
We exclude Scotland straight away as the shapefile we’re using later on doesn’t include it.
accidents <- read.csv("RoadSafetyData_2015\\Accidents_2015.csv", stringsAsFactors = FALSE)
# Exclude Scotland
accidents <- accidents %>% filter(substring(Local_Authority_.Highway.,1,1)!="S")
Here we assign lookup values to some of the coded variables. Note that I haven’t done this for all coded variables, just the ones I’m looking at in this document.
# Accident severity ----------
accidents$Accident_Severity <- ifelse(accidents$Accident_Severity==1,"Fatal",
ifelse(accidents$Accident_Severity==2,"Serious",
ifelse(accidents$Accident_Severity==3,"Slight",NA
)))
# Road Type ----------
roadtype <- read.csv("RoadSafetyData_2015\\lookups\\roadtype.csv", stringsAsFactors=F)
accidents <- accidents %>%
left_join(roadtype, by=c("Road_Type"="code"))
rm(roadtype)
# Day of week Name ----------
# Create a function first
name_the_days <- function(weekday_number){
factor(
ifelse(weekday_number==1,"Sunday",
ifelse(weekday_number==2,"Monday",
ifelse(weekday_number==3,"Tuesday",
ifelse(weekday_number==4,"Wednesday",
ifelse(weekday_number==5,"Thursday",
ifelse(weekday_number==6,"Friday",
ifelse(weekday_number==7,"Saturday",NA
))))))),
levels=c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"))}
# Run the name_the_days function
accidents$Day_of_Week_Name <- name_the_days(accidents$Day_of_Week)
# Change Date field into an actual date ----------
accidents$Date <- dmy(as.character(accidents$Date))
# Create a month number field from the Date field ----------
accidents$Month_Number <- factor(month(as.character(accidents$Date)))
We need to do a bit of work here as the geocoding of the data is a little off. First we’ll load a shapefile from the ONS and try to match all the codes in the accidents data to it. Any that don’t match will be assigned to a local authority based on a spatial join. It’s a little fiddly but does work quite well. I’m not certain if there are any special rules around how accidents are allocated to local authorities, hopefully the lat/lon location being within the borders will match that definition but these things can sometimes be more complex than expected.
# Load County & UA shapefile
# We will use this to check the codes in the dataset
# and then to do a spatial join if we have any failures
CTYUA2015_BGC <- readOGR(dsn="shp\\Counties_and_Unitary_Authorities_December_2015_GCB", layer="Counties_and_Unitary_Authorities_December_2015_Generalised_Clipped_Boundaries_in_England_and_Wales", stringsAsFactors=FALSE)
## OGR data source with driver: ESRI Shapefile
## Source: "shp\Counties_and_Unitary_Authorities_December_2015_GCB", layer: "Counties_and_Unitary_Authorities_December_2015_Generalised_Clipped_Boundaries_in_England_and_Wales"
## with 174 features
## It has 6 fields
## Integer64 fields read as strings: objectid
# Set the CRS
CTYUA2015_BGC <- spTransform(CTYUA2015_BGC, CRS("+init=epsg:27700"))
# We should now be able to check whether all the Local AUthority Highway codes will match this dataset
CTYUA2015_Codes <- unique(CTYUA2015_BGC@data$ctyua15cd)
accidents %>%
filter(!Local_Authority_.Highway. %in% CTYUA2015_Codes) %>%
group_by(Local_Authority_.Highway.) %>%
summarise("n"=n())
## # A tibble: 3 x 2
## Local_Authority_.Highway. n
## <chr> <int>
## 1 E06000048 716
## 2 E08000020 517
## 3 EHEATHROW 39
# We find that we have three local authorities which will need fixing
# Two of these are due to boundary changes (Gateshead and Northumberland)
# The remaining one is some odd quirk of this dataset and relates to Heathrow
# We'll solve these by assiging them using a spatial join and a shapefile
# Pull out the missing points so we can do a spatial join
missing_la_points <- accidents %>%
filter(!Local_Authority_.Highway. %in% CTYUA2015_Codes) %>%
group_by(Location_Easting_OSGR,Location_Northing_OSGR) %>%
summarise() %>%
ungroup()
# Add x y coordinates
coordinates(missing_la_points) <- ~Location_Easting_OSGR+Location_Northing_OSGR
# Add a CRS to the points
proj4string(missing_la_points) <- CRS("+init=epsg:27700")
The following plot shows where the erroneous area codes fall. As we’d expect, we have a cluster in the north east over Northumberland and Gateshead and another in the south on top of Heathrow airport.
plot(CTYUA2015_BGC,border="grey")
plot(missing_la_points,col=rgb(1, 0, 0, 0.25), add=TRUE)
Now we have the problem data in the correct format we can perform a spatial join to identify the correct Local Authority code.
# Spatial join!
assigned_missing_la_points <- over(missing_la_points,CTYUA2015_BGC)
# Attach the spatial join results to the original missing points data frame
assigned_missing_la_points <- cbind(as.data.frame(missing_la_points),assigned_missing_la_points)
# Get rid of everything except the fields we need
assigned_missing_la_points <- assigned_missing_la_points %>%
select(ctyua15cd,Location_Easting_OSGR,Location_Northing_OSGR)
# Join the results back on to the accidents dataset
accidents <- accidents %>% left_join(assigned_missing_la_points)
## Joining, by = c("Location_Easting_OSGR", "Location_Northing_OSGR")
# Use ifelse to fill in the missing la codes in the new ctyua15cd field
accidents$ctyua15cd <- ifelse(is.na(accidents$ctyua15cd),accidents$Local_Authority_.Highway.,accidents$ctyua15cd)
# Add the names
CTYUA2015_LKP <- unique(data.frame("ctyua15cd"=CTYUA2015_BGC@data$ctyua15cd,"ctyua15nm"=CTYUA2015_BGC@data$ctyua15nm))
accidents <- accidents %>%
left_join(CTYUA2015_LKP)
## Joining, by = "ctyua15cd"
There is an error in the dataset meaning that the day of the week needs correcting for two dates, the 13th May and the 11th November. We can see below that these two dates have records tagged as being both Wednesday and Tuesday.
accidents %>%
filter(Date %in% as.Date(c("2015-05-13","2015-11-11"))) %>%
group_by(Date,Day_of_Week,Day_of_Week_Name) %>%
summarise("n"=n())
## # A tibble: 4 x 4
## # Groups: Date, Day_of_Week [?]
## Date Day_of_Week Day_of_Week_Name n
## <date> <int> <fctr> <int>
## 1 2015-05-13 3 Tuesday 1
## 2 2015-05-13 4 Wednesday 447
## 3 2015-11-11 3 Tuesday 1
## 4 2015-11-11 4 Wednesday 390
We fix this by using lubridate to derive these values from the date field and then re-running our name_the_days function used previously.
accidents$Day_of_Week <- wday(accidents$Date)
accidents$Day_of_Week_Name <- name_the_days(accidents$Day_of_Week)
# Check that we fixed them
accidents %>%
filter(Date %in% as.Date(c("2015-05-13","2015-11-11"))) %>%
group_by(Date,Day_of_Week,Day_of_Week_Name) %>%
summarise("n"=n())
## # A tibble: 2 x 4
## # Groups: Date, Day_of_Week [?]
## Date Day_of_Week Day_of_Week_Name n
## <date> <dbl> <fctr> <int>
## 1 2015-05-13 4 Wednesday 448
## 2 2015-11-11 4 Wednesday 391
Now things have been cleaned up a touch, let’s have a quick look at the dataset. We’ll start by just looking at the volume of accidents cut by a few different variables.
accidents %>%
group_by(Date,Month_Number) %>%
summarise("n"=n()) %>%
ggplot(data=., aes(Month_Number,n, fill=Month_Number)) +
geom_boxplot() +
scale_fill_viridis(option="viridis", discrete=T) +
geom_point(alpha=0.5)
Interesting that the vast majority take place at speed limits of 30mph (though this doesn’t mean that vehicles were actually travelling within that speed limit of course!).
accidents %>%
group_by(road_type_name,Speed_limit) %>%
summarise("n"=n()) %>%
ggplot(data=.,aes(road_type_name,n,fill=factor(Speed_limit))) +
geom_col() +
scale_fill_viridis(option="viridis", discrete=T) +
coord_flip() +
labs(title="Number of Accidents by Road Type and Speed Limit",
x="Road Type",y="Number of Accidents",fill="Speed Limit")
accidents %>%
group_by(Date,Day_of_Week_Name, Month_Number) %>%
summarise("n"=n()) %>%
ggplot(data=., aes(Day_of_Week_Name,n, fill=Day_of_Week_Name)) +
geom_boxplot() +
scale_fill_viridis(option="viridis", discrete=T) +
geom_point(alpha=0.5) +
labs(title="Number of Accidents by Day of the Week",subtitle="Each point relates to one day of the year",
x="Weekday",y="Number of Accidents",fill="Weekday")
accidents %>%
group_by(Date,Day_of_Week_Name, Month_Number) %>%
summarise("n"=n()) %>%
group_by(Day_of_Week_Name, Month_Number) %>%
summarise("n"=mean(n)) %>%
ggplot(data=., aes(Month_Number,Day_of_Week_Name, fill=n)) +
geom_tile() +
scale_fill_viridis(option="plasma") +
coord_fixed() +
labs(title="Mean Number of Daily Accidents by Day of the Week and Month",
x="Month Number",y="Weekday",fill="Mean Daily Accidents")
We can easily look to see if there are any temporal patterns, though I won’t be going into any depth here. An ARIMA analysis would be appropriate but for now some rolling means will do.
We want to see where Bank holidays fell in 2015 in case there’s a link with the volume of accidents. You can get the list from https://www.gov.uk/bank-holidays
bank_holidays <- data.frame("Date"=as.Date(c("2015-01-01",
"2015-04-03",
"2015-04-06",
"2015-05-04",
"2015-05-25",
"2015-08-31",
"2015-12-25",
"2015-12-28")))
Let’s also generate an overall average and poisson confidence interval to get a feel for the variance here. We’ll approxiamte the poisson standard error using the square root of the mean of the accident count.
(time_series_limits <- accidents %>%
group_by(Date) %>%
summarise("n"=n()) %>%
group_by() %>%
summarise(
"mean_accidents"=mean(n)) %>%
mutate(
"se"=sqrt(mean_accidents),
"UCL"=mean_accidents+(1.96*se),
"LCL"=mean_accidents-(1.96*se)
))
## # A tibble: 1 x 4
## mean_accidents se UCL LCL
## <dbl> <dbl> <dbl> <dbl>
## 1 360.526 18.98752 397.7416 323.3105
Now let’s look at what happens over the year, the rolling average should iron out the weekly pattern we saw earlier. However, there are still some unusual patterns in the series.
accidents %>%
group_by(Date) %>%
summarise("n"=n()) %>%
ungroup() %>%
mutate("Rollmean_Accidents" = rollmean(n,
k = 7,
fill = NA,
align = "right"
)) %>%
ggplot(data=., aes(Date,n)) +
geom_point(size=0.75,alpha=0.6) +
geom_vline(data=bank_holidays,aes(xintercept=as.numeric(Date)), linetype=1,size=2,col="light blue",alpha=0.5) +
geom_hline(data=time_series_limits,aes(yintercept=mean_accidents), linetype=2,size=0.5,col="red") +
geom_hline(data=time_series_limits,aes(yintercept=UCL), linetype=2,size=0.5,col="grey") +
geom_hline(data=time_series_limits,aes(yintercept=LCL), linetype=2,size=0.5,col="grey") +
geom_line(alpha=0.6) +
geom_line(aes(Date,Rollmean_Accidents),col="red",size=1) +
scale_x_date(date_breaks="1 month", date_labels="%b-%y") +
theme(panel.grid.minor = element_blank(),panel.grid.major.y = element_blank()) +
labs(title="Time Series of Number of Accidents by Date",
x="Date",y="Number of Accidents")
What happens to accident timings over the course of a week?
weekday_hour_summary <- accidents %>%
mutate("Hour"=as.numeric(substr(as.character(Time),1,2))) %>%
group_by(Date,Day_of_Week_Name,Hour) %>%
summarise(
"n"=n()
) %>%
group_by(Day_of_Week_Name,Hour) %>%
summarise("mean_number_of_accidents"=mean(n))
ggplot(weekday_hour_summary, aes(Hour,mean_number_of_accidents)) +
geom_col() +
facet_wrap(~Day_of_Week_Name) +
labs(title="Time Series of Mean Number of Accidents by Hour of the Day",
x="Hour of the Day",y="Mean Number of Accidents")
In this section we will calculate a rate based on a numerator of the number of individuals killed or seriously injured in accidents and a denominator of the total length of road in a given local authority.
We obtain these figures from the casualties dataset which accompanies the accidents dataset.
casualties <- read.csv("RoadSafetyData_2015\\Casualties_2015.csv")
# Assign severity description
casualties$Casualty_Severity <- ifelse(casualties$Casualty_Severity==1,"Fatal",
ifelse(casualties$Casualty_Severity==2,"Serious",
ifelse(casualties$Casualty_Severity==3,"Slight",NA
)))
# There are 1759 casualty records which don't match to a corresponding accident
casualties %>%
filter(Casualty_Severity!="Slight") %>%
left_join(accidents) %>%
filter(is.na(ctyua15cd)) %>%
group_by() %>%
summarise("ksi"=n())
## Joining, by = "Accident_Index"
## # A tibble: 1 x 1
## ksi
## <int>
## 1 1759
# Create a ctyua15 level count of casualties, excluding the 1759 which don't match
(ksi_casualties_ctyua <- casualties %>%
filter(Casualty_Severity!="Slight") %>%
left_join(accidents) %>%
group_by(ctyua15cd) %>%
summarise("ksi"=n())) %>%
filter(!is.na(ctyua15cd))
## Joining, by = "Accident_Index"
## # A tibble: 173 x 2
## ctyua15cd ksi
## <chr> <int>
## 1 E06000001 38
## 2 E06000002 53
## 3 E06000003 50
## 4 E06000004 66
## 5 E06000005 32
## 6 E06000006 32
## 7 E06000007 89
## 8 E06000008 78
## 9 E06000009 59
## 10 E06000010 105
## # ... with 163 more rows
We’re going to try a very basic denominator but one which still feels a lot more realistic than using an area’s population. Here we’ll look at road length by local authority. This could be nuanced by trying to match the road types between the STATS19 data and the road length report (perhaps some form of standardisation is possible even) but for now, we’ll just do overall road length as a starter for 10.
All road length data taken from https://www.gov.uk/government/statistics/road-lengths-in-great-britain-2015
# Compare this to what is recorded in the National Statistics Road lengths in Great Britain: 2015 report
# I've cleaned this up in Excel to create a simple csv file (for shame!)
# Based on table RDL0202a of the report, all distances in km
road_lengths <- read.csv("F:\\Statistics\\STATS19\\road-lengths-in-great-britain-2015\\rdl0202a_2015.csv",stringsAsFactors=F)
road_lengths <- road_lengths %>%
filter(substr(la_code,1,1)!="S") # Remove Scotland
a <- road_lengths[,c(1,2,3)]
a$type <- "motorways"
names(a)[3] <- "length_km"
b <- road_lengths[,c(1,2,4)]
b$type <- "a_roads"
names(b)[3] <- "length_km"
c <- road_lengths[,c(1,2,5)]
c$type <- "minor_roads"
names(c)[3] <- "length_km"
road_lengths <- rbind(a,b,c)
rm(list=c("a","b","c"))
road_lengths <- arrange(road_lengths,la_name)
head(road_lengths,9)
## la_code la_name length_km type
## 1 E09000002 Barking and Dagenham 0.0 motorways
## 2 E09000002 Barking and Dagenham 37.1 a_roads
## 3 E09000002 Barking and Dagenham 300.1 minor_roads
## 4 E09000003 Barnet 12.0 motorways
## 5 E09000003 Barnet 96.4 a_roads
## 6 E09000003 Barnet 648.8 minor_roads
## 7 E08000016 Barnsley 17.3 motorways
## 8 E08000016 Barnsley 150.6 a_roads
## 9 E08000016 Barnsley 1046.8 minor_roads
Note we have to be careful with the Isles of Scilly because they have such low numbers of accidents.
ksi_casualties_ctyua <- road_lengths %>%
group_by(la_code) %>%
summarise("length_km"=sum(length_km)) %>%
left_join(ksi_casualties_ctyua,by=c("la_code"="ctyua15cd")) %>%
mutate("ksi_per_1000km"=(ksi/length_km)*1000)
# Because the Isles of Scilly had no accidents, we have to enter a zero rather than an NA
ksi_casualties_ctyua[which(is.na(ksi_casualties_ctyua$ksi_per_1000km)),]$ksi <- 0
ksi_casualties_ctyua[which(is.na(ksi_casualties_ctyua$ksi_per_1000km)),]$ksi_per_1000km <- 0
# Check the histogram
ggplot(ksi_casualties_ctyua,aes(ksi_per_1000km)) +
geom_histogram(bins=100) +
labs(title="Histogram of KSI per 1000km",x="KSI per 1000km",y="Count")
Here we use the leaflet package to generate an interactive map. This shows the following: * Boundaries for counties and unitary authorities, shaded based on quintiles of the number killed or seriously injured per 1000km of road * Clustered points showing the locations of all accidents (not just those resulting in a casualty), note that you have to zoom in quite closely before the points will split out to show individual accidents.
I suspect really we’re just mapping busy-ness here but it’s a start.
# Now we need to reproject our ctyua shapefile so that leaflet can display using latitude and longitude rather than Eastings and Northings.
CTYUA2015_BGC <- spTransform(CTYUA2015_BGC, CRS("+init=epsg:4326"))
#plot(CTYUA2015_BGC)
# Join the data to the shapefile
CTYUA2015_BGC <- merge(CTYUA2015_BGC, ksi_casualties_ctyua,by.x="ctyua15cd",by.y="la_code")
# Create a rounding function so figures can be displayed without lots of decimals
trunc.round <- function(x){trunc((x*10)+0.5)/10}
# Create colour quantiles
pal <- colorQuantile("YlOrRd", domain = CTYUA2015_BGC@data$ksi_per_1000km,n=10)
m <- leaflet(data=accidents) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(data=CTYUA2015_BGC,
fillColor = ~pal(ksi_per_1000km),
fillOpacity = 0.6,
weight=0.5,
color="black",
group="Local Authorities",
label=~htmlEscape(paste0(ctyua15nm,": ",trunc.round(ksi_per_1000km)," KSI per 1000km"))) %>%
addMarkers(lng=~Longitude,
lat=~Latitude,
label = ~htmlEscape(paste0("No. of vehicles: ",Number_of_Vehicles,", No. of Casualties: ",Number_of_Casualties)),
clusterOptions=markerClusterOptions(),
group="Individual Accidents") %>%
# Unfortunately there are too many points for the heatmap function to work.
# addHeatmap(lng=~Longitude,
# lat=~Latitude,
# blur = 20, max = 0.05, radius = 15,
# group="Heatmap"
# ) %>%
addLegend(pal = pal, values = CTYUA2015_BGC@data$ksi_per_1000km, opacity = 0.7, title = "Deciles",
position = "bottomright") %>%
addLayersControl(
overlayGroups = c("Individual Accidents", "Local Authorities"),
options = layersControlOptions(collapsed = FALSE)
) %>%
addMiniMap(toggleDisplay = TRUE, tiles=providers$CartoDB.Positron)
# %>% hideGroup(c("Individual Accidents"))
m