Introduction

National Collision Database (NCDB) – a database containing all police-reported motor vehicle collisions on public roads in Canada. Selected variables (data elements) relating to fatal and injury collisions for the collisions from 1999 to the most recent available data. The dataset and data description can be obtained from this link.

The aim of this project is to practice my data manipulation and data visualization skills.

library(data.table) #used fread function to read large dataset
library(dplyr) #data manipulation
## Warning: package 'dplyr' was built under R version 3.5.1
library(ggplot2) #visualizations
library(gridExtra) #viewing multiple plots together
library(knitr) #Create nicely formatted output tables
library(kableExtra) #Create nicely formatted output tables
library(formattable) #For the color_tile function
library(DT) #for nicely formatted output tables
library(ggthemes) #for ggplot theme
library(viridis) # to get scale_fill_viridis

collision <- fread("NCDB_1999_to_2016.csv", stringsAsFactors = FALSE)

glimpse(collision)
## Observations: 6,486,831
## Variables: 23
## $ C_YEAR <int> 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1...
## $ C_MNTH <chr> "01", "01", "01", "01", "01", "01", "01", "01", "01", "...
## $ C_WDAY <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", ...
## $ C_HOUR <chr> "20", "20", "20", "08", "08", "17", "17", "17", "17", "...
## $ C_SEV  <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ C_VEHS <chr> "02", "02", "02", "01", "01", "03", "03", "03", "03", "...
## $ C_CONF <chr> "34", "34", "34", "01", "01", "QQ", "QQ", "QQ", "QQ", "...
## $ C_RCFG <chr> "UU", "UU", "UU", "UU", "UU", "QQ", "QQ", "QQ", "QQ", "...
## $ C_WTHR <chr> "1", "1", "1", "5", "5", "1", "1", "1", "1", "1", "1", ...
## $ C_RSUR <chr> "5", "5", "5", "3", "3", "2", "2", "2", "2", "5", "5", ...
## $ C_RALN <chr> "3", "3", "3", "6", "6", "1", "1", "1", "1", "U", "U", ...
## $ C_TRAF <chr> "03", "03", "03", "18", "18", "01", "01", "01", "01", "...
## $ V_ID   <chr> "01", "02", "02", "01", "99", "01", "02", "02", "03", "...
## $ V_TYPE <chr> "06", "01", "01", "01", "NN", "01", "01", "01", "01", "...
## $ V_YEAR <chr> "1990", "1987", "1987", "1986", "NNNN", "1984", "1991",...
## $ P_ID   <chr> "01", "01", "02", "01", "01", "01", "01", "02", "01", "...
## $ P_SEX  <chr> "M", "M", "F", "M", "M", "M", "M", "F", "M", "M", "F", ...
## $ P_AGE  <chr> "41", "19", "20", "46", "05", "28", "21", "UU", "UU", "...
## $ P_PSN  <chr> "11", "11", "13", "11", "99", "11", "11", "13", "11", "...
## $ P_ISEV <chr> "1", "1", "2", "1", "2", "1", "1", "2", "2", "1", "2", ...
## $ P_SAFE <chr> "UU", "UU", "02", "UU", "UU", "UU", "UU", "UU", "UU", "...
## $ P_USER <chr> "1", "1", "2", "1", "3", "1", "1", "2", "1", "1", "2", ...
## $ C_CASE <int> 752, 752, 752, 753, 753, 820, 820, 820, 820, 932, 932, ...

Based on the quick observation from the dataset, it seems like there is a lot of variables recode. Prior the analysis, we are going to remove irrelevant variables such as ID and case ID. For the purpose of the study we going to analyze data with recent five years.

#select data from 2011 to 2016
collision5 <- collision[collision$C_YEAR >= 2011]

#remove id

collision5 <- subset(collision5, select = -c(V_ID, P_ID, C_CASE))

glimpse(collision5)
## Observations: 1,888,670
## Variables: 20
## $ C_YEAR <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2...
## $ C_MNTH <chr> "01", "01", "01", "01", "01", "01", "01", "01", "01", "...
## $ C_WDAY <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", ...
## $ C_HOUR <chr> "10", "12", "00", "17", "17", "22", "16", "16", "13", "...
## $ C_SEV  <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2...
## $ C_VEHS <chr> "01", "01", "01", "02", "02", "01", "02", "02", "02", "...
## $ C_CONF <chr> "02", "04", "03", "21", "21", "04", "35", "35", "35", "...
## $ C_RCFG <chr> "03", "UU", "UU", "UU", "UU", "05", "02", "02", "02", "...
## $ C_WTHR <chr> "1", "1", "7", "1", "1", "1", "4", "4", "1", "1", "1", ...
## $ C_RSUR <chr> "3", "5", "3", "1", "1", "3", "3", "3", "1", "1", "2", ...
## $ C_RALN <chr> "2", "1", "1", "1", "1", "1", "4", "4", "1", "1", "2", ...
## $ C_TRAF <chr> "18", "UU", "UU", "18", "18", "18", "16", "16", "18", "...
## $ V_TYPE <chr> "01", "01", "01", "01", "01", "01", "01", "01", "01", "...
## $ V_YEAR <chr> "UUUU", "UUUU", "UUUU", "UUUU", "UUUU", "UUUU", "UUUU",...
## $ P_SEX  <chr> "M", "F", "F", "F", "M", "F", "M", "F", "F", "M", "F", ...
## $ P_AGE  <chr> "75", "21", "34", "50", "63", "26", "34", "20", "80", "...
## $ P_PSN  <chr> "11", "11", "11", "11", "11", "11", "11", "11", "11", "...
## $ P_ISEV <chr> "2", "2", "2", "2", "1", "2", "2", "2", "2", "1", "2", ...
## $ P_SAFE <chr> "NN", "02", "02", "02", "NN", "02", "02", "02", "02", "...
## $ P_USER <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "U", "1", ...

Data Cleaning

This section I am going to recode the variable into relevant structures and rename the variables into meaningful factor.

collision5$C_MNTH <- recode(collision5$C_MNTH, "01" = "January", "02" = "February", "03" = "March", "04" = "April", "05" = "May", "06" = "June", "07" = "July", "08" = "August", "09" = "September", "10" = "October", "11" = "November", "12" = "December", "UU" = "Unknown", "XX" = "No data")

collision5$C_WDAY <- recode(collision5$C_WDAY, '1' = "Monday", "2" = "Tuesday", "3" = "Wednesday", "4" = "Thursday", "5" = "Friday", "6" = "Saturday", "7" = "Sunday", "U" = "Unknown", "X" = "No data")

collision5$C_SEV <- recode(collision5$C_SEV, "1" = "Collision producing at least one fatality", "2" = "Collision producing non-fatal injury", "U" = "Unknown", "X" = "No data")
## Warning in recode.numeric(collision5$C_SEV, `1` = "Collision producing at
## least one fatality", : NAs introduced by coercion
collision5$C_CONF <- recode(collision5$C_CONF, "01" = "Hit a moving object", "02" = "Hit a stationary object", "03" = "Ran off left shoulder", "04" = "Ran off right shoulder", "05" = "Rollover on roadway", "06" = "Any other single vehicle collision", "21" = "Rear-end collision", "22" = "Side swipe", "23" = "Left turn conflict", "24" = "Right turn conflict", "25" = "Any other two vehicle conflict","31" = "Head-on collision", "32" = "Approaching side-swipe", "33" = "Left turn across opposing traffic", "34" = "Righ turn, including turning conflicts", "35" = "Right angle collision", "36" = "Any other two vehicle conflict, different direction", "41" = "Hit a parked motor vehicle", "QQ" = "Choice is other than the preceding values", "UU" = "Unknown", "XX" = "No data")

collision5$C_RCFG <- recode(collision5$C_RCFG, "01" = "No intersection", "02" = "At intercetion", "03" = "Intersection exit", "04" = "Railroad level crossing", "05" = "Bridge, overpass, viaduct", "06" = "Tunner or underpass", "07" = "Passing or climbing lane", "08" = "Ramp", "09" = "Traffic circle", "10" = "Express lane", "11" = "Collector lane", "12" = "Transfer lane", "QQ" = "Choice is other than the preceding values", "UU" = "Unknown", "XX" = "No data")

collision5$C_WTHR <- recode(collision5$C_WTHR, "1" = "Clear and sunny", "2" = "Cloudy", "3" = "Raining", "4" = "Snowing", "5" = "Freezing rain, sleet, hail", "6" = "Visibility limitation", "7" = "Strong wind", "Q" = "Choice is other than the preceding values", "U" = "Unknown", "X" = "No data")

collision5$C_RSUR <- recode(collision5$C_RSUR, "1" = "Dry, normal", "2" = "Wet", "3" = "Snow", "4" = "Slush, wet snow", "5" = "Icy", "6" = "Sand/gravel/dirt", "7" = "Muddy", "8" = "Oil", "9" = "Flooded", "Q" = "Choice is other than the preceding values", "U" = "Unknown", "X" = "No data")

collision5$C_RALN <- recode(collision5$C_RALN, "1" = "Straight and level", "2" = "Straight with gradient", "3" = "Curved and level", "4" = "Curved with gradient", "5" = "Top of hill or gradient", "6" = "Bottom of hill or gradient", "Q" = "Choice is other than the preceding values", "U" = "Unknown", "X" = "No data")

collision5$C_TRAF <- recode(collision$C_TRAF, "01" = "Traffic signals fully operational", "02" = "Traffic signals in flashing mode", "03" = "Stop sign", "04" = "Yield sign" ,"05" = "Warning sign", "06" = "Pedestrian crosswalk", "07" = "Police officer", "08" = "School guard, flagman", "09" = "School crossing", "10" = "Reduced speed zone", "11" = "No passing zone sign", "12" = "Markings on the road", "12" = "Markings on the road", "13" = "School bus stopped with school bus signal lights flashing", "14" = "School bus stopped with school bus signal lights not flashing", "15" = "Railway crossing with signals, or signals and gates", "16" = "Railway crossing with signs only", "17" = "Control device not specified", "18" = "No control present", "QQ" = "Choice is other than the preceding values", "UU" = "Unknown", "XX" = "No data")
## Warning in `[<-.data.table`(x, j = name, value = value): Supplied 6486831
## items to be assigned to 1888670 items of column 'C_TRAF' (4598161 unused)
collision5$V_TYPE <- recode(collision5$V_TYPE, "01" = "Light Duty Vehicle", "05" = "Panel/cargo van", "06" = "Other trucks and vans", "07" = "Unit trucks > 4536 KG", "08" = "Road tractor", "09" = "School bus", "10" = "Smaller school bus", "11" = "Urban and intercity bus", "14" = "Motorcycle and moped", "16" = "Off road vehicles", "17" = "Bicycle", "18" = "Purpose-build motohome", "19" = "Farm equipment", "20" = "Construction equipment", "21" = "Fire engine", "22" = "Snowmobile", "23" = "Street car", "NN" = "Not applicable", "QQ" = "Choice is other than the preceding values", "UU" = "Unknown", "XX" = "No data")

collision5$P_SEX <- recode(collision5$P_SEX, "F" = "Female", "M" = "Male", "N" = "Not applicable", "U" = "Unknown", "X" = "No data")

collision5$P_PSN <- recode(collision5$P_PSN, "11" = "Driver", "12" = "Front row, center", "13" = "Front row, right outboard, including motorcycle passenger in sidecar", "21" = "Second row, left outboard, including motorcycle passenger", "22" = "Second row, center", "23" = "Second row, right outboard", "31" = "Third row, left outboard", "32" = "Third row, center", "33" = "Third row, right outboard", "96" = "Position unknown, but the person was definitely an occupant", "97" = "Sitting on someone’s lap", "98" = "Outside passenger compartment", "99" = "Pedestrian", "NN" = "Not applicable", "QQ" = "Choice is other than the preceding value", "UU" = "Unknown", "XX" = "No data")

collision5$P_ISEV <- recode(collision5$P_ISEV, "1" = "No Injury", "2" = "Injury", "3" = "Fatality", "N" = "Not applicable", "U" = "Unknow", "X" = "No data")

collision5$P_SAFE <- recode(collision5$P_SAFE, "01" = "No safety device used or No child restraint used", "02" = "Safety device used or child restraint used", "09" = "Helment worn", "10" = "Reflective clothing worn", "11" = "Both helmet and reflective clothing used", "12" = "Other safety device", "13" = "No safety device", "NN" = "Not applicable", "QQ" = "Choice is other than the preceding values", "UU" = "Unknown", "XX" = "No data")

collision5$P_USER <- recode(collision5$P_USER, "1" = "Motor Vehicle Driver", "2" = "Motor Vehicle Passenger", "3" = "Pedestrian", "4" = "Bicyclist", "5" = "Motorcyclist", "U" = "Unknown/Not stated")

colnames(collision5) <- c("Year", "Month", "Weekday", "Hour", "Serverity", "Number_of_vechicle_involved", "Collision_configuration", "Roadway_configuration", "Weather_condition", "Road_surface", "Road_alignment", "Traffic_control", "Vehicle_type", "Vechicle_model_year", "Sex", "Age", "Position", "Medical_treatment_required", "Safety_device", "Road_user_class")
#Remove the values does not represent age
collision5 <-  collision5 %>%
  filter(Age != 'NN' & Age != "UU" )

collision5$Age <- as.numeric(collision5$Age)

character_vars <- lapply(collision5, class) == "character"
collision5[, character_vars] <- lapply(collision5[, character_vars], as.factor)

#remove unknown
clean_data <- collision5 %>%
  filter(Number_of_vechicle_involved != "UU")

#convert into numeric
clean_data$Number_of_vechicle_involved <- as.numeric(clean_data$Number_of_vechicle_involved)

glimpse(clean_data)
## Observations: 1,744,906
## Variables: 20
## $ Year                        <int> 2011, 2011, 2011, 2011, 2011, 2011...
## $ Month                       <fct> January, January, January, January...
## $ Weekday                     <fct> Monday, Monday, Monday, Monday, Mo...
## $ Hour                        <fct> 10, 12, 00, 17, 17, 22, 16, 16, 13...
## $ Serverity                   <fct> Collision producing non-fatal inju...
## $ Number_of_vechicle_involved <dbl> 1, 1, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2...
## $ Collision_configuration     <fct> Hit a stationary object, Ran off r...
## $ Roadway_configuration       <fct> Intersection exit, Unknown, Unknow...
## $ Weather_condition           <fct> Clear and sunny, Clear and sunny, ...
## $ Road_surface                <fct> Snow, Icy, Snow, Dry, normal, Dry,...
## $ Road_alignment              <fct> Straight with gradient, Straight a...
## $ Traffic_control             <fct> Stop sign, Stop sign, Stop sign, N...
## $ Vehicle_type                <fct> Light Duty Vehicle, Light Duty Veh...
## $ Vechicle_model_year         <fct> UUUU, UUUU, UUUU, UUUU, UUUU, UUUU...
## $ Sex                         <fct> Male, Female, Female, Female, Male...
## $ Age                         <dbl> 75, 21, 34, 50, 63, 26, 34, 20, 80...
## $ Position                    <fct> Driver, Driver, Driver, Driver, Dr...
## $ Medical_treatment_required  <fct> Injury, Injury, Injury, Injury, No...
## $ Safety_device               <fct> Not applicable, Safety device used...
## $ Road_user_class             <fct> Motor Vehicle Driver, Motor Vehicl...

After recode, renaming the variable and return to the correct data type, there are 1,744,978 observation and 20 variables in the cleaned dataset.

Exploratory Data Analysis

Before the analysis, let’s discover what is the most common collision that occur.

collision_group <- clean_data %>%
  group_by(Collision_configuration) %>%
  summarise(total = n()) %>%
  filter(Collision_configuration != "Unknown")

collision_group %>%
  ggplot(aes(x = reorder(Collision_configuration, total), y = total)) +
  geom_bar(stat = 'identity', fill = 'red') +
  geom_text(aes(label = total), stat = 'identity', vjust="inward",hjust="inward", size = 3) +
  coord_flip() +
  xlab('Collision Type') +
  ylab('Frequency') +
  scale_y_continuous(labels = comma) +
  ggtitle('Most common collision') +
  theme(plot.title = element_text(size = 12),
        axis.title = element_text(size = 10, face = "bold"))

As you can see rear-end collision is the most common collision occurs. Let’s check what is the most common collision across different years.

collision_group <- clean_data %>%
  group_by(Collision_configuration, Year) %>%
  summarise(total = n()) %>%
  filter(Collision_configuration != "Unknown")

collision_group %>%
  ggplot(aes(x = reorder(Collision_configuration, total), y = total)) +
  geom_bar(stat = 'identity', fill = 'red') +
  geom_text(aes(label = total), stat = 'identity', vjust="inward",hjust="inward", size = 3) +
  coord_flip() +
  xlab('Collistion Type') +
  ylab('Frequency') +
  ggtitle('Most common collision across the years') +
  theme(plot.title = element_text(size = 12),
        axis.title = element_text(size = 10, face = "bold")) +
  facet_wrap(.~Year)

Data for 2016

Let’s take te data from 2016 as example to examine which month, weekdays and hours where the collision most likely to occur.

data2016 <- clean_data[clean_data$Year >= 2016,]
month_count <- data2016 %>% 
  group_by(Month, Collision_configuration) %>% 
  summarise(Total = n())

month_count$Month <- ordered(month_count$Month, levels = c('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'))

month_count <- na.omit(month_count)

month_count %>%
  ggplot(aes(x = reorder(Collision_configuration, Total), y = Total)) +
  geom_bar(stat = 'identity', fill = 'red') +
  geom_text(aes(label = Total), stat = 'identity', hjust = "inward", size = 3) +
  coord_flip() +
  xlab('Collision Type') +
  ylab('Frequency') +
  ggtitle('Most common collision across the months') +
  theme(plot.title = element_text(size = 12),
        axis.title = element_text(size = 10, face = "bold")) +
  facet_wrap(.~Month)

Base on the observation, most of the collision happened around mid or end of the year in 2016. Now, lets examine the weekdays where most of the collision happend.

day_count <- data2016 %>% 
  group_by(Weekday, Collision_configuration) %>% 
  summarise(Total = n())

day_count$Weekday <- ordered(day_count$Weekday, levels = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

day_count <- na.omit(day_count)

day_count %>%
  ggplot(aes(x = reorder(Collision_configuration, Total), y = Total)) +
  geom_bar(stat = 'identity', fill = 'red') +
  geom_text(aes(label = Total), stat = 'identity', hjust = "inward", size = 3) +
  coord_flip() +
  xlab('Collision Type') +
  ylab('Frequency') +
  ggtitle('Most common collision across the day') +
  theme(plot.title = element_text(size = 12),
        axis.title = element_text(size = 10, face = "bold")) +
  facet_wrap(.~Weekday)

As you can see, Thursday, Friday and Saturday have the highest collision count compared to other days. Is this because it is almost weekend and everyone is recklessly driving? Maybe we can gain more insight by looking at the hour where the most collision occurs.

hour_count <- data2016 %>%
  group_by(Hour, Collision_configuration)%>%
  summarise(n=n())

ggplot(aes(x=Hour, y=n), data = hour_count) + geom_line(size = 2.5, alpha = 0.7, color = "black", group=1) + 
  geom_point(size = 0.5) + 
  ggtitle('Total collision by Hour of Day in 2016') +
  ylab('Number of Occurrences') +
  xlab('Hour(24-hour clock)') +
  theme_bw() +
  theme(plot.title = element_text(size = 16),
        axis.title = element_text(size = 12, face = "bold"))

Seems like the road collision is most likely occurs after the work hours. Is this maybe everyone are trying to rush home.

Lastly, we are going to examine the relationship between weather and road condition to the collision occurence.

weather_count <- data2016 %>% 
  group_by(Weather_condition, Collision_configuration) %>% 
  summarise(Total = n())

weather_count <- na.omit(weather_count)

weather_count %>%
  ggplot(aes(x = reorder(Collision_configuration, Total), y = Total)) +
  geom_bar(stat = 'identity', fill = 'red') +
  geom_text(aes(label = Total), stat = 'identity', hjust = "inward", size = 3) +
  coord_flip() +
  xlab('Collision Type') +
  ylab('Frequency') +
  ggtitle('Most common collision across different weather conditions') +
  theme(plot.title = element_text(size = 12),
        axis.title = element_text(size = 10, face = "bold")) +
  facet_wrap(.~Weather_condition)

Suprisingly, most of the collision happen during sunny day. Isn’t it interesting? I would expect collision happened more during raining day or snowing day.

Let’s check the road surface.

road_count <- data2016 %>% 
  group_by(Road_surface, Collision_configuration) %>% 
  summarise(Total = n())

road_count <- na.omit(road_count)


road_count %>%
  ggplot(aes(x = reorder(Collision_configuration, Total), y = Total)) +
  geom_bar(stat = 'identity', fill = 'red') +
  geom_text(aes(label = Total), stat = 'identity', hjust = "inward", size = 3) +
  coord_flip() +
  xlab('Collision Type') +
  ylab('Frequency') +
  ggtitle('Most common collision across different raod condition') +
  theme(plot.title = element_text(size = 12),
        axis.title = element_text(size = 10, face = "bold")) +
  facet_wrap(.~Road_surface)

Intresting enough that most of the collision happen during dry and normal raod. Could this because the reckless driver on the road. More indepth analysis need to be investigate in this area.