{r} setup, include=FALSE} chooseCRANmirror(graphics=FALSE, ind=1) knitr::opts_chunk$set(echo = TRUE)

Install required Packages

pckgs <- c("scales","readxl","hrbrthemes")
install.packages(pckgs, repos = "http://cran.us.r-project.org")  
## Installing packages into 'C:/Users/tharu/OneDrive/Documents/R/win-library/4.1'
## (as 'lib' is unspecified)
## package 'scales' successfully unpacked and MD5 sums checked
## package 'readxl' successfully unpacked and MD5 sums checked
## package 'hrbrthemes' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\tharu\AppData\Local\Temp\RtmpK25LFV\downloaded_packages
library("readxl")
library(scales)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard()    masks scales::discard()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()
library(ggplot2)
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Load Accidents Data Guide

Data_Guide <- read_excel("~/R_Files/Data_Guide.xls")

Load Accidents Data from Accidents.csv

Accidents <- read.csv("~/R_Files/Accidents.csv")

Data Cleaning

acc<- drop_na(Accidents)
Accidents<- subset(acc,Road_Type!="-1" & Junction_Detail!="-1" & Junction_Control!="-1" & Pedestrian_Crossing_Human_Control!="-1" & Pedestrian_Crossing_Physical_Facilities!="-1" & Light_Conditions!="-1" & Weather_Conditions!="-1" & Road_Surface_Conditions!="-1" & Special_Conditions_at_Site!="-1" & Carriageway_Hazards!="-1")

Data Check

head(Accidents)

Number of Accidents based on Road Type

  Acc_RoadType <- Accidents %>% dplyr::count(Road_Type)
  print(Acc_RoadType)
##   Road_Type     n
## 1         1  7853
## 2         2  2072
## 3         3  9020
## 4         6 52056
## 5         7   985
## 6         9   424
  Road_percent  <- round(Acc_RoadType['n'] / NROW(Accidents) * 100)
  print(Road_percent)
##    n
## 1 11
## 2  3
## 3 12
## 4 72
## 5  1
## 6  1
  Road_Type_Guide <- read_excel("~/R_Files/Data_Guide.xls", sheet = 9)
  Road_types <- Road_Type_Guide['label'][1:7,1]
  print(Road_types)
## # A tibble: 7 x 1
##   label                   
##   <chr>                   
## 1 Roundabout              
## 2 One way street          
## 3 Dual carriageway        
## 4 Single carriageway      
## 5 Slip road               
## 6 Unknown                 
## 7 One way street/Slip road
  library(RColorBrewer)
  myPalette <- brewer.pal(9, "Pastel1") 
  pie(Road_percent$n, labels = Acc_RoadType$n , border = "white" , col=myPalette, main = "Number of Accidents based on Road Types" , radius = 1.02, edges =6)
  legend("topleft",fill = myPalette,  legend =Road_types$label ,xpd=TRUE, inset=c(0, -0.0), cex=.65 )

From the above plot, We can observe that the maximum number of accidents happened on the road type “Single Carriageway” and least number of accidents happened on the road type “One Way Street/Slip road”.

Speed Limit VS Number of Accidents

Speed_Limit_Count <- Accidents %>% dplyr::count(Speed_limit)
barplot(height=Speed_Limit_Count$n,names=Speed_Limit_Count$Speed_limit, 
        col=myPalette,
        xlab="Speed Limit", 
        ylab="Number of Accidents", 
        main="Speed Limit VS Number of Accidents", 
        ylim=c(0,90000)
        )

It can be observed that maximum number of accidents on the roads having a speed limit of 30 while the roads having speed limit as 70 has least number of accidents recorder.

Junction Type VS Number of Accidents

Junction_Detail_Count <- Accidents %>% dplyr::count(Junction_Detail)
Junction_Detail_Guide <- read_excel("~/R_Files/Data_Guide.xls", sheet = 10)
Junction_Detail_Count = merge(x = Junction_Detail_Count, y = Junction_Detail_Guide, by.x = "Junction_Detail", by.y = "code", all.x = TRUE, all.y = FALSE)
print(Junction_Detail_Count)
##   Junction_Detail     n                               label
## 1               0    19 Not at junction or within 20 metres
## 2               1 10462                          Roundabout
## 3               2  1474                     Mini-roundabout
## 4               3 37366             T or staggered junction
## 5               5  2002                           Slip road
## 6               6 11916                          Crossroads
## 7               7  1458   More than 4 arms (not roundabout)
## 8               8  3145           Private drive or entrance
## 9               9  4568                      Other junction
barplot(height=Junction_Detail_Count$n,names=Junction_Detail_Count$label, 
        col=myPalette,
        xlab="Junction Type", 
        ylab="Number of Accidents", 
        main="Junction Type VS Number of Accidents", 
        ylim = c(0,40000) ,xaxt = 'n'
        )
legend("topright", legend=Junction_Detail_Count$label, fill = myPalette,  xpd=TRUE, inset=c(0, -0.0), cex=.65)

T or Staggered Junction

Junction Control VS Number of Accidents

Junction_Control_Count <- Accidents %>% dplyr::count(Junction_Control)
Junction_Control_Guide <- read_excel("~/R_Files/Data_Guide.xls", sheet = 11)
Junction_Control_Count = merge(x = Junction_Control_Count, y = Junction_Control_Guide, by.x = "Junction_Control", by.y = "code", all.x = TRUE, all.y = FALSE)
print(Junction_Detail_Count)
##   Junction_Detail     n                               label
## 1               0    19 Not at junction or within 20 metres
## 2               1 10462                          Roundabout
## 3               2  1474                     Mini-roundabout
## 4               3 37366             T or staggered junction
## 5               5  2002                           Slip road
## 6               6 11916                          Crossroads
## 7               7  1458   More than 4 arms (not roundabout)
## 8               8  3145           Private drive or entrance
## 9               9  4568                      Other junction
barplot(height=Junction_Control_Count$n,names=Junction_Control_Count$label, 
        col=myPalette,
        xlab="Junction Control", 
        ylab="Number of Accidents", 
        main="Junction Control VS Number of Accidents", 
        ylim = c(0,60000) ,xaxt = 'n'
        )
legend("topleft", legend=Junction_Control_Count$label, fill = myPalette,  xpd=TRUE, inset=c(0, -0.0), cex=.65)

Juntion control

Number of Accidents based on Light Conditions

Acc_LightConditions <- Accidents %>% dplyr::count(Light_Conditions)
  print(Acc_LightConditions)
##   Light_Conditions     n
## 1                1 51906
## 2                4 17059
## 3                5   482
## 4                6  1537
## 5                7  1426
  Light_Percent  <- round(Acc_LightConditions['n'] / NROW(Accidents) * 100)
  print(Light_Percent)
##    n
## 1 72
## 2 24
## 3  1
## 4  2
## 5  2
  Acc_LightConditions$Percent <- Light_Percent$n
  print(Acc_LightConditions)
##   Light_Conditions     n Percent
## 1                1 51906      72
## 2                4 17059      24
## 3                5   482       1
## 4                6  1537       2
## 5                7  1426       2
  Light_Conditions_Guide <- read_excel("~/R_Files/Data_Guide.xls", sheet = 15)
  Light_Conditions <- merge(x = Acc_LightConditions, y = Light_Conditions_Guide, by.x = "Light_Conditions", by.y = "code", all.x = TRUE, all.y = TRUE)
  print(Light_Conditions)
##   Light_Conditions     n Percent                        label
## 1               -1    NA      NA Data missing or out of range
## 2                1 51906      72                     Daylight
## 3                4 17059      24        Darkness - lights lit
## 4                5   482       1      Darkness - lights unlit
## 5                6  1537       2       Darkness - no lighting
## 6                7  1426       2  Darkness - lighting unknown
  pie(Acc_LightConditions$Percent, labels = Acc_LightConditions$Percent , border = "white" , col=myPalette, main = "% Number of Accidents based on Light Conditions", radius = 1.0)
  legend("topright",fill = myPalette,  legend = Light_Conditions$label,xpd=TRUE, inset=c(0, -0.0), cex=.65 )

> Light Conditions

Number of Accidents based on Weather Conditions

Acc_WeatherConditions <- Accidents %>% dplyr::count(Weather_Conditions)
  print(Acc_WeatherConditions)
##   Weather_Conditions     n
## 1                  1 59646
## 2                  2  8076
## 3                  3   264
## 4                  4   663
## 5                  5   507
## 6                  6    42
## 7                  7   280
## 8                  8  1412
## 9                  9  1520
  Weather_Percent  <- round(Acc_WeatherConditions['n'] / NROW(Accidents) * 100, digits = 2)
  print(Weather_Percent)
##       n
## 1 82.37
## 2 11.15
## 3  0.36
## 4  0.92
## 5  0.70
## 6  0.06
## 7  0.39
## 8  1.95
## 9  2.10
  Acc_WeatherConditions$Percent <- Weather_Percent$n
  print(Acc_WeatherConditions)
##   Weather_Conditions     n Percent
## 1                  1 59646   82.37
## 2                  2  8076   11.15
## 3                  3   264    0.36
## 4                  4   663    0.92
## 5                  5   507    0.70
## 6                  6    42    0.06
## 7                  7   280    0.39
## 8                  8  1412    1.95
## 9                  9  1520    2.10
  Weather_Conditions_Guide <- read_excel("~/R_Files/Data_Guide.xls", sheet = 16)
  Weather_Conditions <- merge(x = Acc_WeatherConditions, y = Weather_Conditions_Guide, by.x = "Weather_Conditions", by.y = "code", all.x = TRUE, all.y = TRUE)
  print(Weather_Conditions)
##    Weather_Conditions     n Percent                        label
## 1                  -1    NA      NA Data missing or out of range
## 2                   1 59646   82.37           Fine no high winds
## 3                   2  8076   11.15        Raining no high winds
## 4                   3   264    0.36        Snowing no high winds
## 5                   4   663    0.92            Fine + high winds
## 6                   5   507    0.70         Raining + high winds
## 7                   6    42    0.06         Snowing + high winds
## 8                   7   280    0.39                  Fog or mist
## 9                   8  1412    1.95                        Other
## 10                  9  1520    2.10                      Unknown
  pie(Acc_WeatherConditions$Percent, labels = "" , border = "white" , col=myPalette, main = "Number of Accidents based on Weather Conditions", radius = 1 )
  legend("topright",fill = myPalette,  legend = Weather_Conditions$label,xpd=TRUE, inset=c(0, -0.0), cex=.65 )

Weather conditons

Road Surface Conditions VS Number of Accidents

Road_Surface_Count <- Accidents %>% dplyr::count(Road_Surface_Conditions)
Road_Surface_Guide <- read_excel("~/R_Files/Data_Guide.xls", sheet = 17)
Road_Surface_Count = merge(x = Road_Surface_Count, y = Road_Surface_Guide, by.x = "Road_Surface_Conditions", by.y = "code", all.x = TRUE, all.y = FALSE)
print(Road_Surface_Count)
##   Road_Surface_Conditions     n                label
## 1                       1 53348                  Dry
## 2                       2 17971          Wet or damp
## 3                       3   160                 Snow
## 4                       4   915         Frost or ice
## 5                       5    16 Flood over 3cm. deep
barplot(height=Road_Surface_Count$n,names=Road_Surface_Count$label, 
        
        col=myPalette,
        xlab="", 
        ylab="", 
        main="Road Surface Conditions VS Number of Accidents"
        )
legend("topright", legend=Road_Surface_Count$label, fill = myPalette,  xpd=TRUE, inset=c(0, -0.0), cex=.65)

Road Surface Conditions VS Number of Accidents

Number Accidents based on day of the Week

Day_Count <- Accidents %>% dplyr::count(Day_of_Week)
Day_Guide <- read_excel("~/R_Files/Data_Guide.xls", sheet = 5)
Day_Count = merge(x = Day_Count, y = Day_Guide, by.x = "Day_of_Week", by.y = "code", all.x = TRUE, all.y = FALSE)
print(Day_Count)
##   Day_of_Week     n     label
## 1           1  7902    Sunday
## 2           2 10036    Monday
## 3           3 11034   Tuesday
## 4           4 10964 Wednesday
## 5           5 11084  Thursday
## 6           6 11918    Friday
## 7           7  9472  Saturday
  ggplot( Day_Count,aes(x=label, y=n, group = 1,)) +
    geom_line( color="grey",size=2, alpha=0.9, linetype=1) +
    geom_point(shape=21, color="black", fill="#69b3a2", size=3) +
    theme_classic()+
    ggtitle("Number Accidents based on day of the Week")+
    xlab("Day of Week")+
    ylab("Number of Accidents")+
    xlim("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday")

Number Accidents based on day of the Week

Heat Map of Number of Accidents VS Time of Day VS Day of Week

grouped_data <- aggregate(Accidents, by=list(Accidents$Day_of_Week, Accidents$Time), FUN=length);
grouped_data$Group.2<- hm(grouped_data$Group.2)
## Warning in .parse_hms(..., order = "HM", quiet = quiet): Some strings failed to
## parse, or all strings are NAs
grouped_data
ggplot(grouped_data, aes(x=Group.2, y=Group.1)) + geom_tile(aes(fill=Latitude))+
  scale_x_continuous(expand = c(0, 0))+
  scale_y_continuous(expand = c(0, 0))
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning in min(diff(sort(x))): no non-missing arguments to min; returning Inf
## Warning: Removed 8514 rows containing missing values (geom_tile).

Number of Accidents VS Time of Day

Time_Count <- Accidents %>% dplyr::count(Time)
Time_Count$Time<- hm(Time_Count$Time)
## Warning in .parse_hms(..., order = "HM", quiet = quiet): Some strings failed to
## parse, or all strings are NAs
Time_Count
ggplot( Time_Count,aes(x=Time, y=n, group = 1,)) +
    geom_line( color="grey",size=2, alpha=0.9, linetype=1)
## Warning: Removed 1433 row(s) containing missing values (geom_path).

Top 10 Local_Authority_Districts having more Number of Accidents

District_Count<- Accidents %>% dplyr::count(Local_Authority_District)
District_Guide <- read_excel("~/R_Files/Data_Guide.xls", sheet = 6)
District_Count = merge(x = District_Count, y = District_Guide, by.x = "Local_Authority_District", by.y = "code", all.x = TRUE, all.y = FALSE)
names(District_Count)[names(District_Count) == 'n'] <- 'NumberOfAccidents'
names(District_Count)[names(District_Count) == 'label'] <- 'Name'
District_Count_Sort <- District_Count[order(District_Count$NumberOfAccidents),]
tail(District_Count_Sort, 10)

Local_Authority_Districts having most Number of Accidents Analysis

Top 10 Local_Authority_Districts having least Number of Accidents

District_Count<- Accidents %>% dplyr::count(Local_Authority_District)
District_Guide <- read_excel("~/R_Files/Data_Guide.xls", sheet = 6)
District_Count = merge(x = District_Count, y = District_Guide, by.x = "Local_Authority_District", by.y = "code", all.x = TRUE, all.y = FALSE)
names(District_Count)[names(District_Count) == 'n'] <- 'NumberOfAccidents'
names(District_Count)[names(District_Count) == 'label'] <- 'Name'
District_Count_Sort <- District_Count[order(District_Count$NumberOfAccidents),]
head(District_Count_Sort, 10)

Local_Authority_Districts having least Number of Accidents Analysis