I have dowloaded the UK accident Dataset from data.gov.uk here, I’ve dowloaded 10 years of historic from 2005 to 2014 for this notebook. There are three datasets: - Accidents:CSv file, Main dataset, 1,6M records, contains information about date, time,day of the week, accident severity, location, weather, road type… - Vehicles: CSV file, 3M records COntains information about vehicle type, driver age, driver sex, is vehicle rent… - Casualties:CSV file, 2,2M records COntains information about - Lookup: XLS file, All the data variables are coded rather than containing textual strings this lookup file contains all the code and text string associated There is unique identifier “Accident_Index” to merge the three datasets, an accident can have more than one vehicle or more than casualty related to it
Thourghout this notebook for performance reason I only used the Accidents and Vehicles datasets but I’ve also created the code to clean and merge casualties dataset if you wish to explore it, so feel free to fork my notebook, improve it and share your findings.
Overall the files are pretty good apart from few missing data, so here the main tasks will be: - Merge the datasets together - Find a way to match variable code with lookup file which contains variable text string - Add some new features to better explore the accident dataset
getwd()
## [1] "H:/R Projetcs/UK_Accident"
setwd("H:\\R Projetcs\\UK_Accident")
Accidents<-fread("Accidents0514.csv",header = TRUE,sep = ",")
##
Read 12.2% of 1640597 rows
Read 28.6% of 1640597 rows
Read 44.5% of 1640597 rows
Read 62.8% of 1640597 rows
Read 75.0% of 1640597 rows
Read 95.1% of 1640597 rows
Read 1640597 rows and 32 (of 32) columns from 0.210 GB file in 00:00:09
#Casualties<-fread("Casualties0514.csv",header = TRUE,sep = ",")
Vehicles<-fread("Vehicles0514.csv",header = TRUE,sep = ",")
##
Read 38.6% of 3004425 rows
Read 77.9% of 3004425 rows
Read 3004425 rows and 22 (of 22) columns from 0.177 GB file in 00:00:04
#Ope the excel file
wb <- loadWorkbook('Road-Accident-Safety-Data-Guide.xls')
#get worksheet names
wsht <- getSheets(wb)
#remove first two worksheets
wsht<-wsht[-c(1,2)]
#create my mapping df
mapping <- data.frame( Code=character(), Label=character(),Variable=character(), stringsAsFactors=FALSE)
#loop through each worksheet and insert sheetname, code and label into the mapping df
for (ws in wsht) {
dat <- readWorksheet(wb, ws)
#replace spaces by "_"
dat$Variable<-str_replace_all(ws, fixed(" "), "_")
colnames(dat) <- c("Code","Label","Variable")
mapping<-rbind(mapping,dat)
}
mapping<-mapping[c("Variable", "Code", "Label")]
In order to match the dataset variables with the lookup df, we need to have the same column names so here I have to rename some columns manually. In addition, as R is case sensitive we need to make sure all our columns are lower cases.
#Casualties
rename.caslts.cols<-function(){
#lower case all column names
names(Casualties)[0:length(Casualties)] <- tolower(names(Casualties)[0:length(Casualties)])
#bus_passenger
setnames(Casualties, "bus_or_coach_passenger", "bus_passenger")
#ped_location
setnames(Casualties,"pedestrian_location", "ped_location")
#ped_movement
setnames(Casualties,"pedestrian_movement", "ped_movement")
#ped_road_maintenance_worker
setnames(Casualties,"pedestrian_road_maintenance_worker", "ped_road_maintenance_worker")
}
#Vehicles
#rename.veh.cols<-function(){
#lower case all column names
names(Vehicles)[0:length(Vehicles)] <- tolower(names(Vehicles)[0:length(Vehicles)])
#journey_purpose
setnames(Vehicles, "journey_purpose_of_driver", "journey_purpose")
#propulsion_code
setnames(Vehicles,"propulsion_code", "vehicle_propulsion_code")
#veh_leaving_carriageway
setnames(Vehicles,"vehicle_leaving_carriageway", "veh_leaving_carriageway")
#vehicle_location
setnames(Vehicles,"vehicle_location-restricted_lane", "vehicle_location")
#}
#Accident
#rename.acc.cols<-function(){
#lower case all column names
names(Accidents)[0:length(Accidents)] <- tolower(names(Accidents)[0:length(Accidents)])
#police_officer_attend
setnames(Accidents,"did_police_officer_attend_scene_of_accident", "police_officer_attend")
#ped_cross_-_human
setnames(Accidents, "pedestrian_crossing-human_control", "ped_cross_human")
#ped_cross_-_physical
setnames(Accidents,"pedestrian_crossing-physical_facilities", "ped_cross_facilities")
#road_surface
setnames(Accidents,"road_surface_conditions", "road_surface")
#urban_rural
setnames(Accidents,"urban_or_rural_area", "urban_rural")
#weather
setnames(Accidents,"weather_conditions", "weather")
#}
#mapping
#rename.map.cols<-function(){
#lower case all column names
mapping$Variable<-tolower(mapping$Variable)
#age_band
mapping$Variable<-ifelse(mapping$Variable=="age_band","age_band_of_driver",mapping$Variable)
#Remove age_of_casualty
mapping<-mapping[mapping$Variable!="age_of_casualty",]
#home_area_type
mapping$Variable<-ifelse(mapping$Variable=="home_area_type","driver_home_area_type",mapping$Variable)
#imd_decile
mapping$Variable<-ifelse(mapping$Variable=="imd_decile","driver_imd_decile",mapping$Variable)
#ped_cross_-_human
mapping$Variable<-ifelse(mapping$Variable=="ped_cross_-_human","ped_cross_human",mapping$Variable)
#ped_cross_-_physical
mapping$Variable<-ifelse(mapping$Variable=="ped_cross_-_physical","ped_cross_facilities",mapping$Variable)
#was_vehicle_left_hand_drive
mapping$Variable<-ifelse(mapping$Variable=="was_vehicle_left_hand_drive","left_hand_drive",mapping$Variable)
#}
#Create a function to merge my df
merge.all<- function(x, y) {
merge(x, y, all=TRUE, by=listCols)
}
#Lits of columns to merge on
listCols<-c(colnames(Accidents)[1])#only the first col in this case
#call the merge function
acc.uk<- Reduce(merge.all, list(Accidents,Vehicles))
#free memeory
rm(Vehicles)
#rm(Casualties)
rm(Accidents)
#show the final dataset
str(acc.uk)
## Classes 'data.table' and 'data.frame': 3004425 obs. of 53 variables:
## $ accident_index : chr "200501BS00001" "200501BS00002" "200501BS00003" "200501BS00003" ...
## $ location_easting_osgr : int 525680 524170 524520 524520 526900 528060 524770 524770 524220 524220 ...
## $ location_northing_osgr : int 178240 181650 182240 182240 177530 179040 181160 181160 180830 180830 ...
## $ longitude : num -0.191 -0.212 -0.206 -0.206 -0.174 ...
## $ latitude : num 51.5 51.5 51.5 51.5 51.5 ...
## $ police_force : int 1 1 1 1 1 1 1 1 1 1 ...
## $ accident_severity : int 2 3 3 3 3 3 3 3 3 3 ...
## $ number_of_vehicles : int 1 1 2 2 1 1 2 2 2 2 ...
## $ number_of_casualties : int 1 1 1 1 1 1 1 1 1 1 ...
## $ date : chr "04/01/2005" "05/01/2005" "06/01/2005" "06/01/2005" ...
## $ day_of_week : int 3 4 5 5 6 2 3 3 5 5 ...
## $ time : chr "17:42" "17:36" "00:15" "00:15" ...
## $ local_authority_(district) : int 12 12 12 12 12 12 12 12 12 12 ...
## $ local_authority_(highway) : chr "E09000020" "E09000020" "E09000020" "E09000020" ...
## $ 1st_road_class : int 3 4 5 5 3 6 6 6 5 5 ...
## $ 1st_road_number : int 3218 450 0 0 3220 0 0 0 0 0 ...
## $ road_type : int 6 3 6 6 6 6 6 6 6 6 ...
## $ speed_limit : int 30 30 30 30 30 30 30 30 30 30 ...
## $ junction_detail : int 0 6 0 0 0 0 0 0 3 3 ...
## $ junction_control : int -1 2 -1 -1 -1 -1 -1 -1 4 4 ...
## $ 2nd_road_class : int -1 5 -1 -1 -1 -1 -1 -1 6 6 ...
## $ 2nd_road_number : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ped_cross_human : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ped_cross_facilities : int 1 5 0 0 0 0 0 0 0 0 ...
## $ light_conditions : int 1 4 4 4 1 7 1 1 4 4 ...
## $ weather : int 2 1 1 1 1 1 2 2 1 1 ...
## $ road_surface : int 2 1 1 1 1 2 2 2 1 1 ...
## $ special_conditions_at_site : int 0 0 0 0 0 0 6 6 0 0 ...
## $ carriageway_hazards : int 0 0 0 0 0 0 0 0 0 0 ...
## $ urban_rural : int 1 1 1 1 1 1 1 1 1 1 ...
## $ police_officer_attend : int 1 1 1 1 1 1 1 1 1 1 ...
## $ lsoa_of_accident_location : chr "E01002849" "E01002909" "E01002857" "E01002857" ...
## $ vehicle_reference : int 1 1 1 2 1 1 1 2 1 2 ...
## $ vehicle_type : int 9 11 11 9 9 3 9 3 3 9 ...
## $ towing_and_articulation : int 0 0 0 0 0 0 0 0 0 0 ...
## $ vehicle_manoeuvre : int 18 4 17 2 18 18 5 18 18 2 ...
## $ vehicle_location : int 0 0 0 0 0 0 0 0 0 0 ...
## $ junction_location : int 0 3 0 0 0 0 0 0 1 1 ...
## $ skidding_and_overturning : int 0 0 0 0 0 1 0 0 0 0 ...
## $ hit_object_in_carriageway : int 0 0 4 0 0 10 0 0 4 0 ...
## $ veh_leaving_carriageway : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hit_object_off_carriageway : int 0 0 0 0 0 0 0 0 0 0 ...
## $ 1st_point_of_impact : int 1 4 4 3 1 1 0 0 1 2 ...
## $ was_vehicle_left_hand_drive?: int 1 1 1 1 1 1 1 1 1 1 ...
## $ journey_purpose : int 15 1 1 15 15 15 15 15 15 15 ...
## $ sex_of_driver : int 2 1 1 1 2 1 1 2 1 1 ...
## $ age_of_driver : int 74 42 35 62 49 49 51 30 31 41 ...
## $ age_band_of_driver : int 10 7 6 9 8 8 8 6 6 7 ...
## $ engine_capacity_(cc) : int -1 8268 8300 1762 1769 85 2976 124 -1 4266 ...
## $ vehicle_propulsion_code : int -1 2 2 1 1 1 1 1 -1 1 ...
## $ age_of_vehicle : int -1 3 5 6 4 10 1 2 -1 4 ...
## $ driver_imd_decile : int 7 -1 2 1 2 -1 4 1 -1 6 ...
## $ driver_home_area_type : int 1 -1 1 1 1 -1 1 1 -1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "accident_index"
In order to make sure all my column names match with the variable names in the lookup df, I’ve created a mapping function to return the matching column (True) or to return the columns that are not matching (False) #### Function to return mapepd or unmapped columns
I have created a function, which loops through only the variable associated with a column of our dataset. (The mapping contains also the Vehicle variable, using our previous mapping function we can exclude those variables then)
#Get only the columns that can be mapped with the variables in the lookup dataframe
mapp.cols<-check.mapping(unique(mapping$Variable),colnames(acc.uk),TRUE)
#take a wile
for (col in mapp.cols) {
temp<-mapping[mapping$Variable==col,]
acc.uk[[col]]<-temp$Label[ match(acc.uk[[col]],temp$Code)]
}
str(acc.uk)
## Classes 'data.table' and 'data.frame': 3004425 obs. of 53 variables:
## $ accident_index : chr "200501BS00001" "200501BS00002" "200501BS00003" "200501BS00003" ...
## $ location_easting_osgr : int 525680 524170 524520 524520 526900 528060 524770 524770 524220 524220 ...
## $ location_northing_osgr : int 178240 181650 182240 182240 177530 179040 181160 181160 180830 180830 ...
## $ longitude : num -0.191 -0.212 -0.206 -0.206 -0.174 ...
## $ latitude : num 51.5 51.5 51.5 51.5 51.5 ...
## $ police_force : chr "Metropolitan Police" "Metropolitan Police" "Metropolitan Police" "Metropolitan Police" ...
## $ accident_severity : chr "Serious" "Slight" "Slight" "Slight" ...
## $ number_of_vehicles : int 1 1 2 2 1 1 2 2 2 2 ...
## $ number_of_casualties : int 1 1 1 1 1 1 1 1 1 1 ...
## $ date : chr "04/01/2005" "05/01/2005" "06/01/2005" "06/01/2005" ...
## $ day_of_week : chr "Tuesday" "Wednesday" "Thursday" "Thursday" ...
## $ time : chr "17:42" "17:36" "00:15" "00:15" ...
## $ local_authority_(district) : chr "Kensington and Chelsea" "Kensington and Chelsea" "Kensington and Chelsea" "Kensington and Chelsea" ...
## $ local_authority_(highway) : chr "Kensington and Chelsea" "Kensington and Chelsea" "Kensington and Chelsea" "Kensington and Chelsea" ...
## $ 1st_road_class : chr "A" "B" "C" "C" ...
## $ 1st_road_number : int 3218 450 0 0 3220 0 0 0 0 0 ...
## $ road_type : chr "Single carriageway" "Dual carriageway" "Single carriageway" "Single carriageway" ...
## $ speed_limit : int 30 30 30 30 30 30 30 30 30 30 ...
## $ junction_detail : chr "Not at junction or within 20 metres" "Crossroads" "Not at junction or within 20 metres" "Not at junction or within 20 metres" ...
## $ junction_control : chr "Data missing or out of range" "Auto traffic signal" "Data missing or out of range" "Data missing or out of range" ...
## $ 2nd_road_class : chr NA "C" NA NA ...
## $ 2nd_road_number : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ped_cross_human : chr "None within 50 metres " "None within 50 metres " "None within 50 metres " "None within 50 metres " ...
## $ ped_cross_facilities : chr "Zebra" "Pedestrian phase at traffic signal junction" "No physical crossing facilities within 50 metres" "No physical crossing facilities within 50 metres" ...
## $ light_conditions : chr "Daylight" "Darkness - lights lit" "Darkness - lights lit" "Darkness - lights lit" ...
## $ weather : chr "Raining no high winds" "Fine no high winds" "Fine no high winds" "Fine no high winds" ...
## $ road_surface : chr "Wet or damp" "Dry" "Dry" "Dry" ...
## $ special_conditions_at_site : chr "None" "None" "None" "None" ...
## $ carriageway_hazards : chr "None" "None" "None" "None" ...
## $ urban_rural : chr "Urban" "Urban" "Urban" "Urban" ...
## $ police_officer_attend : chr "Yes" "Yes" "Yes" "Yes" ...
## $ lsoa_of_accident_location : chr "E01002849" "E01002909" "E01002857" "E01002857" ...
## $ vehicle_reference : int 1 1 1 2 1 1 1 2 1 2 ...
## $ vehicle_type : chr "Car" "Bus or coach (17 or more pass seats)" "Bus or coach (17 or more pass seats)" "Car" ...
## $ towing_and_articulation : chr "No tow/articulation" "No tow/articulation" "No tow/articulation" "No tow/articulation" ...
## $ vehicle_manoeuvre : chr "Going ahead other" "Slowing or stopping" "Going ahead right-hand bend" "Parked" ...
## $ vehicle_location : chr "On main c'way - not in restricted lane" "On main c'way - not in restricted lane" "On main c'way - not in restricted lane" "On main c'way - not in restricted lane" ...
## $ junction_location : chr "Not at or within 20 metres of junction" "Leaving roundabout" "Not at or within 20 metres of junction" "Not at or within 20 metres of junction" ...
## $ skidding_and_overturning : chr "None" "None" "None" "None" ...
## $ hit_object_in_carriageway : chr "None" "None" "Parked vehicle" "None" ...
## $ veh_leaving_carriageway : chr "Did not leave carriageway" "Did not leave carriageway" "Did not leave carriageway" "Did not leave carriageway" ...
## $ hit_object_off_carriageway : chr "None" "None" "None" "None" ...
## $ 1st_point_of_impact : chr "Front" "Nearside" "Nearside" "Offside" ...
## $ was_vehicle_left_hand_drive?: int 1 1 1 1 1 1 1 1 1 1 ...
## $ journey_purpose : chr "Other/Not known (2005-10)" "Journey as part of work" "Journey as part of work" "Other/Not known (2005-10)" ...
## $ sex_of_driver : chr "Female" "Male" "Male" "Male" ...
## $ age_of_driver : int 74 42 35 62 49 49 51 30 31 41 ...
## $ age_band_of_driver : chr "66 - 75" "36 - 45" "26 - 35" "56 - 65" ...
## $ engine_capacity_(cc) : int -1 8268 8300 1762 1769 85 2976 124 -1 4266 ...
## $ vehicle_propulsion_code : chr NA "Heavy oil" "Heavy oil" "Petrol" ...
## $ age_of_vehicle : int -1 3 5 6 4 10 1 2 -1 4 ...
## $ driver_imd_decile : chr "Less deprived 30-40%" "Data missing or out of range" "More deprived 10-20%" "Most deprived 10%" ...
## $ driver_home_area_type : chr "Urban area" "Data missing or out of range" "Urban area" "Urban area" ...
## - attr(*, ".internal.selfref")=<externalptr>
## - attr(*, "sorted")= chr "accident_index"
This function below returns the number of unique elements we have for each variable hence make it easier to identify which variable can be seen as factors
fact.cols<-sapply(acc.uk[,mapp.cols, with=FALSE], function(x) length(unique(x)))
sort(fact.cols,decreasing = TRUE)
## local_authority_(district) local_authority_(highway)
## 416 207
## police_force vehicle_type
## 51 21
## vehicle_manoeuvre hit_object_in_carriageway
## 19 13
## hit_object_off_carriageway vehicle_propulsion_code
## 13 13
## age_band_of_driver driver_imd_decile
## 12 11
## vehicle_location junction_detail
## 11 10
## junction_location veh_leaving_carriageway
## 10 10
## weather special_conditions_at_site
## 10 9
## journey_purpose 2nd_road_class
## 8 7
## carriageway_hazards day_of_week
## 7 7
## ped_cross_facilities skidding_and_overturning
## 7 7
## towing_and_articulation 1st_point_of_impact
## 7 6
## 1st_road_class junction_control
## 6 6
## road_surface road_type
## 6 6
## light_conditions driver_home_area_type
## 5 4
## ped_cross_human police_officer_attend
## 4 4
## sex_of_driver accident_severity
## 4 3
## urban_rural
## 3
Most of the algorithms will perform better if you explicitly declare the variable as factors so you can uncomment those two chunks below but you’ll need good computational resources. And more specifically for ordinal factors or numeric variable that are actually not numeric but factors. In this case below I convert all the variable with less than 25 unique elements to factor, however it’ll be too slow to run this part on my notebook so you’ll need good computation resource to run it.
#mapp.fact.cols<-mapp.cols[fact.cols<25]
#acc.uk[mapp.fact.cols] <- lapply(acc.uk[mapp.fact.cols], factor)
I’ve added a bunch of new features such as Year, Month, Week and time_slot. Those features will be crucial to make deeper exploratory analysis.
#date conversion
acc.uk$newDate<- as.Date(acc.uk$date, "%d/%m/%Y")
#extract year
acc.uk$year<-year(acc.uk$newDate)
#extract month
acc.uk$month<-as.factor(month(acc.uk$newDate))
#extract week of year
acc.uk$week<-as.factor(week(acc.uk$newDate))
#extract day of year
acc.uk$day<-yday(acc.uk$newDate)
# time slot
acc.uk$time_slot <-as.numeric(substr(acc.uk$time,0,2))
#extract week-end night starting 2200 to 0600 on the week-end days
acc.uk$day_of_week<-factor(acc.uk$day_of_week)
I know there are few columns I won’t need for my analysis so removing them will just make my dataframe size smaller and then faster to explore #### check NA first Check if there is any variable is a large amount of NAs.
sort(sapply(acc.uk, function(x) sum(is.na(x))),decreasing = TRUE)
## 2nd_road_class vehicle_propulsion_code
## 1194664 770004
## police_officer_attend time_slot
## 498 222
## location_easting_osgr location_northing_osgr
## 198 198
## longitude latitude
## 198 198
## accident_index police_force
## 0 0
## accident_severity number_of_vehicles
## 0 0
## number_of_casualties date
## 0 0
## day_of_week time
## 0 0
## local_authority_(district) local_authority_(highway)
## 0 0
## 1st_road_class 1st_road_number
## 0 0
## road_type speed_limit
## 0 0
## junction_detail junction_control
## 0 0
## 2nd_road_number ped_cross_human
## 0 0
## ped_cross_facilities light_conditions
## 0 0
## weather road_surface
## 0 0
## special_conditions_at_site carriageway_hazards
## 0 0
## urban_rural lsoa_of_accident_location
## 0 0
## vehicle_reference vehicle_type
## 0 0
## towing_and_articulation vehicle_manoeuvre
## 0 0
## vehicle_location junction_location
## 0 0
## skidding_and_overturning hit_object_in_carriageway
## 0 0
## veh_leaving_carriageway hit_object_off_carriageway
## 0 0
## 1st_point_of_impact was_vehicle_left_hand_drive?
## 0 0
## journey_purpose sex_of_driver
## 0 0
## age_of_driver age_band_of_driver
## 0 0
## engine_capacity_(cc) age_of_vehicle
## 0 0
## driver_imd_decile driver_home_area_type
## 0 0
## newDate year
## 0 0
## month week
## 0 0
## day
## 0
acc.uk$`2nd_road_class`<-NULL
acc.uk$date<-NULL
acc.uk$police_officer_attend<-NULL
acc.uk$vehicle_propulsion_code<-NULL
Throughout this analysis I’ll to analyze the impact of different variables on the number of Accidents and accidents severity There’s a lot of variables to explore but I will mainly focus my analysis on the following variables: - Day - Time - Weather - JUnction Type - Area Type - Age of Driver
acc.uk %>%
group_by(day_of_week) %>%
summarize(total_accidents=n_distinct(accident_index)) %>%
ggplot(aes(x=day_of_week, y=total_accidents)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=total_accidents), vjust=1.6, color="white", size=3.5)+
theme_minimal()
### Accident by hours
acc.uk %>%
group_by(time_slot) %>%
summarize(total_accidents=n_distinct(accident_index)) %>%
ggplot(aes(x=time_slot, y=total_accidents)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=total_accidents), vjust=1.6, color="black", size=3)+
scale_x_continuous(breaks = round(seq(0, 24, by = 2),0)) +
ggtitle("Total Accidents by Hours from 2005 to 2014") +
xlab("Hours") + ylab("Total Accidents")+
theme(plot.title = element_text(hjust = 0.5), panel.background = element_blank())
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
We observe that accidents tend to occur on the business hours when people commute to work. But now let’s dive into it. Are the accident severity similarly distributed?
acc.uk %>%
filter(accident_severity=="Slight")%>%
group_by(time_slot) %>%
summarize(total_accidents=n_distinct(accident_index)) %>%
ggplot(aes(x=time_slot, y=total_accidents)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=total_accidents), vjust=1.6, color="black", size=3)+
scale_x_continuous(breaks = round(seq(0, 24, by = 2),0)) +
ggtitle("Total Slight Accidents by Hours from 2005 to 2014") +
xlab("Hours") + ylab("Total Accidents")+
theme(plot.title = element_text(hjust = 0.5), panel.background = element_blank())
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
acc.uk %>%
filter(accident_severity=="Serious")%>%
group_by(time_slot) %>%
summarize(total_accidents=n_distinct(accident_index)) %>%
ggplot(aes(x=time_slot, y=total_accidents)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=total_accidents), vjust=1.6, color="black", size=3)+
scale_x_continuous(breaks = round(seq(0, 24, by = 2),0)) +
ggtitle("Total Serious Accidents by Hours from 2005 to 2014") +
xlab("Hours") + ylab("Total Accidents")+
theme(plot.title = element_text(hjust = 0.5), panel.background = element_blank())
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
acc.uk %>%
filter(accident_severity=="Fatal")%>%
group_by(time_slot) %>%
summarize(total_accidents=n_distinct(accident_index)) %>%
ggplot(aes(x=time_slot, y=total_accidents)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=total_accidents), vjust=1.6, color="black", size=3)+
scale_x_continuous(breaks = round(seq(0, 24, by = 2),0)) +
ggtitle("Total Fatal Accidents by Hours from 2005 to 2014") +
xlab("Hours") + ylab("Total Accidents")+
theme(plot.title = element_text(hjust = 0.5), panel.background = element_blank())
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
In statistics, a contingency table is a type of table in a matrix format that displays the frequency distribution of the variables. They are heavily used in survey research, business intelligence, engineering and scientific research. They provide a basic picture of the interrelation between two variables and can help find interactions between them. More about Contingency Table here.
In the example below, I generate a contingency table and then compute row percentage, which means the value of each cell is divided by the sum of the row cells.
acc.time.severity<-table(acc.uk$time_slot,acc.uk$accident_severity)
prop.table(acc.time.severity,1)
##
## Fatal Serious Slight
## 0 0.028792601 0.173062184 0.798145215
## 1 0.033289044 0.178977987 0.787732969
## 2 0.034244684 0.190094817 0.775660499
## 3 0.037829136 0.183799011 0.778371852
## 4 0.038106236 0.186273095 0.775620670
## 5 0.033171042 0.175092142 0.791736816
## 6 0.021820629 0.152013842 0.826165529
## 7 0.012133499 0.125000953 0.862865548
## 8 0.006891400 0.098795884 0.894312717
## 9 0.009077532 0.101782152 0.889140316
## 10 0.012777209 0.113418612 0.873804180
## 11 0.011148538 0.112855383 0.875996079
## 12 0.010170808 0.112015669 0.877813522
## 13 0.010854113 0.114072404 0.875073483
## 14 0.011477531 0.121228064 0.867294406
## 15 0.009913095 0.121969777 0.868117128
## 16 0.010499912 0.122714701 0.866785387
## 17 0.009676572 0.120360012 0.869963416
## 18 0.010399565 0.125939770 0.863660666
## 19 0.013852577 0.131612492 0.854534931
## 20 0.015205271 0.141870610 0.842924118
## 21 0.017876927 0.149247530 0.832875542
## 22 0.021979807 0.154454159 0.823566034
## 23 0.024631237 0.157791658 0.817577105
Looking at the proportion table it seems that the hour when the accident occurs has an impact on the accident severity. We can observe that during the night the proportion of fatal accidents is higher than during the day while we observe the opposite result for the slight accidents. We will verify later if we can prove our conclusion using chi-square test.
Weekend_night take the value Yes if it’s Friday or Saturday night (from 10pm till 06am)
acc.uk$Week_end_night<-ifelse((acc.uk$day_of_week=="Friday" & acc.uk$time_slot %in% c(22:23)) |
acc.uk$day_of_week=="Saturday" & acc.uk$time_slot %in% c(22:23,0:6) |
acc.uk$day_of_week=="Sunday" & acc.uk$time_slot %in% c(0:6),"Yes","No")
acc.uk %>%
group_by(Week_end_night,accident_severity) %>%
summarize(total_accidents=n_distinct(accident_index)) %>%
mutate(freq = percent(total_accidents / sum(total_accidents))) %>%
ggplot(aes(x=accident_severity, y=freq,fill=Week_end_night)) +
geom_bar(stat="identity", position="dodge")+
geom_text(aes(label=freq), vjust=1.6, color="black", size=3)+
ggtitle("Accident Severity Proportion WE-Night vs non WE-Night") +
xlab("Accident Severity") + ylab("Accident Proportion")+
theme(plot.title = element_text(hjust = 0.5), panel.background = element_blank())
We can clearly see a massive difference between the proportion of fatal accident during the week-end night hours and non week-end night hours. Let’s verifiy it with the chi-square test again
So far we discovered that there’s more accidents during the rush hour time (4pm-6pm) across all accident severity level. However we found out that the probability of an accident to be fatal is higher during the weekend night.
Let’s now investigate other variables such as: - Weather - Area Type - Junction Type - Age of Driver
acc.uk %>%
group_by(weather,accident_severity) %>%
summarize(total_accidents=n_distinct(accident_index)) %>%
ggplot(aes(x=accident_severity, y=total_accidents,fill=weather)) +
geom_bar(stat="identity", position="dodge")+
ggtitle("Accident Severity by Weather Condition") +
xlab("Accident Severity") + ylab("Total Accidents")+
theme(plot.title = element_text(hjust = 0.5), panel.background = element_blank(),axis.text.x = element_text(angle = 45, hjust = 1))
acc.uk %>%
group_by(weather,accident_severity) %>%
summarize(total_accidents=n_distinct(accident_index)) %>%
mutate(freq = percent(total_accidents / sum(total_accidents))) %>%
ggplot(aes(x=accident_severity, y=freq,fill=weather)) +
geom_bar(stat="identity", position="dodge")+
ggtitle("Accident Severity Proportion by Weather") +
xlab("Accident Severity") + ylab("Accident Proportion")+
theme(plot.title = element_text(hjust = 0.5), panel.background = element_blank(),
axis.text.y=element_blank(),axis.ticks.y=element_blank())
I have removed the Y-axis as tehre’s too many variables it won’t be readable but the conclusion we can make from this chart is that probability of an accident to be fatal is higher when it’s foggy or misty.
acc.uk %>%
filter(urban_rural!="Unallocated")%>%
group_by(urban_rural,accident_severity) %>%
summarize(total_accidents=n_distinct(accident_index)) %>%
ggplot(aes(x=accident_severity, y=total_accidents,fill=urban_rural)) +
geom_bar(stat="identity", position="dodge")+
ggtitle("Accident Severity by Area Type") +
xlab("Accident Severity") + ylab("Total Accidents")+
theme(plot.title = element_text(hjust = 0.5), panel.background = element_blank(),axis.text.x = element_text(angle = 45, hjust = 1))
acc.uk %>%
group_by(urban_rural,accident_severity) %>%
filter(urban_rural!="Unallocated")%>%
summarize(total_accidents=n_distinct(accident_index)) %>%
mutate(freq = percent(total_accidents / sum(total_accidents))) %>%
ggplot(aes(x=accident_severity, y=freq,fill=urban_rural)) +
geom_bar(stat="identity", position="dodge")+
ggtitle("Accident Severity Proportion by Area Type") +
xlab("Accident Severity") + ylab("Accident Severity Proportion")+
theme(plot.title = element_text(hjust = 0.5), panel.background = element_blank(),
axis.ticks.y=element_blank())
acc.uk %>%
group_by(junction_detail,accident_severity) %>%
summarize(total_accidents=n_distinct(accident_index)) %>%
ggplot(aes(x=accident_severity, y=total_accidents,fill=junction_detail)) +
geom_bar(stat="identity", position="dodge")+
ggtitle("Accident Severity by Junction Type") +
xlab("Accident Severity") + ylab("Total Accidents")+
theme(plot.title = element_text(hjust = 0.5), panel.background = element_blank(),axis.text.x = element_text(angle = 45, hjust = 1))
acc.uk %>%
group_by(junction_detail,accident_severity) %>%
filter(junction_detail!="Data missing or out of range")%>%
summarize(total_accidents=n_distinct(accident_index)) %>%
mutate(freq = percent(total_accidents / sum(total_accidents))) %>%
ggplot(aes(x=accident_severity, y=freq,fill=junction_detail)) +
geom_bar(stat="identity", position="dodge")+
ggtitle("Accident Severity Proportion by Junction Type") +
xlab("Accident Severity") + ylab("Accident Severity Proportion")+
theme(plot.title = element_text(hjust = 0.5), panel.background = element_blank(),
axis.ticks.y=element_blank())
We can see that the probability of an accident to be fatal is higher on road that ar enot a junction or within 20 metres of a junction. On the contrary an accident happening on a roundabout is much more likely to be a slight accident and not likely at all to be a fatal accident.
Why I removed the rows labelled as “Data missing or out of range”? There’s only 26 rows with missing information over million rows so it is safe to remove them. And also as we can see in the below frquency table the proportion of the fatal accident for “Data missing or out of range” would be missleading in our plot 5/26~19% while the second highest proportion is just 3%.
tt<-table(acc.uk$junction_detail,acc.uk$accident_severity)
prop.table(tt,1)
##
## Fatal Serious Slight
## Crossroads 0.006929720 0.111143069 0.881927211
## Data missing or out of range 0.060606061 0.030303030 0.909090909
## Mini-roundabout 0.003327787 0.084889382 0.911782831
## More than 4 arms (not roundabout) 0.005321096 0.101491372 0.893187532
## Not at junction or within 20 metres 0.021090226 0.145645619 0.833264155
## Other junction 0.009575401 0.107341141 0.883083459
## Private drive or entrance 0.010173909 0.131024574 0.858801517
## Roundabout 0.002474816 0.074140586 0.923384598
## Slip road 0.013513791 0.095151055 0.891335154
## T or staggered junction 0.008228188 0.120884047 0.870887765
acc.uk %>%
group_by(age_band_of_driver,accident_severity) %>%
summarize(total_accidents=n_distinct(accident_index)) %>%
ggplot(aes(x=accident_severity, y=total_accidents,fill=age_band_of_driver)) +
geom_bar(stat="identity", position="dodge")+
ggtitle("Accident by Age of Drivers") +
xlab("Accident Severity") + ylab("Total Accidents")+
theme(plot.title = element_text(hjust = 0.5), panel.background = element_blank(),axis.text.x = element_text(angle = 45, hjust = 1))
So it looks like the most dangerous drivers or those who cause the highest amount of accidents are the young drivers. Now we want to find out the proportion of fatal accident among each age brand.
acc.uk %>%
group_by(age_band_of_driver,accident_severity) %>%
summarize(total_accidents=n_distinct(accident_index)) %>%
mutate(freq = percent(total_accidents / sum(total_accidents))) %>%
ggplot(aes(x=accident_severity, y=freq,fill=age_band_of_driver)) +
geom_bar(stat="identity", position="dodge")+
ggtitle("Accident Severity Proportion by Age of Driver") +
xlab("Accident Severity") + ylab("Accident Severity Proportion")+
theme(plot.title = element_text(hjust = 0.5), panel.background = element_blank(),
axis.ticks.y=element_blank())
It turns out that the death rate of drivers aged over 75 is much higher probably because they are more vulnerable to injuries? - Over 75: 2.6% - 66 - 75: 1.7% - 56 - 65: 1.6% - 46 - 55: 1.5% - 36 - 45: 1.3% - 26 - 35: 1.2% - 21 - 25: 1.3%
A deeper analysis could be to explore what type of car they drive? Do elderly driver older car hence less safe? Where do they have accidents junction, intersection?
Heatmap is a nice way to illustrate the relation betwwen accident severity accross Days and Hours. Accident Severity from left to right: Slight, Serious, Fatal Again we clearly observe that the amount of fatal accidents increases at night especially during the week-end
acc.hour.day.slight<-as.matrix(table(acc.uk$time_slot[acc.uk$accident_severity=="Slight"],acc.uk$day_of_week[acc.uk$accident_severity=="Slight"]))
acc.hour.day.serious<-as.matrix(table(acc.uk$time_slot[acc.uk$accident_severity=="Serious"],acc.uk$day_of_week[acc.uk$accident_severity=="Serious"]))
acc.hour.day.fatal<-as.matrix(table(acc.uk$time_slot[acc.uk$accident_severity=="Fatal"],acc.uk$day_of_week[acc.uk$accident_severity=="Fatal"]))
coul = rev(colorRampPalette(brewer.pal(8, "PiYG"))(25))
par(mfrow=c(1,3))
{
nba_heatmap <- heatmap(acc.hour.day.slight, Rowv=NA, Colv=NA, col = coul, scale="column", margins=c(5,10))
nba_heatmap <- heatmap(acc.hour.day.serious, Rowv=NA, Colv=NA, col = coul, scale="column", margins=c(5,10))
nba_heatmap <- heatmap(acc.hour.day.fatal, Rowv=NA, Colv=NA, col = coul, scale="column", margins=c(5,10))
}
UK <- map_data("world") %>% filter(region=="UK")
data=world.cities %>% filter(country.etc=="UK")
tree<- createTree(data,columns=c(4,5)) #columns is the number of columns with latitude and longitude
acc.map.uk<-acc.uk %>%
filter(accident_severity=="Fatal")%>%
group_by(longitude,latitude) %>%
summarize(total_fatal_accidents=n_distinct(accident_index))
#K-nearest neighboor lookup
#All the accidents will be mapped to the nearest city based on the long/lat values
acc.map.uk$city.idx<-knnLookup(tree,acc.map.uk$latitude,acc.map.uk$longitude,k=1)
acc.map.uk<- acc.map.uk %>%
group_by(city.idx) %>%
summarize(total_fatal_accidents=sum(total_fatal_accidents))
acc.map.uk$city<-data$name[acc.map.uk$city.idx]
acc.map.uk$pop<-data$pop[acc.map.uk$city.idx]
acc.map.uk$lat<-data$lat[acc.map.uk$city.idx]
acc.map.uk$long<-data$long[acc.map.uk$city.idx]
acc.map.uk$ratio.acc<-round((acc.map.uk$total_fatal_accidents/acc.map.uk$pop)/10,6)
p=acc.map.uk %>%
arrange(ratio.acc) %>%
mutate( city=factor(city, unique(city))) %>%
mutate( mytext=paste("City: ", city, "\n", "Fatal Accidents: ", total_fatal_accidents,
"\n", "Population: ", pop,
"\n", "Ratio Fatal Accidents/Pop: ", ratio.acc,sep="")) %>%
ggplot() +
geom_polygon(data = UK, aes(x=long, y = lat, group = group), fill="grey", alpha=0.3) +
geom_point(aes(x=long, y=lat, size=ratio.acc, color=ratio.acc, text=mytext, alpha=ratio.acc) ) +
scale_size_continuous(range=c(1,7)) +
scale_color_viridis(option="inferno" ) +
scale_alpha_continuous() +
theme_void() +
ylim(50,59) +
coord_map() +
theme()
## Warning: Ignoring unknown aesthetics: text
p=ggplotly(p, tooltip="text")
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
p
k-nearest neighbours is great for the areas with many cities however, for the large rural area such as Scotland, it’s not that good. In fact, there is a very wide rural area around Fort William and this means that all the accidents in the north-west of Scotland will be counted for Fort William, which is a sort of biased result. A better approach could be to get the counties boundaries and map them with our current dataset and then apportionate the total accidents with the county’s population. I would still expect Fort William to have a high ratio, as it is a major tourist centre with a small population.
There is still much more explanatory analysis to be done, you can for example add the Casualties dataset or explore new variables taht I ahven’t explored so far. Overall wer learnt that: - Weather with highes death rate is foggy or misty - Friday and Saturday night have the highest death rate - Death rate in rural area is higher than urban area - Roundabout have the lowest death rate while accident on road (not a junction or within 20 metres) have the highest death rate - Drivers aged over 75 due to the fact they are more vulnerable to injuries? Maybe we’ll need to make further analysis to confirm this assumption - Most of the accidents occur occur during rushhour time (especially evening) - Young driver have more car accidents
For 2-way tables we can use CHi-sqaure to test the independence of the row and column variable. More about Chi-squared test here #### Test of Independence: Accident Severity vs Hours The Null hypothesis states that Accident Severity is independant of the hours variabes.
chisq.test(acc.time.severity)
##
## Pearson's Chi-squared test
##
## data: acc.time.severity
## X-squared = 18306, df = 46, p-value < 2.2e-16
As the p-value is significantly less than 0.05, we reject with the Null hypothesis that the accident severity is independent of the hours. #### Test of Independence: Accident Severity vs Weekend night
acc.we.night.severity<-table(acc.uk$Week_end_night,acc.uk$accident_severity)
chisq.test(acc.we.night.severity)
##
## Pearson's Chi-squared test
##
## data: acc.we.night.severity
## X-squared = 6594.2, df = 2, p-value < 2.2e-16
Again we reject with the Null hypothesis that the accident severity is independent of Weekend night hours. #### Test of Independence: Accident Severity vs Weather, Area Type and Junction Type
## Warning in chisq.test(acc.weather.severity): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: acc.weather.severity
## X-squared = 3364.7, df = 18, p-value < 2.2e-16
## Warning in chisq.test(acc.area.severity): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: acc.area.severity
## X-squared = 30194, df = 4, p-value < 2.2e-16
## Warning in chisq.test(acc.junction.severity): Chi-squared approximation may
## be incorrect
##
## Pearson's Chi-squared test
##
## data: acc.junction.severity
## X-squared = 25631, df = 18, p-value < 2.2e-16
All our previous finindings are with 95% CI statistically correct as we always have a p-value < 0.05
options(scipen=999)
#Let's create two new features: is accident fatal is driver over 75
acc.uk$driver.over.75<-ifelse(acc.uk$age_band_of_driver=="Over 75","Yes","No")
acc.uk$is.fatal<-ifelse(acc.uk$accident_severity=="Fatal",1,0)
acc.fatal.less.75<- replicate(200,mean(sample(acc.uk$is.fatal[acc.uk$driver.over.75=="No"],10000)))
acc.fatal.over.75<-replicate(200,mean(sample(acc.uk$is.fatal[acc.uk$driver.over.75=="Yes"],10000)))
par( mfrow = c( 1, 2 ) )
{
hist(acc.fatal.less.75 ,main="Accident Fatal Ratio Driver Age under 75",
xlab="Accident Fatals Ratio" )
hist(acc.fatal.over.75,main="Accident Fatal Ratio Driver Age over 75",
xlab="Accident Fatals Ratio" )
}
The conditions for the t-test are: - The data were collected in a random way - Each observation must be independent of the others - The sampling distribution must be normal or approximately normal - Variance of my samples are equal (Pooled Variance) if not equal (Unpooled Variance)
Unpooled Variance is also called welsh t-test
Looking at the dirstribution of out two sample means we can carry on with out t-test
options(scipen=999)
var.over.75<-var(acc.fatal.over.75)
var.under.75<-var(acc.fatal.less.75)
(var.over.75 - var.under.75)/var.over.75* 100
## [1] 40.10554
#The variances are not equal at all so R should automatically choose the Welsh t-test
t.test(acc.fatal.less.75,acc.fatal.over.75,paired = FALSE)
##
## Welch Two Sample t-test
##
## data: acc.fatal.less.75 and acc.fatal.over.75
## t = -101.25, df = 374.44, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.01409298 -0.01355602
## sample estimates:
## mean of x mean of y
## 0.0123820 0.0262065
The p-value is nearly equal to zero so this is a strong evidence that the age of driver has an impact on accident death rate
Using different statistic Tests we have then proved that all our assumption made during the explanatory part analysis were correct.
acc.uk$first.day.month <- ymd(format(acc.uk$newDate, "%Y-%m-01"))
acc.aggr<-acc.uk %>%
filter(accident_severity=="Fatal")%>%
group_by(newDate,month,year) %>%
summarize(total_accidents=n_distinct(accident_index))
startDate<-min(acc.aggr$newDate)
endDate<-max(acc.aggr$newDate)
acc.ts<- msts(acc.aggr$total_accidents,seasonal.periods = 365.25,start = decimal_date(as.Date(startDate)))
plot(acc.ts, main="Daily Fatal Accidents", xlab="Year", ylab="Accidents")
### Accident DIstrbution
hist(acc.aggr$total_accidents,main="Frequency of Daily Accidents")
The daily amount of accidents is right skewed. ### Year Variance
{
boxplot(acc.aggr$total_accidents~acc.aggr$year)
means <- tapply(acc.aggr$total_accidents,acc.aggr$year,mean)
points(means,col="red",pch=18)
abline(h=mean(acc.aggr$total_accidents),col="red")
abline(h=(mean(acc.aggr$total_accidents)+c(1,-1)*sd(acc.aggr$total_accidents)),col="blue",lty=2)
legend(9,0.95*max(acc.aggr$total_accidents), legend=c("Mean", "SD"),col=c("red", "blue"), lty=1:2, cex=0.8)
}
Overall the number of accidents tends to decrease from 2005 to 2013 but then slightly increased again for 2014. There’s few outliers and overall all the years seem to be right skewed ### Month Variance
{
boxplot(acc.aggr$total_accidents~acc.aggr$month)
means <- tapply(acc.aggr$total_accidents,acc.aggr$month,mean)
points(means,col="red",pch=18)
abline(h=mean(acc.aggr$total_accidents),col="red")
abline(h=(mean(acc.aggr$total_accidents)+c(1,-1)*sd(acc.aggr$total_accidents)),col="blue",lty=2)
legend(1,0.95*max(acc.aggr$total_accidents), legend=c("Mean", "SD"),col=c("red", "blue"), lty=1:2, cex=0.8)
}
The trend between months seems to remain approximately constant however most of them are right skewed
{
qqnorm(acc.ts)
qqline(acc.ts)
}
As already observed our data are right skewed
acc.ts.components <- decompose(acc.ts)
plot(acc.ts.components)
From the ST decomposition we observe that from 2007 to 2009 the trend the decrease rapidly and then slightly decrease from 2009 to 2013 and finally become stagnant from 2013 to 2014. There is an obvious seasonality accross the years with some spikes. Hoewever our time series seems to be still quite random.
my.dates<-acc.aggr$newDate
nRows<-length(acc.aggr$newDate)
tr = window(acc.ts, start=c(decimal_date(as.Date(my.dates[1]))), end=c(decimal_date(as.Date(my.dates[nRows-365]))))
tst = window(acc.ts, start=decimal_date(as.Date(my.dates[nRows-365])), end=decimal_date(as.Date(my.dates[nRows])))
## Warning in window.default(x, ...): 'end' value not changed
models <- list(
"arima.fit"=auto.arima(tr, stepwise=TRUE, approximation=FALSE),
"stlf.fit"=stlf(tr),
"HW.fit"=HoltWinters(tr),
"tbats.fit"=tbats(tr),
"stl.fit"=stlm(tr, ic='aicc', robust=TRUE, method='ets'),
"neural.fit"=nnetar(tr)
)
forecasts <- lapply(models, forecast, 365)
forecasts$naive <- naive(tr, 365)
for(f in forecasts){
plot(f)
lines(tst, col='red')
}
acc <- lapply(forecasts, function(f){
accuracy(f, tst)[2,,drop=FALSE]
})
acc <- Reduce(rbind, acc)
row.names(acc) <- names(forecasts)
acc <- acc[order(acc[,'MASE']),]
round(acc, 2)
## ME RMSE MAE MPE MAPE MASE ACF1 Theil's U
## tbats.fit 0.46 2.37 1.87 -24.92 56.92 0.64 0.06 0.65
## arima.fit 0.11 2.34 1.89 -36.01 62.64 0.65 0.07 0.66
## HW.fit 0.31 2.51 1.98 -28.85 61.78 0.68 0.04 0.65
## stlf.fit 0.14 2.50 2.00 -33.93 64.40 0.68 0.03 0.65
## stl.fit 0.16 2.53 2.01 -34.18 65.33 0.69 0.03 0.66
## neural.fit 1.17 2.68 2.06 -4.43 52.28 0.70 0.12 0.71
## naive 1.66 2.87 2.17 10.33 48.57 0.74 0.07 0.76
I want to see where mi linear model sit between the previous forecasting method
acc.aggr2<-acc.uk %>%
filter(accident_severity=="Fatal")%>%
group_by( newDate ,day,day_of_week, week,month,year) %>%
summarize(total_accidents=n_distinct(accident_index))
tr2<-acc.aggr2[acc.aggr2$newDate<"2014-01-01",]
tst2<-acc.aggr2[acc.aggr2$newDate>="2014-01-01",]
lm.fit<-lm(total_accidents~.-newDate,data=tr2)
lm.fr<-predict(lm.fit,tst2)
df.tst2 <- as.data.frame(cbind(tst2$newDate ,tst2$total_accidents, lm.fr))
mse <- mean((tst2$total_accidents - lm.fr)^2)
rmse<- sqrt(mse)
{
plot(df.tst2$V2,type="o")
lines(df.tst2$lm.fr, type = "o", col = "blue")
mtext(paste("RMSE of the linear model:",rmse), side=3)
}
tbats.fit=tbats(tr, ic='aicc')
tbats.fr<-forecast(tbats.fit, h=365)
X2 <- cbind(tbats.fr$mean)
df.tst <- cbind(tst, forecasts$tbats.fit$mean)
colnames(df.tst) <- c("Testing Data","tbats Forecast")
autoplot(df.tst)+ theme_bw() +xlab("Year") + ylab(expression("Accidents"))+ ggtitle("Testing Data vs Forecasted Data")+ theme(plot.title = element_text(hjust = 0.5))
Overall we’re not forecasting very well, the lowest RMSE we’ve got is 2.37 which is still a bit high as the range of the daily accident amount is 0 to 15. But if yoiu remember the decomposition of our time series contained a lot of randomness so this obviously difficult to predict anything which is random.
I haven’t developped the forecasting part a lot, so I may add some content soon. We could also predict wether an accident was fatal or not using classification model such as the famous titanic but as we’ve got lot more rows and features it would be interesting to see if we cann add even more features and them come up with classification model such xgboost tree, random forest, nnet.