In Victoria there are in excess of 5,000,000 cars registered to VicRoads. Crash statistics are closely monitored by several government organisations. A reduction in the road toll is paramount in the saving lives of Victorian citizens.Crash data is information recorded by VicRoads and made publicly available at data.vic.gov.au. This dataset contains all crash data from 1st January 2006 up until 3rd August 2016. The Victorian Government in 2011 released a report into the road toll and the effect that road safety cameras (red light and speed cameras) has had on reducing the number of deaths on the road. It was found that many variables contributed to deaths on the road including: speed, fatigue, road design etc. The report also found that there were several factors that lend themselves to higher road toll numbers. Accident/crash dataset is merged with ACCIDENT_NO, identified missing values, excluded the outliers, applied data transformation for following reasons: i) to change the scale for better understanding of the variable, ii) to convert a non-linear relation into linear one, or iii) to decrease the skewness and convert the distribution into a normal distribution
library(readr)
library(dplyr)
library(plyr)
library(tidyr)
library(knitr)
library(MVN)
library(lubridate)
library(stringr)
library(outliers)
library(forecast)
setwd("F:/RMIT/sem2/Data Preprocessing/assignment 3/")
accident <- read.csv("ACCIDENT.csv",stringsAsFactors = FALSE)
acc_chinage <- read.csv("ACCIDENT_CHAINAGE.csv")
acc_event <- read.csv("ACCIDENT_EVENT.csv")
acc_location <- read.csv("ACCIDENT_LOCATION.csv")
acc_atmosCond <- read.csv("ATMOSPHERIC_COND.csv")
acc_node <- read.csv("NODE.csv")
acc_nodeId <- read.csv("NODE_ID_COMPLEX_INT_ID.csv",stringsAsFactors = FALSE)
acc_person <- read.csv("PERSON.csv")
acc_rdSurfaceCond <- read.csv("ROAD_SURFACE_COND.csv")
acc_subdca <- read.csv("SUBDCA.csv")
acc_vehicle <- read.csv("VEHICLE.csv" )
#merging all csv file by ACCIDENT_NO as ID
accident_df <- join_all(list(accident,acc_event,acc_location,acc_atmosCond,acc_node,acc_nodeId,acc_person,acc_rdSurfaceCond,acc_subdca,acc_vehicle), by='ACCIDENT_NO', type='left')
sapply(accident_df,class)
ACCIDENT_NO ACCIDENTDATE ACCIDENTTIME ACCIDENT_TYPE Accident.Type.Desc DAY_OF_WEEK
"character" "character" "character" "integer" "character" "integer"
Day.Week.Description DCA_CODE DCA.Description DIRECTORY EDITION PAGE
"character" "integer" "character" "character" "character" "character"
GRID_REFERENCE_X GRID_REFERENCE_Y LIGHT_CONDITION Light.Condition.Desc NODE_ID NO_OF_VEHICLES
"character" "integer" "integer" "character" "integer" "integer"
NO_PERSONS NO_PERSONS_INJ_2 NO_PERSONS_INJ_3 NO_PERSONS_KILLED NO_PERSONS_NOT_INJ POLICE_ATTEND
"integer" "integer" "integer" "integer" "integer" "integer"
ROAD_GEOMETRY Road.Geometry.Desc SEVERITY SPEED_ZONE EVENT_SEQ_NO EVENT_TYPE
"integer" "character" "integer" "integer" "integer" "factor"
Event.Type.Desc VEHICLE_1_ID VEHICLE_1_COLL_PT Vehicle.1.Coll.Pt.Desc VEHICLE_2_ID VEHICLE_2_COLL_PT
"factor" "factor" "factor" "factor" "factor" "factor"
Vehicle.2.Coll.Pt.Desc PERSON_ID OBJECT_TYPE Object.Type.Desc NODE_ID ROAD_ROUTE_1
"factor" "factor" "integer" "factor" "integer" "integer"
ROAD_NAME ROAD_TYPE ROAD_NAME_INT ROAD_TYPE_INT DISTANCE_LOCATION DIRECTION_LOCATION
"factor" "factor" "factor" "factor" "integer" "factor"
NEAREST_KM_POST OFF_ROAD_LOCATION ATMOSPH_COND ATMOSPH_COND_SEQ Atmosph.Cond.Desc NODE_ID
"integer" "factor" "integer" "integer" "factor" "integer"
NODE_TYPE AMG_X AMG_Y LGA_NAME Lga.Name.All Region.Name
"factor" "numeric" "numeric" "factor" "factor" "factor"
Deg.Urban.Name Lat Long Postcode.No NODE_ID COMPLEX_INT_NO
"factor" "numeric" "numeric" "integer" "integer" "integer"
PERSON_ID VEHICLE_ID SEX AGE Age.Group INJ_LEVEL
"factor" "factor" "factor" "integer" "factor" "integer"
Inj.Level.Desc SEATING_POSITION HELMET_BELT_WORN ROAD_USER_TYPE Road.User.Type.Desc LICENCE_STATE
"factor" "factor" "integer" "numeric" "factor" "factor"
PEDEST_MOVEMENT POSTCODE TAKEN_HOSPITAL EJECTED_CODE SURFACE_COND Surface.Cond.Desc
"integer" "integer" "factor" "integer" "integer" "factor"
SURFACE_COND_SEQ SUB_DCA_CODE SUB_DCA_SEQ Sub.Dca.Code.Desc VEHICLE_ID VEHICLE_YEAR_MANUF
"integer" "factor" "integer" "factor" "factor" "integer"
VEHICLE_DCA_CODE INITIAL_DIRECTION ROAD_SURFACE_TYPE Road.Surface.Type.Desc REG_STATE VEHICLE_BODY_STYLE
"integer" "factor" "integer" "factor" "factor" "factor"
VEHICLE_MAKE VEHICLE_MODEL VEHICLE_POWER VEHICLE_TYPE Vehicle.Type.Desc VEHICLE_WEIGHT
"factor" "factor" "integer" "integer" "factor" "integer"
CONSTRUCTION_TYPE FUEL_TYPE NO_OF_WHEELS NO_OF_CYLINDERS SEATING_CAPACITY TARE_WEIGHT
"factor" "factor" "integer" "integer" "integer" "integer"
TOTAL_NO_OCCUPANTS CARRY_CAPACITY CUBIC_CAPACITY FINAL_DIRECTION DRIVER_INTENT VEHICLE_MOVEMENT
"integer" "integer" "factor" "factor" "integer" "integer"
TRAILER_TYPE VEHICLE_COLOUR_1 VEHICLE_COLOUR_2 CAUGHT_FIRE INITIAL_IMPACT LAMPS
"factor" "factor" "factor" "integer" "factor" "integer"
LEVEL_OF_DAMAGE OWNER_POSTCODE TOWED_AWAY_FLAG TRAFFIC_CONTROL Traffic.Control.Desc
"integer" "integer" "integer" "integer" "factor"
#COncentrating on number of people killed in accidents
acc_severity <- accident_df[(accident_df$NO_PERSONS_KILLED > 0),]
#change to datetime
acc_severity$ACCIDENTDATE <- as.POSIXct(strptime(acc_severity$ACCIDENTDATE, "%d/%m/%Y"))
#changing character to hms
acc_severity$ACCIDENTTIME <- hms(acc_severity$ACCIDENTTIME)
sapply(acc_severity, function(x) sum(is.na(x)))
ACCIDENT_NO ACCIDENTDATE ACCIDENTTIME ACCIDENT_TYPE Accident.Type.Desc DAY_OF_WEEK
0 0 0 0 0 0
Day.Week.Description DCA_CODE DCA.Description DIRECTORY EDITION PAGE
0 0 0 0 0 0
GRID_REFERENCE_X GRID_REFERENCE_Y LIGHT_CONDITION Light.Condition.Desc NODE_ID NO_OF_VEHICLES
0 4982 0 0 0 0
NO_PERSONS NO_PERSONS_INJ_2 NO_PERSONS_INJ_3 NO_PERSONS_KILLED NO_PERSONS_NOT_INJ POLICE_ATTEND
0 0 0 0 0 0
ROAD_GEOMETRY Road.Geometry.Desc SEVERITY SPEED_ZONE EVENT_SEQ_NO EVENT_TYPE
0 0 0 0 0 0
Event.Type.Desc VEHICLE_1_ID VEHICLE_1_COLL_PT Vehicle.1.Coll.Pt.Desc VEHICLE_2_ID VEHICLE_2_COLL_PT
0 0 0 0 0 0
Vehicle.2.Coll.Pt.Desc PERSON_ID OBJECT_TYPE Object.Type.Desc NODE_ID ROAD_ROUTE_1
0 0 0 0 0 0
ROAD_NAME ROAD_TYPE ROAD_NAME_INT ROAD_TYPE_INT DISTANCE_LOCATION DIRECTION_LOCATION
0 0 0 0 0 0
NEAREST_KM_POST OFF_ROAD_LOCATION ATMOSPH_COND ATMOSPH_COND_SEQ Atmosph.Cond.Desc NODE_ID
49283 0 0 0 0 0
NODE_TYPE AMG_X AMG_Y LGA_NAME Lga.Name.All Region.Name
0 0 0 0 0 0
Deg.Urban.Name Lat Long Postcode.No NODE_ID COMPLEX_INT_NO
0 0 0 0 51167 51167
PERSON_ID VEHICLE_ID SEX AGE Age.Group INJ_LEVEL
0 0 0 1069 0 0
Inj.Level.Desc SEATING_POSITION HELMET_BELT_WORN ROAD_USER_TYPE Road.User.Type.Desc LICENCE_STATE
0 2591 0 0 0 0
PEDEST_MOVEMENT POSTCODE TAKEN_HOSPITAL EJECTED_CODE SURFACE_COND Surface.Cond.Desc
0 7532 0 56 0 0
SURFACE_COND_SEQ SUB_DCA_CODE SUB_DCA_SEQ Sub.Dca.Code.Desc VEHICLE_ID VEHICLE_YEAR_MANUF
0 502 502 502 0 2396
VEHICLE_DCA_CODE INITIAL_DIRECTION ROAD_SURFACE_TYPE Road.Surface.Type.Desc REG_STATE VEHICLE_BODY_STYLE
44 0 0 0 0 0
VEHICLE_MAKE VEHICLE_MODEL VEHICLE_POWER VEHICLE_TYPE Vehicle.Type.Desc VEHICLE_WEIGHT
0 0 54265 0 0 45115
CONSTRUCTION_TYPE FUEL_TYPE NO_OF_WHEELS NO_OF_CYLINDERS SEATING_CAPACITY TARE_WEIGHT
0 0 10251 7833 11643 5429
TOTAL_NO_OCCUPANTS CARRY_CAPACITY CUBIC_CAPACITY FINAL_DIRECTION DRIVER_INTENT VEHICLE_MOVEMENT
0 40147 0 0 0 0
TRAILER_TYPE VEHICLE_COLOUR_1 VEHICLE_COLOUR_2 CAUGHT_FIRE INITIAL_IMPACT LAMPS
0 0 0 0 0 0
LEVEL_OF_DAMAGE OWNER_POSTCODE TOWED_AWAY_FLAG TRAFFIC_CONTROL Traffic.Control.Desc
0 2030 0 0 0
Ignorning the variables consisting null values because further analysis dosen’t require those variables with null values.
list1 <- c("ACCIDENT_NO","ACCIDENTDATE","ACCIDENTTIME","Accident.Type.Desc","DAY_OF_WEEK","Day.Week.Description","DCA.Description","DIRECTORY","Light.Condition.Desc","NO_OF_VEHICLES","NO_PERSONS","NO_PERSONS_KILLED","SPEED_ZONE","Event.Type.Desc","Lat","Long","Vehicle.Type.Desc","SEX","Age.Group")
accident <- acc_severity[list1]
sapply(accident, function(x) sum(is.na(x)))
ACCIDENT_NO ACCIDENTDATE ACCIDENTTIME Accident.Type.Desc DAY_OF_WEEK Day.Week.Description DCA.Description
0 0 0 0 0 0 0
DIRECTORY Light.Condition.Desc NO_OF_VEHICLES NO_PERSONS NO_PERSONS_KILLED SPEED_ZONE Event.Type.Desc
0 0 0 0 0 0 0
Lat Long Vehicle.Type.Desc SEX Age.Group
0 0 0 0 0
The data in this study was sourced from VicRoads and stored on data.vic.gov.au, which has a compilation of data from a wide range of government sources. The data set contains a collection of 145,156 accidents for the past decade and records basic accident details such as: day of the week, light conditions, speed limit, type of accidents. . ACCIDENT_NO: The identification number given to each accident where the 2nd to 5th chararachters make up the year that the accident occured. . ACCIDENTDATE: The date that each accident occured - recorded as DD/MM/YYY. . ACCIDENTTIME: The time that the accident occured recorded in 24 Hour time . ACCIDENTTYPE & Accident Type Desc: Variables describing the nature of the accident. ACCIDENTTYPE is a categorical variable with values 1-9 describing the type of accident respective to the Accident Type Description (1=Collision with vehicle 2=Struck pedestrian 3=Struck animal 4=Collision with a fixed object 5=Collision with some other object 6=Vehicle overturned (no collision) 7=Fall from or in moving vehicle 8=No collision and no object struck 9=Other accident). . DAY OF THE WEEK: The day of each week that the accident took place (1=Sunday,2= Monday, 3=Tuesday,4=Wednesday,5= Thursday,6=Friday and 7=Saturday) 1 . LIGHT CONDITION: The level of light available at the time of the accident (1=Day,2=Dusk/dawn,3=Dark street lights on,4=Dark street lights off,5=Dark no street lights,6=Dark street lights unknown, 9=Unknown) . SPEED ZONE: The speed limit that is enforced at the site of the accident (040=40 km/hr, 050=50 km/hr, 060=60 km/hr, 075=75 km/hr, 080=80 km/hr, 090=90 km/hr, 100=100 km/hr, 110=110 km/hr, 777=Other speed limit, 888=Camping grounds/off road, 999=Not known). 2
accident$time_category <- ifelse(hour(accident$ACCIDENTTIME) >= 05 & hour(accident$ACCIDENTTIME) <= 11, "Morning",
ifelse(hour(accident$ACCIDENTTIME) > 11 & hour(accident$ACCIDENTTIME) <= 16, "Afternoon",
ifelse(hour(accident$ACCIDENTTIME) > 16 & hour(accident$ACCIDENTTIME) <= 19, "Evening", "Night")))
accident$SEASON <- ifelse(month(accident$ACCIDENTDATE) >= 12 & month(accident$ACCIDENTDATE) <= 2, "Summer",
ifelse(month(accident$ACCIDENTDATE) >= 3 & month(accident$ACCIDENTDATE) <= 5, "Autumn",
ifelse(month(accident$ACCIDENTDATE) >= 6 & month(accident$ACCIDENTDATE) <= 8, "Winter",
ifelse(month(accident$ACCIDENTDATE) >= 9 & month(accident$ACCIDENTDATE) <= 11, "Spring","Summer"))))
accident$hourly <- hour(accident$ACCIDENTTIME)
accident_freq <- accident %>%
dplyr::group_by(hourly,Vehicle.Type.Desc,SPEED_ZONE) %>%
dplyr::summarise(freq = sum(NO_PERSONS_KILLED))
boxplot(accident_freq$freq ~ accident_freq$SPEED_ZONE, main="NO PERSONS KILLED vs SPEED ZONE", ylab = "NO. PERSONS KILLED freq", xlab = "Speed Zone")
The above figure illustrates the bivariate box plot, we are interested in Speed_zone variable and a “No.of people killed” variable in the accident data set. According to the plot, there are few outliers with respect to the speed zones. The mvn() function is used to discover possible multivariate outliers
# Multivariate outlier detection using Mahalanobis distance with QQ plots
accident_freq_sub <- accident_freq %>% dplyr::select(hourly,SPEED_ZONE,freq)
Adding missing grouping variables: `Vehicle.Type.Desc`
accident_freq_sub$Vehicle.Type.Desc <- NULL
results <- mvn(data = accident_freq_sub, multivariateOutlierMethod = "quan", showOutliers = TRUE)
As we can see on the QQ plot, the Mahalonobis distance suggests existence of few outliers for this subset of the accident data. The list of possible multivariate outliers can be displayed by using result$multivariateOutliers to select only the results related to outliers as follows:
results$multivariateOutliers
This output is providing the locations of outliers in the data set. In this project there are 365 observations that are suggested outliers for this subset of accident data. Argument showNewData = TRUE is used to exclude the outliers. One can simply detect and remove outliers using the following argument:
accident_clean <- mvn(data = accident_freq_sub, multivariateOutlierMethod = "quan", showOutliers = TRUE, showNewData = TRUE)
# Prints the data without outliers
dim(accident_clean$newData)
[1] 741 3
head(accident_clean$newData)
To achive the data normality, we are applying data transformations through mathematical operations like logarithmic (i.e., ln and log), square root, power transformations etc. and finally we will select the best transformation.
hist(accident_clean$newData$freq)
From the histogram, we observe that frequency count have a right-skewed distribution. By applying a logarithmic transformation, the salary distribution would be more symmetrical.
ln_fliter_acc_filt <- log10(accident_clean$newData$freq)
hist(ln_fliter_acc_filt)
ln_accident <- log(accident_clean$newData$freq)
hist(ln_accident)
As seen from the histograms, the log10 transformation worked slightly better than the ln transformation for this dataset.
Another transformation is the square root transformation. It is also used for reducing right skewness, and also has the advantage that it can be applied to zero values.
sqrt_accident <- sqrt(accident_clean$newData$freq)
hist(sqrt_accident)
The square root transformation has reduced the skewness in the accident frequency count distribution. But still we can see for more transformations to confirm the better one.
recip_fliter_acc_filt <- 1/(accident_clean$newData$freq)
hist(recip_fliter_acc_filt)
boxcox_acc_filt<- BoxCox(accident_clean$newData$freq,lambda = "auto")
hist(boxcox_acc_filt)
After trying out many transformations, square root transformation has reduced the skewness in the accident frequency count distribution
center_x <-scale(sqrt_accident, center = TRUE, scale = TRUE)
hist(center_x)
The resulting transformed data values would have a zero mean and one standard deviation.