knitr::opts_chunk$set(warning = FALSE, message = FALSE)
# This is the R chunk for the required packages
library(dplyr)
library(tidyr)
library(ggplot2)
library(forecast)
library(outliers)
library(MVN)
In this data processing, three data sets are used. Firstly, the three data sets are read and then joined. Secondly, the variables of the joined data set are converted into proper types. Then, it is tidyed and a new variable is created using existing variables. Furthermore, missing values are processed and a transformation is done to normalrise a variable. Finally, the outliers of numeric variable is handled.
The data sets used is retrieved from Australia & New Zealand Road Crash Dataset. The used data sets and brief descriptions of the variables are as follows:
Crash:
crash_id: unique charactor variable to identify each crash.lat_long: charactor of latitude and longitude compound.description_id: foreign key for description of the crash.casualties_id: foreign key for the casualties of the crash.Description:
description_id: unique variable to identify each description.severity: categorical variable.speed_limit: categorical variable.midblock: nominal variable of true and false.intersection: nominal variable of true and false.weather: categorical variable.crash_type: categorical variable.lighting: categorical variable.traffic_controls: categorical variable.drugs_alcohol: logical variable.Casualties:
casualties_id: unique variable to identify each casualty condition.fatalities: discrete quantitative variable.serious_injuries: discrete quantitative variable.minor_injuries: discrete quantitative variable.The later two sets are joined into crash using left_join() regarding the foreign keys respectively.
# This is the R chunk for the Data Section
crash.raw <- read.csv("./data/Crash.csv") %>%
select(crash_id,lat_long,casualties_id, description_id)
description.raw <- read.csv("./data/Description.csv", stringsAsFactors = TRUE) %>%
select(description_id, severity,speed_limit,midblock,intersection,weather,crash_type,lighting,traffic_controls,drugs_alcohol)
casualties.raw <- read.csv("./data/Casualties.csv") %>%
select(casualties_id, fatalities,serious_injuries,minor_injuries)
crash <- crash.raw %>% left_join(description.raw, by = "description_id") %>%
left_join(casualties.raw, by = "casualties_id")
crash %>% head()
Summarise all the variables in the data frame.
str(crash)
'data.frame': 1519455 obs. of 16 variables:
$ crash_id : chr "SA2012-1-21/08/2019" "SA2012-2-21/08/2019" "SA2012-3-21/08/2019" "SA2012-4-21/08/2019" ...
$ lat_long : chr "(-34.914968707994774, 138.62326191400015)" "(-34.945411892314496, 138.61069073873753)" "(-35.348782706688546, 138.4547384995269)" "(-34.91089013663556, 138.56464045685533)" ...
$ casualties_id : chr "0c" "0c" "0c" "0c" ...
$ description_id : int 0 1 2 3 4 5 6 7 8 9 ...
$ severity : Factor w/ 4 levels "fatality","minor_injury",..: 3 3 3 3 3 3 3 3 3 3 ...
$ speed_limit : Factor w/ 37 levels "","0 - 50 km/h",..: 23 17 5 23 23 23 23 23 23 21 ...
$ midblock : Factor w/ 2 levels "False","True": 2 2 2 1 1 1 2 1 1 1 ...
$ intersection : Factor w/ 2 levels "False","True": 1 1 1 2 2 2 1 2 2 2 ...
$ weather : Factor w/ 11 levels "","fine","fog",..: 2 2 2 2 2 2 2 2 2 2 ...
$ crash_type : Factor w/ 61 levels "","Collision with a fixed object",..: 42 21 34 36 36 36 53 36 36 36 ...
$ lighting : Factor w/ 7 levels "","darkness_lit",..: 5 5 5 5 5 5 5 5 5 5 ...
$ traffic_controls: Factor w/ 10 levels "","giveway_sign",..: 4 4 4 9 10 10 4 10 10 10 ...
$ drugs_alcohol : Factor w/ 2 levels "","Y": 1 1 1 1 1 1 1 1 1 1 ...
$ fatalities : num 0 0 0 0 0 0 0 0 0 0 ...
$ serious_injuries: num 0 0 0 0 0 0 0 0 0 0 ...
$ minor_injuries : num 0 0 0 0 0 0 0 0 0 0 ...
midblock, intersection and drugs_alcohol should be converted into logical variables. drugs_alcohol contains only "Y" and "", thus "Y" is treated as TRUE value and the other as FALSE.
crash$midblock <- as.logical(crash$midblock)
crash$intersection <- as.logical(crash$intersection)
drugs_alcohol <- as.character(crash$drugs_alcohol)
drugs_alcohol[drugs_alcohol == "Y"] <- TRUE
drugs_alcohol[drugs_alcohol == ""] <- FALSE
drugs_alcohol <- as.logical(drugs_alcohol)
crash$drugs_alcohol <- drugs_alcohol
str(crash)
'data.frame': 1519455 obs. of 16 variables:
$ crash_id : chr "SA2012-1-21/08/2019" "SA2012-2-21/08/2019" "SA2012-3-21/08/2019" "SA2012-4-21/08/2019" ...
$ lat_long : chr "(-34.914968707994774, 138.62326191400015)" "(-34.945411892314496, 138.61069073873753)" "(-35.348782706688546, 138.4547384995269)" "(-34.91089013663556, 138.56464045685533)" ...
$ casualties_id : chr "0c" "0c" "0c" "0c" ...
$ description_id : int 0 1 2 3 4 5 6 7 8 9 ...
$ severity : Factor w/ 4 levels "fatality","minor_injury",..: 3 3 3 3 3 3 3 3 3 3 ...
$ speed_limit : Factor w/ 37 levels "","0 - 50 km/h",..: 23 17 5 23 23 23 23 23 23 21 ...
$ midblock : logi TRUE TRUE TRUE FALSE FALSE FALSE ...
$ intersection : logi FALSE FALSE FALSE TRUE TRUE TRUE ...
$ weather : Factor w/ 11 levels "","fine","fog",..: 2 2 2 2 2 2 2 2 2 2 ...
$ crash_type : Factor w/ 61 levels "","Collision with a fixed object",..: 42 21 34 36 36 36 53 36 36 36 ...
$ lighting : Factor w/ 7 levels "","darkness_lit",..: 5 5 5 5 5 5 5 5 5 5 ...
$ traffic_controls: Factor w/ 10 levels "","giveway_sign",..: 4 4 4 9 10 10 4 10 10 10 ...
$ drugs_alcohol : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ fatalities : num 0 0 0 0 0 0 0 0 0 0 ...
$ serious_injuries: num 0 0 0 0 0 0 0 0 0 0 ...
$ minor_injuries : num 0 0 0 0 0 0 0 0 0 0 ...
The lat_long column contains two variables, which makes the data set untidy. This column is separated into lat and lng using extract().
crash.ex <- crash %>% extract(lat_long, into=c("lat", "lng"), regex = "\\((-?[[:digit:]]+\\.[[:digit:]]+), ([[:digit:]]+\\.[[:digit:]]+)\\)")
head(crash.ex)
A new variable casualties is created using mutate() as a sum of fatalities, serious_injuries and minor_injuries to reflect the total casualties of crashes.
crash.nv <- crash.ex %>% mutate(casualties = fatalities + serious_injuries + minor_injuries)
str(crash.nv$casualties)
num [1:1519455] 0 0 0 0 0 0 0 0 0 0 ...
Scan the data frame for missing values.
crash.nv %>% is.na() %>% colSums()
crash_id lat lng casualties_id description_id
0 66175 66175 0 0
severity speed_limit midblock intersection weather
0 0 0 0 0
crash_type lighting traffic_controls drugs_alcohol fatalities
0 0 0 0 212390
serious_injuries minor_injuries casualties
212390 212390 212390
For lat and lng, the two variables together defines the exact places where crashes happened. In order not to affect the potential analysis of current geographical data, missing values in these two columns are replaced with N/A. While for the four numeric variables fatalities, serious_injuries, minor_injuries and casualties, they are replaced with 0.0.
crash.na <- replace_na(crash.nv, list(lat="N/A", lng="N/A", fatalities=0.0, serious_injuries=0.0, minor_injuries=0.0, casualties=0.0))
crash.na %>% is.na() %>% colSums()
crash_id lat lng casualties_id description_id
0 0 0 0 0
severity speed_limit midblock intersection weather
0 0 0 0 0
crash_type lighting traffic_controls drugs_alcohol fatalities
0 0 0 0 0
serious_injuries minor_injuries casualties
0 0 0
To understand the current distribution of the numeric variable casualties, a histograph is drawn.
crash.na %>% ggplot(aes(x=casualties)) +
geom_histogram(bins = 10) +
scale_y_log10()
Given the distribution is non-normal, Box-Cox transformation is used. Transformed values is appended to the data set as casualties_boxcox and using another histograph to demonstrate the new distribution.
Using a box plot, it can be seen that there are existing outliers after the transformation.
crash.tr %>% ggplot(aes(y=casualties_boxcox)) +
geom_boxplot()
To find the number of outliers, z-score is calculated.
z.scores <- crash.tr$casualties_boxcox %>% scores(type="z")
length (which( abs(z.scores) >3 ))
[1] 262
As the very slight chances that there are input errors in this data set, capping is used to handle those outliers. A box plot of the variable post-capping is used to illustrate the handled outliers.
quantiles <- quantile( crash.tr$casualties_boxcox, c(.05, 0.25, 0.75, .95 ) )
ca.iqr <- IQR(crash.tr$casualties_boxcox)
crash.cp <- crash.tr
crash.cp$casualties_boxcox[ crash.cp$casualties_boxcox < quantiles[2] - 1.5*ca.iqr ] <- quantiles[1]
crash.cp$casualties_boxcox[ crash.cp$casualties_boxcox > quantiles[3] + 1.5*ca.iqr ] <- quantiles[4]
crash.cp %>% ggplot(aes(y=casualties_boxcox)) +
geom_boxplot()