# This is the R chunk for the required packages
library("tidyr")
library("dplyr")
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library("stringr")
library("lubridate")
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
library("forecast")
Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
Vicroads road accident statistics 2000 - 2005 accident data https://discover.data.vic.gov.au/dataset/crash-stats-data-extract
ACCIDENT.csv:
# This is the R chunk for the Data Section
file_url <- "https://vicroadsopendatastorehouse.vicroads.vic.gov.au/opendata/Road_Safety/ACCIDENT.zip"
temp <- tempfile()
download.file(file_url, temp)
trying URL 'https://vicroadsopendatastorehouse.vicroads.vic.gov.au/opendata/Road_Safety/ACCIDENT.zip'
Content type 'application/zip' length 45471291 bytes (43.4 MB)
==================================================
downloaded 43.4 MB
data_accident <- unz(temp, "ACCIDENT.csv") %>% read.csv(strip.white=TRUE)
data_atmosphere <- unz(temp, "ATMOSPHERIC_COND.csv") %>% read.csv(strip.white=TRUE)
unlink(temp)
data_accident %>% head()
data_atmosphere %>% head()
Only interested in the atmospheric conditions present for each accident, sequence and id are irrelevant, but since one accident can have multiple conditions present spread was used to separate the conditions out like a dummy variable. the dummy variables were then joined to back to accidents providing additional atmosphere conditions to the accident dataframe id string stripped of white space to make sure they matched between the 2 files
data_atmosphere$ACCIDENT_NO <- data_atmosphere$ACCIDENT_NO %>% str_trim(side="both")
data_accident$ACCIDENT_NO <- data_accident$ACCIDENT_NO %>% str_trim(side="both")
data_atmosphere_COND <- data_atmosphere %>% mutate(i=TRUE) %>% select(-c("ATMOSPH_COND", "ATMOSPH_COND_SEQ")) %>% pivot_wider(names_from=Atmosph.Cond.Desc, values_from=i, values_fill=FALSE, names_prefix="ATMOSPHERE_COND_")
data <- data_accident %>% inner_join(data_atmosphere_COND, by="ACCIDENT_NO")
data %>% head()
Display summary of dataframe
data %>% summary()
ACCIDENT_NO ACCIDENTDATE ACCIDENTTIME ACCIDENT_TYPE
Length:191649 Length:191649 Length:191649 Min. :1.000
Class :character Class :character Class :character 1st Qu.:1.000
Mode :character Mode :character Mode :character Median :1.000
Mean :2.277
3rd Qu.:4.000
Max. :9.000
Accident.Type.Desc DAY_OF_WEEK Day.Week.Description DCA_CODE
Length:191649 Min. :0.000 Length:191649 Min. :100.0
Class :character 1st Qu.:2.000 Class :character 1st Qu.:120.0
Mode :character Median :4.000 Mode :character Median :130.0
Mean :4.007 Mean :139.3
3rd Qu.:6.000 3rd Qu.:171.0
Max. :7.000 Max. :199.0
DCA.Description DIRECTORY EDITION PAGE
Length:191649 Length:191649 Length:191649 Length:191649
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
GRID_REFERENCE_X GRID_REFERENCE_Y LIGHT_CONDITION Light.Condition.Desc
Length:191649 Min. : 1.000 Min. :1.000 Length:191649
Class :character 1st Qu.: 4.000 1st Qu.:1.000 Class :character
Mode :character Median : 6.000 Median :1.000 Mode :character
Mean : 6.442 Mean :1.829
3rd Qu.: 9.000 3rd Qu.:3.000
Max. :14.000 Max. :9.000
NA's :6046
NODE_ID NO_OF_VEHICLES NO_PERSONS NO_PERSONS_INJ_2 NO_PERSONS_INJ_3
Min. : -10 Min. : 1.000 Min. : 1.000 Min. : 0.0000 Min. : 0.0000
1st Qu.: 43385 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 0.0000 1st Qu.: 0.0000
Median :215525 Median : 2.000 Median : 2.000 Median : 0.0000 Median : 1.0000
Mean :177631 Mean : 1.787 Mean : 2.413 Mean : 0.4182 Mean : 0.8479
3rd Qu.:274530 3rd Qu.: 2.000 3rd Qu.: 3.000 3rd Qu.: 1.0000 3rd Qu.: 1.0000
Max. :651582 Max. :19.000 Max. :97.000 Max. :17.0000 Max. :43.0000
NO_PERSONS_KILLED NO_PERSONS_NOT_INJ POLICE_ATTEND ROAD_GEOMETRY Road.Geometry.Desc
Min. : 0.00000 Min. : 0.000 Min. :1.000 Min. :1.000 Length:191649
1st Qu.: 0.00000 1st Qu.: 0.000 1st Qu.:1.000 1st Qu.:2.000 Class :character
Median : 0.00000 Median : 1.000 Median :1.000 Median :5.000 Mode :character
Mean : 0.02049 Mean : 1.126 Mean :1.274 Mean :3.349
3rd Qu.: 0.00000 3rd Qu.: 1.000 3rd Qu.:2.000 3rd Qu.:5.000
Max. :11.00000 Max. :87.000 Max. :9.000 Max. :9.000
SEVERITY SPEED_ZONE ATMOSPHERE_COND_Clear ATMOSPHERE_COND_Not known
Min. :1.000 Min. : 30.0 Mode :logical Mode :logical
1st Qu.:2.000 1st Qu.: 60.0 FALSE:36653 FALSE:178301
Median :3.000 Median : 60.0 TRUE :154996 TRUE :13348
Mean :2.611 Mean :120.1
3rd Qu.:3.000 3rd Qu.: 80.0
Max. :4.000 Max. :999.0
ATMOSPHERE_COND_Smoke ATMOSPHERE_COND_Strong winds ATMOSPHERE_COND_Raining
Mode :logical Mode :logical Mode :logical
FALSE:191433 FALSE:188739 FALSE:170806
TRUE :216 TRUE :2910 TRUE :20843
ATMOSPHERE_COND_Dust ATMOSPHERE_COND_Fog ATMOSPHERE_COND_Snowing
Mode :logical Mode :logical Mode :logical
FALSE:191131 FALSE:189827 FALSE:191566
TRUE :518 TRUE :1822 TRUE :83
Display structure of dataframe many variables aren’t of the correct typing
data %>% str()
'data.frame': 191649 obs. of 36 variables:
$ ACCIDENT_NO : chr "T20060000010" "T20060000018" "T20060000022" "T20060000023" ...
$ ACCIDENTDATE : chr "13/01/2006" "13/01/2006" "14/01/2006" "14/01/2006" ...
$ ACCIDENTTIME : chr "12:42:00 " "19:10:00 " "12:10:00 " "11:49:00 " ...
$ ACCIDENT_TYPE : int 1 1 7 1 1 1 4 4 1 2 ...
$ Accident.Type.Desc : chr "Collision with vehicle" "Collision with vehicle" "Fall from or in moving vehicle" "Collision with vehicle" ...
$ DAY_OF_WEEK : int 6 6 7 7 7 7 1 1 2 2 ...
$ Day.Week.Description : chr "Friday" "Friday" "Saturday" "Saturday" ...
$ DCA_CODE : int 113 113 190 130 121 116 171 171 140 109 ...
$ DCA.Description : chr "RIGHT NEAR (INTERSECTIONS ONLY) " "RIGHT NEAR (INTERSECTIONS ONLY) " "FELL IN/FROM VEHICLE " "REAR END(VEHICLES IN SAME LANE) " ...
$ DIRECTORY : chr "MEL" "MEL" "MEL" "MEL" ...
$ EDITION : chr "40" "40" "40" "40" ...
$ PAGE : chr "91A" "91" "169" "88" ...
$ GRID_REFERENCE_X : chr "G" "H" "C" "J" ...
$ GRID_REFERENCE_Y : int 7 8 11 8 5 2 4 5 11 3 ...
$ LIGHT_CONDITION : int 1 1 1 1 1 1 1 3 1 1 ...
$ Light.Condition.Desc : chr "Day" "Day" "Day" "Day" ...
$ NODE_ID : int 43078 29720 203074 55462 202988 277431 203045 203047 35621 205206 ...
$ NO_OF_VEHICLES : int 3 2 1 2 2 2 1 1 3 1 ...
$ NO_PERSONS : int 6 4 2 2 3 2 1 1 5 2 ...
$ NO_PERSONS_INJ_2 : int 0 0 1 1 0 1 1 1 2 0 ...
$ NO_PERSONS_INJ_3 : int 1 1 0 0 3 0 0 0 2 1 ...
$ NO_PERSONS_KILLED : int 0 0 0 0 0 0 0 0 0 0 ...
$ NO_PERSONS_NOT_INJ : int 5 3 1 1 0 1 0 0 1 1 ...
$ POLICE_ATTEND : int 1 1 1 1 1 1 1 1 1 1 ...
$ ROAD_GEOMETRY : int 1 2 5 2 5 1 5 5 2 5 ...
$ Road.Geometry.Desc : chr "Cross intersection" "T intersection" "Not at intersection" "T intersection" ...
$ SEVERITY : int 3 3 2 2 3 2 2 2 2 3 ...
$ SPEED_ZONE : int 60 70 100 80 50 100 100 70 80 60 ...
$ ATMOSPHERE_COND_Clear : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ ATMOSPHERE_COND_Not known : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Smoke : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Strong winds: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Raining : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Dust : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Fog : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Snowing : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
coerce variables to the correct typing they should be
data$ACCIDENTDATE <- data$ACCIDENTDATE %>% as.Date('%d/%m/%Y')
data$Accident.Type.Desc <- data$Accident.Type.Desc %>% as.factor()
data$Day.Week.Description <- data$Day.Week.Description %>% as.factor()
data$DCA.Description <- data$DCA.Description %>% as.factor()
data$EDITION <- data$EDITION %>% as.character()
data$GRID_REFERENCE_Y <- data$GRID_REFERENCE_Y %>% as.character()
data$Light.Condition.Desc <- data$Light.Condition.Desc %>% as.factor()
data$NODE_ID <- data$NODE_ID %>% as.character()
data$POLICE_ATTEND <- data$POLICE_ATTEND %>% factor(levels=c(1, 2, 9), labels=c("Yes", "No", "Unknown"))
data$Road.Geometry.Desc <- data$Road.Geometry.Desc %>% as.factor()
data$SEVERITY <- data$SEVERITY %>% as.factor()
data$SPEED_ZONE <- data$SPEED_ZONE %>% as.factor()
data %>% str()
'data.frame': 191649 obs. of 36 variables:
$ ACCIDENT_NO : chr "T20060000010" "T20060000018" "T20060000022" "T20060000023" ...
$ ACCIDENTDATE : Date, format: "2006-01-13" "2006-01-13" ...
$ ACCIDENTTIME : chr "12:42:00 " "19:10:00 " "12:10:00 " "11:49:00 " ...
$ ACCIDENT_TYPE : int 1 1 7 1 1 1 4 4 1 2 ...
$ Accident.Type.Desc : Factor w/ 9 levels "Collision with a fixed object",..: 3 3 4 3 3 3 1 1 3 8 ...
$ DAY_OF_WEEK : int 6 6 7 7 7 7 1 1 2 2 ...
$ Day.Week.Description : Factor w/ 7 levels "Friday","Monday",..: 1 1 3 3 3 3 4 4 2 2 ...
$ DCA_CODE : int 113 113 190 130 121 116 171 171 140 109 ...
$ DCA.Description : Factor w/ 81 levels "ACCIDENT OR BROKEN DOWN ",..: 59 59 8 55 62 16 17 17 74 2 ...
$ DIRECTORY : chr "MEL" "MEL" "MEL" "MEL" ...
$ EDITION : chr "40" "40" "40" "40" ...
$ PAGE : chr "91A" "91" "169" "88" ...
$ GRID_REFERENCE_X : chr "G" "H" "C" "J" ...
$ GRID_REFERENCE_Y : chr "7" "8" "11" "8" ...
$ LIGHT_CONDITION : int 1 1 1 1 1 1 1 3 1 1 ...
$ Light.Condition.Desc : Factor w/ 7 levels "Dark No street lights",..: 5 5 5 5 5 5 5 3 5 5 ...
$ NODE_ID : chr "43078" "29720" "203074" "55462" ...
$ NO_OF_VEHICLES : int 3 2 1 2 2 2 1 1 3 1 ...
$ NO_PERSONS : int 6 4 2 2 3 2 1 1 5 2 ...
$ NO_PERSONS_INJ_2 : int 0 0 1 1 0 1 1 1 2 0 ...
$ NO_PERSONS_INJ_3 : int 1 1 0 0 3 0 0 0 2 1 ...
$ NO_PERSONS_KILLED : int 0 0 0 0 0 0 0 0 0 0 ...
$ NO_PERSONS_NOT_INJ : int 5 3 1 1 0 1 0 0 1 1 ...
$ POLICE_ATTEND : Factor w/ 3 levels "Yes","No","Unknown": 1 1 1 1 1 1 1 1 1 1 ...
$ ROAD_GEOMETRY : int 1 2 5 2 5 1 5 5 2 5 ...
$ Road.Geometry.Desc : Factor w/ 9 levels "Cross intersection",..: 1 7 4 7 4 1 4 4 7 4 ...
$ SEVERITY : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 3 2 2 2 2 3 ...
$ SPEED_ZONE : Factor w/ 13 levels "30","40","50",..: 4 5 9 7 3 9 9 5 7 4 ...
$ ATMOSPHERE_COND_Clear : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ ATMOSPHERE_COND_Not known : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Smoke : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Strong winds: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Raining : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Dust : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Fog : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Snowing : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
data_atmosphere did not contain one accident observation per row, this was handled with group_by and spread earlier before being joined to data_accident-selectdata_clean <- data %>% select(
-c(
"ACCIDENT_TYPE",
"DAY_OF_WEEK",
"DCA_CODE",
"LIGHT_CONDITION",
"ROAD_GEOMETRY",
)
)
data_clean %>% str()
'data.frame': 191649 obs. of 31 variables:
$ ACCIDENT_NO : chr "T20060000010" "T20060000018" "T20060000022" "T20060000023" ...
$ ACCIDENTDATE : Date, format: "2006-01-13" "2006-01-13" ...
$ ACCIDENTTIME : chr "12:42:00 " "19:10:00 " "12:10:00 " "11:49:00 " ...
$ Accident.Type.Desc : Factor w/ 9 levels "Collision with a fixed object",..: 3 3 4 3 3 3 1 1 3 8 ...
$ Day.Week.Description : Factor w/ 7 levels "Friday","Monday",..: 1 1 3 3 3 3 4 4 2 2 ...
$ DCA.Description : Factor w/ 81 levels "ACCIDENT OR BROKEN DOWN ",..: 59 59 8 55 62 16 17 17 74 2 ...
$ DIRECTORY : chr "MEL" "MEL" "MEL" "MEL" ...
$ EDITION : chr "40" "40" "40" "40" ...
$ PAGE : chr "91A" "91" "169" "88" ...
$ GRID_REFERENCE_X : chr "G" "H" "C" "J" ...
$ GRID_REFERENCE_Y : chr "7" "8" "11" "8" ...
$ Light.Condition.Desc : Factor w/ 7 levels "Dark No street lights",..: 5 5 5 5 5 5 5 3 5 5 ...
$ NODE_ID : chr "43078" "29720" "203074" "55462" ...
$ NO_OF_VEHICLES : int 3 2 1 2 2 2 1 1 3 1 ...
$ NO_PERSONS : int 6 4 2 2 3 2 1 1 5 2 ...
$ NO_PERSONS_INJ_2 : int 0 0 1 1 0 1 1 1 2 0 ...
$ NO_PERSONS_INJ_3 : int 1 1 0 0 3 0 0 0 2 1 ...
$ NO_PERSONS_KILLED : int 0 0 0 0 0 0 0 0 0 0 ...
$ NO_PERSONS_NOT_INJ : int 5 3 1 1 0 1 0 0 1 1 ...
$ POLICE_ATTEND : Factor w/ 3 levels "Yes","No","Unknown": 1 1 1 1 1 1 1 1 1 1 ...
$ Road.Geometry.Desc : Factor w/ 9 levels "Cross intersection",..: 1 7 4 7 4 1 4 4 7 4 ...
$ SEVERITY : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 3 2 2 2 2 3 ...
$ SPEED_ZONE : Factor w/ 13 levels "30","40","50",..: 4 5 9 7 3 9 9 5 7 4 ...
$ ATMOSPHERE_COND_Clear : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ ATMOSPHERE_COND_Not known : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Smoke : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Strong winds: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Raining : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Dust : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Fog : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Snowing : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
ACCIDENTTIME was binned into time_of_day using mutate then removed# This is the R chunk for the Tidy & Manipulate Data II
data_tidy <- data_clean %>% mutate(
time_of_day = ifelse(
ACCIDENTTIME <= '05.00.00', 'late night', ifelse(
ACCIDENTTIME <= '08.30.00', 'early morning', ifelse(
ACCIDENTTIME <= '09.30.00', 'morning peak hour', ifelse(
ACCIDENTTIME <= '12.00.00', 'morning', ifelse(
ACCIDENTTIME <= '17.00.00', 'afternoon', ifelse(
ACCIDENTTIME <= '18.00.00', 'afternoon peak hour', ifelse(
ACCIDENTTIME <= '20.00.00', 'evening', ifelse(
ACCIDENTTIME <= '24.00.00', 'night', NA
)
)
)
)
)
)
)
)
)
data_tidy$time_of_day <- data_tidy$time_of_day %>% as.factor()
data_tidy <- data_tidy %>% select(-c("ACCIDENTTIME"))
data_tidy %>% str()
'data.frame': 191649 obs. of 31 variables:
$ ACCIDENT_NO : chr "T20060000010" "T20060000018" "T20060000022" "T20060000023" ...
$ ACCIDENTDATE : Date, format: "2006-01-13" "2006-01-13" ...
$ Accident.Type.Desc : Factor w/ 9 levels "Collision with a fixed object",..: 3 3 4 3 3 3 1 1 3 8 ...
$ Day.Week.Description : Factor w/ 7 levels "Friday","Monday",..: 1 1 3 3 3 3 4 4 2 2 ...
$ DCA.Description : Factor w/ 81 levels "ACCIDENT OR BROKEN DOWN ",..: 59 59 8 55 62 16 17 17 74 2 ...
$ DIRECTORY : chr "MEL" "MEL" "MEL" "MEL" ...
$ EDITION : chr "40" "40" "40" "40" ...
$ PAGE : chr "91A" "91" "169" "88" ...
$ GRID_REFERENCE_X : chr "G" "H" "C" "J" ...
$ GRID_REFERENCE_Y : chr "7" "8" "11" "8" ...
$ Light.Condition.Desc : Factor w/ 7 levels "Dark No street lights",..: 5 5 5 5 5 5 5 3 5 5 ...
$ NODE_ID : chr "43078" "29720" "203074" "55462" ...
$ NO_OF_VEHICLES : int 3 2 1 2 2 2 1 1 3 1 ...
$ NO_PERSONS : int 6 4 2 2 3 2 1 1 5 2 ...
$ NO_PERSONS_INJ_2 : int 0 0 1 1 0 1 1 1 2 0 ...
$ NO_PERSONS_INJ_3 : int 1 1 0 0 3 0 0 0 2 1 ...
$ NO_PERSONS_KILLED : int 0 0 0 0 0 0 0 0 0 0 ...
$ NO_PERSONS_NOT_INJ : int 5 3 1 1 0 1 0 0 1 1 ...
$ POLICE_ATTEND : Factor w/ 3 levels "Yes","No","Unknown": 1 1 1 1 1 1 1 1 1 1 ...
$ Road.Geometry.Desc : Factor w/ 9 levels "Cross intersection",..: 1 7 4 7 4 1 4 4 7 4 ...
$ SEVERITY : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 3 2 2 2 2 3 ...
$ SPEED_ZONE : Factor w/ 13 levels "30","40","50",..: 4 5 9 7 3 9 9 5 7 4 ...
$ ATMOSPHERE_COND_Clear : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ ATMOSPHERE_COND_Not known : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Smoke : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Strong winds: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Raining : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Dust : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Fog : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Snowing : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ time_of_day : Factor w/ 8 levels "afternoon","afternoon peak hour",..: 6 4 6 6 6 1 6 8 3 6 ...
NA found in components that would make up directory_reference, pad it out with “UNKNOWN” to indicate this was missing# This is the R chunk for the Scan I
colnames(data_tidy)[colSums(is.na(data_tidy)) > 0]
[1] "GRID_REFERENCE_Y"
Looks better
data_tidy[is.na(data_tidy)] = "UNKNOWN"
length(data_tidy[is.na(data_tidy)])
[1] 0
unitedata_tidy <- data_tidy %>% unite(
DIRECTORY_REFERENCE,
DIRECTORY,
EDITION,
PAGE,
GRID_REFERENCE_X,
GRID_REFERENCE_Y,
sep="."
)
str(data_tidy)
'data.frame': 191649 obs. of 27 variables:
$ ACCIDENT_NO : chr "T20060000010" "T20060000018" "T20060000022" "T20060000023" ...
$ ACCIDENTDATE : Date, format: "2006-01-13" "2006-01-13" ...
$ Accident.Type.Desc : Factor w/ 9 levels "Collision with a fixed object",..: 3 3 4 3 3 3 1 1 3 8 ...
$ Day.Week.Description : Factor w/ 7 levels "Friday","Monday",..: 1 1 3 3 3 3 4 4 2 2 ...
$ DCA.Description : Factor w/ 81 levels "ACCIDENT OR BROKEN DOWN ",..: 59 59 8 55 62 16 17 17 74 2 ...
$ DIRECTORY_REFERENCE : chr "MEL.40.91A.G.7" "MEL.40.91.H.8" "MEL.40.169.C.11" "MEL.40.88.J.8" ...
$ Light.Condition.Desc : Factor w/ 7 levels "Dark No street lights",..: 5 5 5 5 5 5 5 3 5 5 ...
$ NODE_ID : chr "43078" "29720" "203074" "55462" ...
$ NO_OF_VEHICLES : int 3 2 1 2 2 2 1 1 3 1 ...
$ NO_PERSONS : int 6 4 2 2 3 2 1 1 5 2 ...
$ NO_PERSONS_INJ_2 : int 0 0 1 1 0 1 1 1 2 0 ...
$ NO_PERSONS_INJ_3 : int 1 1 0 0 3 0 0 0 2 1 ...
$ NO_PERSONS_KILLED : int 0 0 0 0 0 0 0 0 0 0 ...
$ NO_PERSONS_NOT_INJ : int 5 3 1 1 0 1 0 0 1 1 ...
$ POLICE_ATTEND : Factor w/ 3 levels "Yes","No","Unknown": 1 1 1 1 1 1 1 1 1 1 ...
$ Road.Geometry.Desc : Factor w/ 9 levels "Cross intersection",..: 1 7 4 7 4 1 4 4 7 4 ...
$ SEVERITY : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 3 2 2 2 2 3 ...
$ SPEED_ZONE : Factor w/ 13 levels "30","40","50",..: 4 5 9 7 3 9 9 5 7 4 ...
$ ATMOSPHERE_COND_Clear : logi TRUE TRUE TRUE TRUE TRUE TRUE ...
$ ATMOSPHERE_COND_Not known : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Smoke : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Strong winds: logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Raining : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Dust : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Fog : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ ATMOSPHERE_COND_Snowing : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
$ time_of_day : Factor w/ 8 levels "afternoon","afternoon peak hour",..: 6 4 6 6 6 1 6 8 3 6 ...
Display head of tidy dataframe
data_tidy %>% head()
Outliers are found in almost all cases, which is good, means most accidents involve very few people.
# This is the R chunk for the Scan II
data_tidy$NO_OF_VEHICLES %>% boxplot(main="No. of vehicles involved")
data_tidy$NO_PERSONS %>% boxplot(main="No. of people involved")
data_tidy$NO_PERSONS_KILLED %>% boxplot(main = "No. of people with fatal injuries")
data_tidy$NO_PERSONS_INJ_2 %>% boxplot(main = "No. of people with critical injuries")
data_tidy$NO_PERSONS_INJ_3 %>% boxplot(main = "No. of people with other injuries")
data_tidy$NO_PERSONS_NOT_INJ %>% boxplot(main = "No. of people with non injuries")
Will try BoxCox transformations to clean it up.
Rest seems to have outliers but figures are plausible, try to reduce outlier/skewness with BoxCox
multicar collisions still stand out after BoxCox
# This is the R chunk for the Transform Section
data_tidy$NO_OF_VEHICLES <- BoxCox(data_tidy$NO_OF_VEHICLES, lambda = "auto")
data_tidy$NO_OF_VEHICLES %>% boxplot(main = "No. of vehicles BoxCox")
data_tidy$NO_PERSONS <- BoxCox(data_tidy$NO_PERSONS, lambda = "auto")
data_tidy$NO_PERSONS %>% boxplot(main = "No. of people BoxCox")
data_tidy$NO_PERSONS_KILLED <- BoxCox(data_tidy$NO_PERSONS_KILLED, lambda = "auto")
data_tidy$NO_PERSONS_KILLED %>% boxplot(main = "No. of people with fatal injuries BoxCox")
data_tidy$NO_PERSONS_INJ_2 <- BoxCox(data_tidy$NO_PERSONS_INJ_2, lambda = "auto")
data_tidy$NO_PERSONS_INJ_2 %>% boxplot(main = "No. of people with serious injuries BoxCox")
data_tidy$NO_PERSONS_INJ_3 <- BoxCox(data_tidy$NO_PERSONS_INJ_3, lambda = "auto")
data_tidy$NO_PERSONS_INJ_3 %>% boxplot(main = "No. of people with other injuries BoxCox")
data_tidy$NO_PERSONS_NOT_INJ <- BoxCox(data_tidy$NO_PERSONS_NOT_INJ, lambda = "auto")
data_tidy$NO_PERSONS_NOT_INJ %>% boxplot(main = "No. of people with non injuries BoxCox")