Required packages

# This is the R chunk for the required packages
library("tidyr")
library("dplyr")

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
library("stringr")
library("lubridate")

Attaching package: ‘lubridate’

The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union
library("forecast")
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

Executive Summary

Data

Vicroads road accident statistics 2000 - 2005 accident data https://discover.data.vic.gov.au/dataset/crash-stats-data-extract

ACCIDENT.csv:

# This is the R chunk for the Data Section
file_url <- "https://vicroadsopendatastorehouse.vicroads.vic.gov.au/opendata/Road_Safety/ACCIDENT.zip"
temp <- tempfile()
download.file(file_url, temp)
trying URL 'https://vicroadsopendatastorehouse.vicroads.vic.gov.au/opendata/Road_Safety/ACCIDENT.zip'
Content type 'application/zip' length 45471291 bytes (43.4 MB)
==================================================
downloaded 43.4 MB
data_accident <- unz(temp, "ACCIDENT.csv") %>% read.csv(strip.white=TRUE)
data_atmosphere <- unz(temp, "ATMOSPHERIC_COND.csv") %>% read.csv(strip.white=TRUE)
unlink(temp)

data_accident %>% head()
data_atmosphere %>% head()

Only interested in the atmospheric conditions present for each accident, sequence and id are irrelevant, but since one accident can have multiple conditions present spread was used to separate the conditions out like a dummy variable. the dummy variables were then joined to back to accidents providing additional atmosphere conditions to the accident dataframe id string stripped of white space to make sure they matched between the 2 files

data_atmosphere$ACCIDENT_NO <- data_atmosphere$ACCIDENT_NO %>% str_trim(side="both")
data_accident$ACCIDENT_NO <- data_accident$ACCIDENT_NO %>% str_trim(side="both")
data_atmosphere_COND <- data_atmosphere %>% mutate(i=TRUE) %>% select(-c("ATMOSPH_COND", "ATMOSPH_COND_SEQ")) %>% pivot_wider(names_from=Atmosph.Cond.Desc, values_from=i, values_fill=FALSE, names_prefix="ATMOSPHERE_COND_")

data <- data_accident %>% inner_join(data_atmosphere_COND, by="ACCIDENT_NO")

data %>% head()

Understand

Display summary of dataframe

data %>% summary()
 ACCIDENT_NO        ACCIDENTDATE       ACCIDENTTIME       ACCIDENT_TYPE  
 Length:191649      Length:191649      Length:191649      Min.   :1.000  
 Class :character   Class :character   Class :character   1st Qu.:1.000  
 Mode  :character   Mode  :character   Mode  :character   Median :1.000  
                                                          Mean   :2.277  
                                                          3rd Qu.:4.000  
                                                          Max.   :9.000  
                                                                         
 Accident.Type.Desc  DAY_OF_WEEK    Day.Week.Description    DCA_CODE    
 Length:191649      Min.   :0.000   Length:191649        Min.   :100.0  
 Class :character   1st Qu.:2.000   Class :character     1st Qu.:120.0  
 Mode  :character   Median :4.000   Mode  :character     Median :130.0  
                    Mean   :4.007                        Mean   :139.3  
                    3rd Qu.:6.000                        3rd Qu.:171.0  
                    Max.   :7.000                        Max.   :199.0  
                                                                        
 DCA.Description     DIRECTORY           EDITION              PAGE          
 Length:191649      Length:191649      Length:191649      Length:191649     
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
                                                                            
 GRID_REFERENCE_X   GRID_REFERENCE_Y LIGHT_CONDITION Light.Condition.Desc
 Length:191649      Min.   : 1.000   Min.   :1.000   Length:191649       
 Class :character   1st Qu.: 4.000   1st Qu.:1.000   Class :character    
 Mode  :character   Median : 6.000   Median :1.000   Mode  :character    
                    Mean   : 6.442   Mean   :1.829                       
                    3rd Qu.: 9.000   3rd Qu.:3.000                       
                    Max.   :14.000   Max.   :9.000                       
                    NA's   :6046                                         
    NODE_ID       NO_OF_VEHICLES     NO_PERSONS     NO_PERSONS_INJ_2  NO_PERSONS_INJ_3 
 Min.   :   -10   Min.   : 1.000   Min.   : 1.000   Min.   : 0.0000   Min.   : 0.0000  
 1st Qu.: 43385   1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 0.0000   1st Qu.: 0.0000  
 Median :215525   Median : 2.000   Median : 2.000   Median : 0.0000   Median : 1.0000  
 Mean   :177631   Mean   : 1.787   Mean   : 2.413   Mean   : 0.4182   Mean   : 0.8479  
 3rd Qu.:274530   3rd Qu.: 2.000   3rd Qu.: 3.000   3rd Qu.: 1.0000   3rd Qu.: 1.0000  
 Max.   :651582   Max.   :19.000   Max.   :97.000   Max.   :17.0000   Max.   :43.0000  
                                                                                       
 NO_PERSONS_KILLED  NO_PERSONS_NOT_INJ POLICE_ATTEND   ROAD_GEOMETRY   Road.Geometry.Desc
 Min.   : 0.00000   Min.   : 0.000     Min.   :1.000   Min.   :1.000   Length:191649     
 1st Qu.: 0.00000   1st Qu.: 0.000     1st Qu.:1.000   1st Qu.:2.000   Class :character  
 Median : 0.00000   Median : 1.000     Median :1.000   Median :5.000   Mode  :character  
 Mean   : 0.02049   Mean   : 1.126     Mean   :1.274   Mean   :3.349                     
 3rd Qu.: 0.00000   3rd Qu.: 1.000     3rd Qu.:2.000   3rd Qu.:5.000                     
 Max.   :11.00000   Max.   :87.000     Max.   :9.000   Max.   :9.000                     
                                                                                         
    SEVERITY       SPEED_ZONE    ATMOSPHERE_COND_Clear ATMOSPHERE_COND_Not known
 Min.   :1.000   Min.   : 30.0   Mode :logical         Mode :logical            
 1st Qu.:2.000   1st Qu.: 60.0   FALSE:36653           FALSE:178301             
 Median :3.000   Median : 60.0   TRUE :154996          TRUE :13348              
 Mean   :2.611   Mean   :120.1                                                  
 3rd Qu.:3.000   3rd Qu.: 80.0                                                  
 Max.   :4.000   Max.   :999.0                                                  
                                                                                
 ATMOSPHERE_COND_Smoke ATMOSPHERE_COND_Strong winds ATMOSPHERE_COND_Raining
 Mode :logical         Mode :logical                Mode :logical          
 FALSE:191433          FALSE:188739                 FALSE:170806           
 TRUE :216             TRUE :2910                   TRUE :20843            
                                                                           
                                                                           
                                                                           
                                                                           
 ATMOSPHERE_COND_Dust ATMOSPHERE_COND_Fog ATMOSPHERE_COND_Snowing
 Mode :logical        Mode :logical       Mode :logical          
 FALSE:191131         FALSE:189827        FALSE:191566           
 TRUE :518            TRUE :1822          TRUE :83               
                                                                 
                                                                 
                                                                 
                                                                 

Display structure of dataframe many variables aren’t of the correct typing

data %>% str()
'data.frame':   191649 obs. of  36 variables:
 $ ACCIDENT_NO                 : chr  "T20060000010" "T20060000018" "T20060000022" "T20060000023" ...
 $ ACCIDENTDATE                : chr  "13/01/2006" "13/01/2006" "14/01/2006" "14/01/2006" ...
 $ ACCIDENTTIME                : chr  "12:42:00                      " "19:10:00                      " "12:10:00                      " "11:49:00                      " ...
 $ ACCIDENT_TYPE               : int  1 1 7 1 1 1 4 4 1 2 ...
 $ Accident.Type.Desc          : chr  "Collision with vehicle" "Collision with vehicle" "Fall from or in moving vehicle" "Collision with vehicle" ...
 $ DAY_OF_WEEK                 : int  6 6 7 7 7 7 1 1 2 2 ...
 $ Day.Week.Description        : chr  "Friday" "Friday" "Saturday" "Saturday" ...
 $ DCA_CODE                    : int  113 113 190 130 121 116 171 171 140 109 ...
 $ DCA.Description             : chr  "RIGHT NEAR (INTERSECTIONS ONLY)               " "RIGHT NEAR (INTERSECTIONS ONLY)               " "FELL IN/FROM VEHICLE                    " "REAR END(VEHICLES IN SAME LANE)               " ...
 $ DIRECTORY                   : chr  "MEL" "MEL" "MEL" "MEL" ...
 $ EDITION                     : chr  "40" "40" "40" "40" ...
 $ PAGE                        : chr  "91A" "91" "169" "88" ...
 $ GRID_REFERENCE_X            : chr  "G" "H" "C" "J" ...
 $ GRID_REFERENCE_Y            : int  7 8 11 8 5 2 4 5 11 3 ...
 $ LIGHT_CONDITION             : int  1 1 1 1 1 1 1 3 1 1 ...
 $ Light.Condition.Desc        : chr  "Day" "Day" "Day" "Day" ...
 $ NODE_ID                     : int  43078 29720 203074 55462 202988 277431 203045 203047 35621 205206 ...
 $ NO_OF_VEHICLES              : int  3 2 1 2 2 2 1 1 3 1 ...
 $ NO_PERSONS                  : int  6 4 2 2 3 2 1 1 5 2 ...
 $ NO_PERSONS_INJ_2            : int  0 0 1 1 0 1 1 1 2 0 ...
 $ NO_PERSONS_INJ_3            : int  1 1 0 0 3 0 0 0 2 1 ...
 $ NO_PERSONS_KILLED           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ NO_PERSONS_NOT_INJ          : int  5 3 1 1 0 1 0 0 1 1 ...
 $ POLICE_ATTEND               : int  1 1 1 1 1 1 1 1 1 1 ...
 $ ROAD_GEOMETRY               : int  1 2 5 2 5 1 5 5 2 5 ...
 $ Road.Geometry.Desc          : chr  "Cross intersection" "T intersection" "Not at intersection" "T intersection" ...
 $ SEVERITY                    : int  3 3 2 2 3 2 2 2 2 3 ...
 $ SPEED_ZONE                  : int  60 70 100 80 50 100 100 70 80 60 ...
 $ ATMOSPHERE_COND_Clear       : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
 $ ATMOSPHERE_COND_Not known   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Smoke       : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Strong winds: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Raining     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Dust        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Fog         : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Snowing     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

coerce variables to the correct typing they should be

data$ACCIDENTDATE <- data$ACCIDENTDATE %>% as.Date('%d/%m/%Y')
data$Accident.Type.Desc <- data$Accident.Type.Desc %>% as.factor()
data$Day.Week.Description <- data$Day.Week.Description %>% as.factor()
data$DCA.Description <- data$DCA.Description %>% as.factor()
data$EDITION <- data$EDITION %>% as.character()
data$GRID_REFERENCE_Y <- data$GRID_REFERENCE_Y %>% as.character()
data$Light.Condition.Desc <- data$Light.Condition.Desc %>% as.factor()
data$NODE_ID <- data$NODE_ID %>% as.character()
data$POLICE_ATTEND <- data$POLICE_ATTEND %>% factor(levels=c(1, 2, 9), labels=c("Yes", "No", "Unknown"))
data$Road.Geometry.Desc <- data$Road.Geometry.Desc %>% as.factor()
data$SEVERITY <- data$SEVERITY %>% as.factor()
data$SPEED_ZONE <- data$SPEED_ZONE %>% as.factor()
data %>% str()
'data.frame':   191649 obs. of  36 variables:
 $ ACCIDENT_NO                 : chr  "T20060000010" "T20060000018" "T20060000022" "T20060000023" ...
 $ ACCIDENTDATE                : Date, format: "2006-01-13" "2006-01-13" ...
 $ ACCIDENTTIME                : chr  "12:42:00                      " "19:10:00                      " "12:10:00                      " "11:49:00                      " ...
 $ ACCIDENT_TYPE               : int  1 1 7 1 1 1 4 4 1 2 ...
 $ Accident.Type.Desc          : Factor w/ 9 levels "Collision with a fixed object",..: 3 3 4 3 3 3 1 1 3 8 ...
 $ DAY_OF_WEEK                 : int  6 6 7 7 7 7 1 1 2 2 ...
 $ Day.Week.Description        : Factor w/ 7 levels "Friday","Monday",..: 1 1 3 3 3 3 4 4 2 2 ...
 $ DCA_CODE                    : int  113 113 190 130 121 116 171 171 140 109 ...
 $ DCA.Description             : Factor w/ 81 levels "ACCIDENT OR BROKEN DOWN                   ",..: 59 59 8 55 62 16 17 17 74 2 ...
 $ DIRECTORY                   : chr  "MEL" "MEL" "MEL" "MEL" ...
 $ EDITION                     : chr  "40" "40" "40" "40" ...
 $ PAGE                        : chr  "91A" "91" "169" "88" ...
 $ GRID_REFERENCE_X            : chr  "G" "H" "C" "J" ...
 $ GRID_REFERENCE_Y            : chr  "7" "8" "11" "8" ...
 $ LIGHT_CONDITION             : int  1 1 1 1 1 1 1 3 1 1 ...
 $ Light.Condition.Desc        : Factor w/ 7 levels "Dark No street lights",..: 5 5 5 5 5 5 5 3 5 5 ...
 $ NODE_ID                     : chr  "43078" "29720" "203074" "55462" ...
 $ NO_OF_VEHICLES              : int  3 2 1 2 2 2 1 1 3 1 ...
 $ NO_PERSONS                  : int  6 4 2 2 3 2 1 1 5 2 ...
 $ NO_PERSONS_INJ_2            : int  0 0 1 1 0 1 1 1 2 0 ...
 $ NO_PERSONS_INJ_3            : int  1 1 0 0 3 0 0 0 2 1 ...
 $ NO_PERSONS_KILLED           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ NO_PERSONS_NOT_INJ          : int  5 3 1 1 0 1 0 0 1 1 ...
 $ POLICE_ATTEND               : Factor w/ 3 levels "Yes","No","Unknown": 1 1 1 1 1 1 1 1 1 1 ...
 $ ROAD_GEOMETRY               : int  1 2 5 2 5 1 5 5 2 5 ...
 $ Road.Geometry.Desc          : Factor w/ 9 levels "Cross intersection",..: 1 7 4 7 4 1 4 4 7 4 ...
 $ SEVERITY                    : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 3 2 2 2 2 3 ...
 $ SPEED_ZONE                  : Factor w/ 13 levels "30","40","50",..: 4 5 9 7 3 9 9 5 7 4 ...
 $ ATMOSPHERE_COND_Clear       : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
 $ ATMOSPHERE_COND_Not known   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Smoke       : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Strong winds: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Raining     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Dust        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Fog         : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Snowing     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

Tidy & Manipulate Data I

data_clean <- data %>% select(
  -c(
    "ACCIDENT_TYPE",
    "DAY_OF_WEEK",
    "DCA_CODE",
    "LIGHT_CONDITION",
    "ROAD_GEOMETRY",
  )
)
data_clean %>% str()
'data.frame':   191649 obs. of  31 variables:
 $ ACCIDENT_NO                 : chr  "T20060000010" "T20060000018" "T20060000022" "T20060000023" ...
 $ ACCIDENTDATE                : Date, format: "2006-01-13" "2006-01-13" ...
 $ ACCIDENTTIME                : chr  "12:42:00                      " "19:10:00                      " "12:10:00                      " "11:49:00                      " ...
 $ Accident.Type.Desc          : Factor w/ 9 levels "Collision with a fixed object",..: 3 3 4 3 3 3 1 1 3 8 ...
 $ Day.Week.Description        : Factor w/ 7 levels "Friday","Monday",..: 1 1 3 3 3 3 4 4 2 2 ...
 $ DCA.Description             : Factor w/ 81 levels "ACCIDENT OR BROKEN DOWN                   ",..: 59 59 8 55 62 16 17 17 74 2 ...
 $ DIRECTORY                   : chr  "MEL" "MEL" "MEL" "MEL" ...
 $ EDITION                     : chr  "40" "40" "40" "40" ...
 $ PAGE                        : chr  "91A" "91" "169" "88" ...
 $ GRID_REFERENCE_X            : chr  "G" "H" "C" "J" ...
 $ GRID_REFERENCE_Y            : chr  "7" "8" "11" "8" ...
 $ Light.Condition.Desc        : Factor w/ 7 levels "Dark No street lights",..: 5 5 5 5 5 5 5 3 5 5 ...
 $ NODE_ID                     : chr  "43078" "29720" "203074" "55462" ...
 $ NO_OF_VEHICLES              : int  3 2 1 2 2 2 1 1 3 1 ...
 $ NO_PERSONS                  : int  6 4 2 2 3 2 1 1 5 2 ...
 $ NO_PERSONS_INJ_2            : int  0 0 1 1 0 1 1 1 2 0 ...
 $ NO_PERSONS_INJ_3            : int  1 1 0 0 3 0 0 0 2 1 ...
 $ NO_PERSONS_KILLED           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ NO_PERSONS_NOT_INJ          : int  5 3 1 1 0 1 0 0 1 1 ...
 $ POLICE_ATTEND               : Factor w/ 3 levels "Yes","No","Unknown": 1 1 1 1 1 1 1 1 1 1 ...
 $ Road.Geometry.Desc          : Factor w/ 9 levels "Cross intersection",..: 1 7 4 7 4 1 4 4 7 4 ...
 $ SEVERITY                    : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 3 2 2 2 2 3 ...
 $ SPEED_ZONE                  : Factor w/ 13 levels "30","40","50",..: 4 5 9 7 3 9 9 5 7 4 ...
 $ ATMOSPHERE_COND_Clear       : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
 $ ATMOSPHERE_COND_Not known   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Smoke       : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Strong winds: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Raining     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Dust        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Fog         : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Snowing     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

Tidy & Manipulate Data II

# This is the R chunk for the Tidy & Manipulate Data II 

data_tidy <- data_clean %>% mutate(
  time_of_day = ifelse(
    ACCIDENTTIME <= '05.00.00', 'late night', ifelse(
      ACCIDENTTIME <= '08.30.00', 'early morning', ifelse(
        ACCIDENTTIME <= '09.30.00', 'morning peak hour', ifelse(
          ACCIDENTTIME <= '12.00.00', 'morning', ifelse(
            ACCIDENTTIME <= '17.00.00', 'afternoon', ifelse(
              ACCIDENTTIME <= '18.00.00', 'afternoon peak hour', ifelse(
                ACCIDENTTIME <= '20.00.00', 'evening', ifelse(
                  ACCIDENTTIME <= '24.00.00', 'night', NA
                )
              )
            )
          )
        )
      )
    )
  )
)
data_tidy$time_of_day <- data_tidy$time_of_day %>% as.factor()
data_tidy <- data_tidy %>% select(-c("ACCIDENTTIME"))
data_tidy %>% str()
'data.frame':   191649 obs. of  31 variables:
 $ ACCIDENT_NO                 : chr  "T20060000010" "T20060000018" "T20060000022" "T20060000023" ...
 $ ACCIDENTDATE                : Date, format: "2006-01-13" "2006-01-13" ...
 $ Accident.Type.Desc          : Factor w/ 9 levels "Collision with a fixed object",..: 3 3 4 3 3 3 1 1 3 8 ...
 $ Day.Week.Description        : Factor w/ 7 levels "Friday","Monday",..: 1 1 3 3 3 3 4 4 2 2 ...
 $ DCA.Description             : Factor w/ 81 levels "ACCIDENT OR BROKEN DOWN                   ",..: 59 59 8 55 62 16 17 17 74 2 ...
 $ DIRECTORY                   : chr  "MEL" "MEL" "MEL" "MEL" ...
 $ EDITION                     : chr  "40" "40" "40" "40" ...
 $ PAGE                        : chr  "91A" "91" "169" "88" ...
 $ GRID_REFERENCE_X            : chr  "G" "H" "C" "J" ...
 $ GRID_REFERENCE_Y            : chr  "7" "8" "11" "8" ...
 $ Light.Condition.Desc        : Factor w/ 7 levels "Dark No street lights",..: 5 5 5 5 5 5 5 3 5 5 ...
 $ NODE_ID                     : chr  "43078" "29720" "203074" "55462" ...
 $ NO_OF_VEHICLES              : int  3 2 1 2 2 2 1 1 3 1 ...
 $ NO_PERSONS                  : int  6 4 2 2 3 2 1 1 5 2 ...
 $ NO_PERSONS_INJ_2            : int  0 0 1 1 0 1 1 1 2 0 ...
 $ NO_PERSONS_INJ_3            : int  1 1 0 0 3 0 0 0 2 1 ...
 $ NO_PERSONS_KILLED           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ NO_PERSONS_NOT_INJ          : int  5 3 1 1 0 1 0 0 1 1 ...
 $ POLICE_ATTEND               : Factor w/ 3 levels "Yes","No","Unknown": 1 1 1 1 1 1 1 1 1 1 ...
 $ Road.Geometry.Desc          : Factor w/ 9 levels "Cross intersection",..: 1 7 4 7 4 1 4 4 7 4 ...
 $ SEVERITY                    : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 3 2 2 2 2 3 ...
 $ SPEED_ZONE                  : Factor w/ 13 levels "30","40","50",..: 4 5 9 7 3 9 9 5 7 4 ...
 $ ATMOSPHERE_COND_Clear       : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
 $ ATMOSPHERE_COND_Not known   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Smoke       : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Strong winds: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Raining     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Dust        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Fog         : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Snowing     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ time_of_day                 : Factor w/ 8 levels "afternoon","afternoon peak hour",..: 6 4 6 6 6 1 6 8 3 6 ...

Scan I

# This is the R chunk for the Scan I
colnames(data_tidy)[colSums(is.na(data_tidy)) > 0]
[1] "GRID_REFERENCE_Y"

Looks better

data_tidy[is.na(data_tidy)] = "UNKNOWN"
length(data_tidy[is.na(data_tidy)])
[1] 0
data_tidy <- data_tidy %>% unite(
  DIRECTORY_REFERENCE,
  DIRECTORY,
  EDITION,
  PAGE,
  GRID_REFERENCE_X,
  GRID_REFERENCE_Y,
  sep="."
)

str(data_tidy)
'data.frame':   191649 obs. of  27 variables:
 $ ACCIDENT_NO                 : chr  "T20060000010" "T20060000018" "T20060000022" "T20060000023" ...
 $ ACCIDENTDATE                : Date, format: "2006-01-13" "2006-01-13" ...
 $ Accident.Type.Desc          : Factor w/ 9 levels "Collision with a fixed object",..: 3 3 4 3 3 3 1 1 3 8 ...
 $ Day.Week.Description        : Factor w/ 7 levels "Friday","Monday",..: 1 1 3 3 3 3 4 4 2 2 ...
 $ DCA.Description             : Factor w/ 81 levels "ACCIDENT OR BROKEN DOWN                   ",..: 59 59 8 55 62 16 17 17 74 2 ...
 $ DIRECTORY_REFERENCE         : chr  "MEL.40.91A.G.7" "MEL.40.91.H.8" "MEL.40.169.C.11" "MEL.40.88.J.8" ...
 $ Light.Condition.Desc        : Factor w/ 7 levels "Dark No street lights",..: 5 5 5 5 5 5 5 3 5 5 ...
 $ NODE_ID                     : chr  "43078" "29720" "203074" "55462" ...
 $ NO_OF_VEHICLES              : int  3 2 1 2 2 2 1 1 3 1 ...
 $ NO_PERSONS                  : int  6 4 2 2 3 2 1 1 5 2 ...
 $ NO_PERSONS_INJ_2            : int  0 0 1 1 0 1 1 1 2 0 ...
 $ NO_PERSONS_INJ_3            : int  1 1 0 0 3 0 0 0 2 1 ...
 $ NO_PERSONS_KILLED           : int  0 0 0 0 0 0 0 0 0 0 ...
 $ NO_PERSONS_NOT_INJ          : int  5 3 1 1 0 1 0 0 1 1 ...
 $ POLICE_ATTEND               : Factor w/ 3 levels "Yes","No","Unknown": 1 1 1 1 1 1 1 1 1 1 ...
 $ Road.Geometry.Desc          : Factor w/ 9 levels "Cross intersection",..: 1 7 4 7 4 1 4 4 7 4 ...
 $ SEVERITY                    : Factor w/ 4 levels "1","2","3","4": 3 3 2 2 3 2 2 2 2 3 ...
 $ SPEED_ZONE                  : Factor w/ 13 levels "30","40","50",..: 4 5 9 7 3 9 9 5 7 4 ...
 $ ATMOSPHERE_COND_Clear       : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
 $ ATMOSPHERE_COND_Not known   : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Smoke       : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Strong winds: logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Raining     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Dust        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Fog         : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ ATMOSPHERE_COND_Snowing     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
 $ time_of_day                 : Factor w/ 8 levels "afternoon","afternoon peak hour",..: 6 4 6 6 6 1 6 8 3 6 ...

Display head of tidy dataframe

data_tidy %>% head()

Scan II

Outliers are found in almost all cases, which is good, means most accidents involve very few people.

# This is the R chunk for the Scan II
data_tidy$NO_OF_VEHICLES %>% boxplot(main="No. of vehicles involved")

data_tidy$NO_PERSONS %>% boxplot(main="No. of people involved")

data_tidy$NO_PERSONS_KILLED %>% boxplot(main = "No. of people with fatal injuries")

data_tidy$NO_PERSONS_INJ_2 %>% boxplot(main = "No. of people with critical injuries")

data_tidy$NO_PERSONS_INJ_3 %>% boxplot(main = "No. of people with other injuries")

data_tidy$NO_PERSONS_NOT_INJ %>% boxplot(main = "No. of people with non injuries")

Will try BoxCox transformations to clean it up.

Transform

Rest seems to have outliers but figures are plausible, try to reduce outlier/skewness with BoxCox

multicar collisions still stand out after BoxCox

# This is the R chunk for the Transform Section
data_tidy$NO_OF_VEHICLES <- BoxCox(data_tidy$NO_OF_VEHICLES, lambda = "auto")
data_tidy$NO_OF_VEHICLES %>% boxplot(main = "No. of vehicles BoxCox")

data_tidy$NO_PERSONS <- BoxCox(data_tidy$NO_PERSONS, lambda = "auto")
data_tidy$NO_PERSONS %>% boxplot(main = "No. of people BoxCox")

data_tidy$NO_PERSONS_KILLED <- BoxCox(data_tidy$NO_PERSONS_KILLED, lambda = "auto")
data_tidy$NO_PERSONS_KILLED %>% boxplot(main = "No. of people with fatal injuries BoxCox")

data_tidy$NO_PERSONS_INJ_2 <- BoxCox(data_tidy$NO_PERSONS_INJ_2, lambda = "auto")
data_tidy$NO_PERSONS_INJ_2 %>% boxplot(main = "No. of people with serious injuries BoxCox")

data_tidy$NO_PERSONS_INJ_3 <- BoxCox(data_tidy$NO_PERSONS_INJ_3, lambda = "auto")
data_tidy$NO_PERSONS_INJ_3 %>% boxplot(main = "No. of people with other injuries BoxCox")

data_tidy$NO_PERSONS_NOT_INJ <- BoxCox(data_tidy$NO_PERSONS_NOT_INJ, lambda = "auto")
data_tidy$NO_PERSONS_NOT_INJ %>% boxplot(main = "No. of people with non injuries BoxCox")



