Required packages

This chunk contains all the neccessary packages required to obtain the desired results.

library(utils)
library(outliers)
library(editrules)
library(lubridate)
library(dplyr)

Executive Summary

The datasets used contain the 10 year history of Accidents in UK involving various variables. For the analysis, we have used only 2 files from this data which holds the information about accident and the corresponding vehicle involved. After which we have merged these two files based on the accident index. There are multiple data types in this dataset such as chr, int and num data types as well as Dates which were in chr (character) format and hence converted it to date format. Then, we extracted Year from the date variable and also the population of UK is merged with the data for performing neccessary transformation on the population statistics.

We then factorise the Variables like sex of the driver and speed limit. Out of the multiple available variables one variable is made factor, ordered and labelled i.e. Driver’s home area type - Rural, Urban etc.

The data is already in tidy format with each variable forming a column, each observation forming a row and each type of observational unit forming a table, therefore, no specific operations are performed to tidy the data.

A new variable is created which is used to calculate the average casualities per accident using two variables namely number of casualities and number of vehicles involved in an accident.

While scanning the data we have also defined some rules to test inconsistencies, it is concluded that there are no inconsistencies and also no missing, infinte or NAN values in the data. Population variable is then scanned for outliers and Normalisation techniques are applied on it.

Data

The datasets provide details about the circumstances of an accident from 2005 to 2014. The statistics relate only to personal injury accidents on public roads that are reported to the police, and subsequently recorded.

We have 3 data files and a lookup file-

  1. Accident file: Contains information about accident severity, weather, location, date, hour, day of week, road type…
  2. Vehicle file: Contains information about vehicle type, vehicle model, engine size, driver sex, driver age, car age…
  3. Casualty file: Contains information about casualty severity, age, sex social class, casualty type, pedestrian or car passenger…
  4. Lookup file: contains the text description of all variable code in the three files

Source : https://www.kaggle.com/benoit72/uk-accidents-10-years-history-with-many-variables OGL:Open Government License http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/

Also we have a differnt dataset that contain Population and Urban Population statistics of the United Kingdom from year 1960 to 2016.

Source: https://data.worldbank.org/country/united-kingdom

setwd("F:\\RMIT\\Data Preproceessing\\Assignment 3\\uk-accidents-10-years-history-with-many-variables")

# Loading datasets here, we will only be working with Accident and Vehicles file. 
# Also, to add a numeric element for transformations we are adding UK yearly population to the same.

Accidents <- read.csv("Accidents_clean.csv",header = T,stringsAsFactors = F)
Vehicles <- read.csv("Vehicles_clean.csv",header = T,stringsAsFactors = F)    
Population <- read.csv("UK_Population.csv",header = T)  

names(Population)[2] <- "Year" #Adding a new column Year to the data

# Displaying heads of the 2 used datasets
head(Accidents) 
head(Vehicles)
##################################################################
# 1. Merging datasets
##################################################################

# Create a function to merge our df
merge.all<- function(x, y) {
  merge(x, y, all=TRUE, by=listCols)
}

# Lists of columns to merge on
listCols<-c(colnames(Accidents)[1])#only the first col in this case

# Call the merge function
acc.uk<- Reduce(merge.all, list(Accidents,Vehicles))

# Free memeory
rm(Vehicles)
rm(Accidents)

Understand

##################################################################
# 2. Unique data types
##################################################################

str(acc.uk)
## 'data.frame':    3004425 obs. of  49 variables:
##  $ ï..Accident_Index                          : chr  "200501BS00001" "200501BS00002" "200501BS00003" "200501BS00003" ...
##  $ Police_Force                               : chr  "Metropolitan Police" "Metropolitan Police" "Metropolitan Police" "Metropolitan Police" ...
##  $ Accident_Severity                          : chr  "Serious" "Slight" "Slight" "Slight" ...
##  $ Number_of_Vehicles                         : int  1 1 2 2 1 1 2 2 2 2 ...
##  $ Number_of_Casualties                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date                                       : chr  "04/01/2005" "05/01/2005" "06/01/2005" "06/01/2005" ...
##  $ Day_of_Week                                : chr  "Tuesday" "Wednesday" "Thursday" "Thursday" ...
##  $ Time                                       : chr  "17:42" "17:36" "00:15" "00:15" ...
##  $ Local_Authority_.District.                 : int  12 12 12 12 12 12 12 12 12 12 ...
##  $ Local_Authority_.Highway.                  : chr  "E09000020" "E09000020" "E09000020" "E09000020" ...
##  $ X1st_Road_Class                            : int  3 4 5 5 3 6 6 6 5 5 ...
##  $ X1st_Road_Number                           : int  3218 450 0 0 3220 0 0 0 0 0 ...
##  $ Road_Type                                  : chr  "Single carriageway" "Dual carriageway" "Single carriageway" "Single carriageway" ...
##  $ Speed_limit                                : int  30 30 30 30 30 30 30 30 30 30 ...
##  $ Junction_Detail                            : chr  "Not at junction or within 20 metres" "Crossroads" "Not at junction or within 20 metres" "Not at junction or within 20 metres" ...
##  $ Junction_Control                           : chr  "Data missing or out of range" "Auto traffic signal" "Data missing or out of range" "Data missing or out of range" ...
##  $ X2nd_Road_Class                            : int  -1 5 -1 -1 -1 -1 -1 -1 6 6 ...
##  $ X2nd_Road_Number                           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Pedestrian_Crossing.Human_Control          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Pedestrian_Crossing.Physical_Facilities    : int  1 5 0 0 0 0 0 0 0 0 ...
##  $ Light_Conditions                           : chr  "Daylight" "Darkness - lights lit" "Darkness - lights lit" "Darkness - lights lit" ...
##  $ Weather_Conditions                         : int  2 1 1 1 1 1 2 2 1 1 ...
##  $ Road_Surface_Conditions                    : int  2 1 1 1 1 2 2 2 1 1 ...
##  $ Special_Conditions_at_Site                 : chr  "None" "None" "None" "None" ...
##  $ Carriageway_Hazards                        : chr  "None" "None" "None" "None" ...
##  $ Urban_or_Rural_Area                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Did_Police_Officer_Attend_Scene_of_Accident: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ LSOA_of_Accident_Location                  : chr  "E01002849" "E01002909" "E01002857" "E01002857" ...
##  $ Vehicle_Reference                          : int  1 1 1 2 1 1 1 2 1 2 ...
##  $ Vehicle_Type                               : chr  "Car" "Bus or coach (17 or more pass seats)" "Bus or coach (17 or more pass seats)" "Car" ...
##  $ Towing_and_Articulation                    : chr  "No tow/articulation" "No tow/articulation" "No tow/articulation" "No tow/articulation" ...
##  $ Vehicle_Manoeuvre                          : chr  "Going ahead other" "Slowing or stopping" "Going ahead right-hand bend" "Parked" ...
##  $ Vehicle_Location.Restricted_Lane           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Junction_Location                          : chr  "Not at or within 20 metres of junction" "Leaving roundabout" "Not at or within 20 metres of junction" "Not at or within 20 metres of junction" ...
##  $ Skidding_and_Overturning                   : chr  "None" "None" "None" "None" ...
##  $ Hit_Object_in_Carriageway                  : chr  "None" "None" "Parked vehicle" "None" ...
##  $ Vehicle_Leaving_Carriageway                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Hit_Object_off_Carriageway                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ X1st_Point_of_Impact                       : int  1 4 4 3 1 1 0 0 1 2 ...
##  $ Was_Vehicle_Left_Hand_Drive.               : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Journey_Purpose_of_Driver                  : int  15 1 1 15 15 15 15 15 15 15 ...
##  $ Sex_of_Driver                              : chr  "Female" "Male" "Male" "Male" ...
##  $ Age_of_Driver                              : int  74 42 35 62 49 49 51 30 31 41 ...
##  $ Age_Band_of_Driver                         : int  10 7 6 9 8 8 8 6 6 7 ...
##  $ Engine_Capacity_.CC.                       : int  -1 8268 8300 1762 1769 85 2976 124 -1 4266 ...
##  $ Propulsion_Code                            : int  -1 2 2 1 1 1 1 1 -1 1 ...
##  $ Age_of_Vehicle                             : int  -1 3 5 6 4 10 1 2 -1 4 ...
##  $ Driver_IMD_Decile                          : int  7 -1 2 1 2 -1 4 1 -1 6 ...
##  $ Driver_Home_Area_Type                      : int  1 -1 1 1 1 -1 1 1 -1 1 ...
# There are chr, int and num data types as well as Date which is in chr(character) format and can be converted to date format

##################################################################
# 3.  Data type conversions
##################################################################

# Creating year variables
# Character to date

acc.uk$Date <- as.Date(acc.uk$Date, "%d/%m/%Y")
str(acc.uk$Date)
##  Date[1:3004425], format: "2005-01-04" "2005-01-05" "2005-01-06" "2005-01-06" "2005-01-07" ...
acc.uk$Year <- year(acc.uk$Date)

# Here, we merge the data from UK's poulation dataset 

acc.uk = merge(acc.uk, 
                  Population[, c("Year", "Population","Urban.population")])


# Character to factor

acc.uk$Sex_of_Driver <- factor(acc.uk$Sex_of_Driver)
str(acc.uk$Sex_of_Driver) # Factor w/4 levels : 1. Male, 2. Female, 3. Not known, -1. data missing or out of range
##  Factor w/ 4 levels "Data missing or out of range",..: 2 3 3 3 2 3 3 2 3 3 ...
levels(acc.uk$Sex_of_Driver)
## [1] "Data missing or out of range" "Female"                      
## [3] "Male"                         "Not known"
# Numeric to factor
acc.uk$Speed_limit <- factor(acc.uk$Speed_limit)
str(acc.uk$Speed_limit) # Factor w/8 levels
##  Factor w/ 8 levels "10","15","20",..: 4 4 4 4 4 4 4 4 4 4 ...
levels(acc.uk$Speed_limit)
## [1] "10" "15" "20" "30" "40" "50" "60" "70"
##################################################################
# 4.  Ordering and Labeling a factor variable
##################################################################

# Converting Driver_Home_Area_Type to factor

acc.uk$Driver_Home_Area_Type <- factor(acc.uk$Driver_Home_Area_Type)
levels(acc.uk$Driver_Home_Area_Type)
## [1] "-1" "1"  "2"  "3"
# Levels on driver home area type are -1, 1, 2, 3
# Attaching value labels -1 = Data missing or out of range 1=Urban area, 2=Small town, 3=Rural

acc.uk$Driver_Home_Area_Type <- ordered(acc.uk$Driver_Home_Area_Type,
                     levels = c(-1,1,2,3),
                     labels = c("Data missing or out of range", "Urban area", "Small town","Rural"))

Tidy & Manipulate Data

##################################################################
# 5.  Tidy format of the data
##################################################################

# The data is in tidy format as per the Codd's third normal form
# Each variable forms a column.
# Each observation forms a row.
# Each type of observational unit forms a table.
# If the data would have been messy/untidy, we would have used "tidyr" package that helps in reshaping the layout of data sets which further contains various functions like gather(), spread(), seperate(), unite() which are used to tidy up the data.

Creating new Variable

##################################################################
# 6.  Creating a new variable
##################################################################

# Here, we use mutate function() from dplyr package which can be used to compute or add the new variable to a data frame. In this chunk, we have created a new variable avg_casualties_per_car using mutate() to calculate the average casualities per accident based on two variables - Number of casualities and number of vehicles involved in an accident.

acc.uk <- acc.uk %>%
  mutate(avg_casualties_per_car = Number_of_Casualties/Number_of_Vehicles)

Scan

###################################################################
# 7.  Scanning all variables for missing values and inconsistencies
###################################################################

# Finding missing values 
sapply(acc.uk, function(x) sum(is.na(x)))  # No missing values in data
##                                        Year 
##                                           0 
##                           ï..Accident_Index 
##                                           0 
##                                Police_Force 
##                                           0 
##                           Accident_Severity 
##                                           0 
##                          Number_of_Vehicles 
##                                           0 
##                        Number_of_Casualties 
##                                           0 
##                                        Date 
##                                           0 
##                                 Day_of_Week 
##                                           0 
##                                        Time 
##                                           0 
##                  Local_Authority_.District. 
##                                           0 
##                   Local_Authority_.Highway. 
##                                           0 
##                             X1st_Road_Class 
##                                           0 
##                            X1st_Road_Number 
##                                           0 
##                                   Road_Type 
##                                           0 
##                                 Speed_limit 
##                                           0 
##                             Junction_Detail 
##                                           0 
##                            Junction_Control 
##                                           0 
##                             X2nd_Road_Class 
##                                           0 
##                            X2nd_Road_Number 
##                                           0 
##           Pedestrian_Crossing.Human_Control 
##                                           0 
##     Pedestrian_Crossing.Physical_Facilities 
##                                           0 
##                            Light_Conditions 
##                                           0 
##                          Weather_Conditions 
##                                           0 
##                     Road_Surface_Conditions 
##                                           0 
##                  Special_Conditions_at_Site 
##                                           0 
##                         Carriageway_Hazards 
##                                           0 
##                         Urban_or_Rural_Area 
##                                           0 
## Did_Police_Officer_Attend_Scene_of_Accident 
##                                           0 
##                   LSOA_of_Accident_Location 
##                                           0 
##                           Vehicle_Reference 
##                                           0 
##                                Vehicle_Type 
##                                           0 
##                     Towing_and_Articulation 
##                                           0 
##                           Vehicle_Manoeuvre 
##                                           0 
##            Vehicle_Location.Restricted_Lane 
##                                           0 
##                           Junction_Location 
##                                           0 
##                    Skidding_and_Overturning 
##                                           0 
##                   Hit_Object_in_Carriageway 
##                                           0 
##                 Vehicle_Leaving_Carriageway 
##                                           0 
##                  Hit_Object_off_Carriageway 
##                                           0 
##                        X1st_Point_of_Impact 
##                                           0 
##                Was_Vehicle_Left_Hand_Drive. 
##                                           0 
##                   Journey_Purpose_of_Driver 
##                                           0 
##                               Sex_of_Driver 
##                                           0 
##                               Age_of_Driver 
##                                           0 
##                          Age_Band_of_Driver 
##                                           0 
##                        Engine_Capacity_.CC. 
##                                           0 
##                             Propulsion_Code 
##                                           0 
##                              Age_of_Vehicle 
##                                           0 
##                           Driver_IMD_Decile 
##                                           0 
##                       Driver_Home_Area_Type 
##                                           0 
##                                  Population 
##                                           0 
##                            Urban.population 
##                                           0 
##                      avg_casualties_per_car 
##                                           0
# Finding Infinite values 
sapply(acc.uk, function(x) sum(is.infinite(x)))  # No infinite values in data
##                                        Year 
##                                           0 
##                           ï..Accident_Index 
##                                           0 
##                                Police_Force 
##                                           0 
##                           Accident_Severity 
##                                           0 
##                          Number_of_Vehicles 
##                                           0 
##                        Number_of_Casualties 
##                                           0 
##                                        Date 
##                                           0 
##                                 Day_of_Week 
##                                           0 
##                                        Time 
##                                           0 
##                  Local_Authority_.District. 
##                                           0 
##                   Local_Authority_.Highway. 
##                                           0 
##                             X1st_Road_Class 
##                                           0 
##                            X1st_Road_Number 
##                                           0 
##                                   Road_Type 
##                                           0 
##                                 Speed_limit 
##                                           0 
##                             Junction_Detail 
##                                           0 
##                            Junction_Control 
##                                           0 
##                             X2nd_Road_Class 
##                                           0 
##                            X2nd_Road_Number 
##                                           0 
##           Pedestrian_Crossing.Human_Control 
##                                           0 
##     Pedestrian_Crossing.Physical_Facilities 
##                                           0 
##                            Light_Conditions 
##                                           0 
##                          Weather_Conditions 
##                                           0 
##                     Road_Surface_Conditions 
##                                           0 
##                  Special_Conditions_at_Site 
##                                           0 
##                         Carriageway_Hazards 
##                                           0 
##                         Urban_or_Rural_Area 
##                                           0 
## Did_Police_Officer_Attend_Scene_of_Accident 
##                                           0 
##                   LSOA_of_Accident_Location 
##                                           0 
##                           Vehicle_Reference 
##                                           0 
##                                Vehicle_Type 
##                                           0 
##                     Towing_and_Articulation 
##                                           0 
##                           Vehicle_Manoeuvre 
##                                           0 
##            Vehicle_Location.Restricted_Lane 
##                                           0 
##                           Junction_Location 
##                                           0 
##                    Skidding_and_Overturning 
##                                           0 
##                   Hit_Object_in_Carriageway 
##                                           0 
##                 Vehicle_Leaving_Carriageway 
##                                           0 
##                  Hit_Object_off_Carriageway 
##                                           0 
##                        X1st_Point_of_Impact 
##                                           0 
##                Was_Vehicle_Left_Hand_Drive. 
##                                           0 
##                   Journey_Purpose_of_Driver 
##                                           0 
##                               Sex_of_Driver 
##                                           0 
##                               Age_of_Driver 
##                                           0 
##                          Age_Band_of_Driver 
##                                           0 
##                        Engine_Capacity_.CC. 
##                                           0 
##                             Propulsion_Code 
##                                           0 
##                              Age_of_Vehicle 
##                                           0 
##                           Driver_IMD_Decile 
##                                           0 
##                       Driver_Home_Area_Type 
##                                           0 
##                                  Population 
##                                           0 
##                            Urban.population 
##                                           0 
##                      avg_casualties_per_car 
##                                           0
# Finding NaNs values 
sapply(acc.uk, function(x) sum(is.nan(x)))  # No NaN values in data
##                                        Year 
##                                           0 
##                           ï..Accident_Index 
##                                           0 
##                                Police_Force 
##                                           0 
##                           Accident_Severity 
##                                           0 
##                          Number_of_Vehicles 
##                                           0 
##                        Number_of_Casualties 
##                                           0 
##                                        Date 
##                                           0 
##                                 Day_of_Week 
##                                           0 
##                                        Time 
##                                           0 
##                  Local_Authority_.District. 
##                                           0 
##                   Local_Authority_.Highway. 
##                                           0 
##                             X1st_Road_Class 
##                                           0 
##                            X1st_Road_Number 
##                                           0 
##                                   Road_Type 
##                                           0 
##                                 Speed_limit 
##                                           0 
##                             Junction_Detail 
##                                           0 
##                            Junction_Control 
##                                           0 
##                             X2nd_Road_Class 
##                                           0 
##                            X2nd_Road_Number 
##                                           0 
##           Pedestrian_Crossing.Human_Control 
##                                           0 
##     Pedestrian_Crossing.Physical_Facilities 
##                                           0 
##                            Light_Conditions 
##                                           0 
##                          Weather_Conditions 
##                                           0 
##                     Road_Surface_Conditions 
##                                           0 
##                  Special_Conditions_at_Site 
##                                           0 
##                         Carriageway_Hazards 
##                                           0 
##                         Urban_or_Rural_Area 
##                                           0 
## Did_Police_Officer_Attend_Scene_of_Accident 
##                                           0 
##                   LSOA_of_Accident_Location 
##                                           0 
##                           Vehicle_Reference 
##                                           0 
##                                Vehicle_Type 
##                                           0 
##                     Towing_and_Articulation 
##                                           0 
##                           Vehicle_Manoeuvre 
##                                           0 
##            Vehicle_Location.Restricted_Lane 
##                                           0 
##                           Junction_Location 
##                                           0 
##                    Skidding_and_Overturning 
##                                           0 
##                   Hit_Object_in_Carriageway 
##                                           0 
##                 Vehicle_Leaving_Carriageway 
##                                           0 
##                  Hit_Object_off_Carriageway 
##                                           0 
##                        X1st_Point_of_Impact 
##                                           0 
##                Was_Vehicle_Left_Hand_Drive. 
##                                           0 
##                   Journey_Purpose_of_Driver 
##                                           0 
##                               Sex_of_Driver 
##                                           0 
##                               Age_of_Driver 
##                                           0 
##                          Age_Band_of_Driver 
##                                           0 
##                        Engine_Capacity_.CC. 
##                                           0 
##                             Propulsion_Code 
##                                           0 
##                              Age_of_Vehicle 
##                                           0 
##                           Driver_IMD_Decile 
##                                           0 
##                       Driver_Home_Area_Type 
##                                           0 
##                                  Population 
##                                           0 
##                            Urban.population 
##                                           0 
##                      avg_casualties_per_car 
##                                           0
(Rule1 <- editset(c("Age_of_Driver >= -1", "Age_of_Driver <= 150"))) #-1 for values which are unknown
## 
## Edit set:
## num1 : -1 <= Age_of_Driver
## num2 : Age_of_Driver <= 150
(Rule2 <- editset("Number_of_Vehicles > 0"))
## 
## Edit set:
## num1 : 0 < Number_of_Vehicles
(Rule3 <- editset("Number_of_Casualties >= 0"))
## 
## Edit set:
## num1 : 0 <= Number_of_Casualties
(Rule4 <- editset(c("Age_of_Vehicle >= -1", "Age_of_Vehicle <= 150"))) #-1 for values which are unknown
## 
## Edit set:
## num1 : -1 <= Age_of_Vehicle
## num2 : Age_of_Vehicle <= 150
# Checking for rule 1
unique(violatedEdits(Rule1, acc.uk))
## [1] FALSE
# No violations

# Checking for rule 2
unique(violatedEdits(Rule2, acc.uk))
## [1] FALSE
# No violations

# Checking for rule 3
unique(violatedEdits(Rule3, acc.uk))
## [1] FALSE
# No violations

# Checking for rule 4
unique(violatedEdits(Rule4, acc.uk))
## [1] FALSE
# No violations


##################################################################
# 8.  Scanning numeric variable for outliers
##################################################################

# There is only one continous numeric variable. All others are categorical variables
# stored as numeric variables as of now

# Now, we use the boxplot function to check if the dataset contains any outlier values.

acc.uk$Population %>%  boxplot(main="Box Plot of Population", ylab="Population", col = "grey")

# No outliers as per the box plot test

# Here, we use the z-score method in which is a distance based method used to detect the outliers in a dataset. An oservation is considered an outlier if the abasolute value of its z-score is greater than 3. We use the scores() fucntion in outlier package to calculate the z-scores.

z.scores <- acc.uk$Population %>%  scores(type = "z")
z.scores %>% summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.4354 -0.7583 -0.0572  0.0000  0.9895  1.6604
# According to Z score test, the variable has no outliers

which( abs(z.scores) >3 )
## integer(0)
# integer(0)

length (which( abs(z.scores) >3 ))
## [1] 0
# 0 data points have z score greater than 3

# Correcting for outliers
# Since there are no outliers the below code has been commented just to show the application


# # Removing outliers
# acc.uk_clean <- acc.uk$Population[-which( abs(z.scores) >3 )]
# 
# # Imputing outliers
# acc.uk$Population[ which( abs(z.scores) >3 )] <- mean(acc.uk$Population, na.rm = TRUE)
# 
# # Capping
# cap <- function(x){
#   quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ) )
#   x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
#   x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
#   x
# }
# 
# acc.uk_capped <- acc.uk$Population %>% cap()

Transform

##################################################################
# 9.  Data transformation
##################################################################

# Centering and Scaling

pop <- subset(acc.uk[ ,c("Year","Population")])
subset_population <- unique(pop)

center_population <- scale(subset_population$Population, center = FALSE, scale = TRUE)

# Z score standardization
z_population <- scale(subset_population$Population, center = TRUE, scale = TRUE)

# Min-Max normalization
minmaxnormalise <- function(x){(x- min(x)) /(max(x)-min(x))}

norm_population <- minmaxnormalise(subset_population$Population)