This report is a follow-up to the original Crashes project posted on RPubs a few weeks ago. The purpose was to address the errors and problems that popped in the original data. The main problem was that the target response, crashes that involved fatalities, had such a low prevalence with only ~1.77% observations having the target value. There were also some strange results in the predictive analyses that occured as a result of this very low number.
To remedy this, we changed the target variable from just fatalities to crashes that involved ‘serious’ injury which were 22313 of the total 69089, roughly 32% of the total observations. Some additional analyses were also run such as lasso regressions to show which variables were the most ‘valuable’. The KNN method was not used in this report.
Crashes <- Crashes[Crashes$LIGHT_CONDITION!="Unk.",] # n=74746
Crashes <- Crashes[Crashes$ROAD_GEOMETRY!="Unknown",] # n=74635
Crashes <- Crashes[!Crashes$SPEED_ZONE %in% c("Camping grounds or off road",
"Not known","Other speed limit"),] # n=70204
Crashes <- Crashes[!Crashes$DAY_OF_WEEK %in% "",] # n=69089
Crashes$SPEED_ZONE <- droplevels(Crashes$SPEED_ZONE)
levels(Crashes$SPEED_ZONE)[levels(Crashes$SPEED_ZONE) %in% "30km/hr"] <- "30 km/hr"
Crashes$SPEED_ZONE <- substr(Crashes$SPEED_ZONE,1,3)
Crashes$SPEED_ZONE <- as.integer(Crashes$SPEED_ZONE)
Crashes.sub <- subset(Crashes,select=c(17,18,28,36,39,41,46,47,48,50))
names(Crashes.sub)[names(Crashes.sub)=='SEVERITY'] <- 'SERIOUS'
levels(Crashes.sub$SERIOUS)[levels(Crashes.sub$SERIOUS) %in%
c("Fatal accident","Serious injury accident")] <- "Yes"
levels(Crashes.sub$SERIOUS)[levels(Crashes.sub$SERIOUS) %in%
c("Non injury accident","Other injury accident")] <- "No"
Crashes.sub <- Crashes.sub[,c(1,9,2,3,4,5,6,7,8,10)]
write.csv(Crashes.sub,"Crashes pt.2.csv")}
str(Crashes)
## 'data.frame': 69089 obs. of 10 variables:
## $ SERIOUS : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 2 2 1 1 1 ...
## $ ALCOHOL_RELATED: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 2 ...
## $ SPEED_ZONE : int 60 60 40 50 80 70 70 70 60 60 ...
## $ TOTAL_PERSONS : int 6 2 2 5 2 2 2 3 5 2 ...
## $ BICYCLIST : int 0 1 0 0 0 0 0 0 0 0 ...
## $ PEDESTRIAN : int 0 0 0 1 0 0 1 0 0 0 ...
## $ MOTORIST : int 0 0 1 0 0 0 0 0 0 0 ...
## $ OLD_DRIVER : int 0 0 0 0 0 0 0 0 0 0 ...
## $ YOUNG_DRIVER : int 0 0 0 1 0 0 0 0 1 0 ...
## $ NO_OF_VEHICLES : int 3 2 2 2 2 2 1 3 2 2 ...
table(Crashes$SERIOUS)
##
## No Yes
## 46776 22313
The same variables are used in this instance but the Y variable has been changed. While information regarding a ‘serious’ injury was not known, we assume that it means hospitilisation and it also includes creashes that concluded with a fatality.
set.seed(123)
smp_size <- floor(0.75*nrow(Crashes))
split_value <- seq(1,smp_size)
train <- Crashes[split_value,]
test <- Crashes[-split_value,]
x_train <- train[,-1]
y_train <- train[,1]
x_test <- test[,-1]
y_test <- test[,1]
Crashes.results <- data.frame(
Technique=character(),
Error=numeric(),
Information=character()
)
library(glmnet)
C.alcohol <- model.matrix(Crashes$SERIOUS~Crashes$ALCOHOL_RELATED)[,-1]
x <- as.matrix(data.frame(Crashes$SPEED_ZONE,Crashes$TOTAL_PERSONS,Crashes$TOTAL_PERSONS,
Crashes$PEDESTRIAN,Crashes$MOTORIST,Crashes$OLD_DRIVER,
Crashes$YOUNG_DRIVER,Crashes$NO_OF_VEHICLES,C.alcohol))
y <- Crashes$SERIOUS
lasso.crash <- glmnet(x[split_value,],y[split_value],alpha = 0,family="binomial")
lasso.cv <- cv.glmnet(x[split_value,],y[split_value],alpha = 0,family="binomial")
lasso.pred <- predict(lasso.crash,s=lasso.cv$lambda.min,newx=x[-split_value,])
lasso.pred[lasso.pred>0] <- "Yes"
lasso.pred[lasso.pred<=0] <- "No"
mean(y_test!=lasso.pred)
coef(lasso.cv,s=lasso.cv$lambda.min)
htmlTable(Lasso.results)
| Variable | Value | |
|---|---|---|
| 1 | (Intercept) | -1.603 |
| 2 | Alcohol | 1.437 |
| 3 | MOTORIST | 0.666 |
| 4 | PEDESTRIAN | 0.598 |
| 5 | OLD_DRIVER | 0.579 |
| 6 | NO_OF_VEHICLES | -0.26 |
| 7 | TOTAL_PERSONS | 0.028 |
| 8 | YOUNG_DRIVER | -0.084 |
| 9 | SPEED_ZONE | 0.015 |
print("mean(y_test!=lasso.pred) == 0.285")
## [1] "mean(y_test!=lasso.pred) == 0.285"
The most ‘important’ variables were the presence of alcohol, number of motorcycles, pedestrians and old drivers. The misclassification error rate of 0.285 is much more realistic than the single-digit error from the original data analyses.
Crashes.sub <- subset(Crashes,select=c(1,2,6,7,8))
train2 <- Crashes.sub[split_value,]
test2 <- Crashes.sub[-split_value,]
x_train2 <- train2[,-1]
y_train2 <- train2[,1]
x_test2 <- test2[,-1]
y_test2 <- test2[,1]
cor(Crashes.sub[,3:5])
## PEDESTRIAN MOTORIST OLD_DRIVER
## PEDESTRIAN 1.00000000 -0.10604667 -0.02386654
## MOTORIST -0.10604667 1.00000000 -0.07535271
## OLD_DRIVER -0.02386654 -0.07535271 1.00000000
The correlation values are small with none over 0.1. The rest of the analyses involving a logistic regression, naive-bayes classifier, SVM and classification tree were used on both sets of data to see any changes or improvements over the original findings.
glm.model <- glm(SERIOUS~.,Crashes,subset=split_value,family="binomial")
glm.pred <- predict(glm.model,x_test,type="response")
glm.pred <- ifelse(glm.pred<=0.5,"No","Yes")
mean(glm.pred!=y_test)
Crashes.results <- rbind(Crashes.results,data.frame(
Technique="Logistic Regression",
Error=mean(glm.pred!=y_test),
Information="Misclassification"
))
glm.model2 <- glm(SERIOUS~.,Crashes.sub,subset=split_value,family="binomial")
glm.pred2 <- predict(glm.model2,x_test2,type="response")
glm.pred2 <- ifelse(glm.pred2<=0.5,"No","Yes")
mean(glm.pred2!=y_test2)
Crashes.results <- rbind(Crashes.results,data.frame(
Technique="Logistic Regression",
Error=mean(glm.pred2!=y_test2),
Information="Misclassification w/ subset"
))
nb.model <- naiveBayes(SERIOUS~.,Crashes,subset=split_value)
nb.pred <- predict(nb.model,x_test)
mean(nb.pred!=y_test)
Crashes.results <- rbind(Crashes.results,data.frame(
Technique="Naive-Bayes",
Error=mean(nb.pred!=y_test),
Information="Misclassification"
))
nb.model2 <- naiveBayes(SERIOUS~.,Crashes.sub,subset=split_value)
nb.pred2 <- predict(nb.model2,x_test2)
mean(nb.pred2!=y_test2)
Crashes.results <- rbind(Crashes.results,data.frame(
Technique="Naive-Bayes",
Error=mean(nb.pred2!=y_test2),
Information="Misclassification w/ subset"
))
svm.class <- svm(SERIOUS~.,data=Crashes,subset=split_value)
summary(svm.class)
svm.pred.class <- predict(svm.class,x_test)
mean(svm.pred.class!=y_test) # 0.2825219
Crashes.results <- rbind(Crashes.results,data.frame(
Technique="SVM",
Error=0.2825219,
Information="Misclassification"
))
svm.class2 <- svm(SERIOUS~.,data=Crashes.sub,subset=split_value)
summary(svm.class2)
svm.pred.class2 <- predict(svm.class2,x_test2)
mean(svm.pred.class2!=y_test2) # 0.2863428
Crashes.results <- rbind(Crashes.results,data.frame(
Technique="SVM",
Error=0.2863428,
Information="Misclassification w/ subset"
))
class.tree <- tree(SERIOUS~.,data=Crashes,subset=split_value)
summary(class.tree) # 0.3351
ctree.prune <- cv.tree(class.tree)
which.min(ctree.prune$dev)
plot(class.tree)
text(class.tree)
pruned.ctree <- prune.misclass(class.tree,best=1)
pruned.results <- summary(pruned.ctree)
summary(pruned.ctree)
Crashes.results <- rbind(Crashes.results,data.frame(
Technique="Classification Tree",
Error=0.3351,
Information="Misclassification"
))
class.tree2 <- tree(SERIOUS~.,data=Crashes.sub,subset=split_value)
summary(class.tree2) # 0.3227
ctree.prune2 <- cv.tree(class.tree2)
which.min(ctree.prune2$dev)
plot(class.tree2)
text(class.tree2)
Crashes.results <- rbind(Crashes.results,data.frame(
Technique="Classification Tree",
Error=0.3227,
Information="Misclassification w/ subset"
))
Crashes.results <- read.csv("Crashes(results) pt.2.csv")
Crashes.results
## Technique Error Information
## 1 SVM 0.2825219 Misclassification
## 2 Lasso Regression 0.2850113 Alcohol, Ped, Mot and Old
## 3 Logistic Regression 0.2850692 Misclassification
## 4 SVM 0.2863428 Misclassification w/ subset
## 5 Logistic Regression 0.2872113 Misclassification w/ subset
## 6 Classification Tree 0.3227000 Misclassification w/ subset
## 7 Classification Tree 0.3351000 Misclassification
## 8 Naive-Bayes 0.3435998 Misclassification w/ subset
## 9 Naive-Bayes 0.3442367 Misclassification
These findings are much more realistic than our original findings which were all single-digit values. We can summise that there were obtained due to the low percentage of observations with the target outcome. Since changing the aim, increasing this number and investigating elements of subset selection, overall results have improved.
The SVM method showed the lowest misclassification rate of 0.283 which may be further improved via tuning of the model. This part of the process was not included in this Rmarkdown document as with 69089 rows, it would have consumed far too much time.