Not Preface

Data wrangling with Traffic Safety Data.

Packages

Packages used in this post are: > ggplot2, gridExtra

Vertical Axis Label in ggplot2

setwd("/Users/subasishdas1/Desktop/TRB2016/Data")
mm <- read.csv("fat23.csv")
head(mm)
##                   Cond Freq Fat Severe PDO
## 1 Light-A Wea-A Road-B 1222  56    236 930
## 2 Light-B Wea-A Road-B 1036  34    222 780
## 3 Light-A Wea-A Road-C  495  21    198 276
## 4 Light-B Wea-A Road-C  326  17    176 133
## 5 Light-A Wea-B Road-B  295  13     99 183
## 6 Light-C Wea-A Road-B  267  10     88 169
library(ggplot2)
library(gridExtra)
## Loading required package: grid
plot1 <- ggplot(mm, aes(x=reorder(Cond, Freq), y= Freq))+theme_bw()+
  geom_bar(stat="identity", colour="red")+
  xlab("Total Crashes") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

plot2 <- ggplot(mm, aes(x=reorder(Cond, Fat), y= Fat))+theme_bw()+
  geom_bar(stat="identity", colour="blue")+
  xlab("Fatal Crashes") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

plot3 <- ggplot(mm, aes(x=reorder(Cond, Severe), y= Severe))+theme_bw()+
  geom_bar(stat="identity", colour="yellow")+
  xlab("Severe Crashes") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

plot4 <- ggplot(mm, aes(x=reorder(Cond, PDO), y= PDO))+theme_bw()+
  geom_bar(stat="identity", colour="green")+
  xlab("PDO Crashes") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

grid.arrange(plot1, plot2, plot3, plot4, ncol=2)

Feature Selection by Boruta

library(Boruta)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
## Loading required package: rFerns
library(mlbench)
data("Ozone")
Ozone <- na.omit(Ozone)
Boruta.Ozone <- Boruta(V4 ~ ., data = Ozone, doTrace = 2, ntree = 500)
##  1. run of importance source...
##  2. run of importance source...
##  3. run of importance source...
##  4. run of importance source...
##  5. run of importance source...
##  6. run of importance source...
##  7. run of importance source...
##  8. run of importance source...
##  9. run of importance source...
##  10. run of importance source...
##  11. run of importance source...
## Confirmed 9 attributes: V1, V10, V11, V12, V13 and 4 more.
## Rejected 2 attributes: V2, V3.
##  12. run of importance source...
##  13. run of importance source...
##  14. run of importance source...
##  15. run of importance source...
##  16. run of importance source...
##  17. run of importance source...
##  18. run of importance source...
##  19. run of importance source...
##  20. run of importance source...
##  21. run of importance source...
## Rejected 1 attributes: V6.
plot(Boruta.Ozone)

Variable Importance by randomForest

library(randomForest)
data(imports85)
imp85 <- imports85[,-2]  # Too many NAs in normalizedLosses.
imp85 <- imp85[complete.cases(imp85), ]
imp85[] <- lapply(imp85, function(x) if (is.factor(x)) x[, drop=TRUE] else x)
stopifnot(require(randomForest))
price.rf <- randomForest(price ~ ., imp85, do.trace=10, ntree=100)
##      |      Out-of-bag   |
## Tree |      MSE  %Var(y) |
##   10 | 4.474e+06     6.87 |
##   20 | 4.109e+06     6.31 |
##   30 | 3.856e+06     5.92 |
##   40 | 3.751e+06     5.76 |
##   50 | 3.666e+06     5.63 |
##   60 | 3.809e+06     5.85 |
##   70 | 3.815e+06     5.86 |
##   80 | 3.751e+06     5.76 |
##   90 | 3.759e+06     5.77 |
##  100 | 3.786e+06     5.82 |
print(price.rf)
## 
## Call:
##  randomForest(formula = price ~ ., data = imp85, do.trace = 10,      ntree = 100) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 8
## 
##           Mean of squared residuals: 3785773
##                     % Var explained: 94.18
varImpPlot(price.rf, sort = TRUE,  pch = 19, col = 8, cex = 1.5,
           main = "Imp. Var. Plot")

Traffic Crash Data

setwd("/Users/subasishdas1/Desktop/TRB2016/Data")
all_fatal <- read.csv("ALL_FATAL.csv")
dim(all_fatal)
## [1] 5661   55
fat4 <- all_fatal[c(4:12)]
names(fat4)
## [1] "DRUGS"              "ALCOHOL"            "DAY_OF_WK"         
## [4] "NUM_VEH"            "ACCESS_CNTL_CD"     "ALIGNMENT_CD"      
## [7] "PRI_CONTRIB_FAC_CD" "LIGHTING_CD"        "MAN_COLL_CD"
dim(fat4)
## [1] 5661    9
fat5 <- fat4[complete.cases(fat4),]
fat5$LIGHTING_CD <- as.factor(as.numeric(fat5$LIGHTING_CD))
fat.rf <- randomForest(LIGHTING_CD ~ ., fat5, do.trace=5, ntree=20)
## ntree      OOB      1      2      3      4      5      6      7      8      9     10     11
##     5:  51.20%100.00%100.00% 29.57% 52.48% 87.56% 99.25%100.00%100.00%100.00%100.00%100.00%
##    10:  49.58%100.00%100.00% 27.12% 50.05% 89.35% 99.65%100.00%100.00%100.00%100.00%100.00%
##    15:  48.97%100.00%100.00% 25.98% 49.48% 90.42% 99.29%100.00%100.00%100.00%100.00%100.00%
##    20:  48.41%100.00%100.00% 24.98% 48.74% 91.41% 99.65%100.00%100.00%100.00%100.00%100.00%
print(fat.rf)
## 
## Call:
##  randomForest(formula = LIGHTING_CD ~ ., data = fat5, do.trace = 5,      ntree = 20) 
##                Type of random forest: classification
##                      Number of trees: 20
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 48.41%
## Confusion matrix:
##    1 2    3   4  5 6 7 8 9 10 11 class.error
## 1  0 0    3   2  1 0 0 0 0  0  0   1.0000000
## 2  0 0    2   1  1 0 0 0 0  0  0   1.0000000
## 3  0 0 1865 565 51 3 2 0 0  0  0   0.2497989
## 4  0 0  890 993 49 5 0 0 0  0  0   0.4873516
## 5  0 0  346 297 61 6 0 0 0  0  0   0.9140845
## 6  0 0  138 121 22 1 1 0 0  0  0   0.9964664
## 7  0 0   58  29  7 0 0 0 0  0  0   1.0000000
## 8  0 0   59  36  6 0 0 0 0  0  0   1.0000000
## 9  0 0    3   3  0 0 0 0 0  0  0   1.0000000
## 10 0 0    9  11  2 0 0 0 0  0  0   1.0000000
## 11 0 0    3   8  0 0 0 0 0  0  0   1.0000000
varImpPlot(fat.rf, sort = TRUE,  pch = 19, col =2, cex=1.3,
           main = "Imp. Var. Plot")