Data wrangling with Traffic Safety Data.
Packages used in this post are: > ggplot2, gridExtra
setwd("/Users/subasishdas1/Desktop/TRB2016/Data")
mm <- read.csv("fat23.csv")
head(mm)
## Cond Freq Fat Severe PDO
## 1 Light-A Wea-A Road-B 1222 56 236 930
## 2 Light-B Wea-A Road-B 1036 34 222 780
## 3 Light-A Wea-A Road-C 495 21 198 276
## 4 Light-B Wea-A Road-C 326 17 176 133
## 5 Light-A Wea-B Road-B 295 13 99 183
## 6 Light-C Wea-A Road-B 267 10 88 169
library(ggplot2)
library(gridExtra)
## Loading required package: grid
plot1 <- ggplot(mm, aes(x=reorder(Cond, Freq), y= Freq))+theme_bw()+
geom_bar(stat="identity", colour="red")+
xlab("Total Crashes") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot2 <- ggplot(mm, aes(x=reorder(Cond, Fat), y= Fat))+theme_bw()+
geom_bar(stat="identity", colour="blue")+
xlab("Fatal Crashes") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot3 <- ggplot(mm, aes(x=reorder(Cond, Severe), y= Severe))+theme_bw()+
geom_bar(stat="identity", colour="yellow")+
xlab("Severe Crashes") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
plot4 <- ggplot(mm, aes(x=reorder(Cond, PDO), y= PDO))+theme_bw()+
geom_bar(stat="identity", colour="green")+
xlab("PDO Crashes") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
grid.arrange(plot1, plot2, plot3, plot4, ncol=2)
library(Boruta)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
## Loading required package: rFerns
library(mlbench)
data("Ozone")
Ozone <- na.omit(Ozone)
Boruta.Ozone <- Boruta(V4 ~ ., data = Ozone, doTrace = 2, ntree = 500)
## 1. run of importance source...
## 2. run of importance source...
## 3. run of importance source...
## 4. run of importance source...
## 5. run of importance source...
## 6. run of importance source...
## 7. run of importance source...
## 8. run of importance source...
## 9. run of importance source...
## 10. run of importance source...
## 11. run of importance source...
## Confirmed 9 attributes: V1, V10, V11, V12, V13 and 4 more.
## Rejected 2 attributes: V2, V3.
## 12. run of importance source...
## 13. run of importance source...
## 14. run of importance source...
## 15. run of importance source...
## 16. run of importance source...
## 17. run of importance source...
## 18. run of importance source...
## 19. run of importance source...
## 20. run of importance source...
## 21. run of importance source...
## Rejected 1 attributes: V6.
plot(Boruta.Ozone)
library(randomForest)
data(imports85)
imp85 <- imports85[,-2] # Too many NAs in normalizedLosses.
imp85 <- imp85[complete.cases(imp85), ]
imp85[] <- lapply(imp85, function(x) if (is.factor(x)) x[, drop=TRUE] else x)
stopifnot(require(randomForest))
price.rf <- randomForest(price ~ ., imp85, do.trace=10, ntree=100)
## | Out-of-bag |
## Tree | MSE %Var(y) |
## 10 | 4.474e+06 6.87 |
## 20 | 4.109e+06 6.31 |
## 30 | 3.856e+06 5.92 |
## 40 | 3.751e+06 5.76 |
## 50 | 3.666e+06 5.63 |
## 60 | 3.809e+06 5.85 |
## 70 | 3.815e+06 5.86 |
## 80 | 3.751e+06 5.76 |
## 90 | 3.759e+06 5.77 |
## 100 | 3.786e+06 5.82 |
print(price.rf)
##
## Call:
## randomForest(formula = price ~ ., data = imp85, do.trace = 10, ntree = 100)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 8
##
## Mean of squared residuals: 3785773
## % Var explained: 94.18
varImpPlot(price.rf, sort = TRUE, pch = 19, col = 8, cex = 1.5,
main = "Imp. Var. Plot")
setwd("/Users/subasishdas1/Desktop/TRB2016/Data")
all_fatal <- read.csv("ALL_FATAL.csv")
dim(all_fatal)
## [1] 5661 55
fat4 <- all_fatal[c(4:12)]
names(fat4)
## [1] "DRUGS" "ALCOHOL" "DAY_OF_WK"
## [4] "NUM_VEH" "ACCESS_CNTL_CD" "ALIGNMENT_CD"
## [7] "PRI_CONTRIB_FAC_CD" "LIGHTING_CD" "MAN_COLL_CD"
dim(fat4)
## [1] 5661 9
fat5 <- fat4[complete.cases(fat4),]
fat5$LIGHTING_CD <- as.factor(as.numeric(fat5$LIGHTING_CD))
fat.rf <- randomForest(LIGHTING_CD ~ ., fat5, do.trace=5, ntree=20)
## ntree OOB 1 2 3 4 5 6 7 8 9 10 11
## 5: 51.20%100.00%100.00% 29.57% 52.48% 87.56% 99.25%100.00%100.00%100.00%100.00%100.00%
## 10: 49.58%100.00%100.00% 27.12% 50.05% 89.35% 99.65%100.00%100.00%100.00%100.00%100.00%
## 15: 48.97%100.00%100.00% 25.98% 49.48% 90.42% 99.29%100.00%100.00%100.00%100.00%100.00%
## 20: 48.41%100.00%100.00% 24.98% 48.74% 91.41% 99.65%100.00%100.00%100.00%100.00%100.00%
print(fat.rf)
##
## Call:
## randomForest(formula = LIGHTING_CD ~ ., data = fat5, do.trace = 5, ntree = 20)
## Type of random forest: classification
## Number of trees: 20
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 48.41%
## Confusion matrix:
## 1 2 3 4 5 6 7 8 9 10 11 class.error
## 1 0 0 3 2 1 0 0 0 0 0 0 1.0000000
## 2 0 0 2 1 1 0 0 0 0 0 0 1.0000000
## 3 0 0 1865 565 51 3 2 0 0 0 0 0.2497989
## 4 0 0 890 993 49 5 0 0 0 0 0 0.4873516
## 5 0 0 346 297 61 6 0 0 0 0 0 0.9140845
## 6 0 0 138 121 22 1 1 0 0 0 0 0.9964664
## 7 0 0 58 29 7 0 0 0 0 0 0 1.0000000
## 8 0 0 59 36 6 0 0 0 0 0 0 1.0000000
## 9 0 0 3 3 0 0 0 0 0 0 0 1.0000000
## 10 0 0 9 11 2 0 0 0 0 0 0 1.0000000
## 11 0 0 3 8 0 0 0 0 0 0 0 1.0000000
varImpPlot(fat.rf, sort = TRUE, pch = 19, col =2, cex=1.3,
main = "Imp. Var. Plot")