Data and summary

## [1] 8877   16
##  [1] "CONTROL_SECTION"  "LOGMILE_FROM"     "LOGMILE_TO"      
##  [4] "LENGTH"           "AVG_ADT"          "FUNCTIONAL_CLASS"
##  [7] "HIGHWAY_CLASS"    "CLASS_DESC"       "MED_TYPE"        
## [10] "MED_TYPE_DESC"    "MEDIAN_WIDTH"     "NUM_LANES"       
## [13] "PAVEMENT_WIDTH"   "INTERSECTION"     "TOT_CRASH"       
## [16] "AVG_CRASH"
## [1] "LENGTH"         "AVG_ADT"        "CLASS_DESC"     "MED_TYPE_DESC" 
## [5] "MEDIAN_WIDTH"   "NUM_LANES"      "PAVEMENT_WIDTH" "AVG_CRASH"
## 
##         Rural 2 Lane         Rural 4 Lane Rural 4 Lane Divided 
##                 3693                  121                  324 
##     Rural Interstate         Urban 2 Lane         Urban 4 Lane 
##                  234                 2208                  637 
## Urban 4 Lane Divided     Urban Interstate 
##                 1135                  525
rural2 <- subset(all1, CLASS_DESC=="Rural 2 Lane")
dim(rural2)
## [1] 3693    8
rural2_1 <- rural2[-c(3,4, 6)]

ruralmul <- subset(all1, CLASS_DESC=="Rural 4 Lane"| CLASS_DESC=="Rural 4 Lane Divided")
dim(ruralmul)
## [1] 445   8
ruralmul_1 <- ruralmul[-c(3,4)]

ruralint <- subset(all1, CLASS_DESC=="Rural Interstate")
dim(ruralint)
## [1] 234   8
ruralint_1 <- ruralint[-c(3,4)]

urban2 <- subset(all1, CLASS_DESC=="Urban 2 Lane")
dim(urban2)
## [1] 2208    8
urban2_1 <- urban2[-c(3,4, 6)]

urbanmul <- subset(all1, CLASS_DESC=="Urban 4 Lane"| CLASS_DESC=="Urban 4 Lane Divided")
dim(ruralmul)
## [1] 445   8
urbanmul_1 <- urbanmul[-c(3,4)]

urbanint <- subset(all1, CLASS_DESC=="Urban Interstate")
dim(urbanint)
## [1] 525   8
urbanint_1 <- urbanint[-c(3,4)]

## summany statistics
summary(rural2_1)
##      LENGTH          AVG_ADT       MEDIAN_WIDTH      PAVEMENT_WIDTH
##  Min.   : 0.010   Min.   :   41   Min.   : 0.00000   Min.   :14.0  
##  1st Qu.: 0.880   1st Qu.:  863   1st Qu.: 0.00000   1st Qu.:20.0  
##  Median : 2.230   Median : 1883   Median : 0.00000   Median :22.0  
##  Mean   : 2.826   Mean   : 2741   Mean   : 0.08665   Mean   :22.3  
##  3rd Qu.: 4.200   3rd Qu.: 3700   3rd Qu.: 0.00000   3rd Qu.:24.0  
##  Max.   :17.400   Max.   :18744   Max.   :30.00000   Max.   :35.0  
##    AVG_CRASH     
##  Min.   : 0.330  
##  1st Qu.: 0.670  
##  Median : 1.330  
##  Mean   : 2.433  
##  3rd Qu.: 3.000  
##  Max.   :55.330
summary(ruralmul_1)
##      LENGTH          AVG_ADT       MEDIAN_WIDTH    NUM_LANES    
##  Min.   : 0.030   Min.   :  130   Min.   : 0.0   Min.   :3.000  
##  1st Qu.: 0.350   1st Qu.: 5025   1st Qu.: 0.0   1st Qu.:4.000  
##  Median : 0.800   Median : 7850   Median :20.0   Median :4.000  
##  Mean   : 1.569   Mean   : 9740   Mean   :22.4   Mean   :4.022  
##  3rd Qu.: 2.230   3rd Qu.:12124   3rd Qu.:32.0   3rd Qu.:4.000  
##  Max.   :13.140   Max.   :38182   Max.   :64.0   Max.   :6.000  
##  PAVEMENT_WIDTH    AVG_CRASH     
##  Min.   :36.00   Min.   : 0.330  
##  1st Qu.:48.00   1st Qu.: 0.670  
##  Median :48.00   Median : 1.330  
##  Mean   :47.85   Mean   : 3.287  
##  3rd Qu.:48.00   3rd Qu.: 4.000  
##  Max.   :72.00   Max.   :43.330
summary(ruralint_1)
##      LENGTH           AVG_ADT       MEDIAN_WIDTH      NUM_LANES    
##  Min.   : 0.0200   Min.   :12300   Min.   :  2.00   Min.   :4.000  
##  1st Qu.: 0.6925   1st Qu.:17709   1st Qu.: 64.00   1st Qu.:4.000  
##  Median : 2.4600   Median :30700   Median : 64.00   Median :4.000  
##  Mean   : 2.7056   Mean   :29864   Mean   : 70.86   Mean   :4.094  
##  3rd Qu.: 4.0450   3rd Qu.:39432   3rd Qu.: 99.00   3rd Qu.:4.000  
##  Max.   :11.0800   Max.   :61060   Max.   :130.00   Max.   :6.000  
##  PAVEMENT_WIDTH    AVG_CRASH      
##  Min.   :48.00   Min.   :  0.330  
##  1st Qu.:48.00   1st Qu.:  1.415  
##  Median :48.00   Median :  5.670  
##  Mean   :50.59   Mean   : 11.667  
##  3rd Qu.:48.00   3rd Qu.: 16.247  
##  Max.   :80.00   Max.   :167.000
summary(urban2_1)
##      LENGTH          AVG_ADT       MEDIAN_WIDTH     PAVEMENT_WIDTH 
##  Min.   :0.0100   Min.   :  220   Min.   : 0.0000   Min.   :12.00  
##  1st Qu.:0.2400   1st Qu.: 3810   1st Qu.: 0.0000   1st Qu.:22.00  
##  Median :0.5500   Median : 6826   Median : 0.0000   Median :24.00  
##  Mean   :0.9072   Mean   : 8499   Mean   : 0.2056   Mean   :23.46  
##  3rd Qu.:1.1900   3rd Qu.:11462   3rd Qu.: 0.0000   3rd Qu.:24.00  
##  Max.   :9.6000   Max.   :57325   Max.   :30.0000   Max.   :72.00  
##    AVG_CRASH      
##  Min.   :  0.330  
##  1st Qu.:  1.000  
##  Median :  2.330  
##  Mean   :  6.094  
##  3rd Qu.:  6.670  
##  Max.   :143.670
summary(urbanmul_1)
##      LENGTH          AVG_ADT        MEDIAN_WIDTH      NUM_LANES   
##  Min.   :0.0100   Min.   :  1020   Min.   :  0.00   Min.   :2.00  
##  1st Qu.:0.1700   1st Qu.: 11625   1st Qu.:  0.00   1st Qu.:4.00  
##  Median :0.3800   Median : 18960   Median :  8.00   Median :4.00  
##  Mean   :0.6151   Mean   : 21095   Mean   : 16.43   Mean   :4.23  
##  3rd Qu.:0.7600   3rd Qu.: 27044   3rd Qu.: 30.00   3rd Qu.:4.00  
##  Max.   :7.5400   Max.   :151349   Max.   :299.00   Max.   :8.00  
##  PAVEMENT_WIDTH    AVG_CRASH     
##  Min.   :22.00   Min.   :  0.33  
##  1st Qu.:48.00   1st Qu.:  2.00  
##  Median :48.00   Median :  5.33  
##  Mean   :50.58   Mean   : 14.53  
##  3rd Qu.:49.00   3rd Qu.: 15.00  
##  Max.   :99.00   Max.   :587.33
summary(urbanint_1)
##      LENGTH          AVG_ADT        MEDIAN_WIDTH      NUM_LANES    
##  Min.   :0.0200   Min.   : 12500   Min.   :  0.00   Min.   :4.000  
##  1st Qu.:0.3100   1st Qu.: 39150   1st Qu.: 28.00   1st Qu.:4.000  
##  Median :0.6400   Median : 56862   Median : 64.00   Median :4.000  
##  Mean   :0.9505   Mean   : 61152   Mean   : 50.75   Mean   :4.592  
##  3rd Qu.:1.2800   3rd Qu.: 76335   3rd Qu.: 64.00   3rd Qu.:6.000  
##  Max.   :4.7100   Max.   :169590   Max.   :130.00   Max.   :8.000  
##  PAVEMENT_WIDTH    AVG_CRASH     
##  Min.   :48.00   Min.   :  0.33  
##  1st Qu.:48.00   1st Qu.:  2.00  
##  Median :48.00   Median :  8.00  
##  Mean   :55.82   Mean   : 18.63  
##  3rd Qu.:72.00   3rd Qu.: 21.67  
##  Max.   :96.00   Max.   :288.00
## To check the number of lanes
table(ruralmul_1$NUM_LANES) ## Error in defining
## 
##   3   4   5   6 
##   3 433   5   4
table(ruralint_1$NUM_LANES)
## 
##   4   6 
## 223  11
table(urbanmul_1$NUM_LANES) ## Error in defining
## 
##    2    3    4    5    6    8 
##    1    3 1562    8  194    4
table(urbanint_1$NUM_LANES)
## 
##   4   6   7   8 
## 381 132   1  11
options(warn=-1)

Exploratory Data Analysis

## Important Variable Selection
## By Randomforest
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
impvar <- randomForest(AVG_CRASH~., data = rural2_1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar, main="Rural 2-lane")

impvar <- randomForest(AVG_CRASH~., data = ruralmul_1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar, main="Rural Muti-lane")

impvar <- randomForest(AVG_CRASH~., data = ruralint_1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar, main="Rural Interstates")

library(randomForest)
impvar <- randomForest(AVG_CRASH~., data = urban2_1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar, main="Urban 2-lane")

impvar <- randomForest(AVG_CRASH~., data = urbanmul_1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar, main="Urban Muti-lane")

impvar <- randomForest(AVG_CRASH~., data = urbanint_1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar, main="Urban Interstates")

## Correlation plot
library(corrplot)
corrp <- cor(rural2_1)
corrplot(corrp, method = "circle")

corrp <- cor(ruralmul_1)
corrplot(corrp, method = "circle")

corrp <- cor(ruralint_1)
corrplot(corrp, method = "circle")

library(corrplot)
corrp <- cor(urban2_1)
corrplot(corrp, method = "circle")

corrp <- cor(urbanmul_1)
corrplot(corrp, method = "circle")

corrp <- cor(urbanint_1)
corrplot(corrp, method = "circle")

Algorithmic Modeling

Trainset

library(e1071)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Modeling for trainset [all data]
### Rural 2-lane
svm.model <- svm(AVG_CRASH~., data = rural2_1, cost = 100, gamma =1)
svm.pred  <- predict(svm.model, rural2_1[1:4])
sd(svm.pred)
## [1] 2.656685
pl <- cbind(obs= rural2_1$AVG_CRASH, pred=svm.pred)
pl <- cbind(obs= rural2_1$AVG_CRASH, pred=abs(svm.pred), diff=abs(svm.pred)-rural2_1$AVG_CRASH)
pl <- data.frame(pl)
svm_err_all <- rural2_1$AVG_CRASH- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] 2.294312
library(ggplot2)
p1 <- ggplot(pl, aes(obs, pred))
p1a <- p1 + geom_point(colour = "#9A8822", size = 1.3)+ theme_bw()+ geom_abline(slope=1,size = 0.6)+ geom_abline(intercept=2.65, slope=1, color="#F98400",size = 0.6,linetype=4)+geom_abline(intercept=-2.65, slope=1, color="#F98400",size = 0.6,linetype=4)+
labs(title = "Rural 2-lane [trainset]")+labs(x = "Observed", y="Predicted")
p1a

### Rural Multilane
svm.model <- svm(AVG_CRASH~., data = ruralmul_1, cost = 100, gamma =1)
svm.pred  <- predict(svm.model, ruralmul_1[1:5])
sd(svm.pred)
## [1] 4.615578
pl <- cbind(obs= ruralmul_1$AVG_CRASH, pred=svm.pred)
pl <- cbind(obs= ruralmul_1$AVG_CRASH, pred=abs(svm.pred), diff=abs(svm.pred)-ruralmul_1$AVG_CRASH)
pl <- data.frame(pl)
svm_err_all <- ruralmul_1$AVG_CRASH- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] 1.399078
library(ggplot2)
p1 <- ggplot(pl, aes(obs, pred))
p1a <- p1 + geom_point(colour = "#9A8822", size = 1.3)+ theme_bw()+ geom_abline(slope=1,size = 0.6)+ geom_abline(intercept=4.6, slope=1, color="#F98400",size = 0.6,linetype=4)+geom_abline(intercept=-4.6, slope=1, color="#F98400",size = 0.6,linetype=4)+
labs(title = "Rural Multilane [trainset]")+labs(x = "Observed", y="Predicted")
p1a

### Rural Interstates
svm.model <- svm(AVG_CRASH~., data = ruralint_1, cost = 100, gamma =1)
svm.pred  <- predict(svm.model, ruralint_1[1:5])
sd(svm.pred)
## [1] 16.30482
pl <- cbind(obs= ruralint_1$AVG_CRASH, pred=svm.pred)
pl <- cbind(obs= ruralint_1$AVG_CRASH, pred=abs(svm.pred), diff=abs(svm.pred)-ruralint_1$AVG_CRASH)
pl <- data.frame(pl)
svm_err_all <- ruralint_1$AVG_CRASH- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] 6.539465
library(ggplot2)
p1 <- ggplot(pl, aes(obs, pred))
p1a <- p1 + geom_point(colour = "#9A8822", size = 1.3)+ theme_bw()+ geom_abline(slope=1,size = 0.6)+ geom_abline(intercept=16.3, slope=1, color="#F98400",size = 0.6,linetype=4)+geom_abline(intercept=-16.3, slope=1, color="#F98400",size = 0.6,linetype=4)+
labs(title = "Rural Interstates [trainset]")+labs(x = "Observed", y="Predicted")
p1a

### Urban 2-lane
svm.model <- svm(AVG_CRASH~., data = urban2_1, cost = 100, gamma =1)
svm.pred  <- predict(svm.model, urban2_1[1:4])
sd(svm.pred)
## [1] 8.342459
pl <- cbind(obs= rural2_1$AVG_CRASH, pred=svm.pred)
pl <- cbind(obs= rural2_1$AVG_CRASH, pred=abs(svm.pred), diff=abs(svm.pred)-rural2_1$AVG_CRASH)
pl <- data.frame(pl)
svm_err_all <- rural2_1$FREQ- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] NaN
library(ggplot2)
p1 <- ggplot(pl, aes(obs, pred))
p1a <- p1 + geom_point(colour = "#9A8822", size = 1.3)+ theme_bw()+ geom_abline(slope=1,size = 0.6)+ geom_abline(intercept=8.3, slope=1, color="#F98400",size = 0.6,linetype=4)+geom_abline(intercept=-8.3, slope=1, color="#F98400",size = 0.6,linetype=4)+
labs(title = "Urban 2-lane [trainset]")+labs(x = "Observed", y="Predicted")
p1a

### Urban Multi-lane
svm.model <- svm(AVG_CRASH~., data = urbanmul_1, cost = 100, gamma =1)
svm.pred  <- predict(svm.model, urbanmul_1[1:5])
sd(svm.pred)
## [1] 27.33387
pl <- cbind(obs= urbanmul_1$AVG_CRASH, pred=svm.pred)
pl <- cbind(obs= urbanmul_1$AVG_CRASH, pred=abs(svm.pred), diff=abs(svm.pred)-urbanmul_1$AVG_CRASH)
pl <- data.frame(pl)
svm_err_all <- urbanmul_1$TOT_CRASH- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] NaN
library(ggplot2)
p1 <- ggplot(pl, aes(obs, pred))
p1a <- p1 + geom_point(colour = "#9A8822", size = 1.3)+ theme_bw()+ geom_abline(slope=1,size = 0.6)+ geom_abline(intercept=27.3, slope=1, color="#F98400",size = 0.6,linetype=4)+geom_abline(intercept=-27.3, slope=1, color="#F98400",size = 0.6,linetype=4)+
labs(title = "Urban Multilane [trainset]")+labs(x = "Observed", y="Predicted")
p1a

### Urban Interstates
svm.model <- svm(AVG_CRASH~., data = urbanint_1, cost = 100, gamma =1)
svm.pred  <- predict(svm.model, urbanint_1[1:5])
sd(svm.pred)
## [1] 23.81082
pl <- cbind(obs= urbanint_1$AVG_CRASH, pred=svm.pred)
pl <- cbind(obs= urbanint_1$AVG_CRASH, pred=abs(svm.pred), diff=abs(svm.pred)-urbanint_1$AVG_CRASH)
pl <- data.frame(pl)
svm_err_all <- urbanint_1$AVG_CRASH- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] 18.19861
library(ggplot2)
p1 <- ggplot(pl, aes(obs, pred))
p1a <- p1 + geom_point(colour = "#9A8822", size = 1.3)+ theme_bw()+ geom_abline(slope=1,size = 0.6)+ geom_abline(intercept=23.8, slope=1, color="#F98400",size = 0.6,linetype=4)+geom_abline(intercept=-23.8, slope=1, color="#F98400",size = 0.6,linetype=4)+
labs(title = "Urban Interstates [trainset]")+labs(x = "Observed", y="Predicted")
p1a

library(randomForestCI)
library(dplyr) # For data manipulation
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(randomForest) # For random forest ensemble models

# Function to divide data into training, and test sets
index <- function(data=data,pctTrain=0.7)
{
  # fcn to create indices to divide data into random 
  # training, validation and testing data sets
  N <- nrow(data) 
  train <- sample(N, pctTrain*N) 
  test <- setdiff(seq_len(N),train) 
  Ind <- list(train=train,test=test)
  return(Ind)
} 
#
set.seed(123)
ind <- index(rural2_1,0.8)
length(ind$train)
## [1] 2954
length(ind$test)
## [1] 739
rf_fit <- randomForest(AVG_CRASH~., data=na.omit(rural2_1[ind$train,]),
                       keep.inbag=TRUE) # Build the model


# Plot the error as the number of trees increases
plot(rf_fit)

# Plot the important variables
varImpPlot(rf_fit,col="blue",pch= 2)

# Calculate the Variance
X <- na.omit(rural2_1[ind$test,-5])
head(X)
##     LENGTH AVG_ADT MEDIAN_WIDTH PAVEMENT_WIDTH
## 1     0.53    1920            0             24
## 47    0.86    5567            0             24
## 66    0.36    1550            0             24
## 90    2.55    4357            0             24
## 135   1.40    2333            0             24
## 153   0.66     890            0             24
var_hat <- randomForestInfJack(rf_fit, X, calibrate = TRUE)

#Have a look at the variance
head(var_hat)
##        y.hat   var.hat
## 1   1.400789 0.2448401
## 47  1.883930 0.2450745
## 66  1.327687 0.2452486
## 90  3.369700 0.3777212
## 135 1.784369 0.2450206
## 153 1.253134 0.2453819
dim(var_hat)
## [1] 739   2
plot(var_hat)

# Plot the fit
df <- data.frame(y = rural2_1[ind$test,]$AVG_CRASH, var_hat)
df <- mutate(df, se = sqrt(var.hat))
head(df)
##      y    y.hat   var.hat        se
## 1 0.33 1.400789 0.2448401 0.4948132
## 2 1.00 1.883930 0.2450745 0.4950500
## 3 0.33 1.327687 0.2452486 0.4952258
## 4 2.33 3.369700 0.3777212 0.6145903
## 5 1.00 1.784369 0.2450206 0.4949956
## 6 1.00 1.253134 0.2453819 0.4953603
p1 <- ggplot(df, aes(x = y, y = y.hat))
p1 + geom_errorbar(aes(ymin=y.hat-se, ymax=y.hat+se), width=.1) +
  geom_point(colour = "#9A8822", size = 1.3) + scale_x_continuous(limits = c(0, 22))+ theme_bw()+
  geom_abline(intercept=0, slope=1, linetype=2) +
  xlab("Observed") +
  ylab("Predicted") +
  ggtitle("Error Bars for Random Forests")

Conducted by: Subasish Das