## [1] 8877 16
## [1] "CONTROL_SECTION" "LOGMILE_FROM" "LOGMILE_TO"
## [4] "LENGTH" "AVG_ADT" "FUNCTIONAL_CLASS"
## [7] "HIGHWAY_CLASS" "CLASS_DESC" "MED_TYPE"
## [10] "MED_TYPE_DESC" "MEDIAN_WIDTH" "NUM_LANES"
## [13] "PAVEMENT_WIDTH" "INTERSECTION" "TOT_CRASH"
## [16] "AVG_CRASH"
## [1] "LENGTH" "AVG_ADT" "CLASS_DESC" "MED_TYPE_DESC"
## [5] "MEDIAN_WIDTH" "NUM_LANES" "PAVEMENT_WIDTH" "AVG_CRASH"
##
## Rural 2 Lane Rural 4 Lane Rural 4 Lane Divided
## 3693 121 324
## Rural Interstate Urban 2 Lane Urban 4 Lane
## 234 2208 637
## Urban 4 Lane Divided Urban Interstate
## 1135 525
rural2 <- subset(all1, CLASS_DESC=="Rural 2 Lane")
dim(rural2)
## [1] 3693 8
rural2_1 <- rural2[-c(3,4, 6)]
ruralmul <- subset(all1, CLASS_DESC=="Rural 4 Lane"| CLASS_DESC=="Rural 4 Lane Divided")
dim(ruralmul)
## [1] 445 8
ruralmul_1 <- ruralmul[-c(3,4)]
ruralint <- subset(all1, CLASS_DESC=="Rural Interstate")
dim(ruralint)
## [1] 234 8
ruralint_1 <- ruralint[-c(3,4)]
urban2 <- subset(all1, CLASS_DESC=="Urban 2 Lane")
dim(urban2)
## [1] 2208 8
urban2_1 <- urban2[-c(3,4, 6)]
urbanmul <- subset(all1, CLASS_DESC=="Urban 4 Lane"| CLASS_DESC=="Urban 4 Lane Divided")
dim(ruralmul)
## [1] 445 8
urbanmul_1 <- urbanmul[-c(3,4)]
urbanint <- subset(all1, CLASS_DESC=="Urban Interstate")
dim(urbanint)
## [1] 525 8
urbanint_1 <- urbanint[-c(3,4)]
## summany statistics
summary(rural2_1)
## LENGTH AVG_ADT MEDIAN_WIDTH PAVEMENT_WIDTH
## Min. : 0.010 Min. : 41 Min. : 0.00000 Min. :14.0
## 1st Qu.: 0.880 1st Qu.: 863 1st Qu.: 0.00000 1st Qu.:20.0
## Median : 2.230 Median : 1883 Median : 0.00000 Median :22.0
## Mean : 2.826 Mean : 2741 Mean : 0.08665 Mean :22.3
## 3rd Qu.: 4.200 3rd Qu.: 3700 3rd Qu.: 0.00000 3rd Qu.:24.0
## Max. :17.400 Max. :18744 Max. :30.00000 Max. :35.0
## AVG_CRASH
## Min. : 0.330
## 1st Qu.: 0.670
## Median : 1.330
## Mean : 2.433
## 3rd Qu.: 3.000
## Max. :55.330
summary(ruralmul_1)
## LENGTH AVG_ADT MEDIAN_WIDTH NUM_LANES
## Min. : 0.030 Min. : 130 Min. : 0.0 Min. :3.000
## 1st Qu.: 0.350 1st Qu.: 5025 1st Qu.: 0.0 1st Qu.:4.000
## Median : 0.800 Median : 7850 Median :20.0 Median :4.000
## Mean : 1.569 Mean : 9740 Mean :22.4 Mean :4.022
## 3rd Qu.: 2.230 3rd Qu.:12124 3rd Qu.:32.0 3rd Qu.:4.000
## Max. :13.140 Max. :38182 Max. :64.0 Max. :6.000
## PAVEMENT_WIDTH AVG_CRASH
## Min. :36.00 Min. : 0.330
## 1st Qu.:48.00 1st Qu.: 0.670
## Median :48.00 Median : 1.330
## Mean :47.85 Mean : 3.287
## 3rd Qu.:48.00 3rd Qu.: 4.000
## Max. :72.00 Max. :43.330
summary(ruralint_1)
## LENGTH AVG_ADT MEDIAN_WIDTH NUM_LANES
## Min. : 0.0200 Min. :12300 Min. : 2.00 Min. :4.000
## 1st Qu.: 0.6925 1st Qu.:17709 1st Qu.: 64.00 1st Qu.:4.000
## Median : 2.4600 Median :30700 Median : 64.00 Median :4.000
## Mean : 2.7056 Mean :29864 Mean : 70.86 Mean :4.094
## 3rd Qu.: 4.0450 3rd Qu.:39432 3rd Qu.: 99.00 3rd Qu.:4.000
## Max. :11.0800 Max. :61060 Max. :130.00 Max. :6.000
## PAVEMENT_WIDTH AVG_CRASH
## Min. :48.00 Min. : 0.330
## 1st Qu.:48.00 1st Qu.: 1.415
## Median :48.00 Median : 5.670
## Mean :50.59 Mean : 11.667
## 3rd Qu.:48.00 3rd Qu.: 16.247
## Max. :80.00 Max. :167.000
summary(urban2_1)
## LENGTH AVG_ADT MEDIAN_WIDTH PAVEMENT_WIDTH
## Min. :0.0100 Min. : 220 Min. : 0.0000 Min. :12.00
## 1st Qu.:0.2400 1st Qu.: 3810 1st Qu.: 0.0000 1st Qu.:22.00
## Median :0.5500 Median : 6826 Median : 0.0000 Median :24.00
## Mean :0.9072 Mean : 8499 Mean : 0.2056 Mean :23.46
## 3rd Qu.:1.1900 3rd Qu.:11462 3rd Qu.: 0.0000 3rd Qu.:24.00
## Max. :9.6000 Max. :57325 Max. :30.0000 Max. :72.00
## AVG_CRASH
## Min. : 0.330
## 1st Qu.: 1.000
## Median : 2.330
## Mean : 6.094
## 3rd Qu.: 6.670
## Max. :143.670
summary(urbanmul_1)
## LENGTH AVG_ADT MEDIAN_WIDTH NUM_LANES
## Min. :0.0100 Min. : 1020 Min. : 0.00 Min. :2.00
## 1st Qu.:0.1700 1st Qu.: 11625 1st Qu.: 0.00 1st Qu.:4.00
## Median :0.3800 Median : 18960 Median : 8.00 Median :4.00
## Mean :0.6151 Mean : 21095 Mean : 16.43 Mean :4.23
## 3rd Qu.:0.7600 3rd Qu.: 27044 3rd Qu.: 30.00 3rd Qu.:4.00
## Max. :7.5400 Max. :151349 Max. :299.00 Max. :8.00
## PAVEMENT_WIDTH AVG_CRASH
## Min. :22.00 Min. : 0.33
## 1st Qu.:48.00 1st Qu.: 2.00
## Median :48.00 Median : 5.33
## Mean :50.58 Mean : 14.53
## 3rd Qu.:49.00 3rd Qu.: 15.00
## Max. :99.00 Max. :587.33
summary(urbanint_1)
## LENGTH AVG_ADT MEDIAN_WIDTH NUM_LANES
## Min. :0.0200 Min. : 12500 Min. : 0.00 Min. :4.000
## 1st Qu.:0.3100 1st Qu.: 39150 1st Qu.: 28.00 1st Qu.:4.000
## Median :0.6400 Median : 56862 Median : 64.00 Median :4.000
## Mean :0.9505 Mean : 61152 Mean : 50.75 Mean :4.592
## 3rd Qu.:1.2800 3rd Qu.: 76335 3rd Qu.: 64.00 3rd Qu.:6.000
## Max. :4.7100 Max. :169590 Max. :130.00 Max. :8.000
## PAVEMENT_WIDTH AVG_CRASH
## Min. :48.00 Min. : 0.33
## 1st Qu.:48.00 1st Qu.: 2.00
## Median :48.00 Median : 8.00
## Mean :55.82 Mean : 18.63
## 3rd Qu.:72.00 3rd Qu.: 21.67
## Max. :96.00 Max. :288.00
## To check the number of lanes
table(ruralmul_1$NUM_LANES) ## Error in defining
##
## 3 4 5 6
## 3 433 5 4
table(ruralint_1$NUM_LANES)
##
## 4 6
## 223 11
table(urbanmul_1$NUM_LANES) ## Error in defining
##
## 2 3 4 5 6 8
## 1 3 1562 8 194 4
table(urbanint_1$NUM_LANES)
##
## 4 6 7 8
## 381 132 1 11
options(warn=-1)
## Important Variable Selection
## By Randomforest
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
impvar <- randomForest(AVG_CRASH~., data = rural2_1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar, main="Rural 2-lane")
impvar <- randomForest(AVG_CRASH~., data = ruralmul_1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar, main="Rural Muti-lane")
impvar <- randomForest(AVG_CRASH~., data = ruralint_1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar, main="Rural Interstates")
library(randomForest)
impvar <- randomForest(AVG_CRASH~., data = urban2_1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar, main="Urban 2-lane")
impvar <- randomForest(AVG_CRASH~., data = urbanmul_1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar, main="Urban Muti-lane")
impvar <- randomForest(AVG_CRASH~., data = urbanint_1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar, main="Urban Interstates")
## Correlation plot
library(corrplot)
corrp <- cor(rural2_1)
corrplot(corrp, method = "circle")
corrp <- cor(ruralmul_1)
corrplot(corrp, method = "circle")
corrp <- cor(ruralint_1)
corrplot(corrp, method = "circle")
library(corrplot)
corrp <- cor(urban2_1)
corrplot(corrp, method = "circle")
corrp <- cor(urbanmul_1)
corrplot(corrp, method = "circle")
corrp <- cor(urbanint_1)
corrplot(corrp, method = "circle")
library(e1071)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Modeling for trainset [all data]
### Rural 2-lane
svm.model <- svm(AVG_CRASH~., data = rural2_1, cost = 100, gamma =1)
svm.pred <- predict(svm.model, rural2_1[1:4])
sd(svm.pred)
## [1] 2.656685
pl <- cbind(obs= rural2_1$AVG_CRASH, pred=svm.pred)
pl <- cbind(obs= rural2_1$AVG_CRASH, pred=abs(svm.pred), diff=abs(svm.pred)-rural2_1$AVG_CRASH)
pl <- data.frame(pl)
svm_err_all <- rural2_1$AVG_CRASH- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] 2.294312
library(ggplot2)
p1 <- ggplot(pl, aes(obs, pred))
p1a <- p1 + geom_point(colour = "#9A8822", size = 1.3)+ theme_bw()+ geom_abline(slope=1,size = 0.6)+ geom_abline(intercept=2.65, slope=1, color="#F98400",size = 0.6,linetype=4)+geom_abline(intercept=-2.65, slope=1, color="#F98400",size = 0.6,linetype=4)+
labs(title = "Rural 2-lane [trainset]")+labs(x = "Observed", y="Predicted")
p1a
### Rural Multilane
svm.model <- svm(AVG_CRASH~., data = ruralmul_1, cost = 100, gamma =1)
svm.pred <- predict(svm.model, ruralmul_1[1:5])
sd(svm.pred)
## [1] 4.615578
pl <- cbind(obs= ruralmul_1$AVG_CRASH, pred=svm.pred)
pl <- cbind(obs= ruralmul_1$AVG_CRASH, pred=abs(svm.pred), diff=abs(svm.pred)-ruralmul_1$AVG_CRASH)
pl <- data.frame(pl)
svm_err_all <- ruralmul_1$AVG_CRASH- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] 1.399078
library(ggplot2)
p1 <- ggplot(pl, aes(obs, pred))
p1a <- p1 + geom_point(colour = "#9A8822", size = 1.3)+ theme_bw()+ geom_abline(slope=1,size = 0.6)+ geom_abline(intercept=4.6, slope=1, color="#F98400",size = 0.6,linetype=4)+geom_abline(intercept=-4.6, slope=1, color="#F98400",size = 0.6,linetype=4)+
labs(title = "Rural Multilane [trainset]")+labs(x = "Observed", y="Predicted")
p1a
### Rural Interstates
svm.model <- svm(AVG_CRASH~., data = ruralint_1, cost = 100, gamma =1)
svm.pred <- predict(svm.model, ruralint_1[1:5])
sd(svm.pred)
## [1] 16.30482
pl <- cbind(obs= ruralint_1$AVG_CRASH, pred=svm.pred)
pl <- cbind(obs= ruralint_1$AVG_CRASH, pred=abs(svm.pred), diff=abs(svm.pred)-ruralint_1$AVG_CRASH)
pl <- data.frame(pl)
svm_err_all <- ruralint_1$AVG_CRASH- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] 6.539465
library(ggplot2)
p1 <- ggplot(pl, aes(obs, pred))
p1a <- p1 + geom_point(colour = "#9A8822", size = 1.3)+ theme_bw()+ geom_abline(slope=1,size = 0.6)+ geom_abline(intercept=16.3, slope=1, color="#F98400",size = 0.6,linetype=4)+geom_abline(intercept=-16.3, slope=1, color="#F98400",size = 0.6,linetype=4)+
labs(title = "Rural Interstates [trainset]")+labs(x = "Observed", y="Predicted")
p1a
### Urban 2-lane
svm.model <- svm(AVG_CRASH~., data = urban2_1, cost = 100, gamma =1)
svm.pred <- predict(svm.model, urban2_1[1:4])
sd(svm.pred)
## [1] 8.342459
pl <- cbind(obs= rural2_1$AVG_CRASH, pred=svm.pred)
pl <- cbind(obs= rural2_1$AVG_CRASH, pred=abs(svm.pred), diff=abs(svm.pred)-rural2_1$AVG_CRASH)
pl <- data.frame(pl)
svm_err_all <- rural2_1$FREQ- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] NaN
library(ggplot2)
p1 <- ggplot(pl, aes(obs, pred))
p1a <- p1 + geom_point(colour = "#9A8822", size = 1.3)+ theme_bw()+ geom_abline(slope=1,size = 0.6)+ geom_abline(intercept=8.3, slope=1, color="#F98400",size = 0.6,linetype=4)+geom_abline(intercept=-8.3, slope=1, color="#F98400",size = 0.6,linetype=4)+
labs(title = "Urban 2-lane [trainset]")+labs(x = "Observed", y="Predicted")
p1a
### Urban Multi-lane
svm.model <- svm(AVG_CRASH~., data = urbanmul_1, cost = 100, gamma =1)
svm.pred <- predict(svm.model, urbanmul_1[1:5])
sd(svm.pred)
## [1] 27.33387
pl <- cbind(obs= urbanmul_1$AVG_CRASH, pred=svm.pred)
pl <- cbind(obs= urbanmul_1$AVG_CRASH, pred=abs(svm.pred), diff=abs(svm.pred)-urbanmul_1$AVG_CRASH)
pl <- data.frame(pl)
svm_err_all <- urbanmul_1$TOT_CRASH- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] NaN
library(ggplot2)
p1 <- ggplot(pl, aes(obs, pred))
p1a <- p1 + geom_point(colour = "#9A8822", size = 1.3)+ theme_bw()+ geom_abline(slope=1,size = 0.6)+ geom_abline(intercept=27.3, slope=1, color="#F98400",size = 0.6,linetype=4)+geom_abline(intercept=-27.3, slope=1, color="#F98400",size = 0.6,linetype=4)+
labs(title = "Urban Multilane [trainset]")+labs(x = "Observed", y="Predicted")
p1a
### Urban Interstates
svm.model <- svm(AVG_CRASH~., data = urbanint_1, cost = 100, gamma =1)
svm.pred <- predict(svm.model, urbanint_1[1:5])
sd(svm.pred)
## [1] 23.81082
pl <- cbind(obs= urbanint_1$AVG_CRASH, pred=svm.pred)
pl <- cbind(obs= urbanint_1$AVG_CRASH, pred=abs(svm.pred), diff=abs(svm.pred)-urbanint_1$AVG_CRASH)
pl <- data.frame(pl)
svm_err_all <- urbanint_1$AVG_CRASH- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] 18.19861
library(ggplot2)
p1 <- ggplot(pl, aes(obs, pred))
p1a <- p1 + geom_point(colour = "#9A8822", size = 1.3)+ theme_bw()+ geom_abline(slope=1,size = 0.6)+ geom_abline(intercept=23.8, slope=1, color="#F98400",size = 0.6,linetype=4)+geom_abline(intercept=-23.8, slope=1, color="#F98400",size = 0.6,linetype=4)+
labs(title = "Urban Interstates [trainset]")+labs(x = "Observed", y="Predicted")
p1a
library(randomForestCI)
library(dplyr) # For data manipulation
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(randomForest) # For random forest ensemble models
# Function to divide data into training, and test sets
index <- function(data=data,pctTrain=0.7)
{
# fcn to create indices to divide data into random
# training, validation and testing data sets
N <- nrow(data)
train <- sample(N, pctTrain*N)
test <- setdiff(seq_len(N),train)
Ind <- list(train=train,test=test)
return(Ind)
}
#
set.seed(123)
ind <- index(rural2_1,0.8)
length(ind$train)
## [1] 2954
length(ind$test)
## [1] 739
rf_fit <- randomForest(AVG_CRASH~., data=na.omit(rural2_1[ind$train,]),
keep.inbag=TRUE) # Build the model
# Plot the error as the number of trees increases
plot(rf_fit)
# Plot the important variables
varImpPlot(rf_fit,col="blue",pch= 2)
# Calculate the Variance
X <- na.omit(rural2_1[ind$test,-5])
head(X)
## LENGTH AVG_ADT MEDIAN_WIDTH PAVEMENT_WIDTH
## 1 0.53 1920 0 24
## 47 0.86 5567 0 24
## 66 0.36 1550 0 24
## 90 2.55 4357 0 24
## 135 1.40 2333 0 24
## 153 0.66 890 0 24
var_hat <- randomForestInfJack(rf_fit, X, calibrate = TRUE)
#Have a look at the variance
head(var_hat)
## y.hat var.hat
## 1 1.400789 0.2448401
## 47 1.883930 0.2450745
## 66 1.327687 0.2452486
## 90 3.369700 0.3777212
## 135 1.784369 0.2450206
## 153 1.253134 0.2453819
dim(var_hat)
## [1] 739 2
plot(var_hat)
# Plot the fit
df <- data.frame(y = rural2_1[ind$test,]$AVG_CRASH, var_hat)
df <- mutate(df, se = sqrt(var.hat))
head(df)
## y y.hat var.hat se
## 1 0.33 1.400789 0.2448401 0.4948132
## 2 1.00 1.883930 0.2450745 0.4950500
## 3 0.33 1.327687 0.2452486 0.4952258
## 4 2.33 3.369700 0.3777212 0.6145903
## 5 1.00 1.784369 0.2450206 0.4949956
## 6 1.00 1.253134 0.2453819 0.4953603
p1 <- ggplot(df, aes(x = y, y = y.hat))
p1 + geom_errorbar(aes(ymin=y.hat-se, ymax=y.hat+se), width=.1) +
geom_point(colour = "#9A8822", size = 1.3) + scale_x_continuous(limits = c(0, 22))+ theme_bw()+
geom_abline(intercept=0, slope=1, linetype=2) +
xlab("Observed") +
ylab("Predicted") +
ggtitle("Error Bars for Random Forests")
Conducted by: Subasish Das