setwd("/Users/subasishdas1/Desktop/TRB2016/nick_thesis")
all_b <- read.csv("All_Roadways.csv") ## All-Crashes
head(all_b) ## Read first few rows
## Type AVG_ADT LENGTH DOTD_FC MEDIAN_WIDTH NUM_LANES PAVEMENT_WIDTH
## 1 Interstate 59000.00 0.82 11 64 4 48
## 2 Interstate 34800.00 1.31 11 64 4 48
## 3 Interstate 34514.29 1.34 11 64 4 48
## 4 Interstate 34712.50 0.58 11 64 4 48
## 5 Interstate 34400.00 0.08 11 64 4 48
## 6 Interstate 34300.00 0.38 11 64 4 48
## AVG_LANE_WIDTH SHOULDER_WIDTH CLASS CLASS_DESC CBD TOT_CRASHES
## 1 12 10 2 DIVIDED 0 1
## 2 12 10 2 DIVIDED 0 121
## 3 12 10 2 DIVIDED 0 107
## 4 12 10 2 DIVIDED 0 31
## 5 12 10 2 DIVIDED 0 12
## 6 12 10 2 DIVIDED 0 25
## AVG_CRASHES FITTED
## 1 0.125 25.330851
## 2 15.125 18.765175
## 3 13.375 18.921692
## 4 3.875 9.375180
## 5 1.500 1.727995
## 6 3.125 6.448532
table(all_b$Type)
##
## 2-lane 4-lane Interstate
## 3211 2285 563
twolane <- subset(all_b, Type=="2-lane") ## subsetting
fourlane <- subset(all_b, Type=="4-lane")
int <- subset(all_b, Type=="Interstate")
options(warn=-1)
dim(twolane)
## [1] 3211 15
str(twolane)
## 'data.frame': 3211 obs. of 15 variables:
## $ Type : Factor w/ 3 levels "2-lane","4-lane",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ AVG_ADT : num 4967 3275 4267 4267 6475 ...
## $ LENGTH : num 0.06 0.06 0.63 1.03 0.15 0.41 0.71 1.96 1.99 1.71 ...
## $ DOTD_FC : int 17 17 17 17 17 16 16 16 16 16 ...
## $ MEDIAN_WIDTH : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NUM_LANES : int 2 2 2 2 2 2 2 2 2 2 ...
## $ PAVEMENT_WIDTH: int 24 24 24 24 24 24 24 24 24 24 ...
## $ AVG_LANE_WIDTH: num 12 12 12 12 12 12 12 12 12 12 ...
## $ SHOULDER_WIDTH: int 0 0 6 7 8 8 8 8 8 7 ...
## $ CLASS : int 1 1 1 1 1 1 1 1 1 1 ...
## $ CLASS_DESC : Factor w/ 3 levels "CTL","DIVIDED",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ CBD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ TOT_CRASHES : int 6 9 10 20 9 5 3 29 4 143 ...
## $ AVG_CRASHES : num 0.75 1.12 1.25 2.5 1.12 ...
## $ FITTED : num 1.107 0.782 3.736 4.947 2.332 ...
twolane1 <- twolane[-c(1, 4, 6, 7, 13, 15)]
names(twolane1)
## [1] "AVG_ADT" "LENGTH" "MEDIAN_WIDTH" "AVG_LANE_WIDTH"
## [5] "SHOULDER_WIDTH" "CLASS" "CLASS_DESC" "CBD"
## [9] "AVG_CRASHES"
## Important Variable Selection
library(leaps)
leaps=regsubsets(AVG_CRASHES~., data = twolane1, nbest=5)
## Reordering variables and trying again:
plot(leaps, scale="bic")
## By Randomforest
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
impvar <- randomForest(AVG_CRASHES~., data = twolane1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar)
## Reducing variables
twolane2 <- twolane1[-c(6:8)]
names(twolane2)
## [1] "AVG_ADT" "LENGTH" "MEDIAN_WIDTH" "AVG_LANE_WIDTH"
## [5] "SHOULDER_WIDTH" "AVG_CRASHES"
## Correlation plot
library(corrplot)
corrp <- cor(twolane2)
corrplot(corrp, method = "circle")
plot(twolane2, pch=8, cex=0.1)
### Considering 8 Variables
full=lm(AVG_CRASHES~., data = twolane1)
full
##
## Call:
## lm(formula = AVG_CRASHES ~ ., data = twolane1)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH
## -1.338e+01 6.309e-04 4.485e+00
## MEDIAN_WIDTH AVG_LANE_WIDTH SHOULDER_WIDTH
## -1.492e-02 6.674e-01 1.045e-01
## CLASS CLASS_DESCUNDIVIDED CBD
## 1.908e+00 NA -5.671e+00
step(full, data=twolane1, direction="both")
## Start: AIC=14901.91
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH +
## SHOULDER_WIDTH + CLASS + CLASS_DESC + CBD
##
##
## Step: AIC=14901.91
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH +
## SHOULDER_WIDTH + CLASS + CBD
##
## Df Sum of Sq RSS AIC
## - MEDIAN_WIDTH 1 6 331131 14900
## <none> 331124 14902
## - CBD 1 255 331380 14902
## - CLASS 1 284 331408 14903
## - SHOULDER_WIDTH 1 373 331497 14904
## - AVG_LANE_WIDTH 1 3274 334399 14932
## - AVG_ADT 1 50987 382111 15360
## - LENGTH 1 66750 397874 15490
##
## Step: AIC=14899.98
## AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH + SHOULDER_WIDTH +
## CLASS + CBD
##
## Df Sum of Sq RSS AIC
## <none> 331131 14900
## - CBD 1 257 331388 14900
## - CLASS 1 302 331433 14901
## - SHOULDER_WIDTH 1 369 331500 14902
## + MEDIAN_WIDTH 1 6 331124 14902
## - AVG_LANE_WIDTH 1 3268 334399 14930
## - AVG_ADT 1 51547 382678 15362
## - LENGTH 1 67592 398723 15494
##
## Call:
## lm(formula = AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH +
## SHOULDER_WIDTH + CLASS + CBD, data = twolane1)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH AVG_LANE_WIDTH
## -1.325e+01 6.301e-04 4.489e+00 6.658e-01
## SHOULDER_WIDTH CLASS CBD
## 1.039e-01 1.788e+00 -5.689e+00
### Considering 5 Variables
full=lm(AVG_CRASHES~., data = twolane2)
full
##
## Call:
## lm(formula = AVG_CRASHES ~ ., data = twolane2)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH MEDIAN_WIDTH
## -1.157e+01 6.317e-04 4.495e+00 2.593e-02
## AVG_LANE_WIDTH SHOULDER_WIDTH
## 6.757e-01 1.023e-01
step(full, data=twolane2, direction="both")
## Start: AIC=14903.28
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH +
## SHOULDER_WIDTH
##
## Df Sum of Sq RSS AIC
## - MEDIAN_WIDTH 1 24 331702 14902
## <none> 331679 14903
## - SHOULDER_WIDTH 1 357 332036 14905
## - AVG_LANE_WIDTH 1 3368 335046 14934
## - AVG_ADT 1 51523 383202 15365
## - LENGTH 1 67083 398761 15493
##
## Step: AIC=14901.51
## AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH + SHOULDER_WIDTH
##
## Df Sum of Sq RSS AIC
## <none> 331702 14902
## - SHOULDER_WIDTH 1 364 332066 14903
## + MEDIAN_WIDTH 1 24 331679 14903
## - AVG_LANE_WIDTH 1 3439 335141 14933
## - AVG_ADT 1 53261 384963 15378
## - LENGTH 1 67591 399293 15495
##
## Call:
## lm(formula = AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH +
## SHOULDER_WIDTH, data = twolane2)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH AVG_LANE_WIDTH
## -1.162e+01 6.339e-04 4.486e+00 6.804e-01
## SHOULDER_WIDTH
## 1.031e-01
### Generalized linear model (Poisson)[all data]
lmdel <- glm(AVG_CRASHES~., data = twolane2, family="poisson")
m <- predict(lmdel, type="response")
error <- twolane2$AVG_CRASHES- m
(rmse <- sqrt(mean(error^2)))
## [1] 11.3625
lmdel
##
## Call: glm(formula = AVG_CRASHES ~ ., family = "poisson", data = twolane2)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH MEDIAN_WIDTH
## 1.433e-01 6.238e-05 3.883e-01 -3.763e-03
## AVG_LANE_WIDTH SHOULDER_WIDTH
## 3.800e-02 2.590e-02
##
## Degrees of Freedom: 3210 Total (i.e. Null); 3205 Residual
## Null Deviance: 36530
## Residual Deviance: 22660 AIC: Inf
### Generalized linear model (Negative-binomial)[all data]
library(MASS)
nbmodel<- glm.nb(AVG_CRASHES~., data = twolane2)
m1 <- predict(nbmodel, type="response")
nbmodel
##
## Call: glm.nb(formula = AVG_CRASHES ~ ., data = twolane2, init.theta = 1.104723388,
## link = log)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH MEDIAN_WIDTH
## -4.608e-01 9.824e-05 6.517e-01 -2.405e-03
## AVG_LANE_WIDTH SHOULDER_WIDTH
## 3.648e-02 9.751e-03
##
## Degrees of Freedom: 3210 Total (i.e. Null); 3205 Residual
## Null Deviance: 6077
## Residual Deviance: 3722 AIC: 16350
error1 <- twolane2$AVG_CRASHES- m1
(rmse <- sqrt(mean(error1^2)))
## [1] 78.30459
Comment: Very high RMSE. Not recommended.
index <- 1:nrow(twolane2)
testindex <- sample(index, trunc(length(index)/30))
testset <- twolane2[testindex,]
trainset <- twolane2[-testindex,]
dim(testset)
## [1] 107 6
dim(trainset)
## [1] 3104 6
library(e1071)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Modeling for testset
svm.model <- svm(AVG_CRASHES~., data = trainset, cost = 100, gamma =1)
svm.pred <- predict(svm.model, testset[1:5])
pl <- cbind(obs= testset$AVG_CRASHES, pred=svm.pred)
pl <- cbind(obs= testset$AVG_CRASHES, pred=abs(svm.pred), diff=abs(svm.pred)-testset$AVG_CRASHES)
pl <- data.frame(pl)
plot(pl$obs, pl$pred, pch=8, cex=0.2)
svm_err <- testset$AVG_CRASHES- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err^2)))
## [1] 9.310153
## Modeling for trainset [all data]
svm.model <- svm(AVG_CRASHES~., data = twolane2, cost = 100, gamma =1)
svm.pred <- predict(svm.model, twolane2[1:5])
pl <- cbind(obs= twolane2$AVG_CRASHES, pred=svm.pred)
pl <- cbind(obs= twolane2$AVG_CRASHES, pred=abs(svm.pred), diff=abs(svm.pred)-twolane2$AVG_CRASHES)
pl <- data.frame(pl)
plot(pl$obs, pl$pred, pch=8, cex=0.2)
svm_err_all <- twolane2$AVG_CRASHES- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] 5.949426
Comment: Lower RMSE than Data Modeling RMSE.
dim(fourlane)
## [1] 2285 15
str(fourlane)
## 'data.frame': 2285 obs. of 15 variables:
## $ Type : Factor w/ 3 levels "2-lane","4-lane",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ AVG_ADT : num 11800 14867 13300 17757 13533 ...
## $ LENGTH : num 0.04 0.1 0.6 1.41 0.92 0.92 0.54 0.54 0.28 0.05 ...
## $ DOTD_FC : int 14 14 14 14 14 14 14 14 14 14 ...
## $ MEDIAN_WIDTH : int 0 0 0 0 14 16 14 16 20 20 ...
## $ NUM_LANES : int 6 6 4 4 4 4 4 4 4 4 ...
## $ PAVEMENT_WIDTH: int 78 78 40 49 48 48 48 44 48 48 ...
## $ AVG_LANE_WIDTH: num 13 13 10 12 12 12 12 11 12 12 ...
## $ SHOULDER_WIDTH: int 0 0 0 0 0 0 0 0 10 10 ...
## $ CLASS : int 1 1 1 1 2 2 2 2 2 2 ...
## $ CLASS_DESC : Factor w/ 3 levels "CTL","DIVIDED",..: 3 3 3 3 2 2 2 2 2 2 ...
## $ CBD : int 1 1 1 1 0 0 0 0 0 0 ...
## $ TOT_CRASHES : int 18 8 44 713 181 125 66 302 138 4 ...
## $ AVG_CRASHES : num 2.25 1 5.5 89.12 22.62 ...
## $ FITTED : num 1.78 3.6 10.67 21.89 17.97 ...
fourlane1 <- fourlane[-c(1, 4, 6, 7, 13, 15)]
names(fourlane1)
## [1] "AVG_ADT" "LENGTH" "MEDIAN_WIDTH" "AVG_LANE_WIDTH"
## [5] "SHOULDER_WIDTH" "CLASS" "CLASS_DESC" "CBD"
## [9] "AVG_CRASHES"
## Important Variable Selection
library(leaps)
leaps=regsubsets(AVG_CRASHES~., data = fourlane1, nbest=5)
## Reordering variables and trying again:
plot(leaps, scale="bic")
## By Randomforest
library(randomForest)
impvar <- randomForest(AVG_CRASHES~., data = fourlane1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar)
## Reducing variables
fourlane2 <- fourlane1[-c(6:8)]
names(fourlane2)
## [1] "AVG_ADT" "LENGTH" "MEDIAN_WIDTH" "AVG_LANE_WIDTH"
## [5] "SHOULDER_WIDTH" "AVG_CRASHES"
## Correlation plot
library(corrplot)
corrp <- cor(fourlane2)
corrplot(corrp, method = "circle")
plot(fourlane2, pch=8, cex=0.1)
### Considering 8 Variables
full=lm(AVG_CRASHES~., data = fourlane1)
full
##
## Call:
## lm(formula = AVG_CRASHES ~ ., data = fourlane1)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH
## -5.2254280 0.0008037 14.2217707
## MEDIAN_WIDTH AVG_LANE_WIDTH SHOULDER_WIDTH
## -0.1855600 0.0502469 -0.1781190
## CLASS CLASS_DESCDIVIDED CLASS_DESCUNDIVIDED
## -0.8228764 0.8403549 NA
## CBD
## -4.0366008
step(full, data=fourlane1, direction="both")
## Start: AIC=14859.78
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH +
## SHOULDER_WIDTH + CLASS + CLASS_DESC + CBD
##
##
## Step: AIC=14859.78
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH +
## SHOULDER_WIDTH + CLASS_DESC + CBD
##
## Df Sum of Sq RSS AIC
## - CLASS_DESC 2 25 1512762 14856
## - AVG_LANE_WIDTH 1 8 1512746 14858
## - SHOULDER_WIDTH 1 1193 1513931 14860
## <none> 1512738 14860
## - MEDIAN_WIDTH 1 1839 1514576 14861
## - CBD 1 1966 1514704 14861
## - AVG_ADT 1 186705 1699443 15124
## - LENGTH 1 215251 1727989 15162
##
## Step: AIC=14855.82
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH +
## SHOULDER_WIDTH + CBD
##
## Df Sum of Sq RSS AIC
## - AVG_LANE_WIDTH 1 11 1512773 14854
## - SHOULDER_WIDTH 1 1207 1513969 14856
## <none> 1512762 14856
## - CBD 1 1965 1514727 14857
## + CLASS 1 1 1512761 14858
## + CLASS_DESC 2 25 1512738 14860
## - MEDIAN_WIDTH 1 4363 1517125 14860
## - AVG_ADT 1 191651 1704413 15126
## - LENGTH 1 215386 1728148 15158
##
## Step: AIC=14853.84
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + SHOULDER_WIDTH +
## CBD
##
## Df Sum of Sq RSS AIC
## - SHOULDER_WIDTH 1 1207 1513980 14854
## <none> 1512773 14854
## - CBD 1 1955 1514728 14855
## + AVG_LANE_WIDTH 1 11 1512762 14856
## + CLASS 1 1 1512772 14856
## + CLASS_DESC 2 27 1512746 14858
## - MEDIAN_WIDTH 1 4360 1517133 14858
## - AVG_ADT 1 194780 1707554 15129
## - LENGTH 1 215374 1728148 15156
##
## Step: AIC=14853.66
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + CBD
##
## Df Sum of Sq RSS AIC
## <none> 1513980 14854
## + SHOULDER_WIDTH 1 1207 1512773 14854
## - CBD 1 1776 1515757 14854
## + CLASS 1 18 1513962 14856
## + AVG_LANE_WIDTH 1 11 1513969 14856
## + CLASS_DESC 2 41 1513940 14858
## - MEDIAN_WIDTH 1 7121 1521101 14862
## - AVG_ADT 1 196030 1710010 15130
## - LENGTH 1 214351 1728332 15154
##
## Call:
## lm(formula = AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH +
## CBD, data = fourlane1)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH MEDIAN_WIDTH CBD
## -5.7819455 0.0008064 14.1128725 -0.2193557 -3.8209792
### Considering 4 Variables
full=lm(AVG_CRASHES~., data = fourlane2)
full
##
## Call:
## lm(formula = AVG_CRASHES ~ ., data = fourlane2)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH MEDIAN_WIDTH
## -5.819484 0.000795 14.228189 -0.179394
## AVG_LANE_WIDTH SHOULDER_WIDTH
## 0.017060 -0.164322
step(full, data=fourlane2, direction="both")
## Start: AIC=14856.79
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH +
## SHOULDER_WIDTH
##
## Df Sum of Sq RSS AIC
## - AVG_LANE_WIDTH 1 1 1514728 14855
## - SHOULDER_WIDTH 1 1028 1515755 14856
## <none> 1514727 14857
## - MEDIAN_WIDTH 1 4108 1518835 14861
## - AVG_ADT 1 189688 1704415 15124
## - LENGTH 1 215507 1730234 15159
##
## Step: AIC=14854.79
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + SHOULDER_WIDTH
##
## Df Sum of Sq RSS AIC
## - SHOULDER_WIDTH 1 1028 1515757 14854
## <none> 1514728 14855
## + AVG_LANE_WIDTH 1 1 1514727 14857
## - MEDIAN_WIDTH 1 4131 1518859 14859
## - AVG_ADT 1 192852 1707580 15127
## - LENGTH 1 215512 1730240 15157
##
## Step: AIC=14854.34
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH
##
## Df Sum of Sq RSS AIC
## <none> 1515757 14854
## + SHOULDER_WIDTH 1 1028 1514728 14855
## + AVG_LANE_WIDTH 1 1 1515755 14856
## - MEDIAN_WIDTH 1 6675 1522432 14862
## - AVG_ADT 1 194325 1710081 15128
## - LENGTH 1 214754 1730511 15155
##
## Call:
## lm(formula = AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH, data = fourlane2)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH MEDIAN_WIDTH
## -5.9035688 0.0007976 14.1254508 -0.2118813
### Generalized linear model (Poisson)[all data]
lmdel <- glm(AVG_CRASHES~., data = fourlane2, family="poisson")
m <- predict(lmdel, type="response")
lmdel
##
## Call: glm(formula = AVG_CRASHES ~ ., family = "poisson", data = fourlane2)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH MEDIAN_WIDTH
## 1.472e+00 3.667e-05 4.215e-01 -6.323e-03
## AVG_LANE_WIDTH SHOULDER_WIDTH
## 1.940e-02 -4.692e-03
##
## Degrees of Freedom: 2284 Total (i.e. Null); 2279 Residual
## Null Deviance: 66910
## Residual Deviance: 47680 AIC: Inf
error <- fourlane2$AVG_CRASHES- m
(rmse <- sqrt(mean(error^2)))
## [1] 29.43547
index <- 1:nrow(fourlane2)
testindex <- sample(index, trunc(length(index)/30))
testset <- fourlane2[testindex,]
trainset <- fourlane2[-testindex,]
dim(testset)
## [1] 76 6
dim(trainset)
## [1] 2209 6
library(e1071)
library(caret)
## Modeling for testset
svm.model <- svm(AVG_CRASHES~., data = trainset, cost = 100, gamma =1)
svm.pred <- predict(svm.model, testset[1:5])
pl <- cbind(obs= testset$AVG_CRASHES, pred=svm.pred)
pl <- cbind(obs= testset$AVG_CRASHES, pred=abs(svm.pred), diff=abs(svm.pred)-testset$AVG_CRASHES)
pl <- data.frame(pl)
plot(pl$obs, pl$pred, pch=8, cex=0.2)
svm_err <- testset$AVG_CRASHES- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err^2)))
## [1] 18.27462
## Modeling for trainset [all data]
svm.model <- svm(AVG_CRASHES~., data = fourlane2, cost = 100, gamma =1)
svm.pred <- predict(svm.model, fourlane2[1:5])
pl <- cbind(obs= fourlane2$AVG_CRASHES, pred=svm.pred)
pl <- cbind(obs= fourlane2$AVG_CRASHES, pred=abs(svm.pred), diff=abs(svm.pred)-fourlane2$AVG_CRASHES)
pl <- data.frame(pl)
plot(pl$obs, pl$pred, pch=8, cex=0.2)
svm_err_all <- fourlane2$AVG_CRASHES- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] 13.86325
Comment: Lower RMSE than Data Modeling RMSE.
dim(int)
## [1] 563 15
str(int)
## 'data.frame': 563 obs. of 15 variables:
## $ Type : Factor w/ 3 levels "2-lane","4-lane",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ AVG_ADT : num 59000 34800 34514 34712 34400 ...
## $ LENGTH : num 0.82 1.31 1.34 0.58 0.08 0.38 0.4 0.13 0.36 0.53 ...
## $ DOTD_FC : int 11 11 11 11 11 11 11 11 11 11 ...
## $ MEDIAN_WIDTH : int 64 64 64 64 64 64 64 64 64 64 ...
## $ NUM_LANES : int 4 4 4 4 4 4 4 4 4 4 ...
## $ PAVEMENT_WIDTH: int 48 48 48 48 48 48 48 48 48 48 ...
## $ AVG_LANE_WIDTH: num 12 12 12 12 12 12 12 12 12 12 ...
## $ SHOULDER_WIDTH: int 10 10 10 10 10 10 10 10 10 10 ...
## $ CLASS : int 2 2 2 2 2 2 2 2 2 2 ...
## $ CLASS_DESC : Factor w/ 3 levels "CTL","DIVIDED",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ CBD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ TOT_CRASHES : int 1 121 107 31 12 25 4 10 69 32 ...
## $ AVG_CRASHES : num 0.125 15.125 13.375 3.875 1.5 ...
## $ FITTED : num 25.33 18.77 18.92 9.38 1.73 ...
int1 <- int[-c(1, 4, 6, 7, 13, 15)]
names(int1)
## [1] "AVG_ADT" "LENGTH" "MEDIAN_WIDTH" "AVG_LANE_WIDTH"
## [5] "SHOULDER_WIDTH" "CLASS" "CLASS_DESC" "CBD"
## [9] "AVG_CRASHES"
## Important Variable Selection
library(leaps)
leaps=regsubsets(AVG_CRASHES~., data = int1, nbest=5)
## Reordering variables and trying again:
plot(leaps, scale="bic")
## By Randomforest
library(randomForest)
impvar <- randomForest(AVG_CRASHES~., data = int1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar)
## Reducing variables
int2 <- int1[-c(5:8)]
names(int2)
## [1] "AVG_ADT" "LENGTH" "MEDIAN_WIDTH" "AVG_LANE_WIDTH"
## [5] "AVG_CRASHES"
## Correlation plot
library(corrplot)
corrp <- cor(int2)
corrplot(corrp, method = "circle")
plot(int2, pch=8, cex=0.1)
### Considering 8 Variables
full=lm(AVG_CRASHES~., data = int1)
full
##
## Call:
## lm(formula = AVG_CRASHES ~ ., data = int1)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH
## 2.329e+01 6.576e-04 1.900e+01
## MEDIAN_WIDTH AVG_LANE_WIDTH SHOULDER_WIDTH
## 1.483e-03 4.797e+00 8.313e-01
## CLASS CLASS_DESCUNDIVIDED CBD
## -5.928e+01 NA 6.175e+00
step(full, data=int1, direction="both")
## Start: AIC=4179.75
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH +
## SHOULDER_WIDTH + CLASS + CLASS_DESC + CBD
##
##
## Step: AIC=4179.75
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH +
## SHOULDER_WIDTH + CLASS + CBD
##
## Df Sum of Sq RSS AIC
## - MEDIAN_WIDTH 1 1 917068 4177.8
## - CBD 1 914 917982 4178.3
## <none> 917068 4179.8
## - SHOULDER_WIDTH 1 3526 920593 4179.9
## - AVG_LANE_WIDTH 1 14269 931336 4186.4
## - CLASS 1 31572 948640 4196.8
## - LENGTH 1 148399 1065467 4262.2
## - AVG_ADT 1 191004 1108071 4284.3
##
## Step: AIC=4177.76
## AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH + SHOULDER_WIDTH +
## CLASS + CBD
##
## Df Sum of Sq RSS AIC
## - CBD 1 913 917982 4176.3
## <none> 917068 4177.8
## - SHOULDER_WIDTH 1 3998 921067 4178.2
## + MEDIAN_WIDTH 1 1 917068 4179.8
## - AVG_LANE_WIDTH 1 14311 931379 4184.5
## - CLASS 1 32006 949074 4195.1
## - LENGTH 1 150274 1067343 4261.2
## - AVG_ADT 1 195289 1112357 4284.4
##
## Step: AIC=4176.32
## AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH + SHOULDER_WIDTH +
## CLASS
##
## Df Sum of Sq RSS AIC
## <none> 917982 4176.3
## - SHOULDER_WIDTH 1 3604 921585 4176.5
## + CBD 1 913 917068 4177.8
## + MEDIAN_WIDTH 1 0 917982 4178.3
## - AVG_LANE_WIDTH 1 14156 932138 4182.9
## - CLASS 1 33939 951921 4194.8
## - LENGTH 1 153593 1071575 4261.4
## - AVG_ADT 1 202129 1120111 4286.4
##
## Call:
## lm(formula = AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH +
## SHOULDER_WIDTH + CLASS, data = int1)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH AVG_LANE_WIDTH
## 2.634e+01 6.632e-04 1.913e+01 4.771e+00
## SHOULDER_WIDTH CLASS
## 7.871e-01 -6.052e+01
### Considering 4 Variables
full=lm(AVG_CRASHES~., data = int2)
full
##
## Call:
## lm(formula = AVG_CRASHES ~ ., data = int2)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH MEDIAN_WIDTH
## -7.823e+01 6.478e-04 1.896e+01 -2.357e-02
## AVG_LANE_WIDTH
## 4.227e+00
step(full, data=int2, direction="both")
## Start: AIC=4194.88
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH
##
## Df Sum of Sq RSS AIC
## - MEDIAN_WIDTH 1 205 952339 4193.0
## <none> 952134 4194.9
## - AVG_LANE_WIDTH 1 12895 965029 4200.5
## - LENGTH 1 149143 1101277 4274.8
## - AVG_ADT 1 192791 1144925 4296.7
##
## Step: AIC=4193
## AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH
##
## Df Sum of Sq RSS AIC
## <none> 952339 4193.0
## + MEDIAN_WIDTH 1 205 952134 4194.9
## - AVG_LANE_WIDTH 1 13348 965688 4198.8
## - LENGTH 1 150096 1102435 4273.4
## - AVG_ADT 1 206109 1158448 4301.3
##
## Call:
## lm(formula = AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH,
## data = int2)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH AVG_LANE_WIDTH
## -8.012e+01 6.525e-04 1.887e+01 4.280e+00
### Generalized linear model (Poisson)[all data]
lmdel <- glm(AVG_CRASHES~., data = int2, family="poisson")
m <- predict(lmdel, type="response")
lmdel
##
## Call: glm(formula = AVG_CRASHES ~ ., family = "poisson", data = int2)
##
## Coefficients:
## (Intercept) AVG_ADT LENGTH MEDIAN_WIDTH
## -4.653e-03 1.905e-05 5.774e-01 -9.157e-04
## AVG_LANE_WIDTH
## 1.178e-01
##
## Degrees of Freedom: 562 Total (i.e. Null); 558 Residual
## Null Deviance: 24850
## Residual Deviance: 15210 AIC: Inf
error <- int2$AVG_CRASHES- m
(rmse <- sqrt(mean(error^2)))
## [1] 40.70793
index <- 1:nrow(int2)
testindex <- sample(index, trunc(length(index)/10))
testset <- int2[testindex,]
trainset <- int2[-testindex,]
dim(testset)
## [1] 56 5
dim(trainset)
## [1] 507 5
library(e1071)
library(caret)
## Modeling for testset
svm.model <- svm(AVG_CRASHES~., data = trainset, cost = 100, gamma =1)
svm.pred <- predict(svm.model, testset[1:4])
pl <- cbind(obs= testset$AVG_CRASHES, pred=svm.pred)
pl <- cbind(obs= testset$AVG_CRASHES, pred=abs(svm.pred), diff=abs(svm.pred)-testset$AVG_CRASHES)
pl <- data.frame(pl)
plot(pl$obs, pl$pred, pch=8, cex=0.2)
svm_err <- testset$AVG_CRASHES- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err^2)))
## [1] 47.0918
## Modeling for trainset [all data]
svm.model <- svm(AVG_CRASHES~., data = int2, cost = 100, gamma =1)
svm.pred <- predict(svm.model, int2[1:4])
pl <- cbind(obs= int2$AVG_CRASHES, pred=svm.pred)
pl <- cbind(obs= int2$AVG_CRASHES, pred=abs(svm.pred), diff=abs(svm.pred)-int2$AVG_CRASHES)
pl <- data.frame(pl)
plot(pl$obs, pl$pred, pch=8, cex=0.2)
svm_err_all <- int2$AVG_CRASHES- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] 21.8101
Comment: Lower RMSE than Data Modeling RMSE.
Conducted by: Subasish Das