Data calling

setwd("/Users/subasishdas1/Desktop/TRB2016/nick_thesis")
all_b <- read.csv("All_Roadways.csv")  ## All-Crashes
head(all_b)  ## Read first few rows
##         Type  AVG_ADT LENGTH DOTD_FC MEDIAN_WIDTH NUM_LANES PAVEMENT_WIDTH
## 1 Interstate 59000.00   0.82      11           64         4             48
## 2 Interstate 34800.00   1.31      11           64         4             48
## 3 Interstate 34514.29   1.34      11           64         4             48
## 4 Interstate 34712.50   0.58      11           64         4             48
## 5 Interstate 34400.00   0.08      11           64         4             48
## 6 Interstate 34300.00   0.38      11           64         4             48
##   AVG_LANE_WIDTH SHOULDER_WIDTH CLASS CLASS_DESC CBD TOT_CRASHES
## 1             12             10     2    DIVIDED   0           1
## 2             12             10     2    DIVIDED   0         121
## 3             12             10     2    DIVIDED   0         107
## 4             12             10     2    DIVIDED   0          31
## 5             12             10     2    DIVIDED   0          12
## 6             12             10     2    DIVIDED   0          25
##   AVG_CRASHES    FITTED
## 1       0.125 25.330851
## 2      15.125 18.765175
## 3      13.375 18.921692
## 4       3.875  9.375180
## 5       1.500  1.727995
## 6       3.125  6.448532
table(all_b$Type)
## 
##     2-lane     4-lane Interstate 
##       3211       2285        563
twolane <- subset(all_b, Type=="2-lane")   ## subsetting
fourlane <- subset(all_b, Type=="4-lane")
int <- subset(all_b, Type=="Interstate")
options(warn=-1)

Two-lane

Exploratory Data Analysis

dim(twolane)
## [1] 3211   15
str(twolane)
## 'data.frame':    3211 obs. of  15 variables:
##  $ Type          : Factor w/ 3 levels "2-lane","4-lane",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ AVG_ADT       : num  4967 3275 4267 4267 6475 ...
##  $ LENGTH        : num  0.06 0.06 0.63 1.03 0.15 0.41 0.71 1.96 1.99 1.71 ...
##  $ DOTD_FC       : int  17 17 17 17 17 16 16 16 16 16 ...
##  $ MEDIAN_WIDTH  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NUM_LANES     : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ PAVEMENT_WIDTH: int  24 24 24 24 24 24 24 24 24 24 ...
##  $ AVG_LANE_WIDTH: num  12 12 12 12 12 12 12 12 12 12 ...
##  $ SHOULDER_WIDTH: int  0 0 6 7 8 8 8 8 8 7 ...
##  $ CLASS         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ CLASS_DESC    : Factor w/ 3 levels "CTL","DIVIDED",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ CBD           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ TOT_CRASHES   : int  6 9 10 20 9 5 3 29 4 143 ...
##  $ AVG_CRASHES   : num  0.75 1.12 1.25 2.5 1.12 ...
##  $ FITTED        : num  1.107 0.782 3.736 4.947 2.332 ...
twolane1 <- twolane[-c(1, 4, 6, 7, 13, 15)]
names(twolane1)
## [1] "AVG_ADT"        "LENGTH"         "MEDIAN_WIDTH"   "AVG_LANE_WIDTH"
## [5] "SHOULDER_WIDTH" "CLASS"          "CLASS_DESC"     "CBD"           
## [9] "AVG_CRASHES"
## Important Variable Selection
library(leaps)
leaps=regsubsets(AVG_CRASHES~., data = twolane1,  nbest=5)
## Reordering variables and trying again:
plot(leaps, scale="bic")

## By Randomforest
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
impvar <- randomForest(AVG_CRASHES~., data = twolane1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar)

## Reducing variables
twolane2 <- twolane1[-c(6:8)]
names(twolane2)
## [1] "AVG_ADT"        "LENGTH"         "MEDIAN_WIDTH"   "AVG_LANE_WIDTH"
## [5] "SHOULDER_WIDTH" "AVG_CRASHES"
## Correlation plot
library(corrplot)
corrp <- cor(twolane2)
corrplot(corrp, method = "circle")

plot(twolane2, pch=8, cex=0.1)

Data Modeling

### Considering 8 Variables
full=lm(AVG_CRASHES~., data = twolane1)
full
## 
## Call:
## lm(formula = AVG_CRASHES ~ ., data = twolane1)
## 
## Coefficients:
##         (Intercept)              AVG_ADT               LENGTH  
##          -1.338e+01            6.309e-04            4.485e+00  
##        MEDIAN_WIDTH       AVG_LANE_WIDTH       SHOULDER_WIDTH  
##          -1.492e-02            6.674e-01            1.045e-01  
##               CLASS  CLASS_DESCUNDIVIDED                  CBD  
##           1.908e+00                   NA           -5.671e+00
step(full, data=twolane1, direction="both")
## Start:  AIC=14901.91
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH + 
##     SHOULDER_WIDTH + CLASS + CLASS_DESC + CBD
## 
## 
## Step:  AIC=14901.91
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH + 
##     SHOULDER_WIDTH + CLASS + CBD
## 
##                  Df Sum of Sq    RSS   AIC
## - MEDIAN_WIDTH    1         6 331131 14900
## <none>                        331124 14902
## - CBD             1       255 331380 14902
## - CLASS           1       284 331408 14903
## - SHOULDER_WIDTH  1       373 331497 14904
## - AVG_LANE_WIDTH  1      3274 334399 14932
## - AVG_ADT         1     50987 382111 15360
## - LENGTH          1     66750 397874 15490
## 
## Step:  AIC=14899.98
## AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH + SHOULDER_WIDTH + 
##     CLASS + CBD
## 
##                  Df Sum of Sq    RSS   AIC
## <none>                        331131 14900
## - CBD             1       257 331388 14900
## - CLASS           1       302 331433 14901
## - SHOULDER_WIDTH  1       369 331500 14902
## + MEDIAN_WIDTH    1         6 331124 14902
## - AVG_LANE_WIDTH  1      3268 334399 14930
## - AVG_ADT         1     51547 382678 15362
## - LENGTH          1     67592 398723 15494
## 
## Call:
## lm(formula = AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH + 
##     SHOULDER_WIDTH + CLASS + CBD, data = twolane1)
## 
## Coefficients:
##    (Intercept)         AVG_ADT          LENGTH  AVG_LANE_WIDTH  
##     -1.325e+01       6.301e-04       4.489e+00       6.658e-01  
## SHOULDER_WIDTH           CLASS             CBD  
##      1.039e-01       1.788e+00      -5.689e+00
### Considering 5 Variables
full=lm(AVG_CRASHES~., data = twolane2)
full
## 
## Call:
## lm(formula = AVG_CRASHES ~ ., data = twolane2)
## 
## Coefficients:
##    (Intercept)         AVG_ADT          LENGTH    MEDIAN_WIDTH  
##     -1.157e+01       6.317e-04       4.495e+00       2.593e-02  
## AVG_LANE_WIDTH  SHOULDER_WIDTH  
##      6.757e-01       1.023e-01
step(full, data=twolane2, direction="both")
## Start:  AIC=14903.28
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH + 
##     SHOULDER_WIDTH
## 
##                  Df Sum of Sq    RSS   AIC
## - MEDIAN_WIDTH    1        24 331702 14902
## <none>                        331679 14903
## - SHOULDER_WIDTH  1       357 332036 14905
## - AVG_LANE_WIDTH  1      3368 335046 14934
## - AVG_ADT         1     51523 383202 15365
## - LENGTH          1     67083 398761 15493
## 
## Step:  AIC=14901.51
## AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH + SHOULDER_WIDTH
## 
##                  Df Sum of Sq    RSS   AIC
## <none>                        331702 14902
## - SHOULDER_WIDTH  1       364 332066 14903
## + MEDIAN_WIDTH    1        24 331679 14903
## - AVG_LANE_WIDTH  1      3439 335141 14933
## - AVG_ADT         1     53261 384963 15378
## - LENGTH          1     67591 399293 15495
## 
## Call:
## lm(formula = AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH + 
##     SHOULDER_WIDTH, data = twolane2)
## 
## Coefficients:
##    (Intercept)         AVG_ADT          LENGTH  AVG_LANE_WIDTH  
##     -1.162e+01       6.339e-04       4.486e+00       6.804e-01  
## SHOULDER_WIDTH  
##      1.031e-01
### Generalized linear model (Poisson)[all data]
lmdel <- glm(AVG_CRASHES~., data = twolane2, family="poisson")
m <- predict(lmdel, type="response")
error <- twolane2$AVG_CRASHES- m
(rmse <- sqrt(mean(error^2)))
## [1] 11.3625
lmdel
## 
## Call:  glm(formula = AVG_CRASHES ~ ., family = "poisson", data = twolane2)
## 
## Coefficients:
##    (Intercept)         AVG_ADT          LENGTH    MEDIAN_WIDTH  
##      1.433e-01       6.238e-05       3.883e-01      -3.763e-03  
## AVG_LANE_WIDTH  SHOULDER_WIDTH  
##      3.800e-02       2.590e-02  
## 
## Degrees of Freedom: 3210 Total (i.e. Null);  3205 Residual
## Null Deviance:       36530 
## Residual Deviance: 22660     AIC: Inf
### Generalized linear model (Negative-binomial)[all data]
library(MASS)
nbmodel<- glm.nb(AVG_CRASHES~., data = twolane2)
m1 <- predict(nbmodel, type="response")
nbmodel
## 
## Call:  glm.nb(formula = AVG_CRASHES ~ ., data = twolane2, init.theta = 1.104723388, 
##     link = log)
## 
## Coefficients:
##    (Intercept)         AVG_ADT          LENGTH    MEDIAN_WIDTH  
##     -4.608e-01       9.824e-05       6.517e-01      -2.405e-03  
## AVG_LANE_WIDTH  SHOULDER_WIDTH  
##      3.648e-02       9.751e-03  
## 
## Degrees of Freedom: 3210 Total (i.e. Null);  3205 Residual
## Null Deviance:       6077 
## Residual Deviance: 3722  AIC: 16350
error1 <- twolane2$AVG_CRASHES- m1
(rmse <- sqrt(mean(error1^2)))
## [1] 78.30459

Comment: Very high RMSE. Not recommended.

Algorithmic Modeling

index     <- 1:nrow(twolane2)
testindex <- sample(index, trunc(length(index)/30))
testset   <- twolane2[testindex,]
trainset  <- twolane2[-testindex,]
dim(testset)
## [1] 107   6
dim(trainset)
## [1] 3104    6
library(e1071)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Modeling for testset
svm.model <- svm(AVG_CRASHES~., data = trainset, cost = 100, gamma =1)
svm.pred  <- predict(svm.model, testset[1:5])

pl <- cbind(obs= testset$AVG_CRASHES, pred=svm.pred)
pl <- cbind(obs= testset$AVG_CRASHES, pred=abs(svm.pred), diff=abs(svm.pred)-testset$AVG_CRASHES)
pl <- data.frame(pl)
plot(pl$obs, pl$pred, pch=8, cex=0.2)

svm_err <- testset$AVG_CRASHES- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err^2)))
## [1] 9.310153
## Modeling for trainset [all data]
svm.model <- svm(AVG_CRASHES~., data = twolane2, cost = 100, gamma =1)
svm.pred  <- predict(svm.model, twolane2[1:5])

pl <- cbind(obs= twolane2$AVG_CRASHES, pred=svm.pred)
pl <- cbind(obs= twolane2$AVG_CRASHES, pred=abs(svm.pred), diff=abs(svm.pred)-twolane2$AVG_CRASHES)
pl <- data.frame(pl)
plot(pl$obs, pl$pred, pch=8, cex=0.2)

svm_err_all <- twolane2$AVG_CRASHES- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] 5.949426

Comment: Lower RMSE than Data Modeling RMSE.

Four-lane

Exploratory Data Analysis

dim(fourlane)
## [1] 2285   15
str(fourlane)
## 'data.frame':    2285 obs. of  15 variables:
##  $ Type          : Factor w/ 3 levels "2-lane","4-lane",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ AVG_ADT       : num  11800 14867 13300 17757 13533 ...
##  $ LENGTH        : num  0.04 0.1 0.6 1.41 0.92 0.92 0.54 0.54 0.28 0.05 ...
##  $ DOTD_FC       : int  14 14 14 14 14 14 14 14 14 14 ...
##  $ MEDIAN_WIDTH  : int  0 0 0 0 14 16 14 16 20 20 ...
##  $ NUM_LANES     : int  6 6 4 4 4 4 4 4 4 4 ...
##  $ PAVEMENT_WIDTH: int  78 78 40 49 48 48 48 44 48 48 ...
##  $ AVG_LANE_WIDTH: num  13 13 10 12 12 12 12 11 12 12 ...
##  $ SHOULDER_WIDTH: int  0 0 0 0 0 0 0 0 10 10 ...
##  $ CLASS         : int  1 1 1 1 2 2 2 2 2 2 ...
##  $ CLASS_DESC    : Factor w/ 3 levels "CTL","DIVIDED",..: 3 3 3 3 2 2 2 2 2 2 ...
##  $ CBD           : int  1 1 1 1 0 0 0 0 0 0 ...
##  $ TOT_CRASHES   : int  18 8 44 713 181 125 66 302 138 4 ...
##  $ AVG_CRASHES   : num  2.25 1 5.5 89.12 22.62 ...
##  $ FITTED        : num  1.78 3.6 10.67 21.89 17.97 ...
fourlane1 <- fourlane[-c(1, 4, 6, 7, 13, 15)]
names(fourlane1)
## [1] "AVG_ADT"        "LENGTH"         "MEDIAN_WIDTH"   "AVG_LANE_WIDTH"
## [5] "SHOULDER_WIDTH" "CLASS"          "CLASS_DESC"     "CBD"           
## [9] "AVG_CRASHES"
## Important Variable Selection
library(leaps)
leaps=regsubsets(AVG_CRASHES~., data = fourlane1,  nbest=5)
## Reordering variables and trying again:
plot(leaps, scale="bic")

## By Randomforest
library(randomForest)
impvar <- randomForest(AVG_CRASHES~., data = fourlane1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar)

## Reducing variables
fourlane2 <- fourlane1[-c(6:8)]
names(fourlane2)
## [1] "AVG_ADT"        "LENGTH"         "MEDIAN_WIDTH"   "AVG_LANE_WIDTH"
## [5] "SHOULDER_WIDTH" "AVG_CRASHES"
## Correlation plot
library(corrplot)
corrp <- cor(fourlane2)
corrplot(corrp, method = "circle")

plot(fourlane2, pch=8, cex=0.1)

Data Modeling

### Considering 8 Variables
full=lm(AVG_CRASHES~., data = fourlane1)
full
## 
## Call:
## lm(formula = AVG_CRASHES ~ ., data = fourlane1)
## 
## Coefficients:
##         (Intercept)              AVG_ADT               LENGTH  
##          -5.2254280            0.0008037           14.2217707  
##        MEDIAN_WIDTH       AVG_LANE_WIDTH       SHOULDER_WIDTH  
##          -0.1855600            0.0502469           -0.1781190  
##               CLASS    CLASS_DESCDIVIDED  CLASS_DESCUNDIVIDED  
##          -0.8228764            0.8403549                   NA  
##                 CBD  
##          -4.0366008
step(full, data=fourlane1, direction="both")
## Start:  AIC=14859.78
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH + 
##     SHOULDER_WIDTH + CLASS + CLASS_DESC + CBD
## 
## 
## Step:  AIC=14859.78
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH + 
##     SHOULDER_WIDTH + CLASS_DESC + CBD
## 
##                  Df Sum of Sq     RSS   AIC
## - CLASS_DESC      2        25 1512762 14856
## - AVG_LANE_WIDTH  1         8 1512746 14858
## - SHOULDER_WIDTH  1      1193 1513931 14860
## <none>                        1512738 14860
## - MEDIAN_WIDTH    1      1839 1514576 14861
## - CBD             1      1966 1514704 14861
## - AVG_ADT         1    186705 1699443 15124
## - LENGTH          1    215251 1727989 15162
## 
## Step:  AIC=14855.82
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH + 
##     SHOULDER_WIDTH + CBD
## 
##                  Df Sum of Sq     RSS   AIC
## - AVG_LANE_WIDTH  1        11 1512773 14854
## - SHOULDER_WIDTH  1      1207 1513969 14856
## <none>                        1512762 14856
## - CBD             1      1965 1514727 14857
## + CLASS           1         1 1512761 14858
## + CLASS_DESC      2        25 1512738 14860
## - MEDIAN_WIDTH    1      4363 1517125 14860
## - AVG_ADT         1    191651 1704413 15126
## - LENGTH          1    215386 1728148 15158
## 
## Step:  AIC=14853.84
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + SHOULDER_WIDTH + 
##     CBD
## 
##                  Df Sum of Sq     RSS   AIC
## - SHOULDER_WIDTH  1      1207 1513980 14854
## <none>                        1512773 14854
## - CBD             1      1955 1514728 14855
## + AVG_LANE_WIDTH  1        11 1512762 14856
## + CLASS           1         1 1512772 14856
## + CLASS_DESC      2        27 1512746 14858
## - MEDIAN_WIDTH    1      4360 1517133 14858
## - AVG_ADT         1    194780 1707554 15129
## - LENGTH          1    215374 1728148 15156
## 
## Step:  AIC=14853.66
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + CBD
## 
##                  Df Sum of Sq     RSS   AIC
## <none>                        1513980 14854
## + SHOULDER_WIDTH  1      1207 1512773 14854
## - CBD             1      1776 1515757 14854
## + CLASS           1        18 1513962 14856
## + AVG_LANE_WIDTH  1        11 1513969 14856
## + CLASS_DESC      2        41 1513940 14858
## - MEDIAN_WIDTH    1      7121 1521101 14862
## - AVG_ADT         1    196030 1710010 15130
## - LENGTH          1    214351 1728332 15154
## 
## Call:
## lm(formula = AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + 
##     CBD, data = fourlane1)
## 
## Coefficients:
##  (Intercept)       AVG_ADT        LENGTH  MEDIAN_WIDTH           CBD  
##   -5.7819455     0.0008064    14.1128725    -0.2193557    -3.8209792
### Considering 4 Variables
full=lm(AVG_CRASHES~., data = fourlane2)
full
## 
## Call:
## lm(formula = AVG_CRASHES ~ ., data = fourlane2)
## 
## Coefficients:
##    (Intercept)         AVG_ADT          LENGTH    MEDIAN_WIDTH  
##      -5.819484        0.000795       14.228189       -0.179394  
## AVG_LANE_WIDTH  SHOULDER_WIDTH  
##       0.017060       -0.164322
step(full, data=fourlane2, direction="both")
## Start:  AIC=14856.79
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH + 
##     SHOULDER_WIDTH
## 
##                  Df Sum of Sq     RSS   AIC
## - AVG_LANE_WIDTH  1         1 1514728 14855
## - SHOULDER_WIDTH  1      1028 1515755 14856
## <none>                        1514727 14857
## - MEDIAN_WIDTH    1      4108 1518835 14861
## - AVG_ADT         1    189688 1704415 15124
## - LENGTH          1    215507 1730234 15159
## 
## Step:  AIC=14854.79
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + SHOULDER_WIDTH
## 
##                  Df Sum of Sq     RSS   AIC
## - SHOULDER_WIDTH  1      1028 1515757 14854
## <none>                        1514728 14855
## + AVG_LANE_WIDTH  1         1 1514727 14857
## - MEDIAN_WIDTH    1      4131 1518859 14859
## - AVG_ADT         1    192852 1707580 15127
## - LENGTH          1    215512 1730240 15157
## 
## Step:  AIC=14854.34
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH
## 
##                  Df Sum of Sq     RSS   AIC
## <none>                        1515757 14854
## + SHOULDER_WIDTH  1      1028 1514728 14855
## + AVG_LANE_WIDTH  1         1 1515755 14856
## - MEDIAN_WIDTH    1      6675 1522432 14862
## - AVG_ADT         1    194325 1710081 15128
## - LENGTH          1    214754 1730511 15155
## 
## Call:
## lm(formula = AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH, data = fourlane2)
## 
## Coefficients:
##  (Intercept)       AVG_ADT        LENGTH  MEDIAN_WIDTH  
##   -5.9035688     0.0007976    14.1254508    -0.2118813
### Generalized linear model (Poisson)[all data]
lmdel <- glm(AVG_CRASHES~., data = fourlane2, family="poisson")
m <- predict(lmdel, type="response")
lmdel
## 
## Call:  glm(formula = AVG_CRASHES ~ ., family = "poisson", data = fourlane2)
## 
## Coefficients:
##    (Intercept)         AVG_ADT          LENGTH    MEDIAN_WIDTH  
##      1.472e+00       3.667e-05       4.215e-01      -6.323e-03  
## AVG_LANE_WIDTH  SHOULDER_WIDTH  
##      1.940e-02      -4.692e-03  
## 
## Degrees of Freedom: 2284 Total (i.e. Null);  2279 Residual
## Null Deviance:       66910 
## Residual Deviance: 47680     AIC: Inf
error <- fourlane2$AVG_CRASHES- m
(rmse <- sqrt(mean(error^2)))
## [1] 29.43547

Algorithmic Modeling

index     <- 1:nrow(fourlane2)
testindex <- sample(index, trunc(length(index)/30))
testset   <- fourlane2[testindex,]
trainset  <- fourlane2[-testindex,]
dim(testset)
## [1] 76  6
dim(trainset)
## [1] 2209    6
library(e1071)
library(caret)


## Modeling for testset
svm.model <- svm(AVG_CRASHES~., data = trainset, cost = 100, gamma =1)
svm.pred  <- predict(svm.model, testset[1:5])

pl <- cbind(obs= testset$AVG_CRASHES, pred=svm.pred)
pl <- cbind(obs= testset$AVG_CRASHES, pred=abs(svm.pred), diff=abs(svm.pred)-testset$AVG_CRASHES)
pl <- data.frame(pl)
plot(pl$obs, pl$pred, pch=8, cex=0.2)

svm_err <- testset$AVG_CRASHES- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err^2)))
## [1] 18.27462
## Modeling for trainset [all data]
svm.model <- svm(AVG_CRASHES~., data = fourlane2, cost = 100, gamma =1)
svm.pred  <- predict(svm.model, fourlane2[1:5])
pl <- cbind(obs= fourlane2$AVG_CRASHES, pred=svm.pred)
pl <- cbind(obs= fourlane2$AVG_CRASHES, pred=abs(svm.pred), diff=abs(svm.pred)-fourlane2$AVG_CRASHES)
pl <- data.frame(pl)
plot(pl$obs, pl$pred, pch=8, cex=0.2)

svm_err_all <- fourlane2$AVG_CRASHES- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] 13.86325

Comment: Lower RMSE than Data Modeling RMSE.

Intersections

Exploratory Data Analysis

dim(int)
## [1] 563  15
str(int)
## 'data.frame':    563 obs. of  15 variables:
##  $ Type          : Factor w/ 3 levels "2-lane","4-lane",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ AVG_ADT       : num  59000 34800 34514 34712 34400 ...
##  $ LENGTH        : num  0.82 1.31 1.34 0.58 0.08 0.38 0.4 0.13 0.36 0.53 ...
##  $ DOTD_FC       : int  11 11 11 11 11 11 11 11 11 11 ...
##  $ MEDIAN_WIDTH  : int  64 64 64 64 64 64 64 64 64 64 ...
##  $ NUM_LANES     : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ PAVEMENT_WIDTH: int  48 48 48 48 48 48 48 48 48 48 ...
##  $ AVG_LANE_WIDTH: num  12 12 12 12 12 12 12 12 12 12 ...
##  $ SHOULDER_WIDTH: int  10 10 10 10 10 10 10 10 10 10 ...
##  $ CLASS         : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ CLASS_DESC    : Factor w/ 3 levels "CTL","DIVIDED",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ CBD           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ TOT_CRASHES   : int  1 121 107 31 12 25 4 10 69 32 ...
##  $ AVG_CRASHES   : num  0.125 15.125 13.375 3.875 1.5 ...
##  $ FITTED        : num  25.33 18.77 18.92 9.38 1.73 ...
int1 <- int[-c(1, 4, 6, 7, 13, 15)]
names(int1)
## [1] "AVG_ADT"        "LENGTH"         "MEDIAN_WIDTH"   "AVG_LANE_WIDTH"
## [5] "SHOULDER_WIDTH" "CLASS"          "CLASS_DESC"     "CBD"           
## [9] "AVG_CRASHES"
## Important Variable Selection
library(leaps)
leaps=regsubsets(AVG_CRASHES~., data = int1,  nbest=5)
## Reordering variables and trying again:
plot(leaps, scale="bic")

## By Randomforest
library(randomForest)
impvar <- randomForest(AVG_CRASHES~., data = int1, ntree=100, keep.forest=FALSE, importance=TRUE)
varImpPlot(impvar)

## Reducing variables
int2 <- int1[-c(5:8)]
names(int2)
## [1] "AVG_ADT"        "LENGTH"         "MEDIAN_WIDTH"   "AVG_LANE_WIDTH"
## [5] "AVG_CRASHES"
## Correlation plot
library(corrplot)
corrp <- cor(int2)
corrplot(corrp, method = "circle")

plot(int2, pch=8, cex=0.1)

Data Modeling

### Considering 8 Variables
full=lm(AVG_CRASHES~., data = int1)
full
## 
## Call:
## lm(formula = AVG_CRASHES ~ ., data = int1)
## 
## Coefficients:
##         (Intercept)              AVG_ADT               LENGTH  
##           2.329e+01            6.576e-04            1.900e+01  
##        MEDIAN_WIDTH       AVG_LANE_WIDTH       SHOULDER_WIDTH  
##           1.483e-03            4.797e+00            8.313e-01  
##               CLASS  CLASS_DESCUNDIVIDED                  CBD  
##          -5.928e+01                   NA            6.175e+00
step(full, data=int1, direction="both")
## Start:  AIC=4179.75
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH + 
##     SHOULDER_WIDTH + CLASS + CLASS_DESC + CBD
## 
## 
## Step:  AIC=4179.75
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH + 
##     SHOULDER_WIDTH + CLASS + CBD
## 
##                  Df Sum of Sq     RSS    AIC
## - MEDIAN_WIDTH    1         1  917068 4177.8
## - CBD             1       914  917982 4178.3
## <none>                         917068 4179.8
## - SHOULDER_WIDTH  1      3526  920593 4179.9
## - AVG_LANE_WIDTH  1     14269  931336 4186.4
## - CLASS           1     31572  948640 4196.8
## - LENGTH          1    148399 1065467 4262.2
## - AVG_ADT         1    191004 1108071 4284.3
## 
## Step:  AIC=4177.76
## AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH + SHOULDER_WIDTH + 
##     CLASS + CBD
## 
##                  Df Sum of Sq     RSS    AIC
## - CBD             1       913  917982 4176.3
## <none>                         917068 4177.8
## - SHOULDER_WIDTH  1      3998  921067 4178.2
## + MEDIAN_WIDTH    1         1  917068 4179.8
## - AVG_LANE_WIDTH  1     14311  931379 4184.5
## - CLASS           1     32006  949074 4195.1
## - LENGTH          1    150274 1067343 4261.2
## - AVG_ADT         1    195289 1112357 4284.4
## 
## Step:  AIC=4176.32
## AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH + SHOULDER_WIDTH + 
##     CLASS
## 
##                  Df Sum of Sq     RSS    AIC
## <none>                         917982 4176.3
## - SHOULDER_WIDTH  1      3604  921585 4176.5
## + CBD             1       913  917068 4177.8
## + MEDIAN_WIDTH    1         0  917982 4178.3
## - AVG_LANE_WIDTH  1     14156  932138 4182.9
## - CLASS           1     33939  951921 4194.8
## - LENGTH          1    153593 1071575 4261.4
## - AVG_ADT         1    202129 1120111 4286.4
## 
## Call:
## lm(formula = AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH + 
##     SHOULDER_WIDTH + CLASS, data = int1)
## 
## Coefficients:
##    (Intercept)         AVG_ADT          LENGTH  AVG_LANE_WIDTH  
##      2.634e+01       6.632e-04       1.913e+01       4.771e+00  
## SHOULDER_WIDTH           CLASS  
##      7.871e-01      -6.052e+01
### Considering 4 Variables
full=lm(AVG_CRASHES~., data = int2)
full
## 
## Call:
## lm(formula = AVG_CRASHES ~ ., data = int2)
## 
## Coefficients:
##    (Intercept)         AVG_ADT          LENGTH    MEDIAN_WIDTH  
##     -7.823e+01       6.478e-04       1.896e+01      -2.357e-02  
## AVG_LANE_WIDTH  
##      4.227e+00
step(full, data=int2, direction="both")
## Start:  AIC=4194.88
## AVG_CRASHES ~ AVG_ADT + LENGTH + MEDIAN_WIDTH + AVG_LANE_WIDTH
## 
##                  Df Sum of Sq     RSS    AIC
## - MEDIAN_WIDTH    1       205  952339 4193.0
## <none>                         952134 4194.9
## - AVG_LANE_WIDTH  1     12895  965029 4200.5
## - LENGTH          1    149143 1101277 4274.8
## - AVG_ADT         1    192791 1144925 4296.7
## 
## Step:  AIC=4193
## AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH
## 
##                  Df Sum of Sq     RSS    AIC
## <none>                         952339 4193.0
## + MEDIAN_WIDTH    1       205  952134 4194.9
## - AVG_LANE_WIDTH  1     13348  965688 4198.8
## - LENGTH          1    150096 1102435 4273.4
## - AVG_ADT         1    206109 1158448 4301.3
## 
## Call:
## lm(formula = AVG_CRASHES ~ AVG_ADT + LENGTH + AVG_LANE_WIDTH, 
##     data = int2)
## 
## Coefficients:
##    (Intercept)         AVG_ADT          LENGTH  AVG_LANE_WIDTH  
##     -8.012e+01       6.525e-04       1.887e+01       4.280e+00
### Generalized linear model (Poisson)[all data]
lmdel <- glm(AVG_CRASHES~., data = int2, family="poisson")
m <- predict(lmdel, type="response")
lmdel
## 
## Call:  glm(formula = AVG_CRASHES ~ ., family = "poisson", data = int2)
## 
## Coefficients:
##    (Intercept)         AVG_ADT          LENGTH    MEDIAN_WIDTH  
##     -4.653e-03       1.905e-05       5.774e-01      -9.157e-04  
## AVG_LANE_WIDTH  
##      1.178e-01  
## 
## Degrees of Freedom: 562 Total (i.e. Null);  558 Residual
## Null Deviance:       24850 
## Residual Deviance: 15210     AIC: Inf
error <- int2$AVG_CRASHES- m
(rmse <- sqrt(mean(error^2)))
## [1] 40.70793

Algorithmic Modeling

index     <- 1:nrow(int2)
testindex <- sample(index, trunc(length(index)/10))
testset   <- int2[testindex,]
trainset  <- int2[-testindex,]
dim(testset)
## [1] 56  5
dim(trainset)
## [1] 507   5
library(e1071)
library(caret)


## Modeling for testset
svm.model <- svm(AVG_CRASHES~., data = trainset, cost = 100, gamma =1)
svm.pred  <- predict(svm.model, testset[1:4])

pl <- cbind(obs= testset$AVG_CRASHES, pred=svm.pred)
pl <- cbind(obs= testset$AVG_CRASHES, pred=abs(svm.pred), diff=abs(svm.pred)-testset$AVG_CRASHES)
pl <- data.frame(pl)
plot(pl$obs, pl$pred, pch=8, cex=0.2)

svm_err <- testset$AVG_CRASHES- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err^2)))
## [1] 47.0918
## Modeling for trainset [all data]
svm.model <- svm(AVG_CRASHES~., data = int2, cost = 100, gamma =1)
svm.pred  <- predict(svm.model, int2[1:4])

pl <- cbind(obs= int2$AVG_CRASHES, pred=svm.pred)
pl <- cbind(obs= int2$AVG_CRASHES, pred=abs(svm.pred), diff=abs(svm.pred)-int2$AVG_CRASHES)
pl <- data.frame(pl)
plot(pl$obs, pl$pred, pch=8, cex=0.2)

svm_err_all <- int2$AVG_CRASHES- svm.pred
(svm_rmse1 <- sqrt(mean(svm_err_all^2)))
## [1] 21.8101

Comment: Lower RMSE than Data Modeling RMSE.

Conducted by: Subasish Das