Data calling

setwd("/Users/subasishdas1/Desktop/TRB2016/Data")
x4 <- read.csv("A_Curve_Freq_a.csv")
names(x4)
##  [1] "NUM_VEH"        "LIGHTING_CD"    "MAN_COLL_CD"    "ROAD_COND_CD"  
##  [5] "ROAD_TYPE_CD"   "INTERSECTION"   "SEVERITY_CD"    "SURF_COND_CD"  
##  [9] "WEATHER_CD"     "DR_SEX"         "DR_AGE"         "DR_COND_CD"    
## [13] "DR_DISTRACT_CD" "POSTED_SPEED"   "NUM_OCC"        "VEH_COND_CD"   
## [17] "FREQ"
dim(x4)
## [1] 50410    17
summary(x4)
##      NUM_VEH                                        LIGHTING_CD   
##  Multiple:25554   Dark - Continuous Street Light          : 6626  
##  Single  :24856   Dark - No Street Lights                 : 9712  
##                   Dark - Street Light At Intersection Only: 2020  
##                   Daylight                                :29765  
##                   Other                                   : 2287  
##                                                                   
##                                                                   
##          MAN_COLL_CD              ROAD_COND_CD  
##  Single Vehicle:19881   Abnormality     : 3122  
##  Rear End      :10512   No Abnormalities:46035  
##  Other         : 7813   Water On Roadway: 1253  
##  Sideswipe     : 5214                           
##  Right Angle   : 2960                           
##  Left Turn     : 1838                           
##  (Other)       : 2192                           
##                                    ROAD_TYPE_CD   INTERSECTION   
##  One-Way Road                            : 7254   Mode :logical  
##  Other                                   :  838   FALSE:36297    
##  Two-Way Road With A Physical Separation :10795   TRUE :14113    
##  Two-Way Road With No Physical Separation:31523   NA's :0        
##                                                                  
##                                                                  
##                                                                  
##     SEVERITY_CD    SURF_COND_CD   WEATHER_CD        DR_SEX     
##  Complaint:13708   Dry  :38514   Clear :30674   Female :19242  
##  Fatal    :  928   Other:  557   Cloudy:11005   Male   :31014  
##  Moderate : 5645   Wet  :11339   Other : 1476   Unknown:  154  
##  No Injury:29198                 Rain  : 7255                  
##  Severe   :  931                                               
##                                                                
##                                                                
##     DR_AGE                        DR_COND_CD   
##  15-24 :15748   Inattentive            :21755  
##  25-34 :11632   Normal                 :14437  
##  35-44 : 8382   Impaired               : 4397  
##  45-54 : 6880   Unknown                : 3794  
##  55-64 : 4367   Distracted             : 3535  
##  65-74 : 2049   Fatigued/Asleep/Illness: 1817  
##  75-Inf: 1352   (Other)                :  675  
##              DR_DISTRACT_CD   POSTED_SPEED            NUM_OCC     
##  Cell Phone         : 1176   51-60  :12453   Single       :36380  
##  Electronic Device  :  261   31-40  :12134   With Occupant:14030  
##  Inside The Vehicle : 2819   41-50  :12134                        
##  Not Distracted     :28888   21-30  : 6455                        
##  Other              :13871   Unknown: 3163                        
##  Outside The Vehicle: 3227   61-70  : 2572                        
##  Unknown            :  168   (Other): 1499                        
##               VEH_COND_CD              FREQ      
##  No Defects Observed:45690   Highly Prone:  976  
##  Unknown            : 3071   Prone       : 6801  
##  Tire Failure       :  920   Single      :42633  
##  Defective Brakes   :  341                       
##  Defective Lights   :  133                       
##  Defective Steering :  112                       
##  (Other)            :  143

SVM [Small Sample]

x5 <- x4[sample(nrow(x4), 2000),]
dim(x5)
## [1] 2000   17
index     <- 1:nrow(x5)
testindex <- sample(index, trunc(length(index)/3))
testset   <- x5[testindex,]
trainset  <- x5[-testindex,]
dim(testset)
## [1] 666  17
dim(trainset)
## [1] 1334   17
library(e1071)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
svm.model <- svm(FREQ ~ ., data = trainset, cost = 100, gamma =1)
svm.pred  <- predict(svm.model, testset[,-17])

confusionMatrix(svm.pred, testset$FREQ)
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     Highly Prone Prone Single
##   Highly Prone            0     0      0
##   Prone                   0     1      0
##   Single                 11    76    578
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8694         
##                  95% CI : (0.8414, 0.894)
##     No Information Rate : 0.8679         
##     P-Value [Acc > NIR] : 0.4828         
##                                          
##                   Kappa : 0.0197         
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Highly Prone Class: Prone Class: Single
## Sensitivity                      0.00000     0.012987       1.00000
## Specificity                      1.00000     1.000000       0.01136
## Pos Pred Value                       NaN     1.000000       0.86917
## Neg Pred Value                   0.98348     0.885714       1.00000
## Prevalence                       0.01652     0.115616       0.86787
## Detection Rate                   0.00000     0.001502       0.86787
## Detection Prevalence             0.00000     0.001502       0.99850
## Balanced Accuracy                0.50000     0.506494       0.50568

SVM [Large Sample]

x5 <- x4[sample(nrow(x4), 20000),]
dim(x5)
## [1] 20000    17
index     <- 1:nrow(x5)
testindex <- sample(index, trunc(length(index)/3))
testset   <- x5[testindex,]
trainset  <- x5[-testindex,]
dim(testset)
## [1] 6666   17
dim(trainset)
## [1] 13334    17
library(e1071)
library(caret)
svm.model <- svm(FREQ ~ ., data = trainset, cost = 100, gamma =1)
svm.pred  <- predict(svm.model, testset[,-17])

confusionMatrix(svm.pred, testset$FREQ)
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     Highly Prone Prone Single
##   Highly Prone           32     7      4
##   Prone                  54   208    167
##   Single                 33   670   5491
## 
## Overall Statistics
##                                          
##                Accuracy : 0.8597         
##                  95% CI : (0.8512, 0.868)
##     No Information Rate : 0.8494         
##     P-Value [Acc > NIR] : 0.009033       
##                                          
##                   Kappa : 0.306          
##  Mcnemar's Test P-Value : < 2.2e-16      
## 
## Statistics by Class:
## 
##                      Class: Highly Prone Class: Prone Class: Single
## Sensitivity                     0.268908      0.23503        0.9698
## Specificity                     0.998320      0.96177        0.2998
## Pos Pred Value                  0.744186      0.48485        0.8865
## Neg Pred Value                  0.986864      0.89145        0.6377
## Prevalence                      0.017852      0.13276        0.8494
## Detection Rate                  0.004800      0.03120        0.8237
## Detection Prevalence            0.006451      0.06436        0.9292
## Balanced Accuracy               0.633614      0.59840        0.6348

Caret

library(caret)
modelFit <- train(FREQ ~., data=trainset, method="rpart")
## Loading required package: rpart
predictions <- predict(modelFit, newdata=testset)
confusionMatrix(predictions, testset$FREQ)
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     Highly Prone Prone Single
##   Highly Prone            0     0      0
##   Prone                   0     0      0
##   Single                119   885   5662
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8494          
##                  95% CI : (0.8406, 0.8579)
##     No Information Rate : 0.8494          
##     P-Value [Acc > NIR] : 0.5084          
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Highly Prone Class: Prone Class: Single
## Sensitivity                      0.00000       0.0000        1.0000
## Specificity                      1.00000       1.0000        0.0000
## Pos Pred Value                       NaN          NaN        0.8494
## Neg Pred Value                   0.98215       0.8672           NaN
## Prevalence                       0.01785       0.1328        0.8494
## Detection Rate                   0.00000       0.0000        0.8494
## Detection Prevalence             0.00000       0.0000        1.0000
## Balanced Accuracy                0.50000       0.5000        0.5000
print(modelFit$finalModel) 
## n= 13334 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 13334 2048 Single (0.01822409 0.13536823 0.84640768) *