Data calling
setwd("/Users/subasishdas1/Desktop/TRB2016/Data")
x4 <- read.csv("A_Curve_Freq_a.csv")
names(x4)
## [1] "NUM_VEH" "LIGHTING_CD" "MAN_COLL_CD" "ROAD_COND_CD"
## [5] "ROAD_TYPE_CD" "INTERSECTION" "SEVERITY_CD" "SURF_COND_CD"
## [9] "WEATHER_CD" "DR_SEX" "DR_AGE" "DR_COND_CD"
## [13] "DR_DISTRACT_CD" "POSTED_SPEED" "NUM_OCC" "VEH_COND_CD"
## [17] "FREQ"
dim(x4)
## [1] 50410 17
summary(x4)
## NUM_VEH LIGHTING_CD
## Multiple:25554 Dark - Continuous Street Light : 6626
## Single :24856 Dark - No Street Lights : 9712
## Dark - Street Light At Intersection Only: 2020
## Daylight :29765
## Other : 2287
##
##
## MAN_COLL_CD ROAD_COND_CD
## Single Vehicle:19881 Abnormality : 3122
## Rear End :10512 No Abnormalities:46035
## Other : 7813 Water On Roadway: 1253
## Sideswipe : 5214
## Right Angle : 2960
## Left Turn : 1838
## (Other) : 2192
## ROAD_TYPE_CD INTERSECTION
## One-Way Road : 7254 Mode :logical
## Other : 838 FALSE:36297
## Two-Way Road With A Physical Separation :10795 TRUE :14113
## Two-Way Road With No Physical Separation:31523 NA's :0
##
##
##
## SEVERITY_CD SURF_COND_CD WEATHER_CD DR_SEX
## Complaint:13708 Dry :38514 Clear :30674 Female :19242
## Fatal : 928 Other: 557 Cloudy:11005 Male :31014
## Moderate : 5645 Wet :11339 Other : 1476 Unknown: 154
## No Injury:29198 Rain : 7255
## Severe : 931
##
##
## DR_AGE DR_COND_CD
## 15-24 :15748 Inattentive :21755
## 25-34 :11632 Normal :14437
## 35-44 : 8382 Impaired : 4397
## 45-54 : 6880 Unknown : 3794
## 55-64 : 4367 Distracted : 3535
## 65-74 : 2049 Fatigued/Asleep/Illness: 1817
## 75-Inf: 1352 (Other) : 675
## DR_DISTRACT_CD POSTED_SPEED NUM_OCC
## Cell Phone : 1176 51-60 :12453 Single :36380
## Electronic Device : 261 31-40 :12134 With Occupant:14030
## Inside The Vehicle : 2819 41-50 :12134
## Not Distracted :28888 21-30 : 6455
## Other :13871 Unknown: 3163
## Outside The Vehicle: 3227 61-70 : 2572
## Unknown : 168 (Other): 1499
## VEH_COND_CD FREQ
## No Defects Observed:45690 Highly Prone: 976
## Unknown : 3071 Prone : 6801
## Tire Failure : 920 Single :42633
## Defective Brakes : 341
## Defective Lights : 133
## Defective Steering : 112
## (Other) : 143
SVM [Small Sample]
x5 <- x4[sample(nrow(x4), 2000),]
dim(x5)
## [1] 2000 17
index <- 1:nrow(x5)
testindex <- sample(index, trunc(length(index)/3))
testset <- x5[testindex,]
trainset <- x5[-testindex,]
dim(testset)
## [1] 666 17
dim(trainset)
## [1] 1334 17
library(e1071)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
svm.model <- svm(FREQ ~ ., data = trainset, cost = 100, gamma =1)
svm.pred <- predict(svm.model, testset[,-17])
confusionMatrix(svm.pred, testset$FREQ)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Highly Prone Prone Single
## Highly Prone 0 0 0
## Prone 0 1 0
## Single 11 76 578
##
## Overall Statistics
##
## Accuracy : 0.8694
## 95% CI : (0.8414, 0.894)
## No Information Rate : 0.8679
## P-Value [Acc > NIR] : 0.4828
##
## Kappa : 0.0197
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Highly Prone Class: Prone Class: Single
## Sensitivity 0.00000 0.012987 1.00000
## Specificity 1.00000 1.000000 0.01136
## Pos Pred Value NaN 1.000000 0.86917
## Neg Pred Value 0.98348 0.885714 1.00000
## Prevalence 0.01652 0.115616 0.86787
## Detection Rate 0.00000 0.001502 0.86787
## Detection Prevalence 0.00000 0.001502 0.99850
## Balanced Accuracy 0.50000 0.506494 0.50568
SVM [Large Sample]
x5 <- x4[sample(nrow(x4), 20000),]
dim(x5)
## [1] 20000 17
index <- 1:nrow(x5)
testindex <- sample(index, trunc(length(index)/3))
testset <- x5[testindex,]
trainset <- x5[-testindex,]
dim(testset)
## [1] 6666 17
dim(trainset)
## [1] 13334 17
library(e1071)
library(caret)
svm.model <- svm(FREQ ~ ., data = trainset, cost = 100, gamma =1)
svm.pred <- predict(svm.model, testset[,-17])
confusionMatrix(svm.pred, testset$FREQ)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Highly Prone Prone Single
## Highly Prone 32 7 4
## Prone 54 208 167
## Single 33 670 5491
##
## Overall Statistics
##
## Accuracy : 0.8597
## 95% CI : (0.8512, 0.868)
## No Information Rate : 0.8494
## P-Value [Acc > NIR] : 0.009033
##
## Kappa : 0.306
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: Highly Prone Class: Prone Class: Single
## Sensitivity 0.268908 0.23503 0.9698
## Specificity 0.998320 0.96177 0.2998
## Pos Pred Value 0.744186 0.48485 0.8865
## Neg Pred Value 0.986864 0.89145 0.6377
## Prevalence 0.017852 0.13276 0.8494
## Detection Rate 0.004800 0.03120 0.8237
## Detection Prevalence 0.006451 0.06436 0.9292
## Balanced Accuracy 0.633614 0.59840 0.6348
Caret
library(caret)
modelFit <- train(FREQ ~., data=trainset, method="rpart")
## Loading required package: rpart
predictions <- predict(modelFit, newdata=testset)
confusionMatrix(predictions, testset$FREQ)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Highly Prone Prone Single
## Highly Prone 0 0 0
## Prone 0 0 0
## Single 119 885 5662
##
## Overall Statistics
##
## Accuracy : 0.8494
## 95% CI : (0.8406, 0.8579)
## No Information Rate : 0.8494
## P-Value [Acc > NIR] : 0.5084
##
## Kappa : 0
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Highly Prone Class: Prone Class: Single
## Sensitivity 0.00000 0.0000 1.0000
## Specificity 1.00000 1.0000 0.0000
## Pos Pred Value NaN NaN 0.8494
## Neg Pred Value 0.98215 0.8672 NaN
## Prevalence 0.01785 0.1328 0.8494
## Detection Rate 0.00000 0.0000 0.8494
## Detection Prevalence 0.00000 0.0000 1.0000
## Balanced Accuracy 0.50000 0.5000 0.5000
print(modelFit$finalModel)
## n= 13334
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 13334 2048 Single (0.01822409 0.13536823 0.84640768) *