library(rpart)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ROSE)
## Loaded ROSE 0.0-3
setwd("D:/Users/gkokate/Desktop/Markdown")
build <- read.csv(file = "Build .csv",sep = ",", header = TRUE)
test1 <- read.csv(file = "test.csv",sep = ",", header = TRUE)
Dataset observation
head(build)
## gponOntAniOpInfoOpticalSignalLevel gponOntAniOpInfoTxOpticalSignalLevel
## 1 -7711 1142
## 2 -7703 1288
## 3 -7703 1081
## 4 -7703 1207
## 5 -7688 1276
## 6 -7688 1282
## gponOntOltsideOpInfoRxOpticalSignalLevel X15MinDnFwdByteCounter
## 1 -171 8.888281
## 2 -170 9.544178
## 3 -170 8.915710
## 4 -171 7.555582
## 5 -170 7.475159
## 6 -169 7.236687
## X15MinUpFwdByteCounter bponOntOpInfoDistance ifOperStatus
## 1 7.245204 38 up
## 2 7.764763 38 up
## 3 7.648214 38 up
## 4 6.976296 38 up
## 5 6.646812 38 up
## 6 6.449259 38 up
#Split data into training (70%) and validation (30%)
split <- sample(nrow(build),floor(nrow(build)*0.7)) #dataset
train <- build[split,]
val <- build[-split,]
##Loading and preprocessing the data
cor(build[,-7])
## gponOntAniOpInfoOpticalSignalLevel
## gponOntAniOpInfoOpticalSignalLevel 1.0000000
## gponOntAniOpInfoTxOpticalSignalLevel 0.2200446
## gponOntOltsideOpInfoRxOpticalSignalLevel 0.8286779
## X15MinDnFwdByteCounter -0.1531402
## X15MinUpFwdByteCounter -0.1431537
## bponOntOpInfoDistance 0.4725262
## gponOntAniOpInfoTxOpticalSignalLevel
## gponOntAniOpInfoOpticalSignalLevel 0.22004458
## gponOntAniOpInfoTxOpticalSignalLevel 1.00000000
## gponOntOltsideOpInfoRxOpticalSignalLevel 0.22851482
## X15MinDnFwdByteCounter -0.09589888
## X15MinUpFwdByteCounter -0.12970713
## bponOntOpInfoDistance 0.10857485
## gponOntOltsideOpInfoRxOpticalSignalLevel
## gponOntAniOpInfoOpticalSignalLevel 0.8286779
## gponOntAniOpInfoTxOpticalSignalLevel 0.2285148
## gponOntOltsideOpInfoRxOpticalSignalLevel 1.0000000
## X15MinDnFwdByteCounter -0.1355498
## X15MinUpFwdByteCounter -0.1414092
## bponOntOpInfoDistance 0.3258893
## X15MinDnFwdByteCounter
## gponOntAniOpInfoOpticalSignalLevel -0.153140207
## gponOntAniOpInfoTxOpticalSignalLevel -0.095898884
## gponOntOltsideOpInfoRxOpticalSignalLevel -0.135549796
## X15MinDnFwdByteCounter 1.000000000
## X15MinUpFwdByteCounter 0.890232544
## bponOntOpInfoDistance 0.004619694
## X15MinUpFwdByteCounter
## gponOntAniOpInfoOpticalSignalLevel -0.14315366
## gponOntAniOpInfoTxOpticalSignalLevel -0.12970713
## gponOntOltsideOpInfoRxOpticalSignalLevel -0.14140922
## X15MinDnFwdByteCounter 0.89023254
## X15MinUpFwdByteCounter 1.00000000
## bponOntOpInfoDistance 0.03787122
## bponOntOpInfoDistance
## gponOntAniOpInfoOpticalSignalLevel 0.472526225
## gponOntAniOpInfoTxOpticalSignalLevel 0.108574852
## gponOntOltsideOpInfoRxOpticalSignalLevel 0.325889320
## X15MinDnFwdByteCounter 0.004619694
## X15MinUpFwdByteCounter 0.037871220
## bponOntOpInfoDistance 1.000000000
pairs(build[,-7])
##Logisitic Regrssion Model Using ROSE algorithm for biased data to convert as a Unbiased .
# train logistic regression on imbalanced data
log.reg.imb <- glm(ifOperStatus ~ ., data = train ,family='binomial')
# use the trained model to predict val data
pred.log.reg.imb <- predict(log.reg.imb, newdata=val,type="response")
# generate new balanced data by ROSE
dataset.rose <- ROSE(ifOperStatus ~ ., data=train,p=0.30, seed=123)$data
cor(dataset.rose[,-7])
## gponOntAniOpInfoOpticalSignalLevel
## gponOntAniOpInfoOpticalSignalLevel 1.00000000
## gponOntAniOpInfoTxOpticalSignalLevel 0.25932807
## gponOntOltsideOpInfoRxOpticalSignalLevel 0.77637704
## X15MinDnFwdByteCounter -0.08535516
## X15MinUpFwdByteCounter -0.05791129
## bponOntOpInfoDistance 0.46672417
## gponOntAniOpInfoTxOpticalSignalLevel
## gponOntAniOpInfoOpticalSignalLevel 0.25932807
## gponOntAniOpInfoTxOpticalSignalLevel 1.00000000
## gponOntOltsideOpInfoRxOpticalSignalLevel 0.30578829
## X15MinDnFwdByteCounter -0.08906029
## X15MinUpFwdByteCounter -0.09658079
## bponOntOpInfoDistance 0.17149022
## gponOntOltsideOpInfoRxOpticalSignalLevel
## gponOntAniOpInfoOpticalSignalLevel 0.77637704
## gponOntAniOpInfoTxOpticalSignalLevel 0.30578829
## gponOntOltsideOpInfoRxOpticalSignalLevel 1.00000000
## X15MinDnFwdByteCounter -0.04454176
## X15MinUpFwdByteCounter -0.02031054
## bponOntOpInfoDistance 0.44337068
## X15MinDnFwdByteCounter
## gponOntAniOpInfoOpticalSignalLevel -0.08535516
## gponOntAniOpInfoTxOpticalSignalLevel -0.08906029
## gponOntOltsideOpInfoRxOpticalSignalLevel -0.04454176
## X15MinDnFwdByteCounter 1.00000000
## X15MinUpFwdByteCounter 0.76454657
## bponOntOpInfoDistance -0.01167402
## X15MinUpFwdByteCounter
## gponOntAniOpInfoOpticalSignalLevel -0.05791129
## gponOntAniOpInfoTxOpticalSignalLevel -0.09658079
## gponOntOltsideOpInfoRxOpticalSignalLevel -0.02031054
## X15MinDnFwdByteCounter 0.76454657
## X15MinUpFwdByteCounter 1.00000000
## bponOntOpInfoDistance 0.03308249
## bponOntOpInfoDistance
## gponOntAniOpInfoOpticalSignalLevel 0.46672417
## gponOntAniOpInfoTxOpticalSignalLevel 0.17149022
## gponOntOltsideOpInfoRxOpticalSignalLevel 0.44337068
## X15MinDnFwdByteCounter -0.01167402
## X15MinUpFwdByteCounter 0.03308249
## bponOntOpInfoDistance 1.00000000
pairs(dataset.rose[,-7])
# check (im)balance of new data
table(dataset.rose$ifOperStatus)
##
## up down
## 7506 3142
# train logistic regression on balanced data
log.reg.bal <- glm(ifOperStatus ~ ., data=dataset.rose , family='binomial')
summary(log.reg.bal)
##
## Call:
## glm(formula = ifOperStatus ~ ., family = "binomial", data = dataset.rose)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6995 -0.6294 -0.3893 0.5403 3.0429
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -2.838e+00 6.249e-01 -4.541
## gponOntAniOpInfoOpticalSignalLevel 3.283e-04 3.783e-05 8.677
## gponOntAniOpInfoTxOpticalSignalLevel -4.027e-03 3.750e-04 -10.739
## gponOntOltsideOpInfoRxOpticalSignalLevel -5.184e-02 1.893e-03 -27.390
## X15MinDnFwdByteCounter 3.788e-01 2.960e-02 12.800
## X15MinUpFwdByteCounter -2.239e-01 3.609e-02 -6.204
## bponOntOpInfoDistance -6.145e-02 2.512e-03 -24.463
## Pr(>|z|)
## (Intercept) 5.61e-06 ***
## gponOntAniOpInfoOpticalSignalLevel < 2e-16 ***
## gponOntAniOpInfoTxOpticalSignalLevel < 2e-16 ***
## gponOntOltsideOpInfoRxOpticalSignalLevel < 2e-16 ***
## X15MinDnFwdByteCounter < 2e-16 ***
## X15MinUpFwdByteCounter 5.49e-10 ***
## bponOntOpInfoDistance < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12918.9 on 10647 degrees of freedom
## Residual deviance: 8948.9 on 10641 degrees of freedom
## AIC: 8962.9
##
## Number of Fisher Scoring iterations: 5
# use the trained model to predict val data
pred.log.reg.bal <- predict(log.reg.bal, newdata= val,type="response")
# check accuracy of the two learners by measuring auc
roc.curve(val$ifOperStatus, pred.log.reg.imb)
## Area under the curve (AUC): 0.869
roc.curve(val$ifOperStatus, pred.log.reg.bal, add.roc=TRUE,col=2 ) # col=2
## Area under the curve (AUC): 0.870
#Odd ratios
exp(coef(log.reg.bal))
## (Intercept)
## 0.0585644
## gponOntAniOpInfoOpticalSignalLevel
## 1.0003283
## gponOntAniOpInfoTxOpticalSignalLevel
## 0.9959809
## gponOntOltsideOpInfoRxOpticalSignalLevel
## 0.9494831
## X15MinDnFwdByteCounter
## 1.4605934
## X15MinUpFwdByteCounter
## 0.7993979
## bponOntOpInfoDistance
## 0.9403987
Classification matrix accurcy on imbalance data 95% and balance data accuracy 92% .
#classification table bal
test.probs <-predict(log.reg.bal, val, type='response')
pred.logit <- rep('up',length(test.probs))
pred.logit[test.probs>=0.5] <- 'down'
class <- table(pred.logit, val$ifOperStatus)
confusionMatrix <- ftable(val$ifOperStatus, pred.logit)
confusionMatrix
## pred.logit down up
##
## down 195 86
## up 282 4001
accuracy <- sum(diag(confusionMatrix))/sum(confusionMatrix)
accuracy
## [1] 0.919369
#classification table imbal
test.probs <-predict(log.reg.imb, val, type='response')
pred.logit <- rep('down',length(test.probs))
pred.logit[test.probs>=0.5] <- 'up'
class <- table(pred.logit, val$ifOperStatus)
confusionMatrix <- ftable(val$ifOperStatus, pred.logit)
confusionMatrix
## pred.logit down up
##
## down 146 135
## up 72 4211
accuracy <- sum(diag(confusionMatrix))/sum(confusionMatrix)
accuracy
## [1] 0.954645
#Accuracy Precison and Recall
accuracy.meas(val$ifOperStatus, pred.log.reg.bal, threshold = 0.06)
##
## Call:
## accuracy.meas(response = val$ifOperStatus, predicted = pred.log.reg.bal,
## threshold = 0.06)
##
## Examples are labelled as positive when predicted is greater than 0.06
##
## precision: 0.928
## recall: 0.828
## F: 0.437
accuracy.meas(val$ifOperStatus, pred.log.reg.imb, threshold = 0.06)
##
## Call:
## accuracy.meas(response = val$ifOperStatus, predicted = pred.log.reg.imb,
## threshold = 0.06)
##
## Examples are labelled as positive when predicted is greater than 0.06
##
## precision: 0.938
## recall: 1.000
## F: 0.484
#Method1
library(caret)
ctrl <- trainControl(method = "repeatedcv", number = 10, savePredictions = TRUE)
mod_fit <- train(ifOperStatus ~ . ,data=dataset.rose,method="glm", family="binomial",trControl = ctrl, tuneLength = 5)
pred = predict(mod_fit, newdata= val)
confusionMatrix(data=pred,val$ifOperStatus )
## Warning in confusionMatrix.default(data = pred, val$ifOperStatus): Levels
## are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction down up
## down 195 282
## up 86 4001
##
## Accuracy : 0.9194
## 95% CI : (0.9111, 0.9271)
## No Information Rate : 0.9384
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4737
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.69395
## Specificity : 0.93416
## Pos Pred Value : 0.40881
## Neg Pred Value : 0.97896
## Prevalence : 0.06157
## Detection Rate : 0.04273
## Detection Prevalence : 0.10451
## Balanced Accuracy : 0.81405
##
## 'Positive' Class : down
##
#Method2 :
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
k <- 3
kfCV <- cv.glm(data=dataset.rose, glmfit=log.reg.bal , K=k)
kfCV$delta
## [1] 0.1276741 0.1275812
#Method3 :
cost <- function(r, pi=0) mean(abs(r-pi)>0.5)
dataset.glm <- glm(ifOperStatus ~ .,binomial,data=dataset.rose)
#cv.err <- cv.glm(dataset.rose, dataset.glm, cost, K=nrow(dataset.rose))$delta
#cv.err
#cv.11.err <- cv.glm(dataset.rose, dataset.glm, cost, K=11)$delta
#cv.11.err
ptest <- predict(log.reg.bal,test1,type="response")
ptest
## 1 2 3 4 5 6
## 0.11817458 0.29438036 0.10589856 0.18519232 0.10458923 0.18145652
## 7 8 9 10 11 12
## 0.11911180 0.29071323 0.12484587 0.24735561 0.07330546 0.33404365
## 13 14 15 16 17 18
## 0.11376417 0.24047648 0.14223806 0.28204095 0.12739349 0.25539954
## 19 20 21 22 23 24
## 0.12208274 0.42252769 0.18600399 0.22999797 0.12441100 0.32041558
## 25 26 27 28 29 30
## 0.12640383 0.44785342 0.11743320 0.27727689 0.17515772 0.41107889
## 31 32 33 34 35 36
## 0.11761024 0.27634986 0.11201385 0.20415841 0.10902146 0.26890022
## 37 38 39 40 41 42
## 0.13243195 0.26828219 0.13205067 0.28359337 0.10908494 0.19969837
## 43 44 45 46 47 48
## 0.10043151 0.31676404 0.08139056 0.17653915 0.05787972 0.27288390
## 49 50 51 52 53 54
## 0.06616371 0.29148979 0.06219948 0.33388785 0.04770890 0.23415273
## 55 56 57 58 59 60
## 0.13665045 0.23197272 0.10403734 0.28623984 0.10712472 0.32636255
## 61 62 63 64 65 66
## 0.06113692 0.32135316 0.04469131 0.40556408 0.09740904 0.16702320
## 67 68 69 70 71 72
## 0.07778008 0.35219638 0.04879127 0.30101713 0.05770360 0.21933671
## 73 74 75 76 77 78
## 0.06945036 0.27594055 0.05077200 0.20866833 0.05435202 0.25496601
## 79 80 81 82 83 84
## 0.10662668 0.31200816 0.08431192 0.23208695 0.09635947 0.30781310
## 85 86 87 88 89 90
## 0.10275236 0.31682872 0.13141442 0.36994736 0.10195910 0.24575424
## 91 92 93 94 95 96
## 0.05831667 0.35431310 0.08294631 0.40711987 0.07001922 0.22782286
## 97 98 99 100 101 102
## 0.09188520 0.39670598 0.16477431 0.31772477 0.07724315 0.25012016
## 103 104 105 106 107 108
## 0.10650389 0.27124080 0.15012599 0.32138808 0.36460139 0.15217822
## 109 110 111 112 113 114
## 0.44642243 0.10933958 0.33280931 0.09951259 0.37249021 0.10133779
## 115 116 117 118 119 120
## 0.33160266 0.08733544 0.37598914 0.04432880 0.21266961 0.07462982