Loading and preprocessing the data

library(rpart)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ROSE)
## Loaded ROSE 0.0-3
setwd("D:/Users/gkokate/Desktop/Markdown")
build <- read.csv(file = "Build .csv",sep = ",", header = TRUE)
test1 <- read.csv(file = "test.csv",sep = ",", header = TRUE)

Dataset observation

head(build)
##   gponOntAniOpInfoOpticalSignalLevel gponOntAniOpInfoTxOpticalSignalLevel
## 1                              -7711                                 1142
## 2                              -7703                                 1288
## 3                              -7703                                 1081
## 4                              -7703                                 1207
## 5                              -7688                                 1276
## 6                              -7688                                 1282
##   gponOntOltsideOpInfoRxOpticalSignalLevel X15MinDnFwdByteCounter
## 1                                     -171               8.888281
## 2                                     -170               9.544178
## 3                                     -170               8.915710
## 4                                     -171               7.555582
## 5                                     -170               7.475159
## 6                                     -169               7.236687
##   X15MinUpFwdByteCounter bponOntOpInfoDistance ifOperStatus
## 1               7.245204                    38           up
## 2               7.764763                    38           up
## 3               7.648214                    38           up
## 4               6.976296                    38           up
## 5               6.646812                    38           up
## 6               6.449259                    38           up
#Split data into training (70%) and validation (30%)
split <- sample(nrow(build),floor(nrow(build)*0.7))  #dataset
train <- build[split,]
val   <- build[-split,]
##Loading and preprocessing the data

Correlation matrix with Imbalance data

cor(build[,-7])
##                                          gponOntAniOpInfoOpticalSignalLevel
## gponOntAniOpInfoOpticalSignalLevel                                1.0000000
## gponOntAniOpInfoTxOpticalSignalLevel                              0.2200446
## gponOntOltsideOpInfoRxOpticalSignalLevel                          0.8286779
## X15MinDnFwdByteCounter                                           -0.1531402
## X15MinUpFwdByteCounter                                           -0.1431537
## bponOntOpInfoDistance                                             0.4725262
##                                          gponOntAniOpInfoTxOpticalSignalLevel
## gponOntAniOpInfoOpticalSignalLevel                                 0.22004458
## gponOntAniOpInfoTxOpticalSignalLevel                               1.00000000
## gponOntOltsideOpInfoRxOpticalSignalLevel                           0.22851482
## X15MinDnFwdByteCounter                                            -0.09589888
## X15MinUpFwdByteCounter                                            -0.12970713
## bponOntOpInfoDistance                                              0.10857485
##                                          gponOntOltsideOpInfoRxOpticalSignalLevel
## gponOntAniOpInfoOpticalSignalLevel                                      0.8286779
## gponOntAniOpInfoTxOpticalSignalLevel                                    0.2285148
## gponOntOltsideOpInfoRxOpticalSignalLevel                                1.0000000
## X15MinDnFwdByteCounter                                                 -0.1355498
## X15MinUpFwdByteCounter                                                 -0.1414092
## bponOntOpInfoDistance                                                   0.3258893
##                                          X15MinDnFwdByteCounter
## gponOntAniOpInfoOpticalSignalLevel                 -0.153140207
## gponOntAniOpInfoTxOpticalSignalLevel               -0.095898884
## gponOntOltsideOpInfoRxOpticalSignalLevel           -0.135549796
## X15MinDnFwdByteCounter                              1.000000000
## X15MinUpFwdByteCounter                              0.890232544
## bponOntOpInfoDistance                               0.004619694
##                                          X15MinUpFwdByteCounter
## gponOntAniOpInfoOpticalSignalLevel                  -0.14315366
## gponOntAniOpInfoTxOpticalSignalLevel                -0.12970713
## gponOntOltsideOpInfoRxOpticalSignalLevel            -0.14140922
## X15MinDnFwdByteCounter                               0.89023254
## X15MinUpFwdByteCounter                               1.00000000
## bponOntOpInfoDistance                                0.03787122
##                                          bponOntOpInfoDistance
## gponOntAniOpInfoOpticalSignalLevel                 0.472526225
## gponOntAniOpInfoTxOpticalSignalLevel               0.108574852
## gponOntOltsideOpInfoRxOpticalSignalLevel           0.325889320
## X15MinDnFwdByteCounter                             0.004619694
## X15MinUpFwdByteCounter                             0.037871220
## bponOntOpInfoDistance                              1.000000000
pairs(build[,-7])

##Logisitic Regrssion Model Using ROSE algorithm for biased data to convert as a Unbiased .

# train logistic regression on imbalanced data
log.reg.imb <- glm(ifOperStatus ~ ., data = train ,family='binomial') 
# use the trained model to predict val data
pred.log.reg.imb <- predict(log.reg.imb, newdata=val,type="response")
# generate new balanced data by ROSE
dataset.rose <- ROSE(ifOperStatus ~ ., data=train,p=0.30, seed=123)$data

Correlation matrix with Balance data

cor(dataset.rose[,-7])
##                                          gponOntAniOpInfoOpticalSignalLevel
## gponOntAniOpInfoOpticalSignalLevel                               1.00000000
## gponOntAniOpInfoTxOpticalSignalLevel                             0.25932807
## gponOntOltsideOpInfoRxOpticalSignalLevel                         0.77637704
## X15MinDnFwdByteCounter                                          -0.08535516
## X15MinUpFwdByteCounter                                          -0.05791129
## bponOntOpInfoDistance                                            0.46672417
##                                          gponOntAniOpInfoTxOpticalSignalLevel
## gponOntAniOpInfoOpticalSignalLevel                                 0.25932807
## gponOntAniOpInfoTxOpticalSignalLevel                               1.00000000
## gponOntOltsideOpInfoRxOpticalSignalLevel                           0.30578829
## X15MinDnFwdByteCounter                                            -0.08906029
## X15MinUpFwdByteCounter                                            -0.09658079
## bponOntOpInfoDistance                                              0.17149022
##                                          gponOntOltsideOpInfoRxOpticalSignalLevel
## gponOntAniOpInfoOpticalSignalLevel                                     0.77637704
## gponOntAniOpInfoTxOpticalSignalLevel                                   0.30578829
## gponOntOltsideOpInfoRxOpticalSignalLevel                               1.00000000
## X15MinDnFwdByteCounter                                                -0.04454176
## X15MinUpFwdByteCounter                                                -0.02031054
## bponOntOpInfoDistance                                                  0.44337068
##                                          X15MinDnFwdByteCounter
## gponOntAniOpInfoOpticalSignalLevel                  -0.08535516
## gponOntAniOpInfoTxOpticalSignalLevel                -0.08906029
## gponOntOltsideOpInfoRxOpticalSignalLevel            -0.04454176
## X15MinDnFwdByteCounter                               1.00000000
## X15MinUpFwdByteCounter                               0.76454657
## bponOntOpInfoDistance                               -0.01167402
##                                          X15MinUpFwdByteCounter
## gponOntAniOpInfoOpticalSignalLevel                  -0.05791129
## gponOntAniOpInfoTxOpticalSignalLevel                -0.09658079
## gponOntOltsideOpInfoRxOpticalSignalLevel            -0.02031054
## X15MinDnFwdByteCounter                               0.76454657
## X15MinUpFwdByteCounter                               1.00000000
## bponOntOpInfoDistance                                0.03308249
##                                          bponOntOpInfoDistance
## gponOntAniOpInfoOpticalSignalLevel                  0.46672417
## gponOntAniOpInfoTxOpticalSignalLevel                0.17149022
## gponOntOltsideOpInfoRxOpticalSignalLevel            0.44337068
## X15MinDnFwdByteCounter                             -0.01167402
## X15MinUpFwdByteCounter                              0.03308249
## bponOntOpInfoDistance                               1.00000000
pairs(dataset.rose[,-7])

# check (im)balance of new data

table(dataset.rose$ifOperStatus)
## 
##   up down 
## 7506 3142
# train logistic regression on balanced data
log.reg.bal <- glm(ifOperStatus ~ ., data=dataset.rose , family='binomial')
summary(log.reg.bal)
## 
## Call:
## glm(formula = ifOperStatus ~ ., family = "binomial", data = dataset.rose)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6995  -0.6294  -0.3893   0.5403   3.0429  
## 
## Coefficients:
##                                            Estimate Std. Error z value
## (Intercept)                              -2.838e+00  6.249e-01  -4.541
## gponOntAniOpInfoOpticalSignalLevel        3.283e-04  3.783e-05   8.677
## gponOntAniOpInfoTxOpticalSignalLevel     -4.027e-03  3.750e-04 -10.739
## gponOntOltsideOpInfoRxOpticalSignalLevel -5.184e-02  1.893e-03 -27.390
## X15MinDnFwdByteCounter                    3.788e-01  2.960e-02  12.800
## X15MinUpFwdByteCounter                   -2.239e-01  3.609e-02  -6.204
## bponOntOpInfoDistance                    -6.145e-02  2.512e-03 -24.463
##                                          Pr(>|z|)    
## (Intercept)                              5.61e-06 ***
## gponOntAniOpInfoOpticalSignalLevel        < 2e-16 ***
## gponOntAniOpInfoTxOpticalSignalLevel      < 2e-16 ***
## gponOntOltsideOpInfoRxOpticalSignalLevel  < 2e-16 ***
## X15MinDnFwdByteCounter                    < 2e-16 ***
## X15MinUpFwdByteCounter                   5.49e-10 ***
## bponOntOpInfoDistance                     < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12918.9  on 10647  degrees of freedom
## Residual deviance:  8948.9  on 10641  degrees of freedom
## AIC: 8962.9
## 
## Number of Fisher Scoring iterations: 5
# use the trained model to predict val data
pred.log.reg.bal <- predict(log.reg.bal, newdata= val,type="response")

What’s Score of Logisitc Regression model for accuracy

# check accuracy of the two learners by measuring auc
roc.curve(val$ifOperStatus, pred.log.reg.imb)
## Area under the curve (AUC): 0.869
roc.curve(val$ifOperStatus, pred.log.reg.bal, add.roc=TRUE,col=2 )  # col=2

## Area under the curve (AUC): 0.870
#Odd ratios
exp(coef(log.reg.bal))
##                              (Intercept) 
##                                0.0585644 
##       gponOntAniOpInfoOpticalSignalLevel 
##                                1.0003283 
##     gponOntAniOpInfoTxOpticalSignalLevel 
##                                0.9959809 
## gponOntOltsideOpInfoRxOpticalSignalLevel 
##                                0.9494831 
##                   X15MinDnFwdByteCounter 
##                                1.4605934 
##                   X15MinUpFwdByteCounter 
##                                0.7993979 
##                    bponOntOpInfoDistance 
##                                0.9403987

Accuracy of Model

Classification matrix accurcy on imbalance data 95% and balance data accuracy 92% .

#classification table bal
test.probs <-predict(log.reg.bal, val, type='response')
pred.logit <- rep('up',length(test.probs))
pred.logit[test.probs>=0.5] <- 'down'
class <- table(pred.logit, val$ifOperStatus)
confusionMatrix <- ftable(val$ifOperStatus, pred.logit)
confusionMatrix
##      pred.logit down   up
##                          
## down             195   86
## up               282 4001
accuracy <- sum(diag(confusionMatrix))/sum(confusionMatrix)
accuracy
## [1] 0.919369
#classification table imbal
test.probs <-predict(log.reg.imb, val, type='response')
pred.logit <- rep('down',length(test.probs))
pred.logit[test.probs>=0.5] <- 'up'
class <- table(pred.logit, val$ifOperStatus)
confusionMatrix <- ftable(val$ifOperStatus, pred.logit)
confusionMatrix
##      pred.logit down   up
##                          
## down             146  135
## up                72 4211
accuracy <- sum(diag(confusionMatrix))/sum(confusionMatrix)
accuracy
## [1] 0.954645
#Accuracy Precison and Recall
accuracy.meas(val$ifOperStatus, pred.log.reg.bal, threshold = 0.06)
## 
## Call: 
## accuracy.meas(response = val$ifOperStatus, predicted = pred.log.reg.bal, 
##     threshold = 0.06)
## 
## Examples are labelled as positive when predicted is greater than 0.06 
## 
## precision: 0.928
## recall: 0.828
## F: 0.437
accuracy.meas(val$ifOperStatus, pred.log.reg.imb, threshold = 0.06)
## 
## Call: 
## accuracy.meas(response = val$ifOperStatus, predicted = pred.log.reg.imb, 
##     threshold = 0.06)
## 
## Examples are labelled as positive when predicted is greater than 0.06 
## 
## precision: 0.938
## recall: 1.000
## F: 0.484

Cross Validation Test

#Method1 
library(caret)
ctrl <- trainControl(method = "repeatedcv", number = 10, savePredictions = TRUE)
mod_fit <- train(ifOperStatus ~ . ,data=dataset.rose,method="glm", family="binomial",trControl = ctrl, tuneLength = 5)
pred = predict(mod_fit, newdata= val)
confusionMatrix(data=pred,val$ifOperStatus )
## Warning in confusionMatrix.default(data = pred, val$ifOperStatus): Levels
## are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction down   up
##       down  195  282
##       up     86 4001
##                                           
##                Accuracy : 0.9194          
##                  95% CI : (0.9111, 0.9271)
##     No Information Rate : 0.9384          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4737          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.69395         
##             Specificity : 0.93416         
##          Pos Pred Value : 0.40881         
##          Neg Pred Value : 0.97896         
##              Prevalence : 0.06157         
##          Detection Rate : 0.04273         
##    Detection Prevalence : 0.10451         
##       Balanced Accuracy : 0.81405         
##                                           
##        'Positive' Class : down            
## 
#Method2 :
library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
k    <- 3
kfCV <- cv.glm(data=dataset.rose, glmfit=log.reg.bal , K=k)
kfCV$delta
## [1] 0.1276741 0.1275812
#Method3 :
cost <- function(r, pi=0) mean(abs(r-pi)>0.5)
dataset.glm <- glm(ifOperStatus ~ .,binomial,data=dataset.rose)
#cv.err <- cv.glm(dataset.rose, dataset.glm, cost, K=nrow(dataset.rose))$delta 
#cv.err           
#cv.11.err <- cv.glm(dataset.rose, dataset.glm, cost, K=11)$delta 
#cv.11.err

Prediction of probabilites new data

ptest <- predict(log.reg.bal,test1,type="response")
ptest
##          1          2          3          4          5          6 
## 0.11817458 0.29438036 0.10589856 0.18519232 0.10458923 0.18145652 
##          7          8          9         10         11         12 
## 0.11911180 0.29071323 0.12484587 0.24735561 0.07330546 0.33404365 
##         13         14         15         16         17         18 
## 0.11376417 0.24047648 0.14223806 0.28204095 0.12739349 0.25539954 
##         19         20         21         22         23         24 
## 0.12208274 0.42252769 0.18600399 0.22999797 0.12441100 0.32041558 
##         25         26         27         28         29         30 
## 0.12640383 0.44785342 0.11743320 0.27727689 0.17515772 0.41107889 
##         31         32         33         34         35         36 
## 0.11761024 0.27634986 0.11201385 0.20415841 0.10902146 0.26890022 
##         37         38         39         40         41         42 
## 0.13243195 0.26828219 0.13205067 0.28359337 0.10908494 0.19969837 
##         43         44         45         46         47         48 
## 0.10043151 0.31676404 0.08139056 0.17653915 0.05787972 0.27288390 
##         49         50         51         52         53         54 
## 0.06616371 0.29148979 0.06219948 0.33388785 0.04770890 0.23415273 
##         55         56         57         58         59         60 
## 0.13665045 0.23197272 0.10403734 0.28623984 0.10712472 0.32636255 
##         61         62         63         64         65         66 
## 0.06113692 0.32135316 0.04469131 0.40556408 0.09740904 0.16702320 
##         67         68         69         70         71         72 
## 0.07778008 0.35219638 0.04879127 0.30101713 0.05770360 0.21933671 
##         73         74         75         76         77         78 
## 0.06945036 0.27594055 0.05077200 0.20866833 0.05435202 0.25496601 
##         79         80         81         82         83         84 
## 0.10662668 0.31200816 0.08431192 0.23208695 0.09635947 0.30781310 
##         85         86         87         88         89         90 
## 0.10275236 0.31682872 0.13141442 0.36994736 0.10195910 0.24575424 
##         91         92         93         94         95         96 
## 0.05831667 0.35431310 0.08294631 0.40711987 0.07001922 0.22782286 
##         97         98         99        100        101        102 
## 0.09188520 0.39670598 0.16477431 0.31772477 0.07724315 0.25012016 
##        103        104        105        106        107        108 
## 0.10650389 0.27124080 0.15012599 0.32138808 0.36460139 0.15217822 
##        109        110        111        112        113        114 
## 0.44642243 0.10933958 0.33280931 0.09951259 0.37249021 0.10133779 
##        115        116        117        118        119        120 
## 0.33160266 0.08733544 0.37598914 0.04432880 0.21266961 0.07462982