library(rpart)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ROSE)
## Loaded ROSE 0.0-3
##Loading and preprocessing the data
setwd("D:/Users/gkokate/Desktop/Markdown")
build <- read.csv(file = "Build .csv",sep = ",", header = TRUE)
test1 <- read.csv(file = "test.csv",sep = ",", header = TRUE)
head(build)
##   OpticalSignalLevel TxOpticalSignalLevel RxOpticalSignalLevel
## 1              -7711                 1142                 -171
## 2              -7703                 1288                 -170
## 3              -7703                 1081                 -170
## 4              -7703                 1207                 -171
## 5              -7688                 1276                 -170
## 6              -7688                 1282                 -169
##   X15MinDnFwdByteCounter X15MinUpFwdByteCounter Distance ifOperStatus
## 1               8.888281               7.245204       38           up
## 2               9.544178               7.764763       38           up
## 3               8.915710               7.648214       38           up
## 4               7.555582               6.976296       38           up
## 5               7.475159               6.646812       38           up
## 6               7.236687               6.449259       38           up
#Split data into training (70%) and validation (30%)
split <- sample(nrow(build),floor(nrow(build)*0.7))  #dataset
train <- build[split,]
val   <- build[-split,]
##Loading and preprocessing the data

Correlation matrix with imbalanced data

library(corrplot)
C1 <- cor(build[,-7])
corrplot(C1,method="number")

pairs(build[,-7],main="Pairwise Independent Variable Analysis_ImbalData ",pch = 21, bg = c("red", "green3", "blue"))

##Logisitic Regrssion Model

# train logistic regression on imbalanced data
log.reg.imb <- glm(ifOperStatus ~ ., data = train ,family='binomial') 
# use the trained model to predict val data
pred.log.reg.imb <- predict(log.reg.imb, newdata=val,type="response")
# generate new balanced data by ROSE
dataset.rose <- ROSE(ifOperStatus ~ ., data=train,p=0.30, seed=123)$data

Correlation matrix with Balance data

C2 <- cor(dataset.rose[,-7])
corrplot(C2,method="number")

pairs(dataset.rose[,-7],main="Pairwise Independent Variable Analysis_BalData ",pch = 21, bg = c("red", "green3", "blue"))

# check (im)balance of new data
table(dataset.rose$ifOperStatus)
## 
##   up down 
## 7506 3142
# train logistic regression on balanced data
log.reg.bal <- glm(ifOperStatus ~ ., data=dataset.rose , family='binomial')
summary(log.reg.bal)
## 
## Call:
## glm(formula = ifOperStatus ~ ., family = "binomial", data = dataset.rose)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7545  -0.5840  -0.3519   0.5111   3.1337  
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -2.540e+00  6.336e-01  -4.008 6.12e-05 ***
## OpticalSignalLevel      3.795e-04  3.898e-05   9.736  < 2e-16 ***
## TxOpticalSignalLevel   -4.209e-03  3.894e-04 -10.810  < 2e-16 ***
## RxOpticalSignalLevel   -5.515e-02  1.937e-03 -28.476  < 2e-16 ***
## X15MinDnFwdByteCounter  3.926e-01  3.008e-02  13.054  < 2e-16 ***
## X15MinUpFwdByteCounter -2.407e-01  3.672e-02  -6.554 5.59e-11 ***
## Distance               -7.376e-02  2.651e-03 -27.823  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12919  on 10647  degrees of freedom
## Residual deviance:  8477  on 10641  degrees of freedom
## AIC: 8491
## 
## Number of Fisher Scoring iterations: 5
# use the trained model to predict val data
pred.log.reg.bal <- predict(log.reg.bal, newdata= val,type="response")

Scoring

# check accuracy of the two learners by measuring auc
roc.curve(val$ifOperStatus, pred.log.reg.imb)
## Area under the curve (AUC): 0.857
roc.curve(val$ifOperStatus, pred.log.reg.bal, add.roc=TRUE,col=2 )  # col=2

## Area under the curve (AUC): 0.858
#Odd ratios
exp(coef(log.reg.bal))
##            (Intercept)     OpticalSignalLevel   TxOpticalSignalLevel 
##             0.07889755             1.00037957             0.99580002 
##   RxOpticalSignalLevel X15MinDnFwdByteCounter X15MinUpFwdByteCounter 
##             0.94634614             1.48089031             0.78608999 
##               Distance 
##             0.92889248

Accuracy of Model

#classification table bal
test.probs <-predict(log.reg.bal, val, type='response')
pred.logit <- rep('up',length(test.probs))
pred.logit[test.probs>=0.5] <- 'down'
class <- table(pred.logit, val$ifOperStatus)
confusionMatrix <- ftable(val$ifOperStatus, pred.logit)
confusionMatrix
##      pred.logit down   up
##                          
## down             201   96
## up               264 4003
accuracy <- sum(diag(confusionMatrix))/sum(confusionMatrix)
accuracy
## [1] 0.9211218
#classification table imbal
test.probs <-predict(log.reg.imb, val, type='response')
pred.logit <- rep('down',length(test.probs))
pred.logit[test.probs>=0.5] <- 'up'
class <- table(pred.logit, val$ifOperStatus)
confusionMatrix <- ftable(val$ifOperStatus, pred.logit)
confusionMatrix
##      pred.logit down   up
##                          
## down             135  162
## up                54 4213
accuracy <- sum(diag(confusionMatrix))/sum(confusionMatrix)
accuracy
## [1] 0.9526731
#Accuracy Precison and Recall
accuracy.meas(val$ifOperStatus, pred.log.reg.bal, threshold = 0.06)
## 
## Call: 
## accuracy.meas(response = val$ifOperStatus, predicted = pred.log.reg.bal, 
##     threshold = 0.06)
## 
## Examples are labelled as positive when predicted is greater than 0.06 
## 
## precision: 0.917
## recall: 0.729
## F: 0.406
accuracy.meas(val$ifOperStatus, pred.log.reg.imb, threshold = 0.06)
## 
## Call: 
## accuracy.meas(response = val$ifOperStatus, predicted = pred.log.reg.imb, 
##     threshold = 0.06)
## 
## Examples are labelled as positive when predicted is greater than 0.06 
## 
## precision: 0.935
## recall: 1.000
## F: 0.483

Cross Validation Test

#Method1 
library(caret)
ctrl <- trainControl(method = "repeatedcv", number = 10, savePredictions = TRUE)
mod_fit <- train(ifOperStatus ~ . ,data=dataset.rose,method="glm", family="binomial",trControl = ctrl, tuneLength = 5)
pred = predict(mod_fit, newdata= val)
confusionMatrix(data=pred,val$ifOperStatus )
## Warning in confusionMatrix.default(data = pred, val$ifOperStatus): Levels
## are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction down   up
##       down  201  264
##       up     96 4003
##                                           
##                Accuracy : 0.9211          
##                  95% CI : (0.9129, 0.9288)
##     No Information Rate : 0.9349          
##     P-Value [Acc > NIR] : 0.9999          
##                                           
##                   Kappa : 0.4868          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.67677         
##             Specificity : 0.93813         
##          Pos Pred Value : 0.43226         
##          Neg Pred Value : 0.97658         
##              Prevalence : 0.06507         
##          Detection Rate : 0.04404         
##    Detection Prevalence : 0.10188         
##       Balanced Accuracy : 0.80745         
##                                           
##        'Positive' Class : down            
## 
#Method2 :
library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
k    <- 3
kfCV <- cv.glm(data=dataset.rose, glmfit=log.reg.bal , K=k)
kfCV$delta
## [1] 0.1203667 0.1202992

Prediction on new dataset

ptest <- predict(log.reg.bal,test1,type="response")