library(rpart)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ROSE)
## Loaded ROSE 0.0-3
##Loading and preprocessing the data
setwd("D:/Users/gkokate/Desktop/Markdown")
build <- read.csv(file = "Build .csv",sep = ",", header = TRUE)
test1 <- read.csv(file = "test.csv",sep = ",", header = TRUE)
head(build)
## OpticalSignalLevel TxOpticalSignalLevel RxOpticalSignalLevel
## 1 -7711 1142 -171
## 2 -7703 1288 -170
## 3 -7703 1081 -170
## 4 -7703 1207 -171
## 5 -7688 1276 -170
## 6 -7688 1282 -169
## X15MinDnFwdByteCounter X15MinUpFwdByteCounter Distance ifOperStatus
## 1 8.888281 7.245204 38 up
## 2 9.544178 7.764763 38 up
## 3 8.915710 7.648214 38 up
## 4 7.555582 6.976296 38 up
## 5 7.475159 6.646812 38 up
## 6 7.236687 6.449259 38 up
#Split data into training (70%) and validation (30%)
split <- sample(nrow(build),floor(nrow(build)*0.7)) #dataset
train <- build[split,]
val <- build[-split,]
##Loading and preprocessing the data
library(corrplot)
C1 <- cor(build[,-7])
corrplot(C1,method="number")
pairs(build[,-7],main="Pairwise Independent Variable Analysis_ImbalData ",pch = 21, bg = c("red", "green3", "blue"))
##Logisitic Regrssion Model
# train logistic regression on imbalanced data
log.reg.imb <- glm(ifOperStatus ~ ., data = train ,family='binomial')
# use the trained model to predict val data
pred.log.reg.imb <- predict(log.reg.imb, newdata=val,type="response")
# generate new balanced data by ROSE
dataset.rose <- ROSE(ifOperStatus ~ ., data=train,p=0.30, seed=123)$data
C2 <- cor(dataset.rose[,-7])
corrplot(C2,method="number")
pairs(dataset.rose[,-7],main="Pairwise Independent Variable Analysis_BalData ",pch = 21, bg = c("red", "green3", "blue"))
# check (im)balance of new data
table(dataset.rose$ifOperStatus)
##
## up down
## 7506 3142
# train logistic regression on balanced data
log.reg.bal <- glm(ifOperStatus ~ ., data=dataset.rose , family='binomial')
summary(log.reg.bal)
##
## Call:
## glm(formula = ifOperStatus ~ ., family = "binomial", data = dataset.rose)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7545 -0.5840 -0.3519 0.5111 3.1337
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.540e+00 6.336e-01 -4.008 6.12e-05 ***
## OpticalSignalLevel 3.795e-04 3.898e-05 9.736 < 2e-16 ***
## TxOpticalSignalLevel -4.209e-03 3.894e-04 -10.810 < 2e-16 ***
## RxOpticalSignalLevel -5.515e-02 1.937e-03 -28.476 < 2e-16 ***
## X15MinDnFwdByteCounter 3.926e-01 3.008e-02 13.054 < 2e-16 ***
## X15MinUpFwdByteCounter -2.407e-01 3.672e-02 -6.554 5.59e-11 ***
## Distance -7.376e-02 2.651e-03 -27.823 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12919 on 10647 degrees of freedom
## Residual deviance: 8477 on 10641 degrees of freedom
## AIC: 8491
##
## Number of Fisher Scoring iterations: 5
# use the trained model to predict val data
pred.log.reg.bal <- predict(log.reg.bal, newdata= val,type="response")
# check accuracy of the two learners by measuring auc
roc.curve(val$ifOperStatus, pred.log.reg.imb)
## Area under the curve (AUC): 0.857
roc.curve(val$ifOperStatus, pred.log.reg.bal, add.roc=TRUE,col=2 ) # col=2
## Area under the curve (AUC): 0.858
#Odd ratios
exp(coef(log.reg.bal))
## (Intercept) OpticalSignalLevel TxOpticalSignalLevel
## 0.07889755 1.00037957 0.99580002
## RxOpticalSignalLevel X15MinDnFwdByteCounter X15MinUpFwdByteCounter
## 0.94634614 1.48089031 0.78608999
## Distance
## 0.92889248
#classification table bal
test.probs <-predict(log.reg.bal, val, type='response')
pred.logit <- rep('up',length(test.probs))
pred.logit[test.probs>=0.5] <- 'down'
class <- table(pred.logit, val$ifOperStatus)
confusionMatrix <- ftable(val$ifOperStatus, pred.logit)
confusionMatrix
## pred.logit down up
##
## down 201 96
## up 264 4003
accuracy <- sum(diag(confusionMatrix))/sum(confusionMatrix)
accuracy
## [1] 0.9211218
#classification table imbal
test.probs <-predict(log.reg.imb, val, type='response')
pred.logit <- rep('down',length(test.probs))
pred.logit[test.probs>=0.5] <- 'up'
class <- table(pred.logit, val$ifOperStatus)
confusionMatrix <- ftable(val$ifOperStatus, pred.logit)
confusionMatrix
## pred.logit down up
##
## down 135 162
## up 54 4213
accuracy <- sum(diag(confusionMatrix))/sum(confusionMatrix)
accuracy
## [1] 0.9526731
#Accuracy Precison and Recall
accuracy.meas(val$ifOperStatus, pred.log.reg.bal, threshold = 0.06)
##
## Call:
## accuracy.meas(response = val$ifOperStatus, predicted = pred.log.reg.bal,
## threshold = 0.06)
##
## Examples are labelled as positive when predicted is greater than 0.06
##
## precision: 0.917
## recall: 0.729
## F: 0.406
accuracy.meas(val$ifOperStatus, pred.log.reg.imb, threshold = 0.06)
##
## Call:
## accuracy.meas(response = val$ifOperStatus, predicted = pred.log.reg.imb,
## threshold = 0.06)
##
## Examples are labelled as positive when predicted is greater than 0.06
##
## precision: 0.935
## recall: 1.000
## F: 0.483
#Method1
library(caret)
ctrl <- trainControl(method = "repeatedcv", number = 10, savePredictions = TRUE)
mod_fit <- train(ifOperStatus ~ . ,data=dataset.rose,method="glm", family="binomial",trControl = ctrl, tuneLength = 5)
pred = predict(mod_fit, newdata= val)
confusionMatrix(data=pred,val$ifOperStatus )
## Warning in confusionMatrix.default(data = pred, val$ifOperStatus): Levels
## are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction down up
## down 201 264
## up 96 4003
##
## Accuracy : 0.9211
## 95% CI : (0.9129, 0.9288)
## No Information Rate : 0.9349
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.4868
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.67677
## Specificity : 0.93813
## Pos Pred Value : 0.43226
## Neg Pred Value : 0.97658
## Prevalence : 0.06507
## Detection Rate : 0.04404
## Detection Prevalence : 0.10188
## Balanced Accuracy : 0.80745
##
## 'Positive' Class : down
##
#Method2 :
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
k <- 3
kfCV <- cv.glm(data=dataset.rose, glmfit=log.reg.bal , K=k)
kfCV$delta
## [1] 0.1203667 0.1202992
ptest <- predict(log.reg.bal,test1,type="response")