Loading and preprocessing the data

library(rpart)                     # R package for decision Tree
library(caret)                     # R package for decision Tree

## Loading required package: lattice

## Loading required package: ggplot2

setwd("D:/Users/gkokate/Desktop/Markdown")
build <- read.csv(file = "Build .csv",sep = ",", header = TRUE)
test1 <- read.csv(file = "test.csv", sep = "," , header = TRUE)
# Sample observations 
head(build)

##   gponOntAniOpInfoOpticalSignalLevel gponOntAniOpInfoTxOpticalSignalLevel
## 1                              -7711                                 1142
## 2                              -7703                                 1288
## 3                              -7703                                 1081
## 4                              -7703                                 1207
## 5                              -7688                                 1276
## 6                              -7688                                 1282
##   gponOntOltsideOpInfoRxOpticalSignalLevel X15MinDnFwdByteCounter
## 1                                     -171               8.888281
## 2                                     -170               9.544178
## 3                                     -170               8.915710
## 4                                     -171               7.555582
## 5                                     -170               7.475159
## 6                                     -169               7.236687
##   X15MinUpFwdByteCounter bponOntOpInfoDistance ifOperStatus
## 1               7.245204                    38           up
## 2               7.764763                    38           up
## 3               7.648214                    38           up
## 4               6.976296                    38           up
## 5               6.646812                    38           up
## 6               6.449259                    38           up

#dependent variable as a factor (categorical)
build$ifOperStatus <- as.factor(build$ifOperStatus)
# Split data into training (70%) and validation (30%)
set.seed(100)
split <- sample(nrow(build),floor(nrow(build)*0.7))
train <- build[split,]
val <- build[-split,]

Decision Tree Model

set.seed(100)
mtree  <- rpart(ifOperStatus~ .,data=train,method = "class",
        parms = list(prior = c(0.3,0.7),split = "gini"))
#parms = list(prior = c(0.5, 0.5)

Confusion matrix for Accuracy of model on Validation data

rpartpred <- predict(mtree,val,type="class")
confusionMatrix(rpartpred,val$ifOperStatus)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction down   up
##       down  235  358
##       up     35 3936
##                                           
##                Accuracy : 0.9139          
##                  95% CI : (0.9054, 0.9219)
##     No Information Rate : 0.9408          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.5043          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.87037         
##             Specificity : 0.91663         
##          Pos Pred Value : 0.39629         
##          Neg Pred Value : 0.99119         
##              Prevalence : 0.05916         
##          Detection Rate : 0.05149         
##    Detection Prevalence : 0.12993         
##       Balanced Accuracy : 0.89350         
##                                           
##        'Positive' Class : down            
##

Accracy :

Overall accuracy for CART model on validation dataset 92% and “Down” class accuracy is 42%

library(rattle)

## Rattle: A free graphical interface for data mining with R.
## Version 4.1.0 Copyright (c) 2006-2015 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(rpart.plot)
library(RColorBrewer)
#plot 
prp(mtree, faclen = 0, cex = 0.8, extra = 1)

tot_count <- function(x, labs, digits, varlen)
{paste(labs, "\n\nn =", x$frame$n)}

Decision Tree for the Network Outage prediction :

prp(mtree, faclen = 0, cex = 0.8, node.fun=tot_count)

fancyRpartPlot(mtree)

# Decision Tree Rules:

1.If RxOpticalSignalLevel less than -210 then network is going “Down”.

2.If RxOpticalSignalLevel more than -210 and greater than -172 then Network “Up”.

3.IF Distance less than 38, RxOpticalSignalLevel more than -210 and less than -172 then Network “Up”.

printcp(mtree)

## 
## Classification tree:
## rpart(formula = ifOperStatus ~ ., data = train, method = "class", 
##     parms = list(prior = c(0.3, 0.7), split = "gini"))
## 
## Variables actually used in tree construction:
## [1] bponOntOpInfoDistance                   
## [2] gponOntOltsideOpInfoRxOpticalSignalLevel
## [3] X15MinUpFwdByteCounter                  
## 
## Root node error: 3194.4/10648 = 0.3
## 
## n= 10648 
## 
##        CP nsplit rel error  xerror     xstd
## 1 0.60477      0   1.00000 1.00000 0.037669
## 2 0.01131      1   0.39523 0.39523 0.022264
## 3 0.01000      4   0.36130 0.40131 0.020475

bestcp <- mtree$cptable[which.min(mtree$cptable[,"xerror"]),"CP"]
#Pruning & classification matrix of Pruning
pruned <- prune(mtree, cp = bestcp)
#prp(pruned, faclen = 0, cex = 0.8, extra = 1)
predictions <- predict(pruned, val, type="class")
#confusionMatrix(predictions,val$ifOperStatus)

What’s Score of CART model for accuracy

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

val1 = predict(pruned, val, type = "prob")
pred_val <-prediction(val1[,2],val$ifOperStatus)
perf_val <- performance(pred_val,"auc")
perf_val

## An object of class "performance"
## Slot "x.name":
## [1] "None"
## 
## Slot "y.name":
## [1] "Area under the ROC curve"
## 
## Slot "alpha.name":
## [1] "none"
## 
## Slot "x.values":
## list()
## 
## Slot "y.values":
## [[1]]
## [1] 0.8626516
## 
## 
## Slot "alpha.values":
## list()

Lift Chart for identify cuttoff point

plot(performance(pred_val, measure="lift", x.measure="rpp"), colorize=TRUE)

# Calculating True Positive and False Positive Rate
perf_val <- performance(pred_val, "tpr", "fpr")
#Plot the ROC curve
plot(perf_val, col = "green", lwd = 1.5)

#Calculating KS statistics
ks1.tree <- max(attr(perf_val, "y.values")[[1]] - (attr(perf_val, "x.values")[[1]]))
ks1.tree

## [1] 0.7253032

Cross Validation Test

#Method1
library(ROSE)

## Loaded ROSE 0.0-3

ROSE.BOOT <- ROSE.eval(ifOperStatus ~ ., data = train, learner = rpart,method.assess = "BOOT", extr.pred = function(obj)obj[,2], seed = 1)
# Method2
library(caret)
tc <- trainControl("cv",10)
rpart.grid <- expand.grid(.cp=0.2)
(train.rpart <- train(ifOperStatus ~., data= train, method="rpart",trControl=tc,tuneGrid=rpart.grid))

## CART 
## 
## 10648 samples
##     6 predictor
##     2 classes: 'down', 'up' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 9583, 9583, 9583, 9583, 9583, 9584, ... 
## Resampling results
## 
##   Accuracy   Kappa      Accuracy SD  Kappa SD  
##   0.9521978  0.6105373  0.005355055  0.04073637
## 
## Tuning parameter 'cp' was held constant at a value of 0.2
##

Model Perfomance on test data

ptest <- predict(mtree, test1)
answers <- as.vector(ptest)
pml_write_files = function(x) {
    n = length(x)
    for (i in 1:n) {
        filename = paste0("problem_id_", i, ".txt")
        write.table(x[i], file = filename, quote = FALSE, row.names = FALSE, 
            col.names = FALSE)
    }
}
pml_write_files(answers)

## Prediction of probabilites new data
ptest

##           down        up
## 1   0.07318718 0.9268128
## 2   0.55563334 0.4443667
## 3   0.07318718 0.9268128
## 4   0.06539755 0.9346025
## 5   0.07318718 0.9268128
## 6   0.55563334 0.4443667
## 7   0.07318718 0.9268128
## 8   0.06539755 0.9346025
## 9   0.07318718 0.9268128
## 10  0.06539755 0.9346025
## 11  0.07318718 0.9268128
## 12  0.55563334 0.4443667
## 13  0.07318718 0.9268128
## 14  0.55563334 0.4443667
## 15  0.07318718 0.9268128
## 16  0.55563334 0.4443667
## 17  0.07318718 0.9268128
## 18  0.55563334 0.4443667
## 19  0.07318718 0.9268128
## 20  0.55563334 0.4443667
## 21  0.07318718 0.9268128
## 22  0.55563334 0.4443667
## 23  0.07318718 0.9268128
## 24  0.55563334 0.4443667
## 25  0.07318718 0.9268128
## 26  0.55563334 0.4443667
## 27  0.07318718 0.9268128
## 28  0.55563334 0.4443667
## 29  0.07318718 0.9268128
## 30  0.55563334 0.4443667
## 31  0.07318718 0.9268128
## 32  0.55563334 0.4443667
## 33  0.07318718 0.9268128
## 34  0.55563334 0.4443667
## 35  0.07318718 0.9268128
## 36  0.55563334 0.4443667
## 37  0.07318718 0.9268128
## 38  0.55563334 0.4443667
## 39  0.07318718 0.9268128
## 40  0.55563334 0.4443667
## 41  0.07318718 0.9268128
## 42  0.55563334 0.4443667
## 43  0.07318718 0.9268128
## 44  0.55563334 0.4443667
## 45  0.07318718 0.9268128
## 46  0.55563334 0.4443667
## 47  0.07318718 0.9268128
## 48  0.07318718 0.9268128
## 49  0.07318718 0.9268128
## 50  0.55563334 0.4443667
## 51  0.07318718 0.9268128
## 52  0.55563334 0.4443667
## 53  0.07318718 0.9268128
## 54  0.55563334 0.4443667
## 55  0.07318718 0.9268128
## 56  0.55563334 0.4443667
## 57  0.07318718 0.9268128
## 58  0.55563334 0.4443667
## 59  0.07318718 0.9268128
## 60  0.55563334 0.4443667
## 61  0.07318718 0.9268128
## 62  0.55563334 0.4443667
## 63  0.07318718 0.9268128
## 64  0.55563334 0.4443667
## 65  0.07318718 0.9268128
## 66  0.06539755 0.9346025
## 67  0.07318718 0.9268128
## 68  0.55563334 0.4443667
## 69  0.07318718 0.9268128
## 70  0.55563334 0.4443667
## 71  0.07318718 0.9268128
## 72  0.55563334 0.4443667
## 73  0.07318718 0.9268128
## 74  0.55563334 0.4443667
## 75  0.07318718 0.9268128
## 76  0.06539755 0.9346025
## 77  0.07318718 0.9268128
## 78  0.55563334 0.4443667
## 79  0.07318718 0.9268128
## 80  0.55563334 0.4443667
## 81  0.07318718 0.9268128
## 82  0.06539755 0.9346025
## 83  0.07318718 0.9268128
## 84  0.55563334 0.4443667
## 85  0.07318718 0.9268128
## 86  0.55563334 0.4443667
## 87  0.07318718 0.9268128
## 88  0.55563334 0.4443667
## 89  0.07318718 0.9268128
## 90  0.55563334 0.4443667
## 91  0.07318718 0.9268128
## 92  0.55563334 0.4443667
## 93  0.07318718 0.9268128
## 94  0.55563334 0.4443667
## 95  0.07318718 0.9268128
## 96  0.55563334 0.4443667
## 97  0.07318718 0.9268128
## 98  0.55563334 0.4443667
## 99  0.07318718 0.9268128
## 100 0.55563334 0.4443667
## 101 0.07318718 0.9268128
## 102 0.55563334 0.4443667
## 103 0.07318718 0.9268128
## 104 0.55563334 0.4443667
## 105 0.07318718 0.9268128
## 106 0.55563334 0.4443667
## 107 0.55563334 0.4443667
## 108 0.07318718 0.9268128
## 109 0.55563334 0.4443667
## 110 0.07318718 0.9268128
## 111 0.55563334 0.4443667
## 112 0.07318718 0.9268128
## 113 0.55563334 0.4443667
## 114 0.07318718 0.9268128
## 115 0.55563334 0.4443667
## 116 0.07318718 0.9268128
## 117 0.55563334 0.4443667
## 118 0.07318718 0.9268128
## 119 0.06539755 0.9346025
## 120 0.07318718 0.9268128

Classification And Decision Tree

COX

April 21, 2016