#Algorithms Implemented: Decision Trees(rpart), SVM,KNN, NB and RF 1.Implemented individually using 80-20 training-test ratio and compared the results 2.Then trained all these Models using same TrainControl and compared the results 3.A significant change in accuracy can be seen is comparison plots for both these techniques

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(purrr)
library(printr)
## Warning: package 'printr' was built under R version 4.3.3
## Registered S3 method overwritten by 'printr':
##   method                from     
##   knit_print.data.frame rmarkdown
library(pROC) 
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ROCR) 
## Warning: package 'ROCR' was built under R version 4.3.3
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
library(class)
## Warning: package 'class' was built under R version 4.3.3
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(RColorBrewer)
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(e1071)

data = read.csv("D:/MS Sem 2/Data Minning/Projects/Project 4/churn.csv", stringsAsFactors=TRUE)
# show dimension, datatype, content of the data set
str(data)
## 'data.frame':    3333 obs. of  21 variables:
##  $ State         : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ Account.Length: int  128 107 137 84 75 118 121 147 117 141 ...
##  $ Area.Code     : int  415 415 415 408 415 510 510 415 408 415 ...
##  $ Phone         : Factor w/ 3333 levels "327-1058","327-1319",..: 1927 1576 1118 1708 111 2254 1048 81 292 118 ...
##  $ Int.l.Plan    : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ VMail.Plan    : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ VMail.Message : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ Day.Mins      : num  265 162 243 299 167 ...
##  $ Day.Calls     : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ Day.Charge    : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ Eve.Mins      : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ Eve.Calls     : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ Eve.Charge    : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ Night.Mins    : num  245 254 163 197 187 ...
##  $ Night.Calls   : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ Night.Charge  : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ Intl.Mins     : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ Intl.Calls    : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ Intl.Charge   : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ CustServ.Calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ Churn.        : Factor w/ 2 levels "False.","True.": 1 1 1 1 1 1 1 1 1 1 ...
head(data)
State Account.Length Area.Code Phone Int.l.Plan VMail.Plan VMail.Message Day.Mins Day.Calls Day.Charge Eve.Mins Eve.Calls Eve.Charge Night.Mins Night.Calls Night.Charge Intl.Mins Intl.Calls Intl.Charge CustServ.Calls Churn.
KS 128 415 382-4657 no yes 25 265.1 110 45.07 197.4 99 16.78 244.7 91 11.01 10.0 3 2.70 1 False.
OH 107 415 371-7191 no yes 26 161.6 123 27.47 195.5 103 16.62 254.4 103 11.45 13.7 3 3.70 1 False.
NJ 137 415 358-1921 no no 0 243.4 114 41.38 121.2 110 10.30 162.6 104 7.32 12.2 5 3.29 0 False.
OH 84 408 375-9999 yes no 0 299.4 71 50.90 61.9 88 5.26 196.9 89 8.86 6.6 7 1.78 2 False.
OK 75 415 330-6626 yes no 0 166.7 113 28.34 148.3 122 12.61 186.9 121 8.41 10.1 3 2.73 3 False.
AL 118 510 391-8027 yes no 0 223.4 98 37.98 220.6 101 18.75 203.9 118 9.18 6.3 6 1.70 0 False.
# detect missing value
knitr::kable(sapply(data, function(x) sum(is.na(x))), col.names = c("Missing Value Count"))
Missing Value Count
State 0
Account.Length 0
Area.Code 0
Phone 0
Int.l.Plan 0
VMail.Plan 0
VMail.Message 0
Day.Mins 0
Day.Calls 0
Day.Charge 0
Eve.Mins 0
Eve.Calls 0
Eve.Charge 0
Night.Mins 0
Night.Calls 0
Night.Charge 0
Intl.Mins 0
Intl.Calls 0
Intl.Charge 0
CustServ.Calls 0
Churn. 0
# show summary statistics of the variables 
summary(data)
State Account.Length Area.Code Phone Int.l.Plan VMail.Plan VMail.Message Day.Mins Day.Calls Day.Charge Eve.Mins Eve.Calls Eve.Charge Night.Mins Night.Calls Night.Charge Intl.Mins Intl.Calls Intl.Charge CustServ.Calls Churn.
WV : 106 Min. : 1.0 Min. :408.0 327-1058: 1 no :3010 no :2411 Min. : 0.000 Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 23.2 Min. : 33.0 Min. : 1.040 Min. : 0.00 Min. : 0.000 Min. :0.000 Min. :0.000 False.:2850
MN : 84 1st Qu.: 74.0 1st Qu.:408.0 327-1319: 1 yes: 323 yes: 922 1st Qu.: 0.000 1st Qu.:143.7 1st Qu.: 87.0 1st Qu.:24.43 1st Qu.:166.6 1st Qu.: 87.0 1st Qu.:14.16 1st Qu.:167.0 1st Qu.: 87.0 1st Qu.: 7.520 1st Qu.: 8.50 1st Qu.: 3.000 1st Qu.:2.300 1st Qu.:1.000 True. : 483
NY : 83 Median :101.0 Median :415.0 327-3053: 1 NA NA Median : 0.000 Median :179.4 Median :101.0 Median :30.50 Median :201.4 Median :100.0 Median :17.12 Median :201.2 Median :100.0 Median : 9.050 Median :10.30 Median : 4.000 Median :2.780 Median :1.000 NA
AL : 80 Mean :101.1 Mean :437.2 327-3587: 1 NA NA Mean : 8.099 Mean :179.8 Mean :100.4 Mean :30.56 Mean :201.0 Mean :100.1 Mean :17.08 Mean :200.9 Mean :100.1 Mean : 9.039 Mean :10.24 Mean : 4.479 Mean :2.765 Mean :1.563 NA
OH : 78 3rd Qu.:127.0 3rd Qu.:510.0 327-3850: 1 NA NA 3rd Qu.:20.000 3rd Qu.:216.4 3rd Qu.:114.0 3rd Qu.:36.79 3rd Qu.:235.3 3rd Qu.:114.0 3rd Qu.:20.00 3rd Qu.:235.3 3rd Qu.:113.0 3rd Qu.:10.590 3rd Qu.:12.10 3rd Qu.: 6.000 3rd Qu.:3.270 3rd Qu.:2.000 NA
OR : 78 Max. :243.0 Max. :510.0 327-3954: 1 NA NA Max. :51.000 Max. :350.8 Max. :165.0 Max. :59.64 Max. :363.7 Max. :170.0 Max. :30.91 Max. :395.0 Max. :175.0 Max. :17.770 Max. :20.00 Max. :20.000 Max. :5.400 Max. :9.000 NA
(Other):2824 NA NA (Other) :3327 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
#Data Processing

#Dropping non-useful variables
data = data[, !names(data) %in% c('State', 'Phone', 'Area.Code')]

#2. Data Scaling
# Select variables for scaling
numeric_data <- as.data.frame(lapply(data, as.numeric))

# Scale the numeric variables
scaled_data <- as.data.frame(scale(numeric_data))

# Combine scaled numeric variables with non-numeric variables
#scaled_data <- cbind(Int.l.Plan = data$Int.l.Plan,VMail.Plan = data$VMail.Plan, scaled_data, Churn. = data$Churn.)

# Convert Churn. back to factor
scaled_data$Churn. <- as.factor(scaled_data$Churn.)

# Check the structure of the scaled dataset
str(scaled_data)
## 'data.frame':    3333 obs. of  18 variables:
##  $ Account.Length: num  0.676 0.149 0.902 -0.429 -0.655 ...
##  $ Int.l.Plan    : num  -0.328 -0.328 -0.328 3.052 3.052 ...
##  $ VMail.Plan    : num  1.617 1.617 -0.618 -0.618 -0.618 ...
##  $ VMail.Message : num  1.235 1.308 -0.592 -0.592 -0.592 ...
##  $ Day.Mins      : num  1.567 -0.334 1.168 2.196 -0.24 ...
##  $ Day.Calls     : num  0.477 1.124 0.676 -1.467 0.626 ...
##  $ Day.Charge    : num  1.567 -0.334 1.168 2.196 -0.24 ...
##  $ Eve.Mins      : num  -0.0706 -0.1081 -1.5731 -2.7425 -1.0388 ...
##  $ Eve.Calls     : num  -0.0559 0.1448 0.4962 -0.6081 1.0985 ...
##  $ Eve.Charge    : num  -0.0704 -0.1075 -1.5737 -2.7429 -1.0378 ...
##  $ Night.Mins    : num  0.8666 1.0584 -0.7568 -0.0785 -0.2763 ...
##  $ Night.Calls   : num  -0.465 0.148 0.199 -0.568 1.068 ...
##  $ Night.Charge  : num  0.8659 1.0592 -0.7555 -0.0788 -0.2765 ...
##  $ Intl.Mins     : num  -0.085 1.2403 0.703 -1.3028 -0.0492 ...
##  $ Intl.Calls    : num  -0.601 -0.601 0.212 1.024 -0.601 ...
##  $ Intl.Charge   : num  -0.0857 1.241 0.6971 -1.3062 -0.0459 ...
##  $ CustServ.Calls: num  -0.428 -0.428 -1.188 0.332 1.092 ...
##  $ Churn.        : Factor w/ 2 levels "-0.411610054566873",..: 1 1 1 1 1 1 1 1 1 1 ...
#Check the summary of the scaled dataset
summary(scaled_data)
Account.Length Int.l.Plan VMail.Plan VMail.Message Day.Mins Day.Calls Day.Charge Eve.Mins Eve.Calls Eve.Charge Night.Mins Night.Calls Night.Charge Intl.Mins Intl.Calls Intl.Charge CustServ.Calls Churn.
Min. :-2.512795 Min. :-0.3275 Min. :-0.6183 Min. :-0.5917 Min. :-3.300601 Min. :-5.00450 Min. :-3.300667 Min. :-3.963027 Min. :-5.025157 Min. :-3.963085 Min. :-3.513121 Min. :-3.429355 Min. :-3.514838 Min. :-3.66686 Min. :-1.8200 Min. :-3.66766 Min. :-1.1880 -0.411610054566873:2850
1st Qu.:-0.679643 1st Qu.:-0.3275 1st Qu.:-0.6183 1st Qu.:-0.5917 1st Qu.:-0.662325 1st Qu.:-0.66947 1st Qu.:-0.662277 1st Qu.:-0.677928 1st Qu.:-0.658262 1st Qu.:-0.678211 1st Qu.:-0.669754 1st Qu.:-0.669834 1st Qu.:-0.667579 1st Qu.:-0.62228 1st Qu.:-0.6011 1st Qu.:-0.61634 1st Qu.:-0.4279 2.42875498036354 : 483
Median :-0.001627 Median :-0.3275 Median :-0.6183 Median :-0.5917 Median :-0.006887 Median : 0.02812 Median :-0.006729 Median : 0.008275 Median :-0.005738 Median : 0.008458 Median : 0.006485 Median :-0.005504 Median : 0.004691 Median : 0.02246 Median :-0.1948 Median : 0.02046 Median :-0.4279 NA
Mean : 0.000000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.000000 Mean : 0.00000 Mean : 0.000000 Mean : 0.000000 Mean : 0.000000 Mean : 0.000000 Mean : 0.000000 Mean : 0.000000 Mean : 0.000000 Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 NA
3rd Qu.: 0.651276 3rd Qu.:-0.3275 3rd Qu.: 1.6168 3rd Qu.: 0.8694 3rd Qu.: 0.672419 3rd Qu.: 0.67588 3rd Qu.: 0.672578 3rd Qu.: 0.676731 3rd Qu.: 0.696981 3rd Qu.: 0.676568 3rd Qu.: 0.680746 3rd Qu.: 0.658825 3rd Qu.: 0.681354 3rd Qu.: 0.66720 3rd Qu.: 0.6178 3rd Qu.: 0.67052 3rd Qu.: 0.3323 NA
Max. : 3.564231 Max. : 3.0522 Max. : 1.6168 Max. : 3.1341 Max. : 3.139950 Max. : 3.21711 Max. : 3.140331 Max. : 3.208584 Max. : 3.507855 Max. : 3.207498 Max. : 3.838505 Max. : 3.827165 Max. : 3.836188 Max. : 3.49687 Max. : 6.3061 Max. : 3.49630 Max. : 5.6535 NA
# data encoding to numeric values
#data_n <- as.data.frame(lapply(scaled_data, as.numeric))
#str(data_n)
#summary(data_n$Churn.)



#Machine Learning - Classification

#To train a classification model, there is mainly three steps:
# 1. Splitting Data into Training and Testing Set
#2. Model Training/ Tuning
#3. Model Testing

set.seed(1234)
trainIndex <- createDataPartition(scaled_data$Churn., p = 0.8, list = FALSE, times = 1)
training_data <- data[ trainIndex,]
testing_data  <- data[-trainIndex,]

# Check if the splitting process is correct
prop.table(table(training_data$Churn.))
False. True.
0.8548931 0.1451069
prop.table(table(testing_data$Churn.))
False. True.
0.8558559 0.1441441
# 1: Decision Tree

Dtree = rpart(Churn.~ Account.Length + Int.l.Plan + VMail.Plan + VMail.Message +  Day.Mins+Day.Calls +Day.Charge + Eve.Mins +Eve.Calls+Eve.Charge +Night.Mins+Night.Calls +Night.Charge+Intl.Mins+Intl.Calls+Intl.Charge +CustServ.Calls+Churn., data = training_data, method = "class")
## Warning in model.matrix.default(attr(frame, "terms"), frame): the response
## appeared on the right-hand side and was dropped
## Warning in model.matrix.default(attr(frame, "terms"), frame): problem with term
## 18 in model.matrix: no columns are assigned
## Warning in cats * !isord: longer object length is not a multiple of shorter
## object length
printcp(Dtree)
## 
## Classification tree:
## rpart(formula = Churn. ~ Account.Length + Int.l.Plan + VMail.Plan + 
##     VMail.Message + Day.Mins + Day.Calls + Day.Charge + Eve.Mins + 
##     Eve.Calls + Eve.Charge + Night.Mins + Night.Calls + Night.Charge + 
##     Intl.Mins + Intl.Calls + Intl.Charge + CustServ.Calls + Churn., 
##     data = training_data, method = "class")
## 
## Variables actually used in tree construction:
## [1] CustServ.Calls Day.Mins       Eve.Mins       Int.l.Plan     Intl.Calls    
## [6] Intl.Mins      Night.Mins     VMail.Plan    
## 
## Root node error: 387/2667 = 0.14511
## 
## n= 2667 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.078811      0   1.00000 1.00000 0.047000
## 2 0.076227      2   0.84238 0.88889 0.044728
## 3 0.052972      4   0.68992 0.70543 0.040450
## 4 0.021964      7   0.49354 0.50904 0.034903
## 5 0.020672      9   0.44961 0.50129 0.034657
## 6 0.019811     10   0.42894 0.50129 0.034657
## 7 0.015504     13   0.36951 0.45478 0.033130
## 8 0.012920     14   0.35401 0.42894 0.032240
## 9 0.010000     15   0.34109 0.43152 0.032330
fancyRpartPlot(Dtree)

# Plot Full Tree
prp(Dtree, type = 1, extra = 1, under = TRUE, split.font = 2, varlen = 0) 

#Find the best pruned Decision Tree by selecting the tree that is having least cross validation error
set.seed(12345)
cv.ct <- rpart(Churn. ~ Account.Length + Int.l.Plan + VMail.Plan + VMail.Message +  Day.Mins+Day.Calls +Day.Charge + Eve.Mins +Eve.Calls+Eve.Charge +Night.Mins+Night.Calls +Night.Charge+Intl.Mins+Intl.Calls+Intl.Charge +CustServ.Calls+Churn., data = training_data, method = "class",  cp = 0.00001, minsplit = 5, xval = 5)
## Warning in model.matrix.default(attr(frame, "terms"), frame): the response
## appeared on the right-hand side and was dropped
## Warning in model.matrix.default(attr(frame, "terms"), frame): problem with term
## 18 in model.matrix: no columns are assigned
## Warning in cats * !isord: longer object length is not a multiple of shorter
## object length
printcp(cv.ct)
## 
## Classification tree:
## rpart(formula = Churn. ~ Account.Length + Int.l.Plan + VMail.Plan + 
##     VMail.Message + Day.Mins + Day.Calls + Day.Charge + Eve.Mins + 
##     Eve.Calls + Eve.Charge + Night.Mins + Night.Calls + Night.Charge + 
##     Intl.Mins + Intl.Calls + Intl.Charge + CustServ.Calls + Churn., 
##     data = training_data, method = "class", cp = 1e-05, minsplit = 5, 
##     xval = 5)
## 
## Variables actually used in tree construction:
##  [1] Account.Length CustServ.Calls Day.Calls      Day.Mins       Eve.Calls     
##  [6] Eve.Mins       Int.l.Plan     Intl.Calls     Intl.Mins      Night.Calls   
## [11] Night.Mins     VMail.Message  VMail.Plan    
## 
## Root node error: 387/2667 = 0.14511
## 
## n= 2667 
## 
##            CP nsplit rel error  xerror     xstd
## 1  0.07881137      0   1.00000 1.00000 0.047000
## 2  0.07622739      2   0.84238 0.92765 0.045545
## 3  0.05297158      4   0.68992 0.77778 0.042225
## 4  0.02196382      7   0.49354 0.52713 0.035467
## 5  0.02067183      9   0.44961 0.48837 0.034242
## 6  0.01981051     10   0.42894 0.47804 0.033905
## 7  0.01550388     13   0.36951 0.44961 0.032954
## 8  0.01291990     14   0.35401 0.42377 0.032058
## 9  0.01033592     15   0.34109 0.42894 0.032240
## 10 0.00904393     17   0.32041 0.38760 0.030744
## 11 0.00516796     21   0.28424 0.38760 0.030744
## 12 0.00387597     26   0.25840 0.38501 0.030648
## 13 0.00258398     32   0.23514 0.40827 0.031503
## 14 0.00206718     42   0.20930 0.41085 0.031597
## 15 0.00172265     47   0.19897 0.41602 0.031782
## 16 0.00129199     50   0.19380 0.41860 0.031874
## 17 0.00119261     59   0.18088 0.44961 0.032954
## 18 0.00086133     82   0.14987 0.47028 0.033649
## 19 0.00001000     85   0.14729 0.49096 0.034326
# Prune by lowest cp
prune_dt <- prune(cv.ct,cp=cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]),"CP"])
predict_dt <- predict(prune_dt, testing_data,type="class") 
length(prune_dt$frame$var[prune_dt$frame$var == "<leaf>"])
## [1] 27
prp(prune_dt, type = 1, extra = 1, split.font = 1, varlen = -10)

# Get row indices of testing_data$Churn. 
indices <- which(!is.na(testing_data$Churn.))

# Subset predict_dt using the row indices
predict_dt_subset <- predict_dt[indices]

# Calculate confusion matrix
cm_dt <- confusionMatrix(as.factor(testing_data$Churn.[indices]), as.factor(predict_dt_subset), positive='True.')
cm_dt
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False. True.
##     False.    559    11
##     True.      20    76
##                                           
##                Accuracy : 0.9535          
##                  95% CI : (0.9346, 0.9682)
##     No Information Rate : 0.8694          
##     P-Value [Acc > NIR] : 2.91e-13        
##                                           
##                   Kappa : 0.8037          
##                                           
##  Mcnemar's Test P-Value : 0.1508          
##                                           
##             Sensitivity : 0.8736          
##             Specificity : 0.9655          
##          Pos Pred Value : 0.7917          
##          Neg Pred Value : 0.9807          
##              Prevalence : 0.1306          
##          Detection Rate : 0.1141          
##    Detection Prevalence : 0.1441          
##       Balanced Accuracy : 0.9195          
##                                           
##        'Positive' Class : True.           
## 
# Decision Tree Result
pred_dt <- predict(prune_dt, newdata= testing_data,type = "prob")[, 2]
Pred_val = prediction(pred_dt, testing_data$Churn.) 
plot(performance(Pred_val, "tpr", "fpr"),colorize=TRUE)
abline(0, 1, lty = 2)
auc_train <- round(as.numeric(performance(Pred_val, "auc")@y.values),2)
legend(.8, .2, auc_train, title = "AUC", cex=1)

#2: Support Vector Machine

library(e1071) 
library(ISLR) 
## Warning: package 'ISLR' was built under R version 4.3.3
learn_svm <- svm(factor(Churn.)~.,data=training_data) 
predict_svm <- predict(learn_svm, testing_data,type ="response") 

#SVM Results
# Get row indices of testing_data$Churn. 
indices <- which(!is.na(testing_data$Churn.))

# Subset predict_dt using the row indices
predict_svm_subset <- predict_svm[indices]

# Calculate confusion matrix
cm_svm <- confusionMatrix(as.factor(testing_data$Churn.[indices]), as.factor(predict_svm_subset), positive='True.')
cm_svm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False. True.
##     False.    567     3
##     True.      42    54
##                                           
##                Accuracy : 0.9324          
##                  95% CI : (0.9106, 0.9503)
##     No Information Rate : 0.9144          
##     P-Value [Acc > NIR] : 0.05196         
##                                           
##                   Kappa : 0.6705          
##                                           
##  Mcnemar's Test P-Value : 1.473e-08       
##                                           
##             Sensitivity : 0.94737         
##             Specificity : 0.93103         
##          Pos Pred Value : 0.56250         
##          Neg Pred Value : 0.99474         
##              Prevalence : 0.08559         
##          Detection Rate : 0.08108         
##    Detection Prevalence : 0.14414         
##       Balanced Accuracy : 0.93920         
##                                           
##        'Positive' Class : True.           
## 
pred_ROCR  <- prediction(as.numeric(predict_svm), as.numeric(testing_data$Churn.))
roc_ROCR <- performance(pred_ROCR, measure = "tpr", x.measure = "fpr")
auc_train <- round(as.numeric(performance(pred_ROCR, "auc")@y.values),2)
plot(roc_ROCR, main = "ROC curve", colorize = T)
abline(a = 0, b = 1)
legend(.8, .2, auc_train, title = "AUC", cex=1)

# 3: Naive Bayes

# Train Naive Bayes model
learn_nb <- naiveBayes(factor(Churn.) ~ ., data = training_data)

# Predict using Naive Bayes model
predict_nb <- predict(learn_nb, testing_data, type = "class")

# Convert predicted values to factor with levels matching the actual data
predict_nb_factor <- factor(predict_nb, levels = c("True.", "False."))

# Convert actual data to factor with levels matching the predicted values
actual_data_factor <- factor(testing_data$Churn., levels = c("True.", "False."))

# Create confusion matrix for Naive Bayes
cm_nb <- confusionMatrix(predict_nb_factor, actual_data_factor, positive = 'True.')

# Display Naive Bayes confusion matrix
cm_nb
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction True. False.
##     True.     40     21
##     False.    56    549
##                                           
##                Accuracy : 0.8844          
##                  95% CI : (0.8576, 0.9077)
##     No Information Rate : 0.8559          
##     P-Value [Acc > NIR] : 0.0184448       
##                                           
##                   Kappa : 0.4477          
##                                           
##  Mcnemar's Test P-Value : 0.0001068       
##                                           
##             Sensitivity : 0.41667         
##             Specificity : 0.96316         
##          Pos Pred Value : 0.65574         
##          Neg Pred Value : 0.90744         
##              Prevalence : 0.14414         
##          Detection Rate : 0.06006         
##    Detection Prevalence : 0.09159         
##       Balanced Accuracy : 0.68991         
##                                           
##        'Positive' Class : True.           
## 
# Calculate ROC curve for Naive Bayes

pred_nb <- predict(learn_nb, newdata = testing_data, type = "raw")
pred_nb_positive <- as.numeric(pred_nb[, 1])
Pred_val_nb <- prediction(pred_nb_positive, as.numeric(testing_data$Churn.))
perf_nb <- performance(Pred_val_nb, "tpr", "fpr")
plot(perf_nb, colorize = TRUE, main = "ROC curve for Naive Bayes")
abline(0, 1, lty = 2)
auc_nb <- round(as.numeric(performance(Pred_val_nb, "auc")@y.values), 2)
legend(.8, .2, auc_nb, title = "AUC", cex = 1)

# 4: K-Nearest Neighbors (KNN)
# Train KNN model
training_data_n<- as.data.frame(lapply(training_data, as.numeric))
str(training_data_n)
## 'data.frame':    2667 obs. of  18 variables:
##  $ Account.Length: num  128 107 84 75 118 121 147 117 141 65 ...
##  $ Int.l.Plan    : num  1 1 2 2 2 1 2 1 2 1 ...
##  $ VMail.Plan    : num  2 2 1 1 1 2 1 1 2 1 ...
##  $ VMail.Message : num  25 26 0 0 0 24 0 0 37 0 ...
##  $ Day.Mins      : num  265 162 299 167 223 ...
##  $ Day.Calls     : num  110 123 71 113 98 88 79 97 84 137 ...
##  $ Day.Charge    : num  45.1 27.5 50.9 28.3 38 ...
##  $ Eve.Mins      : num  197.4 195.5 61.9 148.3 220.6 ...
##  $ Eve.Calls     : num  99 103 88 122 101 108 94 80 111 83 ...
##  $ Eve.Charge    : num  16.78 16.62 5.26 12.61 18.75 ...
##  $ Night.Mins    : num  245 254 197 187 204 ...
##  $ Night.Calls   : num  91 103 89 121 118 118 96 90 97 111 ...
##  $ Night.Charge  : num  11.01 11.45 8.86 8.41 9.18 ...
##  $ Intl.Mins     : num  10 13.7 6.6 10.1 6.3 7.5 7.1 8.7 11.2 12.7 ...
##  $ Intl.Calls    : num  3 3 7 3 6 7 6 4 5 6 ...
##  $ Intl.Charge   : num  2.7 3.7 1.78 2.73 1.7 2.03 1.92 2.35 3.02 3.43 ...
##  $ CustServ.Calls: num  1 1 2 3 0 3 0 1 0 4 ...
##  $ Churn.        : num  1 1 1 1 1 1 1 1 1 2 ...
testing_data_n<- as.data.frame(lapply(testing_data, as.numeric))
str(testing_data_n)
## 'data.frame':    666 obs. of  18 variables:
##  $ Account.Length: num  137 62 73 20 12 119 97 81 125 174 ...
##  $ Int.l.Plan    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ VMail.Plan    : num  1 1 1 1 1 1 2 1 1 1 ...
##  $ VMail.Message : num  0 0 0 0 0 0 24 0 0 0 ...
##  $ Day.Mins      : num  243 121 224 190 250 ...
##  $ Day.Calls     : num  114 70 90 109 118 114 135 67 103 97 ...
##  $ Day.Charge    : num  41.4 20.5 38.1 32.3 42.4 ...
##  $ Eve.Mins      : num  121 307 160 258 252 ...
##  $ Eve.Calls     : num  110 76 88 84 119 117 58 85 126 94 ...
##  $ Eve.Charge    : num  10.3 26.1 13.6 21.9 21.4 ...
##  $ Night.Mins    : num  163 203 193 182 280 ...
##  $ Night.Calls   : num  104 99 74 102 90 91 79 98 95 54 ...
##  $ Night.Charge  : num  7.32 9.14 8.68 8.17 12.61 ...
##  $ Intl.Mins     : num  12.2 13.1 13 6.3 11.8 8.8 11 10.2 12 11.4 ...
##  $ Intl.Calls    : num  5 6 2 6 3 3 3 3 8 4 ...
##  $ Intl.Charge   : num  3.29 3.54 3.51 1.7 3.19 2.38 2.97 2.75 3.24 3.08 ...
##  $ CustServ.Calls: num  0 4 1 0 1 5 1 1 1 1 ...
##  $ Churn.        : num  1 1 1 1 2 2 1 1 1 1 ...
# Ensure Churn. is a factor in both training and testing datasets
training_data_n$Churn. <- as.factor(training_data_n$Churn.)
testing_data_n$Churn. <- as.factor(testing_data_n$Churn.)
dim(training_data_n)
## [1] 2667   18
dim(testing_data_n)
## [1] 666  18
# Train the KNN model with k = 5
learn_knn <- knn(train = training_data_n[,-18], 
                 test = testing_data_n[,-18], 
                 cl = training_data_n$Churn., 
                 k = 5)

# Check levels of 'Churn.' variable in testing_data_n
levels(testing_data_n$Churn.)
## [1] "1" "2"
# Check levels of 'Churn.' variable in learn_knn
levels(learn_knn)
## [1] "1" "2"
# Calculate confusion matrix for KNN
cm_knn <- confusionMatrix(learn_knn, testing_data_n$Churn., positive = '2')

# Display KNN confusion matrix
cm_knn
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2
##          1 555  65
##          2  15  31
##                                           
##                Accuracy : 0.8799          
##                  95% CI : (0.8527, 0.9036)
##     No Information Rate : 0.8559          
##     P-Value [Acc > NIR] : 0.04114         
##                                           
##                   Kappa : 0.3786          
##                                           
##  Mcnemar's Test P-Value : 4.293e-08       
##                                           
##             Sensitivity : 0.32292         
##             Specificity : 0.97368         
##          Pos Pred Value : 0.67391         
##          Neg Pred Value : 0.89516         
##              Prevalence : 0.14414         
##          Detection Rate : 0.04655         
##    Detection Prevalence : 0.06907         
##       Balanced Accuracy : 0.64830         
##                                           
##        'Positive' Class : 2               
## 
# Calculate ROC curve for KNN
pred_knn <- knn(train = training_data_n[, -ncol(training_data_n)], 
                test = testing_data_n[, -ncol(testing_data_n)], 
                cl = training_data_n$Churn., 
                k = 5, prob = TRUE)

# Extract predicted probabilities for the positive class ("True.")
pred_probs <- as.numeric(pred_knn)

# Create the prediction object
Pred_val_knn <- prediction(pred_probs, testing_data_n$Churn.)

plot(performance(Pred_val_knn, "tpr", "fpr"), colorize = TRUE)
abline(0, 1, lty = 2)
auc_knn <- round(as.numeric(performance(Pred_val_knn, "auc")@y.values), 2)
legend(.8, .2, auc_knn, title = "AUC", cex = 1)

# Results
str(cm_svm)
## List of 6
##  $ positive: chr "True."
##  $ table   : 'table' int [1:2, 1:2] 567 42 3 54
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ Prediction: chr [1:2] "False." "True."
##   .. ..$ Reference : chr [1:2] "False." "True."
##  $ overall : Named num [1:7] 0.932 0.67 0.911 0.95 0.914 ...
##   ..- attr(*, "names")= chr [1:7] "Accuracy" "Kappa" "AccuracyLower" "AccuracyUpper" ...
##  $ byClass : Named num [1:11] 0.947 0.931 0.562 0.995 0.562 ...
##   ..- attr(*, "names")= chr [1:11] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...
##  $ mode    : chr "sens_spec"
##  $ dots    : list()
##  - attr(*, "class")= chr "confusionMatrix"
str(cm_dt)
## List of 6
##  $ positive: chr "True."
##  $ table   : 'table' int [1:2, 1:2] 559 20 11 76
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ Prediction: chr [1:2] "False." "True."
##   .. ..$ Reference : chr [1:2] "False." "True."
##  $ overall : Named num [1:7] 0.953 0.804 0.935 0.968 0.869 ...
##   ..- attr(*, "names")= chr [1:7] "Accuracy" "Kappa" "AccuracyLower" "AccuracyUpper" ...
##  $ byClass : Named num [1:11] 0.874 0.965 0.792 0.981 0.792 ...
##   ..- attr(*, "names")= chr [1:11] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...
##  $ mode    : chr "sens_spec"
##  $ dots    : list()
##  - attr(*, "class")= chr "confusionMatrix"
str(cm_nb)
## List of 6
##  $ positive: chr "True."
##  $ table   : 'table' int [1:2, 1:2] 40 56 21 549
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ Prediction: chr [1:2] "True." "False."
##   .. ..$ Reference : chr [1:2] "True." "False."
##  $ overall : Named num [1:7] 0.884 0.448 0.858 0.908 0.856 ...
##   ..- attr(*, "names")= chr [1:7] "Accuracy" "Kappa" "AccuracyLower" "AccuracyUpper" ...
##  $ byClass : Named num [1:11] 0.417 0.963 0.656 0.907 0.656 ...
##   ..- attr(*, "names")= chr [1:11] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...
##  $ mode    : chr "sens_spec"
##  $ dots    : list()
##  - attr(*, "class")= chr "confusionMatrix"
str(cm_knn)
## List of 6
##  $ positive: chr "2"
##  $ table   : 'table' int [1:2, 1:2] 555 15 65 31
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ Prediction: chr [1:2] "1" "2"
##   .. ..$ Reference : chr [1:2] "1" "2"
##  $ overall : Named num [1:7] 0.88 0.379 0.853 0.904 0.856 ...
##   ..- attr(*, "names")= chr [1:7] "Accuracy" "Kappa" "AccuracyLower" "AccuracyUpper" ...
##  $ byClass : Named num [1:11] 0.323 0.974 0.674 0.895 0.674 ...
##   ..- attr(*, "names")= chr [1:11] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...
##  $ mode    : chr "sens_spec"
##  $ dots    : list()
##  - attr(*, "class")= chr "confusionMatrix"
# Accuracy
svm_accuracy <- cm_svm$overall['Accuracy']
dt_accuracy <- cm_dt$overall['Accuracy']
nv_accuracy <- cm_nb$overall['Accuracy']
knn_accuracy <- cm_knn$overall['Accuracy']

# Precision
svm_precision <- cm_svm$byClass['Pos Pred Value']
dt_precision <- cm_dt$byClass['Pos Pred Value']
nv_precision <- cm_nb$byClass['Pos Pred Value']
knn_precision <- cm_knn$byClass['Pos Pred Value']

# Recall
svm_recall <- cm_svm$byClass['Sensitivity']
dt_recall <- cm_dt$byClass['Sensitivity']
nv_recall <- cm_nb$byClass['Sensitivity']
knn_recall <- cm_knn$byClass['Sensitivity']

# F1-score
svm_f1 <- cm_svm$byClass['F1']
dt_f1 <- cm_dt$byClass['F1']
nv_f1 <- cm_nb$byClass['F1']
knn_f1 <- cm_knn$byClass['F1']

# Print evaluation metrics
print("SVM Evaluation Metrics:")
## [1] "SVM Evaluation Metrics:"
print(paste("Accuracy:", svm_accuracy))
## [1] "Accuracy: 0.932432432432432"
print(paste("Precision:", svm_precision))
## [1] "Precision: 0.5625"
print(paste("Recall:", svm_recall))
## [1] "Recall: 0.947368421052632"
print(paste("F1-score:", svm_f1))
## [1] "F1-score: 0.705882352941177"
print("Decision Tree Evaluation Metrics:")
## [1] "Decision Tree Evaluation Metrics:"
print(paste("Accuracy:", dt_accuracy))
## [1] "Accuracy: 0.953453453453453"
print(paste("Precision:", dt_precision))
## [1] "Precision: 0.791666666666666"
print(paste("Recall:", dt_recall))
## [1] "Recall: 0.873563218390805"
print(paste("F1-score:", dt_f1))
## [1] "F1-score: 0.830601092896175"
print("Naive Bayes Evaluation Metrics:")
## [1] "Naive Bayes Evaluation Metrics:"
print(paste("Accuracy:", nv_accuracy))
## [1] "Accuracy: 0.884384384384384"
print(paste("Precision:", nv_precision))
## [1] "Precision: 0.655737704918033"
print(paste("Recall:", nv_recall))
## [1] "Recall: 0.416666666666667"
print(paste("F1-score:", nv_f1))
## [1] "F1-score: 0.509554140127389"
print("KNN Evaluation Metrics:")
## [1] "KNN Evaluation Metrics:"
print(paste("Accuracy:", knn_accuracy))
## [1] "Accuracy: 0.87987987987988"
print(paste("Precision:", knn_precision))
## [1] "Precision: 0.673913043478261"
print(paste("Recall:", knn_recall))
## [1] "Recall: 0.322916666666667"
# Define model names
model_names <- c("Decision Tree", "Naive Bayes", "KNN", "SVM")

# Assuming you have calculated metrics for multiple models and stored them in variables like dt_accuracy, dt_precision, dt_recall, dt_f1 for Decision Tree,
# and similarly for other models

# Define evaluation metrics for each model
accuracy <- c(dt_accuracy, nv_accuracy, knn_accuracy, svm_accuracy)
precision <- c(dt_precision, nv_precision, knn_precision, svm_precision)
recall <- c(dt_recall, nv_recall, knn_recall, svm_recall)
f1_score <- c(dt_f1, nv_f1, knn_f1, svm_f1)

# Create a data frame for plotting
comparison_df <- data.frame(Model = rep(model_names, each = 4),
                            Metric = rep(c("Accuracy", "Precision", "Recall", "F1-score"), times = 4),
                            Value = c(accuracy, precision, recall, f1_score))


# Load ggplot2 library
library(ggplot2)

# Create grouped bar plot
comparison_plot <- ggplot(comparison_df, aes(x = Model, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(title = "Comparison of Evaluation Metrics Across Models",
       x = "Model", y = "Value") +
  theme_minimal() +
  theme(legend.position = "top") +
  scale_fill_manual(values = c("Accuracy" = "blue", "Precision" = "green", "Recall" = "red", "F1-score" = "purple"))

# Print the plot
print(comparison_plot)

print(paste("F1-score:", knn_f1))
## [1] "F1-score: 0.436619718309859"
#Machine Learning Model Training

# Set seed for reproducibility
set.seed(123)

# Create the training set
train_data <- training_data

# Create the test set
test_data <- testing_data

# Set train control
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

# CART model
set.seed(7)
cart_model <- train(Churn.~., data = train_data, method = "rpart", trControl = ctrl)

# SVM model
set.seed(7)
svm_model <- train(Churn.~., data = train_data, method = "svmRadial", trControl = ctrl)

# KNN model
set.seed(7)
knn_model <- train(Churn.~., data =train_data, method = "knn", trControl = ctrl)

# Random Forest model
set.seed(7)
rf_model <- train(Churn.~., data = train_data, method = "rf", trControl = ctrl)

# Train Naive Bayes model
#set.seed(7)
#nb_model <- train(Churn.~., data = train_data, method = "nb", trControl = ctrl)



print(cart_model)
## CART 
## 
## 2667 samples
##   17 predictor
##    2 classes: 'False.', 'True.' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2400, 2401, 2401, 2400, 2401, 2400, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.05297158  0.9125074  0.5786001
##   0.07622739  0.8702665  0.2430818
##   0.07881137  0.8622666  0.1627132
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05297158.
print(svm_model)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 2667 samples
##   17 predictor
##    2 classes: 'False.', 'True.' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2400, 2401, 2401, 2400, 2401, 2400, ... 
## Resampling results across tuning parameters:
## 
##   C     Accuracy   Kappa    
##   0.25  0.8752678  0.2180105
##   0.50  0.9017619  0.4632688
##   1.00  0.9130115  0.5555887
## 
## Tuning parameter 'sigma' was held constant at a value of 0.03960057
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.03960057 and C = 1.
print(knn_model)
## k-Nearest Neighbors 
## 
## 2667 samples
##   17 predictor
##    2 classes: 'False.', 'True.' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2400, 2401, 2401, 2400, 2401, 2400, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.8771385  0.3321184
##   7  0.8793890  0.3226013
##   9  0.8776421  0.2992136
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
print(rf_model)
## Random Forest 
## 
## 2667 samples
##   17 predictor
##    2 classes: 'False.', 'True.' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2400, 2401, 2401, 2400, 2401, 2400, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9342570  0.6733872
##    9    0.9537514  0.7957548
##   17    0.9501272  0.7794779
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 9.
#print(nb_model)




results<-resamples(list(CART=cart_model,SVM=svm_model,KNN=knn_model,RF=rf_model))



print(results)
## 
## Call:
## resamples.default(x = list(CART = cart_model, SVM = svm_model, KNN =
##  knn_model, RF = rf_model))
## 
## Models: CART, SVM, KNN, RF 
## Number of resamples: 30 
## Performance metrics: Accuracy, Kappa 
## Time estimates for: everything, final model fit
#summarize the results
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: CART, SVM, KNN, RF 
## Number of resamples: 30 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## CART 0.8838951 0.8932584 0.9136887 0.9125074 0.9278322 0.9550562    0
## SVM  0.8801498 0.9098589 0.9136958 0.9130115 0.9176030 0.9288390    0
## KNN  0.8426966 0.8760560 0.8801498 0.8793890 0.8876404 0.8988764    0
## RF   0.9248120 0.9447073 0.9549717 0.9537514 0.9662921 0.9775281    0
## 
## Kappa 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## CART 0.3661842 0.4752375 0.6038332 0.5786001 0.6755768 0.7935035    0
## SVM  0.3729635 0.5195870 0.5547499 0.5555887 0.5915728 0.6606462    0
## KNN  0.1286713 0.2839555 0.3293131 0.3226013 0.3776224 0.4479669    0
## RF   0.6373057 0.7530063 0.8051922 0.7957548 0.8564763 0.9059197    0
scales<-list(x=list(relation="free"), y=list(relation="free"))
bwplot(results,scales=scales)

densityplot(results, scales=scales)

dotplot(results, scales=scales)

cart.pred <- predict(cart_model, newdata = test_data)
cart.accuracy <- 1-mean(cart.pred != test_data$Churn.)
table(actual=test_data$Churn., predicted=cart.pred)
actual/predicted False. True.
False. 565 5
True. 51 45
rf.pred1 <- predict(rf_model, newdata = test_data)
rf.accuracy <- 1-mean(rf.pred1 != test_data$Churn.)
table(actual=test_data$Churn., predicted=rf.pred1)
actual/predicted False. True.
False. 564 6
True. 20 76
svm.pred <-predict(svm_model, newdata = test_data)
svm.accuracy <- 1-mean(svm.pred != test_data$Churn.)
table(actual=test_data$Churn., predicted=svm.pred)
actual/predicted False. True.
False. 566 4
True. 44 52
knn.pred <-predict(knn_model, newdata = test_data)
knn.accuracy <- 1-mean(knn.pred != test_data$Churn.)
table(actual=test_data$Churn., predicted=knn.pred)
actual/predicted False. True.
False. 561 9
True. 67 29
#nb.pred <-predict(nb_model, newdata = test_data)
#nb.accuracy <- 1-mean(nb.pred != test_data$Churn.)
#table(actual=test_data$Churn., predicted=nb.pred)

splom(results)

#difference in model prediction
diffs<-diff(results)

summary(diffs)
## 
## Call:
## summary.diff.resamples(object = diffs)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##      CART      SVM        KNN        RF        
## CART           -0.0005041  0.0331184 -0.0412440
## SVM  1                     0.0336224 -0.0407400
## KNN  9.878e-11 3.883e-12             -0.0743624
## RF   3.684e-10 < 2.2e-16  < 2.2e-16            
## 
## Kappa 
##      CART      SVM       KNN       RF      
## CART            0.02301   0.25600  -0.21715
## SVM  1                    0.23299  -0.24017
## KNN  3.457e-13 3.917e-13           -0.47315
## RF   2.031e-09 < 2.2e-16 < 2.2e-16

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.