#Algorithms Implemented: Decision Trees(rpart), SVM,KNN, NB and RF
1.Implemented individually using 80-20 training-test ratio and compared
the results 2.Then trained all these Models using same TrainControl and
compared the results 3.A significant change in accuracy can be seen is
comparison plots for both these techniques
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(purrr)
library(printr)
## Warning: package 'printr' was built under R version 4.3.3
## Registered S3 method overwritten by 'printr':
## method from
## knit_print.data.frame rmarkdown
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.3.3
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
library(class)
## Warning: package 'class' was built under R version 4.3.3
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(RColorBrewer)
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(e1071)
data = read.csv("D:/MS Sem 2/Data Minning/Projects/Project 4/churn.csv", stringsAsFactors=TRUE)
# show dimension, datatype, content of the data set
str(data)
## 'data.frame': 3333 obs. of 21 variables:
## $ State : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ Account.Length: int 128 107 137 84 75 118 121 147 117 141 ...
## $ Area.Code : int 415 415 415 408 415 510 510 415 408 415 ...
## $ Phone : Factor w/ 3333 levels "327-1058","327-1319",..: 1927 1576 1118 1708 111 2254 1048 81 292 118 ...
## $ Int.l.Plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ VMail.Plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ VMail.Message : int 25 26 0 0 0 0 24 0 0 37 ...
## $ Day.Mins : num 265 162 243 299 167 ...
## $ Day.Calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ Day.Charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ Eve.Mins : num 197.4 195.5 121.2 61.9 148.3 ...
## $ Eve.Calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ Eve.Charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ Night.Mins : num 245 254 163 197 187 ...
## $ Night.Calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ Night.Charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ Intl.Mins : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ Intl.Calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ Intl.Charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ CustServ.Calls: int 1 1 0 2 3 0 3 0 1 0 ...
## $ Churn. : Factor w/ 2 levels "False.","True.": 1 1 1 1 1 1 1 1 1 1 ...
head(data)
| KS |
128 |
415 |
382-4657 |
no |
yes |
25 |
265.1 |
110 |
45.07 |
197.4 |
99 |
16.78 |
244.7 |
91 |
11.01 |
10.0 |
3 |
2.70 |
1 |
False. |
| OH |
107 |
415 |
371-7191 |
no |
yes |
26 |
161.6 |
123 |
27.47 |
195.5 |
103 |
16.62 |
254.4 |
103 |
11.45 |
13.7 |
3 |
3.70 |
1 |
False. |
| NJ |
137 |
415 |
358-1921 |
no |
no |
0 |
243.4 |
114 |
41.38 |
121.2 |
110 |
10.30 |
162.6 |
104 |
7.32 |
12.2 |
5 |
3.29 |
0 |
False. |
| OH |
84 |
408 |
375-9999 |
yes |
no |
0 |
299.4 |
71 |
50.90 |
61.9 |
88 |
5.26 |
196.9 |
89 |
8.86 |
6.6 |
7 |
1.78 |
2 |
False. |
| OK |
75 |
415 |
330-6626 |
yes |
no |
0 |
166.7 |
113 |
28.34 |
148.3 |
122 |
12.61 |
186.9 |
121 |
8.41 |
10.1 |
3 |
2.73 |
3 |
False. |
| AL |
118 |
510 |
391-8027 |
yes |
no |
0 |
223.4 |
98 |
37.98 |
220.6 |
101 |
18.75 |
203.9 |
118 |
9.18 |
6.3 |
6 |
1.70 |
0 |
False. |
# detect missing value
knitr::kable(sapply(data, function(x) sum(is.na(x))), col.names = c("Missing Value Count"))
| State |
0 |
| Account.Length |
0 |
| Area.Code |
0 |
| Phone |
0 |
| Int.l.Plan |
0 |
| VMail.Plan |
0 |
| VMail.Message |
0 |
| Day.Mins |
0 |
| Day.Calls |
0 |
| Day.Charge |
0 |
| Eve.Mins |
0 |
| Eve.Calls |
0 |
| Eve.Charge |
0 |
| Night.Mins |
0 |
| Night.Calls |
0 |
| Night.Charge |
0 |
| Intl.Mins |
0 |
| Intl.Calls |
0 |
| Intl.Charge |
0 |
| CustServ.Calls |
0 |
| Churn. |
0 |
# show summary statistics of the variables
summary(data)
|
WV : 106 |
Min. : 1.0 |
Min. :408.0 |
327-1058: 1 |
no :3010 |
no :2411 |
Min. : 0.000 |
Min. : 0.0 |
Min. : 0.0 |
Min. : 0.00 |
Min. : 0.0 |
Min. : 0.0 |
Min. : 0.00 |
Min. : 23.2 |
Min. : 33.0 |
Min. : 1.040 |
Min. : 0.00 |
Min. : 0.000 |
Min. :0.000 |
Min. :0.000 |
False.:2850 |
|
MN : 84 |
1st Qu.: 74.0 |
1st Qu.:408.0 |
327-1319: 1 |
yes: 323 |
yes: 922 |
1st Qu.: 0.000 |
1st Qu.:143.7 |
1st Qu.: 87.0 |
1st Qu.:24.43 |
1st Qu.:166.6 |
1st Qu.: 87.0 |
1st Qu.:14.16 |
1st Qu.:167.0 |
1st Qu.: 87.0 |
1st Qu.: 7.520 |
1st Qu.: 8.50 |
1st Qu.: 3.000 |
1st Qu.:2.300 |
1st Qu.:1.000 |
True. : 483 |
|
NY : 83 |
Median :101.0 |
Median :415.0 |
327-3053: 1 |
NA |
NA |
Median : 0.000 |
Median :179.4 |
Median :101.0 |
Median :30.50 |
Median :201.4 |
Median :100.0 |
Median :17.12 |
Median :201.2 |
Median :100.0 |
Median : 9.050 |
Median :10.30 |
Median : 4.000 |
Median :2.780 |
Median :1.000 |
NA |
|
AL : 80 |
Mean :101.1 |
Mean :437.2 |
327-3587: 1 |
NA |
NA |
Mean : 8.099 |
Mean :179.8 |
Mean :100.4 |
Mean :30.56 |
Mean :201.0 |
Mean :100.1 |
Mean :17.08 |
Mean :200.9 |
Mean :100.1 |
Mean : 9.039 |
Mean :10.24 |
Mean : 4.479 |
Mean :2.765 |
Mean :1.563 |
NA |
|
OH : 78 |
3rd Qu.:127.0 |
3rd Qu.:510.0 |
327-3850: 1 |
NA |
NA |
3rd Qu.:20.000 |
3rd Qu.:216.4 |
3rd Qu.:114.0 |
3rd Qu.:36.79 |
3rd Qu.:235.3 |
3rd Qu.:114.0 |
3rd Qu.:20.00 |
3rd Qu.:235.3 |
3rd Qu.:113.0 |
3rd Qu.:10.590 |
3rd Qu.:12.10 |
3rd Qu.: 6.000 |
3rd Qu.:3.270 |
3rd Qu.:2.000 |
NA |
|
OR : 78 |
Max. :243.0 |
Max. :510.0 |
327-3954: 1 |
NA |
NA |
Max. :51.000 |
Max. :350.8 |
Max. :165.0 |
Max. :59.64 |
Max. :363.7 |
Max. :170.0 |
Max. :30.91 |
Max. :395.0 |
Max. :175.0 |
Max. :17.770 |
Max. :20.00 |
Max. :20.000 |
Max. :5.400 |
Max. :9.000 |
NA |
|
(Other):2824 |
NA |
NA |
(Other) :3327 |
NA |
NA |
NA |
NA |
NA |
NA |
NA |
NA |
NA |
NA |
NA |
NA |
NA |
NA |
NA |
NA |
NA |
#Data Processing
#Dropping non-useful variables
data = data[, !names(data) %in% c('State', 'Phone', 'Area.Code')]
#2. Data Scaling
# Select variables for scaling
numeric_data <- as.data.frame(lapply(data, as.numeric))
# Scale the numeric variables
scaled_data <- as.data.frame(scale(numeric_data))
# Combine scaled numeric variables with non-numeric variables
#scaled_data <- cbind(Int.l.Plan = data$Int.l.Plan,VMail.Plan = data$VMail.Plan, scaled_data, Churn. = data$Churn.)
# Convert Churn. back to factor
scaled_data$Churn. <- as.factor(scaled_data$Churn.)
# Check the structure of the scaled dataset
str(scaled_data)
## 'data.frame': 3333 obs. of 18 variables:
## $ Account.Length: num 0.676 0.149 0.902 -0.429 -0.655 ...
## $ Int.l.Plan : num -0.328 -0.328 -0.328 3.052 3.052 ...
## $ VMail.Plan : num 1.617 1.617 -0.618 -0.618 -0.618 ...
## $ VMail.Message : num 1.235 1.308 -0.592 -0.592 -0.592 ...
## $ Day.Mins : num 1.567 -0.334 1.168 2.196 -0.24 ...
## $ Day.Calls : num 0.477 1.124 0.676 -1.467 0.626 ...
## $ Day.Charge : num 1.567 -0.334 1.168 2.196 -0.24 ...
## $ Eve.Mins : num -0.0706 -0.1081 -1.5731 -2.7425 -1.0388 ...
## $ Eve.Calls : num -0.0559 0.1448 0.4962 -0.6081 1.0985 ...
## $ Eve.Charge : num -0.0704 -0.1075 -1.5737 -2.7429 -1.0378 ...
## $ Night.Mins : num 0.8666 1.0584 -0.7568 -0.0785 -0.2763 ...
## $ Night.Calls : num -0.465 0.148 0.199 -0.568 1.068 ...
## $ Night.Charge : num 0.8659 1.0592 -0.7555 -0.0788 -0.2765 ...
## $ Intl.Mins : num -0.085 1.2403 0.703 -1.3028 -0.0492 ...
## $ Intl.Calls : num -0.601 -0.601 0.212 1.024 -0.601 ...
## $ Intl.Charge : num -0.0857 1.241 0.6971 -1.3062 -0.0459 ...
## $ CustServ.Calls: num -0.428 -0.428 -1.188 0.332 1.092 ...
## $ Churn. : Factor w/ 2 levels "-0.411610054566873",..: 1 1 1 1 1 1 1 1 1 1 ...
#Check the summary of the scaled dataset
summary(scaled_data)
|
Min. :-2.512795 |
Min. :-0.3275 |
Min. :-0.6183 |
Min. :-0.5917 |
Min. :-3.300601 |
Min. :-5.00450 |
Min. :-3.300667 |
Min. :-3.963027 |
Min. :-5.025157 |
Min. :-3.963085 |
Min. :-3.513121 |
Min. :-3.429355 |
Min. :-3.514838 |
Min. :-3.66686 |
Min. :-1.8200 |
Min. :-3.66766 |
Min. :-1.1880 |
-0.411610054566873:2850 |
|
1st Qu.:-0.679643 |
1st Qu.:-0.3275 |
1st Qu.:-0.6183 |
1st Qu.:-0.5917 |
1st Qu.:-0.662325 |
1st Qu.:-0.66947 |
1st Qu.:-0.662277 |
1st Qu.:-0.677928 |
1st Qu.:-0.658262 |
1st Qu.:-0.678211 |
1st Qu.:-0.669754 |
1st Qu.:-0.669834 |
1st Qu.:-0.667579 |
1st Qu.:-0.62228 |
1st Qu.:-0.6011 |
1st Qu.:-0.61634 |
1st Qu.:-0.4279 |
2.42875498036354 : 483 |
|
Median :-0.001627 |
Median :-0.3275 |
Median :-0.6183 |
Median :-0.5917 |
Median :-0.006887 |
Median : 0.02812 |
Median :-0.006729 |
Median : 0.008275 |
Median :-0.005738 |
Median : 0.008458 |
Median : 0.006485 |
Median :-0.005504 |
Median : 0.004691 |
Median : 0.02246 |
Median :-0.1948 |
Median : 0.02046 |
Median :-0.4279 |
NA |
|
Mean : 0.000000 |
Mean : 0.0000 |
Mean : 0.0000 |
Mean : 0.0000 |
Mean : 0.000000 |
Mean : 0.00000 |
Mean : 0.000000 |
Mean : 0.000000 |
Mean : 0.000000 |
Mean : 0.000000 |
Mean : 0.000000 |
Mean : 0.000000 |
Mean : 0.000000 |
Mean : 0.00000 |
Mean : 0.0000 |
Mean : 0.00000 |
Mean : 0.0000 |
NA |
|
3rd Qu.: 0.651276 |
3rd Qu.:-0.3275 |
3rd Qu.: 1.6168 |
3rd Qu.: 0.8694 |
3rd Qu.: 0.672419 |
3rd Qu.: 0.67588 |
3rd Qu.: 0.672578 |
3rd Qu.: 0.676731 |
3rd Qu.: 0.696981 |
3rd Qu.: 0.676568 |
3rd Qu.: 0.680746 |
3rd Qu.: 0.658825 |
3rd Qu.: 0.681354 |
3rd Qu.: 0.66720 |
3rd Qu.: 0.6178 |
3rd Qu.: 0.67052 |
3rd Qu.: 0.3323 |
NA |
|
Max. : 3.564231 |
Max. : 3.0522 |
Max. : 1.6168 |
Max. : 3.1341 |
Max. : 3.139950 |
Max. : 3.21711 |
Max. : 3.140331 |
Max. : 3.208584 |
Max. : 3.507855 |
Max. : 3.207498 |
Max. : 3.838505 |
Max. : 3.827165 |
Max. : 3.836188 |
Max. : 3.49687 |
Max. : 6.3061 |
Max. : 3.49630 |
Max. : 5.6535 |
NA |
# data encoding to numeric values
#data_n <- as.data.frame(lapply(scaled_data, as.numeric))
#str(data_n)
#summary(data_n$Churn.)
#Machine Learning - Classification
#To train a classification model, there is mainly three steps:
# 1. Splitting Data into Training and Testing Set
#2. Model Training/ Tuning
#3. Model Testing
set.seed(1234)
trainIndex <- createDataPartition(scaled_data$Churn., p = 0.8, list = FALSE, times = 1)
training_data <- data[ trainIndex,]
testing_data <- data[-trainIndex,]
# Check if the splitting process is correct
prop.table(table(training_data$Churn.))
prop.table(table(testing_data$Churn.))
# 1: Decision Tree
Dtree = rpart(Churn.~ Account.Length + Int.l.Plan + VMail.Plan + VMail.Message + Day.Mins+Day.Calls +Day.Charge + Eve.Mins +Eve.Calls+Eve.Charge +Night.Mins+Night.Calls +Night.Charge+Intl.Mins+Intl.Calls+Intl.Charge +CustServ.Calls+Churn., data = training_data, method = "class")
## Warning in model.matrix.default(attr(frame, "terms"), frame): the response
## appeared on the right-hand side and was dropped
## Warning in model.matrix.default(attr(frame, "terms"), frame): problem with term
## 18 in model.matrix: no columns are assigned
## Warning in cats * !isord: longer object length is not a multiple of shorter
## object length
printcp(Dtree)
##
## Classification tree:
## rpart(formula = Churn. ~ Account.Length + Int.l.Plan + VMail.Plan +
## VMail.Message + Day.Mins + Day.Calls + Day.Charge + Eve.Mins +
## Eve.Calls + Eve.Charge + Night.Mins + Night.Calls + Night.Charge +
## Intl.Mins + Intl.Calls + Intl.Charge + CustServ.Calls + Churn.,
## data = training_data, method = "class")
##
## Variables actually used in tree construction:
## [1] CustServ.Calls Day.Mins Eve.Mins Int.l.Plan Intl.Calls
## [6] Intl.Mins Night.Mins VMail.Plan
##
## Root node error: 387/2667 = 0.14511
##
## n= 2667
##
## CP nsplit rel error xerror xstd
## 1 0.078811 0 1.00000 1.00000 0.047000
## 2 0.076227 2 0.84238 0.88889 0.044728
## 3 0.052972 4 0.68992 0.70543 0.040450
## 4 0.021964 7 0.49354 0.50904 0.034903
## 5 0.020672 9 0.44961 0.50129 0.034657
## 6 0.019811 10 0.42894 0.50129 0.034657
## 7 0.015504 13 0.36951 0.45478 0.033130
## 8 0.012920 14 0.35401 0.42894 0.032240
## 9 0.010000 15 0.34109 0.43152 0.032330
fancyRpartPlot(Dtree)

# Plot Full Tree
prp(Dtree, type = 1, extra = 1, under = TRUE, split.font = 2, varlen = 0)

#Find the best pruned Decision Tree by selecting the tree that is having least cross validation error
set.seed(12345)
cv.ct <- rpart(Churn. ~ Account.Length + Int.l.Plan + VMail.Plan + VMail.Message + Day.Mins+Day.Calls +Day.Charge + Eve.Mins +Eve.Calls+Eve.Charge +Night.Mins+Night.Calls +Night.Charge+Intl.Mins+Intl.Calls+Intl.Charge +CustServ.Calls+Churn., data = training_data, method = "class", cp = 0.00001, minsplit = 5, xval = 5)
## Warning in model.matrix.default(attr(frame, "terms"), frame): the response
## appeared on the right-hand side and was dropped
## Warning in model.matrix.default(attr(frame, "terms"), frame): problem with term
## 18 in model.matrix: no columns are assigned
## Warning in cats * !isord: longer object length is not a multiple of shorter
## object length
printcp(cv.ct)
##
## Classification tree:
## rpart(formula = Churn. ~ Account.Length + Int.l.Plan + VMail.Plan +
## VMail.Message + Day.Mins + Day.Calls + Day.Charge + Eve.Mins +
## Eve.Calls + Eve.Charge + Night.Mins + Night.Calls + Night.Charge +
## Intl.Mins + Intl.Calls + Intl.Charge + CustServ.Calls + Churn.,
## data = training_data, method = "class", cp = 1e-05, minsplit = 5,
## xval = 5)
##
## Variables actually used in tree construction:
## [1] Account.Length CustServ.Calls Day.Calls Day.Mins Eve.Calls
## [6] Eve.Mins Int.l.Plan Intl.Calls Intl.Mins Night.Calls
## [11] Night.Mins VMail.Message VMail.Plan
##
## Root node error: 387/2667 = 0.14511
##
## n= 2667
##
## CP nsplit rel error xerror xstd
## 1 0.07881137 0 1.00000 1.00000 0.047000
## 2 0.07622739 2 0.84238 0.92765 0.045545
## 3 0.05297158 4 0.68992 0.77778 0.042225
## 4 0.02196382 7 0.49354 0.52713 0.035467
## 5 0.02067183 9 0.44961 0.48837 0.034242
## 6 0.01981051 10 0.42894 0.47804 0.033905
## 7 0.01550388 13 0.36951 0.44961 0.032954
## 8 0.01291990 14 0.35401 0.42377 0.032058
## 9 0.01033592 15 0.34109 0.42894 0.032240
## 10 0.00904393 17 0.32041 0.38760 0.030744
## 11 0.00516796 21 0.28424 0.38760 0.030744
## 12 0.00387597 26 0.25840 0.38501 0.030648
## 13 0.00258398 32 0.23514 0.40827 0.031503
## 14 0.00206718 42 0.20930 0.41085 0.031597
## 15 0.00172265 47 0.19897 0.41602 0.031782
## 16 0.00129199 50 0.19380 0.41860 0.031874
## 17 0.00119261 59 0.18088 0.44961 0.032954
## 18 0.00086133 82 0.14987 0.47028 0.033649
## 19 0.00001000 85 0.14729 0.49096 0.034326
# Prune by lowest cp
prune_dt <- prune(cv.ct,cp=cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]),"CP"])
predict_dt <- predict(prune_dt, testing_data,type="class")
length(prune_dt$frame$var[prune_dt$frame$var == "<leaf>"])
## [1] 27
prp(prune_dt, type = 1, extra = 1, split.font = 1, varlen = -10)

# Get row indices of testing_data$Churn.
indices <- which(!is.na(testing_data$Churn.))
# Subset predict_dt using the row indices
predict_dt_subset <- predict_dt[indices]
# Calculate confusion matrix
cm_dt <- confusionMatrix(as.factor(testing_data$Churn.[indices]), as.factor(predict_dt_subset), positive='True.')
cm_dt
## Confusion Matrix and Statistics
##
## Reference
## Prediction False. True.
## False. 559 11
## True. 20 76
##
## Accuracy : 0.9535
## 95% CI : (0.9346, 0.9682)
## No Information Rate : 0.8694
## P-Value [Acc > NIR] : 2.91e-13
##
## Kappa : 0.8037
##
## Mcnemar's Test P-Value : 0.1508
##
## Sensitivity : 0.8736
## Specificity : 0.9655
## Pos Pred Value : 0.7917
## Neg Pred Value : 0.9807
## Prevalence : 0.1306
## Detection Rate : 0.1141
## Detection Prevalence : 0.1441
## Balanced Accuracy : 0.9195
##
## 'Positive' Class : True.
##
# Decision Tree Result
pred_dt <- predict(prune_dt, newdata= testing_data,type = "prob")[, 2]
Pred_val = prediction(pred_dt, testing_data$Churn.)
plot(performance(Pred_val, "tpr", "fpr"),colorize=TRUE)
abline(0, 1, lty = 2)
auc_train <- round(as.numeric(performance(Pred_val, "auc")@y.values),2)
legend(.8, .2, auc_train, title = "AUC", cex=1)

#2: Support Vector Machine
library(e1071)
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.3
learn_svm <- svm(factor(Churn.)~.,data=training_data)
predict_svm <- predict(learn_svm, testing_data,type ="response")
#SVM Results
# Get row indices of testing_data$Churn.
indices <- which(!is.na(testing_data$Churn.))
# Subset predict_dt using the row indices
predict_svm_subset <- predict_svm[indices]
# Calculate confusion matrix
cm_svm <- confusionMatrix(as.factor(testing_data$Churn.[indices]), as.factor(predict_svm_subset), positive='True.')
cm_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction False. True.
## False. 567 3
## True. 42 54
##
## Accuracy : 0.9324
## 95% CI : (0.9106, 0.9503)
## No Information Rate : 0.9144
## P-Value [Acc > NIR] : 0.05196
##
## Kappa : 0.6705
##
## Mcnemar's Test P-Value : 1.473e-08
##
## Sensitivity : 0.94737
## Specificity : 0.93103
## Pos Pred Value : 0.56250
## Neg Pred Value : 0.99474
## Prevalence : 0.08559
## Detection Rate : 0.08108
## Detection Prevalence : 0.14414
## Balanced Accuracy : 0.93920
##
## 'Positive' Class : True.
##
pred_ROCR <- prediction(as.numeric(predict_svm), as.numeric(testing_data$Churn.))
roc_ROCR <- performance(pred_ROCR, measure = "tpr", x.measure = "fpr")
auc_train <- round(as.numeric(performance(pred_ROCR, "auc")@y.values),2)
plot(roc_ROCR, main = "ROC curve", colorize = T)
abline(a = 0, b = 1)
legend(.8, .2, auc_train, title = "AUC", cex=1)

# 3: Naive Bayes
# Train Naive Bayes model
learn_nb <- naiveBayes(factor(Churn.) ~ ., data = training_data)
# Predict using Naive Bayes model
predict_nb <- predict(learn_nb, testing_data, type = "class")
# Convert predicted values to factor with levels matching the actual data
predict_nb_factor <- factor(predict_nb, levels = c("True.", "False."))
# Convert actual data to factor with levels matching the predicted values
actual_data_factor <- factor(testing_data$Churn., levels = c("True.", "False."))
# Create confusion matrix for Naive Bayes
cm_nb <- confusionMatrix(predict_nb_factor, actual_data_factor, positive = 'True.')
# Display Naive Bayes confusion matrix
cm_nb
## Confusion Matrix and Statistics
##
## Reference
## Prediction True. False.
## True. 40 21
## False. 56 549
##
## Accuracy : 0.8844
## 95% CI : (0.8576, 0.9077)
## No Information Rate : 0.8559
## P-Value [Acc > NIR] : 0.0184448
##
## Kappa : 0.4477
##
## Mcnemar's Test P-Value : 0.0001068
##
## Sensitivity : 0.41667
## Specificity : 0.96316
## Pos Pred Value : 0.65574
## Neg Pred Value : 0.90744
## Prevalence : 0.14414
## Detection Rate : 0.06006
## Detection Prevalence : 0.09159
## Balanced Accuracy : 0.68991
##
## 'Positive' Class : True.
##
# Calculate ROC curve for Naive Bayes
pred_nb <- predict(learn_nb, newdata = testing_data, type = "raw")
pred_nb_positive <- as.numeric(pred_nb[, 1])
Pred_val_nb <- prediction(pred_nb_positive, as.numeric(testing_data$Churn.))
perf_nb <- performance(Pred_val_nb, "tpr", "fpr")
plot(perf_nb, colorize = TRUE, main = "ROC curve for Naive Bayes")
abline(0, 1, lty = 2)
auc_nb <- round(as.numeric(performance(Pred_val_nb, "auc")@y.values), 2)
legend(.8, .2, auc_nb, title = "AUC", cex = 1)

# 4: K-Nearest Neighbors (KNN)
# Train KNN model
training_data_n<- as.data.frame(lapply(training_data, as.numeric))
str(training_data_n)
## 'data.frame': 2667 obs. of 18 variables:
## $ Account.Length: num 128 107 84 75 118 121 147 117 141 65 ...
## $ Int.l.Plan : num 1 1 2 2 2 1 2 1 2 1 ...
## $ VMail.Plan : num 2 2 1 1 1 2 1 1 2 1 ...
## $ VMail.Message : num 25 26 0 0 0 24 0 0 37 0 ...
## $ Day.Mins : num 265 162 299 167 223 ...
## $ Day.Calls : num 110 123 71 113 98 88 79 97 84 137 ...
## $ Day.Charge : num 45.1 27.5 50.9 28.3 38 ...
## $ Eve.Mins : num 197.4 195.5 61.9 148.3 220.6 ...
## $ Eve.Calls : num 99 103 88 122 101 108 94 80 111 83 ...
## $ Eve.Charge : num 16.78 16.62 5.26 12.61 18.75 ...
## $ Night.Mins : num 245 254 197 187 204 ...
## $ Night.Calls : num 91 103 89 121 118 118 96 90 97 111 ...
## $ Night.Charge : num 11.01 11.45 8.86 8.41 9.18 ...
## $ Intl.Mins : num 10 13.7 6.6 10.1 6.3 7.5 7.1 8.7 11.2 12.7 ...
## $ Intl.Calls : num 3 3 7 3 6 7 6 4 5 6 ...
## $ Intl.Charge : num 2.7 3.7 1.78 2.73 1.7 2.03 1.92 2.35 3.02 3.43 ...
## $ CustServ.Calls: num 1 1 2 3 0 3 0 1 0 4 ...
## $ Churn. : num 1 1 1 1 1 1 1 1 1 2 ...
testing_data_n<- as.data.frame(lapply(testing_data, as.numeric))
str(testing_data_n)
## 'data.frame': 666 obs. of 18 variables:
## $ Account.Length: num 137 62 73 20 12 119 97 81 125 174 ...
## $ Int.l.Plan : num 1 1 1 1 1 1 1 1 1 1 ...
## $ VMail.Plan : num 1 1 1 1 1 1 2 1 1 1 ...
## $ VMail.Message : num 0 0 0 0 0 0 24 0 0 0 ...
## $ Day.Mins : num 243 121 224 190 250 ...
## $ Day.Calls : num 114 70 90 109 118 114 135 67 103 97 ...
## $ Day.Charge : num 41.4 20.5 38.1 32.3 42.4 ...
## $ Eve.Mins : num 121 307 160 258 252 ...
## $ Eve.Calls : num 110 76 88 84 119 117 58 85 126 94 ...
## $ Eve.Charge : num 10.3 26.1 13.6 21.9 21.4 ...
## $ Night.Mins : num 163 203 193 182 280 ...
## $ Night.Calls : num 104 99 74 102 90 91 79 98 95 54 ...
## $ Night.Charge : num 7.32 9.14 8.68 8.17 12.61 ...
## $ Intl.Mins : num 12.2 13.1 13 6.3 11.8 8.8 11 10.2 12 11.4 ...
## $ Intl.Calls : num 5 6 2 6 3 3 3 3 8 4 ...
## $ Intl.Charge : num 3.29 3.54 3.51 1.7 3.19 2.38 2.97 2.75 3.24 3.08 ...
## $ CustServ.Calls: num 0 4 1 0 1 5 1 1 1 1 ...
## $ Churn. : num 1 1 1 1 2 2 1 1 1 1 ...
# Ensure Churn. is a factor in both training and testing datasets
training_data_n$Churn. <- as.factor(training_data_n$Churn.)
testing_data_n$Churn. <- as.factor(testing_data_n$Churn.)
dim(training_data_n)
## [1] 2667 18
dim(testing_data_n)
## [1] 666 18
# Train the KNN model with k = 5
learn_knn <- knn(train = training_data_n[,-18],
test = testing_data_n[,-18],
cl = training_data_n$Churn.,
k = 5)
# Check levels of 'Churn.' variable in testing_data_n
levels(testing_data_n$Churn.)
## [1] "1" "2"
# Check levels of 'Churn.' variable in learn_knn
levels(learn_knn)
## [1] "1" "2"
# Calculate confusion matrix for KNN
cm_knn <- confusionMatrix(learn_knn, testing_data_n$Churn., positive = '2')
# Display KNN confusion matrix
cm_knn
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 555 65
## 2 15 31
##
## Accuracy : 0.8799
## 95% CI : (0.8527, 0.9036)
## No Information Rate : 0.8559
## P-Value [Acc > NIR] : 0.04114
##
## Kappa : 0.3786
##
## Mcnemar's Test P-Value : 4.293e-08
##
## Sensitivity : 0.32292
## Specificity : 0.97368
## Pos Pred Value : 0.67391
## Neg Pred Value : 0.89516
## Prevalence : 0.14414
## Detection Rate : 0.04655
## Detection Prevalence : 0.06907
## Balanced Accuracy : 0.64830
##
## 'Positive' Class : 2
##
# Calculate ROC curve for KNN
pred_knn <- knn(train = training_data_n[, -ncol(training_data_n)],
test = testing_data_n[, -ncol(testing_data_n)],
cl = training_data_n$Churn.,
k = 5, prob = TRUE)
# Extract predicted probabilities for the positive class ("True.")
pred_probs <- as.numeric(pred_knn)
# Create the prediction object
Pred_val_knn <- prediction(pred_probs, testing_data_n$Churn.)
plot(performance(Pred_val_knn, "tpr", "fpr"), colorize = TRUE)
abline(0, 1, lty = 2)
auc_knn <- round(as.numeric(performance(Pred_val_knn, "auc")@y.values), 2)
legend(.8, .2, auc_knn, title = "AUC", cex = 1)

# Results
str(cm_svm)
## List of 6
## $ positive: chr "True."
## $ table : 'table' int [1:2, 1:2] 567 42 3 54
## ..- attr(*, "dimnames")=List of 2
## .. ..$ Prediction: chr [1:2] "False." "True."
## .. ..$ Reference : chr [1:2] "False." "True."
## $ overall : Named num [1:7] 0.932 0.67 0.911 0.95 0.914 ...
## ..- attr(*, "names")= chr [1:7] "Accuracy" "Kappa" "AccuracyLower" "AccuracyUpper" ...
## $ byClass : Named num [1:11] 0.947 0.931 0.562 0.995 0.562 ...
## ..- attr(*, "names")= chr [1:11] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...
## $ mode : chr "sens_spec"
## $ dots : list()
## - attr(*, "class")= chr "confusionMatrix"
str(cm_dt)
## List of 6
## $ positive: chr "True."
## $ table : 'table' int [1:2, 1:2] 559 20 11 76
## ..- attr(*, "dimnames")=List of 2
## .. ..$ Prediction: chr [1:2] "False." "True."
## .. ..$ Reference : chr [1:2] "False." "True."
## $ overall : Named num [1:7] 0.953 0.804 0.935 0.968 0.869 ...
## ..- attr(*, "names")= chr [1:7] "Accuracy" "Kappa" "AccuracyLower" "AccuracyUpper" ...
## $ byClass : Named num [1:11] 0.874 0.965 0.792 0.981 0.792 ...
## ..- attr(*, "names")= chr [1:11] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...
## $ mode : chr "sens_spec"
## $ dots : list()
## - attr(*, "class")= chr "confusionMatrix"
str(cm_nb)
## List of 6
## $ positive: chr "True."
## $ table : 'table' int [1:2, 1:2] 40 56 21 549
## ..- attr(*, "dimnames")=List of 2
## .. ..$ Prediction: chr [1:2] "True." "False."
## .. ..$ Reference : chr [1:2] "True." "False."
## $ overall : Named num [1:7] 0.884 0.448 0.858 0.908 0.856 ...
## ..- attr(*, "names")= chr [1:7] "Accuracy" "Kappa" "AccuracyLower" "AccuracyUpper" ...
## $ byClass : Named num [1:11] 0.417 0.963 0.656 0.907 0.656 ...
## ..- attr(*, "names")= chr [1:11] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...
## $ mode : chr "sens_spec"
## $ dots : list()
## - attr(*, "class")= chr "confusionMatrix"
str(cm_knn)
## List of 6
## $ positive: chr "2"
## $ table : 'table' int [1:2, 1:2] 555 15 65 31
## ..- attr(*, "dimnames")=List of 2
## .. ..$ Prediction: chr [1:2] "1" "2"
## .. ..$ Reference : chr [1:2] "1" "2"
## $ overall : Named num [1:7] 0.88 0.379 0.853 0.904 0.856 ...
## ..- attr(*, "names")= chr [1:7] "Accuracy" "Kappa" "AccuracyLower" "AccuracyUpper" ...
## $ byClass : Named num [1:11] 0.323 0.974 0.674 0.895 0.674 ...
## ..- attr(*, "names")= chr [1:11] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...
## $ mode : chr "sens_spec"
## $ dots : list()
## - attr(*, "class")= chr "confusionMatrix"
# Accuracy
svm_accuracy <- cm_svm$overall['Accuracy']
dt_accuracy <- cm_dt$overall['Accuracy']
nv_accuracy <- cm_nb$overall['Accuracy']
knn_accuracy <- cm_knn$overall['Accuracy']
# Precision
svm_precision <- cm_svm$byClass['Pos Pred Value']
dt_precision <- cm_dt$byClass['Pos Pred Value']
nv_precision <- cm_nb$byClass['Pos Pred Value']
knn_precision <- cm_knn$byClass['Pos Pred Value']
# Recall
svm_recall <- cm_svm$byClass['Sensitivity']
dt_recall <- cm_dt$byClass['Sensitivity']
nv_recall <- cm_nb$byClass['Sensitivity']
knn_recall <- cm_knn$byClass['Sensitivity']
# F1-score
svm_f1 <- cm_svm$byClass['F1']
dt_f1 <- cm_dt$byClass['F1']
nv_f1 <- cm_nb$byClass['F1']
knn_f1 <- cm_knn$byClass['F1']
# Print evaluation metrics
print("SVM Evaluation Metrics:")
## [1] "SVM Evaluation Metrics:"
print(paste("Accuracy:", svm_accuracy))
## [1] "Accuracy: 0.932432432432432"
print(paste("Precision:", svm_precision))
## [1] "Precision: 0.5625"
print(paste("Recall:", svm_recall))
## [1] "Recall: 0.947368421052632"
print(paste("F1-score:", svm_f1))
## [1] "F1-score: 0.705882352941177"
print("Decision Tree Evaluation Metrics:")
## [1] "Decision Tree Evaluation Metrics:"
print(paste("Accuracy:", dt_accuracy))
## [1] "Accuracy: 0.953453453453453"
print(paste("Precision:", dt_precision))
## [1] "Precision: 0.791666666666666"
print(paste("Recall:", dt_recall))
## [1] "Recall: 0.873563218390805"
print(paste("F1-score:", dt_f1))
## [1] "F1-score: 0.830601092896175"
print("Naive Bayes Evaluation Metrics:")
## [1] "Naive Bayes Evaluation Metrics:"
print(paste("Accuracy:", nv_accuracy))
## [1] "Accuracy: 0.884384384384384"
print(paste("Precision:", nv_precision))
## [1] "Precision: 0.655737704918033"
print(paste("Recall:", nv_recall))
## [1] "Recall: 0.416666666666667"
print(paste("F1-score:", nv_f1))
## [1] "F1-score: 0.509554140127389"
print("KNN Evaluation Metrics:")
## [1] "KNN Evaluation Metrics:"
print(paste("Accuracy:", knn_accuracy))
## [1] "Accuracy: 0.87987987987988"
print(paste("Precision:", knn_precision))
## [1] "Precision: 0.673913043478261"
print(paste("Recall:", knn_recall))
## [1] "Recall: 0.322916666666667"
# Define model names
model_names <- c("Decision Tree", "Naive Bayes", "KNN", "SVM")
# Assuming you have calculated metrics for multiple models and stored them in variables like dt_accuracy, dt_precision, dt_recall, dt_f1 for Decision Tree,
# and similarly for other models
# Define evaluation metrics for each model
accuracy <- c(dt_accuracy, nv_accuracy, knn_accuracy, svm_accuracy)
precision <- c(dt_precision, nv_precision, knn_precision, svm_precision)
recall <- c(dt_recall, nv_recall, knn_recall, svm_recall)
f1_score <- c(dt_f1, nv_f1, knn_f1, svm_f1)
# Create a data frame for plotting
comparison_df <- data.frame(Model = rep(model_names, each = 4),
Metric = rep(c("Accuracy", "Precision", "Recall", "F1-score"), times = 4),
Value = c(accuracy, precision, recall, f1_score))
# Load ggplot2 library
library(ggplot2)
# Create grouped bar plot
comparison_plot <- ggplot(comparison_df, aes(x = Model, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(title = "Comparison of Evaluation Metrics Across Models",
x = "Model", y = "Value") +
theme_minimal() +
theme(legend.position = "top") +
scale_fill_manual(values = c("Accuracy" = "blue", "Precision" = "green", "Recall" = "red", "F1-score" = "purple"))
# Print the plot
print(comparison_plot)

print(paste("F1-score:", knn_f1))
## [1] "F1-score: 0.436619718309859"
#Machine Learning Model Training
# Set seed for reproducibility
set.seed(123)
# Create the training set
train_data <- training_data
# Create the test set
test_data <- testing_data
# Set train control
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
# CART model
set.seed(7)
cart_model <- train(Churn.~., data = train_data, method = "rpart", trControl = ctrl)
# SVM model
set.seed(7)
svm_model <- train(Churn.~., data = train_data, method = "svmRadial", trControl = ctrl)
# KNN model
set.seed(7)
knn_model <- train(Churn.~., data =train_data, method = "knn", trControl = ctrl)
# Random Forest model
set.seed(7)
rf_model <- train(Churn.~., data = train_data, method = "rf", trControl = ctrl)
# Train Naive Bayes model
#set.seed(7)
#nb_model <- train(Churn.~., data = train_data, method = "nb", trControl = ctrl)
print(cart_model)
## CART
##
## 2667 samples
## 17 predictor
## 2 classes: 'False.', 'True.'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2400, 2401, 2401, 2400, 2401, 2400, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.05297158 0.9125074 0.5786001
## 0.07622739 0.8702665 0.2430818
## 0.07881137 0.8622666 0.1627132
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05297158.
print(svm_model)
## Support Vector Machines with Radial Basis Function Kernel
##
## 2667 samples
## 17 predictor
## 2 classes: 'False.', 'True.'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2400, 2401, 2401, 2400, 2401, 2400, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.8752678 0.2180105
## 0.50 0.9017619 0.4632688
## 1.00 0.9130115 0.5555887
##
## Tuning parameter 'sigma' was held constant at a value of 0.03960057
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.03960057 and C = 1.
print(knn_model)
## k-Nearest Neighbors
##
## 2667 samples
## 17 predictor
## 2 classes: 'False.', 'True.'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2400, 2401, 2401, 2400, 2401, 2400, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.8771385 0.3321184
## 7 0.8793890 0.3226013
## 9 0.8776421 0.2992136
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
print(rf_model)
## Random Forest
##
## 2667 samples
## 17 predictor
## 2 classes: 'False.', 'True.'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2400, 2401, 2401, 2400, 2401, 2400, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9342570 0.6733872
## 9 0.9537514 0.7957548
## 17 0.9501272 0.7794779
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 9.
#print(nb_model)
results<-resamples(list(CART=cart_model,SVM=svm_model,KNN=knn_model,RF=rf_model))
print(results)
##
## Call:
## resamples.default(x = list(CART = cart_model, SVM = svm_model, KNN =
## knn_model, RF = rf_model))
##
## Models: CART, SVM, KNN, RF
## Number of resamples: 30
## Performance metrics: Accuracy, Kappa
## Time estimates for: everything, final model fit
#summarize the results
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: CART, SVM, KNN, RF
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.8838951 0.8932584 0.9136887 0.9125074 0.9278322 0.9550562 0
## SVM 0.8801498 0.9098589 0.9136958 0.9130115 0.9176030 0.9288390 0
## KNN 0.8426966 0.8760560 0.8801498 0.8793890 0.8876404 0.8988764 0
## RF 0.9248120 0.9447073 0.9549717 0.9537514 0.9662921 0.9775281 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## CART 0.3661842 0.4752375 0.6038332 0.5786001 0.6755768 0.7935035 0
## SVM 0.3729635 0.5195870 0.5547499 0.5555887 0.5915728 0.6606462 0
## KNN 0.1286713 0.2839555 0.3293131 0.3226013 0.3776224 0.4479669 0
## RF 0.6373057 0.7530063 0.8051922 0.7957548 0.8564763 0.9059197 0
scales<-list(x=list(relation="free"), y=list(relation="free"))
bwplot(results,scales=scales)

densityplot(results, scales=scales)

dotplot(results, scales=scales)

cart.pred <- predict(cart_model, newdata = test_data)
cart.accuracy <- 1-mean(cart.pred != test_data$Churn.)
table(actual=test_data$Churn., predicted=cart.pred)
rf.pred1 <- predict(rf_model, newdata = test_data)
rf.accuracy <- 1-mean(rf.pred1 != test_data$Churn.)
table(actual=test_data$Churn., predicted=rf.pred1)
svm.pred <-predict(svm_model, newdata = test_data)
svm.accuracy <- 1-mean(svm.pred != test_data$Churn.)
table(actual=test_data$Churn., predicted=svm.pred)
knn.pred <-predict(knn_model, newdata = test_data)
knn.accuracy <- 1-mean(knn.pred != test_data$Churn.)
table(actual=test_data$Churn., predicted=knn.pred)
#nb.pred <-predict(nb_model, newdata = test_data)
#nb.accuracy <- 1-mean(nb.pred != test_data$Churn.)
#table(actual=test_data$Churn., predicted=nb.pred)
splom(results)

#difference in model prediction
diffs<-diff(results)
summary(diffs)
##
## Call:
## summary.diff.resamples(object = diffs)
##
## p-value adjustment: bonferroni
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
##
## Accuracy
## CART SVM KNN RF
## CART -0.0005041 0.0331184 -0.0412440
## SVM 1 0.0336224 -0.0407400
## KNN 9.878e-11 3.883e-12 -0.0743624
## RF 3.684e-10 < 2.2e-16 < 2.2e-16
##
## Kappa
## CART SVM KNN RF
## CART 0.02301 0.25600 -0.21715
## SVM 1 0.23299 -0.24017
## KNN 3.457e-13 3.917e-13 -0.47315
## RF 2.031e-09 < 2.2e-16 < 2.2e-16
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.