Project 4 V0

#Algorithms Implemented: Decision Trees(rpart), SVM,KNN, NB and RF 1.Implemented individually using 80-20 training-test ratio and compared the results 2.Then trained all these Models using same TrainControl and compared the results 3.A significant change in accuracy can be seen is comparison plots for both these techniques

library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(purrr)
library(printr)

## Warning: package 'printr' was built under R version 4.3.3

## Registered S3 method overwritten by 'printr':
##   method                from     
##   knit_print.data.frame rmarkdown

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(ROCR)

## Warning: package 'ROCR' was built under R version 4.3.3

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(car)

## Warning: package 'car' was built under R version 4.3.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.3.3

## 
## Attaching package: 'car'

## The following object is masked from 'package:purrr':
## 
##     some

## The following object is masked from 'package:dplyr':
## 
##     recode

library(class)

## Warning: package 'class' was built under R version 4.3.3

library(rpart)

## Warning: package 'rpart' was built under R version 4.3.3

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.3.3

library(RColorBrewer)
library(rattle)

## Warning: package 'rattle' was built under R version 4.3.3

## Loading required package: tibble

## Loading required package: bitops

## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(e1071)

data = read.csv("D:/MS Sem 2/Data Minning/Projects/Project 4/churn.csv", stringsAsFactors=TRUE)
# show dimension, datatype, content of the data set
str(data)

## 'data.frame':    3333 obs. of  21 variables:
##  $ State         : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ Account.Length: int  128 107 137 84 75 118 121 147 117 141 ...
##  $ Area.Code     : int  415 415 415 408 415 510 510 415 408 415 ...
##  $ Phone         : Factor w/ 3333 levels "327-1058","327-1319",..: 1927 1576 1118 1708 111 2254 1048 81 292 118 ...
##  $ Int.l.Plan    : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ VMail.Plan    : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ VMail.Message : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ Day.Mins      : num  265 162 243 299 167 ...
##  $ Day.Calls     : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ Day.Charge    : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ Eve.Mins      : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ Eve.Calls     : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ Eve.Charge    : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ Night.Mins    : num  245 254 163 197 187 ...
##  $ Night.Calls   : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ Night.Charge  : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ Intl.Mins     : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ Intl.Calls    : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ Intl.Charge   : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ CustServ.Calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ Churn.        : Factor w/ 2 levels "False.","True.": 1 1 1 1 1 1 1 1 1 1 ...

head(data)

State	Account.Length	Area.Code	Phone	Int.l.Plan	VMail.Plan	VMail.Message	Day.Mins	Day.Calls	Day.Charge	Eve.Mins	Eve.Calls	Eve.Charge	Night.Mins	Night.Calls	Night.Charge	Intl.Mins	Intl.Calls	Intl.Charge	CustServ.Calls	Churn.
KS	128	415	382-4657	no	yes	25	265.1	110	45.07	197.4	99	16.78	244.7	91	11.01	10.0	3	2.70	1	False.
OH	107	415	371-7191	no	yes	26	161.6	123	27.47	195.5	103	16.62	254.4	103	11.45	13.7	3	3.70	1	False.
NJ	137	415	358-1921	no	no	0	243.4	114	41.38	121.2	110	10.30	162.6	104	7.32	12.2	5	3.29	0	False.
OH	84	408	375-9999	yes	no	0	299.4	71	50.90	61.9	88	5.26	196.9	89	8.86	6.6	7	1.78	2	False.
OK	75	415	330-6626	yes	no	0	166.7	113	28.34	148.3	122	12.61	186.9	121	8.41	10.1	3	2.73	3	False.
AL	118	510	391-8027	yes	no	0	223.4	98	37.98	220.6	101	18.75	203.9	118	9.18	6.3	6	1.70	0	False.

# detect missing value
knitr::kable(sapply(data, function(x) sum(is.na(x))), col.names = c("Missing Value Count"))

	Missing Value Count
State	0
Account.Length	0
Area.Code	0
Phone	0
Int.l.Plan	0
VMail.Plan	0
VMail.Message	0
Day.Mins	0
Day.Calls	0
Day.Charge	0
Eve.Mins	0
Eve.Calls	0
Eve.Charge	0
Night.Mins	0
Night.Calls	0
Night.Charge	0
Intl.Mins	0
Intl.Calls	0
Intl.Charge	0
CustServ.Calls	0
Churn.	0

# show summary statistics of the variables 
summary(data)

State	Account.Length	Area.Code	Phone	Int.l.Plan	VMail.Plan	VMail.Message	Day.Mins	Day.Calls	Day.Charge	Eve.Mins	Eve.Calls	Eve.Charge	Night.Mins	Night.Calls	Night.Charge	Intl.Mins	Intl.Calls	Intl.Charge	CustServ.Calls	Churn.
WV : 106	Min. : 1.0	Min. :408.0	327-1058: 1	no :3010	no :2411	Min. : 0.000	Min. : 0.0	Min. : 0.0	Min. : 0.00	Min. : 0.0	Min. : 0.0	Min. : 0.00	Min. : 23.2	Min. : 33.0	Min. : 1.040	Min. : 0.00	Min. : 0.000	Min. :0.000	Min. :0.000	False.:2850
MN : 84	1st Qu.: 74.0	1st Qu.:408.0	327-1319: 1	yes: 323	yes: 922	1st Qu.: 0.000	1st Qu.:143.7	1st Qu.: 87.0	1st Qu.:24.43	1st Qu.:166.6	1st Qu.: 87.0	1st Qu.:14.16	1st Qu.:167.0	1st Qu.: 87.0	1st Qu.: 7.520	1st Qu.: 8.50	1st Qu.: 3.000	1st Qu.:2.300	1st Qu.:1.000	True. : 483
NY : 83	Median :101.0	Median :415.0	327-3053: 1	NA	NA	Median : 0.000	Median :179.4	Median :101.0	Median :30.50	Median :201.4	Median :100.0	Median :17.12	Median :201.2	Median :100.0	Median : 9.050	Median :10.30	Median : 4.000	Median :2.780	Median :1.000	NA
AL : 80	Mean :101.1	Mean :437.2	327-3587: 1	NA	NA	Mean : 8.099	Mean :179.8	Mean :100.4	Mean :30.56	Mean :201.0	Mean :100.1	Mean :17.08	Mean :200.9	Mean :100.1	Mean : 9.039	Mean :10.24	Mean : 4.479	Mean :2.765	Mean :1.563	NA
OH : 78	3rd Qu.:127.0	3rd Qu.:510.0	327-3850: 1	NA	NA	3rd Qu.:20.000	3rd Qu.:216.4	3rd Qu.:114.0	3rd Qu.:36.79	3rd Qu.:235.3	3rd Qu.:114.0	3rd Qu.:20.00	3rd Qu.:235.3	3rd Qu.:113.0	3rd Qu.:10.590	3rd Qu.:12.10	3rd Qu.: 6.000	3rd Qu.:3.270	3rd Qu.:2.000	NA
OR : 78	Max. :243.0	Max. :510.0	327-3954: 1	NA	NA	Max. :51.000	Max. :350.8	Max. :165.0	Max. :59.64	Max. :363.7	Max. :170.0	Max. :30.91	Max. :395.0	Max. :175.0	Max. :17.770	Max. :20.00	Max. :20.000	Max. :5.400	Max. :9.000	NA
(Other):2824	NA	NA	(Other) :3327	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA

#Data Processing

#Dropping non-useful variables
data = data[, !names(data) %in% c('State', 'Phone', 'Area.Code')]

#2. Data Scaling
# Select variables for scaling
numeric_data <- as.data.frame(lapply(data, as.numeric))

# Scale the numeric variables
scaled_data <- as.data.frame(scale(numeric_data))

# Combine scaled numeric variables with non-numeric variables
#scaled_data <- cbind(Int.l.Plan = data$Int.l.Plan,VMail.Plan = data$VMail.Plan, scaled_data, Churn. = data$Churn.)

# Convert Churn. back to factor
scaled_data$Churn. <- as.factor(scaled_data$Churn.)

# Check the structure of the scaled dataset
str(scaled_data)

## 'data.frame':    3333 obs. of  18 variables:
##  $ Account.Length: num  0.676 0.149 0.902 -0.429 -0.655 ...
##  $ Int.l.Plan    : num  -0.328 -0.328 -0.328 3.052 3.052 ...
##  $ VMail.Plan    : num  1.617 1.617 -0.618 -0.618 -0.618 ...
##  $ VMail.Message : num  1.235 1.308 -0.592 -0.592 -0.592 ...
##  $ Day.Mins      : num  1.567 -0.334 1.168 2.196 -0.24 ...
##  $ Day.Calls     : num  0.477 1.124 0.676 -1.467 0.626 ...
##  $ Day.Charge    : num  1.567 -0.334 1.168 2.196 -0.24 ...
##  $ Eve.Mins      : num  -0.0706 -0.1081 -1.5731 -2.7425 -1.0388 ...
##  $ Eve.Calls     : num  -0.0559 0.1448 0.4962 -0.6081 1.0985 ...
##  $ Eve.Charge    : num  -0.0704 -0.1075 -1.5737 -2.7429 -1.0378 ...
##  $ Night.Mins    : num  0.8666 1.0584 -0.7568 -0.0785 -0.2763 ...
##  $ Night.Calls   : num  -0.465 0.148 0.199 -0.568 1.068 ...
##  $ Night.Charge  : num  0.8659 1.0592 -0.7555 -0.0788 -0.2765 ...
##  $ Intl.Mins     : num  -0.085 1.2403 0.703 -1.3028 -0.0492 ...
##  $ Intl.Calls    : num  -0.601 -0.601 0.212 1.024 -0.601 ...
##  $ Intl.Charge   : num  -0.0857 1.241 0.6971 -1.3062 -0.0459 ...
##  $ CustServ.Calls: num  -0.428 -0.428 -1.188 0.332 1.092 ...
##  $ Churn.        : Factor w/ 2 levels "-0.411610054566873",..: 1 1 1 1 1 1 1 1 1 1 ...

#Check the summary of the scaled dataset
summary(scaled_data)

Account.Length	Int.l.Plan	VMail.Plan	VMail.Message	Day.Mins	Day.Calls	Day.Charge	Eve.Mins	Eve.Calls	Eve.Charge	Night.Mins	Night.Calls	Night.Charge	Intl.Mins	Intl.Calls	Intl.Charge	CustServ.Calls	Churn.
Min. :-2.512795	Min. :-0.3275	Min. :-0.6183	Min. :-0.5917	Min. :-3.300601	Min. :-5.00450	Min. :-3.300667	Min. :-3.963027	Min. :-5.025157	Min. :-3.963085	Min. :-3.513121	Min. :-3.429355	Min. :-3.514838	Min. :-3.66686	Min. :-1.8200	Min. :-3.66766	Min. :-1.1880	-0.411610054566873:2850
1st Qu.:-0.679643	1st Qu.:-0.3275	1st Qu.:-0.6183	1st Qu.:-0.5917	1st Qu.:-0.662325	1st Qu.:-0.66947	1st Qu.:-0.662277	1st Qu.:-0.677928	1st Qu.:-0.658262	1st Qu.:-0.678211	1st Qu.:-0.669754	1st Qu.:-0.669834	1st Qu.:-0.667579	1st Qu.:-0.62228	1st Qu.:-0.6011	1st Qu.:-0.61634	1st Qu.:-0.4279	2.42875498036354 : 483
Median :-0.001627	Median :-0.3275	Median :-0.6183	Median :-0.5917	Median :-0.006887	Median : 0.02812	Median :-0.006729	Median : 0.008275	Median :-0.005738	Median : 0.008458	Median : 0.006485	Median :-0.005504	Median : 0.004691	Median : 0.02246	Median :-0.1948	Median : 0.02046	Median :-0.4279	NA
Mean : 0.000000	Mean : 0.0000	Mean : 0.0000	Mean : 0.0000	Mean : 0.000000	Mean : 0.00000	Mean : 0.000000	Mean : 0.000000	Mean : 0.000000	Mean : 0.000000	Mean : 0.000000	Mean : 0.000000	Mean : 0.000000	Mean : 0.00000	Mean : 0.0000	Mean : 0.00000	Mean : 0.0000	NA
3rd Qu.: 0.651276	3rd Qu.:-0.3275	3rd Qu.: 1.6168	3rd Qu.: 0.8694	3rd Qu.: 0.672419	3rd Qu.: 0.67588	3rd Qu.: 0.672578	3rd Qu.: 0.676731	3rd Qu.: 0.696981	3rd Qu.: 0.676568	3rd Qu.: 0.680746	3rd Qu.: 0.658825	3rd Qu.: 0.681354	3rd Qu.: 0.66720	3rd Qu.: 0.6178	3rd Qu.: 0.67052	3rd Qu.: 0.3323	NA
Max. : 3.564231	Max. : 3.0522	Max. : 1.6168	Max. : 3.1341	Max. : 3.139950	Max. : 3.21711	Max. : 3.140331	Max. : 3.208584	Max. : 3.507855	Max. : 3.207498	Max. : 3.838505	Max. : 3.827165	Max. : 3.836188	Max. : 3.49687	Max. : 6.3061	Max. : 3.49630	Max. : 5.6535	NA

# data encoding to numeric values
#data_n <- as.data.frame(lapply(scaled_data, as.numeric))
#str(data_n)
#summary(data_n$Churn.)



#Machine Learning - Classification

#To train a classification model, there is mainly three steps:
# 1. Splitting Data into Training and Testing Set
#2. Model Training/ Tuning
#3. Model Testing

set.seed(1234)
trainIndex <- createDataPartition(scaled_data$Churn., p = 0.8, list = FALSE, times = 1)
training_data <- data[ trainIndex,]
testing_data  <- data[-trainIndex,]

# Check if the splitting process is correct
prop.table(table(training_data$Churn.))

False.	True.
0.8548931	0.1451069

prop.table(table(testing_data$Churn.))

False.	True.
0.8558559	0.1441441

# 1: Decision Tree

Dtree = rpart(Churn.~ Account.Length + Int.l.Plan + VMail.Plan + VMail.Message +  Day.Mins+Day.Calls +Day.Charge + Eve.Mins +Eve.Calls+Eve.Charge +Night.Mins+Night.Calls +Night.Charge+Intl.Mins+Intl.Calls+Intl.Charge +CustServ.Calls+Churn., data = training_data, method = "class")

## Warning in model.matrix.default(attr(frame, "terms"), frame): the response
## appeared on the right-hand side and was dropped

## Warning in model.matrix.default(attr(frame, "terms"), frame): problem with term
## 18 in model.matrix: no columns are assigned

## Warning in cats * !isord: longer object length is not a multiple of shorter
## object length

printcp(Dtree)

## 
## Classification tree:
## rpart(formula = Churn. ~ Account.Length + Int.l.Plan + VMail.Plan + 
##     VMail.Message + Day.Mins + Day.Calls + Day.Charge + Eve.Mins + 
##     Eve.Calls + Eve.Charge + Night.Mins + Night.Calls + Night.Charge + 
##     Intl.Mins + Intl.Calls + Intl.Charge + CustServ.Calls + Churn., 
##     data = training_data, method = "class")
## 
## Variables actually used in tree construction:
## [1] CustServ.Calls Day.Mins       Eve.Mins       Int.l.Plan     Intl.Calls    
## [6] Intl.Mins      Night.Mins     VMail.Plan    
## 
## Root node error: 387/2667 = 0.14511
## 
## n= 2667 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.078811      0   1.00000 1.00000 0.047000
## 2 0.076227      2   0.84238 0.88889 0.044728
## 3 0.052972      4   0.68992 0.70543 0.040450
## 4 0.021964      7   0.49354 0.50904 0.034903
## 5 0.020672      9   0.44961 0.50129 0.034657
## 6 0.019811     10   0.42894 0.50129 0.034657
## 7 0.015504     13   0.36951 0.45478 0.033130
## 8 0.012920     14   0.35401 0.42894 0.032240
## 9 0.010000     15   0.34109 0.43152 0.032330

fancyRpartPlot(Dtree)

# Plot Full Tree
prp(Dtree, type = 1, extra = 1, under = TRUE, split.font = 2, varlen = 0)

#Find the best pruned Decision Tree by selecting the tree that is having least cross validation error
set.seed(12345)
cv.ct <- rpart(Churn. ~ Account.Length + Int.l.Plan + VMail.Plan + VMail.Message +  Day.Mins+Day.Calls +Day.Charge + Eve.Mins +Eve.Calls+Eve.Charge +Night.Mins+Night.Calls +Night.Charge+Intl.Mins+Intl.Calls+Intl.Charge +CustServ.Calls+Churn., data = training_data, method = "class",  cp = 0.00001, minsplit = 5, xval = 5)

## Warning in model.matrix.default(attr(frame, "terms"), frame): the response
## appeared on the right-hand side and was dropped

## Warning in model.matrix.default(attr(frame, "terms"), frame): problem with term
## 18 in model.matrix: no columns are assigned

## Warning in cats * !isord: longer object length is not a multiple of shorter
## object length

printcp(cv.ct)

## 
## Classification tree:
## rpart(formula = Churn. ~ Account.Length + Int.l.Plan + VMail.Plan + 
##     VMail.Message + Day.Mins + Day.Calls + Day.Charge + Eve.Mins + 
##     Eve.Calls + Eve.Charge + Night.Mins + Night.Calls + Night.Charge + 
##     Intl.Mins + Intl.Calls + Intl.Charge + CustServ.Calls + Churn., 
##     data = training_data, method = "class", cp = 1e-05, minsplit = 5, 
##     xval = 5)
## 
## Variables actually used in tree construction:
##  [1] Account.Length CustServ.Calls Day.Calls      Day.Mins       Eve.Calls     
##  [6] Eve.Mins       Int.l.Plan     Intl.Calls     Intl.Mins      Night.Calls   
## [11] Night.Mins     VMail.Message  VMail.Plan    
## 
## Root node error: 387/2667 = 0.14511
## 
## n= 2667 
## 
##            CP nsplit rel error  xerror     xstd
## 1  0.07881137      0   1.00000 1.00000 0.047000
## 2  0.07622739      2   0.84238 0.92765 0.045545
## 3  0.05297158      4   0.68992 0.77778 0.042225
## 4  0.02196382      7   0.49354 0.52713 0.035467
## 5  0.02067183      9   0.44961 0.48837 0.034242
## 6  0.01981051     10   0.42894 0.47804 0.033905
## 7  0.01550388     13   0.36951 0.44961 0.032954
## 8  0.01291990     14   0.35401 0.42377 0.032058
## 9  0.01033592     15   0.34109 0.42894 0.032240
## 10 0.00904393     17   0.32041 0.38760 0.030744
## 11 0.00516796     21   0.28424 0.38760 0.030744
## 12 0.00387597     26   0.25840 0.38501 0.030648
## 13 0.00258398     32   0.23514 0.40827 0.031503
## 14 0.00206718     42   0.20930 0.41085 0.031597
## 15 0.00172265     47   0.19897 0.41602 0.031782
## 16 0.00129199     50   0.19380 0.41860 0.031874
## 17 0.00119261     59   0.18088 0.44961 0.032954
## 18 0.00086133     82   0.14987 0.47028 0.033649
## 19 0.00001000     85   0.14729 0.49096 0.034326

# Prune by lowest cp
prune_dt <- prune(cv.ct,cp=cv.ct$cptable[which.min(cv.ct$cptable[,"xerror"]),"CP"])
predict_dt <- predict(prune_dt, testing_data,type="class") 
length(prune_dt$frame$var[prune_dt$frame$var == "<leaf>"])

## [1] 27

prp(prune_dt, type = 1, extra = 1, split.font = 1, varlen = -10)

# Get row indices of testing_data$Churn. 
indices <- which(!is.na(testing_data$Churn.))

# Subset predict_dt using the row indices
predict_dt_subset <- predict_dt[indices]

# Calculate confusion matrix
cm_dt <- confusionMatrix(as.factor(testing_data$Churn.[indices]), as.factor(predict_dt_subset), positive='True.')
cm_dt

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False. True.
##     False.    559    11
##     True.      20    76
##                                           
##                Accuracy : 0.9535          
##                  95% CI : (0.9346, 0.9682)
##     No Information Rate : 0.8694          
##     P-Value [Acc > NIR] : 2.91e-13        
##                                           
##                   Kappa : 0.8037          
##                                           
##  Mcnemar's Test P-Value : 0.1508          
##                                           
##             Sensitivity : 0.8736          
##             Specificity : 0.9655          
##          Pos Pred Value : 0.7917          
##          Neg Pred Value : 0.9807          
##              Prevalence : 0.1306          
##          Detection Rate : 0.1141          
##    Detection Prevalence : 0.1441          
##       Balanced Accuracy : 0.9195          
##                                           
##        'Positive' Class : True.           
##

# Decision Tree Result
pred_dt <- predict(prune_dt, newdata= testing_data,type = "prob")[, 2]
Pred_val = prediction(pred_dt, testing_data$Churn.) 
plot(performance(Pred_val, "tpr", "fpr"),colorize=TRUE)
abline(0, 1, lty = 2)
auc_train <- round(as.numeric(performance(Pred_val, "auc")@y.values),2)
legend(.8, .2, auc_train, title = "AUC", cex=1)

#2: Support Vector Machine

library(e1071) 
library(ISLR)

## Warning: package 'ISLR' was built under R version 4.3.3

learn_svm <- svm(factor(Churn.)~.,data=training_data) 
predict_svm <- predict(learn_svm, testing_data,type ="response") 

#SVM Results
# Get row indices of testing_data$Churn. 
indices <- which(!is.na(testing_data$Churn.))

# Subset predict_dt using the row indices
predict_svm_subset <- predict_svm[indices]

# Calculate confusion matrix
cm_svm <- confusionMatrix(as.factor(testing_data$Churn.[indices]), as.factor(predict_svm_subset), positive='True.')
cm_svm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction False. True.
##     False.    567     3
##     True.      42    54
##                                           
##                Accuracy : 0.9324          
##                  95% CI : (0.9106, 0.9503)
##     No Information Rate : 0.9144          
##     P-Value [Acc > NIR] : 0.05196         
##                                           
##                   Kappa : 0.6705          
##                                           
##  Mcnemar's Test P-Value : 1.473e-08       
##                                           
##             Sensitivity : 0.94737         
##             Specificity : 0.93103         
##          Pos Pred Value : 0.56250         
##          Neg Pred Value : 0.99474         
##              Prevalence : 0.08559         
##          Detection Rate : 0.08108         
##    Detection Prevalence : 0.14414         
##       Balanced Accuracy : 0.93920         
##                                           
##        'Positive' Class : True.           
##

pred_ROCR  <- prediction(as.numeric(predict_svm), as.numeric(testing_data$Churn.))
roc_ROCR <- performance(pred_ROCR, measure = "tpr", x.measure = "fpr")
auc_train <- round(as.numeric(performance(pred_ROCR, "auc")@y.values),2)
plot(roc_ROCR, main = "ROC curve", colorize = T)
abline(a = 0, b = 1)
legend(.8, .2, auc_train, title = "AUC", cex=1)

# 3: Naive Bayes

# Train Naive Bayes model
learn_nb <- naiveBayes(factor(Churn.) ~ ., data = training_data)

# Predict using Naive Bayes model
predict_nb <- predict(learn_nb, testing_data, type = "class")

# Convert predicted values to factor with levels matching the actual data
predict_nb_factor <- factor(predict_nb, levels = c("True.", "False."))

# Convert actual data to factor with levels matching the predicted values
actual_data_factor <- factor(testing_data$Churn., levels = c("True.", "False."))

# Create confusion matrix for Naive Bayes
cm_nb <- confusionMatrix(predict_nb_factor, actual_data_factor, positive = 'True.')

# Display Naive Bayes confusion matrix
cm_nb

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction True. False.
##     True.     40     21
##     False.    56    549
##                                           
##                Accuracy : 0.8844          
##                  95% CI : (0.8576, 0.9077)
##     No Information Rate : 0.8559          
##     P-Value [Acc > NIR] : 0.0184448       
##                                           
##                   Kappa : 0.4477          
##                                           
##  Mcnemar's Test P-Value : 0.0001068       
##                                           
##             Sensitivity : 0.41667         
##             Specificity : 0.96316         
##          Pos Pred Value : 0.65574         
##          Neg Pred Value : 0.90744         
##              Prevalence : 0.14414         
##          Detection Rate : 0.06006         
##    Detection Prevalence : 0.09159         
##       Balanced Accuracy : 0.68991         
##                                           
##        'Positive' Class : True.           
##

# Calculate ROC curve for Naive Bayes

pred_nb <- predict(learn_nb, newdata = testing_data, type = "raw")
pred_nb_positive <- as.numeric(pred_nb[, 1])
Pred_val_nb <- prediction(pred_nb_positive, as.numeric(testing_data$Churn.))
perf_nb <- performance(Pred_val_nb, "tpr", "fpr")
plot(perf_nb, colorize = TRUE, main = "ROC curve for Naive Bayes")
abline(0, 1, lty = 2)
auc_nb <- round(as.numeric(performance(Pred_val_nb, "auc")@y.values), 2)
legend(.8, .2, auc_nb, title = "AUC", cex = 1)

# 4: K-Nearest Neighbors (KNN)
# Train KNN model
training_data_n<- as.data.frame(lapply(training_data, as.numeric))
str(training_data_n)

## 'data.frame':    2667 obs. of  18 variables:
##  $ Account.Length: num  128 107 84 75 118 121 147 117 141 65 ...
##  $ Int.l.Plan    : num  1 1 2 2 2 1 2 1 2 1 ...
##  $ VMail.Plan    : num  2 2 1 1 1 2 1 1 2 1 ...
##  $ VMail.Message : num  25 26 0 0 0 24 0 0 37 0 ...
##  $ Day.Mins      : num  265 162 299 167 223 ...
##  $ Day.Calls     : num  110 123 71 113 98 88 79 97 84 137 ...
##  $ Day.Charge    : num  45.1 27.5 50.9 28.3 38 ...
##  $ Eve.Mins      : num  197.4 195.5 61.9 148.3 220.6 ...
##  $ Eve.Calls     : num  99 103 88 122 101 108 94 80 111 83 ...
##  $ Eve.Charge    : num  16.78 16.62 5.26 12.61 18.75 ...
##  $ Night.Mins    : num  245 254 197 187 204 ...
##  $ Night.Calls   : num  91 103 89 121 118 118 96 90 97 111 ...
##  $ Night.Charge  : num  11.01 11.45 8.86 8.41 9.18 ...
##  $ Intl.Mins     : num  10 13.7 6.6 10.1 6.3 7.5 7.1 8.7 11.2 12.7 ...
##  $ Intl.Calls    : num  3 3 7 3 6 7 6 4 5 6 ...
##  $ Intl.Charge   : num  2.7 3.7 1.78 2.73 1.7 2.03 1.92 2.35 3.02 3.43 ...
##  $ CustServ.Calls: num  1 1 2 3 0 3 0 1 0 4 ...
##  $ Churn.        : num  1 1 1 1 1 1 1 1 1 2 ...

testing_data_n<- as.data.frame(lapply(testing_data, as.numeric))
str(testing_data_n)

## 'data.frame':    666 obs. of  18 variables:
##  $ Account.Length: num  137 62 73 20 12 119 97 81 125 174 ...
##  $ Int.l.Plan    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ VMail.Plan    : num  1 1 1 1 1 1 2 1 1 1 ...
##  $ VMail.Message : num  0 0 0 0 0 0 24 0 0 0 ...
##  $ Day.Mins      : num  243 121 224 190 250 ...
##  $ Day.Calls     : num  114 70 90 109 118 114 135 67 103 97 ...
##  $ Day.Charge    : num  41.4 20.5 38.1 32.3 42.4 ...
##  $ Eve.Mins      : num  121 307 160 258 252 ...
##  $ Eve.Calls     : num  110 76 88 84 119 117 58 85 126 94 ...
##  $ Eve.Charge    : num  10.3 26.1 13.6 21.9 21.4 ...
##  $ Night.Mins    : num  163 203 193 182 280 ...
##  $ Night.Calls   : num  104 99 74 102 90 91 79 98 95 54 ...
##  $ Night.Charge  : num  7.32 9.14 8.68 8.17 12.61 ...
##  $ Intl.Mins     : num  12.2 13.1 13 6.3 11.8 8.8 11 10.2 12 11.4 ...
##  $ Intl.Calls    : num  5 6 2 6 3 3 3 3 8 4 ...
##  $ Intl.Charge   : num  3.29 3.54 3.51 1.7 3.19 2.38 2.97 2.75 3.24 3.08 ...
##  $ CustServ.Calls: num  0 4 1 0 1 5 1 1 1 1 ...
##  $ Churn.        : num  1 1 1 1 2 2 1 1 1 1 ...

# Ensure Churn. is a factor in both training and testing datasets
training_data_n$Churn. <- as.factor(training_data_n$Churn.)
testing_data_n$Churn. <- as.factor(testing_data_n$Churn.)
dim(training_data_n)

## [1] 2667   18

dim(testing_data_n)

## [1] 666  18

# Train the KNN model with k = 5
learn_knn <- knn(train = training_data_n[,-18], 
                 test = testing_data_n[,-18], 
                 cl = training_data_n$Churn., 
                 k = 5)

# Check levels of 'Churn.' variable in testing_data_n
levels(testing_data_n$Churn.)

## [1] "1" "2"

# Check levels of 'Churn.' variable in learn_knn
levels(learn_knn)

## [1] "1" "2"

# Calculate confusion matrix for KNN
cm_knn <- confusionMatrix(learn_knn, testing_data_n$Churn., positive = '2')

# Display KNN confusion matrix
cm_knn

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2
##          1 555  65
##          2  15  31
##                                           
##                Accuracy : 0.8799          
##                  95% CI : (0.8527, 0.9036)
##     No Information Rate : 0.8559          
##     P-Value [Acc > NIR] : 0.04114         
##                                           
##                   Kappa : 0.3786          
##                                           
##  Mcnemar's Test P-Value : 4.293e-08       
##                                           
##             Sensitivity : 0.32292         
##             Specificity : 0.97368         
##          Pos Pred Value : 0.67391         
##          Neg Pred Value : 0.89516         
##              Prevalence : 0.14414         
##          Detection Rate : 0.04655         
##    Detection Prevalence : 0.06907         
##       Balanced Accuracy : 0.64830         
##                                           
##        'Positive' Class : 2               
##

# Calculate ROC curve for KNN
pred_knn <- knn(train = training_data_n[, -ncol(training_data_n)], 
                test = testing_data_n[, -ncol(testing_data_n)], 
                cl = training_data_n$Churn., 
                k = 5, prob = TRUE)

# Extract predicted probabilities for the positive class ("True.")
pred_probs <- as.numeric(pred_knn)

# Create the prediction object
Pred_val_knn <- prediction(pred_probs, testing_data_n$Churn.)

plot(performance(Pred_val_knn, "tpr", "fpr"), colorize = TRUE)
abline(0, 1, lty = 2)
auc_knn <- round(as.numeric(performance(Pred_val_knn, "auc")@y.values), 2)
legend(.8, .2, auc_knn, title = "AUC", cex = 1)

# Results
str(cm_svm)

## List of 6
##  $ positive: chr "True."
##  $ table   : 'table' int [1:2, 1:2] 567 42 3 54
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ Prediction: chr [1:2] "False." "True."
##   .. ..$ Reference : chr [1:2] "False." "True."
##  $ overall : Named num [1:7] 0.932 0.67 0.911 0.95 0.914 ...
##   ..- attr(*, "names")= chr [1:7] "Accuracy" "Kappa" "AccuracyLower" "AccuracyUpper" ...
##  $ byClass : Named num [1:11] 0.947 0.931 0.562 0.995 0.562 ...
##   ..- attr(*, "names")= chr [1:11] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...
##  $ mode    : chr "sens_spec"
##  $ dots    : list()
##  - attr(*, "class")= chr "confusionMatrix"

str(cm_dt)

## List of 6
##  $ positive: chr "True."
##  $ table   : 'table' int [1:2, 1:2] 559 20 11 76
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ Prediction: chr [1:2] "False." "True."
##   .. ..$ Reference : chr [1:2] "False." "True."
##  $ overall : Named num [1:7] 0.953 0.804 0.935 0.968 0.869 ...
##   ..- attr(*, "names")= chr [1:7] "Accuracy" "Kappa" "AccuracyLower" "AccuracyUpper" ...
##  $ byClass : Named num [1:11] 0.874 0.965 0.792 0.981 0.792 ...
##   ..- attr(*, "names")= chr [1:11] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...
##  $ mode    : chr "sens_spec"
##  $ dots    : list()
##  - attr(*, "class")= chr "confusionMatrix"

str(cm_nb)

## List of 6
##  $ positive: chr "True."
##  $ table   : 'table' int [1:2, 1:2] 40 56 21 549
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ Prediction: chr [1:2] "True." "False."
##   .. ..$ Reference : chr [1:2] "True." "False."
##  $ overall : Named num [1:7] 0.884 0.448 0.858 0.908 0.856 ...
##   ..- attr(*, "names")= chr [1:7] "Accuracy" "Kappa" "AccuracyLower" "AccuracyUpper" ...
##  $ byClass : Named num [1:11] 0.417 0.963 0.656 0.907 0.656 ...
##   ..- attr(*, "names")= chr [1:11] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...
##  $ mode    : chr "sens_spec"
##  $ dots    : list()
##  - attr(*, "class")= chr "confusionMatrix"

str(cm_knn)

## List of 6
##  $ positive: chr "2"
##  $ table   : 'table' int [1:2, 1:2] 555 15 65 31
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ Prediction: chr [1:2] "1" "2"
##   .. ..$ Reference : chr [1:2] "1" "2"
##  $ overall : Named num [1:7] 0.88 0.379 0.853 0.904 0.856 ...
##   ..- attr(*, "names")= chr [1:7] "Accuracy" "Kappa" "AccuracyLower" "AccuracyUpper" ...
##  $ byClass : Named num [1:11] 0.323 0.974 0.674 0.895 0.674 ...
##   ..- attr(*, "names")= chr [1:11] "Sensitivity" "Specificity" "Pos Pred Value" "Neg Pred Value" ...
##  $ mode    : chr "sens_spec"
##  $ dots    : list()
##  - attr(*, "class")= chr "confusionMatrix"

# Accuracy
svm_accuracy <- cm_svm$overall['Accuracy']
dt_accuracy <- cm_dt$overall['Accuracy']
nv_accuracy <- cm_nb$overall['Accuracy']
knn_accuracy <- cm_knn$overall['Accuracy']

# Precision
svm_precision <- cm_svm$byClass['Pos Pred Value']
dt_precision <- cm_dt$byClass['Pos Pred Value']
nv_precision <- cm_nb$byClass['Pos Pred Value']
knn_precision <- cm_knn$byClass['Pos Pred Value']

# Recall
svm_recall <- cm_svm$byClass['Sensitivity']
dt_recall <- cm_dt$byClass['Sensitivity']
nv_recall <- cm_nb$byClass['Sensitivity']
knn_recall <- cm_knn$byClass['Sensitivity']

# F1-score
svm_f1 <- cm_svm$byClass['F1']
dt_f1 <- cm_dt$byClass['F1']
nv_f1 <- cm_nb$byClass['F1']
knn_f1 <- cm_knn$byClass['F1']

# Print evaluation metrics
print("SVM Evaluation Metrics:")

## [1] "SVM Evaluation Metrics:"

print(paste("Accuracy:", svm_accuracy))

## [1] "Accuracy: 0.932432432432432"

print(paste("Precision:", svm_precision))

## [1] "Precision: 0.5625"

print(paste("Recall:", svm_recall))

## [1] "Recall: 0.947368421052632"

print(paste("F1-score:", svm_f1))

## [1] "F1-score: 0.705882352941177"

print("Decision Tree Evaluation Metrics:")

## [1] "Decision Tree Evaluation Metrics:"

print(paste("Accuracy:", dt_accuracy))

## [1] "Accuracy: 0.953453453453453"

print(paste("Precision:", dt_precision))

## [1] "Precision: 0.791666666666666"

print(paste("Recall:", dt_recall))

## [1] "Recall: 0.873563218390805"

print(paste("F1-score:", dt_f1))

## [1] "F1-score: 0.830601092896175"

print("Naive Bayes Evaluation Metrics:")

## [1] "Naive Bayes Evaluation Metrics:"

print(paste("Accuracy:", nv_accuracy))

## [1] "Accuracy: 0.884384384384384"

print(paste("Precision:", nv_precision))

## [1] "Precision: 0.655737704918033"

print(paste("Recall:", nv_recall))

## [1] "Recall: 0.416666666666667"

print(paste("F1-score:", nv_f1))

## [1] "F1-score: 0.509554140127389"

print("KNN Evaluation Metrics:")

## [1] "KNN Evaluation Metrics:"

print(paste("Accuracy:", knn_accuracy))

## [1] "Accuracy: 0.87987987987988"

print(paste("Precision:", knn_precision))

## [1] "Precision: 0.673913043478261"

print(paste("Recall:", knn_recall))

## [1] "Recall: 0.322916666666667"

# Define model names
model_names <- c("Decision Tree", "Naive Bayes", "KNN", "SVM")

# Assuming you have calculated metrics for multiple models and stored them in variables like dt_accuracy, dt_precision, dt_recall, dt_f1 for Decision Tree,
# and similarly for other models

# Define evaluation metrics for each model
accuracy <- c(dt_accuracy, nv_accuracy, knn_accuracy, svm_accuracy)
precision <- c(dt_precision, nv_precision, knn_precision, svm_precision)
recall <- c(dt_recall, nv_recall, knn_recall, svm_recall)
f1_score <- c(dt_f1, nv_f1, knn_f1, svm_f1)

# Create a data frame for plotting
comparison_df <- data.frame(Model = rep(model_names, each = 4),
                            Metric = rep(c("Accuracy", "Precision", "Recall", "F1-score"), times = 4),
                            Value = c(accuracy, precision, recall, f1_score))


# Load ggplot2 library
library(ggplot2)

# Create grouped bar plot
comparison_plot <- ggplot(comparison_df, aes(x = Model, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", position = position_dodge()) +
  labs(title = "Comparison of Evaluation Metrics Across Models",
       x = "Model", y = "Value") +
  theme_minimal() +
  theme(legend.position = "top") +
  scale_fill_manual(values = c("Accuracy" = "blue", "Precision" = "green", "Recall" = "red", "F1-score" = "purple"))

# Print the plot
print(comparison_plot)

print(paste("F1-score:", knn_f1))

## [1] "F1-score: 0.436619718309859"

#Machine Learning Model Training

# Set seed for reproducibility
set.seed(123)

# Create the training set
train_data <- training_data

# Create the test set
test_data <- testing_data

# Set train control
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

# CART model
set.seed(7)
cart_model <- train(Churn.~., data = train_data, method = "rpart", trControl = ctrl)

# SVM model
set.seed(7)
svm_model <- train(Churn.~., data = train_data, method = "svmRadial", trControl = ctrl)

# KNN model
set.seed(7)
knn_model <- train(Churn.~., data =train_data, method = "knn", trControl = ctrl)

# Random Forest model
set.seed(7)
rf_model <- train(Churn.~., data = train_data, method = "rf", trControl = ctrl)

# Train Naive Bayes model
#set.seed(7)
#nb_model <- train(Churn.~., data = train_data, method = "nb", trControl = ctrl)



print(cart_model)

## CART 
## 
## 2667 samples
##   17 predictor
##    2 classes: 'False.', 'True.' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2400, 2401, 2401, 2400, 2401, 2400, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.05297158  0.9125074  0.5786001
##   0.07622739  0.8702665  0.2430818
##   0.07881137  0.8622666  0.1627132
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05297158.

print(svm_model)

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 2667 samples
##   17 predictor
##    2 classes: 'False.', 'True.' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2400, 2401, 2401, 2400, 2401, 2400, ... 
## Resampling results across tuning parameters:
## 
##   C     Accuracy   Kappa    
##   0.25  0.8752678  0.2180105
##   0.50  0.9017619  0.4632688
##   1.00  0.9130115  0.5555887
## 
## Tuning parameter 'sigma' was held constant at a value of 0.03960057
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.03960057 and C = 1.

print(knn_model)

## k-Nearest Neighbors 
## 
## 2667 samples
##   17 predictor
##    2 classes: 'False.', 'True.' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2400, 2401, 2401, 2400, 2401, 2400, ... 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.8771385  0.3321184
##   7  0.8793890  0.3226013
##   9  0.8776421  0.2992136
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.

print(rf_model)

## Random Forest 
## 
## 2667 samples
##   17 predictor
##    2 classes: 'False.', 'True.' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2400, 2401, 2401, 2400, 2401, 2400, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9342570  0.6733872
##    9    0.9537514  0.7957548
##   17    0.9501272  0.7794779
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 9.

#print(nb_model)




results<-resamples(list(CART=cart_model,SVM=svm_model,KNN=knn_model,RF=rf_model))



print(results)

## 
## Call:
## resamples.default(x = list(CART = cart_model, SVM = svm_model, KNN =
##  knn_model, RF = rf_model))
## 
## Models: CART, SVM, KNN, RF 
## Number of resamples: 30 
## Performance metrics: Accuracy, Kappa 
## Time estimates for: everything, final model fit

#summarize the results
summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: CART, SVM, KNN, RF 
## Number of resamples: 30 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## CART 0.8838951 0.8932584 0.9136887 0.9125074 0.9278322 0.9550562    0
## SVM  0.8801498 0.9098589 0.9136958 0.9130115 0.9176030 0.9288390    0
## KNN  0.8426966 0.8760560 0.8801498 0.8793890 0.8876404 0.8988764    0
## RF   0.9248120 0.9447073 0.9549717 0.9537514 0.9662921 0.9775281    0
## 
## Kappa 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## CART 0.3661842 0.4752375 0.6038332 0.5786001 0.6755768 0.7935035    0
## SVM  0.3729635 0.5195870 0.5547499 0.5555887 0.5915728 0.6606462    0
## KNN  0.1286713 0.2839555 0.3293131 0.3226013 0.3776224 0.4479669    0
## RF   0.6373057 0.7530063 0.8051922 0.7957548 0.8564763 0.9059197    0

scales<-list(x=list(relation="free"), y=list(relation="free"))
bwplot(results,scales=scales)

densityplot(results, scales=scales)

dotplot(results, scales=scales)

cart.pred <- predict(cart_model, newdata = test_data)
cart.accuracy <- 1-mean(cart.pred != test_data$Churn.)
table(actual=test_data$Churn., predicted=cart.pred)

actual/predicted	False.	True.
False.	565	5
True.	51	45

rf.pred1 <- predict(rf_model, newdata = test_data)
rf.accuracy <- 1-mean(rf.pred1 != test_data$Churn.)
table(actual=test_data$Churn., predicted=rf.pred1)

actual/predicted	False.	True.
False.	564	6
True.	20	76

svm.pred <-predict(svm_model, newdata = test_data)
svm.accuracy <- 1-mean(svm.pred != test_data$Churn.)
table(actual=test_data$Churn., predicted=svm.pred)

actual/predicted	False.	True.
False.	566	4
True.	44	52

knn.pred <-predict(knn_model, newdata = test_data)
knn.accuracy <- 1-mean(knn.pred != test_data$Churn.)
table(actual=test_data$Churn., predicted=knn.pred)

actual/predicted	False.	True.
False.	561	9
True.	67	29

#nb.pred <-predict(nb_model, newdata = test_data)
#nb.accuracy <- 1-mean(nb.pred != test_data$Churn.)
#table(actual=test_data$Churn., predicted=nb.pred)

splom(results)

#difference in model prediction
diffs<-diff(results)

summary(diffs)

## 
## Call:
## summary.diff.resamples(object = diffs)
## 
## p-value adjustment: bonferroni 
## Upper diagonal: estimates of the difference
## Lower diagonal: p-value for H0: difference = 0
## 
## Accuracy 
##      CART      SVM        KNN        RF        
## CART           -0.0005041  0.0331184 -0.0412440
## SVM  1                     0.0336224 -0.0407400
## KNN  9.878e-11 3.883e-12             -0.0743624
## RF   3.684e-10 < 2.2e-16  < 2.2e-16            
## 
## Kappa 
##      CART      SVM       KNN       RF      
## CART            0.02301   0.25600  -0.21715
## SVM  1                    0.23299  -0.24017
## KNN  3.457e-13 3.917e-13           -0.47315
## RF   2.031e-09 < 2.2e-16 < 2.2e-16

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Project 4 V0

Tooba Maryam

2024-05-07