library(plyr)
## Warning: package 'plyr' was built under R version 4.0.5
library(corrplot)
## corrplot 0.90 loaded
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.0.5
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.5
library(caret)
## Warning: package 'caret' was built under R version 4.0.5
## Loading required package: lattice
library(MASS)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(party)
## Warning: package 'party' was built under R version 4.0.5
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.0.5
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:plyr':
##
## empty
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.0.5
churn <- read.csv('churn.csv')
str(churn)
## 'data.frame': 3333 obs. of 22 variables:
## $ Sl.No. : int 1 2 3 4 5 6 7 8 9 10 ...
## $ State : chr "KS" "OH" "NJ" "OH" ...
## $ Account.Length: int 128 107 137 84 75 118 121 147 117 141 ...
## $ Area.Code : int 415 415 415 408 415 510 510 415 408 415 ...
## $ Phone : chr "382-4657" "371-7191" "358-1921" "375-9999" ...
## $ Int.l.Plan : chr "no" "no" "no" "yes" ...
## $ VMail.Plan : chr "yes" "yes" "no" "no" ...
## $ VMail.Message : int 25 26 0 0 0 0 24 0 0 37 ...
## $ Day.Mins : num 265 162 243 299 167 ...
## $ Day.Calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ Day.Charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ Eve.Mins : num 197.4 195.5 121.2 61.9 148.3 ...
## $ Eve.Calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ Eve.Charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ Night.Mins : num 245 254 163 197 187 ...
## $ Night.Calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ Night.Charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ Intl.Mins : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ Intl.Calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ Intl.Charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ CustServ.Calls: int 1 1 0 2 3 0 3 0 1 0 ...
## $ Churn : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
sapply(churn, function(x) sum(is.na(x)))
## Sl.No. State Account.Length Area.Code Phone
## 0 0 0 0 0
## Int.l.Plan VMail.Plan VMail.Message Day.Mins Day.Calls
## 0 0 0 0 0
## Day.Charge Eve.Mins Eve.Calls Eve.Charge Night.Mins
## 0 0 0 0 0
## Night.Calls Night.Charge Intl.Mins Intl.Calls Intl.Charge
## 0 0 0 0 0
## CustServ.Calls Churn
## 0 0
range(churn$Account.Length)
## [1] 1 243
range(churn$CustServ.Calls)
## [1] 0 9
range(churn$Intl.Calls)
## [1] 0 20
group_Account.Length <- function(Account.Length){
if (Account.Length >= 0 & Account.Length <= 120){
return('0-120 group')
}else if (Account.Length > 120){
return('120 and above')
}
}
churn$Account.Length.group <- sapply(churn$Account.Length, group_Account.Length)
churn$Account.Length.group <- as.factor(churn$Account.Length.group)
str(churn)
## 'data.frame': 3333 obs. of 23 variables:
## $ Sl.No. : int 1 2 3 4 5 6 7 8 9 10 ...
## $ State : chr "KS" "OH" "NJ" "OH" ...
## $ Account.Length : int 128 107 137 84 75 118 121 147 117 141 ...
## $ Area.Code : int 415 415 415 408 415 510 510 415 408 415 ...
## $ Phone : chr "382-4657" "371-7191" "358-1921" "375-9999" ...
## $ Int.l.Plan : chr "no" "no" "no" "yes" ...
## $ VMail.Plan : chr "yes" "yes" "no" "no" ...
## $ VMail.Message : int 25 26 0 0 0 0 24 0 0 37 ...
## $ Day.Mins : num 265 162 243 299 167 ...
## $ Day.Calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ Day.Charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ Eve.Mins : num 197.4 195.5 121.2 61.9 148.3 ...
## $ Eve.Calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ Eve.Charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ Night.Mins : num 245 254 163 197 187 ...
## $ Night.Calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ Night.Charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ Intl.Mins : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ Intl.Calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ Intl.Charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ CustServ.Calls : int 1 1 0 2 3 0 3 0 1 0 ...
## $ Churn : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Account.Length.group: Factor w/ 2 levels "0-120 group",..: 2 1 2 1 1 1 2 2 1 2 ...
churn$Int.l.Plan <- as.factor(churn$Int.l.Plan)
churn$VMail.Plan <- as.factor(churn$VMail.Plan)
churn$Churn <- as.factor(churn$Churn)
churn$Sl.No. <- NULL
churn$State <- NULL
churn$Area.Code <- NULL
churn$Phone <- NULL
churn$Account.Length <- NULL
str(churn)
## 'data.frame': 3333 obs. of 18 variables:
## $ Int.l.Plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ VMail.Plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ VMail.Message : int 25 26 0 0 0 0 24 0 0 37 ...
## $ Day.Mins : num 265 162 243 299 167 ...
## $ Day.Calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ Day.Charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ Eve.Mins : num 197.4 195.5 121.2 61.9 148.3 ...
## $ Eve.Calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ Eve.Charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ Night.Mins : num 245 254 163 197 187 ...
## $ Night.Calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ Night.Charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ Intl.Mins : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ Intl.Calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ Intl.Charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ CustServ.Calls : int 1 1 0 2 3 0 3 0 1 0 ...
## $ Churn : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Length.group: Factor w/ 2 levels "0-120 group",..: 2 1 2 1 1 1 2 2 1 2 ...
numeric.var <- sapply(churn, is.numeric)
corr.matrix <- cor(churn[,numeric.var])
corrplot(corr.matrix, main="\n\nCorrelation Plot for Numerical Variables", method="number")

churn$Day.Mins <- NULL
churn$Day.Charge <- NULL
churn$Eve.Mins <- NULL
churn$Eve.Charge <- NULL
churn$Night.Mins <- NULL
churn$Night.Charge <- NULL
churn$Intl.Mins <- NULL
churn$Intl.Charge <- NULL
numeric.var <- sapply(churn, is.numeric)
corr.matrix <- cor(churn[,numeric.var])
corrplot(corr.matrix, main="\n\nCorrelation Plot for Numerical Variables", method="number")

p1 <- ggplot(churn, aes(x=churn$Int.l.Plan)) + ggtitle("Int.I.Plan") + xlab("Int.I.Plan") +
geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
p2 <- ggplot(churn, aes(x=churn$VMail.Plan)) + ggtitle("VMail.Plan") + xlab("VMail.Plan") +
geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
p3 <- ggplot(churn, aes(x=churn$Account.Length.group)) + ggtitle("Account.Length.group") + xlab("Account.Length.group") +
geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
grid.arrange(p1, p2, p3, ncol=2)
## Warning: Use of `churn$Int.l.Plan` is discouraged. Use `Int.l.Plan` instead.
## Warning: Use of `churn$VMail.Plan` is discouraged. Use `VMail.Plan` instead.
## Warning: Use of `churn$Account.Length.group` is discouraged. Use
## `Account.Length.group` instead.

intrain <- createDataPartition(churn$Churn, p=0.8,list=FALSE)
set.seed(2021)
training<- churn[intrain,]
testing<- churn[-intrain,]
dim(training); dim(testing)
## [1] 2667 10
## [1] 666 10
LogModel <- glm(Churn ~ .,family=binomial(link="logit"),data=training)
print(summary(LogModel))
##
## Call:
## glm(formula = Churn ~ ., family = binomial(link = "logit"), data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7545 -0.5428 -0.4011 -0.2768 2.8970
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.7121946 0.5496958 -4.934 8.06e-07 ***
## Int.l.Planyes 2.0646118 0.1555187 13.276 < 2e-16 ***
## VMail.Planyes -1.4345782 0.6201156 -2.313 0.020700 *
## VMail.Message 0.0159856 0.0198061 0.807 0.419607
## Day.Calls 0.0018448 0.0029562 0.624 0.532596
## Eve.Calls 0.0003704 0.0030253 0.122 0.902543
## Night.Calls 0.0015930 0.0030439 0.523 0.600740
## Intl.Calls -0.0980823 0.0271046 -3.619 0.000296 ***
## CustServ.Calls 0.4760230 0.0418160 11.384 < 2e-16 ***
## Account.Length.group120 and above -0.0174273 0.1297663 -0.134 0.893168
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2209.0 on 2666 degrees of freedom
## Residual deviance: 1871.3 on 2657 degrees of freedom
## AIC: 1891.3
##
## Number of Fisher Scoring iterations: 5
anova(LogModel, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Churn
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 2666 2208.9
## Int.l.Plan 1 147.687 2665 2061.3 < 2.2e-16 ***
## VMail.Plan 1 39.645 2664 2021.6 3.045e-10 ***
## VMail.Message 1 1.234 2663 2020.4 0.2667098
## Day.Calls 1 0.090 2662 2020.3 0.7639656
## Eve.Calls 1 0.047 2661 2020.2 0.8276522
## Night.Calls 1 0.062 2660 2020.2 0.8035225
## Intl.Calls 1 14.929 2659 2005.3 0.0001116 ***
## CustServ.Calls 1 133.960 2658 1871.3 < 2.2e-16 ***
## Account.Length.group 1 0.018 2657 1871.3 0.8931007
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
testing$Churn <- as.character(testing$Churn)
testing$Churn[testing$Churn=="No"]
## character(0)
testing$Churn[testing$Churn=="Yes"]
## character(0)
fitted.results <- predict(LogModel,newdata=testing,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != testing$Churn)
print(paste('Logistic Regression Accuracy',1-misClasificError))
## [1] "Logistic Regression Accuracy 0"
print("Confusion Matrix for Logistic Regression"); table(testing$Churn, fitted.results > 0.5)
## [1] "Confusion Matrix for Logistic Regression"
##
## FALSE TRUE
## FALSE 558 12
## TRUE 82 14
library(MASS)
exp(cbind(OR=coef(LogModel), confint(LogModel)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.06639095 0.02243988 0.1937691
## Int.l.Planyes 7.88223714 5.81341606 10.7011725
## VMail.Planyes 0.23821581 0.06810138 0.7763525
## VMail.Message 1.01611402 0.97760473 1.0566258
## Day.Calls 1.00184652 0.99606778 1.0076840
## Eve.Calls 1.00037051 0.99445917 1.0063284
## Night.Calls 1.00159424 0.99563394 1.0075905
## Intl.Calls 0.90657429 0.85885734 0.9551539
## CustServ.Calls 1.60966008 1.48385355 1.7483754
## Account.Length.group120 and above 0.98272372 0.76006606 1.2645707
tree <- ctree(Churn~Int.l.Plan+VMail.Plan+CustServ.Calls+Intl.Calls+VMail.Message, training)
plot(tree)

pred_tree <- predict(tree, testing)
print("Confusion Matrix for Decision Tree"); table(Predicted = pred_tree, Actual = testing$Churn)
## [1] "Confusion Matrix for Decision Tree"
## Actual
## Predicted FALSE TRUE
## FALSE 538 63
## TRUE 32 33
p1 <- predict(tree, training)
tab1 <- table(Predicted = p1, Actual = training$Churn)
tab2 <- table(Predicted = pred_tree, Actual = testing$Churn)
print(paste('Decision Tree Accuracy',sum(diag(tab2))/sum(tab2)))
## [1] "Decision Tree Accuracy 0.857357357357357"
rfModel <- randomForest(Churn ~., data = training)
print(rfModel)
##
## Call:
## randomForest(formula = Churn ~ ., data = training)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 12.3%
## Confusion matrix:
## FALSE TRUE class.error
## FALSE 2224 56 0.0245614
## TRUE 272 115 0.7028424
pred_rf <- predict(rfModel, testing)
library(caret)
confusionMatrix(pred_rf, as.factor(testing$Churn))
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 552 68
## TRUE 18 28
##
## Accuracy : 0.8709
## 95% CI : (0.843, 0.8954)
## No Information Rate : 0.8559
## P-Value [Acc > NIR] : 0.1469
##
## Kappa : 0.332
##
## Mcnemar's Test P-Value : 1.265e-07
##
## Sensitivity : 0.9684
## Specificity : 0.2917
## Pos Pred Value : 0.8903
## Neg Pred Value : 0.6087
## Prevalence : 0.8559
## Detection Rate : 0.8288
## Detection Prevalence : 0.9309
## Balanced Accuracy : 0.6300
##
## 'Positive' Class : FALSE
##
plot(rfModel)

t <- tuneRF(training[, -9], training[, 9], stepFactor = .5, plot = TRUE, ntreeTry = 100, trace = TRUE, improve = 0.05)
## mtry = 3 OOB error = 12.26%
## Searching left ...
## mtry = 6 OOB error = 13.01%
## -0.06116208 0.05
## Searching right ...
## mtry = 1 OOB error = 14.36%
## -0.1712538 0.05

rfModel_new <- randomForest(Churn ~., data = training, ntree = 100, mtry = 3, importance = TRUE, proximity = TRUE)
print(rfModel_new)
##
## Call:
## randomForest(formula = Churn ~ ., data = training, ntree = 100, mtry = 3, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 12.26%
## Confusion matrix:
## FALSE TRUE class.error
## FALSE 2227 53 0.02324561
## TRUE 274 113 0.70801034
pred_rf_new <- predict(rfModel_new, testing)
confusionMatrix(pred_rf_new, as.factor(testing$Churn))
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 553 67
## TRUE 17 29
##
## Accuracy : 0.8739
## 95% CI : (0.8462, 0.8981)
## No Information Rate : 0.8559
## P-Value [Acc > NIR] : 0.1007
##
## Kappa : 0.3475
##
## Mcnemar's Test P-Value : 8.975e-08
##
## Sensitivity : 0.9702
## Specificity : 0.3021
## Pos Pred Value : 0.8919
## Neg Pred Value : 0.6304
## Prevalence : 0.8559
## Detection Rate : 0.8303
## Detection Prevalence : 0.9309
## Balanced Accuracy : 0.6361
##
## 'Positive' Class : FALSE
##
varImpPlot(rfModel_new, sort=T, n.var = 3, main = 'Feature Importance')
