Loading Libraries

library(tidyverse)
library(caret)
library(randomForest)
library(skimr)
library(ggplot2)
library(gridExtra )
library(caTools)
library(corrplot)
library(ggcorrplot)
library(kableExtra)
library(caret)
library(tree)

Reading the Data

churn<-read.csv("./customer_churn.csv", stringsAsFactors = TRUE)

Data Manipulation

a. Extract the 5th column & store it in ‘customer_5’

cutomer_5<-churn %>% 
  select(5)

b. Extract the 15th column & store it in ‘customer_15’

cutomer_15<-churn %>% 
  select(15)

c. Extract all the male senior citizens whose Payment Method is Electronic check & store the result in ‘senior_male_electronic’

senior_male_electronic<-churn %>% 
  filter(gender=="Male" & PaymentMethod=="Electronic check")

d. Extract all those customers whose tenure is greater than 70 months or their Monthly charges is more than 100$ & store the result in ‘customer_total_tenure’

customer_total_tenure<- churn %>% 
  filter(tenure>70 | MonthlyCharges>100)

e. Extract all the customers whose Contract is of two years, payment method is Mailed check & the value of Churn is ‘Yes’ & store the result in ‘two_mail_yes’

two_mail_yes<-churn %>% 
  filter(Contract=="Two year" & PaymentMethod=="Mailed check" & Churn=="Yes")

f.Extract 333 random records from the customer_churndataframe& store the result in ‘customer_333’

set.seed(123)
customer_333<- sample_n(churn,333)

g. Get the count of different levels from the ‘Churn’ column

kable(table(churn$Churn))
Var1 Freq
No 5174
Yes 1869

B) Data Visualization:

a. Build a bar-plot for the ’InternetService’ column:

i. Set x-axis label to ‘Categories of Internet Service’

ii. Set y-axis label to ‘Count of Categories’

iii. Set the title of plot to be ‘Distribution of Internet Service’

iv. Set the color of the bars to be ‘orange’

churn %>% 
  ggplot(aes(x=InternetService))+geom_bar(fill="orange")+
  xlab("Categories of Internet Service") + ylab("Count of Categories")+
  ggtitle("Distribution of Internet Service")

b. Build a histogram for the ‘tenure’ column:

i. Set the number of bins to be 30

ii. Set the color of the bins to be ‘green’

iii. Assign the title ‘Distribution of tenure’

churn %>% 
  ggplot(aes(x=tenure))+geom_histogram(bins = 30,fill="green")+
  ggtitle("Distribution of tenure")

c. Build a scatter-plot between ‘MonthlyCharges’ & ‘tenure’. Map ‘MonthlyCharges’ to the y-axis & ‘tenure’ to the ‘x-axis’:

i. Assign the points a color of ‘brown’

ii. Set the x-axis label to ‘Tenure of customer’

iii. Set the y-axis label to ‘Monthly Charges of customer’

iv. Set the title to ‘Tenure vs Monthly Charges’

churn %>% 
  ggplot(aes(x=tenure, y=MonthlyCharges))+ geom_point(color="brown")+
  xlab("Tenure of customer") + ylab("Monthly Charges of customer")+
  ggtitle("Tenure vs Monthly Charges")

d. Build a box-plot between ‘tenure’ & ‘Contract’. Map ‘tenure’ on the y-axis & ‘Contract’ on the x-axis.

churn %>% 
  ggplot(aes(x=Contract,y=tenure))+geom_boxplot()+
  ggtitle("Tenure vs Contract")

C) Linear Regression:

a. Build a simple linear model where dependent variable is ‘MonthlyCharges’ and independent variable is ‘tenure’

i. Divide the dataset into train and test sets in 70:30 ratio.

sample.split(churn$Churn, SplitRatio = .70)-> split_tag
subset(churn, split_tag==T)->train
subset(churn, split_tag==F)->test

ii. Build the model on train set and predict the values on test set

lmmodel<-lm(MonthlyCharges~tenure, data=train)
prediction<-predict(lmmodel, newdata = test)

iii. Find out the error in prediction & store the result in ‘error’

final<-as.data.frame(cbind(Actual=test$MonthlyCharges, Predicted=prediction))
final$Actual-final$Predicted->error
final<-cbind(final,error)

iv. Find the root mean square error

sqrt(mean((final$error)^2))->rmse
rmse
## [1] 29.3283

D) Logistic Regression:

a. Build a simple logistic regression modelwhere dependent variable is ‘Churn’ &

independent variable is ‘MonthlyCharges’

i. Divide the dataset in 65:35 ratio

sample.split(churn$Churn,SplitRatio = .65 )->s_t
train1<-subset(churn,s_t==T)
test1<-subset(churn,s_t==F)

ii. Build the model on train set and predict the values on test set

glm(Churn~MonthlyCharges, data=train1, family = "binomial")->logmod
summary(logmod)
## 
## Call:
## glm(formula = Churn ~ MonthlyCharges, family = "binomial", data = train1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0810  -0.8501  -0.6663   1.3706   1.9684  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -2.074251   0.092824  -22.35   <2e-16 ***
## MonthlyCharges  0.015522   0.001212   12.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5297.9  on 4577  degrees of freedom
## Residual deviance: 5122.2  on 4576  degrees of freedom
## AIC: 5126.2
## 
## Number of Fisher Scoring iterations: 4
predict(logmod,newdata = test1, type = "response")->logresult

iii. Build the confusion matrix and get the accuracy score

# Let us consider a threshold of 30%
conf1<-table(test1$Churn,logresult>.3)
conf1
##      
##       FALSE TRUE
##   No   1163  648
##   Yes   308  346
acc1<-(1163+346)/(1163+346+648+308)
acc1
## [1] 0.6121704
# Let us consider a threshold of 35%
conf2<-table(test1$Churn,logresult>.35)
conf2
##      
##       FALSE TRUE
##   No   1452  359
##   Yes   477  177
acc2<-(1452+177)/(1452+177+477+359)
acc2
## [1] 0.6608519
# We are getting an accuracy of approximately 61% and 66% respectively.

b. Build a multiple logistic regression model where dependent variable is ‘Churn’ &

independent variables are ‘tenure’ & ‘MonthlyCharges’

i. Divide the dataset in 80:20 ratio

sample.split(churn$Churn,SplitRatio = .8)->s_tmul
trainm<-subset(churn, s_tmul==T)
testm<-subset(churn, s_tmul==F)

ii. Build the model on train set and predict the values on test set

glm(Churn~tenure+MonthlyCharges, data=trainm, family = "binomial")->modmul
predict(modmul, newdata=testm,type="response")-> mulres
summary(modmul)
## 
## Call:
## glm(formula = Churn ~ tenure + MonthlyCharges, family = "binomial", 
##     data = trainm)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8846  -0.7151  -0.4120   0.7790   2.9714  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.806515   0.096870  -18.65   <2e-16 ***
## tenure         -0.054816   0.001887  -29.04   <2e-16 ***
## MonthlyCharges  0.032991   0.001450   22.76   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6519.5  on 5633  degrees of freedom
## Residual deviance: 5112.7  on 5631  degrees of freedom
## AIC: 5118.7
## 
## Number of Fisher Scoring iterations: 5

iii. Build the confusion matrix and get the accuracy score

# Let us consider a threshold of 70% first
table(testm$Churn,mulres>.7)
##      
##       FALSE TRUE
##   No   1024   11
##   Yes   341   33
accmul<-(1024+33)/(1024+33+11+341)
accmul
## [1] 0.7501774
# We are getting an accuracy of approximately 75%
# Now, Let us consider a threshold of 50% 
table(testm$Churn,mulres>.5)
##      
##       FALSE TRUE
##   No    934  101
##   Yes   208  166
accmul1<-(934+166)/(934+166+101+208)
accmul1
## [1] 0.7806955
# We are getting an accuracy of approximately 78%

E) Decision Tree:

a. Build a decision tree model where dependent variable is ‘Churn’ & independent

variable is ‘tenure’

i. Divide the dataset in 80:20 ratio

sample.split(churn$Churn, SplitRatio = .8)-> s_tree
subset(churn,s_tree==T)->traintree
subset(churn,s_tree==F)->testtree

ii. Build the model on train set and predict the values on test set

#Building the tree and plotting it
tree(Churn~tenure, data=traintree)->dtmod
plot(dtmod)
text(dtmod)

# Predicting the values.
predict(dtmod, newdata = testtree, type = "class")->dtresult

iii. Build the confusion matrix and calculate the accuracy

table(dtresult, testtree$Churn)
##         
## dtresult  No Yes
##      No  904 233
##      Yes 131 141
dtacc<-(904+141)/(904+141+131+233)
dtacc
## [1] 0.7416608
# We are getting an accuracy of approximately 74%

F) Random Forest:

a. Build a Random Forest model where dependent variable is ‘Churn’ & independent

variables are ‘tenure’ and ‘MonthlyCharges’

i. Divide the dataset in 70:30 ratio

sample.split(churn$Churn, SplitRatio = .7)->s_trf
subset(churn, s_trf==T)->trainrf
subset(churn, s_trf==F)->testrf

ii. Build the model on train set and predict the values on test set

# Building model
set.seed(123)
rf_mod<-randomForest(formula=Churn~tenure+MonthlyCharges, data=trainrf)
rf_mod
## 
## Call:
##  randomForest(formula = Churn ~ tenure + MonthlyCharges, data = trainrf) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 22.74%
## Confusion matrix:
##       No Yes class.error
## No  3138 484   0.1336278
## Yes  637 671   0.4870031
# predicting
predict(rf_mod,testrf)->predict_rf

iii. Build the confusion matrix and calculate the accuracy

# Let us first use the confusionMatrix function to calculate the accuracy
confusionMatrix(predict_rf,testrf$Churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1313  263
##        Yes  239  298
##                                           
##                Accuracy : 0.7624          
##                  95% CI : (0.7437, 0.7804)
##     No Information Rate : 0.7345          
##     P-Value [Acc > NIR] : 0.001797        
##                                           
##                   Kappa : 0.3824          
##                                           
##  Mcnemar's Test P-Value : 0.304637        
##                                           
##             Sensitivity : 0.8460          
##             Specificity : 0.5312          
##          Pos Pred Value : 0.8331          
##          Neg Pred Value : 0.5549          
##              Prevalence : 0.7345          
##          Detection Rate : 0.6214          
##    Detection Prevalence : 0.7459          
##       Balanced Accuracy : 0.6886          
##                                           
##        'Positive' Class : No              
## 
# We got an accuracy of 76%
# Now let us use the manual method of calculation using a table function
table(predict_rf,testrf$Churn)
##           
## predict_rf   No  Yes
##        No  1313  263
##        Yes  239  298
accrf=(1328+283)/(1328+283+224+283)
accrf
## [1] 0.7606232
# We are getting an accuracy of 76% here too