Team Members:

Inputting the various libraries

library(tidyverse) 
library(MASS)
library(ggcorrplot)
library(olsrr)
library(caret)
library('car')
library(randomForest)
df<- read.table(file = "C:/Users/17che/OneDrive - Georgia Institute of Technology/Desktop/telco.csv"
                , sep=",", header=T,stringsAsFactors=T)

df <- subset(df, select = -customerID)
(head(df));

Get names of the columns that have NA values in them

colnames(df)[colSums(is.na(df)) > 0]
## [1] "TotalCharges"

Omitting the 11 rows that have NA values in them

df <- na.omit(df)

Seeing the variation of churn with respect to the continuous variables

library(ggpubr)

a <- ggplot(df, aes(y= tenure, x = "", fill = Churn)) + 
  geom_boxplot()+  theme_bw()+ ylab("Tenure ")+ xlab(" ") 

b <- ggplot(df, aes(y= MonthlyCharges, x = "", fill = Churn, )) + 
  geom_boxplot()+ theme_bw()+ ylab("Monthly Charges")+ xlab(" ")

c <- ggplot(df, aes(y= TotalCharges, x = "", fill = Churn, colors ='b')) + 
  geom_boxplot()+ theme_bw()+ ylab("Total Charges")+  xlab(" ")

ggarrange(a,b,c, ncol = 3, nrow = 1)

Visualizing churn with respect to the categorical variables (Part 1)

color <- scale_fill_manual(values=c("chartreuse3","red"))
d <- ggplot(df,aes(x=gender,fill=Churn))+geom_bar(position = 'fill')+xlab("Gender ")+ 
  ylab("") +color+theme_bw()

e <- ggplot(df,aes(x=Dependents,fill=Churn))+geom_bar(position = 'fill') +xlab("Dependents")+ ylab("")+color+theme_bw()

f <- ggplot(df,aes(x=Partner,fill=Churn))+geom_bar(position = 'fill') + xlab("Partner")+ ylab("")+color+theme_bw()

g <- ggplot(df,aes(x=InternetService,fill=Churn))+geom_bar(position = 'fill')+ xlab("Internet Service")+ ylab("")+color+theme_bw()

ggarrange(d,e,f,g, ncol = 2, nrow = 2)

(Part 2)

h <- ggplot(df,aes(x=OnlineSecurity,fill=Churn))+geom_bar(position = 'fill')+ xlab("Online Security")+ ylab("")+ scale_x_discrete(labels=function(x) str_wrap(x,width=5))+color+theme_bw()
                       
i <- ggplot(df,aes(x=PaymentMethod,fill=Churn))+geom_bar(position = 'fill')+ xlab("Payment Method")+ ylab("") + scale_x_discrete(labels=function(x) str_wrap(x,width=10))+color+theme_bw()

j <- ggplot(df,aes(x=Contract,fill=Churn))+geom_bar(position = 'fill')+ xlab("Contract")+ ylab("")+ scale_x_discrete(labels=function(x) str_wrap(x,width=5))+color+theme_bw()

k <- ggplot(df,aes(x= DeviceProtection,fill=Churn))+ geom_bar(position = 'fill')+ xlab("Device Protection")+ ylab("") + scale_x_discrete(labels=function(x) str_wrap(x,width=5))+color+theme_bw()
ggarrange(h,i,j,k, ncol = 2, nrow = 2)

(Part 3)

l <- ggplot(df,aes(x=TechSupport,fill=Churn))+geom_bar(position = 'fill')+theme_bw()+ xlab("Tech Support")+ ylab("")+color

m <- ggplot(df,aes(x=StreamingTV,fill=Churn))+geom_bar(position = 'fill')+theme_bw()+ xlab("Streaming TV")+ ylab("")+color

n <- m <- ggplot(df,aes(x=PaperlessBilling,fill=Churn))+geom_bar(position = 'fill')+theme_bw()+ xlab("Paperless Billing")+ ylab("")+color

o <- ggplot(df,aes(x=StreamingMovies,fill=Churn))+geom_bar(position = 'fill')+theme_bw()+ xlab("Streaming Movies")+ ylab("")+color

ggarrange(l,m,n,o, ncol = 2, nrow = 2)

Visualizing the correlations between the continuous variables

corr <- cor(df[,c('tenure', 'MonthlyCharges', 'TotalCharges')])
ggcorrplot(corr,hc.order = TRUE, outline.col = "white")

Considering No phone service and No internet as No, thus cleaning the data Also setting ‘Male’ as 1 and ‘Female’ as 0

df <- data.frame(lapply(df, function(x) {
                  gsub("No internet service", "No", x)}))
df <- data.frame(lapply(df, function(x) {
                  gsub("No phone service", "No", x)}))
df$gender<-ifelse(df$gender=="Male",1,0)

Pre-processing the data by replacing all the data that has ‘No’ as 0 and ‘Yes’ as 1

df <- data.frame(lapply(df, function(x) {
                  gsub("No", 0, x)}))
df <- data.frame(lapply(df, function(x) {
                  gsub("Yes", 1, x)}))

Creating dummy variables for the categorical variables that have multiple levels

library('fastDummies')
categorical<- dummy_cols(df, select_columns = c('InternetService', 'Contract', 'PaymentMethod'), remove_first_dummy = FALSE)
cat<- categorical[,-c(5,8,15,17,18,19)]

Combining the categorical variables, dummy variables and continuous variables in one dataframe

cont <- df[,c(5,18,19)]
df_final <- cbind(cont,cat)
df_final <- sapply(df_final, as.numeric)
df_final <- as.data.frame(df_final)
head(df_final)

Renaming the variables for ease of understanding Splitting the data into testing and training data

library(caTools)
colnames(df_final)[20] <- "FiberOptic"
colnames(df_final)[21] <- "Contract_month_to_month"
colnames(df_final)[22] <- "Contract_one_year"
colnames(df_final)[23] <- "Contract_two_year"
colnames(df_final)[24] <- "Payment_bank_transfer"
colnames(df_final)[25] <- "Payment_credit_card"
colnames(df_final)[26] <- "Payment_electronic_check"
colnames(df_final)[27] <- "Payment_mailed_check"

set.seed(123)
x=sample.split (df_final, SplitRatio= 0.7)
Train= df_final[x, 1:27]
Test= df_final[!x, 1:27]

Fitting all the variables on the training data to get the full model

model <- glm(Churn ~ ., family = binomial(link="logit"), data = Train);
summary(model)
## 
## Call:
## glm(formula = Churn ~ ., family = binomial(link = "logit"), data = Train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8969  -0.6780  -0.3014   0.7238   3.3806  
## 
## Coefficients: (3 not defined because of singularities)
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               3.059e+00  1.942e+00   1.575 0.115150    
## tenure                   -5.623e-02  7.535e-03  -7.463 8.43e-14 ***
## MonthlyCharges           -7.101e-02  3.837e-02  -1.850 0.064260 .  
## TotalCharges              2.558e-04  8.512e-05   3.006 0.002649 ** 
## gender                   -4.326e-02  7.932e-02  -0.545 0.585512    
## SeniorCitizen             1.630e-01  1.041e-01   1.566 0.117422    
## Partner                   1.383e-01  9.563e-02   1.446 0.148129    
## Dependents               -2.338e-01  1.100e-01  -2.125 0.033547 *  
## PhoneService              9.277e-01  7.870e-01   1.179 0.238490    
## MultipleLines             6.221e-01  2.146e-01   2.899 0.003740 ** 
## OnlineSecurity           -8.279e-02  2.159e-01  -0.383 0.701367    
## OnlineBackup              2.309e-01  2.120e-01   1.089 0.276151    
## DeviceProtection          3.475e-01  2.145e-01   1.620 0.105216    
## TechSupport               4.375e-02  2.168e-01   0.202 0.840046    
## StreamingTV               8.913e-01  3.936e-01   2.265 0.023538 *  
## StreamingMovies           9.470e-01  3.959e-01   2.392 0.016753 *  
## PaperlessBilling          3.137e-01  9.160e-02   3.425 0.000615 ***
## InternetService_0        -5.244e+00  1.927e+00  -2.721 0.006501 ** 
## InternetService_DSL      -2.615e+00  9.644e-01  -2.711 0.006700 ** 
## FiberOptic                       NA         NA      NA       NA    
## Contract_month_to_month   1.167e+00  2.094e-01   5.575 2.47e-08 ***
## Contract_one_year         5.844e-01  2.066e-01   2.828 0.004682 ** 
## Contract_two_year                NA         NA      NA       NA    
## Payment_bank_transfer     6.516e-02  1.429e-01   0.456 0.648497    
## Payment_credit_card       6.787e-02  1.425e-01   0.476 0.633881    
## Payment_electronic_check  3.879e-01  1.202e-01   3.228 0.001247 ** 
## Payment_mailed_check             NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5428.3  on 4686  degrees of freedom
## Residual deviance: 3909.5  on 4663  degrees of freedom
## AIC: 3957.5
## 
## Number of Fisher Scoring iterations: 6

Predicting the churn using the fitted model

Test$model_prob <- predict(model, Test, type = "response")
Test <- Test  %>% mutate(model_pred = 1*(model_prob > .5) + 0,
                                 visit_binary = 1*(Churn == 1) + 0)

Checking the accuracy of the full model

Test <- Test %>% mutate(accurate = 1*(model_pred == visit_binary))
sum(Test$accurate, na.rm = T)/nrow(Test)
## [1] 0.7991471

Hence the accuracy of the logistic regression- full model is 79.9%

Calculating the accuracy for the reduced model determined using AIC

Test$model_prob <- predict(model_AIC, Test, type = "response")
Test <- Test  %>% mutate(model_pred = 1*(model_prob > .5) + 0,
                                 visit_binary = 1*(Churn == 1) + 0)
Test <- Test %>% mutate(accurate = 1*(model_pred == visit_binary))
sum(Test$accurate, na.rm = T)/nrow(Test)
## [1] 0.7987207

The accuracy of the model is now 79.87% when only retaining the variables decided by AIC

VIF is used for checking the multicollinearity between the variables selected by AIC

vif(model_AIC)
##                   tenure           MonthlyCharges             TotalCharges 
##                15.490829                82.076936                20.278638 
##            SeniorCitizen                  Partner               Dependents 
##                 1.138209                 1.401656                 1.300379 
##             PhoneService            MultipleLines             OnlineBackup 
##                 5.431540                 2.120849                 1.893234 
##         DeviceProtection              StreamingTV          StreamingMovies 
##                 1.986773                 4.251463                 4.426197 
##         PaperlessBilling        InternetService_0      InternetService_DSL 
##                 1.118060                26.599814                14.879846 
##  Contract_month_to_month        Contract_one_year Payment_electronic_check 
##                 4.475069                 3.451895                 1.140161

Variables removed:

model_vif <- glm(formula = Churn ~ tenure + MonthlyCharges + SeniorCitizen + 
    InternetService_0 + OnlineBackup + DeviceProtection + StreamingTV+
    OnlineSecurity + OnlineBackup + TechSupport + MultipleLines +
    + `Contract_month_to_month` + PaperlessBilling + 
    `Payment_electronic_check`, family = "binomial", data = Train)
summary(model_vif)
## 
## Call:
## glm(formula = Churn ~ tenure + MonthlyCharges + SeniorCitizen + 
##     InternetService_0 + OnlineBackup + DeviceProtection + StreamingTV + 
##     OnlineSecurity + OnlineBackup + TechSupport + MultipleLines + 
##     +Contract_month_to_month + PaperlessBilling + Payment_electronic_check, 
##     family = "binomial", data = Train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9639  -0.6508  -0.3329   0.6998   3.0207  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -2.318793   0.214849 -10.793  < 2e-16 ***
## tenure                   -0.033959   0.002724 -12.465  < 2e-16 ***
## MonthlyCharges            0.023140   0.002862   8.086 6.15e-16 ***
## SeniorCitizen             0.264976   0.101608   2.608 0.009112 ** 
## InternetService_0        -0.677124   0.187495  -3.611 0.000305 ***
## OnlineBackup             -0.197324   0.092709  -2.128 0.033302 *  
## DeviceProtection         -0.060840   0.096472  -0.631 0.528269    
## StreamingTV               0.071674   0.101641   0.705 0.480705    
## OnlineSecurity           -0.576618   0.100596  -5.732 9.92e-09 ***
## TechSupport              -0.445492   0.101351  -4.396 1.11e-05 ***
## MultipleLines             0.144893   0.098591   1.470 0.141659    
## Contract_month_to_month   0.757402   0.122975   6.159 7.32e-10 ***
## PaperlessBilling          0.354271   0.090412   3.918 8.91e-05 ***
## Payment_electronic_check  0.403909   0.083875   4.816 1.47e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5428.3  on 4686  degrees of freedom
## Residual deviance: 3971.5  on 4673  degrees of freedom
## AIC: 3999.5
## 
## Number of Fisher Scoring iterations: 5
vif(model_vif)
##                   tenure           MonthlyCharges            SeniorCitizen 
##                 2.098769                 3.570934                 1.082891 
##        InternetService_0             OnlineBackup         DeviceProtection 
##                 1.879425                 1.220498                 1.325454 
##              StreamingTV           OnlineSecurity              TechSupport 
##                 1.621973                 1.110586                 1.169765 
##            MultipleLines  Contract_month_to_month         PaperlessBilling 
##                 1.545203                 1.596986                 1.107892 
## Payment_electronic_check 
##                 1.128538

Fitting another model determined using the variance inflation factor

model_vif2 <- glm(formula = Churn ~ tenure + MonthlyCharges + SeniorCitizen + 
    InternetService_0 + OnlineBackup + OnlineSecurity + OnlineBackup + TechSupport + 
    + `Contract_month_to_month` + PaperlessBilling + 
    `Payment_electronic_check`, family = "binomial", data = Train)
final <- model_vif2
summary(model_vif2)
## 
## Call:
## glm(formula = Churn ~ tenure + MonthlyCharges + SeniorCitizen + 
##     InternetService_0 + OnlineBackup + OnlineSecurity + OnlineBackup + 
##     TechSupport + +Contract_month_to_month + PaperlessBilling + 
##     Payment_electronic_check, family = "binomial", data = Train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9193  -0.6541  -0.3314   0.7139   3.0074  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -2.421909   0.204018 -11.871  < 2e-16 ***
## tenure                   -0.033237   0.002615 -12.710  < 2e-16 ***
## MonthlyCharges            0.025167   0.002268  11.098  < 2e-16 ***
## SeniorCitizen             0.268023   0.101434   2.642  0.00823 ** 
## InternetService_0        -0.615371   0.183543  -3.353  0.00080 ***
## OnlineBackup             -0.205191   0.092483  -2.219  0.02651 *  
## OnlineSecurity           -0.583323   0.100285  -5.817 6.00e-09 ***
## TechSupport              -0.459340   0.100753  -4.559 5.14e-06 ***
## Contract_month_to_month   0.769366   0.121407   6.337 2.34e-10 ***
## PaperlessBilling          0.361056   0.090321   3.997 6.40e-05 ***
## Payment_electronic_check  0.405412   0.083700   4.844 1.27e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5428.3  on 4686  degrees of freedom
## Residual deviance: 3974.4  on 4676  degrees of freedom
## AIC: 3996.4
## 
## Number of Fisher Scoring iterations: 5
vif(model_vif2)
##                   tenure           MonthlyCharges            SeniorCitizen 
##                 1.934739                 2.245331                 1.081630 
##        InternetService_0             OnlineBackup           OnlineSecurity 
##                 1.800568                 1.215665                 1.104441 
##              TechSupport  Contract_month_to_month         PaperlessBilling 
##                 1.156389                 1.556317                 1.105920 
## Payment_electronic_check 
##                 1.124904

Hence, variables selected in our final model:

predicted <- predict(final, type = "response", newdata = Test[,-17])
Test$prob <- predicted
predicted_churn <- factor(ifelse(predicted >= 0.50, "Yes", "No"))
actual_churn <- factor(ifelse(Test$Churn==1,"Yes","No"))

Get the values of accuracy, sensitivity and specificity for cutoff = 0.5

cutoff_churn <- factor(ifelse(predicted >=0.50, "Yes", "No"))
confusionMatrix_final <- confusionMatrix(cutoff_churn, actual_churn, positive = "Yes")
accuracy <- confusionMatrix_final$overall[1]
sensitivity <- confusionMatrix_final$byClass[1]
specificity <- confusionMatrix_final$byClass[2]
accuracy
##  Accuracy 
## 0.7970149
sensitivity
## Sensitivity 
##   0.5280899
specificity
## Specificity 
##   0.8943089

Hence, the obtained values are:

perform_fn <- function(cutoff) 
{
  predicted_churn <- factor(ifelse(predicted >= cutoff, "Yes", "No"))
  conf <- confusionMatrix(predicted_churn, actual_churn, positive = "Yes")
  accuray <- conf$overall[1]
  sensitivity <- conf$byClass[1]
  specificity <- conf$byClass[2]
  out <- t(as.matrix(c(sensitivity, specificity, accuray))) 
  colnames(out) <- c("sensitivity", "specificity", "accuracy")
  return(out)
}

Plotting the value of these three metrics for different cutoff values

s = seq(0.01,0.90,length=100)
OUT = matrix(0,100,3)

for(i in 1:100)
{
  OUT[i,] = perform_fn(s[i])
} 

plot(s, OUT[,1],xlab="Cutoff",ylab="Value",axes=FALSE,col=2,type="l")
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col='black',lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend("bottom",col=c(2,"black",4,"red")
       ,lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))
axis(1, at = seq(0.1, 1, by = 0.1))

Hence, the cutoff value is taken to be = 0.3 as that is where accuracy, specificity and sensitivity intersect. This gives a high value for all three parameters

cutoff_churn <- factor(ifelse(predicted >=0.3, "Yes", "No"))
conf_final <- confusionMatrix(cutoff_churn, actual_churn, positive = "Yes")
accuracy <- conf_final$overall[1]
sensitivity <- conf_final$byClass[1]
specificity <- conf_final$byClass[2]
accuracy
##  Accuracy 
## 0.7688699
sensitivity
## Sensitivity 
##   0.7495987
specificity
## Specificity 
##    0.775842

Hence the obtained values are:

Get the Area under the curve value

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
model_final.roc <- roc(response = Test$Churn, predictor = as.numeric(predicted))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(model_final.roc, print.auc = TRUE,legacy.axes = TRUE, col='blue')

Training a random forest model

Train$Churn <- as.character(Train$Churn)
Train$Churn <- as.factor(Train$Churn)
model.rf <- randomForest(Churn ~ ., data=Train, proximity=FALSE,importance = FALSE,
                        ntree=500,mtry=4, do.trace=FALSE)
model.rf
## 
## Call:
##  randomForest(formula = Churn ~ ., data = Train, proximity = FALSE,      importance = FALSE, ntree = 500, mtry = 4, do.trace = FALSE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 20.03%
## Confusion matrix:
##      0   1 class.error
## 0 3075 366   0.1063644
## 1  573 673   0.4598716

The ten most important variables according to the Random Forest

varImpPlot(model.rf, main="Random Forest Variable importance", n.var = 10)

Plotting the ROC curve for the random forest

predicted <- predict(model.rf, type = "response", newdata = Test[,-17])
model.rf.roc <- roc(response = Test$Churn, predictor = as.numeric(predicted))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(model.rf.roc, print.auc = TRUE,legacy.axes = TRUE, col='red')

Area under ROC curve is lesser for a random forest model as compared to the logistic regression case

Hence, logistic regression model with cutoff = 0.3 is the best model for this data