This analysis was initially published on Datascienceplus.com. I have used the concepts from there and remodeled the whole analysis

Loading Libraries

library(dplyr)
library(ggplot2)
library(MASS)
library(randomForest)
library(corrplot)
library(gridExtra)
library(rpart)
library(rpart.plot)
library(party)
library(caret)

Data Exploration

telco <- read.csv("Telco_customer_churn.csv")
colSums(is.na(telco))
telco <- na.omit(telco)

str(telco)
summary(telco)

telco$MultipleLines <- ifelse(telco$MultipleLines == "No phone service","No", "Yes")

telco$OnlineSecurity <- ifelse(telco$OnlineSecurity == "No internet service","No", "Yes")

telco$OnlineBackup <- ifelse(telco$OnlineBackup == "No internet service","No", "Yes")

telco$DeviceProtection <- ifelse(telco$DeviceProtection == "No internet service","No", "Yes")

telco$TechSupport <- ifelse(telco$TechSupport == "No internet service","No", "Yes")

telco$StreamingTV <- ifelse(telco$StreamingTV == "No internet service","No", "Yes")

telco$StreamingMovies <- ifelse(telco$StreamingMovies == "No internet service","No", "Yes")

In total we have 7032 observation and 21 variable

Exploratory Data Analysis

Plots

table(telco$Churn)
## 
##   No  Yes 
## 5163 1869
p1 <- ggplot(data = telco, aes(x = gender, fill = Churn)) +
  geom_bar(position = "fill")

p2 <- ggplot(data = telco, aes(x = SeniorCitizen, fill = Churn)) +
  geom_bar(position = "fill")

p3 <- ggplot(data = telco, aes(x = Partner, fill = Churn)) +
  geom_bar(position = "fill")

p4 <- ggplot(data = telco, aes(x = Dependents, fill = Churn)) +
  geom_bar(position = "fill")

grid.arrange(p1, p2, p3, p4, ncol=2)

ggplot(data = telco, aes(x = tenure, fill = Churn)) +
  geom_histogram(position = "fill")

p5 <- ggplot(data = telco, aes(x = PhoneService, fill = Churn)) +
  geom_bar(position = "fill")

p6 <- ggplot(data = telco, aes(x = MultipleLines, fill = Churn)) +
  geom_bar(position = "fill")

p7 <- ggplot(data = telco, aes(x = InternetService, fill = Churn)) +
  geom_bar(position = "fill")

p8 <- ggplot(data = telco, aes(x = OnlineSecurity, fill = Churn)) +
  geom_bar(position = "fill")

grid.arrange(p5, p6, p7, p8, ncol=2)

p9 <- ggplot(data = telco, aes(x = OnlineBackup, fill = Churn)) +
  geom_bar(position = "fill")

p10 <- ggplot(data = telco, aes(x = DeviceProtection, fill = Churn)) +
  geom_bar(position = "fill")

p11 <- ggplot(data = telco, aes(x = TechSupport, fill = Churn)) +
  geom_bar(position = "fill")

p12 <- ggplot(data = telco, aes(x = StreamingTV, fill = Churn)) +
  geom_bar(position = "fill")

grid.arrange(p9, p10, p11, p12, ncol=2)

p13 <- ggplot(data = telco, aes(x = StreamingMovies, fill = Churn)) +
  geom_bar(position = "fill")

p14 <- ggplot(data = telco, aes(x = Contract, fill = Churn)) +
  geom_bar(position = "fill")

p15 <- ggplot(data = telco, aes(x = PaperlessBilling, fill = Churn)) +
  geom_bar(position = "fill")

p16 <- ggplot(data = telco, aes(x = PaymentMethod, fill = Churn)) +
  geom_bar(position = "fill")

grid.arrange(p13, p14, p15, p16, ncol=2)

ggplot(data = telco, aes(x = MonthlyCharges, fill = Churn)) +
  geom_histogram(position = "fill")

ggplot(data = telco, aes(x = TotalCharges, fill = Churn)) +
  geom_histogram(position = "fill")

Correlation between Variables

table(telco$Partner, telco$Dependents)
##      
##         No  Yes
##   No  3280  359
##   Yes 1653 1740
table(telco$StreamingTV, telco$StreamingMovies)
##      
##         No  Yes
##   No  1520    0
##   Yes    0 5512
table(telco$InternetService, telco$OnlineSecurity)
##              
##                 No  Yes
##   DSL            0 2416
##   Fiber optic    0 3096
##   No          1520    0
table(telco$InternetService, telco$DeviceProtection)
##              
##                 No  Yes
##   DSL            0 2416
##   Fiber optic    0 3096
##   No          1520    0
numeric.var <- sapply(telco, is.numeric)
corr.matrix <- cor(telco[,numeric.var])
corrplot(corr.matrix, main="\n\nCorrelation Plot for Numerical Variables", method="number")

Observations

  1. So observe that Churn does vary according to gender, PhoneService, MultipleLines but SeniorCitizen, Partner, Dependents, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies, PaperlessBilling, PaymentMethod, TotalCharges Contract variable does impact Churn.
  2. We also observe that as the tenure increases churn decreses
  3. Apart from these we will also have to consider the changes that many categorical varialbes can be coorelated

Splitting Data in Test and Train

set.seed(12345)
index <- sample(1:nrow(telco), 0.7*nrow(telco))
train <- telco[index,2:21]
test <- telco[-index,2:21]

Models

Fitting the Logistic Regression Model

telco_log <- glm(Churn ~ ., data = train, family = "binomial")
summary(telco_log)
## 
## Call:
## glm(formula = Churn ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6773  -0.6888  -0.3120   0.7801   3.3103  
## 
## Coefficients: (7 not defined because of singularities)
##                                        Estimate Std. Error z value
## (Intercept)                          -0.1998005  0.2669884  -0.748
## genderMale                           -0.0421137  0.0764787  -0.551
## SeniorCitizen                         0.1819262  0.0994083   1.830
## PartnerYes                           -0.0001958  0.0919891  -0.002
## DependentsYes                        -0.1986362  0.1056846  -1.880
## tenure                               -0.0537164  0.0071279  -7.536
## PhoneServiceYes                      -0.6542136  0.1751012  -3.736
## MultipleLinesYes                             NA         NA      NA
## InternetServiceFiber optic            0.8627519  0.1577810   5.468
## InternetServiceNo                    -0.4630047  0.2236187  -2.071
## OnlineSecurityYes                            NA         NA      NA
## OnlineBackupYes                              NA         NA      NA
## DeviceProtectionYes                          NA         NA      NA
## TechSupportYes                               NA         NA      NA
## StreamingTVYes                               NA         NA      NA
## StreamingMoviesYes                           NA         NA      NA
## ContractOne year                     -0.7576200  0.1261411  -6.006
## ContractTwo year                     -1.4140798  0.1986139  -7.120
## PaperlessBillingYes                   0.4092732  0.0881269   4.644
## PaymentMethodCredit card (automatic) -0.1191094  0.1335949  -0.892
## PaymentMethodElectronic check         0.3192109  0.1124708   2.838
## PaymentMethodMailed check            -0.0834132  0.1369695  -0.609
## MonthlyCharges                        0.0033448  0.0047726   0.701
## TotalCharges                          0.0002640  0.0000814   3.244
##                                      Pr(>|z|)    
## (Intercept)                          0.454250    
## genderMale                           0.581868    
## SeniorCitizen                        0.067236 .  
## PartnerYes                           0.998302    
## DependentsYes                        0.060174 .  
## tenure                               4.84e-14 ***
## PhoneServiceYes                      0.000187 ***
## MultipleLinesYes                           NA    
## InternetServiceFiber optic           4.55e-08 ***
## InternetServiceNo                    0.038405 *  
## OnlineSecurityYes                          NA    
## OnlineBackupYes                            NA    
## DeviceProtectionYes                        NA    
## TechSupportYes                             NA    
## StreamingTVYes                             NA    
## StreamingMoviesYes                         NA    
## ContractOne year                     1.90e-09 ***
## ContractTwo year                     1.08e-12 ***
## PaperlessBillingYes                  3.42e-06 ***
## PaymentMethodCredit card (automatic) 0.372623    
## PaymentMethodElectronic check        0.004537 ** 
## PaymentMethodMailed check            0.542530    
## MonthlyCharges                       0.483404    
## TotalCharges                         0.001181 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5652.2  on 4921  degrees of freedom
## Residual deviance: 4165.2  on 4905  degrees of freedom
## AIC: 4199.2
## 
## Number of Fisher Scoring iterations: 6
predict_1_prob <- predict(telco_log, newdata = test, type = "response")
predict_1_class <- ifelse(predict_1_prob > 0.5, "Yes", "No")
mean(predict_1_class == test$Churn)
## [1] 0.8047393

Now lets make a model only with important variables i.e. variables with less p-value

telco_log <- glm(Churn ~ SeniorCitizen + tenure + PhoneService + InternetService +
                   Contract + PaperlessBilling + PaymentMethod + TotalCharges, 
                 data = train, family = "binomial")

summary(telco_log)
## 
## Call:
## glm(formula = Churn ~ SeniorCitizen + tenure + PhoneService + 
##     InternetService + Contract + PaperlessBilling + PaymentMethod + 
##     TotalCharges, family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6816  -0.6960  -0.3129   0.7830   3.2881  
## 
## Coefficients:
##                                        Estimate Std. Error z value
## (Intercept)                          -1.359e-01  1.854e-01  -0.733
## SeniorCitizen                         2.175e-01  9.727e-02   2.236
## tenure                               -5.612e-02  6.773e-03  -8.286
## PhoneServiceYes                      -6.037e-01  1.548e-01  -3.900
## InternetServiceFiber optic            9.472e-01  1.133e-01   8.361
## InternetServiceNo                    -5.761e-01  1.572e-01  -3.665
## ContractOne year                     -7.600e-01  1.248e-01  -6.087
## ContractTwo year                     -1.424e+00  1.975e-01  -7.207
## PaperlessBillingYes                   4.173e-01  8.792e-02   4.747
## PaymentMethodCredit card (automatic) -1.119e-01  1.334e-01  -0.838
## PaymentMethodElectronic check         3.276e-01  1.123e-01   2.917
## PaymentMethodMailed check            -8.593e-02  1.366e-01  -0.629
## TotalCharges                          2.967e-04  7.162e-05   4.142
##                                      Pr(>|z|)    
## (Intercept)                          0.463603    
## SeniorCitizen                        0.025372 *  
## tenure                                < 2e-16 ***
## PhoneServiceYes                      9.62e-05 ***
## InternetServiceFiber optic            < 2e-16 ***
## InternetServiceNo                    0.000247 ***
## ContractOne year                     1.15e-09 ***
## ContractTwo year                     5.70e-13 ***
## PaperlessBillingYes                  2.07e-06 ***
## PaymentMethodCredit card (automatic) 0.401900    
## PaymentMethodElectronic check        0.003535 ** 
## PaymentMethodMailed check            0.529271    
## TotalCharges                         3.44e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5652.2  on 4921  degrees of freedom
## Residual deviance: 4170.3  on 4909  degrees of freedom
## AIC: 4196.3
## 
## Number of Fisher Scoring iterations: 6
predict_1_prob <- predict(telco_log, newdata = test, type = "response")
predict_1_class <- ifelse(predict_1_prob > 0.5, "Yes", "No")
mean(predict_1_class == test$Churn)
## [1] 0.8056872

We get almost the same accuracy as we got in our previous Logistic Model

Fitting the Tree Model

telco_tree <- rpart(Churn ~ SeniorCitizen + tenure + PhoneService + InternetService +
                   Contract + PaperlessBilling + PaymentMethod + TotalCharges, 
                 data = train)
rpart.plot(telco_tree)

telco_tree_predict <- predict(telco_tree, newdata = test, type = "class")
mean(test$Churn == telco_tree_predict)
## [1] 0.7862559

Fitting Random Forest

telco_rf <- randomForest(Churn ~ SeniorCitizen + tenure + PhoneService + 
                           InternetService + Contract + PaperlessBilling + PaymentMethod
                         + TotalCharges, data = train)

plot(telco_rf)

varImpPlot(telco_rf)

telco_rf
## 
## Call:
##  randomForest(formula = Churn ~ SeniorCitizen + tenure + PhoneService +      InternetService + Contract + PaperlessBilling + PaymentMethod +      TotalCharges, data = train) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 20.78%
## Confusion matrix:
##       No Yes class.error
## No  3263 374   0.1028320
## Yes  649 636   0.5050584
telco_rf_predict <- predict(telco_rf, newdata = test)
confusionMatrix(telco_rf_predict, test$Churn)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1388  285
##        Yes  138  299
##                                           
##                Accuracy : 0.7995          
##                  95% CI : (0.7818, 0.8164)
##     No Information Rate : 0.7232          
##     P-Value [Acc > NIR] : 3.612e-16       
##                                           
##                   Kappa : 0.4571          
##  Mcnemar's Test P-Value : 1.259e-12       
##                                           
##             Sensitivity : 0.9096          
##             Specificity : 0.5120          
##          Pos Pred Value : 0.8296          
##          Neg Pred Value : 0.6842          
##              Prevalence : 0.7232          
##          Detection Rate : 0.6578          
##    Detection Prevalence : 0.7929          
##       Balanced Accuracy : 0.7108          
##                                           
##        'Positive' Class : No              
##