Churn Prediction - Logistic Regression

library(readxl)

#get current directory path
getwd()

## [1] "D:/APGCBAA/III/RProg/workspace/DataAnalysis"

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v tibble  3.1.2     v purrr   0.3.4
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(repr)
library(caTools)
library(rpart)
library(rpart.plot)
library(ggpubr)


#************************************************************************
# Real Teleco Customer Dataset
#************************************************************************
tcc <- read_excel("Telco-Customer-Churn.xlsx",sheet = 1)

glimpse(tcc)

## Rows: 7,043
## Columns: 21
## $ customerID       <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CFOCW~
## $ gender           <chr> "Female", "Male", "Male", "Male", "Female", "Female",~
## $ SeniorCitizen    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ Partner          <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes~
## $ Dependents       <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "No"~
## $ tenure           <dbl> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49, 2~
## $ PhoneService     <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No", ~
## $ MultipleLines    <chr> "No phone service", "No", "No", "No phone service", "~
## $ InternetService  <chr> "DSL", "DSL", "DSL", "DSL", "Fiber optic", "Fiber opt~
## $ OnlineSecurity   <chr> "No", "Yes", "Yes", "Yes", "No", "No", "No", "Yes", "~
## $ OnlineBackup     <chr> "Yes", "No", "Yes", "No", "No", "No", "Yes", "No", "N~
## $ DeviceProtection <chr> "No", "Yes", "No", "Yes", "No", "Yes", "No", "No", "Y~
## $ TechSupport      <chr> "No", "No", "No", "Yes", "No", "No", "No", "No", "Yes~
## $ StreamingTV      <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "No", "Ye~
## $ StreamingMovies  <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "Yes~
## $ Contract         <chr> "Month-to-month", "One year", "Month-to-month", "One ~
## $ PaperlessBilling <chr> "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No", ~
## $ PaymentMethod    <chr> "Electronic check", "Mailed check", "Mailed check", "~
## $ MonthlyCharges   <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 29.7~
## $ TotalCharges     <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1949~
## $ Churn            <chr> "No", "No", "Yes", "No", "Yes", "Yes", "No", "No", "Y~

#Check for missing/NULL/NA Data
sumdata <- colSums(is.na(tcc))
sumdata

##       customerID           gender    SeniorCitizen          Partner 
##                0                0                0                0 
##       Dependents           tenure     PhoneService    MultipleLines 
##                0                0                0                0 
##  InternetService   OnlineSecurity     OnlineBackup DeviceProtection 
##                0                0                0                0 
##      TechSupport      StreamingTV  StreamingMovies         Contract 
##                0                0                0                0 
## PaperlessBilling    PaymentMethod   MonthlyCharges     TotalCharges 
##                0                0                0               11 
##            Churn 
##                0

colSums(is.na(tcc))/nrow(tcc)

##       customerID           gender    SeniorCitizen          Partner 
##      0.000000000      0.000000000      0.000000000      0.000000000 
##       Dependents           tenure     PhoneService    MultipleLines 
##      0.000000000      0.000000000      0.000000000      0.000000000 
##  InternetService   OnlineSecurity     OnlineBackup DeviceProtection 
##      0.000000000      0.000000000      0.000000000      0.000000000 
##      TechSupport      StreamingTV  StreamingMovies         Contract 
##      0.000000000      0.000000000      0.000000000      0.000000000 
## PaperlessBilling    PaymentMethod   MonthlyCharges     TotalCharges 
##      0.000000000      0.000000000      0.000000000      0.001561834 
##            Churn 
##      0.000000000

barplot(colSums(is.na(tcc)),
        las = 2,
        col = rainbow(7),cex.names=0.7,
        ylab = 'count',
        horiz = FALSE,
)

#Terms like “No internet service” and “No phone service” should be changed to “No” for convenience.
data <- data.frame(lapply(tcc, function(x) {
  gsub("No internet service", "No", x)}))
data <- data.frame(lapply(tcc, function(x) {
  gsub("No phone service", "No", x)}))



# 2. SeniorCitizen responds in 1 and 0, which is changed to “Yes” or “No”.
data$SeniorCitizen <- as.factor(ifelse(data$SeniorCitizen==1, 'YES', 'NO'))

# 3. Convert double type variables into numeric type and store them in a variable as a data frame.
num_columns <- c("tenure", "MonthlyCharges", "TotalCharges")
data[num_columns] <- sapply(data[num_columns], as.numeric)
data_int <- data[,c("tenure", "MonthlyCharges", "TotalCharges")]
data_int <- data.frame(scale(data_int))


#4 imputing Total Charges with mean
colSums(is.na(data))

##       customerID           gender    SeniorCitizen          Partner 
##                0                0                0                0 
##       Dependents           tenure     PhoneService    MultipleLines 
##                0                0                0                0 
##  InternetService   OnlineSecurity     OnlineBackup DeviceProtection 
##                0                0                0                0 
##      TechSupport      StreamingTV  StreamingMovies         Contract 
##                0                0                0                0 
## PaperlessBilling    PaymentMethod   MonthlyCharges     TotalCharges 
##                0                0                0               11 
##            Churn 
##                0

data$TotalCharges[is.na(data$TotalCharges)] <- mean(data$TotalCharges,na.rm=TRUE)

#Check for missing/NULL/NA Data
colSums(is.na(data))

##       customerID           gender    SeniorCitizen          Partner 
##                0                0                0                0 
##       Dependents           tenure     PhoneService    MultipleLines 
##                0                0                0                0 
##  InternetService   OnlineSecurity     OnlineBackup DeviceProtection 
##                0                0                0                0 
##      TechSupport      StreamingTV  StreamingMovies         Contract 
##                0                0                0                0 
## PaperlessBilling    PaymentMethod   MonthlyCharges     TotalCharges 
##                0                0                0                0 
##            Churn 
##                0

#5. Prepare the final data for analysis excluding selected attributes which we don’t need throughout the analysis process.
data_req <- data[,-c(1,6,19,20)]

x <- data.frame(sapply(data_req,function(x) data.frame(model.matrix(~x-1,data =data_req))[,-1]))
colSums(is.na(x))

##                                 gender                          SeniorCitizen 
##                                      0                                      0 
##                                Partner                             Dependents 
##                                      0                                      0 
##                           PhoneService                          MultipleLines 
##                                      0                                      0 
##           InternetService.xFiber.optic                    InternetService.xNo 
##                                      0                                      0 
##    OnlineSecurity.xNo.internet.service                    OnlineSecurity.xYes 
##                                      0                                      0 
##      OnlineBackup.xNo.internet.service                      OnlineBackup.xYes 
##                                      0                                      0 
##  DeviceProtection.xNo.internet.service                  DeviceProtection.xYes 
##                                      0                                      0 
##       TechSupport.xNo.internet.service                       TechSupport.xYes 
##                                      0                                      0 
##       StreamingTV.xNo.internet.service                       StreamingTV.xYes 
##                                      0                                      0 
##   StreamingMovies.xNo.internet.service                   StreamingMovies.xYes 
##                                      0                                      0 
##                     Contract.xOne.year                     Contract.xTwo.year 
##                                      0                                      0 
##                       PaperlessBilling PaymentMethod.xCredit.card..automatic. 
##                                      0                                      0 
##        PaymentMethod.xElectronic.check            PaymentMethod.xMailed.check 
##                                      0                                      0 
##                                  Churn 
##                                      0

colSums(is.na(data_int))

##         tenure MonthlyCharges   TotalCharges 
##              0              0             11

data_int$TotalCharges[is.na(data_int$TotalCharges)] <- mean(data_int$TotalCharges,na.rm=TRUE)
colSums(is.na(data_int))

##         tenure MonthlyCharges   TotalCharges 
##              0              0              0

############# Cleaning Further in any Further NA's ###################################
x <- na.omit(x) # omit the NA's
data_int <- na.omit(data_int) # omit the NA's
colSums(is.na(data_int))

##         tenure MonthlyCharges   TotalCharges 
##              0              0              0

Clean_Data <- cbind(x, data_int)

#####################  Exploratory Analysis  ##############################
#Correlation plot between the numeric variables:
#Correlation plot between the numeric variables:
library(corrplot)

## corrplot 0.90 loaded

corr = cor(data_int)
corr

##                   tenure MonthlyCharges TotalCharges
## tenure         1.0000000      0.2478999    0.8247573
## MonthlyCharges 0.2478999      1.0000000    0.6504680
## TotalCharges   0.8247573      0.6504680    1.0000000

library("PerformanceAnalytics")

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

chart.Correlation(data_int, histogram=TRUE, pch=19)

# 2. Churn percentage tells around 26.58% of the customer left during last month.
churn <- data %>%
  group_by(Churn) %>%
  summarise(Count = n())%>%
  mutate(percentage = prop.table(Count)*100)
ggplot(churn, aes(reorder(Churn, -percentage), percentage), fill = Churn)+
  geom_col(fill = c("blue", "orange"))+
  geom_text(aes(label = sprintf("%.2f%%", percentage)))+
  xlab("Churn") +
  ylab("Percent")+
  ggtitle("Churn Percentage")

####################    Logistic Regression  #######################

#Split the data in train and test sets
set.seed(123)
split <- sample.split(Clean_Data$Churn, SplitRatio = 0.70)
train <- Clean_Data[split,]
test <- Clean_Data[!(split),]

# 2. Calculate the baseline accuracy
prop.table(table(train$Churn))

## 
##         0         1 
## 0.7346856 0.2653144

# 3. Fitting the model using glm() function
model <- glm(Churn ~., data = train, family = "binomial")
summary(model)

## 
## Call:
## glm(formula = Churn ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9703  -0.6667  -0.2740   0.6976   3.4427  
## 
## Coefficients: (6 not defined because of singularities)
##                                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)                            -3.8993427  1.5190886  -2.567 0.010261
## gender                                  0.0007079  0.0782945   0.009 0.992786
## SeniorCitizen                           0.2280666  0.1021463   2.233 0.025566
## Partner                                 0.0380986  0.0938620   0.406 0.684816
## Dependents                             -0.1881251  0.1085452  -1.733 0.083069
## PhoneService                            0.8674040  0.7720755   1.123 0.261238
## MultipleLines                           0.6201966  0.2109465   2.940 0.003281
## InternetService.xFiber.optic            2.3386261  0.9486119   2.465 0.013689
## InternetService.xNo                    -2.4804426  0.9588639  -2.587 0.009686
## OnlineSecurity.xNo.internet.service            NA         NA      NA       NA
## OnlineSecurity.xYes                    -0.0572797  0.2146295  -0.267 0.789564
## OnlineBackup.xNo.internet.service              NA         NA      NA       NA
## OnlineBackup.xYes                       0.1488093  0.2095328   0.710 0.477583
## DeviceProtection.xNo.internet.service          NA         NA      NA       NA
## DeviceProtection.xYes                   0.2282995  0.2122399   1.076 0.282076
## TechSupport.xNo.internet.service               NA         NA      NA       NA
## TechSupport.xYes                       -0.1066490  0.2171769  -0.491 0.623377
## StreamingTV.xNo.internet.service               NA         NA      NA       NA
## StreamingTV.xYes                        0.8858400  0.3899902   2.271 0.023120
## StreamingMovies.xNo.internet.service           NA         NA      NA       NA
## StreamingMovies.xYes                    0.8601929  0.3898006   2.207 0.027331
## Contract.xOne.year                     -0.7130903  0.1326759  -5.375 7.67e-08
## Contract.xTwo.year                     -1.3852640  0.2120211  -6.534 6.42e-11
## PaperlessBilling                        0.3479089  0.0897581   3.876 0.000106
## PaymentMethod.xCredit.card..automatic.  0.0036981  0.1382275   0.027 0.978656
## PaymentMethod.xElectronic.check         0.3835965  0.1151436   3.331 0.000864
## PaymentMethod.xMailed.check            -0.0185990  0.1397057  -0.133 0.894091
## tenure                                 -1.4985032  0.1835281  -8.165 3.21e-16
## MonthlyCharges                         -1.9939708  1.1354690  -1.756 0.079075
## TotalCharges                            0.7512229  0.1925731   3.901 9.58e-05
##                                           
## (Intercept)                            *  
## gender                                    
## SeniorCitizen                          *  
## Partner                                   
## Dependents                             .  
## PhoneService                              
## MultipleLines                          ** 
## InternetService.xFiber.optic           *  
## InternetService.xNo                    ** 
## OnlineSecurity.xNo.internet.service       
## OnlineSecurity.xYes                       
## OnlineBackup.xNo.internet.service         
## OnlineBackup.xYes                         
## DeviceProtection.xNo.internet.service     
## DeviceProtection.xYes                     
## TechSupport.xNo.internet.service          
## TechSupport.xYes                          
## StreamingTV.xNo.internet.service          
## StreamingTV.xYes                       *  
## StreamingMovies.xNo.internet.service      
## StreamingMovies.xYes                   *  
## Contract.xOne.year                     ***
## Contract.xTwo.year                     ***
## PaperlessBilling                       ***
## PaymentMethod.xCredit.card..automatic.    
## PaymentMethod.xElectronic.check        ***
## PaymentMethod.xMailed.check               
## tenure                                 ***
## MonthlyCharges                         .  
## TotalCharges                           ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5704.4  on 4929  degrees of freedom
## Residual deviance: 4009.0  on 4906  degrees of freedom
## AIC: 4057
## 
## Number of Fisher Scoring iterations: 6

#################    bservations on the Train  set   ##########################
pred_train <- predict(model, data = train, type = "response")
summary(pred_train)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001097 0.039613 0.188353 0.265314 0.466860 0.868097

# confusion matrix on training set
cmatrix_train <- table(train$Churn, pred_train >= 0.5)
cmatrix_train

##    
##     FALSE TRUE
##   0  3254  368
##   1   559  749

#Accuracy
mod_train_acc <- ((cmatrix_train[1,1]+cmatrix_train[2,2]))/nrow(train)
# Misclassification rate or Error Rate
mod_train_mr <- 1-mod_train_acc
# True positive rate / Sensitivity , tpr = tp/(tp+fn)
mod_train_tpr <- cmatrix_train[2,2] / (cmatrix_train[2,2] + cmatrix_train[1,2])
#False positive rate fpr = fp/(fp+tn)
mod_train_fpr <- cmatrix_train[2,1] / (cmatrix_train[2,1] + cmatrix_train[1,1])
# Specificity Spec tnr = tn/(tn+fp)
mod_train_tnr <- cmatrix_train[1,1] / (cmatrix_train[1,1] + cmatrix_train[2,1])
#False Negative rate fnr = fn/(fn+tp)
mod_train_fnr <- cmatrix_train[1,2] / (cmatrix_train[1,2] + cmatrix_train[2,2])
#Negative predictive value (NPV) = tn/(tn+fn)
mod_train_npv <- cmatrix_train[1,1] / (cmatrix_train[1,1] + cmatrix_train[1,2])
#Positive predictive value / Precision  ppv =  tp/(tp+fp)
mod_train_ppv <- cmatrix_train[2,2] / (cmatrix_train[2,2] + cmatrix_train[2,1])
#Positive DLR(Diagnostic likelihood ratio) = tpr/fpr
mod_train_pdlr <- ( mod_train_tpr / mod_train_fpr )
#Negative DLR(Diagnostic likelihood ratio) = tnr/fnr
mod_train_ndlr <- ( mod_train_tnr / mod_train_fnr )
#F-Score = 2 x (ppv x tpr)/(ppv + tpr)
mod_train_fscore <- 2 * ((mod_train_ppv * mod_train_tpr) / (mod_train_ppv + mod_train_tpr ))

cat("*************  Observations on the Train  set  *************  \n",
"Accuracy                                       : ", mod_train_acc,"\n" ,
"Misclassification rate or Error Rate           : ", mod_train_mr,"\n" ,
"Sensitivity (TPR)                              : ", mod_train_tpr,"\n" ,
"Specificity (TNR)                              : ", mod_train_tnr,"\n" ,
"False Negative Rate (FNR)                      : ", mod_train_fnr,"\n" ,
"False positive rate (FPR)                      : ", mod_train_fpr,"\n" ,
"Negative predictive value (NPV)                : ", mod_train_npv,"\n" ,
"Positive predictive value / Precision (PPV)    : ", mod_train_ppv,"\n" ,
"Positive DLR(Diagnostic likelihood ratio)      : ", mod_train_pdlr,"\n" ,
"Negative DLR(Diagnostic likelihood ratio)      : ", mod_train_ndlr,"\n" ,
"F-Score                                        : ", mod_train_fscore,"\n" )

## *************  Observations on the Train  set  *************  
##  Accuracy                                       :  0.8119675 
##  Misclassification rate or Error Rate           :  0.1880325 
##  Sensitivity (TPR)                              :  0.6705461 
##  Specificity (TNR)                              :  0.8533963 
##  False Negative Rate (FNR)                      :  0.3294539 
##  False positive rate (FPR)                      :  0.1466037 
##  Negative predictive value (NPV)                :  0.8983987 
##  Positive predictive value / Precision (PPV)    :  0.57263 
##  Positive DLR(Diagnostic likelihood ratio)      :  4.573868 
##  Negative DLR(Diagnostic likelihood ratio)      :  2.590336 
##  F-Score                                        :  0.617732

#################    Observations on the test set   ##########################

pred_test <- predict(model, newdata = test, type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

# confusion matrix on training set
cmatrix_test <- table(test$Churn, pred_test >= 0.5)
cmatrix_test

##    
##     FALSE TRUE
##   0  1387  165
##   1   257  304

#Accuracy = (TP+TN)/(TN+FN+TP+FP)
mod_test_acc <- ((cmatrix_test[1,1]+cmatrix_test[2,2]))/nrow(test)
# Misclassification rate or Error Rate
mod_test_mr <- 1-mod_test_acc
# True positive rate / Sensitivity , tpr = tp/(tp+fn)
mod_test_tpr <- cmatrix_test[2,2] / (cmatrix_test[2,2] + cmatrix_test[1,2])
#False positive rate fpr = fp/(fp+tn)
mod_test_fpr <- cmatrix_test[2,1] / (cmatrix_test[2,1] + cmatrix_test[1,1])
# Specificity Spec tnr = tn/(tn+fp)
mod_test_tnr <- cmatrix_test[1,1] / (cmatrix_test[1,1] + cmatrix_test[2,1])
#False Negative rate fnr = fn/(fn+tp)
mod_test_fnr <- cmatrix_test[1,2] / (cmatrix_test[1,2] + cmatrix_test[2,2])
#Negative predictive value (NPV) = tn/(tn+fn)
mod_test_npv <- cmatrix_test[1,1] / (cmatrix_test[1,1] + cmatrix_test[1,2])
#Positive predictive value / Precision  ppv =  tp/(tp+fp)
mod_test_ppv <- cmatrix_test[2,2] / (cmatrix_test[2,2] + cmatrix_test[2,1])
#Positive DLR(Diagnostic likelihood ratio) = tpr/fpr
mod_test_pdlr <- ( mod_test_tpr / mod_test_fpr )
#Negative DLR(Diagnostic likelihood ratio) = tnr/fnr
mod_test_ndlr <- ( mod_test_tnr / mod_test_fnr )
#F-Score = 2 x (ppv x tpr)/(ppv + tpr)
mod_test_fscore <- 2 * ((mod_test_ppv * mod_test_tpr) / (mod_test_ppv + mod_test_tpr ))

cat("*************  Observations on the Test  set  *************  \n",
"Accuracy                                       : ", mod_test_acc,"\n" ,
"Misclassification rate or Error Rate           : ", mod_test_mr,"\n" ,
"Sensitivity (TPR)                              : ", mod_test_tpr,"\n" ,
"Specificity (TNR)                              : ", mod_test_tnr,"\n" ,
"False Negative Rate (FNR)                      : ", mod_test_fnr,"\n" ,
"False positive rate (FPR)                      : ", mod_test_fpr,"\n" ,
"Negative predictive value (NPV)                : ", mod_test_npv,"\n" ,
"Positive predictive value / Precision (PPV)    : ", mod_test_ppv,"\n" ,
"Positive DLR(Diagnostic likelihood ratio)      : ", mod_test_pdlr,"\n" ,
"Negative DLR(Diagnostic likelihood ratio)      : ", mod_test_ndlr,"\n" ,
"F-Score                                        : ", mod_test_fscore,"\n" )

## *************  Observations on the Test  set  *************  
##  Accuracy                                       :  0.800284 
##  Misclassification rate or Error Rate           :  0.199716 
##  Sensitivity (TPR)                              :  0.6481876 
##  Specificity (TNR)                              :  0.843674 
##  False Negative Rate (FNR)                      :  0.3518124 
##  False positive rate (FPR)                      :  0.156326 
##  Negative predictive value (NPV)                :  0.8936856 
##  Positive predictive value / Precision (PPV)    :  0.5418895 
##  Positive DLR(Diagnostic likelihood ratio)      :  4.146383 
##  Negative DLR(Diagnostic likelihood ratio)      :  2.398079 
##  F-Score                                        :  0.5902913

Churn Prediction - Logistic Regression

Prasanna Reddy

02/07/2021