# 1. Data. 
# 2. Data Cleaning.
# 3. EDA.
# 4. Preparing  To Model The Data.
# 5. Building Logistic Regression.
# 6. Model Evaluation.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(ggplot2)
library(ggthemes)
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
library(stringr)
library(ggcorrplot)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(rlang)
library(caret)
## Loading required package: lattice
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(dplyr)
library(ggplot2)
library(cowplot)
library(glue)
library(scales)


library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
data_raw <- read_csv("~/Downloads/WA_Fn-UseC_-Telco-Customer-Churn.csv")
## Rows: 7043 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (17): customerID, gender, Partner, Dependents, PhoneService, MultipleLin...
## dbl  (4): SeniorCitizen, tenure, MonthlyCharges, TotalCharges
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_raw <- data_raw[,-1]
dim(data_raw)
## [1] 7043   20
#remove CustomerID
data_raw$id <-c(1:nrow(data_raw)) 
data_raw <- data_raw[,c(21,1:20)]

Project Objective

This project has two main objective and they are as follows:

  1. Learn more about our potential customer. I will use EDA.
  2. To build a classification model that would predict the customer churn:i will use Logistic Regression.

# Data

The data has 7043 observations and 20 variables. The target variable is “Churn” with a binary response(“Yes”,“No”) and the rest are indpendent variables.

Data Cleaning/Data Preparation

1. Data Types

str(data_raw)
## tibble [7,043 × 21] (S3: tbl_df/tbl/data.frame)
##  $ id              : int [1:7043] 1 2 3 4 5 6 7 8 9 10 ...
##  $ gender          : chr [1:7043] "Female" "Male" "Male" "Male" ...
##  $ SeniorCitizen   : num [1:7043] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Partner         : chr [1:7043] "Yes" "No" "No" "No" ...
##  $ Dependents      : chr [1:7043] "No" "No" "No" "No" ...
##  $ tenure          : num [1:7043] 1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : chr [1:7043] "No" "Yes" "Yes" "No" ...
##  $ MultipleLines   : chr [1:7043] "No phone service" "No" "No" "No phone service" ...
##  $ InternetService : chr [1:7043] "DSL" "DSL" "DSL" "DSL" ...
##  $ OnlineSecurity  : chr [1:7043] "No" "Yes" "Yes" "Yes" ...
##  $ OnlineBackup    : chr [1:7043] "Yes" "No" "Yes" "No" ...
##  $ DeviceProtection: chr [1:7043] "No" "Yes" "No" "Yes" ...
##  $ TechSupport     : chr [1:7043] "No" "No" "No" "Yes" ...
##  $ StreamingTV     : chr [1:7043] "No" "No" "No" "No" ...
##  $ StreamingMovies : chr [1:7043] "No" "No" "No" "No" ...
##  $ Contract        : chr [1:7043] "Month-to-month" "One year" "Month-to-month" "One year" ...
##  $ PaperlessBilling: chr [1:7043] "Yes" "No" "Yes" "No" ...
##  $ PaymentMethod   : chr [1:7043] "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
##  $ MonthlyCharges  : num [1:7043] 29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num [1:7043] 29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : chr [1:7043] "No" "No" "Yes" "No" ...
data_raw<- clean_names(data_raw)
head(data_raw,6)
## # A tibble: 6 × 21
##      id gender senior_citizen partner dependents tenure phone_service
##   <int> <chr>           <dbl> <chr>   <chr>       <dbl> <chr>        
## 1     1 Female              0 Yes     No              1 No           
## 2     2 Male                0 No      No             34 Yes          
## 3     3 Male                0 No      No              2 Yes          
## 4     4 Male                0 No      No             45 No           
## 5     5 Female              0 No      No              2 Yes          
## 6     6 Female              0 No      No              8 Yes          
## # ℹ 14 more variables: multiple_lines <chr>, internet_service <chr>,
## #   online_security <chr>, online_backup <chr>, device_protection <chr>,
## #   tech_support <chr>, streaming_tv <chr>, streaming_movies <chr>,
## #   contract <chr>, paperless_billing <chr>, payment_method <chr>,
## #   monthly_charges <dbl>, total_charges <dbl>, churn <chr>
summary(data_raw)
##        id          gender          senior_citizen     partner         
##  Min.   :   1   Length:7043        Min.   :0.0000   Length:7043       
##  1st Qu.:1762   Class :character   1st Qu.:0.0000   Class :character  
##  Median :3522   Mode  :character   Median :0.0000   Mode  :character  
##  Mean   :3522                      Mean   :0.1621                     
##  3rd Qu.:5282                      3rd Qu.:0.0000                     
##  Max.   :7043                      Max.   :1.0000                     
##                                                                       
##   dependents            tenure      phone_service      multiple_lines    
##  Length:7043        Min.   : 0.00   Length:7043        Length:7043       
##  Class :character   1st Qu.: 9.00   Class :character   Class :character  
##  Mode  :character   Median :29.00   Mode  :character   Mode  :character  
##                     Mean   :32.37                                        
##                     3rd Qu.:55.00                                        
##                     Max.   :72.00                                        
##                                                                          
##  internet_service   online_security    online_backup      device_protection 
##  Length:7043        Length:7043        Length:7043        Length:7043       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  tech_support       streaming_tv       streaming_movies     contract        
##  Length:7043        Length:7043        Length:7043        Length:7043       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  paperless_billing  payment_method     monthly_charges  total_charges   
##  Length:7043        Length:7043        Min.   : 18.25   Min.   :  18.8  
##  Class :character   Class :character   1st Qu.: 35.50   1st Qu.: 401.4  
##  Mode  :character   Mode  :character   Median : 70.35   Median :1397.5  
##                                        Mean   : 64.76   Mean   :2283.3  
##                                        3rd Qu.: 89.85   3rd Qu.:3794.7  
##                                        Max.   :118.75   Max.   :8684.8  
##                                                         NA's   :11      
##     churn          
##  Length:7043       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

convert senior_citizen to a factor data type

data_raw$senior_citizen <- factor(data_raw$senior_citizen)

> We have 17 variables of factor data type and 3 variables of numeric data type.

2. Missing Values

The only column has missing values is “total_charges” with 11 NA’s. Its very small proprtion to the total rows and therfor i will drop NA’s records.

summary(data_raw$total_charges)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    18.8   401.4  1397.5  2283.3  3794.7  8684.8      11
# drop NA's rows
data_raw <- data_raw[!is.na(data_raw$total_charges),]

data <- data_raw
dim(data)
## [1] 7032   21

3. Outliers

In ordr to check for outliers i would use the histogram plot for the three numeric variables(tenure,total_charges and monthly_charges).As plot below there are no outliers.

options(repr.plot.width=10, repr.plot.height=7)
hist(data$tenure,main="Tenure Histogram",col = "red",xlab="Tenure",ylab="Count") 

hist(data$monthly_charges,main="Monthly Chargers Histogram",col = "green",xlab="Monthly_charges",ylab="Count")

hist(data$total_charges,main="Total Chargers Histogram",col = "purple",xlab="Total_charges",ylab="Count")

# Explatoray Data Analysis

1. Churn

options(warn=-1)
sam <- theme(plot.background = element_rect(fill="#F5FFFA",color = "darkblue"),
             plot.title = element_text(size=25, hjust=.5),
             axis.title.x = element_text(size=25, color = "black"),
              axis.title.y = element_text(size=25, color = "black"),
             axis.text.x = element_text(size=20),
             axis.text.y = element_text(size=20),
            legend.position = "top",
            legend.text = element_text(size=20),
            legend.title = element_text(size=20))

churn <- data %>%
group_by(churn) %>%
summarise(n = n())%>%
mutate(prop = n / sum(n)) %>%
ungroup()%>%
mutate(label_text = str_glue("n:{n} \n prop:{scales::percent(prop)}"))

churn
## # A tibble: 2 × 4
##   churn     n  prop label_text     
##   <chr> <int> <dbl> <glue>         
## 1 No     5163 0.734 n:5163 
## prop:73%
## 2 Yes    1869 0.266 n:1869 
## prop:27%
options(repr.plot.width=15, repr.plot.height=10)
churn1 <- churn %>% ggplot(aes(x = churn,y = prop,fill = churn)) + 
geom_col(alpha=0.7,color="black") +
geom_label(aes(label=label_text),fill="white",size =8,position=position_fill(vjust=0.3),color = "black",size = 10)+
xlab("Churn(Yes,NO)") +
ylab("Prop") +
ggtitle("Churn Bar Graph Distribution")+
scale_y_continuous(labels = scales::percent_format())+
theme_minimal()+
sam
churn1

2. Tenure VS Churn

options(repr.plot.width=20, repr.plot.height=15)
data %>% ggplot(mapping = aes(x = tenure)) +
geom_bar(aes(fill = churn),color="black",alpha=0.7) +
theme_minimal()+
xlab("Tenure")+
ylab("Count") +
ggtitle("Tenure Bar Graph with Churn Overlay\n(Not Normalized)")+
theme(legend.position = "none") +
sam

data %>% ggplot(mapping = aes(x = tenure)) +
geom_bar(aes(fill = churn),position = 'fill',color="black",alpha=0.7) +
scale_y_continuous(labels = scales::percent_format())+
theme_minimal()+
xlab("Tenure")+
ylab("Prop") +
ggtitle("Tenure Bar Graph with Churn Overlay \n(Normalized)") +
sam

data %>% ggplot(aes(x = tenure,fill = churn)) + 
geom_density(alpha=0.7,color="black") +
xlab("Tenure") +
ylab("Prop") +
ggtitle("Tenure Density Graph with Churn") +
theme_minimal()+
sam

3. Monthly Charges VS Churn

options(repr.plot.width=20, repr.plot.height=15)

data %>% ggplot(mapping = aes(x = monthly_charges)) +
geom_bar(aes(fill = churn),alpha=0.7) +
theme_minimal()+
xlab("Monthly Charges ($)")+
ylab("Count") +
ggtitle("Monthly Charges Bar Graph with Churn Overlay\n(Not Normalized)")+
theme(legend.position = "none") +
sam

data %>% ggplot(mapping = aes(x = monthly_charges)) +
geom_bar(aes(fill = churn),position = 'fill',alpha=0.7) +
scale_y_continuous(labels = scales::percent_format())+
theme_minimal()+
xlab("Monthly Charges ($)")+
ylab("Prop") +
ggtitle("Monthly Charges Bar Graph with Churn Overlay \n(Normalized)") +
sam

data %>% ggplot(aes(x = monthly_charges,fill = churn)) + 
geom_density(alpha=0.7,color="black") +
xlab("Monthly Charges") +
ylab("Prop") +
ggtitle("Monthly Charges Density Graph with Churn") +
theme_minimal()+
sam

The majority of our customers has low monthly charges. Those with the highest proportion of positive churn(left our platform) are the ones with high monthly charges(between ‘70’ to ‘112’$ / month)

4. Total Charges VS Churn

options(repr.plot.width=20, repr.plot.height=15)

options(warn=-1)
data %>% ggplot(mapping = aes(x = total_charges)) +
geom_bar(aes(fill = churn),alpha = 0.7) +
theme_minimal()+
xlab("Total Charges ($)")+
ylab("Count") +
ggtitle("Total Charges Bar Graph with Churn Overlay\n(Not Normalized)")+
theme(legend.position = "none") +
sam

data %>% ggplot(mapping = aes(x = total_charges)) +
geom_bar(aes(fill = churn),position = 'fill',alpha=0.7) +
scale_y_continuous(labels = scales::percent_format())+
theme_minimal()+
xlab("Total Charges ($)")+
ylab("Prop") +
ggtitle("Total Charges Bar Graph with Churn Overlay \n(Normalized)") +
sam

data %>% ggplot(aes(x =total_charges,fill = churn)) + 
geom_density(alpha=0.7,color="black") +
xlab("Total Charges") +
ylab("Prop") +
ggtitle("Total Charges Density Graph with Churn") +
theme_minimal()+
sam

* Correlation Matrix Between Numeric Variables

ggcorrplot(round(cor(data[,c(6,19,20)]),2), title = "Correlation Matrix",hc.order = TRUE,lab=TRUE,type = "lower",lab_size = 5) + theme(plot.title=element_text(hjust = 0.5,size = 25),axis.text.y = element_text(size = 25),axis.text.x = element_text(size = 15),legend.text = element_text(size = 20))

options(repr.plot.width=15, repr.plot.height=22)

ggpairs(data,columns = c(6,19,20),ggplot2::aes(color = churn)) +
theme_economist()+
sam 

I found:

5. Gender Vs Churn

library(glue)
plot_facet_bar <- function(data, target, list_of_variables) {
    target <- sym(target) # Converting the string to a column reference
    plt_matrix <- list()
    
    for (i in seq_along(list_of_variables)) {
        column <- list_of_variables[i]
        col <- sym(column) 
        
        temp <- data %>% 
            group_by(!!col, !!target) %>% 
            summarize(count = n(), .groups = 'drop') %>% 
            mutate(prop = round(count / sum(count), 2)) %>%
            ungroup() %>%
            mutate(label_text = glue("n: {count}\nprop: {percent(prop)}"))
        
        plt_matrix[[i]] <- ggplot(data = temp, aes(x = !!col, y = prop, fill = !!target)) + 
            geom_bar(stat = "identity", alpha = 0.7, color = "black") +
            geom_label(aes(label = label_text), size = 3, hjust = 0.5, fill = "white", color = "black") +
            scale_y_continuous(labels = percent_format()) +
            labs(x = column, y = "Proportion", title = paste("Distribution of", target, "across", column)) +
            theme_minimal() +
            theme(
                axis.text.x = element_text(angle = 45, hjust = 1),
                plot.title = element_text(size = 6, face = "bold"),
                axis.title = element_text(size = 14),
                legend.title = element_text(size = 14),
                legend.text = element_text(size = 12)
            )
    }
    
    options(repr.plot.width = 30, repr.plot.height = 20) 
    plot_grid(plotlist = plt_matrix, ncol = 2)
}

plot_facet_bar(data, 'churn', c('gender', 'senior_citizen', 'partner', 'dependents'))

plot_categorical_vs_target <- function(data, target, list_of_variables) {
    target <- sym(target) # Converting the string to a column reference
    plt_matrix <- list()
    
    for (i in seq_along(list_of_variables)) {
        column <- list_of_variables[i]
        col <- sym(column) 
        
        temp <- data %>% 
            group_by(!!col, !!target) %>% 
            summarize(count = n(), .groups = 'drop') %>% 
            mutate(prop = round(count / sum(count), 2)) %>%
            ungroup() %>%
            mutate(label_text = glue("n: {count}\nprop: {percent(prop)}"))
        
        plt_matrix[[i]] <- ggplot(data = temp, aes(x = !!col, y = prop, fill = !!target)) + 
            geom_bar(stat = "identity", alpha = 0.7, color = "black", position = position_dodge(width = 0.9)) +
            geom_text(aes(label = label_text), size = 3, position = position_stack(vjust = 0.5), color = "black") +
            scale_y_continuous(labels = percent_format()) +
            labs(x = column, y = "Proportion", title = paste("Distribution of", target, "across", column)) +
            theme_minimal() +
            theme(
                axis.text.x = element_text(angle = 45, hjust = 1),
                plot.title = element_text(size = 16, face = "bold"),
                axis.title = element_text(size = 14),
                legend.title = element_text(size = 4),
                legend.text = element_text(size = 12)
            )
    }
    
    options(repr.plot.width = 30, repr.plot.height = 20) 
    plot_grid(plotlist = plt_matrix, ncol = 2)
}

plot_categorical_vs_target(data, 'churn', c('phone_service', 'multiple_lines', 'internet_service', 'online_security'))

* Overall across the three levels of multiple lines,on average 73% of our customers didnt left our platform compared to 257% who did.

* 42% Customers of fiber optic services left our platform the last month.93% of customers with no internet services did not leave. 81% of our customers with DSL services didnt leave.

* 42% of customers with no online security left our platform the last month.93% of customers with no internet services did not leave. 85% of our customers with online security didnt leave.

plot_dodged_bar <- function(data, target, list_of_variables) {
    target <- sym(target) # Converting the string to a column reference
    plt_matrix <- list()
    
    for (i in seq_along(list_of_variables)) {
        column <- list_of_variables[i]
        col <- sym(column) 
        
        temp <- data %>% 
            group_by(!!col, !!target) %>% 
            summarize(count = n(), .groups = 'drop') %>% 
            mutate(prop = round(count / sum(count), 2)) %>%
            ungroup() %>%
            mutate(label_text = glue("n: {count}\nprop: {percent(prop)}"))
        
        plt_matrix[[i]] <- ggplot(data = temp, aes(x = !!col, y = prop, fill = !!target)) + 
            geom_bar(stat = "identity", alpha = 0.7, color = "black", position = position_dodge(width = 0.9)) +
            geom_text(aes(label = label_text), size = 3, position = position_dodge(width = 0.9), vjust = -0.5, color = "black") +
            scale_y_continuous(labels = percent_format()) +
            labs(x = column, y = "Proportion", title = paste("Distribution of", target, "across", column)) +
            theme_minimal() +
            theme(
                axis.text.x = element_text(angle = 45, hjust = 1),
                plot.title = element_text(size = 6, face = "bold"),
                axis.title = element_text(size = 4),
                legend.title = element_text(size = 4),
                legend.text = element_text(size = 6)
            )
    }
    
    options(repr.plot.width = 30, repr.plot.height = 20) 
    plot_grid(plotlist = plt_matrix, ncol = 2)
}

# Example usage
plot_dodged_bar(data, 'churn', c('online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies'))

* 93% of our customers with no internet services did leave our platform.

* 40% of customers with no online backup left our platform.75% of Those who have online backup didnt leave.

* 39% of customers with no device protection left our platform.77% of Those who have device protection didnt leave.

* 42% of customers with no tech support left our platform.85% of Those who have tech support didnt leave.

* 34% of customers with no streaming TV left our platform.70% of Those who have streaming TV didnt leave.

* 34% of customers with no streaming movies left our platform.70% of Those who have streaming movies didnt leave.

plot_dodged_bar(data, 'churn', c('contract', 'paperless_billing', 'payment_method'))

* 43% of our customers who have a month-to-month contract left our platform.89% of those have a one contract did not leave.97% of those has two year contract did not leave.

* 34% of our customers who have paperless billing left our platform. 84% of those who did not have paperless billing did not leave.

* 45% of our customers with electronic check as payment method left our platform.

* Bining Based on Churn Value

Its important to bin our numeric variables into levels based on its behavior with the target or predictive variable:

  1. Monthly charges:
    i will devide it into two categories.

False when values are between 25 and 56 OR between 66 and 110.

data$m5bins <- cut(x = data$monthly_charges,breaks = c(0, 26,56,66,110,120),
      right = FALSE, labels = c("very-Low","low","medium","high","very-high"),ordered = TRUE)

data %>%
group_by(m5bins,churn)%>%
summarise(count=n())%>%
mutate(prop=round(count/sum(count),2))%>%
ungroup()%>%
mutate(label_text = str_glue("n : {count}\nprop:{scales::percent(prop)}"))%>%
ggplot(aes(x = m5bins,y = prop,fill = churn)) + 
geom_col(alpha=0.7,color="black") +
geom_label(aes(label=label_text),fill="white",size =8,color = "black",size = 10)+
xlab("Monthly Charges") +
ylab("Prop") +
ggtitle("Monthly Charges Bar Graph with Churn") +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal()+
sam
## `summarise()` has grouped output by 'm5bins'. You can override using the
## `.groups` argument.

data %>% ggplot(aes(x = monthly_charges,fill = churn)) + 
geom_density(alpha=0.7,color="black") +
xlab("Monthly Charges") +
ylab("Count") +
ggtitle("Monthly Charges Density Graph with Churn") +
theme_minimal()+
sam

* 28% of Low (monthly charges is between (26 - 55 USD)) monthly charges customers left our platform.

* 36% of high (monthly charges is between (66 - 110 USD)) monthly charges customers left our platform.

Now that we have a full picture on how customer churn behave relative to the indpendent variables in our data i would sum up the characteristics of our potential customer below:

  1. Not a senior citizen

  2. Has a partner.

  3. Has dependents.

  4. Has no internet services.

  5. Has online security.

  6. Has online online backup.

  7. Has online device protection.

  8. Has tech support services.

  9. Has a two year contract.

  10. Has paper billing.

  11. Has credit card(automated) as payment method.

  12. Has a very low monthly charges (below 25 USD).

# Preparing To Model The Data * Randomly Reorder The Data

* Clean Categorical Variables.

* Create Dummy Variables.

* Standardize Continuous Variables.

* Final DataSet.

* Split Data into train data (65%) and test data (35%).

* Randomly Reorder The Data

set.seed(1000)
rows <- sample(nrow(data))
data_1 <- data
data_shuffled <- data_1[rows,]

* Clean Categorical Variables.

In EDA , we know some categorical variables such as online services and phone services have “No”,“Yes” and “No internet services” as categories, or “No phone services” as a category . I will condiser them as “No” and have only “No”,“Yes”.

data_shuffled <- data.frame(lapply(data_shuffled, function(x) {
                  gsub("No internet service", "No", x)}))

data_shuffled <- data.frame(lapply(data_shuffled, function(x) {
                  gsub("No phone service", "No", x)}))

data_shuffled[,c(1,6,19,20)] <- lapply(data_shuffled[,c(1,6,19,20)],as.numeric)


head(data_shuffled)
##     id gender senior_citizen partner dependents tenure phone_service
## 1 5112   Male              1      No         No     47           Yes
## 2  581 Female              0     Yes        Yes     12           Yes
## 3 4317   Male              0      No         No     69           Yes
## 4 1078 Female              0      No         No     41           Yes
## 5 4448   Male              0     Yes         No     68           Yes
## 6 5484 Female              0     Yes         No     55           Yes
##   multiple_lines internet_service online_security online_backup
## 1             No      Fiber optic             Yes            No
## 2             No               No              No            No
## 3            Yes              DSL             Yes           Yes
## 4            Yes      Fiber optic             Yes           Yes
## 5            Yes      Fiber optic             Yes           Yes
## 6            Yes      Fiber optic              No            No
##   device_protection tech_support streaming_tv streaming_movies       contract
## 1                No           No           No              Yes Month-to-month
## 2                No           No           No               No Month-to-month
## 3               Yes          Yes          Yes              Yes       Two year
## 4               Yes          Yes          Yes              Yes Month-to-month
## 5                No          Yes          Yes              Yes Month-to-month
## 6               Yes           No           No               No Month-to-month
##   paperless_billing            payment_method monthly_charges total_charges
## 1                No              Mailed check           85.50       4042.30
## 2                No              Mailed check           19.00        233.55
## 3                No              Mailed check           90.65       6322.10
## 4               Yes Bank transfer (automatic)          114.50       4527.45
## 5               Yes          Electronic check          107.70       7320.90
## 6               Yes          Electronic check           77.75       4458.15
##   churn    m5bins
## 1   Yes      high
## 2   Yes  very-Low
## 3    No      high
## 4   Yes very-high
## 5    No      high
## 6   Yes      high

* Create Dummay Variables.

This can be easily done using ‘dummayVars’ from ‘caret package’. Categorical variables with more than two categoreis such as m5bin ,4 extra column will be created with 0 and 1. total of 25 dummay variables. The full dataset is joining all dummay variables with

d_int <-data_shuffled[,c(1,6,19,20)]
d <- data_shuffled[,-c(1,6,19,20)]

dummy <- data.frame(sapply(data_shuffled[,-c(6,19,20)],function(x) data.frame(model.matrix(~x-1,data =data_shuffled[,-c(6,19,20)]))[,-1]))
head(dummy)
##   gender senior_citizen partner dependents phone_service multiple_lines
## 1      1              1       0          0             1              0
## 2      0              0       1          1             1              0
## 3      1              0       0          0             1              1
## 4      0              0       0          0             1              1
## 5      1              0       1          0             1              1
## 6      0              0       1          0             1              1
##   internet_service.xFiber.optic internet_service.xNo online_security
## 1                             1                    0               1
## 2                             0                    1               0
## 3                             0                    0               1
## 4                             1                    0               1
## 5                             1                    0               1
## 6                             1                    0               0
##   online_backup device_protection tech_support streaming_tv streaming_movies
## 1             0                 0            0            0                1
## 2             0                 0            0            0                0
## 3             1                 1            1            1                1
## 4             1                 1            1            1                1
## 5             1                 0            1            1                1
## 6             0                 1            0            0                0
##   contract.xOne.year contract.xTwo.year paperless_billing
## 1                  0                  0                 0
## 2                  0                  0                 0
## 3                  0                  1                 0
## 4                  0                  0                 1
## 5                  0                  0                 1
## 6                  0                  0                 1
##   payment_method.xCredit.card..automatic. payment_method.xElectronic.check
## 1                                       0                                0
## 2                                       0                                0
## 3                                       0                                0
## 4                                       0                                0
## 5                                       0                                1
## 6                                       0                                1
##   payment_method.xMailed.check churn m5bins.xlow m5bins.xmedium
## 1                            1     1           0              0
## 2                            1     1           0              0
## 3                            1     0           0              0
## 4                            0     1           0              0
## 5                            0     0           0              0
## 6                            0     1           0              0
##   m5bins.xvery.high m5bins.xvery.Low
## 1                 0                0
## 2                 0                1
## 3                 0                0
## 4                 1                0
## 5                 0                0
## 6                 0                0

* Standardize Continuous Variables.

I will standardize tenure and total charges variables only as the only numeric variables to be i will use in the model.

d_int[,c(2,3,4)] <- scale(d_int[,c(2,3,4)]) 
str_glue("Mean of Tenure : {round(mean(d_int$tenure),2)} | Standard deviation of tenure : {round(sd(d_int$tenure),2)}")
## Mean of Tenure : 0 | Standard deviation of tenure : 1
str_glue("Mean of Total Charges : {round(mean(d_int$total_charges),2)} | Standard deviation of total_charges : {round(sd(d_int$total_charges),2)}")
## Mean of Total Charges : 0 | Standard deviation of total_charges : 1
str_glue("Mean of Monthly Charges : {round(mean(d_int$monthly_charges),2)} | Standard deviation of monthly charges : {round(sd(d_int$monthly_charges),2)}")
## Mean of Monthly Charges : 0 | Standard deviation of monthly charges : 1

* Final DataSet.

# full dataset

final_data <- cbind(d_int , dummy) # remove monthly charges continous variable
head(final_data)
##     id     tenure monthly_charges total_charges gender senior_citizen partner
## 1 5112  0.5939319       0.6880878     0.7759934      1              1       0
## 2  581 -0.8320053      -1.5222445    -0.9042599      0              0       1
## 3 4317  1.4902354       0.8592639     1.7817410      1              0       0
## 4 1078  0.3494856       1.6519921     0.9900203      0              0       0
## 5 4448  1.4494943       1.4259732     2.2223677      1              0       1
## 6 5484  0.9198605       0.4304927     0.9594481      0              0       1
##   dependents phone_service multiple_lines internet_service.xFiber.optic
## 1          0             1              0                             1
## 2          1             1              0                             0
## 3          0             1              1                             0
## 4          0             1              1                             1
## 5          0             1              1                             1
## 6          0             1              1                             1
##   internet_service.xNo online_security online_backup device_protection
## 1                    0               1             0                 0
## 2                    1               0             0                 0
## 3                    0               1             1                 1
## 4                    0               1             1                 1
## 5                    0               1             1                 0
## 6                    0               0             0                 1
##   tech_support streaming_tv streaming_movies contract.xOne.year
## 1            0            0                1                  0
## 2            0            0                0                  0
## 3            1            1                1                  0
## 4            1            1                1                  0
## 5            1            1                1                  0
## 6            0            0                0                  0
##   contract.xTwo.year paperless_billing payment_method.xCredit.card..automatic.
## 1                  0                 0                                       0
## 2                  0                 0                                       0
## 3                  1                 0                                       0
## 4                  0                 1                                       0
## 5                  0                 1                                       0
## 6                  0                 1                                       0
##   payment_method.xElectronic.check payment_method.xMailed.check churn
## 1                                0                            1     1
## 2                                0                            1     1
## 3                                0                            1     0
## 4                                0                            0     1
## 5                                1                            0     0
## 6                                1                            0     1
##   m5bins.xlow m5bins.xmedium m5bins.xvery.high m5bins.xvery.Low
## 1           0              0                 0                0
## 2           0              0                 0                1
## 3           0              0                 0                0
## 4           0              0                 1                0
## 5           0              0                 0                0
## 6           0              0                 0                0

* Split Data into train data (65%) and test data (35%).

set.seed(1234)
split <- createDataPartition(y = final_data$churn,p=0.65,list = FALSE)

training_set<-final_data[split,]
testing_set<-final_data[-split,]

check data balance

table(training_set$churn)
## 
##    0    1 
## 3353 1218
table(testing_set$churn)
## 
##    0    1 
## 1810  651

Both train and test data are perfectly balanced.

# Building Logistic Regression Model

model_1 <- glm(churn~.,data = training_set,family = binomial(link = "logit"))
summary(model_1)
## 
## Call:
## glm(formula = churn ~ ., family = binomial(link = "logit"), data = training_set)
## 
## Coefficients:
##                                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                             -4.172e+00  1.623e+00  -2.570 0.010171
## id                                       1.923e-05  1.992e-05   0.965 0.334343
## tenure                                  -1.427e+00  1.973e-01  -7.232 4.74e-13
## monthly_charges                         -1.597e+00  1.203e+00  -1.327 0.184496
## total_charges                            5.873e-01  2.121e-01   2.769 0.005623
## gender                                  -7.355e-02  8.073e-02  -0.911 0.362278
## senior_citizen                           2.053e-01  1.051e-01   1.954 0.050755
## partner                                  7.499e-02  9.642e-02   0.778 0.436717
## dependents                              -1.767e-01  1.110e-01  -1.592 0.111430
## phone_service                            6.540e-01  8.211e-01   0.796 0.425750
## multiple_lines                           6.360e-01  2.222e-01   2.863 0.004197
## internet_service.xFiber.optic            2.479e+00  1.018e+00   2.434 0.014931
## internet_service.xNo                    -1.715e+00  1.065e+00  -1.610 0.107412
## online_security                         -1.571e-01  2.242e-01  -0.701 0.483361
## online_backup                            1.254e-01  2.188e-01   0.573 0.566372
## device_protection                        2.965e-01  2.216e-01   1.338 0.180958
## tech_support                            -4.173e-02  2.241e-01  -0.186 0.852263
## streaming_tv                             7.794e-01  4.099e-01   1.901 0.057263
## streaming_movies                         8.647e-01  4.109e-01   2.105 0.035334
## contract.xOne.year                      -5.799e-01  1.335e-01  -4.344 1.40e-05
## contract.xTwo.year                      -1.317e+00  2.251e-01  -5.848 4.97e-09
## paperless_billing                        3.441e-01  9.272e-02   3.712 0.000206
## payment_method.xCredit.card..automatic. -5.694e-02  1.424e-01  -0.400 0.689343
## payment_method.xElectronic.check         3.786e-01  1.180e-01   3.209 0.001330
## payment_method.xMailed.check            -1.167e-01  1.438e-01  -0.812 0.416995
## m5bins.xlow                              8.364e-01  2.696e-01   3.103 0.001918
## m5bins.xmedium                           5.322e-01  2.850e-01   1.867 0.061832
## m5bins.xvery.high                       -3.639e-01  3.432e-01  -1.060 0.289015
## m5bins.xvery.Low                         2.579e-01  4.750e-01   0.543 0.587205
##                                            
## (Intercept)                             *  
## id                                         
## tenure                                  ***
## monthly_charges                            
## total_charges                           ** 
## gender                                     
## senior_citizen                          .  
## partner                                    
## dependents                                 
## phone_service                              
## multiple_lines                          ** 
## internet_service.xFiber.optic           *  
## internet_service.xNo                       
## online_security                            
## online_backup                              
## device_protection                          
## tech_support                               
## streaming_tv                            .  
## streaming_movies                        *  
## contract.xOne.year                      ***
## contract.xTwo.year                      ***
## paperless_billing                       ***
## payment_method.xCredit.card..automatic.    
## payment_method.xElectronic.check        ** 
## payment_method.xMailed.check               
## m5bins.xlow                             ** 
## m5bins.xmedium                          .  
## m5bins.xvery.high                          
## m5bins.xvery.Low                           
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5299.7  on 4570  degrees of freedom
## Residual deviance: 3769.0  on 4542  degrees of freedom
## AIC: 3827
## 
## Number of Fisher Scoring iterations: 6

* Above is the initial model with all variables included. I will use stewise feature selection methods with function called ‘stepAIC’, it will iterate untill the lowest AIC model among all models is discovered.

model_2 <- stepAIC(model_1,direction = "both")
## Start:  AIC=3827.01
## churn ~ id + tenure + monthly_charges + total_charges + gender + 
##     senior_citizen + partner + dependents + phone_service + multiple_lines + 
##     internet_service.xFiber.optic + internet_service.xNo + online_security + 
##     online_backup + device_protection + tech_support + streaming_tv + 
##     streaming_movies + contract.xOne.year + contract.xTwo.year + 
##     paperless_billing + payment_method.xCredit.card..automatic. + 
##     payment_method.xElectronic.check + payment_method.xMailed.check + 
##     m5bins.xlow + m5bins.xmedium + m5bins.xvery.high + m5bins.xvery.Low
## 
##                                           Df Deviance    AIC
## - tech_support                             1   3769.0 3825.0
## - payment_method.xCredit.card..automatic.  1   3769.2 3825.2
## - m5bins.xvery.Low                         1   3769.3 3825.3
## - online_backup                            1   3769.3 3825.3
## - online_security                          1   3769.5 3825.5
## - partner                                  1   3769.6 3825.6
## - phone_service                            1   3769.6 3825.6
## - payment_method.xMailed.check             1   3769.7 3825.7
## - gender                                   1   3769.8 3825.8
## - id                                       1   3769.9 3825.9
## - m5bins.xvery.high                        1   3770.2 3826.2
## - monthly_charges                          1   3770.8 3826.8
## - device_protection                        1   3770.8 3826.8
## <none>                                         3769.0 3827.0
## - dependents                               1   3771.6 3827.6
## - internet_service.xNo                     1   3771.6 3827.6
## - m5bins.xmedium                           1   3772.5 3828.5
## - streaming_tv                             1   3772.6 3828.6
## - senior_citizen                           1   3772.8 3828.8
## - streaming_movies                         1   3773.4 3829.4
## - internet_service.xFiber.optic            1   3775.0 3831.0
## - total_charges                            1   3777.0 3833.0
## - multiple_lines                           1   3777.2 3833.2
## - m5bins.xlow                              1   3779.1 3835.1
## - payment_method.xElectronic.check         1   3779.4 3835.4
## - paperless_billing                        1   3782.9 3838.9
## - contract.xOne.year                       1   3788.6 3844.6
## - contract.xTwo.year                       1   3810.1 3866.1
## - tenure                                   1   3831.4 3887.4
## 
## Step:  AIC=3825.04
## churn ~ id + tenure + monthly_charges + total_charges + gender + 
##     senior_citizen + partner + dependents + phone_service + multiple_lines + 
##     internet_service.xFiber.optic + internet_service.xNo + online_security + 
##     online_backup + device_protection + streaming_tv + streaming_movies + 
##     contract.xOne.year + contract.xTwo.year + paperless_billing + 
##     payment_method.xCredit.card..automatic. + payment_method.xElectronic.check + 
##     payment_method.xMailed.check + m5bins.xlow + m5bins.xmedium + 
##     m5bins.xvery.high + m5bins.xvery.Low
## 
##                                           Df Deviance    AIC
## - payment_method.xCredit.card..automatic.  1   3769.2 3823.2
## - m5bins.xvery.Low                         1   3769.3 3823.3
## - partner                                  1   3769.6 3823.6
## - payment_method.xMailed.check             1   3769.7 3823.7
## - online_security                          1   3769.8 3823.8
## - gender                                   1   3769.9 3823.9
## - id                                       1   3770.0 3824.0
## - m5bins.xvery.high                        1   3770.2 3824.2
## - online_backup                            1   3770.4 3824.4
## <none>                                         3769.0 3825.0
## - dependents                               1   3771.6 3825.6
## - phone_service                            1   3772.5 3826.5
## - m5bins.xmedium                           1   3772.6 3826.6
## - senior_citizen                           1   3772.9 3826.9
## + tech_support                             1   3769.0 3827.0
## - device_protection                        1   3774.5 3828.5
## - total_charges                            1   3777.0 3831.0
## - monthly_charges                          1   3778.2 3832.2
## - internet_service.xNo                     1   3778.3 3832.3
## - m5bins.xlow                              1   3779.3 3833.3
## - payment_method.xElectronic.check         1   3779.5 3833.5
## - paperless_billing                        1   3782.9 3836.9
## - streaming_tv                             1   3784.0 3838.0
## - streaming_movies                         1   3787.1 3841.1
## - contract.xOne.year                       1   3788.8 3842.8
## - multiple_lines                           1   3793.9 3847.9
## - internet_service.xFiber.optic            1   3797.5 3851.5
## - contract.xTwo.year                       1   3810.6 3864.6
## - tenure                                   1   3831.4 3885.4
## 
## Step:  AIC=3823.2
## churn ~ id + tenure + monthly_charges + total_charges + gender + 
##     senior_citizen + partner + dependents + phone_service + multiple_lines + 
##     internet_service.xFiber.optic + internet_service.xNo + online_security + 
##     online_backup + device_protection + streaming_tv + streaming_movies + 
##     contract.xOne.year + contract.xTwo.year + paperless_billing + 
##     payment_method.xElectronic.check + payment_method.xMailed.check + 
##     m5bins.xlow + m5bins.xmedium + m5bins.xvery.high + m5bins.xvery.Low
## 
##                                           Df Deviance    AIC
## - m5bins.xvery.Low                         1   3769.5 3821.5
## - payment_method.xMailed.check             1   3769.7 3821.7
## - partner                                  1   3769.8 3821.8
## - online_security                          1   3770.0 3822.0
## - gender                                   1   3770.0 3822.0
## - id                                       1   3770.1 3822.1
## - m5bins.xvery.high                        1   3770.4 3822.4
## - online_backup                            1   3770.6 3822.6
## <none>                                         3769.2 3823.2
## - dependents                               1   3771.8 3823.8
## - phone_service                            1   3772.6 3824.6
## - m5bins.xmedium                           1   3772.8 3824.8
## - senior_citizen                           1   3773.0 3825.0
## + payment_method.xCredit.card..automatic.  1   3769.0 3825.0
## + tech_support                             1   3769.2 3825.2
## - device_protection                        1   3774.7 3826.7
## - total_charges                            1   3777.2 3829.2
## - monthly_charges                          1   3778.4 3830.4
## - internet_service.xNo                     1   3778.5 3830.5
## - m5bins.xlow                              1   3779.5 3831.5
## - paperless_billing                        1   3783.0 3835.0
## - streaming_tv                             1   3784.2 3836.2
## - payment_method.xElectronic.check         1   3787.2 3839.2
## - streaming_movies                         1   3787.3 3839.3
## - contract.xOne.year                       1   3789.0 3841.0
## - multiple_lines                           1   3794.0 3846.0
## - internet_service.xFiber.optic            1   3797.7 3849.7
## - contract.xTwo.year                       1   3810.8 3862.8
## - tenure                                   1   3831.6 3883.6
## 
## Step:  AIC=3821.52
## churn ~ id + tenure + monthly_charges + total_charges + gender + 
##     senior_citizen + partner + dependents + phone_service + multiple_lines + 
##     internet_service.xFiber.optic + internet_service.xNo + online_security + 
##     online_backup + device_protection + streaming_tv + streaming_movies + 
##     contract.xOne.year + contract.xTwo.year + paperless_billing + 
##     payment_method.xElectronic.check + payment_method.xMailed.check + 
##     m5bins.xlow + m5bins.xmedium + m5bins.xvery.high
## 
##                                           Df Deviance    AIC
## - payment_method.xMailed.check             1   3770.0 3820.0
## - partner                                  1   3770.1 3820.1
## - online_security                          1   3770.3 3820.3
## - gender                                   1   3770.3 3820.3
## - id                                       1   3770.5 3820.5
## - m5bins.xvery.high                        1   3770.6 3820.6
## - online_backup                            1   3770.9 3820.9
## <none>                                         3769.5 3821.5
## - dependents                               1   3772.1 3822.1
## - phone_service                            1   3772.8 3822.8
## - m5bins.xmedium                           1   3772.9 3822.9
## + m5bins.xvery.Low                         1   3769.2 3823.2
## + payment_method.xCredit.card..automatic.  1   3769.3 3823.3
## - senior_citizen                           1   3773.4 3823.4
## + tech_support                             1   3769.5 3823.5
## - device_protection                        1   3775.1 3825.1
## - internet_service.xNo                     1   3778.8 3828.8
## - total_charges                            1   3779.3 3829.3
## - monthly_charges                          1   3779.7 3829.7
## - paperless_billing                        1   3783.2 3833.2
## - m5bins.xlow                              1   3783.6 3833.6
## - streaming_tv                             1   3784.8 3834.8
## - payment_method.xElectronic.check         1   3787.6 3837.6
## - streaming_movies                         1   3787.9 3837.9
## - contract.xOne.year                       1   3789.3 3839.3
## - multiple_lines                           1   3794.8 3844.8
## - internet_service.xFiber.optic            1   3797.7 3847.7
## - contract.xTwo.year                       1   3811.6 3861.6
## - tenure                                   1   3837.7 3887.7
## 
## Step:  AIC=3820.01
## churn ~ id + tenure + monthly_charges + total_charges + gender + 
##     senior_citizen + partner + dependents + phone_service + multiple_lines + 
##     internet_service.xFiber.optic + internet_service.xNo + online_security + 
##     online_backup + device_protection + streaming_tv + streaming_movies + 
##     contract.xOne.year + contract.xTwo.year + paperless_billing + 
##     payment_method.xElectronic.check + m5bins.xlow + m5bins.xmedium + 
##     m5bins.xvery.high
## 
##                                           Df Deviance    AIC
## - partner                                  1   3770.6 3818.6
## - online_security                          1   3770.8 3818.8
## - gender                                   1   3770.8 3818.8
## - id                                       1   3770.9 3818.9
## - m5bins.xvery.high                        1   3771.1 3819.1
## - online_backup                            1   3771.4 3819.4
## <none>                                         3770.0 3820.0
## - dependents                               1   3772.6 3820.6
## - m5bins.xmedium                           1   3773.3 3821.3
## - phone_service                            1   3773.3 3821.3
## + payment_method.xMailed.check             1   3769.5 3821.5
## + m5bins.xvery.Low                         1   3769.7 3821.7
## - senior_citizen                           1   3773.9 3821.9
## + tech_support                             1   3770.0 3822.0
## + payment_method.xCredit.card..automatic.  1   3770.0 3822.0
## - device_protection                        1   3775.6 3823.6
## - total_charges                            1   3779.5 3827.5
## - internet_service.xNo                     1   3779.7 3827.7
## - monthly_charges                          1   3780.2 3828.2
## - m5bins.xlow                              1   3783.8 3831.8
## - paperless_billing                        1   3783.9 3831.9
## - streaming_tv                             1   3785.5 3833.5
## - streaming_movies                         1   3788.6 3836.6
## - contract.xOne.year                       1   3789.9 3837.9
## - payment_method.xElectronic.check         1   3795.4 3843.4
## - multiple_lines                           1   3795.5 3843.5
## - internet_service.xFiber.optic            1   3798.4 3846.4
## - contract.xTwo.year                       1   3812.1 3860.1
## - tenure                                   1   3838.2 3886.2
## 
## Step:  AIC=3818.65
## churn ~ id + tenure + monthly_charges + total_charges + gender + 
##     senior_citizen + dependents + phone_service + multiple_lines + 
##     internet_service.xFiber.optic + internet_service.xNo + online_security + 
##     online_backup + device_protection + streaming_tv + streaming_movies + 
##     contract.xOne.year + contract.xTwo.year + paperless_billing + 
##     payment_method.xElectronic.check + m5bins.xlow + m5bins.xmedium + 
##     m5bins.xvery.high
## 
##                                           Df Deviance    AIC
## - online_security                          1   3771.4 3817.4
## - gender                                   1   3771.5 3817.5
## - id                                       1   3771.6 3817.6
## - m5bins.xvery.high                        1   3771.7 3817.7
## - online_backup                            1   3772.1 3818.1
## - dependents                               1   3772.6 3818.6
## <none>                                         3770.6 3818.6
## - m5bins.xmedium                           1   3773.9 3819.9
## + partner                                  1   3770.0 3820.0
## - phone_service                            1   3774.0 3820.0
## + payment_method.xMailed.check             1   3770.1 3820.1
## + m5bins.xvery.Low                         1   3770.4 3820.4
## + tech_support                             1   3770.6 3820.6
## + payment_method.xCredit.card..automatic.  1   3770.6 3820.6
## - senior_citizen                           1   3774.9 3820.9
## - device_protection                        1   3776.4 3822.4
## - total_charges                            1   3780.1 3826.1
## - internet_service.xNo                     1   3780.4 3826.4
## - monthly_charges                          1   3781.0 3827.0
## - m5bins.xlow                              1   3784.4 3830.4
## - paperless_billing                        1   3784.6 3830.6
## - streaming_tv                             1   3786.2 3832.2
## - streaming_movies                         1   3789.4 3835.4
## - contract.xOne.year                       1   3790.4 3836.4
## - payment_method.xElectronic.check         1   3796.1 3842.1
## - multiple_lines                           1   3796.4 3842.4
## - internet_service.xFiber.optic            1   3799.2 3845.2
## - contract.xTwo.year                       1   3812.6 3858.6
## - tenure                                   1   3838.2 3884.2
## 
## Step:  AIC=3817.36
## churn ~ id + tenure + monthly_charges + total_charges + gender + 
##     senior_citizen + dependents + phone_service + multiple_lines + 
##     internet_service.xFiber.optic + internet_service.xNo + online_backup + 
##     device_protection + streaming_tv + streaming_movies + contract.xOne.year + 
##     contract.xTwo.year + paperless_billing + payment_method.xElectronic.check + 
##     m5bins.xlow + m5bins.xmedium + m5bins.xvery.high
## 
##                                           Df Deviance    AIC
## - gender                                   1   3772.1 3816.1
## - id                                       1   3772.3 3816.3
## - m5bins.xvery.high                        1   3772.4 3816.4
## - dependents                               1   3773.3 3817.3
## <none>                                         3771.4 3817.4
## - m5bins.xmedium                           1   3774.5 3818.5
## + online_security                          1   3770.6 3818.6
## - online_backup                            1   3774.7 3818.7
## + partner                                  1   3770.8 3818.8
## + payment_method.xMailed.check             1   3770.9 3818.9
## + m5bins.xvery.Low                         1   3771.1 3819.1
## + tech_support                             1   3771.1 3819.1
## + payment_method.xCredit.card..automatic.  1   3771.4 3819.4
## - senior_citizen                           1   3775.6 3819.6
## - phone_service                            1   3780.4 3824.4
## - total_charges                            1   3780.7 3824.7
## - device_protection                        1   3781.6 3825.6
## - m5bins.xlow                              1   3785.0 3829.0
## - paperless_billing                        1   3785.5 3829.5
## - internet_service.xNo                     1   3790.3 3834.3
## - contract.xOne.year                       1   3791.0 3835.0
## - payment_method.xElectronic.check         1   3796.9 3840.9
## - monthly_charges                          1   3797.4 3841.4
## - streaming_tv                             1   3803.6 3847.6
## - streaming_movies                         1   3808.6 3852.6
## - multiple_lines                           1   3810.6 3854.6
## - contract.xTwo.year                       1   3813.1 3857.1
## - internet_service.xFiber.optic            1   3832.8 3876.8
## - tenure                                   1   3839.2 3883.2
## 
## Step:  AIC=3816.13
## churn ~ id + tenure + monthly_charges + total_charges + senior_citizen + 
##     dependents + phone_service + multiple_lines + internet_service.xFiber.optic + 
##     internet_service.xNo + online_backup + device_protection + 
##     streaming_tv + streaming_movies + contract.xOne.year + contract.xTwo.year + 
##     paperless_billing + payment_method.xElectronic.check + m5bins.xlow + 
##     m5bins.xmedium + m5bins.xvery.high
## 
##                                           Df Deviance    AIC
## - id                                       1   3773.0 3815.0
## - m5bins.xvery.high                        1   3773.1 3815.1
## - dependents                               1   3774.1 3816.1
## <none>                                         3772.1 3816.1
## - m5bins.xmedium                           1   3775.3 3817.3
## + gender                                   1   3771.4 3817.4
## - online_backup                            1   3775.4 3817.4
## + online_security                          1   3771.5 3817.5
## + partner                                  1   3771.5 3817.5
## + payment_method.xMailed.check             1   3771.6 3817.6
## + tech_support                             1   3771.9 3817.9
## + m5bins.xvery.Low                         1   3771.9 3817.9
## + payment_method.xCredit.card..automatic.  1   3772.1 3818.1
## - senior_citizen                           1   3776.3 3818.3
## - phone_service                            1   3781.1 3823.1
## - total_charges                            1   3781.4 3823.4
## - device_protection                        1   3782.3 3824.3
## - m5bins.xlow                              1   3785.7 3827.7
## - paperless_billing                        1   3786.5 3828.5
## - internet_service.xNo                     1   3791.0 3833.0
## - contract.xOne.year                       1   3791.6 3833.6
## - payment_method.xElectronic.check         1   3797.7 3839.7
## - monthly_charges                          1   3798.1 3840.1
## - streaming_tv                             1   3804.3 3846.3
## - streaming_movies                         1   3809.4 3851.4
## - multiple_lines                           1   3811.1 3853.1
## - contract.xTwo.year                       1   3813.7 3855.7
## - internet_service.xFiber.optic            1   3833.3 3875.3
## - tenure                                   1   3839.8 3881.8
## 
## Step:  AIC=3815.05
## churn ~ tenure + monthly_charges + total_charges + senior_citizen + 
##     dependents + phone_service + multiple_lines + internet_service.xFiber.optic + 
##     internet_service.xNo + online_backup + device_protection + 
##     streaming_tv + streaming_movies + contract.xOne.year + contract.xTwo.year + 
##     paperless_billing + payment_method.xElectronic.check + m5bins.xlow + 
##     m5bins.xmedium + m5bins.xvery.high
## 
##                                           Df Deviance    AIC
## - m5bins.xvery.high                        1   3774.1 3814.1
## - dependents                               1   3775.0 3815.0
## <none>                                         3773.0 3815.0
## + id                                       1   3772.1 3816.1
## - m5bins.xmedium                           1   3776.2 3816.2
## + gender                                   1   3772.3 3816.3
## - online_backup                            1   3776.3 3816.3
## + online_security                          1   3772.4 3816.4
## + partner                                  1   3772.5 3816.5
## + payment_method.xMailed.check             1   3772.6 3816.6
## + m5bins.xvery.Low                         1   3772.8 3816.8
## + tech_support                             1   3772.8 3816.8
## + payment_method.xCredit.card..automatic.  1   3773.0 3817.0
## - senior_citizen                           1   3777.3 3817.3
## - phone_service                            1   3781.8 3821.8
## - total_charges                            1   3782.4 3822.4
## - device_protection                        1   3783.2 3823.2
## - m5bins.xlow                              1   3786.5 3826.5
## - paperless_billing                        1   3787.6 3827.6
## - internet_service.xNo                     1   3791.9 3831.9
## - contract.xOne.year                       1   3792.6 3832.6
## - payment_method.xElectronic.check         1   3798.8 3838.8
## - monthly_charges                          1   3798.8 3838.8
## - streaming_tv                             1   3805.0 3845.0
## - streaming_movies                         1   3810.0 3850.0
## - multiple_lines                           1   3811.9 3851.9
## - contract.xTwo.year                       1   3814.6 3854.6
## - internet_service.xFiber.optic            1   3833.9 3873.9
## - tenure                                   1   3840.8 3880.8
## 
## Step:  AIC=3814.11
## churn ~ tenure + monthly_charges + total_charges + senior_citizen + 
##     dependents + phone_service + multiple_lines + internet_service.xFiber.optic + 
##     internet_service.xNo + online_backup + device_protection + 
##     streaming_tv + streaming_movies + contract.xOne.year + contract.xTwo.year + 
##     paperless_billing + payment_method.xElectronic.check + m5bins.xlow + 
##     m5bins.xmedium
## 
##                                           Df Deviance    AIC
## - dependents                               1   3776.1 3814.1
## <none>                                         3774.1 3814.1
## + m5bins.xvery.high                        1   3773.0 3815.0
## - m5bins.xmedium                           1   3777.1 3815.1
## + id                                       1   3773.1 3815.1
## + gender                                   1   3773.3 3815.3
## + partner                                  1   3773.5 3815.5
## + online_security                          1   3773.5 3815.5
## + payment_method.xMailed.check             1   3773.6 3815.6
## - online_backup                            1   3777.7 3815.7
## + tech_support                             1   3773.8 3815.8
## + m5bins.xvery.Low                         1   3773.9 3815.9
## + payment_method.xCredit.card..automatic.  1   3774.1 3816.1
## - senior_citizen                           1   3778.2 3816.2
## - total_charges                            1   3782.5 3820.5
## - phone_service                            1   3784.6 3822.6
## - device_protection                        1   3784.9 3822.9
## - m5bins.xlow                              1   3787.1 3825.1
## - paperless_billing                        1   3788.4 3826.4
## - contract.xOne.year                       1   3793.9 3831.9
## - internet_service.xNo                     1   3796.0 3834.0
## - payment_method.xElectronic.check         1   3799.7 3837.7
## - monthly_charges                          1   3803.4 3841.4
## - streaming_tv                             1   3809.1 3847.1
## - streaming_movies                         1   3814.4 3852.4
## - multiple_lines                           1   3814.6 3852.6
## - contract.xTwo.year                       1   3818.5 3856.5
## - internet_service.xFiber.optic            1   3839.8 3877.8
## - tenure                                   1   3841.3 3879.3
## 
## Step:  AIC=3814.06
## churn ~ tenure + monthly_charges + total_charges + senior_citizen + 
##     phone_service + multiple_lines + internet_service.xFiber.optic + 
##     internet_service.xNo + online_backup + device_protection + 
##     streaming_tv + streaming_movies + contract.xOne.year + contract.xTwo.year + 
##     paperless_billing + payment_method.xElectronic.check + m5bins.xlow + 
##     m5bins.xmedium
## 
##                                           Df Deviance    AIC
## <none>                                         3776.1 3814.1
## + dependents                               1   3774.1 3814.1
## - m5bins.xmedium                           1   3779.0 3815.0
## + m5bins.xvery.high                        1   3775.0 3815.0
## + id                                       1   3775.1 3815.1
## + gender                                   1   3775.3 3815.3
## + online_security                          1   3775.4 3815.4
## - online_backup                            1   3779.6 3815.6
## + payment_method.xMailed.check             1   3775.6 3815.6
## + tech_support                             1   3775.8 3815.8
## + m5bins.xvery.Low                         1   3775.8 3815.8
## + partner                                  1   3776.0 3816.0
## + payment_method.xCredit.card..automatic.  1   3776.0 3816.0
## - senior_citizen                           1   3781.3 3817.3
## - total_charges                            1   3784.7 3820.7
## - phone_service                            1   3786.7 3822.7
## - device_protection                        1   3787.0 3823.0
## - m5bins.xlow                              1   3788.9 3824.9
## - paperless_billing                        1   3790.7 3826.7
## - contract.xOne.year                       1   3796.6 3832.6
## - internet_service.xNo                     1   3798.4 3834.4
## - payment_method.xElectronic.check         1   3801.9 3837.9
## - monthly_charges                          1   3805.7 3841.7
## - streaming_tv                             1   3811.3 3847.3
## - streaming_movies                         1   3816.9 3852.9
## - multiple_lines                           1   3817.0 3853.0
## - contract.xTwo.year                       1   3821.9 3857.9
## - internet_service.xFiber.optic            1   3842.5 3878.5
## - tenure                                   1   3844.6 3880.6
summary(model_2)
## 
## Call:
## glm(formula = churn ~ tenure + monthly_charges + total_charges + 
##     senior_citizen + phone_service + multiple_lines + internet_service.xFiber.optic + 
##     internet_service.xNo + online_backup + device_protection + 
##     streaming_tv + streaming_movies + contract.xOne.year + contract.xTwo.year + 
##     paperless_billing + payment_method.xElectronic.check + m5bins.xlow + 
##     m5bins.xmedium, family = binomial(link = "logit"), data = training_set)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -4.97163    0.56082  -8.865  < 2e-16 ***
## tenure                           -1.39499    0.18293  -7.626 2.43e-14 ***
## monthly_charges                  -2.29205    0.42467  -5.397 6.76e-08 ***
## total_charges                     0.56500    0.19592   2.884 0.003929 ** 
## senior_citizen                    0.23593    0.10304   2.290 0.022031 *  
## phone_service                     1.06235    0.32705   3.248 0.001161 ** 
## multiple_lines                    0.74534    0.11761   6.337 2.34e-10 ***
## internet_service.xFiber.optic     2.97957    0.37797   7.883 3.20e-15 ***
## internet_service.xNo             -2.17834    0.46094  -4.726 2.29e-06 ***
## online_backup                     0.22394    0.11876   1.886 0.059341 .  
## device_protection                 0.40402    0.12269   3.293 0.000991 ***
## streaming_tv                      1.00256    0.17065   5.875 4.23e-09 ***
## streaming_movies                  1.08905    0.17257   6.311 2.78e-10 ***
## contract.xOne.year               -0.59028    0.13295  -4.440 9.00e-06 ***
## contract.xTwo.year               -1.36547    0.22238  -6.140 8.24e-10 ***
## paperless_billing                 0.35203    0.09233   3.813 0.000137 ***
## payment_method.xElectronic.check  0.43897    0.08607   5.100 3.39e-07 ***
## m5bins.xlow                       0.70711    0.20128   3.513 0.000443 ***
## m5bins.xmedium                    0.43639    0.25480   1.713 0.086771 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5299.7  on 4570  degrees of freedom
## Residual deviance: 3776.1  on 4552  degrees of freedom
## AIC: 3814.1
## 
## Number of Fisher Scoring iterations: 6
formula(model_2)
## churn ~ tenure + monthly_charges + total_charges + senior_citizen + 
##     phone_service + multiple_lines + internet_service.xFiber.optic + 
##     internet_service.xNo + online_backup + device_protection + 
##     streaming_tv + streaming_movies + contract.xOne.year + contract.xTwo.year + 
##     paperless_billing + payment_method.xElectronic.check + m5bins.xlow + 
##     m5bins.xmedium

Improve Model Performance

  1. Check for multicollinearity:
    In order to check for multicollinearity i will use ‘VIF function’ to get variance inflation factor for each variable in ‘model_stepwise’.I will remove a variable with VIF > 5.

  2. Check P-Values. p-vlaue > 0.05 —> remove variable.

  3. Check for multicollinearity:

(vif_vars <- as.data.frame(vif(model_2)))
##                                  vif(model_2)
## tenure                              14.276772
## monthly_charges                     89.210594
## total_charges                       19.141273
## senior_citizen                       1.091549
## phone_service                        5.633263
## multiple_lines                       2.088719
## internet_service.xFiber.optic       21.249413
## internet_service.xNo                12.017351
## online_backup                        1.896315
## device_protection                    2.005867
## streaming_tv                         4.355047
## streaming_movies                     4.452581
## contract.xOne.year                   1.319457
## contract.xTwo.year                   1.284584
## paperless_billing                    1.121314
## payment_method.xElectronic.check     1.144557
## m5bins.xlow                          4.003061
## m5bins.xmedium                       1.852579

* internet_service.xFiber.optic variable has VIF haigeher than 5 ,therefor i will remove it and recheck the model again.

2. Check P-Values

All indpendent variables are statistically significant.

#* Final Model

model_3 <- glm(churn ~ tenure + total_charges + dependents +
    internet_service.xNo + online_security + online_backup + 
    tech_support + streaming_tv + streaming_movies + contract.xOne.year + 
    contract.xTwo.year + paperless_billing + payment_method.xElectronic.check 
    + m5bins.xmedium,data = training_set,family = "binomial") 
summary(model_3)
## 
## Call:
## glm(formula = churn ~ tenure + total_charges + dependents + internet_service.xNo + 
##     online_security + online_backup + tech_support + streaming_tv + 
##     streaming_movies + contract.xOne.year + contract.xTwo.year + 
##     paperless_billing + payment_method.xElectronic.check + m5bins.xmedium, 
##     family = "binomial", data = training_set)
## 
## Coefficients:
##                                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                      -1.19210    0.12469  -9.560  < 2e-16 ***
## tenure                           -1.56309    0.16791  -9.309  < 2e-16 ***
## total_charges                     0.93375    0.16713   5.587 2.31e-08 ***
## dependents                       -0.19248    0.09852  -1.954   0.0507 .  
## internet_service.xNo             -1.19432    0.15417  -7.747 9.43e-15 ***
## online_security                  -0.53935    0.10402  -5.185 2.16e-07 ***
## online_backup                    -0.15348    0.09423  -1.629   0.1034    
## tech_support                     -0.46826    0.10470  -4.473 7.73e-06 ***
## streaming_tv                      0.22101    0.09923   2.227   0.0259 *  
## streaming_movies                  0.31505    0.09985   3.155   0.0016 ** 
## contract.xOne.year               -0.70990    0.13145  -5.400 6.65e-08 ***
## contract.xTwo.year               -1.53203    0.22310  -6.867 6.56e-12 ***
## paperless_billing                 0.42751    0.09098   4.699 2.62e-06 ***
## payment_method.xElectronic.check  0.51313    0.08429   6.088 1.15e-09 ***
## m5bins.xmedium                   -0.37694    0.19698  -1.914   0.0557 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5299.7  on 4570  degrees of freedom
## Residual deviance: 3831.7  on 4556  degrees of freedom
## AIC: 3861.7
## 
## Number of Fisher Scoring iterations: 6

* AIC is 3954 for model_1.

* AIC is 3941 for model_2.

* AIC is 3977 for model_3.

model 2 is selected since it hasthe lowest AIC.

# Model Evaluation

pred_train <- predict(model_2,newdata = training_set[,which(colnames(training_set)!= "churn")],type = "response")
pred <- predict(model_2,newdata = testing_set[,which(colnames(testing_set)!= "churn")],type = "response")
summary(pred)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001453 0.040578 0.189528 0.263258 0.455258 0.872511

Check model train data

pred_trainchurn <- factor(ifelse(pred_train >= 0.50, "Yes", "No"))
actual_trainchurn <- factor(ifelse(training_set$churn==1,"Yes","No"))

table(actual_trainchurn,pred_trainchurn)
##                  pred_trainchurn
## actual_trainchurn   No  Yes
##               No  3002  351
##               Yes  545  673

check model test

pred_churn <- factor(ifelse(pred >= 0.50, "Yes", "No"))
actual_churn <- factor(ifelse(testing_set$churn==1,"Yes","No"))

table(actual_churn,pred_churn)
##             pred_churn
## actual_churn   No  Yes
##          No  1632  178
##          Yes  300  351
caret::confusionMatrix(pred_trainchurn,actual_trainchurn,positive = "No")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  3002  545
##        Yes  351  673
##                                           
##                Accuracy : 0.804           
##                  95% CI : (0.7922, 0.8154)
##     No Information Rate : 0.7335          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4718          
##                                           
##  Mcnemar's Test P-Value : 1.136e-10       
##                                           
##             Sensitivity : 0.8953          
##             Specificity : 0.5525          
##          Pos Pred Value : 0.8463          
##          Neg Pred Value : 0.6572          
##              Prevalence : 0.7335          
##          Detection Rate : 0.6567          
##    Detection Prevalence : 0.7760          
##       Balanced Accuracy : 0.7239          
##                                           
##        'Positive' Class : No              
## 
caret::confusionMatrix(pred_churn,actual_churn,positive = "No")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1632  300
##        Yes  178  351
##                                           
##                Accuracy : 0.8058          
##                  95% CI : (0.7896, 0.8212)
##     No Information Rate : 0.7355          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.469           
##                                           
##  Mcnemar's Test P-Value : 3.123e-08       
##                                           
##             Sensitivity : 0.9017          
##             Specificity : 0.5392          
##          Pos Pred Value : 0.8447          
##          Neg Pred Value : 0.6635          
##              Prevalence : 0.7355          
##          Detection Rate : 0.6631          
##    Detection Prevalence : 0.7850          
##       Balanced Accuracy : 0.7204          
##                                           
##        'Positive' Class : No              
## 
list('Train Performance(CutOff = 50%)',c(Accuracy = 78,Sensitivity = 88,Specificity = 50),'Test Performance(Cutoff = 50)', c(Accuracy = 78 ,Sensitivity = 88, Specificity = 50))
## [[1]]
## [1] "Train Performance(CutOff = 50%)"
## 
## [[2]]
##    Accuracy Sensitivity Specificity 
##          78          88          50 
## 
## [[3]]
## [1] "Test Performance(Cutoff = 50)"
## 
## [[4]]
##    Accuracy Sensitivity Specificity 
##          78          88          50

The default cutoff prediction probability score is 0.5 or the ratio of 1’s and 0’s in the training data. But sometimes, tuning the probability cutoff can improve the accuracy in both the development and validation samples.

## Model Evaluation with Optimal Cutoff


# Install and load the cutpointr package
if (!require(cutpointr)) {
  install.packages("cutpointr")
}
## Loading required package: cutpointr
## 
## Attaching package: 'cutpointr'
## The following objects are masked from 'package:caret':
## 
##     precision, recall, sensitivity, specificity
library(cutpointr)

# Find the optimal cutoff using cutpointr
opt_cut <- cutpointr(testing_set, pred, churn, method = maximize_metric, metric = youden)
## Assuming the positive class is 1
## Assuming the positive class has higher x values
# Display the optimal cutoff value
opt_cut$optimal_cutpoint
## [1] 0.2836847
# Create a factor vector for predictions based on the optimal cutoff
pred_churn_51 <- factor(ifelse(pred >= opt_cut$optimal_cutpoint, 'Yes', 'No'))

# Calculate and display the confusion matrix
caret::confusionMatrix(actual_churn, pred_churn_51, positive = 'No')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1346  464
##        Yes  137  514
##                                           
##                Accuracy : 0.7558          
##                  95% CI : (0.7383, 0.7727)
##     No Information Rate : 0.6026          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4593          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9076          
##             Specificity : 0.5256          
##          Pos Pred Value : 0.7436          
##          Neg Pred Value : 0.7896          
##              Prevalence : 0.6026          
##          Detection Rate : 0.5469          
##    Detection Prevalence : 0.7355          
##       Balanced Accuracy : 0.7166          
##                                           
##        'Positive' Class : No              
## 

Finally , Logistic Regression with a cutoff probability value of 28 % gives us better values of accuracy and specificity but lower in sensitivity.