Predict Customer Churn in tele company

1. Getting Data

library(readr)
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.0.3

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v dplyr   1.0.2
## v tibble  3.0.4     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.0
## v purrr   0.3.4

## Warning: package 'ggplot2' was built under R version 4.0.3

## Warning: package 'tibble' was built under R version 4.0.3

## Warning: package 'stringr' was built under R version 4.0.3

## Warning: package 'forcats' was built under R version 4.0.3

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

churnData = read_csv("./WA_Fn-UseC_-Telco-Customer-Churn (1).csv")

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   SeniorCitizen = col_double(),
##   tenure = col_double(),
##   MonthlyCharges = col_double(),
##   TotalCharges = col_double()
## )

## See spec(...) for full column specifications.

# Compute total missing values in each column
colSums(is.na(churnData))

##       customerID           gender    SeniorCitizen          Partner 
##                0                0                0                0 
##       Dependents           tenure     PhoneService    MultipleLines 
##                0                0                0                0 
##  InternetService   OnlineSecurity     OnlineBackup DeviceProtection 
##                0                0                0                0 
##      TechSupport      StreamingTV  StreamingMovies         Contract 
##                0                0                0                0 
## PaperlessBilling    PaymentMethod   MonthlyCharges     TotalCharges 
##                0                0                0               11 
##            Churn 
##                0

2. Exploratory Data Analysis / Descriptive Analysis

2.1 Detect Outliers if any

Use Percentiles method for outlier detection

# tenure
lower_bound_te <- quantile(churnData$tenure , 0.025)

upper_bound_te <- quantile(churnData$tenure, 0.975)

outlier_ind_te <- which(churnData$tenure < lower_bound_te | churnData$tenure > upper_bound_te)

# Print outliers
head(churnData[outlier_ind_te, ])

## # A tibble: 6 x 21
##   customerID gender SeniorCitizen Partner Dependents tenure PhoneService
##   <chr>      <chr>          <dbl> <chr>   <chr>       <dbl> <chr>       
## 1 4472-LVYGI Female             0 Yes     Yes             0 No          
## 2 3115-CZMZD Male               0 No      Yes             0 Yes         
## 3 5709-LVOEQ Female             0 Yes     Yes             0 Yes         
## 4 4367-NUYAO Male               0 Yes     Yes             0 Yes         
## 5 1371-DWPAZ Female             0 Yes     Yes             0 No          
## 6 7644-OMVMY Male               0 Yes     Yes             0 Yes         
## # ... with 14 more variables: MultipleLines <chr>, InternetService <chr>,
## #   OnlineSecurity <chr>, OnlineBackup <chr>, DeviceProtection <chr>,
## #   TechSupport <chr>, StreamingTV <chr>, StreamingMovies <chr>,
## #   Contract <chr>, PaperlessBilling <chr>, PaymentMethod <chr>,
## #   MonthlyCharges <dbl>, TotalCharges <dbl>, Churn <chr>

require(ggplot2)

# Histogram
hist(churnData$tenure, col = "green")
rug(churnData$tenure)
abline(v = median(churnData$tenure), col = "magenta", lwd = 4)

# MonthlyCharges
lower_bound_mo <- quantile(churnData$MonthlyCharges , 0.025)

upper_bound_mo <- quantile(churnData$MonthlyCharges, 0.975)

outlier_ind_mo <- which(churnData$MonthlyCharges < lower_bound_mo | churnData$MonthlyCharges > upper_bound_mo)

# Print outliers
head(churnData[outlier_ind_mo, ])

## # A tibble: 6 x 21
##   customerID gender SeniorCitizen Partner Dependents tenure PhoneService
##   <chr>      <chr>          <dbl> <chr>   <chr>       <dbl> <chr>       
## 1 7469-LKBCI Male               0 No      No             16 Yes         
## 2 3655-SNQYZ Female             0 Yes     Yes            69 Yes         
## 3 1891-QRQSA Male               1 Yes     Yes            64 Yes         
## 4 6067-NGCEU Female             0 No      No             65 Yes         
## 5 2146-EGVDT Male               0 Yes     Yes            59 Yes         
## 6 6168-YBYNP Male               0 No      No             59 Yes         
## # ... with 14 more variables: MultipleLines <chr>, InternetService <chr>,
## #   OnlineSecurity <chr>, OnlineBackup <chr>, DeviceProtection <chr>,
## #   TechSupport <chr>, StreamingTV <chr>, StreamingMovies <chr>,
## #   Contract <chr>, PaperlessBilling <chr>, PaymentMethod <chr>,
## #   MonthlyCharges <dbl>, TotalCharges <dbl>, Churn <chr>

# Histogram
hist(churnData$MonthlyCharges, col = "gold")
rug(churnData$MonthlyCharges)
abline(v = median(churnData$MonthlyCharges), col = "magenta", lwd = 4)

# Histogram
hist(churnData$TotalCharges, col = "maroon")
rug(churnData$TotalCharges)
abline(v = median(churnData$TotalCharges), col = "magenta", lwd = 4)

2.2. Descriptive Analysis

# Count on Gender
count_gender <- matrix(table(churnData$gender))
df_gen <- data.frame(gender = c("Female", "Male"), count = count_gender[,1])
ggplot(data=df_gen, aes(x = gender, y = count, fill = gender)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_gen$count, label= df_gen$count), vjust=1, color="black", size=3) +
labs(title="Count on Gender")

## Warning: Use of `df_gen$count` is discouraged. Use `count` instead.

## Warning: Use of `df_gen$count` is discouraged. Use `count` instead.

# Count on OnlineBackup
count_onb <- matrix(table(churnData$OnlineBackup))
df_onb <- data.frame(OnlineBackup = c("No", "No Internet Service", "Yes"), count = count_onb[,1])
ggplot(data=df_onb, aes(x = OnlineBackup, y = count, fill = OnlineBackup)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_onb$count, label= df_onb$count), vjust=1, color="black", size=3) +
labs(title="Count on OnlineBackup")

## Warning: Use of `df_onb$count` is discouraged. Use `count` instead.

## Warning: Use of `df_onb$count` is discouraged. Use `count` instead.

# Count on TechSupport
count_tec <- matrix(table(churnData$TechSupport))
df_tec <- data.frame(TechSupport = c("No", "No Internet Service", "Yes"), count = count_tec[,1])
ggplot(data=df_tec, aes(x = TechSupport, y = count, fill = TechSupport)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_tec$count, label= df_tec$count), vjust=1, color="black", size=3) +
labs(title="Count on TechSupport")

## Warning: Use of `df_tec$count` is discouraged. Use `count` instead.

## Warning: Use of `df_tec$count` is discouraged. Use `count` instead.

# Count on Contract
count_con <- matrix(table(churnData$Contract))
df_con <- data.frame(Contract = c("Month-to-month","One year", "Two year"), count = count_con[,1])
ggplot(data=df_con, aes(x = Contract, y = count, fill = Contract)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_con$count, label= df_con$count), vjust=1, color="black", size=3) +
labs(title="Count on Contract")

## Warning: Use of `df_con$count` is discouraged. Use `count` instead.

## Warning: Use of `df_con$count` is discouraged. Use `count` instead.

# Count on PaperlessBilling
count_pap <- matrix(table(churnData$PaperlessBilling))
df_pap <- data.frame(PaperlessBilling = c("No", "Yes"), count = count_pap[,1])
ggplot(data=df_pap, aes(x = PaperlessBilling, y = count, fill = PaperlessBilling)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_pap$count, label= df_pap$count), vjust=1, color="black", size=3) +
labs(title="Count on PaperlessBilling")

## Warning: Use of `df_pap$count` is discouraged. Use `count` instead.

## Warning: Use of `df_pap$count` is discouraged. Use `count` instead.

# Count on PaymentMethod
count_pay <- matrix(table(churnData$PaymentMethod))
df_pay <- data.frame(PaymentMethod = c("Bank transfer (automatic)", "Credit card (automatic)", "Electronic check", "Mailed check"), count = count_pay[,1])
ggplot(data=df_pay, aes(x = PaymentMethod, y = count, fill = PaymentMethod)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_pay$count, label= df_pay$count), vjust=1, color="black", size=3) +
labs(title="Count on PaymentMethod") +
  theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1))

## Warning: Use of `df_pay$count` is discouraged. Use `count` instead.

## Warning: Use of `df_pay$count` is discouraged. Use `count` instead.

# Count on Churn
count_chu <- matrix(table(churnData$Churn))
df_chu <- data.frame(Churn = c("No", "Yes"), count = count_chu[,1])
ggplot(data=df_chu, aes(x = Churn, y = count, fill = Churn)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_chu$count, label= df_chu$count), vjust=1, color="black", size=3) +
labs(title="Count on Churn")

## Warning: Use of `df_chu$count` is discouraged. Use `count` instead.

## Warning: Use of `df_chu$count` is discouraged. Use `count` instead.

2.2 Testing Variables Correlation

require(GGally)

## Loading required package: GGally

## Warning: package 'GGally' was built under R version 4.0.3

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

require(ggplot2)

churn <- subset(churnData, select = c("tenure", "MonthlyCharges", "TotalCharges"))

# Function to return points and geom_smooth
# allow for the method to be changed
my_fn <- function(data, mapping, method="loess", ...){
      p <- ggplot(data = data, mapping = mapping) + 
      geom_point() + 
      geom_smooth(method=method, ...)
      p
    }

# Default "loess" curve    
ggpairs(churn, lower = list(continuous = my_fn))

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 11 rows containing missing values

## `geom_smooth()` using formula 'y ~ x'

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 11 rows containing missing values

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 11 rows containing non-finite values (stat_smooth).

## Warning: Removed 11 rows containing missing values (geom_point).

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 11 rows containing non-finite values (stat_smooth).

## Warning: Removed 11 rows containing missing values (geom_point).

## Warning: Removed 11 rows containing non-finite values (stat_density).

Observation: tenure and TotalCharge indicated a very week correlation; while tenure and MonthlyCharges reflected its strongest correlation among the other pairs. The loess curve of pairs (tenure - TotalCharges, tenure - MonthlyCharges, and MonthlyCharges - TotalCharges) were upward trend. In the other words, more time using services, more expenses had been paid per month and in total.

# Drop unused columns
churnData$customerID <- NULL

2.3. Plot correlation matrix

churndt <- cor(churnData[,c("tenure", "MonthlyCharges", "TotalCharges")])
head(round(churndt,2))

##                tenure MonthlyCharges TotalCharges
## tenure           1.00           0.25           NA
## MonthlyCharges   0.25           1.00           NA
## TotalCharges       NA             NA            1

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.0.3

## corrplot 0.84 loaded

library(Hmisc)

## Warning: package 'Hmisc' was built under R version 4.0.3

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Warning: package 'Formula' was built under R version 4.0.3

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(ggcorrplot)

## Warning: package 'ggcorrplot' was built under R version 4.0.3

cormat <- churndt
ggcorrplot::ggcorrplot(cormat, title = "Correlation of Numeric Variables")

3. Data Analysis

3.1. Data Transformation

library(dplyr)

churnData <- churnData %>%
  transmute(
    gender,
    SeniorCitizen = ifelse(SeniorCitizen == 0, "No", "Yes"),
    Partner,
    Dependents,
    tenure,
    PhoneService,
    MultipleLines = ifelse(MultipleLines == "Yes", "Yes", "No"),
    InternetService = ifelse(InternetService == "No", "No", "Yes"),
    OnlineSecurity = ifelse(OnlineSecurity == "Yes", "Yes", "No"),
    OnlineBackup = ifelse(OnlineBackup == "Yes", "Yes", "No"),
    DeviceProtection = ifelse(DeviceProtection == "Yes", "Yes", "No"),
    TechSupport = ifelse(TechSupport == "Yes", "Yes", "No"),
    StreamingTV = ifelse(StreamingTV == "Yes", "Yes", "No"),
    StreamingMovies = ifelse(StreamingMovies == "Yes", "Yes", "No"),
    Contract,
    PaperlessBilling,
    PaymentMethod,
    MonthlyCharges,
    TotalCharges,
    Churn
  ) 

# Transform "tenure" from months into years 
 churnData$tenure[churnData$tenure >= 0 & churnData$tenure <= 12] <- "0-1 year"
 churnData$tenure[churnData$tenure > 12 & churnData$tenure <= 24] <- "1-2 years" 
 churnData$tenure[churnData$tenure > 24 & churnData$tenure <= 36] <- "2-3 years" 
 churnData$tenure[churnData$tenure > 36 & churnData$tenure <= 48] <- "3-4 years" 
 churnData$tenure[churnData$tenure > 48 & churnData$tenure <= 60] <- "4-5 years" 
 churnData$tenure[churnData$tenure > 60 & churnData$tenure <= 72] <- "5-6 years"

churnData$Contract[churnData$Contract == "Month-to-month"] <- 0
churnData$Contract[churnData$Contract == "One year"] <- 1
churnData$Contract[churnData$Contract == "Two year"] <- 2

churnData$PaymentMethod[churnData$PaymentMethod == "Bank transfer (automatic)"] <- 1
churnData$PaymentMethod[churnData$PaymentMethod == "Credit card (automatic)"] <- 2
churnData$PaymentMethod[churnData$PaymentMethod == "Electronic check"] <- 3
churnData$PaymentMethod[churnData$PaymentMethod == "Mailed check"] <- 4

churnData <- churnData %>%
  mutate(num_tenure = substr(tenure, 3, 3))

churnData$num_tenure <- as.numeric(as.character(churnData$num_tenure))

# Impute missing values
churnData$TotalCharges[is.na(churnData$TotalCharges)] <- round(churnData$num_tenure * churnData$MonthlyCharges, 2)

## Warning in churnData$TotalCharges[is.na(churnData$TotalCharges)] <-
## round(churnData$num_tenure * : number of items to replace is not a multiple of
## replacement length

churnData <- churnData %>%
  transmute(
    gender = ifelse(gender == "Female", 1, 0),
    SeniorCitizen = ifelse(SeniorCitizen == "Yes", 1, 0) ,
    Partner = ifelse(Partner == "Yes", 1, 0) ,
    Dependents = ifelse(Dependents == "Yes", 1, 0) ,
    tenure,
    PhoneService = ifelse(PhoneService == "Yes", 1, 0) ,
    MultipleLines = ifelse(MultipleLines == "Yes", 1, 0) ,
    InternetService = ifelse(InternetService == "Yes", 1, 0) ,
    OnlineSecurity = ifelse(OnlineSecurity == "Yes", 1, 0) ,
    OnlineBackup = ifelse(OnlineBackup == "Yes", 1, 0) ,
    DeviceProtection = ifelse(DeviceProtection == "Yes", 1, 0) ,
    TechSupport = ifelse(TechSupport == "Yes", 1, 0) ,
    StreamingTV = ifelse(StreamingTV == "Yes", 1, 0) ,
    StreamingMovies = ifelse(StreamingMovies == "Yes", 1, 0) ,
    Contract,
    PaperlessBilling = ifelse(PaperlessBilling == "Yes", 1, 0) ,
    PaymentMethod,
    MonthlyCharges,
    TotalCharges,
    Churn = ifelse(Churn == "Yes", 1, 0)
  )

3.2. Data Modeling

library(tidymodels)

## Warning: package 'tidymodels' was built under R version 4.0.3

## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --

## v broom     0.7.3      v recipes   0.1.15
## v dials     0.0.9      v rsample   0.0.8 
## v infer     0.5.3      v tune      0.1.2 
## v modeldata 0.1.0      v workflows 0.2.1 
## v parsnip   0.1.4      v yardstick 0.0.7

## Warning: package 'broom' was built under R version 4.0.3

## Warning: package 'dials' was built under R version 4.0.3

## Warning: package 'infer' was built under R version 4.0.3

## Warning: package 'modeldata' was built under R version 4.0.3

## Warning: package 'parsnip' was built under R version 4.0.3

## Warning: package 'recipes' was built under R version 4.0.3

## Warning: package 'rsample' was built under R version 4.0.3

## Warning: package 'tune' was built under R version 4.0.3

## Warning: package 'workflows' was built under R version 4.0.3

## Warning: package 'yardstick' was built under R version 4.0.3

## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard()    masks purrr::discard()
## x dplyr::filter()      masks stats::filter()
## x recipes::fixed()     masks stringr::fixed()
## x dplyr::lag()         masks stats::lag()
## x yardstick::spec()    masks readr::spec()
## x Hmisc::src()         masks dplyr::src()
## x recipes::step()      masks stats::step()
## x Hmisc::summarize()   masks dplyr::summarize()
## x parsnip::translate() masks Hmisc::translate()

# Split the data into training and test sets
set.seed(2021)
churn_split <- initial_split(churnData, strata = Churn, prop = 3/4)
churn_train <- training(churn_split) # training set
churn_test <- testing(churn_split) # test set

3.2.1. Logistic Regression

# Logistic Regression
mod_glm <- glm(Churn ~ ., family = "binomial", data = churn_train) # model 01
summary(mod_glm)

## 
## Call:
## glm(formula = Churn ~ ., family = "binomial", data = churn_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9194  -0.6816  -0.2978   0.6649   3.0829  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.001e+00  2.305e-01  -4.343 1.41e-05 ***
## gender            2.617e-03  7.468e-02   0.035 0.972049    
## SeniorCitizen     2.150e-01  9.721e-02   2.211 0.027007 *  
## Partner          -7.055e-02  8.997e-02  -0.784 0.432966    
## Dependents       -4.931e-02  1.034e-01  -0.477 0.633547    
## tenure1-2 years  -9.065e-01  1.202e-01  -7.542 4.63e-14 ***
## tenure2-3 years  -1.124e+00  1.690e-01  -6.649 2.95e-11 ***
## tenure3-4 years  -8.999e-01  2.231e-01  -4.033 5.51e-05 ***
## tenure4-5 years  -9.361e-01  2.887e-01  -3.242 0.001187 ** 
## tenure5-6 years  -1.071e+00  3.798e-01  -2.819 0.004824 ** 
## PhoneService     -1.334e+00  2.009e-01  -6.641 3.11e-11 ***
## MultipleLines     1.928e-01  9.566e-02   2.016 0.043832 *  
## InternetService  -3.130e-01  2.214e-01  -1.414 0.157508    
## OnlineSecurity   -4.501e-01  9.756e-02  -4.614 3.96e-06 ***
## OnlineBackup     -2.892e-01  8.974e-02  -3.222 0.001271 ** 
## DeviceProtection -1.937e-01  9.363e-02  -2.069 0.038557 *  
## TechSupport      -5.157e-01  9.935e-02  -5.191 2.10e-07 ***
## StreamingTV      -1.361e-01  1.030e-01  -1.322 0.186270    
## StreamingMovies  -8.621e-02  1.039e-01  -0.830 0.406643    
## Contract1        -8.050e-01  1.236e-01  -6.513 7.35e-11 ***
## Contract2        -1.733e+00  2.031e-01  -8.533  < 2e-16 ***
## PaperlessBilling  2.631e-01  8.547e-02   3.078 0.002085 ** 
## PaymentMethod2   -4.097e-02  1.300e-01  -0.315 0.752633    
## PaymentMethod3    3.549e-01  1.076e-01   3.297 0.000976 ***
## PaymentMethod4    4.040e-02  1.319e-01   0.306 0.759406    
## MonthlyCharges    4.043e-02  4.511e-03   8.964  < 2e-16 ***
## TotalCharges     -1.241e-04  6.837e-05  -1.815 0.069523 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6113.6  on 5282  degrees of freedom
## Residual deviance: 4436.9  on 5256  degrees of freedom
## AIC: 4490.9
## 
## Number of Fisher Scoring iterations: 6

library(car)

## Warning: package 'car' was built under R version 4.0.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.0.3

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

vif(mod_glm)

##                       GVIF Df GVIF^(1/(2*Df))
## gender            1.005763  1        1.002877
## SeniorCitizen     1.136703  1        1.066163
## Partner           1.404981  1        1.185319
## Dependents        1.323744  1        1.150541
## tenure           11.984596  5        1.281924
## PhoneService      2.695193  1        1.641704
## MultipleLines     1.621080  1        1.273216
## InternetService   3.291622  1        1.814283
## OnlineSecurity    1.153353  1        1.073943
## OnlineBackup      1.271964  1        1.127814
## DeviceProtection  1.373446  1        1.171941
## TechSupport       1.192909  1        1.092204
## StreamingTV       1.856190  1        1.362421
## StreamingMovies   1.889904  1        1.374738
## Contract          1.769217  2        1.153308
## PaperlessBilling  1.128832  1        1.062465
## PaymentMethod     1.385136  3        1.055801
## MonthlyCharges   10.566353  1        3.250593
## TotalCharges     14.638476  1        3.826026

Observation: tenure, PhoneService, InternetService, MonthlyCharges, and TotalCharges strongly demonstrated a correlation to Churn.

Feature Analysis

# Use ANOVA to assess model 01
anova(mod_glm, test = "Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Churn
## 
## Terms added sequentially (first to last)
## 
## 
##                  Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                              5282     6113.6              
## gender            1     0.25      5281     6113.3   0.61395    
## SeniorCitizen     1   108.32      5280     6005.0 < 2.2e-16 ***
## Partner           1   128.19      5279     5876.8 < 2.2e-16 ***
## Dependents        1    27.36      5278     5849.5 1.689e-07 ***
## tenure            5   573.51      5273     5275.9 < 2.2e-16 ***
## PhoneService      1     0.91      5272     5275.0   0.34130    
## MultipleLines     1   133.87      5271     5141.2 < 2.2e-16 ***
## InternetService   1   247.39      5270     4893.8 < 2.2e-16 ***
## OnlineSecurity    1    62.54      5269     4831.2 2.617e-15 ***
## OnlineBackup      1     3.59      5268     4827.7   0.05815 .  
## DeviceProtection  1     1.24      5267     4826.4   0.26523    
## TechSupport       1    54.96      5266     4771.4 1.231e-13 ***
## StreamingTV       1    30.06      5265     4741.4 4.180e-08 ***
## StreamingMovies   1    20.35      5264     4721.0 6.440e-06 ***
## Contract          2   145.02      5262     4576.0 < 2.2e-16 ***
## PaperlessBilling  1    19.90      5261     4556.1 8.176e-06 ***
## PaymentMethod     3    33.27      5258     4522.8 2.829e-07 ***
## MonthlyCharges    1    82.76      5257     4440.1 < 2.2e-16 ***
## TotalCharges      1     3.23      5256     4436.9   0.07211 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Observation: Important features are extracted, including: SeniorCitizen, Partner, Dependents, tenure, MultipleLines, InternetService, OnlineSecurity, TechSupport, StreamingTV, StreamingMovies, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges.

Closely Look At Hypothesis Test

Null hypothesis is that the coefficient of regressors is zero
Alternative hypothesis is that the coefficient of regressors is different to zero

The expected outcome is that we had sufficient evidences to reject the null hypothesis so that we might obtain significant features for our model in following steps.

We used Chi-square distribution to test on one variance.

# model 02 of logistic regression, omitted top VIFs (variance inflation factor)
mod3_glm <- glm(Churn ~ ., family = "binomial", 
                data = subset(churn_train, select = -c(tenure, PhoneService,
                                                       InternetService,
                                                       MonthlyCharges,
                                                       TotalCharges ))
)

# Use ANOVA to assess the significance of model 02
anova(mod3_glm, test = "Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Churn
## 
## Terms added sequentially (first to last)
## 
## 
##                  Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                              5282     6113.6              
## gender            1     0.25      5281     6113.3 0.6139501    
## SeniorCitizen     1   108.32      5280     6005.0 < 2.2e-16 ***
## Partner           1   128.19      5279     5876.8 < 2.2e-16 ***
## Dependents        1    27.36      5278     5849.5 1.689e-07 ***
## MultipleLines     1    11.70      5277     5837.8 0.0006249 ***
## OnlineSecurity    1   115.87      5276     5721.9 < 2.2e-16 ***
## OnlineBackup      1     8.89      5275     5713.0 0.0028733 ** 
## DeviceProtection  1     4.35      5274     5708.7 0.0369095 *  
## TechSupport       1    60.72      5273     5647.9 6.578e-15 ***
## StreamingTV       1    67.38      5272     5580.5 2.235e-16 ***
## StreamingMovies   1    30.75      5271     5549.8 2.942e-08 ***
## Contract          2   683.20      5269     4866.6 < 2.2e-16 ***
## PaperlessBilling  1    36.71      5268     4829.9 1.370e-09 ***
## PaymentMethod     3    69.52      5265     4760.4 5.405e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Chi-square distribution to test on one variance

Compare deviance of 14 regressors in model 02 to the 95th percentile of chi-square distribution with relevant degree of freedom (‘df’).

qchisq(0.95, 5282-5281) # gender, df = 5281

## [1] 3.841459

qchisq(0.95, 5282-5280) # SeniorCitizen , df = 5280

## [1] 5.991465

qchisq(0.95, 5282-5279) # Partner , df = 5279

## [1] 7.814728

qchisq(0.95, 5282-5278) # Dependents, df = 5278

## [1] 9.487729

qchisq(0.95, 5282-5277) # MultipleLines, df = 5277

## [1] 11.0705

qchisq(0.95, 5282-5276) # OnlineSecurity, df = 5276

## [1] 12.59159

qchisq(0.95, 5282-5275) # OnlineBackup, df = 5275

## [1] 14.06714

qchisq(0.95, 5282-5274) # DeviceProtection, df = 5274

## [1] 15.50731

qchisq(0.95, 5282-5273) # TechSupport, df = 5273

## [1] 16.91898

qchisq(0.95, 5282-5272) # StreamingTV, df = 5272

## [1] 18.30704

qchisq(0.95, 5282-5271) # StreamingMovies, df = 5271

## [1] 19.67514

qchisq(0.95, 5282-5269) # Contract, df = 5269

## [1] 22.36203

qchisq(0.95, 5282-5268) # PaperlessBilling, df = 5268

## [1] 23.68479

qchisq(0.95, 5282-5265) # PaymentMethod, df = 5265

## [1] 27.58711

Observation: gender, OnlineBackup, DeviceProtection having respective deviance to be smaller than the 95th percentile of chi-square distribution, hence would be regarded as consistent with the null hypothesis at the conventional 5% level. In other words, gender, OnlineBackup, and DeviceProtection add very little to the model.

mod_glm_selected <- glm(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
                          MultipleLines + InternetService + OnlineSecurity +
                          TechSupport + StreamingTV + StreamingMovies +
                          Contract + PaperlessBilling + PaymentMethod + MonthlyCharges
                        , family = "binomial", data = churn_train)
summary(mod_glm_selected)

## 
## Call:
## glm(formula = Churn ~ SeniorCitizen + Partner + Dependents + 
##     tenure + MultipleLines + InternetService + OnlineSecurity + 
##     TechSupport + StreamingTV + StreamingMovies + Contract + 
##     PaperlessBilling + PaymentMethod + MonthlyCharges, family = "binomial", 
##     data = churn_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8621  -0.6903  -0.3004   0.7049   3.0551  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.767444   0.171006 -10.336  < 2e-16 ***
## SeniorCitizen     0.265787   0.095897   2.772 0.005578 ** 
## Partner          -0.070357   0.089112  -0.790 0.429797    
## Dependents       -0.075775   0.102569  -0.739 0.460049    
## tenure1-2 years  -0.968387   0.110615  -8.755  < 2e-16 ***
## tenure2-3 years  -1.299971   0.132655  -9.800  < 2e-16 ***
## tenure3-4 years  -1.192831   0.145308  -8.209 2.23e-16 ***
## tenure4-5 years  -1.346732   0.158141  -8.516  < 2e-16 ***
## tenure5-6 years  -1.683063   0.187408  -8.981  < 2e-16 ***
## MultipleLines     0.231883   0.092820   2.498 0.012482 *  
## InternetService   0.583354   0.171464   3.402 0.000668 ***
## OnlineSecurity   -0.412234   0.095988  -4.295 1.75e-05 ***
## TechSupport      -0.485519   0.097489  -4.980 6.35e-07 ***
## StreamingTV       0.092654   0.095381   0.971 0.331345    
## StreamingMovies   0.156456   0.095475   1.639 0.101274    
## Contract1        -0.868004   0.122566  -7.082 1.42e-12 ***
## Contract2        -1.781605   0.202378  -8.803  < 2e-16 ***
## PaperlessBilling  0.291618   0.084470   3.452 0.000556 ***
## PaymentMethod2   -0.037012   0.129243  -0.286 0.774590    
## PaymentMethod3    0.397903   0.106547   3.735 0.000188 ***
## PaymentMethod4   -0.010390   0.130511  -0.080 0.936546    
## MonthlyCharges    0.015913   0.002754   5.778 7.58e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6113.6  on 5282  degrees of freedom
## Residual deviance: 4491.0  on 5261  degrees of freedom
## AIC: 4535
## 
## Number of Fisher Scoring iterations: 6

Assessing the predictive ability of the Logistic Regression model

churn_test$Churn <- as.character(churn_test$Churn)

fitted.results <- predict(mod_glm_selected ,newdata = churn_test, type = 'response')
fitted.results <- ifelse(fitted.results > 0.5, 1, 0)
misClasificError <- mean(fitted.results != churn_test$Churn)
print(paste('Logistic Regression Accuracy', 1-misClasificError))

## [1] "Logistic Regression Accuracy 0.800568181818182"

# Find the exact lower and upper bounds to the 95% confidence intervals for the coefficients.
exp(cbind(Odds_ratio = coef(mod_glm_selected), confint(mod_glm_selected)) )

## Waiting for profiling to be done...

##                  Odds_ratio     2.5 %    97.5 %
## (Intercept)       0.1707688 0.1216413 0.2378745
## SeniorCitizen     1.3044571 1.0807400 1.5740477
## Partner           0.9320610 0.7826974 1.1100265
## Dependents        0.9270250 0.7577434 1.1329072
## tenure1-2 years   0.3796950 0.3052494 0.4710040
## tenure2-3 years   0.2725396 0.2095856 0.3526161
## tenure3-4 years   0.3033614 0.2275554 0.4023388
## tenure4-5 years   0.2600890 0.1901987 0.3536584
## tenure5-6 years   0.1858040 0.1282379 0.2674635
## MultipleLines     1.2609718 1.0514169 1.5129646
## InternetService   1.7920383 1.2826264 2.5129111
## OnlineSecurity    0.6621692 0.5481641 0.7987014
## TechSupport       0.6153779 0.5078957 0.7443953
## StreamingTV       1.0970824 0.9100220 1.3227268
## StreamingMovies   1.1693593 0.9698805 1.4102525
## Contract1         0.4197888 0.3292509 0.5324837
## Contract2         0.1683676 0.1119095 0.2477587
## PaperlessBilling  1.3385912 1.1346704 1.5801867
## PaymentMethod2    0.9636644 0.7477281 1.2412929
## PaymentMethod3    1.4887002 1.2090656 1.8361527
## PaymentMethod4    0.9896636 0.7664536 1.2786404
## MonthlyCharges    1.0160400 1.0105908 1.0215640

Observation: The odds of Churn increase by 36% (exp(1.2826264)10), 31% (exp(1.1346704)10), and 34% (exp(1.2090656)10) for each point scored in top 03 features, including InternetService, PaperlessBilling, and PaymentMethod respectively.*

Logistic Regression Confusion Matrix

# Suppose that we used the cut-off is 0.5 in regression model
print("Confusion Matrix for Logistic Regression at threshold of 0.5")

## [1] "Confusion Matrix for Logistic Regression at threshold of 0.5"

table(churn_test$Churn, fitted.results > 0.5)

##    
##     FALSE TRUE
##   0  1175  118
##   1   233  234

With 0.5 of threshold, there are 233 customers predicted as non-churn but they turned to be churn in fact. Our target, therefore, is to reduce this number as much as possible. In the other words, we aimed at increasing the proportion of True Negative rate or Specificity as evaluating model probability in prediction to non-churn customers who are truly non-churn.

Test our assumption of using threshold of 0.5

How to make decision on the cut-off/threshold for making “yes” (positive) prediction. Using ROC instead.

library(ROCR)

## Warning: package 'ROCR' was built under R version 4.0.3

churn.probs <- predict(mod_glm_selected, churn_test, type="response")
# need to create prediction object from ROCR
pr <- prediction(churn.probs, churn_test$Churn)

# plotting ROC curve
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

# AUC value
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc

## [1] 0.8553527

K-fold Cross Validation

After we run the model on each fold, we average the evaluation metric from each one. So if we ran the model 10 times using ROC, we would average each of the 10 ROC values together. This is a great way to try and prevent overfitting a model.

library(caret)

## Warning: package 'caret' was built under R version 4.0.3

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:yardstick':
## 
##     precision, recall, sensitivity, specificity

## The following object is masked from 'package:survival':
## 
##     cluster

## The following object is masked from 'package:purrr':
## 
##     lift

# setting a seed for reproducibility
set.seed(10)

# changing the positive class to "Yes"
churnData$Churn <- as.character(churnData$Churn)
churnData$Churn[churnData$Churn == "0"] <- "nonchurn"
churnData$Churn[churnData$Churn == "1"] <- "churn"


# train control
fitControl <- trainControl(## 10-fold CV
  method = "repeatedcv",
  number = 10,
  ## repeated 3 times
  repeats = 3,
  classProbs = TRUE,
  summaryFunction = twoClassSummary)

# logistic regression model
logreg <- train(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
                          MultipleLines + InternetService + OnlineSecurity +
                          TechSupport + StreamingTV + StreamingMovies +
                          Contract + PaperlessBilling + PaymentMethod + MonthlyCharges,
                data = churnData,
                method = "glm",
                family = "binomial",
                trControl = fitControl,
                metric = "ROC")
logreg

## Generalized Linear Model 
## 
## 7043 samples
##   14 predictor
##    2 classes: 'churn', 'nonchurn' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 6338, 6339, 6339, 6339, 6338, 6338, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8399487  0.4970454  0.8999506

ROC/AUC is 0.84. The true positive rate (sensitivity) is 0.497 and the true negative rate(specificity) is 0.899 .

Calculate cost-saving per Customer

If we assume that customer acquisition cost is approximately $300, which is the cost to replace a churned customer who is predicted as “nonchurn’ (False Negative). Otherwise, the cost will be five time less expensive to retain an existing customer, or $60 per customer, who is predicted as”churn" (True Positive)

FN (predict that a customer won’t churn, but they actually do): $300
TP (predict that a customer would churn, when they actually would): $60
FP (predict that a customer would churn, when they actually wouldn’t): $60
TN (predict that a customer won’t churn, when they actually wouldn’t): $0

Cost = FN($300) + TP($60) + FP($60) + TN($0)

Apply this cost evaluation to our model.

We start by fitting the model, and making predictions in the form of probabilities.

# fitting the logistic regression model
fit <- glm(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
                          MultipleLines + InternetService + OnlineSecurity +
                          TechSupport + StreamingTV + StreamingMovies +
                          Contract + PaperlessBilling + PaymentMethod + MonthlyCharges, 
           data = churn_train, family=binomial)

# making predictions
churn.probs <- predict(fit, churn_test, type="response")

# Calculate a test misclassification rate
table(churn_test$Churn) / nrow(churn_test)

## 
##         0         1 
## 0.7346591 0.2653409

classifying everything as non-churn for a test misclassification rate of 0.265.

Next, let’s create a threshold vector and a cost vector.

# threshold vector
thresh <- seq(0.1,1.0, length = 10)

#cost vector
cost = rep(0,length(thresh))

dim(churn_test)

## [1] 1760   20

# cost as a function of threshold
for (i in 1:length(thresh)){
  
  glm.pred = rep("No", length(churn.probs))
  glm.pred <- ifelse(churn.probs > thresh[[i]], 1, 0)
  glm.pred <- as.factor(glm.pred)
  churn_test$Churn <- as.factor(churn_test$Churn)
  
  x <- confusionMatrix(glm.pred, churn_test$Churn)
  TN <- x$table[1]/1760
  FP <- x$table[2]/1760
  FN <- x$table[3]/1760
  TP <- x$table[4]/1760
  
  cost[i] = FN*300 + TP*60 + FP*60 + TN*0
}

## Warning in confusionMatrix.default(glm.pred, churn_test$Churn): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Warning in confusionMatrix.default(glm.pred, churn_test$Churn): Levels are not
## in the same order for reference and data. Refactoring data to match.

Let’s assume that the company is using “simple model” which just defaults to a threshold of 0.5. We need fit that model, make predictions, and calculate the cost.

# simple model - assume threshold is 0.5
glm.pred = rep("No", length(churn.probs))
glm.pred <- ifelse(churn.probs > 0.5, 1, 0)
glm.pred <- as.factor(glm.pred)
churn_test$Churn <- as.factor(churn_test$Churn)

x <- confusionMatrix(glm.pred, churn_test$Churn)
TN <- x$table[1]/1760
FP <- x$table[2]/1760
FN <- x$table[3]/1760
TP <- x$table[4]/1760

cost_simple = FN*300 + TP*60 + FP*60 + TN*0

Finally, we can put all of the results in a dataframe and plot them.

# putting results in a dataframe for plotting
dat <- data.frame(
  model = c(rep("optimized",10),"simple"),
  cost_per_customer = c(cost, cost_simple),
  threshold = c(thresh, 0.5)
)

# plotting
ggplot(dat, aes(x = threshold, y = cost_per_customer, group = model, colour = model)) +
  geom_line() +
  geom_point()

Looking at the results, we can see that the minimum cost per customer is about $38.00 at a threshold of 0.2. The “simple” model that our company is currently implementing costs about $52.00 per customer, at a threshold of 0.50.

As assumed that we have 1,000,000 customer-base to be switched from existing model to new model, the cost-saving will be:

# cost savings of optimized model (threshold = 0.2) compared to baseline model (threshold = 0.5)
savings_per_customer = cost_simple - min(cost)

total_savings = 1000000*savings_per_customer

total_savings

## [1] 13261364

3.2.2. Random Forest

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.0.3

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(caret)

churn_train$Churn <- factor(churn_train$Churn)
churn_test$Churn <- factor(churn_test$Churn)

# Important variables
mod_rf <- randomForest(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
                          MultipleLines + InternetService + OnlineSecurity +
                          TechSupport + StreamingTV + StreamingMovies +
                          Contract + PaperlessBilling + PaymentMethod + MonthlyCharges,
                       data = churn_train)
order(varImp(mod_rf), decreasing=TRUE)

##  [1] 14 11  4 13  8  7 12  6  5  2  1  3  9 10

# Calculate the number of principle components needed to capture 90% of the variance
preProc_sub <- preProcess(churn_train, method="pca", thresh=0.9)
preProc_sub

## Created from 5283 samples and 20 variables
## 
## Pre-processing:
##   - centered (16)
##   - ignored (4)
##   - principal component signal extraction (16)
##   - scaled (16)
## 
## PCA needed 12 components to capture 90 percent of the variance

plot(mod_rf)

At number of trees is 200, the error is no longer improved so that we chose ntrees is 200 to tune our model.

Option 01: Random Forest

control_rf <- trainControl(method = "cv", 5)
model_rf <- train(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
                          MultipleLines + InternetService + OnlineSecurity +
                          TechSupport + StreamingTV + StreamingMovies +
                          Contract + PaperlessBilling + PaymentMethod + MonthlyCharges,
                  data = churn_train, method="rf", 
                  trControl=control_rf, ntree=200 
                  )
model_rf

## Random Forest 
## 
## 5283 samples
##   14 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 4227, 4225, 4227, 4227, 4226 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.7825128  0.3564390
##   11    0.7722918  0.3759235
##   21    0.7607438  0.3524874
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

predict_rf <- predict(model_rf, churn_test )

confusionMatrix(churn_test$Churn, predict_rf)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1213   80
##          1  291  176
##                                          
##                Accuracy : 0.7892         
##                  95% CI : (0.7694, 0.808)
##     No Information Rate : 0.8545         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.3681         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.8065         
##             Specificity : 0.6875         
##          Pos Pred Value : 0.9381         
##          Neg Pred Value : 0.3769         
##              Prevalence : 0.8545         
##          Detection Rate : 0.6892         
##    Detection Prevalence : 0.7347         
##       Balanced Accuracy : 0.7470         
##                                          
##        'Positive' Class : 0              
##

# Out-of-sample-error in test set
OOSE <- 1 - as.numeric(confusionMatrix(churn_test$Churn, predict_rf)$overall[1])
OOSE

## [1] 0.2107955

Accuracy is 0.7869 Out-of-sample-error in test set is 0.21 Sensitivity or True Positive rate is 0.8036 Specificity or True Negative Rate is 0.6855

plot(model_rf)

Option 02:

Tune Random Forest Model

dim(churn_train)

## [1] 5283   20

churn_train <- as.data.frame(churn_train)

t <- tuneRF(churn_train[, -20], churn_train[, 20], stepFactor = 0.5, plot = TRUE,
            ntreeTry = 200, trace = TRUE, improve = 0.05)

## mtry = 4  OOB error = 20.67% 
## Searching left ...
## mtry = 8     OOB error = 21.16% 
## -0.02380952 0.05 
## Searching right ...
## mtry = 2     OOB error = 20.42% 
## 0.01190476 0.05

At mtry=2, Out-of-Sample-error is lowest.

Fit the Random Forest Model After Tuning

rfModel_new <- randomForest(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
                          MultipleLines + InternetService + OnlineSecurity +
                          TechSupport + StreamingTV + StreamingMovies +
                          Contract + PaperlessBilling + PaymentMethod + MonthlyCharges,
                          data = churn_train, ntree = 200, mtry = 2, importance = TRUE,
                          proximity = TRUE )
print(rfModel_new)

## 
## Call:
##  randomForest(formula = Churn ~ SeniorCitizen + Partner + Dependents +      tenure + MultipleLines + InternetService + OnlineSecurity +      TechSupport + StreamingTV + StreamingMovies + Contract +      PaperlessBilling + PaymentMethod + MonthlyCharges, data = churn_train,      ntree = 200, mtry = 2, importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 200
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 20.99%
## Confusion matrix:
##      0   1 class.error
## 0 3589 292  0.07523834
## 1  817 585  0.58273894

Out-of-sample-error in test set is 20.99%

Random Forest Predictions and Confusion Matrix After Tuning

pred_rf_new <- predict(rfModel_new, churn_test)
confusionMatrix(pred_rf_new, churn_test$Churn)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1207  264
##          1   86  203
##                                           
##                Accuracy : 0.8011          
##                  95% CI : (0.7817, 0.8196)
##     No Information Rate : 0.7347          
##     P-Value [Acc > NIR] : 4.636e-11       
##                                           
##                   Kappa : 0.4192          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9335          
##             Specificity : 0.4347          
##          Pos Pred Value : 0.8205          
##          Neg Pred Value : 0.7024          
##              Prevalence : 0.7347          
##          Detection Rate : 0.6858          
##    Detection Prevalence : 0.8358          
##       Balanced Accuracy : 0.6841          
##                                           
##        'Positive' Class : 0               
##

Accuracy : 0.8011 Sensitivity or True Positive Rate : 0.9335 Specificity or True Negative Rate : 0.4347

Random Forest Feature Importance

varImpPlot(rfModel_new, sort=T, n.var = 10, main = 'Top 10 Feature Importance')

plot(pred_rf_new, churn_test$Churn,
     xlab = "Predicted", ylab = "Actual",
     main = "Predicted vs Actual: Random Forest, Test Data",
     col = "dodgerblue", pch = 20)
grid()
abline(0, 1, col = "darkorange", lwd = 2)

3.2.3. Tree Model

Option 01: A Tree model set up with a grid of tuning parameters (by default).

library(caret)

churn_train$Churn <- factor(churn_train$Churn)
churn_test$Churn <- factor(churn_test$Churn)

# build model with Classification Tree 
modFit.rpart <- train(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
                          MultipleLines + InternetService + OnlineSecurity +
                          TechSupport + StreamingTV + StreamingMovies +
                          Contract + PaperlessBilling + PaymentMethod +
                          MonthlyCharges, 
                      method = "rpart", data = churn_train )  

pred.rpart <- predict(modFit.rpart, churn_test)
confusionMatrix(pred.rpart, churn_test$Churn)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1195  292
##          1   98  175
##                                           
##                Accuracy : 0.7784          
##                  95% CI : (0.7583, 0.7976)
##     No Information Rate : 0.7347          
##     P-Value [Acc > NIR] : 1.307e-05       
##                                           
##                   Kappa : 0.3447          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9242          
##             Specificity : 0.3747          
##          Pos Pred Value : 0.8036          
##          Neg Pred Value : 0.6410          
##              Prevalence : 0.7347          
##          Detection Rate : 0.6790          
##    Detection Prevalence : 0.8449          
##       Balanced Accuracy : 0.6495          
##                                           
##        'Positive' Class : 0               
##

# Out-of-sample-error in test set
OOSE <- 1 - as.numeric(confusionMatrix(churn_test$Churn, pred.rpart)$overall[1])
OOSE

## [1] 0.2215909

Accuracy is 0.7784 Out-of-sample-error in test set is 0.22 Sensitivity or True Positive rate is 0.9242 Specificity or True Negative Rate is 0.3747

Option 02: Regression Tree Model

library(rlang)

## Warning: package 'rlang' was built under R version 4.0.3

## 
## Attaching package: 'rlang'

## The following objects are masked from 'package:purrr':
## 
##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
##     splice

library(rpart)

## Warning: package 'rpart' was built under R version 4.0.3

## 
## Attaching package: 'rpart'

## The following object is masked from 'package:dials':
## 
##     prune

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.0.3

churn_tree = rpart(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
                          MultipleLines + InternetService + OnlineSecurity +
                          TechSupport + StreamingTV + StreamingMovies +
                          Contract + PaperlessBilling + PaymentMethod + MonthlyCharges, 
              data = churn_train )

rpart.plot(churn_tree)

churn_tree_tst_pred = predict(churn_tree, churn_test, type = "class")
table(predicted = churn_tree_tst_pred, actual = churn_test$Churn)

##          actual
## predicted    0    1
##         0 1228  302
##         1   65  165

# funct to predict accuracy of Classification tree model
calc_acc = function(actual, predicted) {
  mean(actual == predicted)
}

# Accuracy
(tree_tst_acc = calc_acc(predicted = churn_tree_tst_pred, actual = churn_test$Churn))

## [1] 0.7914773

Accuracy: 0.79

confusionMatrix(churn_tree_tst_pred, churn_test$Churn)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1228  302
##          1   65  165
##                                           
##                Accuracy : 0.7915          
##                  95% CI : (0.7717, 0.8102)
##     No Information Rate : 0.7347          
##     P-Value [Acc > NIR] : 1.844e-08       
##                                           
##                   Kappa : 0.3617          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9497          
##             Specificity : 0.3533          
##          Pos Pred Value : 0.8026          
##          Neg Pred Value : 0.7174          
##              Prevalence : 0.7347          
##          Detection Rate : 0.6977          
##    Detection Prevalence : 0.8693          
##       Balanced Accuracy : 0.6515          
##                                           
##        'Positive' Class : 0               
##

# Out-of-sample-error in test set
OOSE <- 1 - as.numeric(confusionMatrix(churn_test$Churn, churn_tree_tst_pred)$overall[1])
OOSE

## [1] 0.2085227

Accuracy is 0.7915 Out-of-sample-error in test set is 0.208 Sensitivity or True Positive rate is 0.9497 Specificity or True Negative Rate is 0.3533