library(readr)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v dplyr 1.0.2
## v tibble 3.0.4 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.0
## v purrr 0.3.4
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.3
## Warning: package 'stringr' was built under R version 4.0.3
## Warning: package 'forcats' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
churnData = read_csv("./WA_Fn-UseC_-Telco-Customer-Churn (1).csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## SeniorCitizen = col_double(),
## tenure = col_double(),
## MonthlyCharges = col_double(),
## TotalCharges = col_double()
## )
## See spec(...) for full column specifications.
# Compute total missing values in each column
colSums(is.na(churnData))
## customerID gender SeniorCitizen Partner
## 0 0 0 0
## Dependents tenure PhoneService MultipleLines
## 0 0 0 0
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## 0 0 0 0
## TechSupport StreamingTV StreamingMovies Contract
## 0 0 0 0
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## 0 0 0 11
## Churn
## 0
Use Percentiles method for outlier detection
# tenure
lower_bound_te <- quantile(churnData$tenure , 0.025)
upper_bound_te <- quantile(churnData$tenure, 0.975)
outlier_ind_te <- which(churnData$tenure < lower_bound_te | churnData$tenure > upper_bound_te)
# Print outliers
head(churnData[outlier_ind_te, ])
## # A tibble: 6 x 21
## customerID gender SeniorCitizen Partner Dependents tenure PhoneService
## <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 4472-LVYGI Female 0 Yes Yes 0 No
## 2 3115-CZMZD Male 0 No Yes 0 Yes
## 3 5709-LVOEQ Female 0 Yes Yes 0 Yes
## 4 4367-NUYAO Male 0 Yes Yes 0 Yes
## 5 1371-DWPAZ Female 0 Yes Yes 0 No
## 6 7644-OMVMY Male 0 Yes Yes 0 Yes
## # ... with 14 more variables: MultipleLines <chr>, InternetService <chr>,
## # OnlineSecurity <chr>, OnlineBackup <chr>, DeviceProtection <chr>,
## # TechSupport <chr>, StreamingTV <chr>, StreamingMovies <chr>,
## # Contract <chr>, PaperlessBilling <chr>, PaymentMethod <chr>,
## # MonthlyCharges <dbl>, TotalCharges <dbl>, Churn <chr>
require(ggplot2)
# Histogram
hist(churnData$tenure, col = "green")
rug(churnData$tenure)
abline(v = median(churnData$tenure), col = "magenta", lwd = 4)
# MonthlyCharges
lower_bound_mo <- quantile(churnData$MonthlyCharges , 0.025)
upper_bound_mo <- quantile(churnData$MonthlyCharges, 0.975)
outlier_ind_mo <- which(churnData$MonthlyCharges < lower_bound_mo | churnData$MonthlyCharges > upper_bound_mo)
# Print outliers
head(churnData[outlier_ind_mo, ])
## # A tibble: 6 x 21
## customerID gender SeniorCitizen Partner Dependents tenure PhoneService
## <chr> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 7469-LKBCI Male 0 No No 16 Yes
## 2 3655-SNQYZ Female 0 Yes Yes 69 Yes
## 3 1891-QRQSA Male 1 Yes Yes 64 Yes
## 4 6067-NGCEU Female 0 No No 65 Yes
## 5 2146-EGVDT Male 0 Yes Yes 59 Yes
## 6 6168-YBYNP Male 0 No No 59 Yes
## # ... with 14 more variables: MultipleLines <chr>, InternetService <chr>,
## # OnlineSecurity <chr>, OnlineBackup <chr>, DeviceProtection <chr>,
## # TechSupport <chr>, StreamingTV <chr>, StreamingMovies <chr>,
## # Contract <chr>, PaperlessBilling <chr>, PaymentMethod <chr>,
## # MonthlyCharges <dbl>, TotalCharges <dbl>, Churn <chr>
# Histogram
hist(churnData$MonthlyCharges, col = "gold")
rug(churnData$MonthlyCharges)
abline(v = median(churnData$MonthlyCharges), col = "magenta", lwd = 4)
# Histogram
hist(churnData$TotalCharges, col = "maroon")
rug(churnData$TotalCharges)
abline(v = median(churnData$TotalCharges), col = "magenta", lwd = 4)
# Count on Gender
count_gender <- matrix(table(churnData$gender))
df_gen <- data.frame(gender = c("Female", "Male"), count = count_gender[,1])
ggplot(data=df_gen, aes(x = gender, y = count, fill = gender)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_gen$count, label= df_gen$count), vjust=1, color="black", size=3) +
labs(title="Count on Gender")
## Warning: Use of `df_gen$count` is discouraged. Use `count` instead.
## Warning: Use of `df_gen$count` is discouraged. Use `count` instead.
# Count on OnlineBackup
count_onb <- matrix(table(churnData$OnlineBackup))
df_onb <- data.frame(OnlineBackup = c("No", "No Internet Service", "Yes"), count = count_onb[,1])
ggplot(data=df_onb, aes(x = OnlineBackup, y = count, fill = OnlineBackup)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_onb$count, label= df_onb$count), vjust=1, color="black", size=3) +
labs(title="Count on OnlineBackup")
## Warning: Use of `df_onb$count` is discouraged. Use `count` instead.
## Warning: Use of `df_onb$count` is discouraged. Use `count` instead.
# Count on TechSupport
count_tec <- matrix(table(churnData$TechSupport))
df_tec <- data.frame(TechSupport = c("No", "No Internet Service", "Yes"), count = count_tec[,1])
ggplot(data=df_tec, aes(x = TechSupport, y = count, fill = TechSupport)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_tec$count, label= df_tec$count), vjust=1, color="black", size=3) +
labs(title="Count on TechSupport")
## Warning: Use of `df_tec$count` is discouraged. Use `count` instead.
## Warning: Use of `df_tec$count` is discouraged. Use `count` instead.
# Count on Contract
count_con <- matrix(table(churnData$Contract))
df_con <- data.frame(Contract = c("Month-to-month","One year", "Two year"), count = count_con[,1])
ggplot(data=df_con, aes(x = Contract, y = count, fill = Contract)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_con$count, label= df_con$count), vjust=1, color="black", size=3) +
labs(title="Count on Contract")
## Warning: Use of `df_con$count` is discouraged. Use `count` instead.
## Warning: Use of `df_con$count` is discouraged. Use `count` instead.
# Count on PaperlessBilling
count_pap <- matrix(table(churnData$PaperlessBilling))
df_pap <- data.frame(PaperlessBilling = c("No", "Yes"), count = count_pap[,1])
ggplot(data=df_pap, aes(x = PaperlessBilling, y = count, fill = PaperlessBilling)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_pap$count, label= df_pap$count), vjust=1, color="black", size=3) +
labs(title="Count on PaperlessBilling")
## Warning: Use of `df_pap$count` is discouraged. Use `count` instead.
## Warning: Use of `df_pap$count` is discouraged. Use `count` instead.
# Count on PaymentMethod
count_pay <- matrix(table(churnData$PaymentMethod))
df_pay <- data.frame(PaymentMethod = c("Bank transfer (automatic)", "Credit card (automatic)", "Electronic check", "Mailed check"), count = count_pay[,1])
ggplot(data=df_pay, aes(x = PaymentMethod, y = count, fill = PaymentMethod)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_pay$count, label= df_pay$count), vjust=1, color="black", size=3) +
labs(title="Count on PaymentMethod") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1))
## Warning: Use of `df_pay$count` is discouraged. Use `count` instead.
## Warning: Use of `df_pay$count` is discouraged. Use `count` instead.
# Count on Churn
count_chu <- matrix(table(churnData$Churn))
df_chu <- data.frame(Churn = c("No", "Yes"), count = count_chu[,1])
ggplot(data=df_chu, aes(x = Churn, y = count, fill = Churn)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_chu$count, label= df_chu$count), vjust=1, color="black", size=3) +
labs(title="Count on Churn")
## Warning: Use of `df_chu$count` is discouraged. Use `count` instead.
## Warning: Use of `df_chu$count` is discouraged. Use `count` instead.
require(GGally)
## Loading required package: GGally
## Warning: package 'GGally' was built under R version 4.0.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
require(ggplot2)
churn <- subset(churnData, select = c("tenure", "MonthlyCharges", "TotalCharges"))
# Function to return points and geom_smooth
# allow for the method to be changed
my_fn <- function(data, mapping, method="loess", ...){
p <- ggplot(data = data, mapping = mapping) +
geom_point() +
geom_smooth(method=method, ...)
p
}
# Default "loess" curve
ggpairs(churn, lower = list(continuous = my_fn))
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 11 rows containing missing values
## `geom_smooth()` using formula 'y ~ x'
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 11 rows containing missing values
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 11 rows containing non-finite values (stat_smooth).
## Warning: Removed 11 rows containing missing values (geom_point).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 11 rows containing non-finite values (stat_smooth).
## Warning: Removed 11 rows containing missing values (geom_point).
## Warning: Removed 11 rows containing non-finite values (stat_density).
Observation: tenure and TotalCharge indicated a very week correlation; while tenure and MonthlyCharges reflected its strongest correlation among the other pairs. The loess curve of pairs (tenure - TotalCharges, tenure - MonthlyCharges, and MonthlyCharges - TotalCharges) were upward trend. In the other words, more time using services, more expenses had been paid per month and in total.
# Drop unused columns
churnData$customerID <- NULL
churndt <- cor(churnData[,c("tenure", "MonthlyCharges", "TotalCharges")])
head(round(churndt,2))
## tenure MonthlyCharges TotalCharges
## tenure 1.00 0.25 NA
## MonthlyCharges 0.25 1.00 NA
## TotalCharges NA NA 1
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.0.3
## corrplot 0.84 loaded
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 4.0.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.0.3
cormat <- churndt
ggcorrplot::ggcorrplot(cormat, title = "Correlation of Numeric Variables")
library(dplyr)
churnData <- churnData %>%
transmute(
gender,
SeniorCitizen = ifelse(SeniorCitizen == 0, "No", "Yes"),
Partner,
Dependents,
tenure,
PhoneService,
MultipleLines = ifelse(MultipleLines == "Yes", "Yes", "No"),
InternetService = ifelse(InternetService == "No", "No", "Yes"),
OnlineSecurity = ifelse(OnlineSecurity == "Yes", "Yes", "No"),
OnlineBackup = ifelse(OnlineBackup == "Yes", "Yes", "No"),
DeviceProtection = ifelse(DeviceProtection == "Yes", "Yes", "No"),
TechSupport = ifelse(TechSupport == "Yes", "Yes", "No"),
StreamingTV = ifelse(StreamingTV == "Yes", "Yes", "No"),
StreamingMovies = ifelse(StreamingMovies == "Yes", "Yes", "No"),
Contract,
PaperlessBilling,
PaymentMethod,
MonthlyCharges,
TotalCharges,
Churn
)
# Transform "tenure" from months into years
churnData$tenure[churnData$tenure >= 0 & churnData$tenure <= 12] <- "0-1 year"
churnData$tenure[churnData$tenure > 12 & churnData$tenure <= 24] <- "1-2 years"
churnData$tenure[churnData$tenure > 24 & churnData$tenure <= 36] <- "2-3 years"
churnData$tenure[churnData$tenure > 36 & churnData$tenure <= 48] <- "3-4 years"
churnData$tenure[churnData$tenure > 48 & churnData$tenure <= 60] <- "4-5 years"
churnData$tenure[churnData$tenure > 60 & churnData$tenure <= 72] <- "5-6 years"
churnData$Contract[churnData$Contract == "Month-to-month"] <- 0
churnData$Contract[churnData$Contract == "One year"] <- 1
churnData$Contract[churnData$Contract == "Two year"] <- 2
churnData$PaymentMethod[churnData$PaymentMethod == "Bank transfer (automatic)"] <- 1
churnData$PaymentMethod[churnData$PaymentMethod == "Credit card (automatic)"] <- 2
churnData$PaymentMethod[churnData$PaymentMethod == "Electronic check"] <- 3
churnData$PaymentMethod[churnData$PaymentMethod == "Mailed check"] <- 4
churnData <- churnData %>%
mutate(num_tenure = substr(tenure, 3, 3))
churnData$num_tenure <- as.numeric(as.character(churnData$num_tenure))
# Impute missing values
churnData$TotalCharges[is.na(churnData$TotalCharges)] <- round(churnData$num_tenure * churnData$MonthlyCharges, 2)
## Warning in churnData$TotalCharges[is.na(churnData$TotalCharges)] <-
## round(churnData$num_tenure * : number of items to replace is not a multiple of
## replacement length
churnData <- churnData %>%
transmute(
gender = ifelse(gender == "Female", 1, 0),
SeniorCitizen = ifelse(SeniorCitizen == "Yes", 1, 0) ,
Partner = ifelse(Partner == "Yes", 1, 0) ,
Dependents = ifelse(Dependents == "Yes", 1, 0) ,
tenure,
PhoneService = ifelse(PhoneService == "Yes", 1, 0) ,
MultipleLines = ifelse(MultipleLines == "Yes", 1, 0) ,
InternetService = ifelse(InternetService == "Yes", 1, 0) ,
OnlineSecurity = ifelse(OnlineSecurity == "Yes", 1, 0) ,
OnlineBackup = ifelse(OnlineBackup == "Yes", 1, 0) ,
DeviceProtection = ifelse(DeviceProtection == "Yes", 1, 0) ,
TechSupport = ifelse(TechSupport == "Yes", 1, 0) ,
StreamingTV = ifelse(StreamingTV == "Yes", 1, 0) ,
StreamingMovies = ifelse(StreamingMovies == "Yes", 1, 0) ,
Contract,
PaperlessBilling = ifelse(PaperlessBilling == "Yes", 1, 0) ,
PaymentMethod,
MonthlyCharges,
TotalCharges,
Churn = ifelse(Churn == "Yes", 1, 0)
)
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.0.3
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom 0.7.3 v recipes 0.1.15
## v dials 0.0.9 v rsample 0.0.8
## v infer 0.5.3 v tune 0.1.2
## v modeldata 0.1.0 v workflows 0.2.1
## v parsnip 0.1.4 v yardstick 0.0.7
## Warning: package 'broom' was built under R version 4.0.3
## Warning: package 'dials' was built under R version 4.0.3
## Warning: package 'infer' was built under R version 4.0.3
## Warning: package 'modeldata' was built under R version 4.0.3
## Warning: package 'parsnip' was built under R version 4.0.3
## Warning: package 'recipes' was built under R version 4.0.3
## Warning: package 'rsample' was built under R version 4.0.3
## Warning: package 'tune' was built under R version 4.0.3
## Warning: package 'workflows' was built under R version 4.0.3
## Warning: package 'yardstick' was built under R version 4.0.3
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x Hmisc::src() masks dplyr::src()
## x recipes::step() masks stats::step()
## x Hmisc::summarize() masks dplyr::summarize()
## x parsnip::translate() masks Hmisc::translate()
# Split the data into training and test sets
set.seed(2021)
churn_split <- initial_split(churnData, strata = Churn, prop = 3/4)
churn_train <- training(churn_split) # training set
churn_test <- testing(churn_split) # test set
# Logistic Regression
mod_glm <- glm(Churn ~ ., family = "binomial", data = churn_train) # model 01
summary(mod_glm)
##
## Call:
## glm(formula = Churn ~ ., family = "binomial", data = churn_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9194 -0.6816 -0.2978 0.6649 3.0829
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.001e+00 2.305e-01 -4.343 1.41e-05 ***
## gender 2.617e-03 7.468e-02 0.035 0.972049
## SeniorCitizen 2.150e-01 9.721e-02 2.211 0.027007 *
## Partner -7.055e-02 8.997e-02 -0.784 0.432966
## Dependents -4.931e-02 1.034e-01 -0.477 0.633547
## tenure1-2 years -9.065e-01 1.202e-01 -7.542 4.63e-14 ***
## tenure2-3 years -1.124e+00 1.690e-01 -6.649 2.95e-11 ***
## tenure3-4 years -8.999e-01 2.231e-01 -4.033 5.51e-05 ***
## tenure4-5 years -9.361e-01 2.887e-01 -3.242 0.001187 **
## tenure5-6 years -1.071e+00 3.798e-01 -2.819 0.004824 **
## PhoneService -1.334e+00 2.009e-01 -6.641 3.11e-11 ***
## MultipleLines 1.928e-01 9.566e-02 2.016 0.043832 *
## InternetService -3.130e-01 2.214e-01 -1.414 0.157508
## OnlineSecurity -4.501e-01 9.756e-02 -4.614 3.96e-06 ***
## OnlineBackup -2.892e-01 8.974e-02 -3.222 0.001271 **
## DeviceProtection -1.937e-01 9.363e-02 -2.069 0.038557 *
## TechSupport -5.157e-01 9.935e-02 -5.191 2.10e-07 ***
## StreamingTV -1.361e-01 1.030e-01 -1.322 0.186270
## StreamingMovies -8.621e-02 1.039e-01 -0.830 0.406643
## Contract1 -8.050e-01 1.236e-01 -6.513 7.35e-11 ***
## Contract2 -1.733e+00 2.031e-01 -8.533 < 2e-16 ***
## PaperlessBilling 2.631e-01 8.547e-02 3.078 0.002085 **
## PaymentMethod2 -4.097e-02 1.300e-01 -0.315 0.752633
## PaymentMethod3 3.549e-01 1.076e-01 3.297 0.000976 ***
## PaymentMethod4 4.040e-02 1.319e-01 0.306 0.759406
## MonthlyCharges 4.043e-02 4.511e-03 8.964 < 2e-16 ***
## TotalCharges -1.241e-04 6.837e-05 -1.815 0.069523 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6113.6 on 5282 degrees of freedom
## Residual deviance: 4436.9 on 5256 degrees of freedom
## AIC: 4490.9
##
## Number of Fisher Scoring iterations: 6
library(car)
## Warning: package 'car' was built under R version 4.0.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.0.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif(mod_glm)
## GVIF Df GVIF^(1/(2*Df))
## gender 1.005763 1 1.002877
## SeniorCitizen 1.136703 1 1.066163
## Partner 1.404981 1 1.185319
## Dependents 1.323744 1 1.150541
## tenure 11.984596 5 1.281924
## PhoneService 2.695193 1 1.641704
## MultipleLines 1.621080 1 1.273216
## InternetService 3.291622 1 1.814283
## OnlineSecurity 1.153353 1 1.073943
## OnlineBackup 1.271964 1 1.127814
## DeviceProtection 1.373446 1 1.171941
## TechSupport 1.192909 1 1.092204
## StreamingTV 1.856190 1 1.362421
## StreamingMovies 1.889904 1 1.374738
## Contract 1.769217 2 1.153308
## PaperlessBilling 1.128832 1 1.062465
## PaymentMethod 1.385136 3 1.055801
## MonthlyCharges 10.566353 1 3.250593
## TotalCharges 14.638476 1 3.826026
Observation: tenure, PhoneService, InternetService, MonthlyCharges, and TotalCharges strongly demonstrated a correlation to Churn.
# Use ANOVA to assess model 01
anova(mod_glm, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Churn
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 5282 6113.6
## gender 1 0.25 5281 6113.3 0.61395
## SeniorCitizen 1 108.32 5280 6005.0 < 2.2e-16 ***
## Partner 1 128.19 5279 5876.8 < 2.2e-16 ***
## Dependents 1 27.36 5278 5849.5 1.689e-07 ***
## tenure 5 573.51 5273 5275.9 < 2.2e-16 ***
## PhoneService 1 0.91 5272 5275.0 0.34130
## MultipleLines 1 133.87 5271 5141.2 < 2.2e-16 ***
## InternetService 1 247.39 5270 4893.8 < 2.2e-16 ***
## OnlineSecurity 1 62.54 5269 4831.2 2.617e-15 ***
## OnlineBackup 1 3.59 5268 4827.7 0.05815 .
## DeviceProtection 1 1.24 5267 4826.4 0.26523
## TechSupport 1 54.96 5266 4771.4 1.231e-13 ***
## StreamingTV 1 30.06 5265 4741.4 4.180e-08 ***
## StreamingMovies 1 20.35 5264 4721.0 6.440e-06 ***
## Contract 2 145.02 5262 4576.0 < 2.2e-16 ***
## PaperlessBilling 1 19.90 5261 4556.1 8.176e-06 ***
## PaymentMethod 3 33.27 5258 4522.8 2.829e-07 ***
## MonthlyCharges 1 82.76 5257 4440.1 < 2.2e-16 ***
## TotalCharges 1 3.23 5256 4436.9 0.07211 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Observation: Important features are extracted, including: SeniorCitizen, Partner, Dependents, tenure, MultipleLines, InternetService, OnlineSecurity, TechSupport, StreamingTV, StreamingMovies, Contract, PaperlessBilling, PaymentMethod, MonthlyCharges.
The expected outcome is that we had sufficient evidences to reject the null hypothesis so that we might obtain significant features for our model in following steps.
We used Chi-square distribution to test on one variance.
# model 02 of logistic regression, omitted top VIFs (variance inflation factor)
mod3_glm <- glm(Churn ~ ., family = "binomial",
data = subset(churn_train, select = -c(tenure, PhoneService,
InternetService,
MonthlyCharges,
TotalCharges ))
)
# Use ANOVA to assess the significance of model 02
anova(mod3_glm, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Churn
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 5282 6113.6
## gender 1 0.25 5281 6113.3 0.6139501
## SeniorCitizen 1 108.32 5280 6005.0 < 2.2e-16 ***
## Partner 1 128.19 5279 5876.8 < 2.2e-16 ***
## Dependents 1 27.36 5278 5849.5 1.689e-07 ***
## MultipleLines 1 11.70 5277 5837.8 0.0006249 ***
## OnlineSecurity 1 115.87 5276 5721.9 < 2.2e-16 ***
## OnlineBackup 1 8.89 5275 5713.0 0.0028733 **
## DeviceProtection 1 4.35 5274 5708.7 0.0369095 *
## TechSupport 1 60.72 5273 5647.9 6.578e-15 ***
## StreamingTV 1 67.38 5272 5580.5 2.235e-16 ***
## StreamingMovies 1 30.75 5271 5549.8 2.942e-08 ***
## Contract 2 683.20 5269 4866.6 < 2.2e-16 ***
## PaperlessBilling 1 36.71 5268 4829.9 1.370e-09 ***
## PaymentMethod 3 69.52 5265 4760.4 5.405e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Compare deviance of 14 regressors in model 02 to the 95th percentile of chi-square distribution with relevant degree of freedom (‘df’).
qchisq(0.95, 5282-5281) # gender, df = 5281
## [1] 3.841459
qchisq(0.95, 5282-5280) # SeniorCitizen , df = 5280
## [1] 5.991465
qchisq(0.95, 5282-5279) # Partner , df = 5279
## [1] 7.814728
qchisq(0.95, 5282-5278) # Dependents, df = 5278
## [1] 9.487729
qchisq(0.95, 5282-5277) # MultipleLines, df = 5277
## [1] 11.0705
qchisq(0.95, 5282-5276) # OnlineSecurity, df = 5276
## [1] 12.59159
qchisq(0.95, 5282-5275) # OnlineBackup, df = 5275
## [1] 14.06714
qchisq(0.95, 5282-5274) # DeviceProtection, df = 5274
## [1] 15.50731
qchisq(0.95, 5282-5273) # TechSupport, df = 5273
## [1] 16.91898
qchisq(0.95, 5282-5272) # StreamingTV, df = 5272
## [1] 18.30704
qchisq(0.95, 5282-5271) # StreamingMovies, df = 5271
## [1] 19.67514
qchisq(0.95, 5282-5269) # Contract, df = 5269
## [1] 22.36203
qchisq(0.95, 5282-5268) # PaperlessBilling, df = 5268
## [1] 23.68479
qchisq(0.95, 5282-5265) # PaymentMethod, df = 5265
## [1] 27.58711
Observation: gender, OnlineBackup, DeviceProtection having respective deviance to be smaller than the 95th percentile of chi-square distribution, hence would be regarded as consistent with the null hypothesis at the conventional 5% level. In other words, gender, OnlineBackup, and DeviceProtection add very little to the model.
mod_glm_selected <- glm(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
MultipleLines + InternetService + OnlineSecurity +
TechSupport + StreamingTV + StreamingMovies +
Contract + PaperlessBilling + PaymentMethod + MonthlyCharges
, family = "binomial", data = churn_train)
summary(mod_glm_selected)
##
## Call:
## glm(formula = Churn ~ SeniorCitizen + Partner + Dependents +
## tenure + MultipleLines + InternetService + OnlineSecurity +
## TechSupport + StreamingTV + StreamingMovies + Contract +
## PaperlessBilling + PaymentMethod + MonthlyCharges, family = "binomial",
## data = churn_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8621 -0.6903 -0.3004 0.7049 3.0551
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.767444 0.171006 -10.336 < 2e-16 ***
## SeniorCitizen 0.265787 0.095897 2.772 0.005578 **
## Partner -0.070357 0.089112 -0.790 0.429797
## Dependents -0.075775 0.102569 -0.739 0.460049
## tenure1-2 years -0.968387 0.110615 -8.755 < 2e-16 ***
## tenure2-3 years -1.299971 0.132655 -9.800 < 2e-16 ***
## tenure3-4 years -1.192831 0.145308 -8.209 2.23e-16 ***
## tenure4-5 years -1.346732 0.158141 -8.516 < 2e-16 ***
## tenure5-6 years -1.683063 0.187408 -8.981 < 2e-16 ***
## MultipleLines 0.231883 0.092820 2.498 0.012482 *
## InternetService 0.583354 0.171464 3.402 0.000668 ***
## OnlineSecurity -0.412234 0.095988 -4.295 1.75e-05 ***
## TechSupport -0.485519 0.097489 -4.980 6.35e-07 ***
## StreamingTV 0.092654 0.095381 0.971 0.331345
## StreamingMovies 0.156456 0.095475 1.639 0.101274
## Contract1 -0.868004 0.122566 -7.082 1.42e-12 ***
## Contract2 -1.781605 0.202378 -8.803 < 2e-16 ***
## PaperlessBilling 0.291618 0.084470 3.452 0.000556 ***
## PaymentMethod2 -0.037012 0.129243 -0.286 0.774590
## PaymentMethod3 0.397903 0.106547 3.735 0.000188 ***
## PaymentMethod4 -0.010390 0.130511 -0.080 0.936546
## MonthlyCharges 0.015913 0.002754 5.778 7.58e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6113.6 on 5282 degrees of freedom
## Residual deviance: 4491.0 on 5261 degrees of freedom
## AIC: 4535
##
## Number of Fisher Scoring iterations: 6
churn_test$Churn <- as.character(churn_test$Churn)
fitted.results <- predict(mod_glm_selected ,newdata = churn_test, type = 'response')
fitted.results <- ifelse(fitted.results > 0.5, 1, 0)
misClasificError <- mean(fitted.results != churn_test$Churn)
print(paste('Logistic Regression Accuracy', 1-misClasificError))
## [1] "Logistic Regression Accuracy 0.800568181818182"
# Find the exact lower and upper bounds to the 95% confidence intervals for the coefficients.
exp(cbind(Odds_ratio = coef(mod_glm_selected), confint(mod_glm_selected)) )
## Waiting for profiling to be done...
## Odds_ratio 2.5 % 97.5 %
## (Intercept) 0.1707688 0.1216413 0.2378745
## SeniorCitizen 1.3044571 1.0807400 1.5740477
## Partner 0.9320610 0.7826974 1.1100265
## Dependents 0.9270250 0.7577434 1.1329072
## tenure1-2 years 0.3796950 0.3052494 0.4710040
## tenure2-3 years 0.2725396 0.2095856 0.3526161
## tenure3-4 years 0.3033614 0.2275554 0.4023388
## tenure4-5 years 0.2600890 0.1901987 0.3536584
## tenure5-6 years 0.1858040 0.1282379 0.2674635
## MultipleLines 1.2609718 1.0514169 1.5129646
## InternetService 1.7920383 1.2826264 2.5129111
## OnlineSecurity 0.6621692 0.5481641 0.7987014
## TechSupport 0.6153779 0.5078957 0.7443953
## StreamingTV 1.0970824 0.9100220 1.3227268
## StreamingMovies 1.1693593 0.9698805 1.4102525
## Contract1 0.4197888 0.3292509 0.5324837
## Contract2 0.1683676 0.1119095 0.2477587
## PaperlessBilling 1.3385912 1.1346704 1.5801867
## PaymentMethod2 0.9636644 0.7477281 1.2412929
## PaymentMethod3 1.4887002 1.2090656 1.8361527
## PaymentMethod4 0.9896636 0.7664536 1.2786404
## MonthlyCharges 1.0160400 1.0105908 1.0215640
Observation: The odds of Churn increase by 36% (exp(1.2826264)10), 31% (exp(1.1346704)10), and 34% (exp(1.2090656)10) for each point scored in top 03 features, including InternetService, PaperlessBilling, and PaymentMethod respectively.*
# Suppose that we used the cut-off is 0.5 in regression model
print("Confusion Matrix for Logistic Regression at threshold of 0.5")
## [1] "Confusion Matrix for Logistic Regression at threshold of 0.5"
table(churn_test$Churn, fitted.results > 0.5)
##
## FALSE TRUE
## 0 1175 118
## 1 233 234
With 0.5 of threshold, there are 233 customers predicted as non-churn but they turned to be churn in fact. Our target, therefore, is to reduce this number as much as possible. In the other words, we aimed at increasing the proportion of True Negative rate or Specificity as evaluating model probability in prediction to non-churn customers who are truly non-churn.
How to make decision on the cut-off/threshold for making “yes” (positive) prediction. Using ROC instead.
library(ROCR)
## Warning: package 'ROCR' was built under R version 4.0.3
churn.probs <- predict(mod_glm_selected, churn_test, type="response")
# need to create prediction object from ROCR
pr <- prediction(churn.probs, churn_test$Churn)
# plotting ROC curve
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
# AUC value
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8553527
After we run the model on each fold, we average the evaluation metric from each one. So if we ran the model 10 times using ROC, we would average each of the 10 ROC values together. This is a great way to try and prevent overfitting a model.
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
##
## Attaching package: 'caret'
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
# setting a seed for reproducibility
set.seed(10)
# changing the positive class to "Yes"
churnData$Churn <- as.character(churnData$Churn)
churnData$Churn[churnData$Churn == "0"] <- "nonchurn"
churnData$Churn[churnData$Churn == "1"] <- "churn"
# train control
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
## repeated 3 times
repeats = 3,
classProbs = TRUE,
summaryFunction = twoClassSummary)
# logistic regression model
logreg <- train(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
MultipleLines + InternetService + OnlineSecurity +
TechSupport + StreamingTV + StreamingMovies +
Contract + PaperlessBilling + PaymentMethod + MonthlyCharges,
data = churnData,
method = "glm",
family = "binomial",
trControl = fitControl,
metric = "ROC")
logreg
## Generalized Linear Model
##
## 7043 samples
## 14 predictor
## 2 classes: 'churn', 'nonchurn'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 6338, 6339, 6339, 6339, 6338, 6338, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8399487 0.4970454 0.8999506
ROC/AUC is 0.84. The true positive rate (sensitivity) is 0.497 and the true negative rate(specificity) is 0.899 .
If we assume that customer acquisition cost is approximately $300, which is the cost to replace a churned customer who is predicted as “nonchurn’ (False Negative). Otherwise, the cost will be five time less expensive to retain an existing customer, or $60 per customer, who is predicted as”churn" (True Positive)
Cost = FN($300) + TP($60) + FP($60) + TN($0)
We start by fitting the model, and making predictions in the form of probabilities.
# fitting the logistic regression model
fit <- glm(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
MultipleLines + InternetService + OnlineSecurity +
TechSupport + StreamingTV + StreamingMovies +
Contract + PaperlessBilling + PaymentMethod + MonthlyCharges,
data = churn_train, family=binomial)
# making predictions
churn.probs <- predict(fit, churn_test, type="response")
# Calculate a test misclassification rate
table(churn_test$Churn) / nrow(churn_test)
##
## 0 1
## 0.7346591 0.2653409
classifying everything as non-churn for a test misclassification rate of 0.265.
Next, let’s create a threshold vector and a cost vector.
# threshold vector
thresh <- seq(0.1,1.0, length = 10)
#cost vector
cost = rep(0,length(thresh))
dim(churn_test)
## [1] 1760 20
# cost as a function of threshold
for (i in 1:length(thresh)){
glm.pred = rep("No", length(churn.probs))
glm.pred <- ifelse(churn.probs > thresh[[i]], 1, 0)
glm.pred <- as.factor(glm.pred)
churn_test$Churn <- as.factor(churn_test$Churn)
x <- confusionMatrix(glm.pred, churn_test$Churn)
TN <- x$table[1]/1760
FP <- x$table[2]/1760
FN <- x$table[3]/1760
TP <- x$table[4]/1760
cost[i] = FN*300 + TP*60 + FP*60 + TN*0
}
## Warning in confusionMatrix.default(glm.pred, churn_test$Churn): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Warning in confusionMatrix.default(glm.pred, churn_test$Churn): Levels are not
## in the same order for reference and data. Refactoring data to match.
Let’s assume that the company is using “simple model” which just defaults to a threshold of 0.5. We need fit that model, make predictions, and calculate the cost.
# simple model - assume threshold is 0.5
glm.pred = rep("No", length(churn.probs))
glm.pred <- ifelse(churn.probs > 0.5, 1, 0)
glm.pred <- as.factor(glm.pred)
churn_test$Churn <- as.factor(churn_test$Churn)
x <- confusionMatrix(glm.pred, churn_test$Churn)
TN <- x$table[1]/1760
FP <- x$table[2]/1760
FN <- x$table[3]/1760
TP <- x$table[4]/1760
cost_simple = FN*300 + TP*60 + FP*60 + TN*0
Finally, we can put all of the results in a dataframe and plot them.
# putting results in a dataframe for plotting
dat <- data.frame(
model = c(rep("optimized",10),"simple"),
cost_per_customer = c(cost, cost_simple),
threshold = c(thresh, 0.5)
)
# plotting
ggplot(dat, aes(x = threshold, y = cost_per_customer, group = model, colour = model)) +
geom_line() +
geom_point()
Looking at the results, we can see that the minimum cost per customer is about $38.00 at a threshold of 0.2. The “simple” model that our company is currently implementing costs about $52.00 per customer, at a threshold of 0.50.
As assumed that we have 1,000,000 customer-base to be switched from existing model to new model, the cost-saving will be:
# cost savings of optimized model (threshold = 0.2) compared to baseline model (threshold = 0.5)
savings_per_customer = cost_simple - min(cost)
total_savings = 1000000*savings_per_customer
total_savings
## [1] 13261364
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
churn_train$Churn <- factor(churn_train$Churn)
churn_test$Churn <- factor(churn_test$Churn)
# Important variables
mod_rf <- randomForest(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
MultipleLines + InternetService + OnlineSecurity +
TechSupport + StreamingTV + StreamingMovies +
Contract + PaperlessBilling + PaymentMethod + MonthlyCharges,
data = churn_train)
order(varImp(mod_rf), decreasing=TRUE)
## [1] 14 11 4 13 8 7 12 6 5 2 1 3 9 10
# Calculate the number of principle components needed to capture 90% of the variance
preProc_sub <- preProcess(churn_train, method="pca", thresh=0.9)
preProc_sub
## Created from 5283 samples and 20 variables
##
## Pre-processing:
## - centered (16)
## - ignored (4)
## - principal component signal extraction (16)
## - scaled (16)
##
## PCA needed 12 components to capture 90 percent of the variance
plot(mod_rf)
At number of trees is 200, the error is no longer improved so that we chose ntrees is 200 to tune our model.
control_rf <- trainControl(method = "cv", 5)
model_rf <- train(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
MultipleLines + InternetService + OnlineSecurity +
TechSupport + StreamingTV + StreamingMovies +
Contract + PaperlessBilling + PaymentMethod + MonthlyCharges,
data = churn_train, method="rf",
trControl=control_rf, ntree=200
)
model_rf
## Random Forest
##
## 5283 samples
## 14 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 4227, 4225, 4227, 4227, 4226
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.7825128 0.3564390
## 11 0.7722918 0.3759235
## 21 0.7607438 0.3524874
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
predict_rf <- predict(model_rf, churn_test )
confusionMatrix(churn_test$Churn, predict_rf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1213 80
## 1 291 176
##
## Accuracy : 0.7892
## 95% CI : (0.7694, 0.808)
## No Information Rate : 0.8545
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3681
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8065
## Specificity : 0.6875
## Pos Pred Value : 0.9381
## Neg Pred Value : 0.3769
## Prevalence : 0.8545
## Detection Rate : 0.6892
## Detection Prevalence : 0.7347
## Balanced Accuracy : 0.7470
##
## 'Positive' Class : 0
##
# Out-of-sample-error in test set
OOSE <- 1 - as.numeric(confusionMatrix(churn_test$Churn, predict_rf)$overall[1])
OOSE
## [1] 0.2107955
Accuracy is 0.7869 Out-of-sample-error in test set is 0.21 Sensitivity or True Positive rate is 0.8036 Specificity or True Negative Rate is 0.6855
plot(model_rf)
dim(churn_train)
## [1] 5283 20
churn_train <- as.data.frame(churn_train)
t <- tuneRF(churn_train[, -20], churn_train[, 20], stepFactor = 0.5, plot = TRUE,
ntreeTry = 200, trace = TRUE, improve = 0.05)
## mtry = 4 OOB error = 20.67%
## Searching left ...
## mtry = 8 OOB error = 21.16%
## -0.02380952 0.05
## Searching right ...
## mtry = 2 OOB error = 20.42%
## 0.01190476 0.05
At mtry=2, Out-of-Sample-error is lowest.
rfModel_new <- randomForest(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
MultipleLines + InternetService + OnlineSecurity +
TechSupport + StreamingTV + StreamingMovies +
Contract + PaperlessBilling + PaymentMethod + MonthlyCharges,
data = churn_train, ntree = 200, mtry = 2, importance = TRUE,
proximity = TRUE )
print(rfModel_new)
##
## Call:
## randomForest(formula = Churn ~ SeniorCitizen + Partner + Dependents + tenure + MultipleLines + InternetService + OnlineSecurity + TechSupport + StreamingTV + StreamingMovies + Contract + PaperlessBilling + PaymentMethod + MonthlyCharges, data = churn_train, ntree = 200, mtry = 2, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 200
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 20.99%
## Confusion matrix:
## 0 1 class.error
## 0 3589 292 0.07523834
## 1 817 585 0.58273894
Out-of-sample-error in test set is 20.99%
pred_rf_new <- predict(rfModel_new, churn_test)
confusionMatrix(pred_rf_new, churn_test$Churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1207 264
## 1 86 203
##
## Accuracy : 0.8011
## 95% CI : (0.7817, 0.8196)
## No Information Rate : 0.7347
## P-Value [Acc > NIR] : 4.636e-11
##
## Kappa : 0.4192
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9335
## Specificity : 0.4347
## Pos Pred Value : 0.8205
## Neg Pred Value : 0.7024
## Prevalence : 0.7347
## Detection Rate : 0.6858
## Detection Prevalence : 0.8358
## Balanced Accuracy : 0.6841
##
## 'Positive' Class : 0
##
Accuracy : 0.8011 Sensitivity or True Positive Rate : 0.9335 Specificity or True Negative Rate : 0.4347
varImpPlot(rfModel_new, sort=T, n.var = 10, main = 'Top 10 Feature Importance')
plot(pred_rf_new, churn_test$Churn,
xlab = "Predicted", ylab = "Actual",
main = "Predicted vs Actual: Random Forest, Test Data",
col = "dodgerblue", pch = 20)
grid()
abline(0, 1, col = "darkorange", lwd = 2)
library(caret)
churn_train$Churn <- factor(churn_train$Churn)
churn_test$Churn <- factor(churn_test$Churn)
# build model with Classification Tree
modFit.rpart <- train(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
MultipleLines + InternetService + OnlineSecurity +
TechSupport + StreamingTV + StreamingMovies +
Contract + PaperlessBilling + PaymentMethod +
MonthlyCharges,
method = "rpart", data = churn_train )
pred.rpart <- predict(modFit.rpart, churn_test)
confusionMatrix(pred.rpart, churn_test$Churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1195 292
## 1 98 175
##
## Accuracy : 0.7784
## 95% CI : (0.7583, 0.7976)
## No Information Rate : 0.7347
## P-Value [Acc > NIR] : 1.307e-05
##
## Kappa : 0.3447
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9242
## Specificity : 0.3747
## Pos Pred Value : 0.8036
## Neg Pred Value : 0.6410
## Prevalence : 0.7347
## Detection Rate : 0.6790
## Detection Prevalence : 0.8449
## Balanced Accuracy : 0.6495
##
## 'Positive' Class : 0
##
# Out-of-sample-error in test set
OOSE <- 1 - as.numeric(confusionMatrix(churn_test$Churn, pred.rpart)$overall[1])
OOSE
## [1] 0.2215909
Accuracy is 0.7784 Out-of-sample-error in test set is 0.22 Sensitivity or True Positive rate is 0.9242 Specificity or True Negative Rate is 0.3747
library(rlang)
## Warning: package 'rlang' was built under R version 4.0.3
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
## flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
## splice
library(rpart)
## Warning: package 'rpart' was built under R version 4.0.3
##
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
##
## prune
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.3
churn_tree = rpart(Churn ~ SeniorCitizen + Partner + Dependents + tenure +
MultipleLines + InternetService + OnlineSecurity +
TechSupport + StreamingTV + StreamingMovies +
Contract + PaperlessBilling + PaymentMethod + MonthlyCharges,
data = churn_train )
rpart.plot(churn_tree)
churn_tree_tst_pred = predict(churn_tree, churn_test, type = "class")
table(predicted = churn_tree_tst_pred, actual = churn_test$Churn)
## actual
## predicted 0 1
## 0 1228 302
## 1 65 165
# funct to predict accuracy of Classification tree model
calc_acc = function(actual, predicted) {
mean(actual == predicted)
}
# Accuracy
(tree_tst_acc = calc_acc(predicted = churn_tree_tst_pred, actual = churn_test$Churn))
## [1] 0.7914773
Accuracy: 0.79
confusionMatrix(churn_tree_tst_pred, churn_test$Churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1228 302
## 1 65 165
##
## Accuracy : 0.7915
## 95% CI : (0.7717, 0.8102)
## No Information Rate : 0.7347
## P-Value [Acc > NIR] : 1.844e-08
##
## Kappa : 0.3617
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9497
## Specificity : 0.3533
## Pos Pred Value : 0.8026
## Neg Pred Value : 0.7174
## Prevalence : 0.7347
## Detection Rate : 0.6977
## Detection Prevalence : 0.8693
## Balanced Accuracy : 0.6515
##
## 'Positive' Class : 0
##
# Out-of-sample-error in test set
OOSE <- 1 - as.numeric(confusionMatrix(churn_test$Churn, churn_tree_tst_pred)$overall[1])
OOSE
## [1] 0.2085227
Accuracy is 0.7915 Out-of-sample-error in test set is 0.208 Sensitivity or True Positive rate is 0.9497 Specificity or True Negative Rate is 0.3533