# 1. Data.
# 2. Data Cleaning.
# 3. EDA.
# 4. Preparing To Model The Data.
# 5. Building Logistic Regression.
# 6. Model Evaluation.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(ggplot2)
library(ggthemes)
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
##
## theme_map
library(stringr)
library(ggcorrplot)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(rlang)
library(caret)
## Loading required package: lattice
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(dplyr)
library(ggplot2)
library(cowplot)
library(glue)
library(scales)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
data_raw <- read_csv("~/Downloads/WA_Fn-UseC_-Telco-Customer-Churn.csv")
## Rows: 7043 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (17): customerID, gender, Partner, Dependents, PhoneService, MultipleLin...
## dbl (4): SeniorCitizen, tenure, MonthlyCharges, TotalCharges
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_raw <- data_raw[,-1]
dim(data_raw)
## [1] 7043 20
#remove CustomerID
data_raw$id <-c(1:nrow(data_raw))
data_raw <- data_raw[,c(21,1:20)]
This project has two main objective and they are as follows:
The data has 7043 observations and 20 variables. The target variable is “Churn” with a binary response(“Yes”,“No”) and the rest are indpendent variables.
str(data_raw)
## tibble [7,043 × 21] (S3: tbl_df/tbl/data.frame)
## $ id : int [1:7043] 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : chr [1:7043] "Female" "Male" "Male" "Male" ...
## $ SeniorCitizen : num [1:7043] 0 0 0 0 0 0 0 0 0 0 ...
## $ Partner : chr [1:7043] "Yes" "No" "No" "No" ...
## $ Dependents : chr [1:7043] "No" "No" "No" "No" ...
## $ tenure : num [1:7043] 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : chr [1:7043] "No" "Yes" "Yes" "No" ...
## $ MultipleLines : chr [1:7043] "No phone service" "No" "No" "No phone service" ...
## $ InternetService : chr [1:7043] "DSL" "DSL" "DSL" "DSL" ...
## $ OnlineSecurity : chr [1:7043] "No" "Yes" "Yes" "Yes" ...
## $ OnlineBackup : chr [1:7043] "Yes" "No" "Yes" "No" ...
## $ DeviceProtection: chr [1:7043] "No" "Yes" "No" "Yes" ...
## $ TechSupport : chr [1:7043] "No" "No" "No" "Yes" ...
## $ StreamingTV : chr [1:7043] "No" "No" "No" "No" ...
## $ StreamingMovies : chr [1:7043] "No" "No" "No" "No" ...
## $ Contract : chr [1:7043] "Month-to-month" "One year" "Month-to-month" "One year" ...
## $ PaperlessBilling: chr [1:7043] "Yes" "No" "Yes" "No" ...
## $ PaymentMethod : chr [1:7043] "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
## $ MonthlyCharges : num [1:7043] 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num [1:7043] 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : chr [1:7043] "No" "No" "Yes" "No" ...
data_raw<- clean_names(data_raw)
head(data_raw,6)
## # A tibble: 6 × 21
## id gender senior_citizen partner dependents tenure phone_service
## <int> <chr> <dbl> <chr> <chr> <dbl> <chr>
## 1 1 Female 0 Yes No 1 No
## 2 2 Male 0 No No 34 Yes
## 3 3 Male 0 No No 2 Yes
## 4 4 Male 0 No No 45 No
## 5 5 Female 0 No No 2 Yes
## 6 6 Female 0 No No 8 Yes
## # ℹ 14 more variables: multiple_lines <chr>, internet_service <chr>,
## # online_security <chr>, online_backup <chr>, device_protection <chr>,
## # tech_support <chr>, streaming_tv <chr>, streaming_movies <chr>,
## # contract <chr>, paperless_billing <chr>, payment_method <chr>,
## # monthly_charges <dbl>, total_charges <dbl>, churn <chr>
summary(data_raw)
## id gender senior_citizen partner
## Min. : 1 Length:7043 Min. :0.0000 Length:7043
## 1st Qu.:1762 Class :character 1st Qu.:0.0000 Class :character
## Median :3522 Mode :character Median :0.0000 Mode :character
## Mean :3522 Mean :0.1621
## 3rd Qu.:5282 3rd Qu.:0.0000
## Max. :7043 Max. :1.0000
##
## dependents tenure phone_service multiple_lines
## Length:7043 Min. : 0.00 Length:7043 Length:7043
## Class :character 1st Qu.: 9.00 Class :character Class :character
## Mode :character Median :29.00 Mode :character Mode :character
## Mean :32.37
## 3rd Qu.:55.00
## Max. :72.00
##
## internet_service online_security online_backup device_protection
## Length:7043 Length:7043 Length:7043 Length:7043
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## tech_support streaming_tv streaming_movies contract
## Length:7043 Length:7043 Length:7043 Length:7043
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## paperless_billing payment_method monthly_charges total_charges
## Length:7043 Length:7043 Min. : 18.25 Min. : 18.8
## Class :character Class :character 1st Qu.: 35.50 1st Qu.: 401.4
## Mode :character Mode :character Median : 70.35 Median :1397.5
## Mean : 64.76 Mean :2283.3
## 3rd Qu.: 89.85 3rd Qu.:3794.7
## Max. :118.75 Max. :8684.8
## NA's :11
## churn
## Length:7043
## Class :character
## Mode :character
##
##
##
##
convert senior_citizen to a factor data type
data_raw$senior_citizen <- factor(data_raw$senior_citizen)
> We have 17 variables of factor data type and 3 variables of numeric data type.
The only column has missing values is “total_charges” with 11 NA’s. Its very small proprtion to the total rows and therfor i will drop NA’s records.
summary(data_raw$total_charges)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.8 401.4 1397.5 2283.3 3794.7 8684.8 11
# drop NA's rows
data_raw <- data_raw[!is.na(data_raw$total_charges),]
data <- data_raw
dim(data)
## [1] 7032 21
In ordr to check for outliers i would use the histogram plot for the three numeric variables(tenure,total_charges and monthly_charges).As plot below there are no outliers.
options(repr.plot.width=10, repr.plot.height=7)
hist(data$tenure,main="Tenure Histogram",col = "red",xlab="Tenure",ylab="Count")
hist(data$monthly_charges,main="Monthly Chargers Histogram",col = "green",xlab="Monthly_charges",ylab="Count")
hist(data$total_charges,main="Total Chargers Histogram",col = "purple",xlab="Total_charges",ylab="Count")
options(warn=-1)
sam <- theme(plot.background = element_rect(fill="#F5FFFA",color = "darkblue"),
plot.title = element_text(size=25, hjust=.5),
axis.title.x = element_text(size=25, color = "black"),
axis.title.y = element_text(size=25, color = "black"),
axis.text.x = element_text(size=20),
axis.text.y = element_text(size=20),
legend.position = "top",
legend.text = element_text(size=20),
legend.title = element_text(size=20))
churn <- data %>%
group_by(churn) %>%
summarise(n = n())%>%
mutate(prop = n / sum(n)) %>%
ungroup()%>%
mutate(label_text = str_glue("n:{n} \n prop:{scales::percent(prop)}"))
churn
## # A tibble: 2 × 4
## churn n prop label_text
## <chr> <int> <dbl> <glue>
## 1 No 5163 0.734 n:5163
## prop:73%
## 2 Yes 1869 0.266 n:1869
## prop:27%
options(repr.plot.width=15, repr.plot.height=10)
churn1 <- churn %>% ggplot(aes(x = churn,y = prop,fill = churn)) +
geom_col(alpha=0.7,color="black") +
geom_label(aes(label=label_text),fill="white",size =8,position=position_fill(vjust=0.3),color = "black",size = 10)+
xlab("Churn(Yes,NO)") +
ylab("Prop") +
ggtitle("Churn Bar Graph Distribution")+
scale_y_continuous(labels = scales::percent_format())+
theme_minimal()+
sam
churn1
options(repr.plot.width=20, repr.plot.height=15)
data %>% ggplot(mapping = aes(x = tenure)) +
geom_bar(aes(fill = churn),color="black",alpha=0.7) +
theme_minimal()+
xlab("Tenure")+
ylab("Count") +
ggtitle("Tenure Bar Graph with Churn Overlay\n(Not Normalized)")+
theme(legend.position = "none") +
sam
data %>% ggplot(mapping = aes(x = tenure)) +
geom_bar(aes(fill = churn),position = 'fill',color="black",alpha=0.7) +
scale_y_continuous(labels = scales::percent_format())+
theme_minimal()+
xlab("Tenure")+
ylab("Prop") +
ggtitle("Tenure Bar Graph with Churn Overlay \n(Normalized)") +
sam
data %>% ggplot(aes(x = tenure,fill = churn)) +
geom_density(alpha=0.7,color="black") +
xlab("Tenure") +
ylab("Prop") +
ggtitle("Tenure Density Graph with Churn") +
theme_minimal()+
sam
options(repr.plot.width=20, repr.plot.height=15)
data %>% ggplot(mapping = aes(x = monthly_charges)) +
geom_bar(aes(fill = churn),alpha=0.7) +
theme_minimal()+
xlab("Monthly Charges ($)")+
ylab("Count") +
ggtitle("Monthly Charges Bar Graph with Churn Overlay\n(Not Normalized)")+
theme(legend.position = "none") +
sam
data %>% ggplot(mapping = aes(x = monthly_charges)) +
geom_bar(aes(fill = churn),position = 'fill',alpha=0.7) +
scale_y_continuous(labels = scales::percent_format())+
theme_minimal()+
xlab("Monthly Charges ($)")+
ylab("Prop") +
ggtitle("Monthly Charges Bar Graph with Churn Overlay \n(Normalized)") +
sam
data %>% ggplot(aes(x = monthly_charges,fill = churn)) +
geom_density(alpha=0.7,color="black") +
xlab("Monthly Charges") +
ylab("Prop") +
ggtitle("Monthly Charges Density Graph with Churn") +
theme_minimal()+
sam
The majority of our customers has low monthly charges. Those with the highest proportion of positive churn(left our platform) are the ones with high monthly charges(between ‘70’ to ‘112’$ / month)
options(repr.plot.width=20, repr.plot.height=15)
options(warn=-1)
data %>% ggplot(mapping = aes(x = total_charges)) +
geom_bar(aes(fill = churn),alpha = 0.7) +
theme_minimal()+
xlab("Total Charges ($)")+
ylab("Count") +
ggtitle("Total Charges Bar Graph with Churn Overlay\n(Not Normalized)")+
theme(legend.position = "none") +
sam
data %>% ggplot(mapping = aes(x = total_charges)) +
geom_bar(aes(fill = churn),position = 'fill',alpha=0.7) +
scale_y_continuous(labels = scales::percent_format())+
theme_minimal()+
xlab("Total Charges ($)")+
ylab("Prop") +
ggtitle("Total Charges Bar Graph with Churn Overlay \n(Normalized)") +
sam
data %>% ggplot(aes(x =total_charges,fill = churn)) +
geom_density(alpha=0.7,color="black") +
xlab("Total Charges") +
ylab("Prop") +
ggtitle("Total Charges Density Graph with Churn") +
theme_minimal()+
sam
ggcorrplot(round(cor(data[,c(6,19,20)]),2), title = "Correlation Matrix",hc.order = TRUE,lab=TRUE,type = "lower",lab_size = 5) + theme(plot.title=element_text(hjust = 0.5,size = 25),axis.text.y = element_text(size = 25),axis.text.x = element_text(size = 15),legend.text = element_text(size = 20))
options(repr.plot.width=15, repr.plot.height=22)
ggpairs(data,columns = c(6,19,20),ggplot2::aes(color = churn)) +
theme_economist()+
sam
Strong positive correlation between tenure and total charges.
weak positive correlation between tenure and monthly charges.
medium to strong positive correlation between monthly charges and total charges.
library(glue)
plot_facet_bar <- function(data, target, list_of_variables) {
target <- sym(target) # Converting the string to a column reference
plt_matrix <- list()
for (i in seq_along(list_of_variables)) {
column <- list_of_variables[i]
col <- sym(column)
temp <- data %>%
group_by(!!col, !!target) %>%
summarize(count = n(), .groups = 'drop') %>%
mutate(prop = round(count / sum(count), 2)) %>%
ungroup() %>%
mutate(label_text = glue("n: {count}\nprop: {percent(prop)}"))
plt_matrix[[i]] <- ggplot(data = temp, aes(x = !!col, y = prop, fill = !!target)) +
geom_bar(stat = "identity", alpha = 0.7, color = "black") +
geom_label(aes(label = label_text), size = 3, hjust = 0.5, fill = "white", color = "black") +
scale_y_continuous(labels = percent_format()) +
labs(x = column, y = "Proportion", title = paste("Distribution of", target, "across", column)) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 6, face = "bold"),
axis.title = element_text(size = 14),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)
)
}
options(repr.plot.width = 30, repr.plot.height = 20)
plot_grid(plotlist = plt_matrix, ncol = 2)
}
plot_facet_bar(data, 'churn', c('gender', 'senior_citizen', 'partner', 'dependents'))
There is no insight we could get from gender as both female and male have almost the same proprtion with regard of the churn variable.
42% of senior citizen have left our platform the last month. 76% of not senior citizen did not leave.
33% of our customers with no partner left our platform.80% of customers with partner didnt leave.
31% of our customers who left our platform are not dependents. 84% of our customers who didnt leave are dependents.
plot_categorical_vs_target <- function(data, target, list_of_variables) {
target <- sym(target) # Converting the string to a column reference
plt_matrix <- list()
for (i in seq_along(list_of_variables)) {
column <- list_of_variables[i]
col <- sym(column)
temp <- data %>%
group_by(!!col, !!target) %>%
summarize(count = n(), .groups = 'drop') %>%
mutate(prop = round(count / sum(count), 2)) %>%
ungroup() %>%
mutate(label_text = glue("n: {count}\nprop: {percent(prop)}"))
plt_matrix[[i]] <- ggplot(data = temp, aes(x = !!col, y = prop, fill = !!target)) +
geom_bar(stat = "identity", alpha = 0.7, color = "black", position = position_dodge(width = 0.9)) +
geom_text(aes(label = label_text), size = 3, position = position_stack(vjust = 0.5), color = "black") +
scale_y_continuous(labels = percent_format()) +
labs(x = column, y = "Proportion", title = paste("Distribution of", target, "across", column)) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 14),
legend.title = element_text(size = 4),
legend.text = element_text(size = 12)
)
}
options(repr.plot.width = 30, repr.plot.height = 20)
plot_grid(plotlist = plt_matrix, ncol = 2)
}
plot_categorical_vs_target(data, 'churn', c('phone_service', 'multiple_lines', 'internet_service', 'online_security'))
* Overall across the three levels of multiple lines,on average 73% of our customers didnt left our platform compared to 257% who did.
* 42% Customers of fiber optic services left our platform the last month.93% of customers with no internet services did not leave. 81% of our customers with DSL services didnt leave.
* 42% of customers with no online security left our platform the last month.93% of customers with no internet services did not leave. 85% of our customers with online security didnt leave.
plot_dodged_bar <- function(data, target, list_of_variables) {
target <- sym(target) # Converting the string to a column reference
plt_matrix <- list()
for (i in seq_along(list_of_variables)) {
column <- list_of_variables[i]
col <- sym(column)
temp <- data %>%
group_by(!!col, !!target) %>%
summarize(count = n(), .groups = 'drop') %>%
mutate(prop = round(count / sum(count), 2)) %>%
ungroup() %>%
mutate(label_text = glue("n: {count}\nprop: {percent(prop)}"))
plt_matrix[[i]] <- ggplot(data = temp, aes(x = !!col, y = prop, fill = !!target)) +
geom_bar(stat = "identity", alpha = 0.7, color = "black", position = position_dodge(width = 0.9)) +
geom_text(aes(label = label_text), size = 3, position = position_dodge(width = 0.9), vjust = -0.5, color = "black") +
scale_y_continuous(labels = percent_format()) +
labs(x = column, y = "Proportion", title = paste("Distribution of", target, "across", column)) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size = 6, face = "bold"),
axis.title = element_text(size = 4),
legend.title = element_text(size = 4),
legend.text = element_text(size = 6)
)
}
options(repr.plot.width = 30, repr.plot.height = 20)
plot_grid(plotlist = plt_matrix, ncol = 2)
}
# Example usage
plot_dodged_bar(data, 'churn', c('online_backup', 'device_protection', 'tech_support', 'streaming_tv', 'streaming_movies'))
* 93% of our customers with no internet services did leave our platform.
* 40% of customers with no online backup left our platform.75% of Those who have online backup didnt leave.
* 39% of customers with no device protection left our platform.77% of Those who have device protection didnt leave.
* 42% of customers with no tech support left our platform.85% of Those who have tech support didnt leave.
* 34% of customers with no streaming TV left our platform.70% of Those who have streaming TV didnt leave.
* 34% of customers with no streaming movies left our platform.70% of Those who have streaming movies didnt leave.
plot_dodged_bar(data, 'churn', c('contract', 'paperless_billing', 'payment_method'))
* 43% of our customers who have a month-to-month contract left our platform.89% of those have a one contract did not leave.97% of those has two year contract did not leave.
* 34% of our customers who have paperless billing left our platform. 84% of those who did not have paperless billing did not leave.
* 45% of our customers with electronic check as payment method left our platform.
Its important to bin our numeric variables into levels based on its behavior with the target or predictive variable:
False when values are between 25 and 56 OR between 66 and 110.
data$m5bins <- cut(x = data$monthly_charges,breaks = c(0, 26,56,66,110,120),
right = FALSE, labels = c("very-Low","low","medium","high","very-high"),ordered = TRUE)
data %>%
group_by(m5bins,churn)%>%
summarise(count=n())%>%
mutate(prop=round(count/sum(count),2))%>%
ungroup()%>%
mutate(label_text = str_glue("n : {count}\nprop:{scales::percent(prop)}"))%>%
ggplot(aes(x = m5bins,y = prop,fill = churn)) +
geom_col(alpha=0.7,color="black") +
geom_label(aes(label=label_text),fill="white",size =8,color = "black",size = 10)+
xlab("Monthly Charges") +
ylab("Prop") +
ggtitle("Monthly Charges Bar Graph with Churn") +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal()+
sam
## `summarise()` has grouped output by 'm5bins'. You can override using the
## `.groups` argument.
data %>% ggplot(aes(x = monthly_charges,fill = churn)) +
geom_density(alpha=0.7,color="black") +
xlab("Monthly Charges") +
ylab("Count") +
ggtitle("Monthly Charges Density Graph with Churn") +
theme_minimal()+
sam
* 28% of Low (monthly charges is between (26 - 55 USD)) monthly charges customers left our platform.
* 36% of high (monthly charges is between (66 - 110 USD)) monthly charges customers left our platform.
Now that we have a full picture on how customer churn behave relative to the indpendent variables in our data i would sum up the characteristics of our potential customer below:
Not a senior citizen
Has a partner.
Has dependents.
Has no internet services.
Has online security.
Has online online backup.
Has online device protection.
Has tech support services.
Has a two year contract.
Has paper billing.
Has credit card(automated) as payment method.
Has a very low monthly charges (below 25 USD).
# Preparing To Model The Data * Randomly Reorder The Data
* Clean Categorical Variables.
* Create Dummy Variables.
* Standardize Continuous Variables.
* Final DataSet.
* Split Data into train data (65%) and test data (35%).
* Randomly Reorder The Data
set.seed(1000)
rows <- sample(nrow(data))
data_1 <- data
data_shuffled <- data_1[rows,]
In EDA , we know some categorical variables such as online services and phone services have “No”,“Yes” and “No internet services” as categories, or “No phone services” as a category . I will condiser them as “No” and have only “No”,“Yes”.
data_shuffled <- data.frame(lapply(data_shuffled, function(x) {
gsub("No internet service", "No", x)}))
data_shuffled <- data.frame(lapply(data_shuffled, function(x) {
gsub("No phone service", "No", x)}))
data_shuffled[,c(1,6,19,20)] <- lapply(data_shuffled[,c(1,6,19,20)],as.numeric)
head(data_shuffled)
## id gender senior_citizen partner dependents tenure phone_service
## 1 5112 Male 1 No No 47 Yes
## 2 581 Female 0 Yes Yes 12 Yes
## 3 4317 Male 0 No No 69 Yes
## 4 1078 Female 0 No No 41 Yes
## 5 4448 Male 0 Yes No 68 Yes
## 6 5484 Female 0 Yes No 55 Yes
## multiple_lines internet_service online_security online_backup
## 1 No Fiber optic Yes No
## 2 No No No No
## 3 Yes DSL Yes Yes
## 4 Yes Fiber optic Yes Yes
## 5 Yes Fiber optic Yes Yes
## 6 Yes Fiber optic No No
## device_protection tech_support streaming_tv streaming_movies contract
## 1 No No No Yes Month-to-month
## 2 No No No No Month-to-month
## 3 Yes Yes Yes Yes Two year
## 4 Yes Yes Yes Yes Month-to-month
## 5 No Yes Yes Yes Month-to-month
## 6 Yes No No No Month-to-month
## paperless_billing payment_method monthly_charges total_charges
## 1 No Mailed check 85.50 4042.30
## 2 No Mailed check 19.00 233.55
## 3 No Mailed check 90.65 6322.10
## 4 Yes Bank transfer (automatic) 114.50 4527.45
## 5 Yes Electronic check 107.70 7320.90
## 6 Yes Electronic check 77.75 4458.15
## churn m5bins
## 1 Yes high
## 2 Yes very-Low
## 3 No high
## 4 Yes very-high
## 5 No high
## 6 Yes high
This can be easily done using ‘dummayVars’ from ‘caret package’. Categorical variables with more than two categoreis such as m5bin ,4 extra column will be created with 0 and 1. total of 25 dummay variables. The full dataset is joining all dummay variables with
d_int <-data_shuffled[,c(1,6,19,20)]
d <- data_shuffled[,-c(1,6,19,20)]
dummy <- data.frame(sapply(data_shuffled[,-c(6,19,20)],function(x) data.frame(model.matrix(~x-1,data =data_shuffled[,-c(6,19,20)]))[,-1]))
head(dummy)
## gender senior_citizen partner dependents phone_service multiple_lines
## 1 1 1 0 0 1 0
## 2 0 0 1 1 1 0
## 3 1 0 0 0 1 1
## 4 0 0 0 0 1 1
## 5 1 0 1 0 1 1
## 6 0 0 1 0 1 1
## internet_service.xFiber.optic internet_service.xNo online_security
## 1 1 0 1
## 2 0 1 0
## 3 0 0 1
## 4 1 0 1
## 5 1 0 1
## 6 1 0 0
## online_backup device_protection tech_support streaming_tv streaming_movies
## 1 0 0 0 0 1
## 2 0 0 0 0 0
## 3 1 1 1 1 1
## 4 1 1 1 1 1
## 5 1 0 1 1 1
## 6 0 1 0 0 0
## contract.xOne.year contract.xTwo.year paperless_billing
## 1 0 0 0
## 2 0 0 0
## 3 0 1 0
## 4 0 0 1
## 5 0 0 1
## 6 0 0 1
## payment_method.xCredit.card..automatic. payment_method.xElectronic.check
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 1
## 6 0 1
## payment_method.xMailed.check churn m5bins.xlow m5bins.xmedium
## 1 1 1 0 0
## 2 1 1 0 0
## 3 1 0 0 0
## 4 0 1 0 0
## 5 0 0 0 0
## 6 0 1 0 0
## m5bins.xvery.high m5bins.xvery.Low
## 1 0 0
## 2 0 1
## 3 0 0
## 4 1 0
## 5 0 0
## 6 0 0
I will standardize tenure and total charges variables only as the only numeric variables to be i will use in the model.
d_int[,c(2,3,4)] <- scale(d_int[,c(2,3,4)])
str_glue("Mean of Tenure : {round(mean(d_int$tenure),2)} | Standard deviation of tenure : {round(sd(d_int$tenure),2)}")
## Mean of Tenure : 0 | Standard deviation of tenure : 1
str_glue("Mean of Total Charges : {round(mean(d_int$total_charges),2)} | Standard deviation of total_charges : {round(sd(d_int$total_charges),2)}")
## Mean of Total Charges : 0 | Standard deviation of total_charges : 1
str_glue("Mean of Monthly Charges : {round(mean(d_int$monthly_charges),2)} | Standard deviation of monthly charges : {round(sd(d_int$monthly_charges),2)}")
## Mean of Monthly Charges : 0 | Standard deviation of monthly charges : 1
# full dataset
final_data <- cbind(d_int , dummy) # remove monthly charges continous variable
head(final_data)
## id tenure monthly_charges total_charges gender senior_citizen partner
## 1 5112 0.5939319 0.6880878 0.7759934 1 1 0
## 2 581 -0.8320053 -1.5222445 -0.9042599 0 0 1
## 3 4317 1.4902354 0.8592639 1.7817410 1 0 0
## 4 1078 0.3494856 1.6519921 0.9900203 0 0 0
## 5 4448 1.4494943 1.4259732 2.2223677 1 0 1
## 6 5484 0.9198605 0.4304927 0.9594481 0 0 1
## dependents phone_service multiple_lines internet_service.xFiber.optic
## 1 0 1 0 1
## 2 1 1 0 0
## 3 0 1 1 0
## 4 0 1 1 1
## 5 0 1 1 1
## 6 0 1 1 1
## internet_service.xNo online_security online_backup device_protection
## 1 0 1 0 0
## 2 1 0 0 0
## 3 0 1 1 1
## 4 0 1 1 1
## 5 0 1 1 0
## 6 0 0 0 1
## tech_support streaming_tv streaming_movies contract.xOne.year
## 1 0 0 1 0
## 2 0 0 0 0
## 3 1 1 1 0
## 4 1 1 1 0
## 5 1 1 1 0
## 6 0 0 0 0
## contract.xTwo.year paperless_billing payment_method.xCredit.card..automatic.
## 1 0 0 0
## 2 0 0 0
## 3 1 0 0
## 4 0 1 0
## 5 0 1 0
## 6 0 1 0
## payment_method.xElectronic.check payment_method.xMailed.check churn
## 1 0 1 1
## 2 0 1 1
## 3 0 1 0
## 4 0 0 1
## 5 1 0 0
## 6 1 0 1
## m5bins.xlow m5bins.xmedium m5bins.xvery.high m5bins.xvery.Low
## 1 0 0 0 0
## 2 0 0 0 1
## 3 0 0 0 0
## 4 0 0 1 0
## 5 0 0 0 0
## 6 0 0 0 0
* Split Data into train data (65%) and test data (35%).
set.seed(1234)
split <- createDataPartition(y = final_data$churn,p=0.65,list = FALSE)
training_set<-final_data[split,]
testing_set<-final_data[-split,]
table(training_set$churn)
##
## 0 1
## 3353 1218
table(testing_set$churn)
##
## 0 1
## 1810 651
Both train and test data are perfectly balanced.
# Building Logistic Regression Model
model_1 <- glm(churn~.,data = training_set,family = binomial(link = "logit"))
summary(model_1)
##
## Call:
## glm(formula = churn ~ ., family = binomial(link = "logit"), data = training_set)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.172e+00 1.623e+00 -2.570 0.010171
## id 1.923e-05 1.992e-05 0.965 0.334343
## tenure -1.427e+00 1.973e-01 -7.232 4.74e-13
## monthly_charges -1.597e+00 1.203e+00 -1.327 0.184496
## total_charges 5.873e-01 2.121e-01 2.769 0.005623
## gender -7.355e-02 8.073e-02 -0.911 0.362278
## senior_citizen 2.053e-01 1.051e-01 1.954 0.050755
## partner 7.499e-02 9.642e-02 0.778 0.436717
## dependents -1.767e-01 1.110e-01 -1.592 0.111430
## phone_service 6.540e-01 8.211e-01 0.796 0.425750
## multiple_lines 6.360e-01 2.222e-01 2.863 0.004197
## internet_service.xFiber.optic 2.479e+00 1.018e+00 2.434 0.014931
## internet_service.xNo -1.715e+00 1.065e+00 -1.610 0.107412
## online_security -1.571e-01 2.242e-01 -0.701 0.483361
## online_backup 1.254e-01 2.188e-01 0.573 0.566372
## device_protection 2.965e-01 2.216e-01 1.338 0.180958
## tech_support -4.173e-02 2.241e-01 -0.186 0.852263
## streaming_tv 7.794e-01 4.099e-01 1.901 0.057263
## streaming_movies 8.647e-01 4.109e-01 2.105 0.035334
## contract.xOne.year -5.799e-01 1.335e-01 -4.344 1.40e-05
## contract.xTwo.year -1.317e+00 2.251e-01 -5.848 4.97e-09
## paperless_billing 3.441e-01 9.272e-02 3.712 0.000206
## payment_method.xCredit.card..automatic. -5.694e-02 1.424e-01 -0.400 0.689343
## payment_method.xElectronic.check 3.786e-01 1.180e-01 3.209 0.001330
## payment_method.xMailed.check -1.167e-01 1.438e-01 -0.812 0.416995
## m5bins.xlow 8.364e-01 2.696e-01 3.103 0.001918
## m5bins.xmedium 5.322e-01 2.850e-01 1.867 0.061832
## m5bins.xvery.high -3.639e-01 3.432e-01 -1.060 0.289015
## m5bins.xvery.Low 2.579e-01 4.750e-01 0.543 0.587205
##
## (Intercept) *
## id
## tenure ***
## monthly_charges
## total_charges **
## gender
## senior_citizen .
## partner
## dependents
## phone_service
## multiple_lines **
## internet_service.xFiber.optic *
## internet_service.xNo
## online_security
## online_backup
## device_protection
## tech_support
## streaming_tv .
## streaming_movies *
## contract.xOne.year ***
## contract.xTwo.year ***
## paperless_billing ***
## payment_method.xCredit.card..automatic.
## payment_method.xElectronic.check **
## payment_method.xMailed.check
## m5bins.xlow **
## m5bins.xmedium .
## m5bins.xvery.high
## m5bins.xvery.Low
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5299.7 on 4570 degrees of freedom
## Residual deviance: 3769.0 on 4542 degrees of freedom
## AIC: 3827
##
## Number of Fisher Scoring iterations: 6
* Above is the initial model with all variables included. I will use stewise feature selection methods with function called ‘stepAIC’, it will iterate untill the lowest AIC model among all models is discovered.
model_2 <- stepAIC(model_1,direction = "both")
## Start: AIC=3827.01
## churn ~ id + tenure + monthly_charges + total_charges + gender +
## senior_citizen + partner + dependents + phone_service + multiple_lines +
## internet_service.xFiber.optic + internet_service.xNo + online_security +
## online_backup + device_protection + tech_support + streaming_tv +
## streaming_movies + contract.xOne.year + contract.xTwo.year +
## paperless_billing + payment_method.xCredit.card..automatic. +
## payment_method.xElectronic.check + payment_method.xMailed.check +
## m5bins.xlow + m5bins.xmedium + m5bins.xvery.high + m5bins.xvery.Low
##
## Df Deviance AIC
## - tech_support 1 3769.0 3825.0
## - payment_method.xCredit.card..automatic. 1 3769.2 3825.2
## - m5bins.xvery.Low 1 3769.3 3825.3
## - online_backup 1 3769.3 3825.3
## - online_security 1 3769.5 3825.5
## - partner 1 3769.6 3825.6
## - phone_service 1 3769.6 3825.6
## - payment_method.xMailed.check 1 3769.7 3825.7
## - gender 1 3769.8 3825.8
## - id 1 3769.9 3825.9
## - m5bins.xvery.high 1 3770.2 3826.2
## - monthly_charges 1 3770.8 3826.8
## - device_protection 1 3770.8 3826.8
## <none> 3769.0 3827.0
## - dependents 1 3771.6 3827.6
## - internet_service.xNo 1 3771.6 3827.6
## - m5bins.xmedium 1 3772.5 3828.5
## - streaming_tv 1 3772.6 3828.6
## - senior_citizen 1 3772.8 3828.8
## - streaming_movies 1 3773.4 3829.4
## - internet_service.xFiber.optic 1 3775.0 3831.0
## - total_charges 1 3777.0 3833.0
## - multiple_lines 1 3777.2 3833.2
## - m5bins.xlow 1 3779.1 3835.1
## - payment_method.xElectronic.check 1 3779.4 3835.4
## - paperless_billing 1 3782.9 3838.9
## - contract.xOne.year 1 3788.6 3844.6
## - contract.xTwo.year 1 3810.1 3866.1
## - tenure 1 3831.4 3887.4
##
## Step: AIC=3825.04
## churn ~ id + tenure + monthly_charges + total_charges + gender +
## senior_citizen + partner + dependents + phone_service + multiple_lines +
## internet_service.xFiber.optic + internet_service.xNo + online_security +
## online_backup + device_protection + streaming_tv + streaming_movies +
## contract.xOne.year + contract.xTwo.year + paperless_billing +
## payment_method.xCredit.card..automatic. + payment_method.xElectronic.check +
## payment_method.xMailed.check + m5bins.xlow + m5bins.xmedium +
## m5bins.xvery.high + m5bins.xvery.Low
##
## Df Deviance AIC
## - payment_method.xCredit.card..automatic. 1 3769.2 3823.2
## - m5bins.xvery.Low 1 3769.3 3823.3
## - partner 1 3769.6 3823.6
## - payment_method.xMailed.check 1 3769.7 3823.7
## - online_security 1 3769.8 3823.8
## - gender 1 3769.9 3823.9
## - id 1 3770.0 3824.0
## - m5bins.xvery.high 1 3770.2 3824.2
## - online_backup 1 3770.4 3824.4
## <none> 3769.0 3825.0
## - dependents 1 3771.6 3825.6
## - phone_service 1 3772.5 3826.5
## - m5bins.xmedium 1 3772.6 3826.6
## - senior_citizen 1 3772.9 3826.9
## + tech_support 1 3769.0 3827.0
## - device_protection 1 3774.5 3828.5
## - total_charges 1 3777.0 3831.0
## - monthly_charges 1 3778.2 3832.2
## - internet_service.xNo 1 3778.3 3832.3
## - m5bins.xlow 1 3779.3 3833.3
## - payment_method.xElectronic.check 1 3779.5 3833.5
## - paperless_billing 1 3782.9 3836.9
## - streaming_tv 1 3784.0 3838.0
## - streaming_movies 1 3787.1 3841.1
## - contract.xOne.year 1 3788.8 3842.8
## - multiple_lines 1 3793.9 3847.9
## - internet_service.xFiber.optic 1 3797.5 3851.5
## - contract.xTwo.year 1 3810.6 3864.6
## - tenure 1 3831.4 3885.4
##
## Step: AIC=3823.2
## churn ~ id + tenure + monthly_charges + total_charges + gender +
## senior_citizen + partner + dependents + phone_service + multiple_lines +
## internet_service.xFiber.optic + internet_service.xNo + online_security +
## online_backup + device_protection + streaming_tv + streaming_movies +
## contract.xOne.year + contract.xTwo.year + paperless_billing +
## payment_method.xElectronic.check + payment_method.xMailed.check +
## m5bins.xlow + m5bins.xmedium + m5bins.xvery.high + m5bins.xvery.Low
##
## Df Deviance AIC
## - m5bins.xvery.Low 1 3769.5 3821.5
## - payment_method.xMailed.check 1 3769.7 3821.7
## - partner 1 3769.8 3821.8
## - online_security 1 3770.0 3822.0
## - gender 1 3770.0 3822.0
## - id 1 3770.1 3822.1
## - m5bins.xvery.high 1 3770.4 3822.4
## - online_backup 1 3770.6 3822.6
## <none> 3769.2 3823.2
## - dependents 1 3771.8 3823.8
## - phone_service 1 3772.6 3824.6
## - m5bins.xmedium 1 3772.8 3824.8
## - senior_citizen 1 3773.0 3825.0
## + payment_method.xCredit.card..automatic. 1 3769.0 3825.0
## + tech_support 1 3769.2 3825.2
## - device_protection 1 3774.7 3826.7
## - total_charges 1 3777.2 3829.2
## - monthly_charges 1 3778.4 3830.4
## - internet_service.xNo 1 3778.5 3830.5
## - m5bins.xlow 1 3779.5 3831.5
## - paperless_billing 1 3783.0 3835.0
## - streaming_tv 1 3784.2 3836.2
## - payment_method.xElectronic.check 1 3787.2 3839.2
## - streaming_movies 1 3787.3 3839.3
## - contract.xOne.year 1 3789.0 3841.0
## - multiple_lines 1 3794.0 3846.0
## - internet_service.xFiber.optic 1 3797.7 3849.7
## - contract.xTwo.year 1 3810.8 3862.8
## - tenure 1 3831.6 3883.6
##
## Step: AIC=3821.52
## churn ~ id + tenure + monthly_charges + total_charges + gender +
## senior_citizen + partner + dependents + phone_service + multiple_lines +
## internet_service.xFiber.optic + internet_service.xNo + online_security +
## online_backup + device_protection + streaming_tv + streaming_movies +
## contract.xOne.year + contract.xTwo.year + paperless_billing +
## payment_method.xElectronic.check + payment_method.xMailed.check +
## m5bins.xlow + m5bins.xmedium + m5bins.xvery.high
##
## Df Deviance AIC
## - payment_method.xMailed.check 1 3770.0 3820.0
## - partner 1 3770.1 3820.1
## - online_security 1 3770.3 3820.3
## - gender 1 3770.3 3820.3
## - id 1 3770.5 3820.5
## - m5bins.xvery.high 1 3770.6 3820.6
## - online_backup 1 3770.9 3820.9
## <none> 3769.5 3821.5
## - dependents 1 3772.1 3822.1
## - phone_service 1 3772.8 3822.8
## - m5bins.xmedium 1 3772.9 3822.9
## + m5bins.xvery.Low 1 3769.2 3823.2
## + payment_method.xCredit.card..automatic. 1 3769.3 3823.3
## - senior_citizen 1 3773.4 3823.4
## + tech_support 1 3769.5 3823.5
## - device_protection 1 3775.1 3825.1
## - internet_service.xNo 1 3778.8 3828.8
## - total_charges 1 3779.3 3829.3
## - monthly_charges 1 3779.7 3829.7
## - paperless_billing 1 3783.2 3833.2
## - m5bins.xlow 1 3783.6 3833.6
## - streaming_tv 1 3784.8 3834.8
## - payment_method.xElectronic.check 1 3787.6 3837.6
## - streaming_movies 1 3787.9 3837.9
## - contract.xOne.year 1 3789.3 3839.3
## - multiple_lines 1 3794.8 3844.8
## - internet_service.xFiber.optic 1 3797.7 3847.7
## - contract.xTwo.year 1 3811.6 3861.6
## - tenure 1 3837.7 3887.7
##
## Step: AIC=3820.01
## churn ~ id + tenure + monthly_charges + total_charges + gender +
## senior_citizen + partner + dependents + phone_service + multiple_lines +
## internet_service.xFiber.optic + internet_service.xNo + online_security +
## online_backup + device_protection + streaming_tv + streaming_movies +
## contract.xOne.year + contract.xTwo.year + paperless_billing +
## payment_method.xElectronic.check + m5bins.xlow + m5bins.xmedium +
## m5bins.xvery.high
##
## Df Deviance AIC
## - partner 1 3770.6 3818.6
## - online_security 1 3770.8 3818.8
## - gender 1 3770.8 3818.8
## - id 1 3770.9 3818.9
## - m5bins.xvery.high 1 3771.1 3819.1
## - online_backup 1 3771.4 3819.4
## <none> 3770.0 3820.0
## - dependents 1 3772.6 3820.6
## - m5bins.xmedium 1 3773.3 3821.3
## - phone_service 1 3773.3 3821.3
## + payment_method.xMailed.check 1 3769.5 3821.5
## + m5bins.xvery.Low 1 3769.7 3821.7
## - senior_citizen 1 3773.9 3821.9
## + tech_support 1 3770.0 3822.0
## + payment_method.xCredit.card..automatic. 1 3770.0 3822.0
## - device_protection 1 3775.6 3823.6
## - total_charges 1 3779.5 3827.5
## - internet_service.xNo 1 3779.7 3827.7
## - monthly_charges 1 3780.2 3828.2
## - m5bins.xlow 1 3783.8 3831.8
## - paperless_billing 1 3783.9 3831.9
## - streaming_tv 1 3785.5 3833.5
## - streaming_movies 1 3788.6 3836.6
## - contract.xOne.year 1 3789.9 3837.9
## - payment_method.xElectronic.check 1 3795.4 3843.4
## - multiple_lines 1 3795.5 3843.5
## - internet_service.xFiber.optic 1 3798.4 3846.4
## - contract.xTwo.year 1 3812.1 3860.1
## - tenure 1 3838.2 3886.2
##
## Step: AIC=3818.65
## churn ~ id + tenure + monthly_charges + total_charges + gender +
## senior_citizen + dependents + phone_service + multiple_lines +
## internet_service.xFiber.optic + internet_service.xNo + online_security +
## online_backup + device_protection + streaming_tv + streaming_movies +
## contract.xOne.year + contract.xTwo.year + paperless_billing +
## payment_method.xElectronic.check + m5bins.xlow + m5bins.xmedium +
## m5bins.xvery.high
##
## Df Deviance AIC
## - online_security 1 3771.4 3817.4
## - gender 1 3771.5 3817.5
## - id 1 3771.6 3817.6
## - m5bins.xvery.high 1 3771.7 3817.7
## - online_backup 1 3772.1 3818.1
## - dependents 1 3772.6 3818.6
## <none> 3770.6 3818.6
## - m5bins.xmedium 1 3773.9 3819.9
## + partner 1 3770.0 3820.0
## - phone_service 1 3774.0 3820.0
## + payment_method.xMailed.check 1 3770.1 3820.1
## + m5bins.xvery.Low 1 3770.4 3820.4
## + tech_support 1 3770.6 3820.6
## + payment_method.xCredit.card..automatic. 1 3770.6 3820.6
## - senior_citizen 1 3774.9 3820.9
## - device_protection 1 3776.4 3822.4
## - total_charges 1 3780.1 3826.1
## - internet_service.xNo 1 3780.4 3826.4
## - monthly_charges 1 3781.0 3827.0
## - m5bins.xlow 1 3784.4 3830.4
## - paperless_billing 1 3784.6 3830.6
## - streaming_tv 1 3786.2 3832.2
## - streaming_movies 1 3789.4 3835.4
## - contract.xOne.year 1 3790.4 3836.4
## - payment_method.xElectronic.check 1 3796.1 3842.1
## - multiple_lines 1 3796.4 3842.4
## - internet_service.xFiber.optic 1 3799.2 3845.2
## - contract.xTwo.year 1 3812.6 3858.6
## - tenure 1 3838.2 3884.2
##
## Step: AIC=3817.36
## churn ~ id + tenure + monthly_charges + total_charges + gender +
## senior_citizen + dependents + phone_service + multiple_lines +
## internet_service.xFiber.optic + internet_service.xNo + online_backup +
## device_protection + streaming_tv + streaming_movies + contract.xOne.year +
## contract.xTwo.year + paperless_billing + payment_method.xElectronic.check +
## m5bins.xlow + m5bins.xmedium + m5bins.xvery.high
##
## Df Deviance AIC
## - gender 1 3772.1 3816.1
## - id 1 3772.3 3816.3
## - m5bins.xvery.high 1 3772.4 3816.4
## - dependents 1 3773.3 3817.3
## <none> 3771.4 3817.4
## - m5bins.xmedium 1 3774.5 3818.5
## + online_security 1 3770.6 3818.6
## - online_backup 1 3774.7 3818.7
## + partner 1 3770.8 3818.8
## + payment_method.xMailed.check 1 3770.9 3818.9
## + m5bins.xvery.Low 1 3771.1 3819.1
## + tech_support 1 3771.1 3819.1
## + payment_method.xCredit.card..automatic. 1 3771.4 3819.4
## - senior_citizen 1 3775.6 3819.6
## - phone_service 1 3780.4 3824.4
## - total_charges 1 3780.7 3824.7
## - device_protection 1 3781.6 3825.6
## - m5bins.xlow 1 3785.0 3829.0
## - paperless_billing 1 3785.5 3829.5
## - internet_service.xNo 1 3790.3 3834.3
## - contract.xOne.year 1 3791.0 3835.0
## - payment_method.xElectronic.check 1 3796.9 3840.9
## - monthly_charges 1 3797.4 3841.4
## - streaming_tv 1 3803.6 3847.6
## - streaming_movies 1 3808.6 3852.6
## - multiple_lines 1 3810.6 3854.6
## - contract.xTwo.year 1 3813.1 3857.1
## - internet_service.xFiber.optic 1 3832.8 3876.8
## - tenure 1 3839.2 3883.2
##
## Step: AIC=3816.13
## churn ~ id + tenure + monthly_charges + total_charges + senior_citizen +
## dependents + phone_service + multiple_lines + internet_service.xFiber.optic +
## internet_service.xNo + online_backup + device_protection +
## streaming_tv + streaming_movies + contract.xOne.year + contract.xTwo.year +
## paperless_billing + payment_method.xElectronic.check + m5bins.xlow +
## m5bins.xmedium + m5bins.xvery.high
##
## Df Deviance AIC
## - id 1 3773.0 3815.0
## - m5bins.xvery.high 1 3773.1 3815.1
## - dependents 1 3774.1 3816.1
## <none> 3772.1 3816.1
## - m5bins.xmedium 1 3775.3 3817.3
## + gender 1 3771.4 3817.4
## - online_backup 1 3775.4 3817.4
## + online_security 1 3771.5 3817.5
## + partner 1 3771.5 3817.5
## + payment_method.xMailed.check 1 3771.6 3817.6
## + tech_support 1 3771.9 3817.9
## + m5bins.xvery.Low 1 3771.9 3817.9
## + payment_method.xCredit.card..automatic. 1 3772.1 3818.1
## - senior_citizen 1 3776.3 3818.3
## - phone_service 1 3781.1 3823.1
## - total_charges 1 3781.4 3823.4
## - device_protection 1 3782.3 3824.3
## - m5bins.xlow 1 3785.7 3827.7
## - paperless_billing 1 3786.5 3828.5
## - internet_service.xNo 1 3791.0 3833.0
## - contract.xOne.year 1 3791.6 3833.6
## - payment_method.xElectronic.check 1 3797.7 3839.7
## - monthly_charges 1 3798.1 3840.1
## - streaming_tv 1 3804.3 3846.3
## - streaming_movies 1 3809.4 3851.4
## - multiple_lines 1 3811.1 3853.1
## - contract.xTwo.year 1 3813.7 3855.7
## - internet_service.xFiber.optic 1 3833.3 3875.3
## - tenure 1 3839.8 3881.8
##
## Step: AIC=3815.05
## churn ~ tenure + monthly_charges + total_charges + senior_citizen +
## dependents + phone_service + multiple_lines + internet_service.xFiber.optic +
## internet_service.xNo + online_backup + device_protection +
## streaming_tv + streaming_movies + contract.xOne.year + contract.xTwo.year +
## paperless_billing + payment_method.xElectronic.check + m5bins.xlow +
## m5bins.xmedium + m5bins.xvery.high
##
## Df Deviance AIC
## - m5bins.xvery.high 1 3774.1 3814.1
## - dependents 1 3775.0 3815.0
## <none> 3773.0 3815.0
## + id 1 3772.1 3816.1
## - m5bins.xmedium 1 3776.2 3816.2
## + gender 1 3772.3 3816.3
## - online_backup 1 3776.3 3816.3
## + online_security 1 3772.4 3816.4
## + partner 1 3772.5 3816.5
## + payment_method.xMailed.check 1 3772.6 3816.6
## + m5bins.xvery.Low 1 3772.8 3816.8
## + tech_support 1 3772.8 3816.8
## + payment_method.xCredit.card..automatic. 1 3773.0 3817.0
## - senior_citizen 1 3777.3 3817.3
## - phone_service 1 3781.8 3821.8
## - total_charges 1 3782.4 3822.4
## - device_protection 1 3783.2 3823.2
## - m5bins.xlow 1 3786.5 3826.5
## - paperless_billing 1 3787.6 3827.6
## - internet_service.xNo 1 3791.9 3831.9
## - contract.xOne.year 1 3792.6 3832.6
## - payment_method.xElectronic.check 1 3798.8 3838.8
## - monthly_charges 1 3798.8 3838.8
## - streaming_tv 1 3805.0 3845.0
## - streaming_movies 1 3810.0 3850.0
## - multiple_lines 1 3811.9 3851.9
## - contract.xTwo.year 1 3814.6 3854.6
## - internet_service.xFiber.optic 1 3833.9 3873.9
## - tenure 1 3840.8 3880.8
##
## Step: AIC=3814.11
## churn ~ tenure + monthly_charges + total_charges + senior_citizen +
## dependents + phone_service + multiple_lines + internet_service.xFiber.optic +
## internet_service.xNo + online_backup + device_protection +
## streaming_tv + streaming_movies + contract.xOne.year + contract.xTwo.year +
## paperless_billing + payment_method.xElectronic.check + m5bins.xlow +
## m5bins.xmedium
##
## Df Deviance AIC
## - dependents 1 3776.1 3814.1
## <none> 3774.1 3814.1
## + m5bins.xvery.high 1 3773.0 3815.0
## - m5bins.xmedium 1 3777.1 3815.1
## + id 1 3773.1 3815.1
## + gender 1 3773.3 3815.3
## + partner 1 3773.5 3815.5
## + online_security 1 3773.5 3815.5
## + payment_method.xMailed.check 1 3773.6 3815.6
## - online_backup 1 3777.7 3815.7
## + tech_support 1 3773.8 3815.8
## + m5bins.xvery.Low 1 3773.9 3815.9
## + payment_method.xCredit.card..automatic. 1 3774.1 3816.1
## - senior_citizen 1 3778.2 3816.2
## - total_charges 1 3782.5 3820.5
## - phone_service 1 3784.6 3822.6
## - device_protection 1 3784.9 3822.9
## - m5bins.xlow 1 3787.1 3825.1
## - paperless_billing 1 3788.4 3826.4
## - contract.xOne.year 1 3793.9 3831.9
## - internet_service.xNo 1 3796.0 3834.0
## - payment_method.xElectronic.check 1 3799.7 3837.7
## - monthly_charges 1 3803.4 3841.4
## - streaming_tv 1 3809.1 3847.1
## - streaming_movies 1 3814.4 3852.4
## - multiple_lines 1 3814.6 3852.6
## - contract.xTwo.year 1 3818.5 3856.5
## - internet_service.xFiber.optic 1 3839.8 3877.8
## - tenure 1 3841.3 3879.3
##
## Step: AIC=3814.06
## churn ~ tenure + monthly_charges + total_charges + senior_citizen +
## phone_service + multiple_lines + internet_service.xFiber.optic +
## internet_service.xNo + online_backup + device_protection +
## streaming_tv + streaming_movies + contract.xOne.year + contract.xTwo.year +
## paperless_billing + payment_method.xElectronic.check + m5bins.xlow +
## m5bins.xmedium
##
## Df Deviance AIC
## <none> 3776.1 3814.1
## + dependents 1 3774.1 3814.1
## - m5bins.xmedium 1 3779.0 3815.0
## + m5bins.xvery.high 1 3775.0 3815.0
## + id 1 3775.1 3815.1
## + gender 1 3775.3 3815.3
## + online_security 1 3775.4 3815.4
## - online_backup 1 3779.6 3815.6
## + payment_method.xMailed.check 1 3775.6 3815.6
## + tech_support 1 3775.8 3815.8
## + m5bins.xvery.Low 1 3775.8 3815.8
## + partner 1 3776.0 3816.0
## + payment_method.xCredit.card..automatic. 1 3776.0 3816.0
## - senior_citizen 1 3781.3 3817.3
## - total_charges 1 3784.7 3820.7
## - phone_service 1 3786.7 3822.7
## - device_protection 1 3787.0 3823.0
## - m5bins.xlow 1 3788.9 3824.9
## - paperless_billing 1 3790.7 3826.7
## - contract.xOne.year 1 3796.6 3832.6
## - internet_service.xNo 1 3798.4 3834.4
## - payment_method.xElectronic.check 1 3801.9 3837.9
## - monthly_charges 1 3805.7 3841.7
## - streaming_tv 1 3811.3 3847.3
## - streaming_movies 1 3816.9 3852.9
## - multiple_lines 1 3817.0 3853.0
## - contract.xTwo.year 1 3821.9 3857.9
## - internet_service.xFiber.optic 1 3842.5 3878.5
## - tenure 1 3844.6 3880.6
summary(model_2)
##
## Call:
## glm(formula = churn ~ tenure + monthly_charges + total_charges +
## senior_citizen + phone_service + multiple_lines + internet_service.xFiber.optic +
## internet_service.xNo + online_backup + device_protection +
## streaming_tv + streaming_movies + contract.xOne.year + contract.xTwo.year +
## paperless_billing + payment_method.xElectronic.check + m5bins.xlow +
## m5bins.xmedium, family = binomial(link = "logit"), data = training_set)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.97163 0.56082 -8.865 < 2e-16 ***
## tenure -1.39499 0.18293 -7.626 2.43e-14 ***
## monthly_charges -2.29205 0.42467 -5.397 6.76e-08 ***
## total_charges 0.56500 0.19592 2.884 0.003929 **
## senior_citizen 0.23593 0.10304 2.290 0.022031 *
## phone_service 1.06235 0.32705 3.248 0.001161 **
## multiple_lines 0.74534 0.11761 6.337 2.34e-10 ***
## internet_service.xFiber.optic 2.97957 0.37797 7.883 3.20e-15 ***
## internet_service.xNo -2.17834 0.46094 -4.726 2.29e-06 ***
## online_backup 0.22394 0.11876 1.886 0.059341 .
## device_protection 0.40402 0.12269 3.293 0.000991 ***
## streaming_tv 1.00256 0.17065 5.875 4.23e-09 ***
## streaming_movies 1.08905 0.17257 6.311 2.78e-10 ***
## contract.xOne.year -0.59028 0.13295 -4.440 9.00e-06 ***
## contract.xTwo.year -1.36547 0.22238 -6.140 8.24e-10 ***
## paperless_billing 0.35203 0.09233 3.813 0.000137 ***
## payment_method.xElectronic.check 0.43897 0.08607 5.100 3.39e-07 ***
## m5bins.xlow 0.70711 0.20128 3.513 0.000443 ***
## m5bins.xmedium 0.43639 0.25480 1.713 0.086771 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5299.7 on 4570 degrees of freedom
## Residual deviance: 3776.1 on 4552 degrees of freedom
## AIC: 3814.1
##
## Number of Fisher Scoring iterations: 6
formula(model_2)
## churn ~ tenure + monthly_charges + total_charges + senior_citizen +
## phone_service + multiple_lines + internet_service.xFiber.optic +
## internet_service.xNo + online_backup + device_protection +
## streaming_tv + streaming_movies + contract.xOne.year + contract.xTwo.year +
## paperless_billing + payment_method.xElectronic.check + m5bins.xlow +
## m5bins.xmedium
Check for multicollinearity:
In order to check for
multicollinearity i will use ‘VIF function’ to get variance inflation
factor for each variable in ‘model_stepwise’.I will remove a variable
with VIF > 5.
Check P-Values. p-vlaue > 0.05 —> remove variable.
Check for multicollinearity:
(vif_vars <- as.data.frame(vif(model_2)))
## vif(model_2)
## tenure 14.276772
## monthly_charges 89.210594
## total_charges 19.141273
## senior_citizen 1.091549
## phone_service 5.633263
## multiple_lines 2.088719
## internet_service.xFiber.optic 21.249413
## internet_service.xNo 12.017351
## online_backup 1.896315
## device_protection 2.005867
## streaming_tv 4.355047
## streaming_movies 4.452581
## contract.xOne.year 1.319457
## contract.xTwo.year 1.284584
## paperless_billing 1.121314
## payment_method.xElectronic.check 1.144557
## m5bins.xlow 4.003061
## m5bins.xmedium 1.852579
* internet_service.xFiber.optic variable has VIF haigeher than 5 ,therefor i will remove it and recheck the model again.
All indpendent variables are statistically significant.
#* Final Model
model_3 <- glm(churn ~ tenure + total_charges + dependents +
internet_service.xNo + online_security + online_backup +
tech_support + streaming_tv + streaming_movies + contract.xOne.year +
contract.xTwo.year + paperless_billing + payment_method.xElectronic.check
+ m5bins.xmedium,data = training_set,family = "binomial")
summary(model_3)
##
## Call:
## glm(formula = churn ~ tenure + total_charges + dependents + internet_service.xNo +
## online_security + online_backup + tech_support + streaming_tv +
## streaming_movies + contract.xOne.year + contract.xTwo.year +
## paperless_billing + payment_method.xElectronic.check + m5bins.xmedium,
## family = "binomial", data = training_set)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.19210 0.12469 -9.560 < 2e-16 ***
## tenure -1.56309 0.16791 -9.309 < 2e-16 ***
## total_charges 0.93375 0.16713 5.587 2.31e-08 ***
## dependents -0.19248 0.09852 -1.954 0.0507 .
## internet_service.xNo -1.19432 0.15417 -7.747 9.43e-15 ***
## online_security -0.53935 0.10402 -5.185 2.16e-07 ***
## online_backup -0.15348 0.09423 -1.629 0.1034
## tech_support -0.46826 0.10470 -4.473 7.73e-06 ***
## streaming_tv 0.22101 0.09923 2.227 0.0259 *
## streaming_movies 0.31505 0.09985 3.155 0.0016 **
## contract.xOne.year -0.70990 0.13145 -5.400 6.65e-08 ***
## contract.xTwo.year -1.53203 0.22310 -6.867 6.56e-12 ***
## paperless_billing 0.42751 0.09098 4.699 2.62e-06 ***
## payment_method.xElectronic.check 0.51313 0.08429 6.088 1.15e-09 ***
## m5bins.xmedium -0.37694 0.19698 -1.914 0.0557 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5299.7 on 4570 degrees of freedom
## Residual deviance: 3831.7 on 4556 degrees of freedom
## AIC: 3861.7
##
## Number of Fisher Scoring iterations: 6
* AIC is 3954 for model_1.
* AIC is 3941 for model_2.
* AIC is 3977 for model_3.
model 2 is selected since it hasthe lowest AIC.
# Model Evaluation
pred_train <- predict(model_2,newdata = training_set[,which(colnames(training_set)!= "churn")],type = "response")
pred <- predict(model_2,newdata = testing_set[,which(colnames(testing_set)!= "churn")],type = "response")
summary(pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001453 0.040578 0.189528 0.263258 0.455258 0.872511
pred_trainchurn <- factor(ifelse(pred_train >= 0.50, "Yes", "No"))
actual_trainchurn <- factor(ifelse(training_set$churn==1,"Yes","No"))
table(actual_trainchurn,pred_trainchurn)
## pred_trainchurn
## actual_trainchurn No Yes
## No 3002 351
## Yes 545 673
pred_churn <- factor(ifelse(pred >= 0.50, "Yes", "No"))
actual_churn <- factor(ifelse(testing_set$churn==1,"Yes","No"))
table(actual_churn,pred_churn)
## pred_churn
## actual_churn No Yes
## No 1632 178
## Yes 300 351
caret::confusionMatrix(pred_trainchurn,actual_trainchurn,positive = "No")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 3002 545
## Yes 351 673
##
## Accuracy : 0.804
## 95% CI : (0.7922, 0.8154)
## No Information Rate : 0.7335
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4718
##
## Mcnemar's Test P-Value : 1.136e-10
##
## Sensitivity : 0.8953
## Specificity : 0.5525
## Pos Pred Value : 0.8463
## Neg Pred Value : 0.6572
## Prevalence : 0.7335
## Detection Rate : 0.6567
## Detection Prevalence : 0.7760
## Balanced Accuracy : 0.7239
##
## 'Positive' Class : No
##
caret::confusionMatrix(pred_churn,actual_churn,positive = "No")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1632 300
## Yes 178 351
##
## Accuracy : 0.8058
## 95% CI : (0.7896, 0.8212)
## No Information Rate : 0.7355
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.469
##
## Mcnemar's Test P-Value : 3.123e-08
##
## Sensitivity : 0.9017
## Specificity : 0.5392
## Pos Pred Value : 0.8447
## Neg Pred Value : 0.6635
## Prevalence : 0.7355
## Detection Rate : 0.6631
## Detection Prevalence : 0.7850
## Balanced Accuracy : 0.7204
##
## 'Positive' Class : No
##
list('Train Performance(CutOff = 50%)',c(Accuracy = 78,Sensitivity = 88,Specificity = 50),'Test Performance(Cutoff = 50)', c(Accuracy = 78 ,Sensitivity = 88, Specificity = 50))
## [[1]]
## [1] "Train Performance(CutOff = 50%)"
##
## [[2]]
## Accuracy Sensitivity Specificity
## 78 88 50
##
## [[3]]
## [1] "Test Performance(Cutoff = 50)"
##
## [[4]]
## Accuracy Sensitivity Specificity
## 78 88 50
The default cutoff prediction probability score is 0.5 or the ratio of 1’s and 0’s in the training data. But sometimes, tuning the probability cutoff can improve the accuracy in both the development and validation samples.
## Model Evaluation with Optimal Cutoff
# Install and load the cutpointr package
if (!require(cutpointr)) {
install.packages("cutpointr")
}
## Loading required package: cutpointr
##
## Attaching package: 'cutpointr'
## The following objects are masked from 'package:caret':
##
## precision, recall, sensitivity, specificity
library(cutpointr)
# Find the optimal cutoff using cutpointr
opt_cut <- cutpointr(testing_set, pred, churn, method = maximize_metric, metric = youden)
## Assuming the positive class is 1
## Assuming the positive class has higher x values
# Display the optimal cutoff value
opt_cut$optimal_cutpoint
## [1] 0.2836847
# Create a factor vector for predictions based on the optimal cutoff
pred_churn_51 <- factor(ifelse(pred >= opt_cut$optimal_cutpoint, 'Yes', 'No'))
# Calculate and display the confusion matrix
caret::confusionMatrix(actual_churn, pred_churn_51, positive = 'No')
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1346 464
## Yes 137 514
##
## Accuracy : 0.7558
## 95% CI : (0.7383, 0.7727)
## No Information Rate : 0.6026
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4593
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9076
## Specificity : 0.5256
## Pos Pred Value : 0.7436
## Neg Pred Value : 0.7896
## Prevalence : 0.6026
## Detection Rate : 0.5469
## Detection Prevalence : 0.7355
## Balanced Accuracy : 0.7166
##
## 'Positive' Class : No
##
Finally , Logistic Regression with a cutoff probability value of 28 % gives us better values of accuracy and specificity but lower in sensitivity.