The purpose of this project is to analyze the data and find insights about why customers are leaving the company so that the company can come up with customer retention strategies. In this project, I will use ggplot2 to explore the data, do some preprocessing like binning some variables, getting dummy variables for categorical features and scale some numeric variables. For predictions, First I will split the data into train and test to see the model performance on both the dataset so that we dont end up overfitting the model. I will use glmnet, gbm and randomforest with simple parameter tuning.
library(caret)
library(caTools)
library(gbm)
library(glmnet)
library(tidyverse)
library(plotly)
churn = read.csv("Churn.csv", stringsAsFactors = F)
str(churn)
## 'data.frame': 7043 obs. of 21 variables:
## $ customerID : chr "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
## $ gender : chr "Female" "Male" "Male" "Male" ...
## $ SeniorCitizen : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Partner : chr "Yes" "No" "No" "No" ...
## $ Dependents : chr "No" "No" "No" "No" ...
## $ tenure : int 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : chr "No" "Yes" "Yes" "No" ...
## $ MultipleLines : chr "No phone service" "No" "No" "No phone service" ...
## $ InternetService : chr "DSL" "DSL" "DSL" "DSL" ...
## $ OnlineSecurity : chr "No" "Yes" "Yes" "Yes" ...
## $ OnlineBackup : chr "Yes" "No" "Yes" "No" ...
## $ DeviceProtection: chr "No" "Yes" "No" "Yes" ...
## $ TechSupport : chr "No" "No" "No" "Yes" ...
## $ StreamingTV : chr "No" "No" "No" "No" ...
## $ StreamingMovies : chr "No" "No" "No" "No" ...
## $ Contract : chr "Month-to-month" "One year" "Month-to-month" "One year" ...
## $ PaperlessBilling: chr "Yes" "No" "Yes" "No" ...
## $ PaymentMethod : chr "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
## $ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : chr "No" "No" "Yes" "No" ...
We have 7043 rows and 21 columns
colSums(is.na(churn))
## customerID gender SeniorCitizen Partner
## 0 0 0 0
## Dependents tenure PhoneService MultipleLines
## 0 0 0 0
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## 0 0 0 0
## TechSupport StreamingTV StreamingMovies Contract
## 0 0 0 0
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## 0 0 0 11
## Churn
## 0
TotalCharges variable has 11 missing values. We can safely remove those rows.
churn = churn[complete.cases(churn),]
plot = churn %>%
group_by(Churn) %>%
summarise(Count = n())%>%
mutate(percentage = round(prop.table(Count)*100)) %>%
ggplot(aes(x="", y=percentage, fill = Churn)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) +
theme_void() +
geom_text(aes(y = percentage, label = paste(percentage, "%")), color = "white", size=6,
position = position_stack(vjust = 0.5)) +
scale_fill_brewer(palette="Set1") +
labs(title = "Customer Churn Rate")
plot
Around 27 % of the customer left.
plot1 = churn %>%
group_by(gender, Churn) %>%
summarise(Count = n())%>%
mutate(percentage = round(prop.table(Count)*100)) %>%
ggplot(aes(x = gender, y = percentage ,fill = Churn)) +
geom_col(position = "fill" ,col = "black") +
labs(y = "Percent") +
theme_classic()
ggplotly(plot1)
No significant difference between the customer turnover based on gender.
plot2 = churn %>%
group_by(Partner, Churn) %>%
summarise(Count = n())%>%
mutate(percentage = round(prop.table(Count)*100)) %>%
ggplot(aes(x = Partner, y = percentage ,fill = Churn)) +
geom_col(position = "fill" ,col = "black") +
labs(y = "Percent") +
theme_classic()
ggplotly(plot2)
Turnover rate for the customer who dont have partner is high.
## Binning the tenure variable into 6 groups
breaks = c(0, 12, 24, 36, 48, 60, 72)
labels = c("0-1", "1-2", "2-3", "3-4", "4-5", "5-6")
churn$tenure = cut(churn$tenure,
breaks = breaks,
labels = labels)
plot3 = churn %>%
group_by(tenure, Churn) %>%
summarise(Count = n())%>%
mutate(percentage = round(prop.table(Count)*100)) %>%
ggplot(aes(x = tenure, y = percentage ,fill = Churn)) +
geom_col(position = "fill" ,col = "black") +
labs(y = "Percent") +
theme_classic()
ggplotly(plot3)
Almost 50 percent of Customers with tenure 0-1 year have left the company. The Churn rate is decreasing as the years of tenure increases.
plot4 = churn %>%
group_by(MultipleLines, Churn) %>%
summarise(Count = n())%>%
mutate(percentage = round(prop.table(Count)*100)) %>%
ggplot(aes(x = MultipleLines, y = percentage ,fill = Churn)) +
geom_col(position = "fill" ,col = "black") +
labs(y = "Percent") +
theme_classic()
ggplotly(plot4)
No Significant Difference.
plot5 = churn %>%
group_by(InternetService, Churn) %>%
summarise(Count = n())%>%
mutate(percentage = round(prop.table(Count)*100)) %>%
ggplot(aes(x = InternetService, y = percentage ,fill = Churn)) +
geom_col(position = "fill" ,col = "black") +
labs(y = "Percent") +
theme_classic()
ggplotly(plot5)
Churn rate is extremely high for the customers who have Fiber Optics as their internet service followed by DSL.
plot6 = churn %>%
group_by(Contract, Churn) %>%
summarise(Count = n())%>%
mutate(percentage = round(prop.table(Count)*100)) %>%
ggplot(aes(x = Contract, y = percentage ,fill = Churn)) +
geom_col(position = "fill" ,col = "black") +
labs(y = "Percent") +
theme_classic()
ggplotly(plot6)
Churn Rate is 43% which is extremely high for the customers with month-to-month subscription. Customers with two year contract have the lowest churn rate.
## Convert the columns with Yes No to 1 and 0 repectively
churn$Partner = ifelse(churn$Partner =="Yes", 1, 0)
churn$Dependents = ifelse(churn$Dependents =="Yes", 1, 0)
churn$PhoneService = ifelse(churn$PhoneService =="Yes", 1, 0)
churn$PaperlessBilling = ifelse(churn$PaperlessBilling =="Yes", 1, 0)
churn$InternetService = as.factor(churn$InternetService)
churn$gender = as.factor(churn$gender)
churn$InternetService = as.factor(churn$InternetService)
churn$Contract = as.factor(churn$Contract)
churn$PaymentMethod = as.factor(churn$PaymentMethod)
churn$Churn = as.factor(churn$Churn)
churn$MultipleLines = as.factor(churn$MultipleLines)
churn$StreamingMovies = as.factor(churn$StreamingMovies)
churn$TechSupport = as.factor(churn$TechSupport)
churn$StreamingTV = as.factor(churn$StreamingTV)
churn$OnlineSecurity = as.factor(churn$OnlineSecurity)
churn$OnlineBackup = as.factor(churn$OnlineBackup)
churn$DeviceProtection = as.factor(churn$DeviceProtection)
churn$customerID = NULL
y = churn$Churn
numericDF = churn %>%
select_if(is.numeric)
factorDF = churn %>%
select_if(is.factor)
## Get dummies for all the factors except Churn which is column number 13
dummies <- as.data.frame(model.matrix(~.-1, factorDF[, -13]))
preprocess = preProcess(numericDF[, 6:7], method = c("center", "scale"))
numericDF = predict(preprocess, numericDF)
data = cbind(dummies, numericDF)
data$Churn = y
## Binning tenure
str(data)
## 'data.frame': 7032 obs. of 36 variables:
## $ genderFemale : num 1 0 0 0 1 1 0 1 1 0 ...
## $ genderMale : num 0 1 1 1 0 0 1 0 0 1 ...
## $ tenure1-2 : num 0 0 0 0 0 0 1 0 0 0 ...
## $ tenure2-3 : num 0 1 0 0 0 0 0 0 1 0 ...
## $ tenure3-4 : num 0 0 0 1 0 0 0 0 0 0 ...
## $ tenure4-5 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ tenure5-6 : num 0 0 0 0 0 0 0 0 0 1 ...
## $ MultipleLinesNo phone service : num 1 0 0 1 0 0 0 1 0 0 ...
## $ MultipleLinesYes : num 0 0 0 0 0 1 1 0 1 0 ...
## $ InternetServiceFiber optic : num 0 0 0 0 1 1 1 0 1 0 ...
## $ InternetServiceNo : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OnlineSecurityNo internet service : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OnlineSecurityYes : num 0 1 1 1 0 0 0 1 0 1 ...
## $ OnlineBackupNo internet service : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OnlineBackupYes : num 1 0 1 0 0 0 1 0 0 1 ...
## $ DeviceProtectionNo internet service : num 0 0 0 0 0 0 0 0 0 0 ...
## $ DeviceProtectionYes : num 0 1 0 1 0 1 0 0 1 0 ...
## $ TechSupportNo internet service : num 0 0 0 0 0 0 0 0 0 0 ...
## $ TechSupportYes : num 0 0 0 1 0 0 0 0 1 0 ...
## $ StreamingTVNo internet service : num 0 0 0 0 0 0 0 0 0 0 ...
## $ StreamingTVYes : num 0 0 0 0 0 1 1 0 1 0 ...
## $ StreamingMoviesNo internet service : num 0 0 0 0 0 0 0 0 0 0 ...
## $ StreamingMoviesYes : num 0 0 0 0 0 1 0 0 1 0 ...
## $ ContractOne year : num 0 1 0 1 0 0 0 0 0 1 ...
## $ ContractTwo year : num 0 0 0 0 0 0 0 0 0 0 ...
## $ PaymentMethodCredit card (automatic): num 0 0 0 0 0 0 1 0 0 0 ...
## $ PaymentMethodElectronic check : num 1 0 0 0 1 1 0 0 1 0 ...
## $ PaymentMethodMailed check : num 0 1 1 0 0 0 0 1 0 0 ...
## $ SeniorCitizen : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Partner : num 1 0 0 0 0 0 0 0 1 0 ...
## $ Dependents : num 0 0 0 0 0 0 1 0 0 1 ...
## $ PhoneService : num 0 1 1 0 1 1 1 0 1 1 ...
## $ PaperlessBilling : num 1 0 1 0 1 1 1 0 1 0 ...
## $ MonthlyCharges : num -1.162 -0.261 -0.364 -0.748 0.196 ...
## $ TotalCharges : num -0.994 -0.174 -0.96 -0.195 -0.94 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...
set.seed(123)
split = sample.split(churn$Churn, SplitRatio = 0.7)
train = subset(data, split == TRUE)
test = subset(data, split == FALSE)
set.seed(123)
mycontrol = trainControl(method = "repeatedcv",
number = 5,
repeats = 2,
classProbs = TRUE)
glmnetGrid <- expand.grid(alpha = 0,
lambda = seq(0.0001, 0.1, by = 0.0005))
model_glmnet = train(x = train[, -36],
y = train$Churn,
method = 'glmnet',
trControl = mycontrol,
tuneGrid = glmnetGrid)
pred_glmnet = predict(model_glmnet, test[, -36], type = "prob")
pred_glmnet$Churn = as.factor(ifelse(pred_glmnet$No > 0.5, "No", "Yes"))
confusionMatrix(test$Churn, pred_glmnet$Churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1406 143
## Yes 285 276
##
## Accuracy : 0.7972
## 95% CI : (0.7794, 0.8141)
## No Information Rate : 0.8014
## P-Value [Acc > NIR] : 0.6993
##
## Kappa : 0.4348
##
## Mcnemar's Test P-Value : 9.394e-12
##
## Sensitivity : 0.8315
## Specificity : 0.6587
## Pos Pred Value : 0.9077
## Neg Pred Value : 0.4920
## Prevalence : 0.8014
## Detection Rate : 0.6664
## Detection Prevalence : 0.7341
## Balanced Accuracy : 0.7451
##
## 'Positive' Class : No
##
Accuracy of 79 % again with 0.434 kappa.
set.seed(123)
mycontrol = trainControl(method = "repeatedcv",
number = 5,
repeats = 2,
classProbs = TRUE)
model_gbm_cv = train(x = train[, -36],
y = train$Churn,
method = 'gbm',
trControl = mycontrol)
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1383 nan 0.1000 0.0093
## 2 1.1216 nan 0.1000 0.0087
## 3 1.1049 nan 0.1000 0.0079
## 4 1.0902 nan 0.1000 0.0071
## 5 1.0786 nan 0.1000 0.0048
## 6 1.0654 nan 0.1000 0.0067
## 7 1.0537 nan 0.1000 0.0052
## 8 1.0437 nan 0.1000 0.0050
## 9 1.0354 nan 0.1000 0.0040
## 10 1.0253 nan 0.1000 0.0051
## 20 0.9589 nan 0.1000 0.0029
## 40 0.8823 nan 0.1000 0.0013
## 60 0.8424 nan 0.1000 0.0003
## 80 0.8185 nan 0.1000 0.0002
## 100 0.8040 nan 0.1000 -0.0000
## 120 0.7945 nan 0.1000 0.0000
## 140 0.7883 nan 0.1000 -0.0000
## 150 0.7863 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1209 nan 0.1000 0.0182
## 2 1.0884 nan 0.1000 0.0150
## 3 1.0670 nan 0.1000 0.0101
## 4 1.0433 nan 0.1000 0.0103
## 5 1.0278 nan 0.1000 0.0072
## 6 1.0094 nan 0.1000 0.0094
## 7 0.9932 nan 0.1000 0.0076
## 8 0.9833 nan 0.1000 0.0043
## 9 0.9668 nan 0.1000 0.0083
## 10 0.9534 nan 0.1000 0.0070
## 20 0.8769 nan 0.1000 0.0024
## 40 0.8153 nan 0.1000 -0.0000
## 60 0.7904 nan 0.1000 -0.0001
## 80 0.7774 nan 0.1000 -0.0002
## 100 0.7682 nan 0.1000 -0.0002
## 120 0.7617 nan 0.1000 -0.0002
## 140 0.7561 nan 0.1000 -0.0003
## 150 0.7544 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1115 nan 0.1000 0.0215
## 2 1.0823 nan 0.1000 0.0140
## 3 1.0506 nan 0.1000 0.0153
## 4 1.0228 nan 0.1000 0.0139
## 5 1.0044 nan 0.1000 0.0077
## 6 0.9828 nan 0.1000 0.0103
## 7 0.9643 nan 0.1000 0.0075
## 8 0.9537 nan 0.1000 0.0050
## 9 0.9376 nan 0.1000 0.0074
## 10 0.9240 nan 0.1000 0.0066
## 20 0.8464 nan 0.1000 0.0018
## 40 0.7929 nan 0.1000 0.0003
## 60 0.7733 nan 0.1000 -0.0000
## 80 0.7608 nan 0.1000 -0.0004
## 100 0.7524 nan 0.1000 -0.0005
## 120 0.7455 nan 0.1000 -0.0001
## 140 0.7367 nan 0.1000 -0.0004
## 150 0.7337 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1403 nan 0.1000 0.0085
## 2 1.1245 nan 0.1000 0.0081
## 3 1.1102 nan 0.1000 0.0065
## 4 1.0972 nan 0.1000 0.0063
## 5 1.0846 nan 0.1000 0.0065
## 6 1.0729 nan 0.1000 0.0049
## 7 1.0624 nan 0.1000 0.0047
## 8 1.0525 nan 0.1000 0.0040
## 9 1.0416 nan 0.1000 0.0048
## 10 1.0324 nan 0.1000 0.0040
## 20 0.9697 nan 0.1000 0.0025
## 40 0.8988 nan 0.1000 0.0008
## 60 0.8606 nan 0.1000 0.0005
## 80 0.8411 nan 0.1000 0.0005
## 100 0.8282 nan 0.1000 0.0001
## 120 0.8206 nan 0.1000 -0.0001
## 140 0.8132 nan 0.1000 -0.0002
## 150 0.8108 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1264 nan 0.1000 0.0158
## 2 1.0966 nan 0.1000 0.0153
## 3 1.0783 nan 0.1000 0.0088
## 4 1.0544 nan 0.1000 0.0114
## 5 1.0341 nan 0.1000 0.0097
## 6 1.0170 nan 0.1000 0.0081
## 7 1.0030 nan 0.1000 0.0058
## 8 0.9900 nan 0.1000 0.0060
## 9 0.9772 nan 0.1000 0.0061
## 10 0.9645 nan 0.1000 0.0065
## 20 0.8950 nan 0.1000 0.0015
## 40 0.8375 nan 0.1000 0.0006
## 60 0.8157 nan 0.1000 -0.0000
## 80 0.8044 nan 0.1000 0.0002
## 100 0.7959 nan 0.1000 -0.0002
## 120 0.7906 nan 0.1000 -0.0004
## 140 0.7853 nan 0.1000 -0.0003
## 150 0.7839 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1175 nan 0.1000 0.0202
## 2 1.0860 nan 0.1000 0.0163
## 3 1.0638 nan 0.1000 0.0109
## 4 1.0366 nan 0.1000 0.0133
## 5 1.0151 nan 0.1000 0.0096
## 6 0.9949 nan 0.1000 0.0096
## 7 0.9831 nan 0.1000 0.0053
## 8 0.9681 nan 0.1000 0.0076
## 9 0.9528 nan 0.1000 0.0069
## 10 0.9421 nan 0.1000 0.0050
## 20 0.8764 nan 0.1000 0.0017
## 40 0.8234 nan 0.1000 0.0003
## 60 0.8027 nan 0.1000 -0.0003
## 80 0.7897 nan 0.1000 -0.0003
## 100 0.7813 nan 0.1000 -0.0004
## 120 0.7742 nan 0.1000 -0.0003
## 140 0.7668 nan 0.1000 -0.0003
## 150 0.7617 nan 0.1000 -0.0006
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1401 nan 0.1000 0.0089
## 2 1.1230 nan 0.1000 0.0083
## 3 1.1084 nan 0.1000 0.0071
## 4 1.0943 nan 0.1000 0.0068
## 5 1.0827 nan 0.1000 0.0054
## 6 1.0708 nan 0.1000 0.0055
## 7 1.0599 nan 0.1000 0.0048
## 8 1.0501 nan 0.1000 0.0046
## 9 1.0409 nan 0.1000 0.0046
## 10 1.0330 nan 0.1000 0.0037
## 20 0.9686 nan 0.1000 0.0022
## 40 0.8923 nan 0.1000 0.0009
## 60 0.8567 nan 0.1000 0.0003
## 80 0.8353 nan 0.1000 0.0002
## 100 0.8231 nan 0.1000 0.0003
## 120 0.8141 nan 0.1000 0.0000
## 140 0.8090 nan 0.1000 -0.0001
## 150 0.8074 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1338 nan 0.1000 0.0106
## 2 1.1034 nan 0.1000 0.0149
## 3 1.0769 nan 0.1000 0.0142
## 4 1.0549 nan 0.1000 0.0110
## 5 1.0326 nan 0.1000 0.0107
## 6 1.0134 nan 0.1000 0.0089
## 7 0.9963 nan 0.1000 0.0077
## 8 0.9853 nan 0.1000 0.0051
## 9 0.9723 nan 0.1000 0.0057
## 10 0.9620 nan 0.1000 0.0042
## 20 0.8872 nan 0.1000 0.0030
## 40 0.8325 nan 0.1000 0.0004
## 60 0.8107 nan 0.1000 0.0001
## 80 0.7996 nan 0.1000 -0.0001
## 100 0.7922 nan 0.1000 -0.0001
## 120 0.7858 nan 0.1000 -0.0003
## 140 0.7801 nan 0.1000 -0.0004
## 150 0.7778 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1272 nan 0.1000 0.0145
## 2 1.0896 nan 0.1000 0.0181
## 3 1.0591 nan 0.1000 0.0146
## 4 1.0311 nan 0.1000 0.0139
## 5 1.0088 nan 0.1000 0.0103
## 6 0.9884 nan 0.1000 0.0100
## 7 0.9748 nan 0.1000 0.0063
## 8 0.9635 nan 0.1000 0.0049
## 9 0.9514 nan 0.1000 0.0053
## 10 0.9378 nan 0.1000 0.0065
## 20 0.8643 nan 0.1000 0.0025
## 40 0.8135 nan 0.1000 0.0004
## 60 0.7960 nan 0.1000 -0.0002
## 80 0.7853 nan 0.1000 -0.0002
## 100 0.7754 nan 0.1000 -0.0003
## 120 0.7671 nan 0.1000 -0.0005
## 140 0.7594 nan 0.1000 -0.0003
## 150 0.7559 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1398 nan 0.1000 0.0089
## 2 1.1213 nan 0.1000 0.0084
## 3 1.1058 nan 0.1000 0.0072
## 4 1.0929 nan 0.1000 0.0063
## 5 1.0793 nan 0.1000 0.0068
## 6 1.0687 nan 0.1000 0.0048
## 7 1.0584 nan 0.1000 0.0048
## 8 1.0474 nan 0.1000 0.0051
## 9 1.0393 nan 0.1000 0.0036
## 10 1.0309 nan 0.1000 0.0038
## 20 0.9622 nan 0.1000 0.0032
## 40 0.8909 nan 0.1000 0.0014
## 60 0.8544 nan 0.1000 0.0006
## 80 0.8333 nan 0.1000 0.0002
## 100 0.8207 nan 0.1000 0.0000
## 120 0.8125 nan 0.1000 0.0001
## 140 0.8069 nan 0.1000 -0.0003
## 150 0.8049 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1232 nan 0.1000 0.0169
## 2 1.0925 nan 0.1000 0.0156
## 3 1.0674 nan 0.1000 0.0128
## 4 1.0505 nan 0.1000 0.0079
## 5 1.0298 nan 0.1000 0.0097
## 6 1.0153 nan 0.1000 0.0060
## 7 0.9991 nan 0.1000 0.0068
## 8 0.9854 nan 0.1000 0.0065
## 9 0.9695 nan 0.1000 0.0078
## 10 0.9561 nan 0.1000 0.0066
## 20 0.8833 nan 0.1000 0.0015
## 40 0.8294 nan 0.1000 0.0002
## 60 0.8090 nan 0.1000 0.0000
## 80 0.7977 nan 0.1000 -0.0003
## 100 0.7903 nan 0.1000 -0.0002
## 120 0.7848 nan 0.1000 -0.0002
## 140 0.7803 nan 0.1000 -0.0003
## 150 0.7783 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1155 nan 0.1000 0.0213
## 2 1.0883 nan 0.1000 0.0129
## 3 1.0557 nan 0.1000 0.0159
## 4 1.0299 nan 0.1000 0.0125
## 5 1.0058 nan 0.1000 0.0115
## 6 0.9907 nan 0.1000 0.0069
## 7 0.9711 nan 0.1000 0.0088
## 8 0.9541 nan 0.1000 0.0077
## 9 0.9449 nan 0.1000 0.0040
## 10 0.9336 nan 0.1000 0.0057
## 20 0.8580 nan 0.1000 0.0026
## 40 0.8101 nan 0.1000 -0.0000
## 60 0.7926 nan 0.1000 -0.0002
## 80 0.7838 nan 0.1000 -0.0005
## 100 0.7749 nan 0.1000 0.0000
## 120 0.7687 nan 0.1000 -0.0000
## 140 0.7606 nan 0.1000 -0.0001
## 150 0.7579 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1406 nan 0.1000 0.0088
## 2 1.1242 nan 0.1000 0.0083
## 3 1.1092 nan 0.1000 0.0075
## 4 1.0954 nan 0.1000 0.0069
## 5 1.0830 nan 0.1000 0.0060
## 6 1.0728 nan 0.1000 0.0045
## 7 1.0624 nan 0.1000 0.0051
## 8 1.0524 nan 0.1000 0.0047
## 9 1.0444 nan 0.1000 0.0033
## 10 1.0351 nan 0.1000 0.0038
## 20 0.9698 nan 0.1000 0.0022
## 40 0.8954 nan 0.1000 0.0013
## 60 0.8570 nan 0.1000 0.0009
## 80 0.8371 nan 0.1000 0.0001
## 100 0.8247 nan 0.1000 -0.0000
## 120 0.8159 nan 0.1000 -0.0001
## 140 0.8096 nan 0.1000 -0.0001
## 150 0.8072 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1239 nan 0.1000 0.0166
## 2 1.0939 nan 0.1000 0.0149
## 3 1.0696 nan 0.1000 0.0121
## 4 1.0470 nan 0.1000 0.0103
## 5 1.0322 nan 0.1000 0.0069
## 6 1.0146 nan 0.1000 0.0087
## 7 0.9995 nan 0.1000 0.0071
## 8 0.9897 nan 0.1000 0.0045
## 9 0.9808 nan 0.1000 0.0041
## 10 0.9680 nan 0.1000 0.0062
## 20 0.8924 nan 0.1000 0.0016
## 40 0.8324 nan 0.1000 0.0004
## 60 0.8086 nan 0.1000 0.0002
## 80 0.7968 nan 0.1000 -0.0002
## 100 0.7901 nan 0.1000 -0.0006
## 120 0.7836 nan 0.1000 -0.0001
## 140 0.7796 nan 0.1000 -0.0003
## 150 0.7771 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1171 nan 0.1000 0.0196
## 2 1.0836 nan 0.1000 0.0175
## 3 1.0598 nan 0.1000 0.0115
## 4 1.0330 nan 0.1000 0.0121
## 5 1.0140 nan 0.1000 0.0088
## 6 0.9985 nan 0.1000 0.0076
## 7 0.9838 nan 0.1000 0.0066
## 8 0.9705 nan 0.1000 0.0056
## 9 0.9603 nan 0.1000 0.0047
## 10 0.9446 nan 0.1000 0.0072
## 20 0.8651 nan 0.1000 0.0023
## 40 0.8144 nan 0.1000 0.0001
## 60 0.7965 nan 0.1000 -0.0003
## 80 0.7851 nan 0.1000 -0.0002
## 100 0.7766 nan 0.1000 -0.0002
## 120 0.7691 nan 0.1000 -0.0005
## 140 0.7611 nan 0.1000 -0.0002
## 150 0.7580 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1400 nan 0.1000 0.0091
## 2 1.1237 nan 0.1000 0.0078
## 3 1.1096 nan 0.1000 0.0068
## 4 1.0973 nan 0.1000 0.0047
## 5 1.0843 nan 0.1000 0.0059
## 6 1.0701 nan 0.1000 0.0066
## 7 1.0608 nan 0.1000 0.0046
## 8 1.0511 nan 0.1000 0.0041
## 9 1.0411 nan 0.1000 0.0044
## 10 1.0303 nan 0.1000 0.0053
## 20 0.9681 nan 0.1000 0.0023
## 40 0.8945 nan 0.1000 0.0011
## 60 0.8580 nan 0.1000 0.0004
## 80 0.8379 nan 0.1000 0.0002
## 100 0.8248 nan 0.1000 0.0001
## 120 0.8152 nan 0.1000 -0.0001
## 140 0.8101 nan 0.1000 -0.0000
## 150 0.8077 nan 0.1000 0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1259 nan 0.1000 0.0165
## 2 1.0973 nan 0.1000 0.0135
## 3 1.0685 nan 0.1000 0.0133
## 4 1.0456 nan 0.1000 0.0097
## 5 1.0300 nan 0.1000 0.0069
## 6 1.0181 nan 0.1000 0.0053
## 7 1.0002 nan 0.1000 0.0084
## 8 0.9872 nan 0.1000 0.0063
## 9 0.9734 nan 0.1000 0.0063
## 10 0.9600 nan 0.1000 0.0062
## 20 0.8915 nan 0.1000 0.0016
## 40 0.8375 nan 0.1000 0.0000
## 60 0.8137 nan 0.1000 -0.0003
## 80 0.8034 nan 0.1000 -0.0000
## 100 0.7951 nan 0.1000 -0.0000
## 120 0.7886 nan 0.1000 -0.0004
## 140 0.7834 nan 0.1000 -0.0003
## 150 0.7811 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1183 nan 0.1000 0.0204
## 2 1.0828 nan 0.1000 0.0166
## 3 1.0563 nan 0.1000 0.0128
## 4 1.0364 nan 0.1000 0.0091
## 5 1.0096 nan 0.1000 0.0104
## 6 0.9901 nan 0.1000 0.0091
## 7 0.9780 nan 0.1000 0.0057
## 8 0.9630 nan 0.1000 0.0076
## 9 0.9487 nan 0.1000 0.0060
## 10 0.9351 nan 0.1000 0.0065
## 20 0.8666 nan 0.1000 0.0007
## 40 0.8192 nan 0.1000 0.0003
## 60 0.8001 nan 0.1000 -0.0001
## 80 0.7879 nan 0.1000 -0.0002
## 100 0.7782 nan 0.1000 -0.0003
## 120 0.7692 nan 0.1000 -0.0003
## 140 0.7619 nan 0.1000 -0.0004
## 150 0.7581 nan 0.1000 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1409 nan 0.1000 0.0084
## 2 1.1228 nan 0.1000 0.0080
## 3 1.1073 nan 0.1000 0.0078
## 4 1.0939 nan 0.1000 0.0062
## 5 1.0826 nan 0.1000 0.0054
## 6 1.0715 nan 0.1000 0.0049
## 7 1.0610 nan 0.1000 0.0047
## 8 1.0490 nan 0.1000 0.0059
## 9 1.0395 nan 0.1000 0.0042
## 10 1.0294 nan 0.1000 0.0050
## 20 0.9633 nan 0.1000 0.0032
## 40 0.8924 nan 0.1000 0.0016
## 60 0.8535 nan 0.1000 0.0002
## 80 0.8316 nan 0.1000 0.0004
## 100 0.8182 nan 0.1000 0.0002
## 120 0.8092 nan 0.1000 0.0000
## 140 0.8038 nan 0.1000 -0.0000
## 150 0.8016 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1227 nan 0.1000 0.0167
## 2 1.0928 nan 0.1000 0.0149
## 3 1.0737 nan 0.1000 0.0094
## 4 1.0573 nan 0.1000 0.0075
## 5 1.0375 nan 0.1000 0.0099
## 6 1.0178 nan 0.1000 0.0102
## 7 1.0026 nan 0.1000 0.0074
## 8 0.9857 nan 0.1000 0.0078
## 9 0.9719 nan 0.1000 0.0067
## 10 0.9615 nan 0.1000 0.0049
## 20 0.8908 nan 0.1000 0.0017
## 40 0.8301 nan 0.1000 0.0001
## 60 0.8055 nan 0.1000 0.0002
## 80 0.7935 nan 0.1000 -0.0001
## 100 0.7859 nan 0.1000 -0.0002
## 120 0.7801 nan 0.1000 -0.0002
## 140 0.7744 nan 0.1000 -0.0000
## 150 0.7724 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1171 nan 0.1000 0.0205
## 2 1.0830 nan 0.1000 0.0168
## 3 1.0513 nan 0.1000 0.0152
## 4 1.0253 nan 0.1000 0.0118
## 5 1.0080 nan 0.1000 0.0084
## 6 0.9878 nan 0.1000 0.0098
## 7 0.9757 nan 0.1000 0.0061
## 8 0.9613 nan 0.1000 0.0068
## 9 0.9522 nan 0.1000 0.0036
## 10 0.9402 nan 0.1000 0.0062
## 20 0.8641 nan 0.1000 0.0029
## 40 0.8106 nan 0.1000 0.0002
## 60 0.7901 nan 0.1000 -0.0004
## 80 0.7783 nan 0.1000 0.0000
## 100 0.7698 nan 0.1000 -0.0003
## 120 0.7604 nan 0.1000 -0.0005
## 140 0.7533 nan 0.1000 -0.0003
## 150 0.7503 nan 0.1000 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1402 nan 0.1000 0.0077
## 2 1.1236 nan 0.1000 0.0087
## 3 1.1091 nan 0.1000 0.0069
## 4 1.0929 nan 0.1000 0.0076
## 5 1.0795 nan 0.1000 0.0066
## 6 1.0693 nan 0.1000 0.0050
## 7 1.0604 nan 0.1000 0.0045
## 8 1.0493 nan 0.1000 0.0049
## 9 1.0385 nan 0.1000 0.0052
## 10 1.0312 nan 0.1000 0.0029
## 20 0.9689 nan 0.1000 0.0018
## 40 0.8947 nan 0.1000 0.0014
## 60 0.8564 nan 0.1000 0.0005
## 80 0.8368 nan 0.1000 0.0001
## 100 0.8233 nan 0.1000 -0.0000
## 120 0.8143 nan 0.1000 0.0001
## 140 0.8075 nan 0.1000 -0.0000
## 150 0.8049 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1224 nan 0.1000 0.0161
## 2 1.0948 nan 0.1000 0.0126
## 3 1.0663 nan 0.1000 0.0135
## 4 1.0455 nan 0.1000 0.0102
## 5 1.0311 nan 0.1000 0.0064
## 6 1.0155 nan 0.1000 0.0076
## 7 0.9980 nan 0.1000 0.0090
## 8 0.9868 nan 0.1000 0.0055
## 9 0.9754 nan 0.1000 0.0049
## 10 0.9648 nan 0.1000 0.0048
## 20 0.8885 nan 0.1000 0.0016
## 40 0.8354 nan 0.1000 0.0003
## 60 0.8107 nan 0.1000 -0.0002
## 80 0.7999 nan 0.1000 0.0000
## 100 0.7907 nan 0.1000 -0.0001
## 120 0.7854 nan 0.1000 -0.0002
## 140 0.7796 nan 0.1000 -0.0001
## 150 0.7771 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1146 nan 0.1000 0.0194
## 2 1.0782 nan 0.1000 0.0179
## 3 1.0555 nan 0.1000 0.0102
## 4 1.0304 nan 0.1000 0.0120
## 5 1.0092 nan 0.1000 0.0109
## 6 0.9904 nan 0.1000 0.0099
## 7 0.9752 nan 0.1000 0.0069
## 8 0.9608 nan 0.1000 0.0065
## 9 0.9513 nan 0.1000 0.0044
## 10 0.9394 nan 0.1000 0.0059
## 20 0.8679 nan 0.1000 0.0016
## 40 0.8155 nan 0.1000 0.0002
## 60 0.7940 nan 0.1000 -0.0003
## 80 0.7829 nan 0.1000 -0.0002
## 100 0.7745 nan 0.1000 -0.0004
## 120 0.7659 nan 0.1000 -0.0005
## 140 0.7576 nan 0.1000 -0.0003
## 150 0.7549 nan 0.1000 -0.0005
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1399 nan 0.1000 0.0091
## 2 1.1228 nan 0.1000 0.0080
## 3 1.1061 nan 0.1000 0.0077
## 4 1.0923 nan 0.1000 0.0065
## 5 1.0803 nan 0.1000 0.0059
## 6 1.0695 nan 0.1000 0.0046
## 7 1.0594 nan 0.1000 0.0054
## 8 1.0482 nan 0.1000 0.0043
## 9 1.0392 nan 0.1000 0.0042
## 10 1.0296 nan 0.1000 0.0048
## 20 0.9653 nan 0.1000 0.0029
## 40 0.8925 nan 0.1000 0.0012
## 60 0.8540 nan 0.1000 0.0005
## 80 0.8347 nan 0.1000 0.0001
## 100 0.8219 nan 0.1000 -0.0001
## 120 0.8124 nan 0.1000 -0.0000
## 140 0.8060 nan 0.1000 -0.0002
## 150 0.8031 nan 0.1000 -0.0000
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1314 nan 0.1000 0.0123
## 2 1.0994 nan 0.1000 0.0158
## 3 1.0719 nan 0.1000 0.0138
## 4 1.0492 nan 0.1000 0.0114
## 5 1.0291 nan 0.1000 0.0107
## 6 1.0103 nan 0.1000 0.0087
## 7 0.9948 nan 0.1000 0.0072
## 8 0.9795 nan 0.1000 0.0072
## 9 0.9701 nan 0.1000 0.0047
## 10 0.9618 nan 0.1000 0.0037
## 20 0.8843 nan 0.1000 0.0018
## 40 0.8281 nan 0.1000 0.0008
## 60 0.8061 nan 0.1000 -0.0001
## 80 0.7929 nan 0.1000 0.0000
## 100 0.7860 nan 0.1000 -0.0003
## 120 0.7806 nan 0.1000 -0.0002
## 140 0.7758 nan 0.1000 -0.0001
## 150 0.7729 nan 0.1000 -0.0006
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1146 nan 0.1000 0.0213
## 2 1.0813 nan 0.1000 0.0166
## 3 1.0568 nan 0.1000 0.0113
## 4 1.0365 nan 0.1000 0.0103
## 5 1.0115 nan 0.1000 0.0120
## 6 0.9976 nan 0.1000 0.0062
## 7 0.9789 nan 0.1000 0.0087
## 8 0.9611 nan 0.1000 0.0068
## 9 0.9431 nan 0.1000 0.0079
## 10 0.9296 nan 0.1000 0.0068
## 20 0.8599 nan 0.1000 0.0012
## 40 0.8090 nan 0.1000 -0.0000
## 60 0.7916 nan 0.1000 -0.0006
## 80 0.7793 nan 0.1000 -0.0003
## 100 0.7701 nan 0.1000 -0.0003
## 120 0.7631 nan 0.1000 -0.0004
## 140 0.7566 nan 0.1000 -0.0003
## 150 0.7536 nan 0.1000 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1398 nan 0.1000 0.0087
## 2 1.1214 nan 0.1000 0.0084
## 3 1.1042 nan 0.1000 0.0084
## 4 1.0891 nan 0.1000 0.0070
## 5 1.0787 nan 0.1000 0.0047
## 6 1.0649 nan 0.1000 0.0061
## 7 1.0544 nan 0.1000 0.0049
## 8 1.0453 nan 0.1000 0.0038
## 9 1.0367 nan 0.1000 0.0042
## 10 1.0257 nan 0.1000 0.0050
## 20 0.9589 nan 0.1000 0.0017
## 40 0.8858 nan 0.1000 0.0006
## 60 0.8469 nan 0.1000 0.0007
## 80 0.8270 nan 0.1000 -0.0001
## 100 0.8140 nan 0.1000 -0.0002
## 120 0.8049 nan 0.1000 -0.0001
## 140 0.7991 nan 0.1000 0.0001
## 150 0.7970 nan 0.1000 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1221 nan 0.1000 0.0174
## 2 1.0918 nan 0.1000 0.0156
## 3 1.0722 nan 0.1000 0.0086
## 4 1.0471 nan 0.1000 0.0121
## 5 1.0271 nan 0.1000 0.0100
## 6 1.0080 nan 0.1000 0.0093
## 7 0.9955 nan 0.1000 0.0052
## 8 0.9806 nan 0.1000 0.0069
## 9 0.9684 nan 0.1000 0.0057
## 10 0.9551 nan 0.1000 0.0065
## 20 0.8858 nan 0.1000 0.0022
## 40 0.8226 nan 0.1000 0.0002
## 60 0.8019 nan 0.1000 -0.0001
## 80 0.7905 nan 0.1000 -0.0002
## 100 0.7816 nan 0.1000 -0.0004
## 120 0.7770 nan 0.1000 -0.0002
## 140 0.7715 nan 0.1000 -0.0002
## 150 0.7690 nan 0.1000 -0.0005
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1159 nan 0.1000 0.0208
## 2 1.0880 nan 0.1000 0.0135
## 3 1.0560 nan 0.1000 0.0167
## 4 1.0294 nan 0.1000 0.0130
## 5 1.0058 nan 0.1000 0.0107
## 6 0.9867 nan 0.1000 0.0093
## 7 0.9740 nan 0.1000 0.0054
## 8 0.9577 nan 0.1000 0.0083
## 9 0.9461 nan 0.1000 0.0060
## 10 0.9319 nan 0.1000 0.0070
## 20 0.8582 nan 0.1000 0.0012
## 40 0.8073 nan 0.1000 -0.0000
## 60 0.7875 nan 0.1000 -0.0001
## 80 0.7749 nan 0.1000 0.0001
## 100 0.7646 nan 0.1000 -0.0002
## 120 0.7570 nan 0.1000 -0.0003
## 140 0.7500 nan 0.1000 -0.0002
## 150 0.7477 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1287 nan 0.1000 0.0152
## 2 1.0899 nan 0.1000 0.0193
## 3 1.0585 nan 0.1000 0.0154
## 4 1.0370 nan 0.1000 0.0105
## 5 1.0122 nan 0.1000 0.0123
## 6 0.9930 nan 0.1000 0.0095
## 7 0.9760 nan 0.1000 0.0080
## 8 0.9588 nan 0.1000 0.0081
## 9 0.9470 nan 0.1000 0.0055
## 10 0.9384 nan 0.1000 0.0033
## 20 0.8598 nan 0.1000 0.0019
## 40 0.8133 nan 0.1000 0.0005
## 60 0.7960 nan 0.1000 -0.0001
## 80 0.7866 nan 0.1000 -0.0001
## 100 0.7787 nan 0.1000 -0.0003
pred_gbm = predict(model_gbm_cv, test[, -36], type = "prob")
pred_gbm$Churn = as.factor(ifelse(pred_gbm$No > 0.5, "No", "Yes"))
confusionMatrix(test$Churn, pred_gbm$Churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1386 163
## Yes 269 292
##
## Accuracy : 0.7953
## 95% CI : (0.7774, 0.8123)
## No Information Rate : 0.7844
## P-Value [Acc > NIR] : 0.1164
##
## Kappa : 0.4419
##
## Mcnemar's Test P-Value : 4.376e-07
##
## Sensitivity : 0.8375
## Specificity : 0.6418
## Pos Pred Value : 0.8948
## Neg Pred Value : 0.5205
## Prevalence : 0.7844
## Detection Rate : 0.6569
## Detection Prevalence : 0.7341
## Balanced Accuracy : 0.7396
##
## 'Positive' Class : No
##
Accuracy of 79 % again with 0.44 kappa.
set.seed(123)
mycontrol = trainControl(method = "repeatedcv",
number = 5,
repeats = 2,
classProbs = TRUE)
mtry <- sqrt(ncol(train[, -36]))
rfgrid <- expand.grid(.mtry=mtry)
model_rf_cv = train(x = train[, -36],
y = train$Churn,
method = 'rf',
trControl = mycontrol,
tuneGrid = rfgrid)
pred_rf = predict(model_rf_cv, test[, -36], type = "prob")
pred_rf$Churn = as.factor(ifelse(pred_rf$No > 0.5, "No", "Yes"))
confusionMatrix(test$Churn, pred_rf$Churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1377 172
## Yes 274 287
##
## Accuracy : 0.7886
## 95% CI : (0.7706, 0.8059)
## No Information Rate : 0.7825
## P-Value [Acc > NIR] : 0.2557
##
## Kappa : 0.4252
##
## Mcnemar's Test P-Value : 1.731e-06
##
## Sensitivity : 0.8340
## Specificity : 0.6253
## Pos Pred Value : 0.8890
## Neg Pred Value : 0.5116
## Prevalence : 0.7825
## Detection Rate : 0.6526
## Detection Prevalence : 0.7341
## Balanced Accuracy : 0.7297
##
## 'Positive' Class : No
##
Accuracy of 78 % again with 0.42 kappa.
Customers with FiberOptics as their internet services have high churn rate.
Churn Rate is high for the customers who have partner.
Churn Rate is high for the customers with month-to-month contract.
All the models have almost the same accuracy of around 77-80% and kappa of 42-44 %.