#Load data sets
#Main data set
churn_df <- data.frame(read_csv('Data Files/Telco_customer_churn.csv', col_types=cols()))
#additional data sets
churn_status_df <- data.frame(read_csv('Data Files/Telco_customer_churn_status.csv', col_types = cols()))
churn_services_df <- data.frame(read_csv('Data Files/Telco_customer_churn_services.csv', col_types = cols()))
##################################################
#Join and filter
#Filter columns to join from status_df
status_join <- churn_status_df %>%
select(c(Customer.ID, Satisfaction.Score, Churn.Category))
#status_join
#Filer columns to join from services)df
service_join <- churn_services_df %>%
select(c(Customer.ID, Avg.Monthly.Long.Distance.Charges, Avg.Monthly.GB.Download, Unlimited.Data, Total.Extra.Data.Charges))
#service_join
#Join status with churn
joined_churn_df <- merge(churn_df, status_join, by.x="CustomerID", by.y="Customer.ID")
#head(joined_churn_df)
#Join service with churn
joined_churn_df <- merge(joined_churn_df, service_join, by.x="CustomerID", by.y="Customer.ID")
#head(joined_churn_df)
##################################################
#drop unwanted columns from final df
filtered_joined_churn_df <- joined_churn_df %>%
select(-c(1:10, 16, 20, 22:23, 28, 30:32))
##################################################
#Convert categorical to factor
cols <- c(1:3, 5:12, 14:17, 20)
filtered_joined_churn_df[cols] <- lapply(filtered_joined_churn_df[cols], as.factor)
##################################################
#Separate into churn and non-churn groupings for later comparison
only_churn <- joined_churn_df %>%
filter(Churn.Value==1)
no_churn <- joined_churn_df %>%
filter(Churn.Value==0)
##################################################
#Functions
#Create updated xkable_summary function to delete unwanted text
xkablesum_updated <- function (df, title = "Table: Statistics summary.", digits = 4,
pos = "left", bso = "striped")
{
s = summary(df)
# Including RE NA's strip
# Needed to add a dim check because including NA strip when no NA row made a new 7th row with text Min.:
if (dim(s)[1] == 6) {
strip_vector = c("Min.\\s*:\\s*", "1st Qu.\\s*:\\s*", "Median\\s*:\\s*", "Mean\\s*:\\s*",
"3rd Qu.\\s*:\\s*", "Max.\\s*:\\s*")
}
# NA's strip RE added here
else if (dim(s)[1] == 7) {
strip_vector = c("Min.\\s*:\\s*", "1st Qu.\\s*:\\s*", "Median\\s*:\\s*", "Mean\\s*:\\s*",
"3rd Qu.\\s*:\\s*", "Max.\\s*:\\s*", "NA's\\s*:\\s*")
}
# Made s = apply() -- without, didn't apply changes to table
s <- apply(s, 2, function(x) stringr::str_remove_all(x, strip_vector))
# Made s = apply()
s <- apply(s, 2, function(x) stringr::str_trim(x, "right"))
colnames(s) <- stringr::str_trim(colnames(s))
if (dim(s)[1] == 6) {
rownames(s) <- c("Min", "Q1", "Median",
"Mean", "Q3", "Max")
}
else if (dim(s)[1] == 7) {
rownames(s) <- c("Min", "Q1", "Median",
"Mean", "Q3", "Max", "NA")
}
xkabledply(s, title = title, digits = digits, pos = pos,
bso = bso)
}
#Better looking version of 2-sample t-test results than what the object itself displays
ttest2sample_info <- function(test) {
if (test[["p.value"]] <= 0.05) {
result = 'Reject the Null Hypothesis'
} else {
result = 'Do not reject the Null Hypothesis'
}
cat(c('\t', test$method, '\n\n',
'Data: ', '| ', test$data.name, '\n',
'Null Hypothesis: ', '| true difference in means = ', test$null.value[1], '\n',
'Alternative Hypothesis: ', '| true difference in means != ', test$null.value[1], '\n',
'Confidence Level: ', '| ', attributes(test$conf.int)$conf.level, '\n',
'Confidence Interval: ', '| [', round(test$conf.int[1], 2), ', ', round(test$conf.int[2], 2)), ']\n',
'Sample Estimates of Mean: ', '| [X = ', test$estimate[1], ', Y = ', test$estimate[2], ']\n',
'Test Values: ', '| [t = ', test$statistic,
', df = ', test$parameter[1],
', p-value = ', test[["p.value"]], ']\n',
'Result: ', '| ', result, sep='')
}
Ready to continue…
This data set is a fictional data set provided by IBM. It includes the demographic information on the customers of a telecommunications company, data on the services each customer used, charges paid (etc.), as well as whether the customer “churned” or not – meaning, whether the customer left the company in the past month. It also includes data on the reason(s) each customer who did leave gave for leaving.
str(filtered_joined_churn_df)
## 'data.frame': 7043 obs. of 21 variables:
## $ Senior.Citizen : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 1 2 1 2 1 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 1 2 2 1 2 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 2 ...
## $ Tenure.Months : num 9 9 4 13 3 9 71 63 7 65 ...
## $ Phone.Service : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Internet.Service : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 2 2 2 1 2 2 1 1 ...
## $ Online.Security : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 1 3 3 3 3 ...
## $ Online.Backup : Factor w/ 3 levels "No","No internet service",..: 3 1 1 3 1 1 3 1 1 3 ...
## $ Tech.Support : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 3 3 3 3 1 3 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 2 1 1 1 1 1 3 3 1 3 ...
## $ Paperless.Billing : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 2 2 2 ...
## $ Payment.Method : Factor w/ 4 levels "Bank transfer (automatic)",..: 4 4 3 3 4 2 1 2 3 4 ...
## $ Monthly.Charges : num 65.6 59.9 73.9 98 83.9 ...
## $ Churn.Label : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 1 1 1 1 1 ...
## $ Churn.Reason : Factor w/ 20 levels "Attitude of service provider",..: NA NA 18 19 15 NA NA NA NA NA ...
## $ Satisfaction.Score : Factor w/ 5 levels "1","2","3","4",..: 3 5 1 1 1 3 3 4 3 3 ...
## $ Churn.Category : Factor w/ 5 levels "Attitude","Competitor",..: NA NA 2 3 3 NA NA NA NA NA ...
## $ Avg.Monthly.Long.Distance.Charges: num 42.39 10.69 33.65 27.82 7.38 ...
## $ Avg.Monthly.GB.Download : num 16 10 30 4 11 73 14 7 21 14 ...
## $ Unlimited.Data : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 1 2 2 ...
## $ Total.Extra.Data.Charges : num 0 10 0 0 0 0 0 20 0 0 ...
From the graph below, we see that around 27% of customer left the platform within the last month.
options(repr.plot.width = 6, repr.plot.height = 4)
filtered_joined_churn_df %>%
group_by(Churn.Label) %>%
summarise(Count = n())%>%
mutate(percent = prop.table(Count)*100)%>%
ggplot(aes(reorder(Churn.Label, -percent), percent), fill = Churn)+
geom_col(fill = c("#FC4E07", "#E7B800"))+
geom_text(aes(label = sprintf("%.2f%%", percent)), hjust = 0.01,vjust = -0.5, size =3)+
theme_bw()+
xlab("Churn") +
ylab("Percent")+
ggtitle("Churn Percent")
The graph below details the reasons customers gave for leaving. Although some customers may have selected the same reason for leaving, they may have categorized how they associated that reason differently. The churn category is used as a fill to provide further detail in understanding the customer perspective. We will use these reasons to help us formulate our questions, select appropriate variables to investigate, and contextualize the results:
joined_churn_df %>%
filter(!is.na(Churn.Category)) %>%
ggplot() +
geom_bar(mapping=aes(y=Churn.Reason, fill=Churn.Category), show.legend=TRUE) +
labs(title='Reasons for Leaving by Answer', y=('Reason'), x=('Count')) +
theme(plot.title = element_text(hjust = 0.5))
Answering SMART Question 1
To answer this question, we divided the data into churn and non-churn groups, then visually explored how the data set’s variables differed when divided into these groups. We then tested the differences using 2-sample t-tests, goodness of fit chi squared tests, and chi squared tests of independence. From this we determined that the following 15 variables showed a statistical difference between the churn and non-churn groups:
TenureAvg. Monthly ChargesAvg. Monthly GB DownloadSenior CitizenPartnerDependentsInternet ServiceOnline SecurityOnline BackupTech SupportContract TypePaperless BillingPayment MethodSatisfaction ScoreUnlimited DataSmart Questions to answer with modeling:
First, let’s explore how Satisfcation Score is best understood and used. Is it as a factor or conceptual framework?
sat_logit_model <- glm(Churn.Label ~ Satisfaction.Score, data=filtered_joined_churn_df, family="binomial")
#summary(sat_logit_model)
xkabledply(sat_logit_model)
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| (Intercept) | 20.6 | 584 | 0.0352 | 0.972 |
| Satisfaction.Score2 | 0.0 | 974 | 0.0000 | 1.000 |
| Satisfaction.Score3 | -22.2 | 584 | -0.0380 | 0.970 |
| Satisfaction.Score4 | -41.1 | 719 | -0.0572 | 0.954 |
| Satisfaction.Score5 | -41.1 | 784 | -0.0525 | 0.958 |
Exponentiate both sides of equation to get growth and decay factors…
#exponentiate both sides of the equation
expcoeff <- exp(coef(sat_logit_model))
xkabledply(as.table(expcoeff), title='Growth and Decay Factors After Exponentiation')
| x | |
|---|---|
| (Intercept) | 8.55e+08 |
| Satisfaction.Score2 | 1.00e+00 |
| Satisfaction.Score3 | 0.00e+00 |
| Satisfaction.Score4 | 0.00e+00 |
| Satisfaction.Score5 | 0.00e+00 |
Exponentiated Equation Satisfaction
\(\frac{p}{q} = 854535380 * (1)^{sat2} * (2.2452e-10)^{sat3} * (1.36943e-18)^{sat4} * (1.36943e-18)^{sat5}\)
Predictions: Satisfaction Model
#Create new DF of 5 rows, one for each level of satisfaction in order to make predictions
pred_df <- data.frame(Satisfaction.Score=as.factor(c(1,2,3,4,5)))
#Determine likelihood of Churn using Satisfaction Model
pred_response <- predict(sat_logit_model, newdata = pred_df, type = "response")
#Create prediction response df with results of model calculations
pred_response_df <- data.frame(pred_response)
#Clean up formatting of DF
pred_response_df <- pred_response_df %>%
format(scientific=FALSE) %>%
cbind(newColName = rownames(pred_response_df), pred_response_df) %>%
select(c(2,3)) %>%
mutate(
Prediction=ifelse(pred_response > 0.5, "Churn", "Non-Churn")
)
colnames(pred_response_df) <- c("Satisfaction Score", "Churn Likelihood", "Prediction")
#display DF as table
xkabledply(pred_response_df, title="Churn Predictions Using Only Satisfaction Score and a Cut Off of 0.5")
| Satisfaction Score | Churn Likelihood | Prediction |
|---|---|---|
| 1 | 1.000 | Churn |
| 2 | 1.000 | Churn |
| 3 | 0.161 | Non-Churn |
| 4 | 0.000 | Non-Churn |
| 5 | 0.000 | Non-Churn |
Coefficient P-values: Satisfaction Model
All p-values are high
Confusion Matrix: Satisfaction Model
#Create and display the confusion matrix
confusion_matrix_sat <- xkabledply(confusion_matrix(sat_logit_model), title = "Confusion Matrix for Satisfcation Logistic Model" )
confusion_matrix_sat
| Predicted No | Predicted Yes | Total | |
|---|---|---|---|
| Actual No | 5174 | 0 | 5174 |
| Actual Yes | 429 | 1440 | 1869 |
| Total | 5603 | 1440 | 7043 |
#Turn from kable into data frame (was having trouble accessing elements as a kable)
confusion_matrix_sat_df <- data.frame(confusion_matrix(sat_logit_model))
#confusion_matrix_sat_df
accuracy <- (confusion_matrix_sat_df[1,1] + confusion_matrix_sat_df[2,2])/confusion_matrix_sat_df[3,3]
precision <- confusion_matrix_sat_df[2,2]/(confusion_matrix_sat_df[2,2] + confusion_matrix_sat_df[1,2])
recall_rate <- confusion_matrix_sat_df[2,2]/(confusion_matrix_sat_df[2,2] + confusion_matrix_sat_df[2,1])
specificity <- confusion_matrix_sat_df[1,1]/(confusion_matrix_sat_df[1,1] + confusion_matrix_sat_df[1,2])
f1_score <- 2*(precision)*(recall_rate)/(precision + recall_rate)
# 1. **Accuracy:** *(TP + TN)/Total*
# * `r (confusion_matrix_test_df[1,1] + confusion_matrix_test_df[2,2])/confusion_matrix_test_df[3,3]`
# * The model correctly predicts survived or died `r round((confusion_matrix_test_df[1,1] + confusion_matrix_test_df[2,2])/confusion_matrix_test_df[3,3], digits=3)*100`% of the time.
# 2. **Precision:** *TP/(TP + FP)*
# * `r confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[1,2])`
# * Of the values that are predicted true by the model, `r round(confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[1,2]), digits=3)*100`% of them actually are true.
# 3. **Recall Rate:** *TP/(TP + FN)*
# * `r confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[2,1])`
# * Of the values that actually are true, the model correctly predicts `r round(confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[2,1]), digits=3)*100`% of them as true.
# 4. **Specificity:** *TN/(TN + FP)*
# * `r confusion_matrix_test_df[1,1]/(confusion_matrix_test_df[1,1] + confusion_matrix_test_df[1,2])`
# * Of the values that are actually false, the model correctly predicts `r round(confusion_matrix_test_df[1,1]/(confusion_matrix_test_df[1,1] + confusion_matrix_test_df[1,2]), digits=3)*100`% of them as false.
# 5. **F1 Score:** *2(Precision)(Recall)/(Precision + Recall)*
# * `r 2*(0.753)*(0.714)/(0.753 + 0.714)`
# * The harmonic mean of Precision and Recall, a more balanced view of how good the model is.
ROC-AUC: Satisfaction Model
Area under the curve, testing the true positive rate (or recall) against the false positive rate (or specificity).
#Get the probability of survival for each passenger from the model
prob_test=predict(sat_logit_model, type = "response" )
#Create new column in df of predicted probability of survival
filtered_joined_churn_df$prob=prob_test
#Determine the ratio of true-positive-rate/false-positive rate as a curve between 0.5 and 1
roc_data <- roc(Churn.Label~prob, data=filtered_joined_churn_df)
#Plot as graph
plot(roc_data)
#Value of area under the curve, prefer 0.8 or higher.
#auc(roc_data2) # area-under-curve
# unloadPkg("pROC")
Comments
McFadden: Satisfaction Model
#calculate McFadden statistics, the pseudo r^2
sat_pr2 = pR2(sat_logit_model)
fitting null model for pseudo-r2
#sat_pr2[4]
The pseudo \(r^2\) value for the Satisfaction model, using the McFadden method of calculation, is 0.711. This means that 71.1% of the variance seen in the data is accounted for by this model.
AIC/BIC and Deviance: Satisfaction Model
Next, let’s look at a baseline model that uses all potential explanatory factors in the data set, with the exception of Average Monthly Long Distance Charges and Total Extra Data charges, which were shown in the EDA to not have statistical significance between their different levels.
#Everything model - those factors dropped in EDA
#Senior.Citizen + Partner + Dependents + Tenure.Months + Phone.Service + Internet.Service + Online.Security + Online.Backup + Tech.Support + Contract + Paperless.Billing + Payment.Method + Monthly.Charges + Avg.Monthly.GB.Download + Unlimited.Data
#NOT USING:
#Average Monthly Long Distance
#Total Extra Data Charges
#Satisfaction Score
all_logit_model <- glm(Churn.Label ~ Senior.Citizen + Partner + Dependents + Tenure.Months + Phone.Service + Internet.Service + Online.Security + Online.Backup + Tech.Support + Contract + Paperless.Billing + Payment.Method + Monthly.Charges + Avg.Monthly.GB.Download + Unlimited.Data, data=filtered_joined_churn_df, family="binomial")
#summary(all_logit_model)
xkabledply(all_logit_model, title="Churn.Label ~ All Factors")
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| (Intercept) | -0.7084 | 0.2117 | -3.346 | 0.0008 |
| Senior.CitizenYes | 0.0903 | 0.0866 | 1.043 | 0.2970 |
| PartnerYes | 0.2674 | 0.0758 | 3.529 | 0.0004 |
| DependentsYes | -1.5157 | 0.1195 | -12.684 | 0.0000 |
| Tenure.Months | -0.0342 | 0.0024 | -14.511 | 0.0000 |
| Phone.ServiceYes | -0.9433 | 0.1504 | -6.272 | 0.0000 |
| Internet.ServiceFiber optic | 0.2196 | 0.1500 | 1.464 | 0.1432 |
| Internet.ServiceNo | -0.1758 | 0.2106 | -0.835 | 0.4038 |
| Online.SecurityYes | -0.4633 | 0.0879 | -5.273 | 0.0000 |
| Online.BackupYes | -0.2219 | 0.0808 | -2.748 | 0.0060 |
| Tech.SupportYes | -0.4741 | 0.0915 | -5.183 | 0.0000 |
| ContractOne year | -0.7284 | 0.1076 | -6.769 | 0.0000 |
| ContractTwo year | -1.3590 | 0.1760 | -7.721 | 0.0000 |
| Paperless.BillingYes | 0.3410 | 0.0755 | 4.515 | 0.0000 |
| Payment.MethodCredit card (automatic) | -0.0808 | 0.1158 | -0.698 | 0.4855 |
| Payment.MethodElectronic check | 0.3126 | 0.0963 | 3.247 | 0.0012 |
| Payment.MethodMailed check | 0.0004 | 0.1157 | 0.003 | 0.9976 |
| Monthly.Charges | 0.0259 | 0.0041 | 6.358 | 0.0000 |
| Avg.Monthly.GB.Download | -0.0014 | 0.0019 | -0.724 | 0.4688 |
| Unlimited.DataYes | -0.0458 | 0.1001 | -0.458 | 0.6469 |
Exponentiate both sides of equation to get growth and decay factors…
#exponentiate both sides of the equation
expcoeff <- exp(coef(all_logit_model))
xkabledply(as.table(expcoeff), title='Growth and Decay Factors After Exponentiation')
| x | |
|---|---|
| (Intercept) | 0.492 |
| Senior.CitizenYes | 1.095 |
| PartnerYes | 1.307 |
| DependentsYes | 0.220 |
| Tenure.Months | 0.966 |
| Phone.ServiceYes | 0.389 |
| Internet.ServiceFiber optic | 1.246 |
| Internet.ServiceNo | 0.839 |
| Online.SecurityNo internet service | NA |
| Online.SecurityYes | 0.629 |
| Online.BackupNo internet service | NA |
| Online.BackupYes | 0.801 |
| Tech.SupportNo internet service | NA |
| Tech.SupportYes | 0.623 |
| ContractOne year | 0.483 |
| ContractTwo year | 0.257 |
| Paperless.BillingYes | 1.406 |
| Payment.MethodCredit card (automatic) | 0.922 |
| Payment.MethodElectronic check | 1.367 |
| Payment.MethodMailed check | 1.000 |
| Monthly.Charges | 1.026 |
| Avg.Monthly.GB.Download | 0.999 |
| Unlimited.DataYes | 0.955 |
Exponentiated Equation
\(\frac{p}{q} = 0.492441 * (1.09454)^{senior} * (1.30655)^{partner} * (0.219654)^{dependents} * (0.96634)^{tenure} * (0.389351)^{phone}\)
\(* (1.24563)^{fiberOptic} * (0.838754)^{noInt} * (0.629209)^{oSecurity} * (0.800973)^{oBackup} * (0.622464)^{techSup} * (0.482686)^{cont1yr}\)
\(* (0.25691)^{cont2yr} * (1.40631)^{paperless} * (0.922403)^{pmAutoCC} * (1.36695)^{pmEcheck} * (1.00035)^{mailCheck}\)
\(* (1.02627)^{monthlyCharge} * (0.998611)^{monthlyGB} * (0.955201)^{unlimited}\)
Coefficient P-values: Full Model
High p-values for the following factors:
Confusion Matrix: Full Model
#Create and display the confusion matrix
confusion_matrix_all <- xkabledply(confusion_matrix(all_logit_model), title = "Confusion Matrix for Full Logistic Model, Cut Off of 0.5" )
confusion_matrix_all
| Predicted No | Predicted Yes | Total | |
|---|---|---|---|
| Actual No | 4633 | 541 | 5174 |
| Actual Yes | 795 | 1074 | 1869 |
| Total | 5428 | 1615 | 7043 |
#Turn from kable into data frame (was having trouble accessing elements as a kable)
confusion_matrix_all_df <- data.frame(confusion_matrix(all_logit_model))
#confusion_matrix_all_df
accuracy <- (confusion_matrix_all_df[1,1] + confusion_matrix_all_df[2,2])/confusion_matrix_all_df[3,3]
precision <- confusion_matrix_all_df[2,2]/(confusion_matrix_all_df[2,2] + confusion_matrix_all_df[1,2])
recall_rate <- confusion_matrix_all_df[2,2]/(confusion_matrix_all_df[2,2] + confusion_matrix_all_df[2,1])
specificity <- confusion_matrix_all_df[1,1]/(confusion_matrix_all_df[1,1] + confusion_matrix_all_df[1,2])
f1_score <- 2*(precision)*(recall_rate)/(precision + recall_rate)
# 1. **Accuracy:** *(TP + TN)/Total*
# * `r (confusion_matrix_test_df[1,1] + confusion_matrix_test_df[2,2])/confusion_matrix_test_df[3,3]`
# * The model correctly predicts survived or died `r round((confusion_matrix_test_df[1,1] + confusion_matrix_test_df[2,2])/confusion_matrix_test_df[3,3], digits=3)*100`% of the time.
# 2. **Precision:** *TP/(TP + FP)*
# * `r confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[1,2])`
# * Of the values that are predicted true by the model, `r round(confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[1,2]), digits=3)*100`% of them actually are true.
# 3. **Recall Rate:** *TP/(TP + FN)*
# * `r confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[2,1])`
# * Of the values that actually are true, the model correctly predicts `r round(confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[2,1]), digits=3)*100`% of them as true.
# 4. **Specificity:** *TN/(TN + FP)*
# * `r confusion_matrix_test_df[1,1]/(confusion_matrix_test_df[1,1] + confusion_matrix_test_df[1,2])`
# * Of the values that are actually false, the model correctly predicts `r round(confusion_matrix_test_df[1,1]/(confusion_matrix_test_df[1,1] + confusion_matrix_test_df[1,2]), digits=3)*100`% of them as false.
# 5. **F1 Score:** *2(Precision)(Recall)/(Precision + Recall)*
# * `r 2*(0.753)*(0.714)/(0.753 + 0.714)`
# * The harmonic mean of Precision and Recall, a more balanced view of how good the model is.
ROC-AUC: Full Model
Area under the curve, testing the true positive rate (or recall) against the false positive rate (or specificity).
#Get the probability of survival for each passenger from the model
prob_test=predict(all_logit_model, type = "response" )
#Create new column in df of predicted probability of survival
filtered_joined_churn_df$prob=prob_test
#Determine the ratio of true-positive-rate/false-positive rate as a curve between 0.5 and 1
roc_data <- roc(Churn.Label~prob, data=filtered_joined_churn_df)
#Plot as graph
plot(roc_data)
#Value of area under the curve, prefer 0.8 or higher.
#auc(roc_data2) # area-under-curve
# unloadPkg("pROC")
Comments
McFadden: Full Model
#calculate McFadden statistics, the pseudo r^2
all_pr2 = pR2(all_logit_model)
fitting null model for pseudo-r2
#all_pr2[4]
The pseudo \(r^2\) value for the full model, using the McFadden method of calculation, is 0.305. This means that 30.5% of the variance seen in the data is accounted for by this model.
AIC/BIC and Deviance: Full Model
To build a better model for the final version, we will drop those variables that had high p-values in the full model. After some experiementation based on dropping high p-value factors, the final model includes the following factors:
DependentsPhone ServiceContractMonthly ChargesMonthly GB# #Attempt Lasso regression for numeric variables -- encountered code errors that looked too involved to figure out
# full_mod_sel <- filtered_joined_churn_df %>%
# select(c(4,13,14,19)) #18?
#
# #x <- as.matrix(full_mod_sel[,-3]) # all X vars
# x <- full_mod_sel[, -3]
# y <- as.double(as.matrix(ifelse(full_mod_sel[, 4]=='Yes', 1, 0))) # Only Churn.Label
#
# # Fit the LASSO model (Lasso: Alpha = 1)
# set.seed(100)
# cv.lasso <- cv.glmnet(x, y, family='binomial', alpha=1, parallel=TRUE, standardize=TRUE, type.measure='auc')
#
# # Results
# plot(cv.lasso)
#Factors
#Senior.Citizen + Partner + Dependents + Tenure.Months + Phone.Service + Internet.Service + Online.Security + Online.Backup + Tech.Support + Contract + Paperless.Billing + Payment.Method + Monthly.Charges + Avg.Monthly.GB.Download + Unlimited.Data
#Drop senior, type of internet, type of payment, Monthly GB, unlimited (online security, online backup, tech support, unlimited data)
#+ Tenure.Months, partner
#Build model
final_logit_model <- glm(Churn.Label ~ Dependents + Phone.Service + Contract + Monthly.Charges + Avg.Monthly.GB.Download, data=filtered_joined_churn_df, family="binomial")
#summary(final_logit_model)
xkabledply(final_logit_model, title="Churn.Label ~ All Factors")
| Estimate | Std. Error | z value | Pr(>|z|) | |
|---|---|---|---|---|
| (Intercept) | -0.8853 | 0.1151 | -7.688 | 0.000 |
| DependentsYes | -1.6083 | 0.1114 | -14.431 | 0.000 |
| Phone.ServiceYes | -0.5660 | 0.1154 | -4.906 | 0.000 |
| ContractOne year | -1.8305 | 0.0922 | -19.850 | 0.000 |
| ContractTwo year | -3.2073 | 0.1530 | -20.969 | 0.000 |
| Monthly.Charges | 0.0198 | 0.0014 | 14.585 | 0.000 |
| Avg.Monthly.GB.Download | -0.0012 | 0.0017 | -0.714 | 0.475 |
Exponentiate both sides of equation to get growth and decay factors…
#exponentiate both sides of the equation
expcoeff <- exp(coef(final_logit_model))
xkabledply(as.table(expcoeff), title='Growth and Decay Factors After Exponentiation')
| x | |
|---|---|
| (Intercept) | 0.4126 |
| DependentsYes | 0.2002 |
| Phone.ServiceYes | 0.5678 |
| ContractOne year | 0.1603 |
| ContractTwo year | 0.0405 |
| Monthly.Charges | 1.0200 |
| Avg.Monthly.GB.Download | 0.9988 |
Exponentiated Equation
\(\frac{p}{q} = 0.412593 * (0.200223)^{dependents} * (0.567806)^{phone} * (0.160328)^{cont1yr}\)
\(* (0.0404644)^{cont2yr} * (1.01996)^{monthlyCharge} * (0.998789)^{monthlyGB}\)
Coefficient P-values: Final Model
All p-values low.
Confusion Matrix: Final Model, 0.5 Cut Off
#Create and display the confusion matrix
confusion_matrix_final <- xkabledply(confusion_matrix(final_logit_model), title = "Confusion Matrix for Final Logistic Model With a Cut Off Value of 0.5" )
confusion_matrix_final
| Predicted No | Predicted Yes | Total | |
|---|---|---|---|
| Actual No | 4466 | 708 | 5174 |
| Actual Yes | 925 | 944 | 1869 |
| Total | 5391 | 1652 | 7043 |
#Turn from kable into data frame (was having trouble accessing elements as a kable)
confusion_matrix_final_df <- data.frame(confusion_matrix(final_logit_model))
#confusion_matrix_final_df
accuracy <- (confusion_matrix_final_df[1,1] + confusion_matrix_final_df[2,2])/confusion_matrix_final_df[3,3]
precision <- confusion_matrix_final_df[2,2]/(confusion_matrix_final_df[2,2] + confusion_matrix_final_df[1,2])
recall_rate <- confusion_matrix_final_df[2,2]/(confusion_matrix_final_df[2,2] + confusion_matrix_final_df[2,1])
specificity <- confusion_matrix_final_df[1,1]/(confusion_matrix_final_df[1,1] + confusion_matrix_final_df[1,2])
f1_score <- 2*(precision)*(recall_rate)/(precision + recall_rate)
# 1. **Accuracy:** *(TP + TN)/Total*
# * `r (confusion_matrix_test_df[1,1] + confusion_matrix_test_df[2,2])/confusion_matrix_test_df[3,3]`
# * The model correctly predicts survived or died `r round((confusion_matrix_test_df[1,1] + confusion_matrix_test_df[2,2])/confusion_matrix_test_df[3,3], digits=3)*100`% of the time.
# 2. **Precision:** *TP/(TP + FP)*
# * `r confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[1,2])`
# * Of the values that are predicted true by the model, `r round(confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[1,2]), digits=3)*100`% of them actually are true.
# 3. **Recall Rate:** *TP/(TP + FN)*
# * `r confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[2,1])`
# * Of the values that actually are true, the model correctly predicts `r round(confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[2,1]), digits=3)*100`% of them as true.
# 4. **Specificity:** *TN/(TN + FP)*
# * `r confusion_matrix_test_df[1,1]/(confusion_matrix_test_df[1,1] + confusion_matrix_test_df[1,2])`
# * Of the values that are actually false, the model correctly predicts `r round(confusion_matrix_test_df[1,1]/(confusion_matrix_test_df[1,1] + confusion_matrix_test_df[1,2]), digits=3)*100`% of them as false.
# 5. **F1 Score:** *2(Precision)(Recall)/(Precision + Recall)*
# * `r 2*(0.753)*(0.714)/(0.753 + 0.714)`
# * The harmonic mean of Precision and Recall, a more balanced view of how good the model is.
ROC-AUC: Final Model
Area under the curve, testing the true positive rate (or recall) against the false positive rate (or specificity)
#Get the probability of survival for each passenger from the model
prob_test=predict(final_logit_model, type = "response" )
#Create new column in df of predicted probability of survival
filtered_joined_churn_df$prob=prob_test
#Determine the ratio of true-positive-rate/false-positive rate as a curve between 0.5 and 1
roc_data <- roc(Churn.Label~prob, data=filtered_joined_churn_df)
#Plot as graph
plot(roc_data)
#Value of area under the curve, prefer 0.8 or higher.
#auc(roc_data2) # area-under-curve
# unloadPkg("pROC")
Comments
McFadden: Final Model
#calculate McFadden statistics, the pseudo r^2
final_pr2 = pR2(final_logit_model)
fitting null model for pseudo-r2
#final_pr2[4]
The pseudo \(r^2\) value for the final model, using the McFadden method of calculation, is 0.243. This means that 24.3% of the variance seen in the data is accounted for by this model.
AIC/BIC and Deviance: Final Model
Confusion Matrix: Final Model, 0.26 Cut Off
############################################################
#HELP WITH MAKING CUTOFF
# The code for the logistic regression model and the predictions is given below
#log_model_full <- glm(loan_status ~ ., family = "binomial", data = training_set)
#predictions_all_full <- predict(log_model_full, newdata = test_set, type = "response")
# Make a binary predictions-vector using a cut-off of 15%
#pred_cutoff_15 <- ifelse(predictions_all_full > 0.15, 1, 0)
# Construct a confusion matrix
#table(test_set$loan_status, pred_cutoff_15)
############################################################
#0.361
#0.263
#0.265
#0.28 -- 80% recall
#Get actual values of churn/non-churn
predictions_final_model <- final_logit_model$fitted.values
#Get predictions based off of selected cut off value
pred_cutoff <- ifelse(predictions_final_model > 0.26, 1, 0)
############################################################
#Turn from kable into data frame (was having trouble accessing elements as a kable)
confusion_matrix_cutoff_df <- data.frame(table(filtered_joined_churn_df$Churn.Label, pred_cutoff))
#Convert format of df to matrix style
confusion_matrix_cutoff_df <- reshape(confusion_matrix_cutoff_df, idvar = "Var1", timevar = "pred_cutoff", direction = "wide")
#Drop first column and add totals column
confusion_matrix_cutoff_df <- select(confusion_matrix_cutoff_df, c(2,3)) %>%
mutate(
Total=Freq.0+Freq.1
)
#Add total row to capture prediction totals
confusion_matrix_cutoff_df[nrow(confusion_matrix_cutoff_df)+ 1 , ] <- c(sum(confusion_matrix_cutoff_df$Freq.0), sum(confusion_matrix_cutoff_df$Freq.1), sum(confusion_matrix_cutoff_df$Total))
#Add Column at the start of DF to identify the actual values
Index=c("Actual No", "Actual Yes", "Total")
confusion_matrix_cutoff_df <- cbind(Index, confusion_matrix_cutoff_df)
#Rename columns to identify predicted values
colnames(confusion_matrix_cutoff_df) <- c("", "Predicted No", "Predicted Yes", "Total")
#Display Cut Off Confusion Matrix
xkabledply(confusion_matrix_cutoff_df, title="Confusion Matrix for Final Model With a Cut Off Value of 0.26")
| Predicted No | Predicted Yes | Total | |
|---|---|---|---|
| Actual No | 3693 | 1481 | 5174 |
| Actual Yes | 382 | 1487 | 1869 |
| Total | 4075 | 2968 | 7043 |
########################
#Second value is column
#calculate matric scores with custom cut off
accuracy <- (confusion_matrix_cutoff_df[1,2] + confusion_matrix_cutoff_df[2,3])/confusion_matrix_cutoff_df[3,4]
precision <- confusion_matrix_cutoff_df[2,3]/(confusion_matrix_cutoff_df[2,3] + confusion_matrix_cutoff_df[1,3])
recall_rate <- confusion_matrix_cutoff_df[2,3]/(confusion_matrix_cutoff_df[2,3] + confusion_matrix_cutoff_df[2,2])
specificity <- confusion_matrix_cutoff_df[1,2]/(confusion_matrix_cutoff_df[1,2] + confusion_matrix_cutoff_df[1,3])
f1_score <- 2*(precision)*(recall_rate)/(precision + recall_rate)
# cat('\naccuracy: ', accuracy, '\n')
# cat('precision: ', precision, '\n')
# cat('recal: ', recall_rate, '\n')
# cat('specificity: ', specificity, '\n')
# cat('f1: ', f1_score, '\n')
#accuracy
#precision
#recall_rate
#specificity
#f1_score
Log Loss, Regular and Brier Score
#Grab only Actual Churn and Predicted Churn
logloss_df <- filtered_joined_churn_df %>%
select(c(14, 22)) %>%
mutate(
corrected_prob=ifelse(Churn.Label=="Yes", prob, 1-prob) #Correct pred based on actual churn
)
xkabledply(head(logloss_df), title="Example of Model Predictions Corrected for Log Loss Calculation")
| Churn.Label | prob | corrected_prob |
|---|---|---|
| No | 0.119 | 0.881 |
| No | 0.431 | 0.569 |
| Yes | 0.493 | 0.493 |
| Yes | 0.618 | 0.618 |
| Yes | 0.548 | 0.548 |
| No | 0.145 | 0.855 |
logloss_value <- LogLoss(y_pred= final_logit_model$fitted.values, y_true=churn_df$Churn.Value)
#logloss_value
#Get the probability of survival for each passenger from the model
prob_test=predict(sat_logit_model, type = "response" )
#Create new column in df of predicted probability of survival
churn_df$prob=prob_test
#brier_test <- brierscore(data=churn_df, Churn.Value ~ prob)
brier_test <- BrierScore(churn_df$Churn.Value, final_logit_model$fitted.values)
#brier_test
When the regular Log Loss function is used, the result is:
When the Brier Test Method is used for an unbalanced Data Set, the result is:
Nearly 89% of customers who churned were on month-to-month contracts. It is suggested that the company offer an additional $10 off monthly charges for customers on month-to-month contracts who would be willing to sign a 2-year contract.
For a customer who pays the avg Monthly Charge of ~$65 and who has no other mitigating factors, this would reduce the risk of churning from about ~60% to about ~5%.
Assuming the ideal scenario were all would-be churners who have month-to-month contracts take the deal and sign a 2-year contract, this theoretically reduces the number of customers who churn in this group by about ~91.7%.
This would result in the following monetary benefits to the company:
From the EDA, features we’ve found out will be useful and will be used in random forest model:
Categorical: Partner Tech.Support Online.Security Online.Backup Paperless.Billing Payment.Method Senior Citizen Contract Type Satisfaction Score Dependents
Continuous: Tenure Avg. Monthly Charges Total.Months
clean categorical feature
From the previous EDA, we know that there are some categorical features that have ‘No’ and ‘No Internet Service’ or ‘No Phone Service’ as a category, we can make them as ‘No’ and clean these features.
tg <- data.frame(lapply(filtered_joined_churn_df, function(x){ gsub("No internet service", "No", x)}))
tg <- data.frame(lapply(filtered_joined_churn_df, function(x){ gsub("No phone service", "No", x)}))
glimpse(tg)
## Rows: 7,043
## Columns: 22
## $ Senior.Citizen <chr> "No", "No", "No", "Yes", "Yes", "No"~
## $ Partner <chr> "Yes", "No", "No", "Yes", "Yes", "No~
## $ Dependents <chr> "No", "No", "No", "No", "No", "Yes",~
## $ Tenure.Months <chr> "9", "9", "4", "13", "3", "9", "71",~
## $ Phone.Service <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "~
## $ Internet.Service <chr> "DSL", "DSL", "Fiber optic", "Fiber ~
## $ Online.Security <chr> "No", "No", "No", "No", "No", "No", ~
## $ Online.Backup <chr> "Yes", "No", "No", "Yes", "No", "No"~
## $ Tech.Support <chr> "Yes", "No", "No", "No", "Yes", "Yes~
## $ Contract <chr> "One year", "Month-to-month", "Month~
## $ Paperless.Billing <chr> "Yes", "No", "Yes", "Yes", "Yes", "Y~
## $ Payment.Method <chr> "Mailed check", "Mailed check", "Ele~
## $ Monthly.Charges <chr> "65.6", "59.9", "73.9", "98", "83.9"~
## $ Churn.Label <chr> "No", "No", "Yes", "Yes", "Yes", "No~
## $ Churn.Reason <chr> NA, NA, "Price too high", "Product d~
## $ Satisfaction.Score <chr> "3", "5", "1", "1", "1", "3", "3", "~
## $ Churn.Category <chr> NA, NA, "Competitor", "Dissatisfacti~
## $ Avg.Monthly.Long.Distance.Charges <chr> "42.39", "10.69", "33.65", "27.82", ~
## $ Avg.Monthly.GB.Download <chr> "16", "10", "30", "4", "11", "73", "~
## $ Unlimited.Data <chr> "Yes", "No", "Yes", "Yes", "Yes", "Y~
## $ Total.Extra.Data.Charges <chr> "0", "10", "0", "0", "0", "0", "0", ~
## $ prob <chr> "0.118715221855491", "0.430575377275~
Standardizing Continuous features
Continuous festures: Tenure Avg. Monthly Charges Total.Months
num_columns <- c("Tenure.Months", "Monthly.Charges", "Total.Extra.Data.Charges")
tg[num_columns] <- sapply(tg[num_columns], as.numeric)
tg_int <- tg[,c("Tenure.Months", "Monthly.Charges", "Total.Extra.Data.Charges")]
tg_int <- data.frame(scale(tg_int))
head(tg_int,5)
## Tenure.Months Monthly.Charges Total.Extra.Data.Charges
## 1 -0.952 0.0279 -0.273
## 2 -0.952 -0.1616 0.125
## 3 -1.155 0.3037 -0.273
## 4 -0.789 1.1046 -0.273
## 5 -1.196 0.6360 -0.273
Create discrete options of features Tenure is highy correlate with target variable, therefore I am trying to create a discrete feature from tenure, where I have made different bins of tenure(which is in months) such as ‘0-1 year’, ‘2-3 years’, ‘3-4 years’ etc. to show how will trnure.months actually affect whether customer choose to churn or not churn.
tg <- mutate(tg, tenure_bin = Tenure.Months)
tg$tenure_bin[tg$tenure_bin >=0 & tg$tenure_bin <= 12] <- '0-1 year'
tg$tenure_bin[tg$tenure_bin > 12 & tg$tenure_bin <= 24] <- '1-2 years'
tg$tenure_bin[tg$tenure_bin > 24 & tg$tenure_bin <= 36] <- '2-3 years'
tg$tenure_bin[tg$tenure_bin > 36 & tg$tenure_bin <= 48] <- '3-4 years'
tg$tenure_bin[tg$tenure_bin > 48 & tg$tenure_bin <= 60] <- '4-5 years'
tg$tenure_bin[tg$tenure_bin > 60 & tg$tenure_bin <= 72] <- '5-6 years'
tg$tenure_bin <- as.factor(tg$tenure_bin)
After checking the distribution of data in each tenure bin, we found that maximum number of customers have a tenure of either 0-1 years and followed by 5-6 years.
options(repr.plot.width =6, repr.plot.height = 3)
ggplot(tg, aes(tenure_bin, fill = tenure_bin)) + geom_bar()
Create dummy
tg_c1 <- tg[,c("Paperless.Billing", "Payment.Method","tenure_bin", "Tech.Support","Online.Backup","Contract","Internet.Service","Partner","Senior.Citizen","Dependents","Churn.Label")]
dummy<- data.frame(sapply(tg_c1,function(x) data.frame(model.matrix(~x-1,data =tg_c1))[,-1]))
tail(dummy, n=5)
## Paperless.Billing Payment.Method.xCredit.card..automatic.
## 7039 0 0
## 7040 1 0
## 7041 1 0
## 7042 0 0
## 7043 0 0
## Payment.Method.xElectronic.check Payment.Method.xMailed.check
## 7039 0 1
## 7040 1 0
## 7041 0 1
## 7042 0 1
## 7043 1 0
## tenure_bin.x1.2.years tenure_bin.x2.3.years tenure_bin.x3.4.years
## 7039 1 0 0
## 7040 1 0 0
## 7041 0 0 0
## 7042 0 0 0
## 7043 0 0 0
## tenure_bin.x4.5.years tenure_bin.x5.6.years
## 7039 0 0
## 7040 0 0
## 7041 0 0
## 7042 0 1
## 7043 0 1
## Tech.Support.xNo.internet.service Tech.Support.xYes
## 7039 0 1
## 7040 0 0
## 7041 0 0
## 7042 0 1
## 7043 0 0
## Online.Backup.xNo.internet.service Online.Backup.xYes Contract.xOne.year
## 7039 0 0 1
## 7040 0 0 0
## 7041 0 1 0
## 7042 0 0 0
## 7043 0 1 0
## Contract.xTwo.year Internet.Service.xFiber.optic Internet.Service.xNo
## 7039 0 0 0
## 7040 0 1 0
## 7041 0 0 0
## 7042 1 0 0
## 7043 1 0 0
## Partner Senior.Citizen Dependents Churn.Label
## 7039 0 0 0 0
## 7040 1 0 0 1
## 7041 0 0 0 0
## 7042 1 0 0 0
## 7043 1 0 0 0
Create final dataset
Creating the final dataset by combining the numeric and dummy data frames.
#Combining the data
tg_final <- cbind(tg_int,dummy)
head(tg_final)
## Tenure.Months Monthly.Charges Total.Extra.Data.Charges Paperless.Billing
## 1 -0.952 0.0279 -0.273 1
## 2 -0.952 -0.1616 0.125 0
## 3 -1.155 0.3037 -0.273 1
## 4 -0.789 1.1046 -0.273 1
## 5 -1.196 0.6360 -0.273 1
## 6 -0.952 0.1541 -0.273 1
## Payment.Method.xCredit.card..automatic. Payment.Method.xElectronic.check
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 0
## 6 1 0
## Payment.Method.xMailed.check tenure_bin.x1.2.years tenure_bin.x2.3.years
## 1 1 0 0
## 2 1 0 0
## 3 0 0 0
## 4 0 1 0
## 5 1 0 0
## 6 0 0 0
## tenure_bin.x3.4.years tenure_bin.x4.5.years tenure_bin.x5.6.years
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Tech.Support.xNo.internet.service Tech.Support.xYes
## 1 0 1
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 1
## 6 0 1
## Online.Backup.xNo.internet.service Online.Backup.xYes Contract.xOne.year
## 1 0 1 1
## 2 0 0 0
## 3 0 0 0
## 4 0 1 0
## 5 0 0 0
## 6 0 0 0
## Contract.xTwo.year Internet.Service.xFiber.optic Internet.Service.xNo Partner
## 1 0 0 0 1
## 2 0 0 0 0
## 3 0 1 0 0
## 4 0 1 0 1
## 5 0 1 0 1
## 6 0 0 0 0
## Senior.Citizen Dependents Churn.Label
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 1 0 1
## 5 1 0 1
## 6 0 1 0
tg_final$Churn.Label<- as.factor(tg_final$Churn.Label)
head(tg_final,5)
## Tenure.Months Monthly.Charges Total.Extra.Data.Charges Paperless.Billing
## 1 -0.952 0.0279 -0.273 1
## 2 -0.952 -0.1616 0.125 0
## 3 -1.155 0.3037 -0.273 1
## 4 -0.789 1.1046 -0.273 1
## 5 -1.196 0.6360 -0.273 1
## Payment.Method.xCredit.card..automatic. Payment.Method.xElectronic.check
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 1
## 5 0 0
## Payment.Method.xMailed.check tenure_bin.x1.2.years tenure_bin.x2.3.years
## 1 1 0 0
## 2 1 0 0
## 3 0 0 0
## 4 0 1 0
## 5 1 0 0
## tenure_bin.x3.4.years tenure_bin.x4.5.years tenure_bin.x5.6.years
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## Tech.Support.xNo.internet.service Tech.Support.xYes
## 1 0 1
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 1
## Online.Backup.xNo.internet.service Online.Backup.xYes Contract.xOne.year
## 1 0 1 1
## 2 0 0 0
## 3 0 0 0
## 4 0 1 0
## 5 0 0 0
## Contract.xTwo.year Internet.Service.xFiber.optic Internet.Service.xNo Partner
## 1 0 0 0 1
## 2 0 0 0 0
## 3 0 1 0 0
## 4 0 1 0 1
## 5 0 1 0 1
## Senior.Citizen Dependents Churn.Label
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 1 0 1
## 5 1 0 1
Split data
Splitting data into train and validation
#Splitting the data
library(randomForest)
set.seed(123)
indices = sample.split(tg_final$Churn.Label, SplitRatio = 0.7)
train = tg_final[indices,]
validation = tg_final[!(indices),]
xkablesummary(train)
| Tenure.Months | Monthly.Charges | Total.Extra.Data.Charges | Paperless.Billing | Payment.Method.xCredit.card..automatic. | Payment.Method.xElectronic.check | Payment.Method.xMailed.check | tenure_bin.x1.2.years | tenure_bin.x2.3.years | tenure_bin.x3.4.years | tenure_bin.x4.5.years | tenure_bin.x5.6.years | Tech.Support.xNo.internet.service | Tech.Support.xYes | Online.Backup.xNo.internet.service | Online.Backup.xYes | Contract.xOne.year | Contract.xTwo.year | Internet.Service.xFiber.optic | Internet.Service.xNo | Partner | Senior.Citizen | Dependents | Churn.Label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | Min. :-1.318 | Min. :-1.546 | Min. :-0.27 | Min. :0.000 | Min. :0.00 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.00 | 0:3622 |
| Q1 | 1st Qu.:-0.952 | 1st Qu.:-0.965 | 1st Qu.:-0.27 | 1st Qu.:0.000 | 1st Qu.:0.00 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.00 | 1:1308 |
| Median | Median :-0.178 | Median : 0.186 | Median :-0.27 | Median :1.000 | Median :0.00 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.00 | NA |
| Mean | Mean :-0.010 | Mean : 0.001 | Mean : 0.00 | Mean :0.589 | Mean :0.22 | Mean :0.328 | Mean :0.236 | Mean :0.151 | Mean :0.123 | Mean :0.103 | Mean :0.114 | Mean :0.199 | Mean :0.217 | Mean :0.291 | Mean :0.217 | Mean :0.348 | Mean :0.207 | Mean :0.238 | Mean :0.442 | Mean :0.217 | Mean :0.489 | Mean :0.165 | Mean :0.23 | NA |
| Q3 | 3rd Qu.: 0.921 | 3rd Qu.: 0.834 | 3rd Qu.:-0.27 | 3rd Qu.:1.000 | 3rd Qu.:0.00 | 3rd Qu.:1.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | 3rd Qu.:1.000 | 3rd Qu.:0.000 | 3rd Qu.:1.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | 3rd Qu.:1.000 | 3rd Qu.:0.000 | 3rd Qu.:1.000 | 3rd Qu.:0.000 | 3rd Qu.:0.00 | NA |
| Max | Max. : 1.614 | Max. : 1.794 | Max. : 5.70 | Max. :1.000 | Max. :1.00 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.00 | NA |
xkablesummary(validation)
| Tenure.Months | Monthly.Charges | Total.Extra.Data.Charges | Paperless.Billing | Payment.Method.xCredit.card..automatic. | Payment.Method.xElectronic.check | Payment.Method.xMailed.check | tenure_bin.x1.2.years | tenure_bin.x2.3.years | tenure_bin.x3.4.years | tenure_bin.x4.5.years | tenure_bin.x5.6.years | Tech.Support.xNo.internet.service | Tech.Support.xYes | Online.Backup.xNo.internet.service | Online.Backup.xYes | Contract.xOne.year | Contract.xTwo.year | Internet.Service.xFiber.optic | Internet.Service.xNo | Partner | Senior.Citizen | Dependents | Churn.Label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | Min. :-1.318 | Min. :-1.541 | Min. :-0.27 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | Min. :0.000 | 0:1552 |
| Q1 | 1st Qu.:-0.992 | 1st Qu.:-0.984 | 1st Qu.:-0.27 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1st Qu.:0.000 | 1: 561 |
| Median | Median :-0.056 | Median : 0.184 | Median :-0.27 | Median :1.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | Median :0.000 | NA |
| Mean | Mean : 0.024 | Mean :-0.002 | Mean : 0.01 | Mean :0.601 | Mean :0.206 | Mean :0.354 | Mean :0.212 | Mean :0.132 | Mean :0.108 | Mean :0.121 | Mean :0.128 | Mean :0.201 | Mean :0.216 | Mean :0.288 | Mean :0.216 | Mean :0.338 | Mean :0.213 | Mean :0.247 | Mean :0.435 | Mean :0.216 | Mean :0.469 | Mean :0.156 | Mean :0.233 | NA |
| Q3 | 3rd Qu.: 0.962 | 3rd Qu.: 0.835 | 3rd Qu.:-0.27 | 3rd Qu.:1.000 | 3rd Qu.:0.000 | 3rd Qu.:1.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | 3rd Qu.:1.000 | 3rd Qu.:0.000 | 3rd Qu.:1.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | 3rd Qu.:1.000 | 3rd Qu.:0.000 | 3rd Qu.:1.000 | 3rd Qu.:0.000 | 3rd Qu.:0.000 | NA |
| Max | Max. : 1.614 | Max. : 1.776 | Max. : 5.70 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | Max. :1.000 | NA |
Train randomforest model
#Training the RandomForest Model
model.rf <- randomForest(Churn.Label ~ ., data=train, proximity=F,importance = F,ntree=500,mtry=4, do.trace=F, na.action=na.roughfix)
model.rf
##
## Call:
## randomForest(formula = Churn.Label ~ ., data = train, proximity = F, importance = F, ntree = 500, mtry = 4, do.trace = F, na.action = na.roughfix)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 20.1%
## Confusion matrix:
## 0 1 class.error
## 0 3207 415 0.115
## 1 577 731 0.441
The OOB error estimate comes to around 20%, so the model has around 80% out of sample accuracy for the training set. Let’s check the prediction and accuracy on validation data.
testPred <- predict(model.rf, newdata=validation[,-24])
#table(testPred2, validation2$Churn.Label)
confusionMatrix(validation$Churn.Label, testPred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1371 181
## 1 236 325
##
## Accuracy : 0.803
## 95% CI : (0.785, 0.819)
## No Information Rate : 0.761
## P-Value [Acc > NIR] : 2.06e-06
##
## Kappa : 0.478
##
## Mcnemar's Test P-Value : 0.00818
##
## Sensitivity : 0.853
## Specificity : 0.642
## Pos Pred Value : 0.883
## Neg Pred Value : 0.579
## Prevalence : 0.761
## Detection Rate : 0.649
## Detection Prevalence : 0.735
## Balanced Accuracy : 0.748
##
## 'Positive' Class : 0
##
The basic RandomForest model gives an accuracy of 80.3%( almost close enough to the OOB estimate), Sensitivity 85.3% and Specificity 64%.,
Variable Importance Plot
Below is the variable importance plot, that shows the most significant attribute in decreasing order by mean decrease in Gini. The Mean decrease Gini measures how pure the nodes are at the end of the tree. Higher the Gini Index, better is the homogeneity.
#Checking the variable Importance Plot
varImpPlot(model.rf)
Checking the AUC
model.rf.roc <- roc(response = validation$Churn.Label, predictor = as.numeric(testPred))
#print()
#plot(rf.roc, col = "red" , add = TRUE, print.auc.y = 0.85, print.auc = TRUE)
plot(model.rf.roc,col = "red",print.auc.y = 0.85, print.auc = TRUE )
RandomForest:
Accuracy 80.3%, Sensitivity 85.3% Specificity 64%
str(filtered_joined_churn_df)
## 'data.frame': 7043 obs. of 22 variables:
## $ Senior.Citizen : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 1 2 1 2 1 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 1 2 2 1 2 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 2 ...
## $ Tenure.Months : num 9 9 4 13 3 9 71 63 7 65 ...
## $ Phone.Service : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Internet.Service : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 2 2 2 1 2 2 1 1 ...
## $ Online.Security : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 1 3 3 3 3 ...
## $ Online.Backup : Factor w/ 3 levels "No","No internet service",..: 3 1 1 3 1 1 3 1 1 3 ...
## $ Tech.Support : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 3 3 3 3 1 3 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 2 1 1 1 1 1 3 3 1 3 ...
## $ Paperless.Billing : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 2 2 2 ...
## $ Payment.Method : Factor w/ 4 levels "Bank transfer (automatic)",..: 4 4 3 3 4 2 1 2 3 4 ...
## $ Monthly.Charges : num 65.6 59.9 73.9 98 83.9 ...
## $ Churn.Label : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 1 1 1 1 1 ...
## $ Churn.Reason : Factor w/ 20 levels "Attitude of service provider",..: NA NA 18 19 15 NA NA NA NA NA ...
## $ Satisfaction.Score : Factor w/ 5 levels "1","2","3","4",..: 3 5 1 1 1 3 3 4 3 3 ...
## $ Churn.Category : Factor w/ 5 levels "Attitude","Competitor",..: NA NA 2 3 3 NA NA NA NA NA ...
## $ Avg.Monthly.Long.Distance.Charges: num 42.39 10.69 33.65 27.82 7.38 ...
## $ Avg.Monthly.GB.Download : num 16 10 30 4 11 73 14 7 21 14 ...
## $ Unlimited.Data : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 1 2 2 ...
## $ Total.Extra.Data.Charges : num 0 10 0 0 0 0 0 20 0 0 ...
## $ prob : num 0.119 0.431 0.493 0.618 0.548 ...
view(filtered_joined_churn_df)
my_data <- data.frame(filtered_joined_churn_df$Senior.Citizen,filtered_joined_churn_df$Churn.Label,filtered_joined_churn_df$Partner,filtered_joined_churn_df$Dependents,filtered_joined_churn_df$Tenure.Months,filtered_joined_churn_df$Phone.Service,filtered_joined_churn_df$Online.Security,filtered_joined_churn_df$Online.Backup,filtered_joined_churn_df$Tech.Support,filtered_joined_churn_df$Paperless.Billing,filtered_joined_churn_df$Unlimited.Data,filtered_joined_churn_df$Avg.Monthly.Long.Distance.Charges,filtered_joined_churn_df$Avg.Monthly.GB.Download,filtered_joined_churn_df$Monthly.Charges)
str(my_data)
## 'data.frame': 7043 obs. of 14 variables:
## $ filtered_joined_churn_df.Senior.Citizen : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 1 2 1 2 1 ...
## $ filtered_joined_churn_df.Churn.Label : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 1 1 1 1 1 ...
## $ filtered_joined_churn_df.Partner : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 1 2 2 1 2 ...
## $ filtered_joined_churn_df.Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 2 ...
## $ filtered_joined_churn_df.Tenure.Months : num 9 9 4 13 3 9 71 63 7 65 ...
## $ filtered_joined_churn_df.Phone.Service : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ filtered_joined_churn_df.Online.Security : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 1 3 3 3 3 ...
## $ filtered_joined_churn_df.Online.Backup : Factor w/ 3 levels "No","No internet service",..: 3 1 1 3 1 1 3 1 1 3 ...
## $ filtered_joined_churn_df.Tech.Support : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 3 3 3 3 1 3 ...
## $ filtered_joined_churn_df.Paperless.Billing : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 2 2 2 ...
## $ filtered_joined_churn_df.Unlimited.Data : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 1 2 2 ...
## $ filtered_joined_churn_df.Avg.Monthly.Long.Distance.Charges: num 42.39 10.69 33.65 27.82 7.38 ...
## $ filtered_joined_churn_df.Avg.Monthly.GB.Download : num 16 10 30 4 11 73 14 7 21 14 ...
## $ filtered_joined_churn_df.Monthly.Charges : num 65.6 59.9 73.9 98 83.9 ...
### Libraries used
library(caret)
library(class)
## Partition of data
train_sample<-createDataPartition(y = my_data$filtered_joined_churn_df.Churn.Label, p= 0.7, list = FALSE)
train_data<-my_data[train_sample, ]
test_data<-my_data[-train_sample, ]
train_label<-my_data[train_sample,2]
test_label<-my_data[-train_sample,2]
### Linear SVM
svm_linear <- train(filtered_joined_churn_df.Churn.Label ~., data = train_data, method = "svmLinear", trControl=trainControl(method = "repeatedcv", number = 10, repeats = 3), preProcess = c("center", "scale"), tuneLength = 10)
svm_linear
## Support Vector Machines with Linear Kernel
##
## 4931 samples
## 13 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (16), scaled (16)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 4438, 4437, 4437, 4438, 4438, 4438, ...
## Resampling results:
##
## Accuracy Kappa
## 0.802 0.467
##
## Tuning parameter 'C' was held constant at a value of 1
It’s a linear model therefore, it just tested at value “C” =1.
Now, our model is trained with C value as 1. We are ready to predict classes for our test set and check the accuracy of our model using confusion matrix.
### Confusion Matrix
test_pred<-predict(svm_linear,test_data)
confusionMatrix(table(test_pred,test_data$filtered_joined_churn_df.Churn.Label))
## Confusion Matrix and Statistics
##
##
## test_pred No Yes
## No 1381 245
## Yes 171 315
##
## Accuracy : 0.803
## 95% CI : (0.785, 0.82)
## No Information Rate : 0.735
## P-Value [Acc > NIR] : 1.5e-13
##
## Kappa : 0.472
##
## Mcnemar's Test P-Value : 0.000345
##
## Sensitivity : 0.890
## Specificity : 0.562
## Pos Pred Value : 0.849
## Neg Pred Value : 0.648
## Prevalence : 0.735
## Detection Rate : 0.654
## Detection Prevalence : 0.770
## Balanced Accuracy : 0.726
##
## 'Positive' Class : No
##
The output shows that our model accuracy for test set is 79.8%
We can also do some customization for selecting C value(Cost) in Linear classifier.
### griding for obtaining best C value
svm_grid<-train(filtered_joined_churn_df.Churn.Label~., data = train_data, method = "svmLinear", trControl=trainControl("cv", number = 10), tuneGrid = expand.grid(C=seq(0,2,length=20)), preProcess = c("center", "scale"))
# plot model accuracy vs different values of cost
plot(svm_grid)
svm_grid$bestTune
## C
## 5 0.421
test_predgrid<-predict(svm_grid,test_data)
The above plot is showing that our classifier is giving best accuracy on C = 0.2.
checking its accuracy using confusion -matrix.
# Accuracy after finding best parameter
confusionMatrix(table(test_predgrid,test_data$filtered_joined_churn_df.Churn.Label))
## Confusion Matrix and Statistics
##
##
## test_predgrid No Yes
## No 1381 244
## Yes 171 316
##
## Accuracy : 0.804
## 95% CI : (0.786, 0.82)
## No Information Rate : 0.735
## P-Value [Acc > NIR] : 1.01e-13
##
## Kappa : 0.474
##
## Mcnemar's Test P-Value : 0.000409
##
## Sensitivity : 0.890
## Specificity : 0.564
## Pos Pred Value : 0.850
## Neg Pred Value : 0.649
## Prevalence : 0.735
## Detection Rate : 0.654
## Detection Prevalence : 0.769
## Balanced Accuracy : 0.727
##
## 'Positive' Class : No
##
The results of the confusion matrix show that this time the accuracy on the test set is 79.9 %
### ROC Curve
library(pROC)
roc_svm_test <-plot(roc(as.numeric(test_data$filtered_joined_churn_df.Churn.Label),as.numeric(test_pred)),main="Comparaison ",col="#1c61b6")
plot(roc_svm_test, add = TRUE,col = "red", print.auc=TRUE, print.auc.x = 0.5, print.auc.y = 0.3)
legend(0.3, 0.2, legend = c("test-svm"), lty = c(1), col = c("blue"))
The area under the curve(AUC) is 72.4%
Omit NA values
We have also tried to build the decision tree mode.A decision tree is a type of supervised machine learning used to categorize or make predictions based on how a previous set of questions were answered. The model is a form of supervised learning, meaning that the model is trained and tested on a set of data that contains the desired categorization
before building the actual tree , since the data is not so clean, we have tried to clean categorical variables, changing into factors.standardized numerical variables. I have also cleaned the na values we have a used label encoding for categorical variables.
We got the insight from our previous eda that tota extra data charges , avg monthly long dist chanrges has null on whether a customer to churn or not so we removed those variables from the model building.
loadPkg("rpart")
library(caret)
library(tidyr)
install.packages("dplyr")
library("dplyr")
filtered_joined_churn_df %>% na.omit(filtered_joined_churn_df)
filtered_joined_churn_df[complete.cases(filtered_joined_churn_df), ]
filtered_joined_churn_df %>% drop_na()
control1 <- rfeControl(functions = rpart, # random forest
method = "repeatedcv", # repeated cv
repeats = 5, # number of repeats
number = 10) # number of folds
Label encoding the categorical variables
head(filtered_joined_churn_df)
## Senior.Citizen Partner Dependents Tenure.Months Phone.Service
## 1 No Yes No 9 Yes
## 2 No No No 9 Yes
## 3 No No No 4 Yes
## 4 Yes Yes No 13 Yes
## 5 Yes Yes No 3 Yes
## 6 No No Yes 9 Yes
## Internet.Service Online.Security Online.Backup Tech.Support Contract
## 1 DSL No Yes Yes One year
## 2 DSL No No No Month-to-month
## 3 Fiber optic No No No Month-to-month
## 4 Fiber optic No Yes No Month-to-month
## 5 Fiber optic No No Yes Month-to-month
## 6 DSL No No Yes Month-to-month
## Paperless.Billing Payment.Method Monthly.Charges Churn.Label
## 1 Yes Mailed check 65.6 No
## 2 No Mailed check 59.9 No
## 3 Yes Electronic check 73.9 Yes
## 4 Yes Electronic check 98.0 Yes
## 5 Yes Mailed check 83.9 Yes
## 6 Yes Credit card (automatic) 69.4 No
## Churn.Reason Satisfaction.Score Churn.Category
## 1 <NA> 3 <NA>
## 2 <NA> 5 <NA>
## 3 Price too high 1 Competitor
## 4 Product dissatisfaction 1 Dissatisfaction
## 5 Network reliability 1 Dissatisfaction
## 6 <NA> 3 <NA>
## Avg.Monthly.Long.Distance.Charges Avg.Monthly.GB.Download Unlimited.Data
## 1 42.39 16 Yes
## 2 10.69 10 No
## 3 33.65 30 Yes
## 4 27.82 4 Yes
## 5 7.38 11 Yes
## 6 16.77 73 Yes
## Total.Extra.Data.Charges prob
## 1 0 0.119
## 2 10 0.431
## 3 0 0.493
## 4 0 0.618
## 5 0 0.548
## 6 0 0.145
library(superml)
label <- LabelEncoder$new()
filtered_joined_churn_df$Internet.Service <- label$fit_transform(filtered_joined_churn_df$Internet.Service)
filtered_joined_churn_df$Contract <- label$fit_transform(filtered_joined_churn_df$Contract)
filtered_joined_churn_df$Payment.Method <- label$fit_transform(filtered_joined_churn_df$Payment.Method)
filtered_joined_churn_df$Churn.Label <- ifelse(filtered_joined_churn_df$Churn.Label == "Yes",1,0)
filtered_joined_churn_df$Dependents <- ifelse(filtered_joined_churn_df$Dependents == "Yes",1,0)
filtered_joined_churn_df$Senior.Citizen <- ifelse(filtered_joined_churn_df$Senior.Citizen== "Yes",1,0)
filtered_joined_churn_df$Partner <- ifelse(filtered_joined_churn_df$Partner== "Yes",1,0)
#filtered_joined_churn_df$Tenure.Months <- ifelse(filtered_joined_churn_df$Tenure.Months == "Yes",1,0)
filtered_joined_churn_df$Phone.Service <- ifelse(filtered_joined_churn_df$Phone.Service == "Yes",1,0)
filtered_joined_churn_df$Online.Security <- ifelse(filtered_joined_churn_df$Online.Security == "Yes",1,0)
filtered_joined_churn_df$Online.Backup <- ifelse(filtered_joined_churn_df$Online.Backup == "Yes",1,0)
filtered_joined_churn_df$Tech.Support <- ifelse(filtered_joined_churn_df$Tech.Support == "Yes",1,0)
filtered_joined_churn_df$Paperless.Billing <- ifelse(filtered_joined_churn_df$Paperless.Billing == "Yes",1,0)
filtered_joined_churn_df$Unlimited.Data <- ifelse(filtered_joined_churn_df$Unlimited.Data == "Yes",1,0)
head(filtered_joined_churn_df)
## Senior.Citizen Partner Dependents Tenure.Months Phone.Service
## 1 0 1 0 9 1
## 2 0 0 0 9 1
## 3 0 0 0 4 1
## 4 1 1 0 13 1
## 5 1 1 0 3 1
## 6 0 0 1 9 1
## Internet.Service Online.Security Online.Backup Tech.Support Contract
## 1 0 0 1 1 1
## 2 0 0 0 0 0
## 3 1 0 0 0 0
## 4 1 0 1 0 0
## 5 1 0 0 1 0
## 6 0 0 0 1 0
## Paperless.Billing Payment.Method Monthly.Charges Churn.Label
## 1 1 3 65.6 0
## 2 0 3 59.9 0
## 3 1 2 73.9 1
## 4 1 2 98.0 1
## 5 1 3 83.9 1
## 6 1 1 69.4 0
## Churn.Reason Satisfaction.Score Churn.Category
## 1 <NA> 3 <NA>
## 2 <NA> 5 <NA>
## 3 Price too high 1 Competitor
## 4 Product dissatisfaction 1 Dissatisfaction
## 5 Network reliability 1 Dissatisfaction
## 6 <NA> 3 <NA>
## Avg.Monthly.Long.Distance.Charges Avg.Monthly.GB.Download Unlimited.Data
## 1 42.39 16 1
## 2 10.69 10 0
## 3 33.65 30 1
## 4 27.82 4 1
## 5 7.38 11 1
## 6 16.77 73 1
## Total.Extra.Data.Charges prob
## 1 0 0.119
## 2 10 0.431
## 3 0 0.493
## 4 0 0.618
## 5 0 0.548
## 6 0 0.145
myvars <- c("Churn.Label","Senior.Citizen", "Partner", "Dependents","Tenure.Months","Phone.Service","Online.Security","Online.Backup","Tech.Support","Paperless.Billing","Unlimited.Data","Avg.Monthly.Long.Distance.Charges","Avg.Monthly.GB.Download","Monthly.Charges","Tenure.Months")
n <- filtered_joined_churn_df[myvars]
my <- c("Senior.Citizen", "Partner", "Dependents","Tenure.Months","Phone.Service","Online.Security")
u = filtered_joined_churn_df[my]
a <- n[myvars]
typeof(a)
## [1] "list"
nrow(a)
## [1] 7043
head(a)
## Churn.Label Senior.Citizen Partner Dependents Tenure.Months Phone.Service
## 1 0 0 1 0 9 1
## 2 0 0 0 0 9 1
## 3 1 0 0 0 4 1
## 4 1 1 1 0 13 1
## 5 1 1 1 0 3 1
## 6 0 0 0 1 9 1
## Online.Security Online.Backup Tech.Support Paperless.Billing Unlimited.Data
## 1 0 1 1 1 1
## 2 0 0 0 0 0
## 3 0 0 0 1 1
## 4 0 1 0 1 1
## 5 0 0 1 1 1
## 6 0 0 1 1 1
## Avg.Monthly.Long.Distance.Charges Avg.Monthly.GB.Download Monthly.Charges
## 1 42.39 16 65.6
## 2 10.69 10 59.9
## 3 33.65 30 73.9
## 4 27.82 4 98.0
## 5 7.38 11 83.9
## 6 16.77 73 69.4
## Tenure.Months.1
## 1 9
## 2 9
## 3 4
## 4 13
## 5 3
## 6 9
myvars <- c("Senior.Citizen","Churn.Label", "Partner", "Dependents","Tenure.Months","Phone.Service","Online.Security","Online.Backup","Tech.Support","Paperless.Billing","Unlimited.Data","Avg.Monthly.Long.Distance.Charges","Avg.Monthly.GB.Download","Monthly.Charges","Tenure.Months")
n$Churn.Label <- as.factor(n$Churn.Label)
head(n$Churn.Label)
## [1] 0 0 1 1 1 0
## Levels: 0 1
typeof(n$Churn.Label)
## [1] "integer"
b = as.factor(n["Churn.Label"])
Features of variable importance
fit_dt = rpart(Churn.Label~., data=filtered_joined_churn_df, method="class", control = list(maxdepth = 4))
varImp(fit_dt)
## Overall
## Contract 451
## Dependents 170
## prob 590
## Satisfaction.Score 1954
## Tenure.Months 292
## Senior.Citizen 0
## Partner 0
## Phone.Service 0
## Internet.Service 0
## Online.Security 0
## Online.Backup 0
## Tech.Support 0
## Paperless.Billing 0
## Payment.Method 0
## Monthly.Charges 0
## Churn.Reason 0
## Churn.Category 0
## Avg.Monthly.Long.Distance.Charges 0
## Avg.Monthly.GB.Download 0
## Unlimited.Data 0
## Total.Extra.Data.Charges 0
Decision Tree with Gini
myvars <- c("Senior.Citizen","Churn.Label", "Partner", "Dependents","Tenure.Months","Phone.Service","Online.Security","Online.Backup","Tech.Support","Paperless.Billing","Unlimited.Data","Avg.Monthly.GB.Download","Tenure.Months","Internet.Service","Contract")
model_df <- filtered_joined_churn_df[myvars]
We have removed Total.Extra.Data.Charges and Avg.Monthly.Long.Distance.Charges , as it is not significant from the test we performed above
set.seed(56)
library(caret)
split_train_test <- createDataPartition(model_df$Churn.Label,p=0.7,list=FALSE)
dtrain<- model_df[split_train_test,]
dtest<- model_df[-split_train_test,]
# Remove Total Charges from the training dataset
Train-Test data
head(dtrain)
## Senior.Citizen Churn.Label Partner Dependents Tenure.Months Phone.Service
## 2 0 0 0 0 9 1
## 3 0 1 0 0 4 1
## 4 1 1 1 0 13 1
## 5 1 1 1 0 3 1
## 6 0 0 0 1 9 1
## 7 1 0 1 0 71 1
## Online.Security Online.Backup Tech.Support Paperless.Billing Unlimited.Data
## 2 0 0 0 0 0
## 3 0 0 0 1 1
## 4 0 1 0 1 1
## 5 0 0 1 1 1
## 6 0 0 1 1 1
## 7 1 1 1 1 1
## Avg.Monthly.GB.Download Tenure.Months.1 Internet.Service Contract
## 2 10 9 0 0
## 3 30 4 1 0
## 4 4 13 1 0
## 5 11 3 1 0
## 6 73 9 0 0
## 7 14 71 1 2
head(dtest)
## Senior.Citizen Churn.Label Partner Dependents Tenure.Months Phone.Service
## 1 0 0 1 0 9 1
## 8 0 0 1 0 63 1
## 10 0 0 1 1 65 1
## 11 0 0 0 0 54 0
## 12 0 0 1 1 72 1
## 16 0 0 1 1 71 1
## Online.Security Online.Backup Tech.Support Paperless.Billing Unlimited.Data
## 1 0 1 1 1 1
## 8 1 0 1 1 0
## 10 1 1 1 1 1
## 11 1 0 1 0 1
## 12 1 1 1 1 1
## 16 0 1 0 1 1
## Avg.Monthly.GB.Download Tenure.Months.1 Internet.Service Contract
## 1 16 9 0 1
## 8 7 63 1 2
## 10 14 65 0 2
## 11 10 54 0 2
## 12 59 72 1 2
## 16 12 71 1 2
loadPkg("rpart")
library(caret)
library(tidyr)
install.packages("dplyr")
library("dplyr")
library(superml)
loadPkg("caret")
loadPkg("rpart.plot")
loadPkg("rattle")
tr_fit <- rpart(Churn.Label ~., data = dtrain, method="class",na.action = na.exclude)
rpart.plot(tr_fit)
summary(tr_fit)
## Call:
## rpart(formula = Churn.Label ~ ., data = dtrain, na.action = na.exclude,
## method = "class")
## n= 4931
##
## CP nsplit rel error xerror xstd
## 1 0.0468 0 1.000 1.000 0.0236
## 2 0.0333 4 0.813 0.915 0.0229
## 3 0.0205 5 0.780 0.825 0.0221
## 4 0.0129 6 0.759 0.775 0.0216
## 5 0.0100 9 0.719 0.766 0.0215
##
## Variable importance
## Contract Tenure.Months Tenure.Months.1
## 25 19 19
## Internet.Service Tech.Support Dependents
## 7 6 6
## Partner Online.Security Avg.Monthly.GB.Download
## 6 5 4
## Phone.Service Online.Backup
## 1 1
##
## Node number 1: 4931 observations, complexity param=0.0468
## predicted class=0 expected loss=0.268 P(node) =1
## class counts: 3611 1320
## probabilities: 0.732 0.268
## left son=2 (2217 obs) right son=3 (2714 obs)
## Primary splits:
## Contract < 0.5 to the right, improve=311.0, (0 missing)
## Tenure.Months < 16.5 to the right, improve=194.0, (0 missing)
## Tenure.Months.1 < 16.5 to the right, improve=194.0, (0 missing)
## Dependents < 0.5 to the right, improve=123.0, (0 missing)
## Avg.Monthly.GB.Download < 1 to the left, improve= 96.9, (0 missing)
## Surrogate splits:
## Tenure.Months < 34.5 to the right, agree=0.786, adj=0.524, (0 split)
## Tenure.Months.1 < 34.5 to the right, agree=0.786, adj=0.524, (0 split)
## Tech.Support < 0.5 to the right, agree=0.650, adj=0.222, (0 split)
## Partner < 0.5 to the right, agree=0.640, adj=0.200, (0 split)
## Online.Security < 0.5 to the right, agree=0.630, adj=0.178, (0 split)
##
## Node number 2: 2217 observations
## predicted class=0 expected loss=0.0713 P(node) =0.45
## class counts: 2059 158
## probabilities: 0.929 0.071
##
## Node number 3: 2714 observations, complexity param=0.0468
## predicted class=0 expected loss=0.428 P(node) =0.55
## class counts: 1552 1162
## probabilities: 0.572 0.428
## left son=6 (443 obs) right son=7 (2271 obs)
## Primary splits:
## Dependents < 0.5 to the right, improve=76.0, (0 missing)
## Tenure.Months < 1.5 to the right, improve=49.5, (0 missing)
## Tenure.Months.1 < 1.5 to the right, improve=49.5, (0 missing)
## Avg.Monthly.GB.Download < 1 to the left, improve=43.5, (0 missing)
## Internet.Service < 1.5 to the right, improve=43.5, (0 missing)
##
## Node number 6: 443 observations
## predicted class=0 expected loss=0.16 P(node) =0.0898
## class counts: 372 71
## probabilities: 0.840 0.160
##
## Node number 7: 2271 observations, complexity param=0.0468
## predicted class=0 expected loss=0.48 P(node) =0.461
## class counts: 1180 1091
## probabilities: 0.520 0.480
## left son=14 (270 obs) right son=15 (2001 obs)
## Primary splits:
## Avg.Monthly.GB.Download < 1 to the left, improve=38.5, (0 missing)
## Internet.Service < 1.5 to the right, improve=38.5, (0 missing)
## Tenure.Months < 1.5 to the right, improve=37.8, (0 missing)
## Tenure.Months.1 < 1.5 to the right, improve=37.8, (0 missing)
## Paperless.Billing < 0.5 to the left, improve=32.3, (0 missing)
## Surrogate splits:
## Internet.Service < 1.5 to the right, agree=1, adj=1, (0 split)
##
## Node number 14: 270 observations
## predicted class=0 expected loss=0.23 P(node) =0.0548
## class counts: 208 62
## probabilities: 0.770 0.230
##
## Node number 15: 2001 observations, complexity param=0.0468
## predicted class=1 expected loss=0.486 P(node) =0.406
## class counts: 972 1029
## probabilities: 0.486 0.514
## left son=30 (656 obs) right son=31 (1345 obs)
## Primary splits:
## Internet.Service < 0.5 to the left, improve=49.4, (0 missing)
## Tenure.Months < 16.5 to the right, improve=46.5, (0 missing)
## Tenure.Months.1 < 16.5 to the right, improve=46.5, (0 missing)
## Online.Security < 0.5 to the right, improve=32.6, (0 missing)
## Tech.Support < 0.5 to the right, improve=25.4, (0 missing)
## Surrogate splits:
## Phone.Service < 0.5 to the left, agree=0.771, adj=0.302, (0 split)
##
## Node number 30: 656 observations, complexity param=0.0205
## predicted class=0 expected loss=0.355 P(node) =0.133
## class counts: 423 233
## probabilities: 0.645 0.355
## left son=60 (451 obs) right son=61 (205 obs)
## Primary splits:
## Tenure.Months < 3.5 to the right, improve=26.50, (0 missing)
## Tenure.Months.1 < 3.5 to the right, improve=26.50, (0 missing)
## Online.Security < 0.5 to the right, improve= 6.06, (0 missing)
## Online.Backup < 0.5 to the right, improve= 5.24, (0 missing)
## Tech.Support < 0.5 to the right, improve= 4.21, (0 missing)
## Surrogate splits:
## Tenure.Months.1 < 3.5 to the right, agree=1, adj=1, (0 split)
##
## Node number 31: 1345 observations, complexity param=0.0333
## predicted class=1 expected loss=0.408 P(node) =0.273
## class counts: 549 796
## probabilities: 0.408 0.592
## left son=62 (660 obs) right son=63 (685 obs)
## Primary splits:
## Tenure.Months < 16.5 to the right, improve=40.6, (0 missing)
## Tenure.Months.1 < 16.5 to the right, improve=40.6, (0 missing)
## Online.Security < 0.5 to the right, improve=16.9, (0 missing)
## Tech.Support < 0.5 to the right, improve=12.8, (0 missing)
## Paperless.Billing < 0.5 to the left, improve= 6.6, (0 missing)
## Surrogate splits:
## Tenure.Months.1 < 16.5 to the right, agree=1.000, adj=1.000, (0 split)
## Online.Backup < 0.5 to the right, agree=0.619, adj=0.223, (0 split)
## Partner < 0.5 to the right, agree=0.599, adj=0.182, (0 split)
## Online.Security < 0.5 to the right, agree=0.575, adj=0.135, (0 split)
## Tech.Support < 0.5 to the right, agree=0.558, adj=0.098, (0 split)
##
## Node number 60: 451 observations
## predicted class=0 expected loss=0.259 P(node) =0.0915
## class counts: 334 117
## probabilities: 0.741 0.259
##
## Node number 61: 205 observations
## predicted class=1 expected loss=0.434 P(node) =0.0416
## class counts: 89 116
## probabilities: 0.434 0.566
##
## Node number 62: 660 observations, complexity param=0.0129
## predicted class=0 expected loss=0.467 P(node) =0.134
## class counts: 352 308
## probabilities: 0.533 0.467
## left son=124 (115 obs) right son=125 (545 obs)
## Primary splits:
## Tenure.Months < 53.5 to the right, improve=9.00, (0 missing)
## Tenure.Months.1 < 53.5 to the right, improve=9.00, (0 missing)
## Tech.Support < 0.5 to the right, improve=7.76, (0 missing)
## Online.Security < 0.5 to the right, improve=5.07, (0 missing)
## Avg.Monthly.GB.Download < 28.5 to the right, improve=4.80, (0 missing)
## Surrogate splits:
## Tenure.Months.1 < 53.5 to the right, agree=1, adj=1, (0 split)
##
## Node number 63: 685 observations
## predicted class=1 expected loss=0.288 P(node) =0.139
## class counts: 197 488
## probabilities: 0.288 0.712
##
## Node number 124: 115 observations
## predicted class=0 expected loss=0.287 P(node) =0.0233
## class counts: 82 33
## probabilities: 0.713 0.287
##
## Node number 125: 545 observations, complexity param=0.0129
## predicted class=1 expected loss=0.495 P(node) =0.111
## class counts: 270 275
## probabilities: 0.495 0.505
## left son=250 (91 obs) right son=251 (454 obs)
## Primary splits:
## Avg.Monthly.GB.Download < 29.5 to the right, improve=5.87, (0 missing)
## Tech.Support < 0.5 to the right, improve=4.85, (0 missing)
## Partner < 0.5 to the left, improve=4.62, (0 missing)
## Paperless.Billing < 0.5 to the left, improve=3.92, (0 missing)
## Senior.Citizen < 0.5 to the left, improve=3.54, (0 missing)
##
## Node number 250: 91 observations
## predicted class=0 expected loss=0.341 P(node) =0.0185
## class counts: 60 31
## probabilities: 0.659 0.341
##
## Node number 251: 454 observations, complexity param=0.0129
## predicted class=1 expected loss=0.463 P(node) =0.0921
## class counts: 210 244
## probabilities: 0.463 0.537
## left son=502 (83 obs) right son=503 (371 obs)
## Primary splits:
## Tech.Support < 0.5 to the right, improve=4.69, (0 missing)
## Partner < 0.5 to the left, improve=3.67, (0 missing)
## Paperless.Billing < 0.5 to the left, improve=2.97, (0 missing)
## Online.Security < 0.5 to the right, improve=1.94, (0 missing)
## Senior.Citizen < 0.5 to the left, improve=1.64, (0 missing)
##
## Node number 502: 83 observations
## predicted class=0 expected loss=0.386 P(node) =0.0168
## class counts: 51 32
## probabilities: 0.614 0.386
##
## Node number 503: 371 observations
## predicted class=1 expected loss=0.429 P(node) =0.0752
## class counts: 159 212
## probabilities: 0.429 0.571
From this decision tree, we can interpret the following:
The contract variable is the most important. Customers with month-to-month contracts are more likely to churn.Customers who has a dependents (value=1) has the higher chance to churn. Customers with DSL internet service are less likely to churn. Customers who have stayed longer than 54 months are less likely to churn. Now let’s assess the prediction accuracy of the decision tree model by investigating how well it predicts churn in the test subset. We will begin with the confustion matrix, which is a useful display of classification accuracy. It displays the following information:
true positives (TP): These are cases in which we predicted yes (they churned), and they did churn. true negatives (TN): We predicted no, and they didn’t churn. false positives (FP): We predicted yes, but they didn’t actually churn. (Also known as a “Type I error.”) false negatives (FN): We predicted no, but they actually churned. (Also known as a “Type II error.”) Let’s examine the confusion matrix for our decision tree model.
tr_prob1 <- predict(tr_fit, dtest)
tr_pred1 <- ifelse(tr_prob1[,2] > 0.5,"Yes","No")
table(Predicted = tr_pred1, Actual = dtest$Churn.Label)
## Actual
## Predicted 0 1
## No 1354 208
## Yes 209 341
From this confusion matrix, we can see that the model performs well at predicting non-churning customers (1354 correct vs. 208 incorrect) and predicting churning customers 341 correct vs. 209 incorrect).
tr_prob2 <- predict(tr_fit, dtrain)
tr_pred2 <- ifelse(tr_prob2[,2] > 0.5,"Yes","No")
tr_tab1 <- table(Predicted = tr_pred2, Actual = dtrain$Churn.Label)
tr_tab2 <- table(Predicted = tr_pred1, Actual = dtest$Churn.Label)
# # Train
# confusionMatrix(
# as.factor(tr_pred2),
# as.factor(dtrain$Churn.Label),na.action = na.pass,
# positive = "Yes"
# )
#
# #Test
# confusionMatrix(
# as.factor(tr_pred1),
# as.factor(dtest$Churn.Label),
# positive = "Yes"
# )
tr_acc <- sum(diag(tr_tab2))/sum(tr_tab2)
tr_acc
## [1] 0.803
The overall accuracy of the model is 80.26%
library(pROC)
ROC_rf <- roc(dtest$Churn.Label, tr_prob1[,2])
ROC_rf_auc <- auc(ROC_rf)
plot(ROC_rf, col = "green", main = "ROC Curve for the decision tree with gini")
paste("Accuracy decision tree with gini: ", mean(dtest$Churn.Labe == round(tr_prob1[,2], digits = 0)))
## [1] "Accuracy decision tree with gini: 0.802556818181818"
paste("Area under curve for decision tree with gini is: ", ROC_rf_auc)
## [1] "Area under curve for decision tree with gini is: 0.824659387684466"
This is the auc roc curve for the decision tree with gini parameter
The Area Under the Curve (AUC) is the measure of the ability of a classifier to distinguish between classes and is used as a summary of the ROC curve. The higher the AUC, the better the performance of the model at distinguishing between the positive and negative classes.
The overall accuracy of the decision tree with gini model is 80.26% The AUC of the decision tree with gini model is 0.82
Decision Tree with Entropy- rpart
myvars <- c("Senior.Citizen","Churn.Label", "Partner", "Dependents","Tenure.Months","Phone.Service","Online.Security","Online.Backup","Tech.Support","Paperless.Billing","Unlimited.Data","Avg.Monthly.GB.Download","Tenure.Months","Internet.Service","Contract")
model_df <- filtered_joined_churn_df[myvars]
We have removed Total.Extra.Data.Charges and Avg.Monthly.Long.Distance.Charges , as it is not significant from the test we performed above
set.seed(56)
library(caret)
split_train_test <- createDataPartition(model_df$Churn.Label,p=0.7,list=FALSE)
dtrain<- model_df[split_train_test,]
dtest<- model_df[-split_train_test,]
# Remove Total Charges from the training dataset
Train-Test data
head(dtrain)
## Senior.Citizen Churn.Label Partner Dependents Tenure.Months Phone.Service
## 2 0 0 0 0 9 1
## 3 0 1 0 0 4 1
## 4 1 1 1 0 13 1
## 5 1 1 1 0 3 1
## 6 0 0 0 1 9 1
## 7 1 0 1 0 71 1
## Online.Security Online.Backup Tech.Support Paperless.Billing Unlimited.Data
## 2 0 0 0 0 0
## 3 0 0 0 1 1
## 4 0 1 0 1 1
## 5 0 0 1 1 1
## 6 0 0 1 1 1
## 7 1 1 1 1 1
## Avg.Monthly.GB.Download Tenure.Months.1 Internet.Service Contract
## 2 10 9 0 0
## 3 30 4 1 0
## 4 4 13 1 0
## 5 11 3 1 0
## 6 73 9 0 0
## 7 14 71 1 2
head(dtest)
## Senior.Citizen Churn.Label Partner Dependents Tenure.Months Phone.Service
## 1 0 0 1 0 9 1
## 8 0 0 1 0 63 1
## 10 0 0 1 1 65 1
## 11 0 0 0 0 54 0
## 12 0 0 1 1 72 1
## 16 0 0 1 1 71 1
## Online.Security Online.Backup Tech.Support Paperless.Billing Unlimited.Data
## 1 0 1 1 1 1
## 8 1 0 1 1 0
## 10 1 1 1 1 1
## 11 1 0 1 0 1
## 12 1 1 1 1 1
## 16 0 1 0 1 1
## Avg.Monthly.GB.Download Tenure.Months.1 Internet.Service Contract
## 1 16 9 0 1
## 8 7 63 1 2
## 10 14 65 0 2
## 11 10 54 0 2
## 12 59 72 1 2
## 16 12 71 1 2
loadPkg("rpart")
library(caret)
library(tidyr)
install.packages("dplyr")
library("dplyr")
library(superml)
loadPkg("caret")
loadPkg("rpart.plot")
loadPkg("rattle")
tr_fit <- rpart(Churn.Label ~., data = dtrain, method="class",na.action = na.exclude,parms=list(split='entropy'))
rpart.plot(tr_fit)
summary(tr_fit)
## Call:
## rpart(formula = Churn.Label ~ ., data = dtrain, na.action = na.exclude,
## method = "class", parms = list(split = "entropy"))
## n= 4931
##
## CP nsplit rel error xerror xstd
## 1 0.0468 0 1.000 1.000 0.0236
## 2 0.0333 4 0.813 0.915 0.0229
## 3 0.0205 5 0.780 0.825 0.0221
## 4 0.0129 6 0.759 0.775 0.0216
## 5 0.0100 9 0.719 0.766 0.0215
##
## Variable importance
## Contract Tenure.Months Tenure.Months.1
## 25 19 19
## Internet.Service Tech.Support Dependents
## 7 6 6
## Partner Online.Security Avg.Monthly.GB.Download
## 6 5 4
## Phone.Service Online.Backup
## 1 1
##
## Node number 1: 4931 observations, complexity param=0.0468
## predicted class=0 expected loss=0.268 P(node) =1
## class counts: 3611 1320
## probabilities: 0.732 0.268
## left son=2 (2217 obs) right son=3 (2714 obs)
## Primary splits:
## Contract < 0.5 to the right, improve=311.0, (0 missing)
## Tenure.Months < 16.5 to the right, improve=194.0, (0 missing)
## Tenure.Months.1 < 16.5 to the right, improve=194.0, (0 missing)
## Dependents < 0.5 to the right, improve=123.0, (0 missing)
## Avg.Monthly.GB.Download < 1 to the left, improve= 96.9, (0 missing)
## Surrogate splits:
## Tenure.Months < 34.5 to the right, agree=0.786, adj=0.524, (0 split)
## Tenure.Months.1 < 34.5 to the right, agree=0.786, adj=0.524, (0 split)
## Tech.Support < 0.5 to the right, agree=0.650, adj=0.222, (0 split)
## Partner < 0.5 to the right, agree=0.640, adj=0.200, (0 split)
## Online.Security < 0.5 to the right, agree=0.630, adj=0.178, (0 split)
##
## Node number 2: 2217 observations
## predicted class=0 expected loss=0.0713 P(node) =0.45
## class counts: 2059 158
## probabilities: 0.929 0.071
##
## Node number 3: 2714 observations, complexity param=0.0468
## predicted class=0 expected loss=0.428 P(node) =0.55
## class counts: 1552 1162
## probabilities: 0.572 0.428
## left son=6 (443 obs) right son=7 (2271 obs)
## Primary splits:
## Dependents < 0.5 to the right, improve=76.0, (0 missing)
## Tenure.Months < 1.5 to the right, improve=49.5, (0 missing)
## Tenure.Months.1 < 1.5 to the right, improve=49.5, (0 missing)
## Avg.Monthly.GB.Download < 1 to the left, improve=43.5, (0 missing)
## Internet.Service < 1.5 to the right, improve=43.5, (0 missing)
##
## Node number 6: 443 observations
## predicted class=0 expected loss=0.16 P(node) =0.0898
## class counts: 372 71
## probabilities: 0.840 0.160
##
## Node number 7: 2271 observations, complexity param=0.0468
## predicted class=0 expected loss=0.48 P(node) =0.461
## class counts: 1180 1091
## probabilities: 0.520 0.480
## left son=14 (270 obs) right son=15 (2001 obs)
## Primary splits:
## Avg.Monthly.GB.Download < 1 to the left, improve=38.5, (0 missing)
## Internet.Service < 1.5 to the right, improve=38.5, (0 missing)
## Tenure.Months < 1.5 to the right, improve=37.8, (0 missing)
## Tenure.Months.1 < 1.5 to the right, improve=37.8, (0 missing)
## Paperless.Billing < 0.5 to the left, improve=32.3, (0 missing)
## Surrogate splits:
## Internet.Service < 1.5 to the right, agree=1, adj=1, (0 split)
##
## Node number 14: 270 observations
## predicted class=0 expected loss=0.23 P(node) =0.0548
## class counts: 208 62
## probabilities: 0.770 0.230
##
## Node number 15: 2001 observations, complexity param=0.0468
## predicted class=1 expected loss=0.486 P(node) =0.406
## class counts: 972 1029
## probabilities: 0.486 0.514
## left son=30 (656 obs) right son=31 (1345 obs)
## Primary splits:
## Internet.Service < 0.5 to the left, improve=49.4, (0 missing)
## Tenure.Months < 16.5 to the right, improve=46.5, (0 missing)
## Tenure.Months.1 < 16.5 to the right, improve=46.5, (0 missing)
## Online.Security < 0.5 to the right, improve=32.6, (0 missing)
## Tech.Support < 0.5 to the right, improve=25.4, (0 missing)
## Surrogate splits:
## Phone.Service < 0.5 to the left, agree=0.771, adj=0.302, (0 split)
##
## Node number 30: 656 observations, complexity param=0.0205
## predicted class=0 expected loss=0.355 P(node) =0.133
## class counts: 423 233
## probabilities: 0.645 0.355
## left son=60 (451 obs) right son=61 (205 obs)
## Primary splits:
## Tenure.Months < 3.5 to the right, improve=26.50, (0 missing)
## Tenure.Months.1 < 3.5 to the right, improve=26.50, (0 missing)
## Online.Security < 0.5 to the right, improve= 6.06, (0 missing)
## Online.Backup < 0.5 to the right, improve= 5.24, (0 missing)
## Tech.Support < 0.5 to the right, improve= 4.21, (0 missing)
## Surrogate splits:
## Tenure.Months.1 < 3.5 to the right, agree=1, adj=1, (0 split)
##
## Node number 31: 1345 observations, complexity param=0.0333
## predicted class=1 expected loss=0.408 P(node) =0.273
## class counts: 549 796
## probabilities: 0.408 0.592
## left son=62 (660 obs) right son=63 (685 obs)
## Primary splits:
## Tenure.Months < 16.5 to the right, improve=40.6, (0 missing)
## Tenure.Months.1 < 16.5 to the right, improve=40.6, (0 missing)
## Online.Security < 0.5 to the right, improve=16.9, (0 missing)
## Tech.Support < 0.5 to the right, improve=12.8, (0 missing)
## Paperless.Billing < 0.5 to the left, improve= 6.6, (0 missing)
## Surrogate splits:
## Tenure.Months.1 < 16.5 to the right, agree=1.000, adj=1.000, (0 split)
## Online.Backup < 0.5 to the right, agree=0.619, adj=0.223, (0 split)
## Partner < 0.5 to the right, agree=0.599, adj=0.182, (0 split)
## Online.Security < 0.5 to the right, agree=0.575, adj=0.135, (0 split)
## Tech.Support < 0.5 to the right, agree=0.558, adj=0.098, (0 split)
##
## Node number 60: 451 observations
## predicted class=0 expected loss=0.259 P(node) =0.0915
## class counts: 334 117
## probabilities: 0.741 0.259
##
## Node number 61: 205 observations
## predicted class=1 expected loss=0.434 P(node) =0.0416
## class counts: 89 116
## probabilities: 0.434 0.566
##
## Node number 62: 660 observations, complexity param=0.0129
## predicted class=0 expected loss=0.467 P(node) =0.134
## class counts: 352 308
## probabilities: 0.533 0.467
## left son=124 (115 obs) right son=125 (545 obs)
## Primary splits:
## Tenure.Months < 53.5 to the right, improve=9.00, (0 missing)
## Tenure.Months.1 < 53.5 to the right, improve=9.00, (0 missing)
## Tech.Support < 0.5 to the right, improve=7.76, (0 missing)
## Online.Security < 0.5 to the right, improve=5.07, (0 missing)
## Avg.Monthly.GB.Download < 28.5 to the right, improve=4.80, (0 missing)
## Surrogate splits:
## Tenure.Months.1 < 53.5 to the right, agree=1, adj=1, (0 split)
##
## Node number 63: 685 observations
## predicted class=1 expected loss=0.288 P(node) =0.139
## class counts: 197 488
## probabilities: 0.288 0.712
##
## Node number 124: 115 observations
## predicted class=0 expected loss=0.287 P(node) =0.0233
## class counts: 82 33
## probabilities: 0.713 0.287
##
## Node number 125: 545 observations, complexity param=0.0129
## predicted class=1 expected loss=0.495 P(node) =0.111
## class counts: 270 275
## probabilities: 0.495 0.505
## left son=250 (91 obs) right son=251 (454 obs)
## Primary splits:
## Avg.Monthly.GB.Download < 29.5 to the right, improve=5.87, (0 missing)
## Tech.Support < 0.5 to the right, improve=4.85, (0 missing)
## Partner < 0.5 to the left, improve=4.62, (0 missing)
## Paperless.Billing < 0.5 to the left, improve=3.92, (0 missing)
## Senior.Citizen < 0.5 to the left, improve=3.54, (0 missing)
##
## Node number 250: 91 observations
## predicted class=0 expected loss=0.341 P(node) =0.0185
## class counts: 60 31
## probabilities: 0.659 0.341
##
## Node number 251: 454 observations, complexity param=0.0129
## predicted class=1 expected loss=0.463 P(node) =0.0921
## class counts: 210 244
## probabilities: 0.463 0.537
## left son=502 (83 obs) right son=503 (371 obs)
## Primary splits:
## Tech.Support < 0.5 to the right, improve=4.69, (0 missing)
## Partner < 0.5 to the left, improve=3.67, (0 missing)
## Paperless.Billing < 0.5 to the left, improve=2.97, (0 missing)
## Online.Security < 0.5 to the right, improve=1.94, (0 missing)
## Senior.Citizen < 0.5 to the left, improve=1.64, (0 missing)
##
## Node number 502: 83 observations
## predicted class=0 expected loss=0.386 P(node) =0.0168
## class counts: 51 32
## probabilities: 0.614 0.386
##
## Node number 503: 371 observations
## predicted class=1 expected loss=0.429 P(node) =0.0752
## class counts: 159 212
## probabilities: 0.429 0.571
The overall accuracy of the tree model with entropy is 80.26%
From this decision tree, we can interpret the following:
The contract variable is the most important. Customers with month-to-month contracts are more likely to churn.Customers who has a dependents (value=1) has the higher chance to churn. Customers with DSL internet service are less likely to churn. Customers who have stayed longer than 54 months are less likely to churn. Now let’s assess the prediction accuracy of the decision tree model by investigating how well it predicts churn in the test subset. We will begin with the confustion matrix, which is a useful display of classification accuracy. It displays the following information:
true positives (TP): These are cases in which we predicted yes (they churned), and they did churn. true negatives (TN): We predicted no, and they didn’t churn. false positives (FP): We predicted yes, but they didn’t actually churn. (Also known as a “Type I error.”) false negatives (FN): We predicted no, but they actually churned. (Also known as a “Type II error.”) Let’s examine the confusion matrix for our decision tree model.
tr_prob1 <- predict(tr_fit, dtest)
tr_pred1 <- ifelse(tr_prob1[,2] > 0.5,"Yes","No")
table(Predicted = tr_pred1, Actual = dtest$Churn.Label)
## Actual
## Predicted 0 1
## No 1354 208
## Yes 209 341
tr_prob2 <- predict(tr_fit, dtrain)
tr_pred2 <- ifelse(tr_prob2[,2] > 0.5,"Yes","No")
tr_tab1 <- table(Predicted = tr_pred2, Actual = dtrain$Churn.Label)
tr_tab2 <- table(Predicted = tr_pred1, Actual = dtest$Churn.Label)
# # Train
# confusionMatrix(
# as.factor(tr_pred2),
# as.factor(dtrain$Churn.Label),na.action = na.pass,
# positive = "Yes"
# )
#
# #Test
# confusionMatrix(
# as.factor(tr_pred1),
# as.factor(dtest$Churn.Label),
# positive = "Yes"
# )
tr_acc <- sum(diag(tr_tab2))/sum(tr_tab2)
tr_acc
## [1] 0.803
library(pROC)
ROC_rf <- roc(dtest$Churn.Label, tr_prob1[,2])
ROC_rf_auc <- auc(ROC_rf)
plot(ROC_rf, col = "green", main = "ROC Curve for the decision tree with entropy")
paste("Accuracy decision tree with entropy: ", mean(dtest$Churn.Labe == round(tr_prob1[,2], digits = 0)))
## [1] "Accuracy decision tree with entropy: 0.802556818181818"
paste("Area under curve for decision tree with entropy is: ", ROC_rf_auc)
## [1] "Area under curve for decision tree with entropy is: 0.824659387684466"
The overall accuracy of the decision tree with entropy model is 80.26% The AUC of the decision tree with entropy model is 0.82
We have build the decision tree with entropy parameter but surprisingly it gave the same result as given by gini decision tree model.
Prune trees
Pruning is a data compression technique in machine learning and search algorithms that reduces the size of decision trees by removing sections of the tree that are non-critical and redundant to classify instances.
we have used the control paramteres for the tree for prunning as :
cp = 0.02,maxcompete = 2, maxsurrogate = 5, usesurrogate = 2, xval = 10, surrogatestyle = 0, maxdepth = 10.
cp:The complexity parameter (cp) in rpart is the minimum improvement in the model needed at each node. It’s based on the cost complexity of the model maxsurrogate :number of surrogate splits to evaluate.
maxdepth : Set the maximum depth of any node of the final tree,xval: number of cross-validations
tr_fit <- rpart(Churn.Label ~., data = dtrain, method="class",na.action = na.exclude,control = rpart.control(cp = 0.02,maxcompete = 2, maxsurrogate = 5, usesurrogate = 2, xval = 10, surrogatestyle = 0, maxdepth = 10 ))
rpart.plot(tr_fit)
summary(tr_fit)
## Call:
## rpart(formula = Churn.Label ~ ., data = dtrain, na.action = na.exclude,
## method = "class", control = rpart.control(cp = 0.02, maxcompete = 2,
## maxsurrogate = 5, usesurrogate = 2, xval = 10, surrogatestyle = 0,
## maxdepth = 10))
## n= 4931
##
## CP nsplit rel error xerror xstd
## 1 0.0468 0 1.000 1.000 0.0236
## 2 0.0333 4 0.813 0.906 0.0228
## 3 0.0205 5 0.780 0.804 0.0219
## 4 0.0200 6 0.759 0.795 0.0218
##
## Variable importance
## Contract Tenure.Months Tenure.Months.1
## 26 19 19
## Internet.Service Dependents Tech.Support
## 7 6 6
## Partner Online.Security Avg.Monthly.GB.Download
## 6 5 3
## Phone.Service Online.Backup
## 1 1
##
## Node number 1: 4931 observations, complexity param=0.0468
## predicted class=0 expected loss=0.268 P(node) =1
## class counts: 3611 1320
## probabilities: 0.732 0.268
## left son=2 (2217 obs) right son=3 (2714 obs)
## Primary splits:
## Contract < 0.5 to the right, improve=311, (0 missing)
## Tenure.Months < 16.5 to the right, improve=194, (0 missing)
## Tenure.Months.1 < 16.5 to the right, improve=194, (0 missing)
## Surrogate splits:
## Tenure.Months < 34.5 to the right, agree=0.786, adj=0.524, (0 split)
## Tenure.Months.1 < 34.5 to the right, agree=0.786, adj=0.524, (0 split)
## Tech.Support < 0.5 to the right, agree=0.650, adj=0.222, (0 split)
## Partner < 0.5 to the right, agree=0.640, adj=0.200, (0 split)
## Online.Security < 0.5 to the right, agree=0.630, adj=0.178, (0 split)
##
## Node number 2: 2217 observations
## predicted class=0 expected loss=0.0713 P(node) =0.45
## class counts: 2059 158
## probabilities: 0.929 0.071
##
## Node number 3: 2714 observations, complexity param=0.0468
## predicted class=0 expected loss=0.428 P(node) =0.55
## class counts: 1552 1162
## probabilities: 0.572 0.428
## left son=6 (443 obs) right son=7 (2271 obs)
## Primary splits:
## Dependents < 0.5 to the right, improve=76.0, (0 missing)
## Tenure.Months < 1.5 to the right, improve=49.5, (0 missing)
## Tenure.Months.1 < 1.5 to the right, improve=49.5, (0 missing)
##
## Node number 6: 443 observations
## predicted class=0 expected loss=0.16 P(node) =0.0898
## class counts: 372 71
## probabilities: 0.840 0.160
##
## Node number 7: 2271 observations, complexity param=0.0468
## predicted class=0 expected loss=0.48 P(node) =0.461
## class counts: 1180 1091
## probabilities: 0.520 0.480
## left son=14 (270 obs) right son=15 (2001 obs)
## Primary splits:
## Avg.Monthly.GB.Download < 1 to the left, improve=38.5, (0 missing)
## Internet.Service < 1.5 to the right, improve=38.5, (0 missing)
## Tenure.Months < 1.5 to the right, improve=37.8, (0 missing)
## Surrogate splits:
## Internet.Service < 1.5 to the right, agree=1, adj=1, (0 split)
##
## Node number 14: 270 observations
## predicted class=0 expected loss=0.23 P(node) =0.0548
## class counts: 208 62
## probabilities: 0.770 0.230
##
## Node number 15: 2001 observations, complexity param=0.0468
## predicted class=1 expected loss=0.486 P(node) =0.406
## class counts: 972 1029
## probabilities: 0.486 0.514
## left son=30 (656 obs) right son=31 (1345 obs)
## Primary splits:
## Internet.Service < 0.5 to the left, improve=49.4, (0 missing)
## Tenure.Months < 16.5 to the right, improve=46.5, (0 missing)
## Tenure.Months.1 < 16.5 to the right, improve=46.5, (0 missing)
## Surrogate splits:
## Phone.Service < 0.5 to the left, agree=0.771, adj=0.302, (0 split)
##
## Node number 30: 656 observations, complexity param=0.0205
## predicted class=0 expected loss=0.355 P(node) =0.133
## class counts: 423 233
## probabilities: 0.645 0.355
## left son=60 (451 obs) right son=61 (205 obs)
## Primary splits:
## Tenure.Months < 3.5 to the right, improve=26.50, (0 missing)
## Tenure.Months.1 < 3.5 to the right, improve=26.50, (0 missing)
## Online.Security < 0.5 to the right, improve= 6.06, (0 missing)
## Surrogate splits:
## Tenure.Months.1 < 3.5 to the right, agree=1, adj=1, (0 split)
##
## Node number 31: 1345 observations, complexity param=0.0333
## predicted class=1 expected loss=0.408 P(node) =0.273
## class counts: 549 796
## probabilities: 0.408 0.592
## left son=62 (660 obs) right son=63 (685 obs)
## Primary splits:
## Tenure.Months < 16.5 to the right, improve=40.6, (0 missing)
## Tenure.Months.1 < 16.5 to the right, improve=40.6, (0 missing)
## Online.Security < 0.5 to the right, improve=16.9, (0 missing)
## Surrogate splits:
## Tenure.Months.1 < 16.5 to the right, agree=1.000, adj=1.000, (0 split)
## Online.Backup < 0.5 to the right, agree=0.619, adj=0.223, (0 split)
## Partner < 0.5 to the right, agree=0.599, adj=0.182, (0 split)
## Online.Security < 0.5 to the right, agree=0.575, adj=0.135, (0 split)
## Tech.Support < 0.5 to the right, agree=0.558, adj=0.098, (0 split)
##
## Node number 60: 451 observations
## predicted class=0 expected loss=0.259 P(node) =0.0915
## class counts: 334 117
## probabilities: 0.741 0.259
##
## Node number 61: 205 observations
## predicted class=1 expected loss=0.434 P(node) =0.0416
## class counts: 89 116
## probabilities: 0.434 0.566
##
## Node number 62: 660 observations
## predicted class=0 expected loss=0.467 P(node) =0.134
## class counts: 352 308
## probabilities: 0.533 0.467
##
## Node number 63: 685 observations
## predicted class=1 expected loss=0.288 P(node) =0.139
## class counts: 197 488
## probabilities: 0.288 0.712
From this decision tree, we can interpret the following:
The contract variable is the most important. Customers with month-to-month contracts are more likely to churn.Customers who has a dependents (value=1) has the higher chance to churn. Customers with DSL internet service are less likely to churn. Customers who have stayed longer than 16 months(previously 54) are less likely to churn. Now let’s assess the prediction accuracy of the decision tree model by investigating how well it predicts churn in the test subset. We will begin with the confustion matrix, which is a useful display of classification accuracy. It displays the following information:
true positives (TP): These are cases in which we predicted yes (they churned), and they did churn. true negatives (TN): We predicted no, and they didn’t churn. false positives (FP): We predicted yes, but they didn’t actually churn. (Also known as a “Type I error.”) false negatives (FN): We predicted no, but they actually churned. (Also known as a “Type II error.”) Let’s examine the confusion matrix for our decision tree model.From this confusion matrix, we can see that the model performs well at predicting non-churning customers (1426 correct vs. 272 incorrect) and predicting churning customers 277 correct vs. 137 incorrect).
tr_prob1 <- predict(tr_fit, dtest)
tr_pred1 <- ifelse(tr_prob1[,2] > 0.5,"Yes","No")
table(Predicted = tr_pred1, Actual = dtest$Churn.Label)
## Actual
## Predicted 0 1
## No 1426 272
## Yes 137 277
tr_prob2 <- predict(tr_fit, dtrain)
tr_pred2 <- ifelse(tr_prob2[,2] > 0.5,"Yes","No")
tr_tab1 <- table(Predicted = tr_pred2, Actual = dtrain$Churn.Label)
tr_tab2 <- table(Predicted = tr_pred1, Actual = dtest$Churn.Label)
# # Train
# confusionMatrix(
# as.factor(tr_pred2),
# as.factor(dtrain$Churn.Label),na.action = na.pass,
# positive = "Yes"
# )
#
# #Test
# confusionMatrix(
# as.factor(tr_pred1),
# as.factor(dtest$Churn.Label),
# positive = "Yes"
# )
tr_acc <- sum(diag(tr_tab2))/sum(tr_tab2)
tr_acc
## [1] 0.806
library(pROC)
ROC_rf <- roc(dtest$Churn.Label, tr_prob1[,2])
ROC_rf_auc <- auc(ROC_rf)
plot(ROC_rf, col = "green", main = "ROC Curve for the prune tree")
paste("Accuracy : ", mean(dtest$Churn.Labe == round(tr_prob1[,2], digits = 0)))
## [1] "Accuracy : 0.806344696969697"
paste("Area under curve for prune tree model is: ", ROC_rf_auc)
## [1] "Area under curve for prune tree model is: 0.824739799111279"
The overall accuracy of the prune tree model is 80.63% The AUC of the prune tree model is 0.82
Prune model is best than other models in terms of accuracy, efficiency and roc value
Accuracy of each model: Logistic Regression :73.5% Random Forest Model: 80.3% SVM: 80.9% Decision Tree: 80.63%
The Prune Tree Model had the best combined accuracy and AUC
After comparing the accuracy, sensitivity, specificity, and AUC value, the prune tree has the best performence in all cases. Therefore the prune tree is selected as the final model.
The overall accuracy of the decision tree with entropy model is 80.26% The AUC of the decision tree with entropy model is 0.82
The overall accuracy of the prune tree model is 80.63% The AUC of the prune tree model is 0.82.
Sensity : 50.45% Specificity : 91.23%
Prune gives higher accuacy with lower levels than decision tree with entropy or gini parametre.
As this is a binary classification, we have used auc score for comparing the model as a parametre . Random forest has close accuracy to the prune tree , but lower auc score than prune tree,also the decision tree model gives high importance to a particular set of features. But the random forest chooses features randomly during the training process. Therefore, it does not depend highly on any specific set of features,making to opt for prune tree over random forest.
Decision tree has 9 levels where prune tree has only 6 levels. Prune tree gives more clear picture of feature ‘Tenure months’ and ‘Average monthly gb downloaded’
We have two questions:
Customers with what behaviors and conditions would likely leave the platform?
What services are important to deliver to a customer to keep them with the company?
According to the model analysis, we found a couple of important features that would affect whether customers decide to churn or not churn. For example,vfeatures from the random forest model would make impact on whether customers decide to churn or not churn include total charges, tenure months, monthly charges, internet service with fiber optic, dependents, payment method with an electronic check, contract two years, contract one year, tech support, and paperless billing. Since total charges, tenure months, and monthly charges rank the top 3 in importance, therefore a reasonable price plan will be important to keep customer with plaforms.
Another example, as explored in the Logistic Model, would be developing special deals to offer customers in order to get them to agree to a 1-year or 2-year contract, as being in a long-term contract significantly reduced the risk of churning. Several of the models found this feature to be an important one.
It is evident from prune tree that customers who has 16 months or more contract has low chance of churning than others, if the company need to keep their customers it is advisable to devise attractive 2 year long contract plans