#Load data sets

#Main data set
churn_df <- data.frame(read_csv('Data Files/Telco_customer_churn.csv', col_types=cols()))

#additional data sets
churn_status_df <- data.frame(read_csv('Data Files/Telco_customer_churn_status.csv', col_types = cols()))
churn_services_df <- data.frame(read_csv('Data Files/Telco_customer_churn_services.csv', col_types = cols()))

##################################################

#Join and filter

#Filter columns to join from status_df
status_join <- churn_status_df %>%
  select(c(Customer.ID, Satisfaction.Score, Churn.Category))
#status_join

#Filer columns to join from services)df
service_join <- churn_services_df %>%
  select(c(Customer.ID, Avg.Monthly.Long.Distance.Charges, Avg.Monthly.GB.Download, Unlimited.Data, Total.Extra.Data.Charges))
#service_join

#Join status with churn
joined_churn_df <- merge(churn_df, status_join, by.x="CustomerID", by.y="Customer.ID")
#head(joined_churn_df)

#Join service with churn
joined_churn_df <- merge(joined_churn_df, service_join, by.x="CustomerID", by.y="Customer.ID")
#head(joined_churn_df)

##################################################

#drop unwanted columns from final df
filtered_joined_churn_df <- joined_churn_df %>%
  select(-c(1:10, 16, 20, 22:23, 28, 30:32))

##################################################

#Convert categorical to factor
cols <- c(1:3, 5:12, 14:17, 20)
filtered_joined_churn_df[cols] <- lapply(filtered_joined_churn_df[cols], as.factor)

##################################################

#Separate into churn and non-churn groupings for later comparison
only_churn <- joined_churn_df %>%
  filter(Churn.Value==1)

no_churn <- joined_churn_df %>%
  filter(Churn.Value==0)

##################################################

#Functions

#Create updated xkable_summary function to delete unwanted text
xkablesum_updated <- function (df, title = "Table: Statistics summary.", digits = 4,
          pos = "left", bso = "striped")
{
    s = summary(df)

    # Including RE NA's strip
    # Needed to add a dim check because including NA strip when no NA row made a new 7th row with text Min.:
    if (dim(s)[1] == 6) {
      strip_vector = c("Min.\\s*:\\s*", "1st Qu.\\s*:\\s*", "Median\\s*:\\s*", "Mean\\s*:\\s*",
                       "3rd Qu.\\s*:\\s*", "Max.\\s*:\\s*")
    }
    # NA's strip RE added here
    else if (dim(s)[1] == 7) {
      strip_vector = c("Min.\\s*:\\s*", "1st Qu.\\s*:\\s*", "Median\\s*:\\s*", "Mean\\s*:\\s*",
                       "3rd Qu.\\s*:\\s*", "Max.\\s*:\\s*", "NA's\\s*:\\s*")
    }

    # Made s = apply() -- without, didn't apply changes to table
    s <- apply(s, 2, function(x) stringr::str_remove_all(x, strip_vector))

    # Made s = apply()
    s <- apply(s, 2, function(x) stringr::str_trim(x, "right"))
    colnames(s) <- stringr::str_trim(colnames(s))

    if (dim(s)[1] == 6) {
        rownames(s) <- c("Min", "Q1", "Median",
                         "Mean", "Q3", "Max")
    }
    else if (dim(s)[1] == 7) {
        rownames(s) <- c("Min", "Q1", "Median",
                         "Mean", "Q3", "Max", "NA")
    }
    xkabledply(s, title = title, digits = digits, pos = pos,
               bso = bso)
}

#Better looking version of 2-sample t-test results than what the object itself displays
ttest2sample_info <- function(test) {
  if (test[["p.value"]] <= 0.05) {
  result = 'Reject the Null Hypothesis'
  } else {
    result = 'Do not reject the Null Hypothesis'
  }

  cat(c('\t', test$method, '\n\n',
      'Data:                       ', '|   ', test$data.name, '\n',
      'Null Hypothesis:            ', '|   true difference in means = ', test$null.value[1], '\n',
      'Alternative Hypothesis:     ', '|   true difference in means != ', test$null.value[1], '\n',
      'Confidence Level:           ', '|   ', attributes(test$conf.int)$conf.level, '\n',
      'Confidence Interval:        ', '|   [', round(test$conf.int[1], 2), ', ', round(test$conf.int[2], 2)), ']\n',
      'Sample Estimates of Mean:   ', '|   [X = ', test$estimate[1], ', Y = ', test$estimate[2], ']\n',
      'Test Values:                ', '|   [t = ', test$statistic,
                                   ', df = ', test$parameter[1],
                                   ', p-value = ', test[["p.value"]], ']\n',
      'Result:                     ', '|   ', result, sep='')
}

Ready to continue…



1. Quick Review of Data Set

This data set is a fictional data set provided by IBM. It includes the demographic information on the customers of a telecommunications company, data on the services each customer used, charges paid (etc.), as well as whether the customer “churned” or not – meaning, whether the customer left the company in the past month. It also includes data on the reason(s) each customer who did leave gave for leaving.


Data Set Structure

str(filtered_joined_churn_df)
## 'data.frame':    7043 obs. of  21 variables:
##  $ Senior.Citizen                   : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 1 2 1 2 1 ...
##  $ Partner                          : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 1 2 2 1 2 ...
##  $ Dependents                       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 2 ...
##  $ Tenure.Months                    : num  9 9 4 13 3 9 71 63 7 65 ...
##  $ Phone.Service                    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Internet.Service                 : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 2 2 2 1 2 2 1 1 ...
##  $ Online.Security                  : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 1 3 3 3 3 ...
##  $ Online.Backup                    : Factor w/ 3 levels "No","No internet service",..: 3 1 1 3 1 1 3 1 1 3 ...
##  $ Tech.Support                     : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 3 3 3 3 1 3 ...
##  $ Contract                         : Factor w/ 3 levels "Month-to-month",..: 2 1 1 1 1 1 3 3 1 3 ...
##  $ Paperless.Billing                : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 2 2 2 ...
##  $ Payment.Method                   : Factor w/ 4 levels "Bank transfer (automatic)",..: 4 4 3 3 4 2 1 2 3 4 ...
##  $ Monthly.Charges                  : num  65.6 59.9 73.9 98 83.9 ...
##  $ Churn.Label                      : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 1 1 1 1 1 ...
##  $ Churn.Reason                     : Factor w/ 20 levels "Attitude of service provider",..: NA NA 18 19 15 NA NA NA NA NA ...
##  $ Satisfaction.Score               : Factor w/ 5 levels "1","2","3","4",..: 3 5 1 1 1 3 3 4 3 3 ...
##  $ Churn.Category                   : Factor w/ 5 levels "Attitude","Competitor",..: NA NA 2 3 3 NA NA NA NA NA ...
##  $ Avg.Monthly.Long.Distance.Charges: num  42.39 10.69 33.65 27.82 7.38 ...
##  $ Avg.Monthly.GB.Download          : num  16 10 30 4 11 73 14 7 21 14 ...
##  $ Unlimited.Data                   : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 1 2 2 ...
##  $ Total.Extra.Data.Charges         : num  0 10 0 0 0 0 0 20 0 0 ...


From the graph below, we see that around 27% of customer left the platform within the last month.

options(repr.plot.width = 6, repr.plot.height = 4)
filtered_joined_churn_df %>%
group_by(Churn.Label) %>%
summarise(Count = n())%>%
mutate(percent = prop.table(Count)*100)%>%
ggplot(aes(reorder(Churn.Label, -percent), percent), fill = Churn)+
geom_col(fill = c("#FC4E07", "#E7B800"))+
geom_text(aes(label = sprintf("%.2f%%", percent)), hjust = 0.01,vjust = -0.5, size =3)+
theme_bw()+  
xlab("Churn") +
ylab("Percent")+
ggtitle("Churn Percent")


The graph below details the reasons customers gave for leaving. Although some customers may have selected the same reason for leaving, they may have categorized how they associated that reason differently. The churn category is used as a fill to provide further detail in understanding the customer perspective. We will use these reasons to help us formulate our questions, select appropriate variables to investigate, and contextualize the results:

joined_churn_df %>%
  filter(!is.na(Churn.Category)) %>%
  ggplot() +
  geom_bar(mapping=aes(y=Churn.Reason, fill=Churn.Category), show.legend=TRUE) +
  labs(title='Reasons for Leaving by Answer', y=('Reason'), x=('Count')) +
  theme(plot.title = element_text(hjust = 0.5))

2. Quick Review of EDA and SMART QUESTIONS

Answering SMART Question 1

  1. Which of the variables in the data set show a statistical difference between the churn and non-churn groupings?

To answer this question, we divided the data into churn and non-churn groups, then visually explored how the data set’s variables differed when divided into these groups. We then tested the differences using 2-sample t-tests, goodness of fit chi squared tests, and chi squared tests of independence. From this we determined that the following 15 variables showed a statistical difference between the churn and non-churn groups:

  • Tenure
  • Avg. Monthly Charges
  • Avg. Monthly GB Download
  • Senior Citizen
  • Partner
  • Dependents
  • Internet Service
  • Online Security
  • Online Backup
  • Tech Support
  • Contract Type
  • Paperless Billing
  • Payment Method
  • Satisfaction Score
  • Unlimited Data

Smart Questions to answer with modeling:

  • Customers with what behaviors and conditions would likely leave the platform?
  • What services are important to deliver to a customer to keep them with the company?

3. Models

3.1 Logistic Model


3.1.1 Satisfaction Logistic Model


First, let’s explore how Satisfcation Score is best understood and used. Is it as a factor or conceptual framework?

sat_logit_model <- glm(Churn.Label ~ Satisfaction.Score, data=filtered_joined_churn_df, family="binomial")
#summary(sat_logit_model)
xkabledply(sat_logit_model)
Model: Churn.Label ~ Satisfaction.Score
Estimate Std. Error z value Pr(>|z|)
(Intercept) 20.6 584 0.0352 0.972
Satisfaction.Score2 0.0 974 0.0000 1.000
Satisfaction.Score3 -22.2 584 -0.0380 0.970
Satisfaction.Score4 -41.1 719 -0.0572 0.954
Satisfaction.Score5 -41.1 784 -0.0525 0.958


Exponentiate both sides of equation to get growth and decay factors…

#exponentiate both sides of the equation
expcoeff <- exp(coef(sat_logit_model))
xkabledply(as.table(expcoeff), title='Growth and Decay Factors After Exponentiation') 
Growth and Decay Factors After Exponentiation
x
(Intercept) 8.55e+08
Satisfaction.Score2 1.00e+00
Satisfaction.Score3 0.00e+00
Satisfaction.Score4 0.00e+00
Satisfaction.Score5 0.00e+00


Exponentiated Equation Satisfaction


\(\frac{p}{q} = 854535380 * (1)^{sat2} * (2.2452e-10)^{sat3} * (1.36943e-18)^{sat4} * (1.36943e-18)^{sat5}\)


Predictions: Satisfaction Model

#Create new DF of 5 rows, one for each level of satisfaction in order to make predictions
pred_df <- data.frame(Satisfaction.Score=as.factor(c(1,2,3,4,5)))
#Determine likelihood of Churn using Satisfaction Model
pred_response <- predict(sat_logit_model, newdata = pred_df, type = "response")

#Create prediction response df with results of model calculations
pred_response_df <- data.frame(pred_response)
#Clean up formatting of DF
pred_response_df <- pred_response_df %>%
  format(scientific=FALSE) %>%
  cbind(newColName = rownames(pred_response_df), pred_response_df) %>%
  select(c(2,3)) %>%
  mutate(
    Prediction=ifelse(pred_response > 0.5, "Churn", "Non-Churn")
  )
colnames(pred_response_df) <- c("Satisfaction Score", "Churn Likelihood", "Prediction")

#display DF as table
xkabledply(pred_response_df, title="Churn Predictions Using Only Satisfaction Score and a Cut Off of 0.5")
Churn Predictions Using Only Satisfaction Score and a Cut Off of 0.5
Satisfaction Score Churn Likelihood Prediction
1 1.000 Churn
2 1.000 Churn
3 0.161 Non-Churn
4 0.000 Non-Churn
5 0.000 Non-Churn


Satisfaction Model Tests


Coefficient P-values: Satisfaction Model

All p-values are high


Confusion Matrix: Satisfaction Model

#Create and display the confusion matrix
confusion_matrix_sat <- xkabledply(confusion_matrix(sat_logit_model), title = "Confusion Matrix for Satisfcation Logistic Model" )
confusion_matrix_sat
Confusion Matrix for Satisfcation Logistic Model
Predicted No Predicted Yes Total
Actual No 5174 0 5174
Actual Yes 429 1440 1869
Total 5603 1440 7043
#Turn from kable into data frame (was having trouble accessing elements as a kable)
confusion_matrix_sat_df <- data.frame(confusion_matrix(sat_logit_model))
#confusion_matrix_sat_df

accuracy <- (confusion_matrix_sat_df[1,1] + confusion_matrix_sat_df[2,2])/confusion_matrix_sat_df[3,3]
precision <- confusion_matrix_sat_df[2,2]/(confusion_matrix_sat_df[2,2] + confusion_matrix_sat_df[1,2])
recall_rate <- confusion_matrix_sat_df[2,2]/(confusion_matrix_sat_df[2,2] + confusion_matrix_sat_df[2,1])
specificity <- confusion_matrix_sat_df[1,1]/(confusion_matrix_sat_df[1,1] + confusion_matrix_sat_df[1,2])
f1_score <- 2*(precision)*(recall_rate)/(precision + recall_rate)

# 1. **Accuracy:** *(TP + TN)/Total*
# * `r (confusion_matrix_test_df[1,1] + confusion_matrix_test_df[2,2])/confusion_matrix_test_df[3,3]`
# * The model correctly predicts survived or died `r round((confusion_matrix_test_df[1,1] + confusion_matrix_test_df[2,2])/confusion_matrix_test_df[3,3], digits=3)*100`% of the time.
# 2. **Precision:** *TP/(TP + FP)*
# * `r confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[1,2])`
# * Of the values that are predicted true by the model, `r round(confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[1,2]), digits=3)*100`% of them actually are true.
# 3. **Recall Rate:** *TP/(TP + FN)*
# * `r confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[2,1])`
# * Of the values that actually are true, the model correctly predicts `r round(confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[2,1]), digits=3)*100`% of them as true.
# 4. **Specificity:** *TN/(TN + FP)*
# * `r confusion_matrix_test_df[1,1]/(confusion_matrix_test_df[1,1] + confusion_matrix_test_df[1,2])`
# * Of the values that are actually false, the model correctly predicts `r round(confusion_matrix_test_df[1,1]/(confusion_matrix_test_df[1,1] + confusion_matrix_test_df[1,2]), digits=3)*100`% of them as false.
# 5. **F1 Score:** *2(Precision)(Recall)/(Precision + Recall)*
# * `r 2*(0.753)*(0.714)/(0.753 + 0.714)`
# * The harmonic mean of Precision and Recall, a more balanced view of how good the model is.
  1. Accuracy: (TP + TN)/Total
  • 0.939
  • The model correctly predicts churn or non-churn 93.9% of the time.
  1. Precision: TP/(TP + FP)
  • 1
  • Of the values that are predicted true by the model, 100% of them actually are true.
  1. Recall Rate: TP/(TP + FN)
  • 0.77
  • Of the values that actually are true, the model correctly predicts 77% of them as true.
  1. Specificity: TN/(TN + FP)
  • 1
  • Of the values that are actually false, the model correctly predicts, 100% of them as false.
  1. F1 Score: 2(Precision)(Recall)/(Precision + Recall)
  • 0.87
  • The harmonic mean of Precision and Recall, a more balanced view of how good the model is.

ROC-AUC: Satisfaction Model

Area under the curve, testing the true positive rate (or recall) against the false positive rate (or specificity).

#Get the probability of survival for each passenger from the model
prob_test=predict(sat_logit_model, type = "response" )
#Create new column in df of predicted probability of survival 
filtered_joined_churn_df$prob=prob_test
#Determine the ratio of true-positive-rate/false-positive rate as a curve between 0.5 and 1
roc_data <- roc(Churn.Label~prob, data=filtered_joined_churn_df)
#Plot as graph
plot(roc_data)

#Value of area under the curve, prefer 0.8 or higher.
#auc(roc_data2) # area-under-curve 
# unloadPkg("pROC")

Comments

  • This model produces a score of 0.95
  • This is above the threshold of 0.8 for a good model fit
  • a value of 1 would be a perfect model

McFadden: Satisfaction Model

#calculate McFadden statistics, the pseudo r^2
sat_pr2 = pR2(sat_logit_model)
fitting null model for pseudo-r2
#sat_pr2[4]

The pseudo \(r^2\) value for the Satisfaction model, using the McFadden method of calculation, is 0.711. This means that 71.1% of the variance seen in the data is accounted for by this model.


AIC/BIC and Deviance: Satisfaction Model

  • The AIC of the Satisfaction model is: 2362.044
  • The Residual Deviance of the Satisfaction model is: 2352.044

Satisfaction Model Takeaways

  • A customer’s satisfaction score almost perfectly predicts Churn and Non-Churn
  • All 1-2 satisfaction score customers were Churn
  • All 4-5 satisfaction score customer were Non-Churn
  • Most 3 satisfaction score customers were Non-Churn
  • However, what this really means is that satisfaction is basically a stand in for Churn/Non-Churn. It does not provide new information useful for determining what factors are associated with the decision to Churn.
  • Satisfaction is better used as the conceptual framework for understanding the customer’s motivation in making the decision to Churn or not.


3.1.2 Full Logistic Model


Next, let’s look at a baseline model that uses all potential explanatory factors in the data set, with the exception of Average Monthly Long Distance Charges and Total Extra Data charges, which were shown in the EDA to not have statistical significance between their different levels.

#Everything model - those factors dropped in EDA

#Senior.Citizen + Partner + Dependents + Tenure.Months + Phone.Service + Internet.Service + Online.Security + Online.Backup + Tech.Support + Contract + Paperless.Billing + Payment.Method + Monthly.Charges + Avg.Monthly.GB.Download + Unlimited.Data

#NOT USING:
#Average Monthly Long Distance
#Total Extra Data Charges
#Satisfaction Score

all_logit_model <- glm(Churn.Label ~ Senior.Citizen + Partner + Dependents + Tenure.Months + Phone.Service + Internet.Service + Online.Security + Online.Backup + Tech.Support + Contract + Paperless.Billing + Payment.Method + Monthly.Charges + Avg.Monthly.GB.Download + Unlimited.Data, data=filtered_joined_churn_df, family="binomial")
#summary(all_logit_model)
xkabledply(all_logit_model, title="Churn.Label ~ All Factors")
Churn.Label ~ All Factors
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.7084 0.2117 -3.346 0.0008
Senior.CitizenYes 0.0903 0.0866 1.043 0.2970
PartnerYes 0.2674 0.0758 3.529 0.0004
DependentsYes -1.5157 0.1195 -12.684 0.0000
Tenure.Months -0.0342 0.0024 -14.511 0.0000
Phone.ServiceYes -0.9433 0.1504 -6.272 0.0000
Internet.ServiceFiber optic 0.2196 0.1500 1.464 0.1432
Internet.ServiceNo -0.1758 0.2106 -0.835 0.4038
Online.SecurityYes -0.4633 0.0879 -5.273 0.0000
Online.BackupYes -0.2219 0.0808 -2.748 0.0060
Tech.SupportYes -0.4741 0.0915 -5.183 0.0000
ContractOne year -0.7284 0.1076 -6.769 0.0000
ContractTwo year -1.3590 0.1760 -7.721 0.0000
Paperless.BillingYes 0.3410 0.0755 4.515 0.0000
Payment.MethodCredit card (automatic) -0.0808 0.1158 -0.698 0.4855
Payment.MethodElectronic check 0.3126 0.0963 3.247 0.0012
Payment.MethodMailed check 0.0004 0.1157 0.003 0.9976
Monthly.Charges 0.0259 0.0041 6.358 0.0000
Avg.Monthly.GB.Download -0.0014 0.0019 -0.724 0.4688
Unlimited.DataYes -0.0458 0.1001 -0.458 0.6469


Exponentiate both sides of equation to get growth and decay factors…

#exponentiate both sides of the equation
expcoeff <- exp(coef(all_logit_model))
xkabledply(as.table(expcoeff), title='Growth and Decay Factors After Exponentiation') 
Growth and Decay Factors After Exponentiation
x
(Intercept) 0.492
Senior.CitizenYes 1.095
PartnerYes 1.307
DependentsYes 0.220
Tenure.Months 0.966
Phone.ServiceYes 0.389
Internet.ServiceFiber optic 1.246
Internet.ServiceNo 0.839
Online.SecurityNo internet service NA
Online.SecurityYes 0.629
Online.BackupNo internet service NA
Online.BackupYes 0.801
Tech.SupportNo internet service NA
Tech.SupportYes 0.623
ContractOne year 0.483
ContractTwo year 0.257
Paperless.BillingYes 1.406
Payment.MethodCredit card (automatic) 0.922
Payment.MethodElectronic check 1.367
Payment.MethodMailed check 1.000
Monthly.Charges 1.026
Avg.Monthly.GB.Download 0.999
Unlimited.DataYes 0.955


Exponentiated Equation


\(\frac{p}{q} = 0.492441 * (1.09454)^{senior} * (1.30655)^{partner} * (0.219654)^{dependents} * (0.96634)^{tenure} * (0.389351)^{phone}\)

\(* (1.24563)^{fiberOptic} * (0.838754)^{noInt} * (0.629209)^{oSecurity} * (0.800973)^{oBackup} * (0.622464)^{techSup} * (0.482686)^{cont1yr}\)

\(* (0.25691)^{cont2yr} * (1.40631)^{paperless} * (0.922403)^{pmAutoCC} * (1.36695)^{pmEcheck} * (1.00035)^{mailCheck}\)

\(* (1.02627)^{monthlyCharge} * (0.998611)^{monthlyGB} * (0.955201)^{unlimited}\)



Full Model Tests


Coefficient P-values: Full Model

High p-values for the following factors:

  • Senior
  • Fiber optic internet
  • No internet service
  • PM auto credit card
  • PM mailed check
  • Monthly GB
  • Unlimited data

Confusion Matrix: Full Model

#Create and display the confusion matrix
confusion_matrix_all <- xkabledply(confusion_matrix(all_logit_model), title = "Confusion Matrix for Full Logistic Model, Cut Off of 0.5" )
confusion_matrix_all
Confusion Matrix for Full Logistic Model, Cut Off of 0.5
Predicted No Predicted Yes Total
Actual No 4633 541 5174
Actual Yes 795 1074 1869
Total 5428 1615 7043
#Turn from kable into data frame (was having trouble accessing elements as a kable)
confusion_matrix_all_df <- data.frame(confusion_matrix(all_logit_model))
#confusion_matrix_all_df

accuracy <- (confusion_matrix_all_df[1,1] + confusion_matrix_all_df[2,2])/confusion_matrix_all_df[3,3]
precision <- confusion_matrix_all_df[2,2]/(confusion_matrix_all_df[2,2] + confusion_matrix_all_df[1,2])
recall_rate <- confusion_matrix_all_df[2,2]/(confusion_matrix_all_df[2,2] + confusion_matrix_all_df[2,1])
specificity <- confusion_matrix_all_df[1,1]/(confusion_matrix_all_df[1,1] + confusion_matrix_all_df[1,2])
f1_score <- 2*(precision)*(recall_rate)/(precision + recall_rate)

# 1. **Accuracy:** *(TP + TN)/Total*
# * `r (confusion_matrix_test_df[1,1] + confusion_matrix_test_df[2,2])/confusion_matrix_test_df[3,3]`
# * The model correctly predicts survived or died `r round((confusion_matrix_test_df[1,1] + confusion_matrix_test_df[2,2])/confusion_matrix_test_df[3,3], digits=3)*100`% of the time.
# 2. **Precision:** *TP/(TP + FP)*
# * `r confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[1,2])`
# * Of the values that are predicted true by the model, `r round(confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[1,2]), digits=3)*100`% of them actually are true.
# 3. **Recall Rate:** *TP/(TP + FN)*
# * `r confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[2,1])`
# * Of the values that actually are true, the model correctly predicts `r round(confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[2,1]), digits=3)*100`% of them as true.
# 4. **Specificity:** *TN/(TN + FP)*
# * `r confusion_matrix_test_df[1,1]/(confusion_matrix_test_df[1,1] + confusion_matrix_test_df[1,2])`
# * Of the values that are actually false, the model correctly predicts `r round(confusion_matrix_test_df[1,1]/(confusion_matrix_test_df[1,1] + confusion_matrix_test_df[1,2]), digits=3)*100`% of them as false.
# 5. **F1 Score:** *2(Precision)(Recall)/(Precision + Recall)*
# * `r 2*(0.753)*(0.714)/(0.753 + 0.714)`
# * The harmonic mean of Precision and Recall, a more balanced view of how good the model is.
  1. Accuracy: (TP + TN)/Total
  • 0.81
  • The model correctly predicts churn or non-churn 81% of the time.
  1. Precision: TP/(TP + FP)
  • 0.665
  • Of the values that are predicted true by the model, 66.5% of them actually are true.
  1. Recall Rate: TP/(TP + FN)
  • 0.575
  • Of the values that actually are true, the model correctly predicts 57.5% of them as true.
  1. Specificity: TN/(TN + FP)
  • 0.895
  • Of the values that are actually false, the model correctly predicts, 89.5% of them as false.
  1. F1 Score: 2(Precision)(Recall)/(Precision + Recall)
  • 0.617
  • The harmonic mean of Precision and Recall, a more balanced view of how good the model is.

ROC-AUC: Full Model

Area under the curve, testing the true positive rate (or recall) against the false positive rate (or specificity).

#Get the probability of survival for each passenger from the model
prob_test=predict(all_logit_model, type = "response" )
#Create new column in df of predicted probability of survival 
filtered_joined_churn_df$prob=prob_test
#Determine the ratio of true-positive-rate/false-positive rate as a curve between 0.5 and 1
roc_data <- roc(Churn.Label~prob, data=filtered_joined_churn_df)
#Plot as graph
plot(roc_data)

#Value of area under the curve, prefer 0.8 or higher.
#auc(roc_data2) # area-under-curve 
# unloadPkg("pROC")

Comments

  • This model produces a score of 0.858
  • This is above the threshold of 0.8 for a good model fit

McFadden: Full Model

#calculate McFadden statistics, the pseudo r^2
all_pr2 = pR2(all_logit_model)
fitting null model for pseudo-r2
#all_pr2[4]

The pseudo \(r^2\) value for the full model, using the McFadden method of calculation, is 0.305. This means that 30.5% of the variance seen in the data is accounted for by this model.


AIC/BIC and Deviance: Full Model

  • The AIC of the full model is: 5700.865
  • The Residual Deviance of the full model is: 5660.865

Full Model Takeaways

  • The high p-values for the coefficients of a number of factors indicates that further feature selection is needed.
  • The relatively low precision and recall rate suggests that a cut off score other than 0.5 needs to be used.
  • However, an AUC of 0.858 indicates that we are on the right track.

3.1.3 Final Logistic Model


To build a better model for the final version, we will drop those variables that had high p-values in the full model. After some experiementation based on dropping high p-value factors, the final model includes the following factors:

  • Dependents
  • Phone Service
  • Contract
  • Monthly Charges
  • Monthly GB
# #Attempt Lasso regression for numeric variables -- encountered code errors that looked too involved to figure out
# full_mod_sel <- filtered_joined_churn_df %>%
#   select(c(4,13,14,19)) #18?
# 
# #x <- as.matrix(full_mod_sel[,-3]) # all X vars
# x <- full_mod_sel[, -3]
# y <- as.double(as.matrix(ifelse(full_mod_sel[, 4]=='Yes', 1, 0))) # Only Churn.Label
# 
# # Fit the LASSO model (Lasso: Alpha = 1)
# set.seed(100)
# cv.lasso <- cv.glmnet(x, y, family='binomial', alpha=1, parallel=TRUE, standardize=TRUE, type.measure='auc')
# 
# # Results
# plot(cv.lasso)

#Factors
#Senior.Citizen + Partner + Dependents + Tenure.Months + Phone.Service + Internet.Service + Online.Security + Online.Backup + Tech.Support + Contract + Paperless.Billing + Payment.Method + Monthly.Charges + Avg.Monthly.GB.Download + Unlimited.Data

#Drop senior, type of internet, type of payment, Monthly GB, unlimited (online security, online backup, tech support, unlimited data)
#+ Tenure.Months, partner

#Build model
final_logit_model <- glm(Churn.Label ~ Dependents + Phone.Service + Contract + Monthly.Charges + Avg.Monthly.GB.Download, data=filtered_joined_churn_df, family="binomial")
#summary(final_logit_model)
xkabledply(final_logit_model, title="Churn.Label ~ All Factors")
Churn.Label ~ All Factors
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.8853 0.1151 -7.688 0.000
DependentsYes -1.6083 0.1114 -14.431 0.000
Phone.ServiceYes -0.5660 0.1154 -4.906 0.000
ContractOne year -1.8305 0.0922 -19.850 0.000
ContractTwo year -3.2073 0.1530 -20.969 0.000
Monthly.Charges 0.0198 0.0014 14.585 0.000
Avg.Monthly.GB.Download -0.0012 0.0017 -0.714 0.475


Exponentiate both sides of equation to get growth and decay factors…

#exponentiate both sides of the equation
expcoeff <- exp(coef(final_logit_model))
xkabledply(as.table(expcoeff), title='Growth and Decay Factors After Exponentiation') 
Growth and Decay Factors After Exponentiation
x
(Intercept) 0.4126
DependentsYes 0.2002
Phone.ServiceYes 0.5678
ContractOne year 0.1603
ContractTwo year 0.0405
Monthly.Charges 1.0200
Avg.Monthly.GB.Download 0.9988

Exponentiated Equation


\(\frac{p}{q} = 0.412593 * (0.200223)^{dependents} * (0.567806)^{phone} * (0.160328)^{cont1yr}\)

\(* (0.0404644)^{cont2yr} * (1.01996)^{monthlyCharge} * (0.998789)^{monthlyGB}\)


Final Model Tests


Coefficient P-values: Final Model

All p-values low.


Confusion Matrix: Final Model, 0.5 Cut Off

#Create and display the confusion matrix
confusion_matrix_final <- xkabledply(confusion_matrix(final_logit_model), title = "Confusion Matrix for Final Logistic Model With a Cut Off Value of 0.5" )
confusion_matrix_final
Confusion Matrix for Final Logistic Model With a Cut Off Value of 0.5
Predicted No Predicted Yes Total
Actual No 4466 708 5174
Actual Yes 925 944 1869
Total 5391 1652 7043
#Turn from kable into data frame (was having trouble accessing elements as a kable)
confusion_matrix_final_df <- data.frame(confusion_matrix(final_logit_model))
#confusion_matrix_final_df

accuracy <- (confusion_matrix_final_df[1,1] + confusion_matrix_final_df[2,2])/confusion_matrix_final_df[3,3]
precision <- confusion_matrix_final_df[2,2]/(confusion_matrix_final_df[2,2] + confusion_matrix_final_df[1,2])
recall_rate <- confusion_matrix_final_df[2,2]/(confusion_matrix_final_df[2,2] + confusion_matrix_final_df[2,1])
specificity <- confusion_matrix_final_df[1,1]/(confusion_matrix_final_df[1,1] + confusion_matrix_final_df[1,2])
f1_score <- 2*(precision)*(recall_rate)/(precision + recall_rate)

# 1. **Accuracy:** *(TP + TN)/Total*
# * `r (confusion_matrix_test_df[1,1] + confusion_matrix_test_df[2,2])/confusion_matrix_test_df[3,3]`
# * The model correctly predicts survived or died `r round((confusion_matrix_test_df[1,1] + confusion_matrix_test_df[2,2])/confusion_matrix_test_df[3,3], digits=3)*100`% of the time.
# 2. **Precision:** *TP/(TP + FP)*
# * `r confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[1,2])`
# * Of the values that are predicted true by the model, `r round(confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[1,2]), digits=3)*100`% of them actually are true.
# 3. **Recall Rate:** *TP/(TP + FN)*
# * `r confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[2,1])`
# * Of the values that actually are true, the model correctly predicts `r round(confusion_matrix_test_df[2,2]/(confusion_matrix_test_df[2,2] + confusion_matrix_test_df[2,1]), digits=3)*100`% of them as true.
# 4. **Specificity:** *TN/(TN + FP)*
# * `r confusion_matrix_test_df[1,1]/(confusion_matrix_test_df[1,1] + confusion_matrix_test_df[1,2])`
# * Of the values that are actually false, the model correctly predicts `r round(confusion_matrix_test_df[1,1]/(confusion_matrix_test_df[1,1] + confusion_matrix_test_df[1,2]), digits=3)*100`% of them as false.
# 5. **F1 Score:** *2(Precision)(Recall)/(Precision + Recall)*
# * `r 2*(0.753)*(0.714)/(0.753 + 0.714)`
# * The harmonic mean of Precision and Recall, a more balanced view of how good the model is.
  1. Accuracy: (TP + TN)/Total
  • 0.768
  • The model correctly predicts churn or non-churn 76.8% of the time.
  1. Precision: TP/(TP + FP)
  • 0.571
  • Of the values that are predicted true by the model, 57.1% of them actually are true.
  1. Recall Rate: TP/(TP + FN)
  • 0.505
  • Of the values that actually are true, the model correctly predicts 50.5% of them as true.
  1. Specificity: TN/(TN + FP)
  • 0.863
  • Of the values that are actually false, the model correctly predicts, 86.3% of them as false.
  1. F1 Score: 2(Precision)(Recall)/(Precision + Recall)
  • 0.536
  • The harmonic mean of Precision and Recall, a more balanced view of how good the model is.

ROC-AUC: Final Model

Area under the curve, testing the true positive rate (or recall) against the false positive rate (or specificity)

#Get the probability of survival for each passenger from the model
prob_test=predict(final_logit_model, type = "response" )
#Create new column in df of predicted probability of survival 
filtered_joined_churn_df$prob=prob_test
#Determine the ratio of true-positive-rate/false-positive rate as a curve between 0.5 and 1
roc_data <- roc(Churn.Label~prob, data=filtered_joined_churn_df)
#Plot as graph
plot(roc_data)

#Value of area under the curve, prefer 0.8 or higher.
#auc(roc_data2) # area-under-curve 
# unloadPkg("pROC")

Comments

  • This model produces a score of 0.818
  • This is above the threshold of 0.08 for a good fit model

McFadden: Final Model

#calculate McFadden statistics, the pseudo r^2
final_pr2 = pR2(final_logit_model)
fitting null model for pseudo-r2
#final_pr2[4]

The pseudo \(r^2\) value for the final model, using the McFadden method of calculation, is 0.243. This means that 24.3% of the variance seen in the data is accounted for by this model.


AIC/BIC and Deviance: Final Model

  • The AIC of the final model is: 6186.792
  • The Residual Deviance of the final model is: 6172.792

Adjust Cut Off Value: Final Model


Confusion Matrix: Final Model, 0.26 Cut Off

############################################################

#HELP WITH MAKING CUTOFF
# The code for the logistic regression model and the predictions is given below
#log_model_full <- glm(loan_status ~ ., family = "binomial", data = training_set)
#predictions_all_full <- predict(log_model_full, newdata = test_set, type = "response")

# Make a binary predictions-vector using a cut-off of 15%
#pred_cutoff_15 <- ifelse(predictions_all_full > 0.15, 1, 0)

# Construct a confusion matrix
#table(test_set$loan_status, pred_cutoff_15)

############################################################

#0.361
#0.263
#0.265
#0.28 -- 80% recall

#Get actual values of churn/non-churn
predictions_final_model <- final_logit_model$fitted.values
#Get predictions based off of selected cut off value
pred_cutoff <- ifelse(predictions_final_model > 0.26, 1, 0)

############################################################

#Turn from kable into data frame (was having trouble accessing elements as a kable)
confusion_matrix_cutoff_df <- data.frame(table(filtered_joined_churn_df$Churn.Label, pred_cutoff))
#Convert format of df to matrix style
confusion_matrix_cutoff_df <- reshape(confusion_matrix_cutoff_df, idvar = "Var1", timevar = "pred_cutoff", direction = "wide")
#Drop first column and add totals column
confusion_matrix_cutoff_df <- select(confusion_matrix_cutoff_df, c(2,3)) %>%
  mutate(
    Total=Freq.0+Freq.1
  )
#Add total row to capture prediction totals
confusion_matrix_cutoff_df[nrow(confusion_matrix_cutoff_df)+ 1 , ] <- c(sum(confusion_matrix_cutoff_df$Freq.0), sum(confusion_matrix_cutoff_df$Freq.1), sum(confusion_matrix_cutoff_df$Total))
#Add Column at the start of DF to identify the actual values
Index=c("Actual No", "Actual Yes", "Total")
confusion_matrix_cutoff_df <- cbind(Index, confusion_matrix_cutoff_df)
#Rename columns to identify predicted values
colnames(confusion_matrix_cutoff_df) <- c("", "Predicted No", "Predicted Yes", "Total")
#Display Cut Off Confusion Matrix
xkabledply(confusion_matrix_cutoff_df, title="Confusion Matrix for Final Model With a Cut Off Value of 0.26")
Confusion Matrix for Final Model With a Cut Off Value of 0.26
Predicted No Predicted Yes Total
Actual No 3693 1481 5174
Actual Yes 382 1487 1869
Total 4075 2968 7043
########################

#Second value is column

#calculate matric scores with custom cut off
accuracy <- (confusion_matrix_cutoff_df[1,2] + confusion_matrix_cutoff_df[2,3])/confusion_matrix_cutoff_df[3,4]
precision <- confusion_matrix_cutoff_df[2,3]/(confusion_matrix_cutoff_df[2,3] + confusion_matrix_cutoff_df[1,3])
recall_rate <- confusion_matrix_cutoff_df[2,3]/(confusion_matrix_cutoff_df[2,3] + confusion_matrix_cutoff_df[2,2])
specificity <- confusion_matrix_cutoff_df[1,2]/(confusion_matrix_cutoff_df[1,2] + confusion_matrix_cutoff_df[1,3])
f1_score <- 2*(precision)*(recall_rate)/(precision + recall_rate)

# cat('\naccuracy:     ', accuracy, '\n')
# cat('precision:    ', precision, '\n')
# cat('recal:        ', recall_rate, '\n')
# cat('specificity:  ', specificity, '\n')
# cat('f1:           ', f1_score, '\n')

#accuracy
#precision
#recall_rate
#specificity
#f1_score
  1. Accuracy: (TP + TN)/Total
  • 0.735
  • The model correctly predicts churn or non-churn 73.5% of the time.
  1. Precision: TP/(TP + FP)
  • 0.501
  • Of the values that are predicted true by the model, 50.1% of them actually are true.
  1. Recall Rate: TP/(TP + FN)
  • 0.796
  • Of the values that actually are true, the model correctly predicts 79.6% of them as true.
  1. Specificity: TN/(TN + FP)
  • 0.714
  • Of the values that are actually false, the model correctly predicts, 71.4% of them as false.
  1. F1 Score: 2(Precision)(Recall)/(Precision + Recall)
  • 0.615
  • The harmonic mean of Precision and Recall, a more balanced view of how good the model is.

Log Loss, Regular and Brier Score

#Grab only Actual Churn and Predicted Churn
logloss_df <- filtered_joined_churn_df %>%
  select(c(14, 22)) %>%
  mutate(
    corrected_prob=ifelse(Churn.Label=="Yes", prob, 1-prob) #Correct pred based on actual churn
  )
xkabledply(head(logloss_df), title="Example of Model Predictions Corrected for Log Loss Calculation")
Example of Model Predictions Corrected for Log Loss Calculation
Churn.Label prob corrected_prob
No 0.119 0.881
No 0.431 0.569
Yes 0.493 0.493
Yes 0.618 0.618
Yes 0.548 0.548
No 0.145 0.855
logloss_value <- LogLoss(y_pred= final_logit_model$fitted.values, y_true=churn_df$Churn.Value)
#logloss_value
  
#Get the probability of survival for each passenger from the model
prob_test=predict(sat_logit_model, type = "response" )
#Create new column in df of predicted probability of survival 
churn_df$prob=prob_test

#brier_test <- brierscore(data=churn_df, Churn.Value ~ prob)
brier_test <- BrierScore(churn_df$Churn.Value, final_logit_model$fitted.values)
#brier_test

When the regular Log Loss function is used, the result is:

  • 0.806

When the Brier Test Method is used for an unbalanced Data Set, the result is:

  • 0.244

Final Logistic Model Takeaways

  • Customers with avg Monthly Charge (~$65) and no other mitigating factors have a churn risk of about ~60%
  • Long-term contracts biggest factor in keeping customers with the company
  • Roughly every ~100 GB of monthly download decays odds-ratio of customer churning by 10%
  • The overall avg Monthly Charge of ~$65, has a 3.6-fold growth effect on the odds-ratio of a customer churning
  • Reducing the Monthly Charge to $55 decreases the growth effect from 3.6 to 3
  • Adjusting the cutoff significantly improves recall rate (+29.1%) while only suffering a minor hit to precision (-7%)
  • Brier Test for Log Loss (0.244) shows a model that performs reasonably well

3.1.4 Suggestion Based on Model

Nearly 89% of customers who churned were on month-to-month contracts. It is suggested that the company offer an additional $10 off monthly charges for customers on month-to-month contracts who would be willing to sign a 2-year contract.

For a customer who pays the avg Monthly Charge of ~$65 and who has no other mitigating factors, this would reduce the risk of churning from about ~60% to about ~5%.

Assuming the ideal scenario were all would-be churners who have month-to-month contracts take the deal and sign a 2-year contract, this theoretically reduces the number of customers who churn in this group by about ~91.7%.

This would result in the following monetary benefits to the company:

  • Using plan with adjusted cut off model saves ~$794K a year by retaining ~65% of would-be churners (91.7% of the 88.6% who have month-to-month contracts of the 79.6% of churners the model catches)
  • Using plan with adjusted cut off model loses an extra ~$76k by offering discounts to customers that wouldn’t have churned
  • Total value of implementing retainment plan according to model with adjusted cut off: ~$718k a year

3.2 Random Forest Model

EDA feature selection

From the EDA, features we’ve found out will be useful and will be used in random forest model:

Categorical: Partner Tech.Support Online.Security Online.Backup Paperless.Billing Payment.Method Senior Citizen Contract Type Satisfaction Score Dependents

Continuous: Tenure Avg. Monthly Charges Total.Months

Data Preparation

clean categorical feature

From the previous EDA, we know that there are some categorical features that have ‘No’ and ‘No Internet Service’ or ‘No Phone Service’ as a category, we can make them as ‘No’ and clean these features.

tg <- data.frame(lapply(filtered_joined_churn_df, function(x){ gsub("No internet service", "No", x)}))
tg <- data.frame(lapply(filtered_joined_churn_df, function(x){ gsub("No phone service", "No", x)}))
glimpse(tg)
## Rows: 7,043
## Columns: 22
## $ Senior.Citizen                    <chr> "No", "No", "No", "Yes", "Yes", "No"~
## $ Partner                           <chr> "Yes", "No", "No", "Yes", "Yes", "No~
## $ Dependents                        <chr> "No", "No", "No", "No", "No", "Yes",~
## $ Tenure.Months                     <chr> "9", "9", "4", "13", "3", "9", "71",~
## $ Phone.Service                     <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "~
## $ Internet.Service                  <chr> "DSL", "DSL", "Fiber optic", "Fiber ~
## $ Online.Security                   <chr> "No", "No", "No", "No", "No", "No", ~
## $ Online.Backup                     <chr> "Yes", "No", "No", "Yes", "No", "No"~
## $ Tech.Support                      <chr> "Yes", "No", "No", "No", "Yes", "Yes~
## $ Contract                          <chr> "One year", "Month-to-month", "Month~
## $ Paperless.Billing                 <chr> "Yes", "No", "Yes", "Yes", "Yes", "Y~
## $ Payment.Method                    <chr> "Mailed check", "Mailed check", "Ele~
## $ Monthly.Charges                   <chr> "65.6", "59.9", "73.9", "98", "83.9"~
## $ Churn.Label                       <chr> "No", "No", "Yes", "Yes", "Yes", "No~
## $ Churn.Reason                      <chr> NA, NA, "Price too high", "Product d~
## $ Satisfaction.Score                <chr> "3", "5", "1", "1", "1", "3", "3", "~
## $ Churn.Category                    <chr> NA, NA, "Competitor", "Dissatisfacti~
## $ Avg.Monthly.Long.Distance.Charges <chr> "42.39", "10.69", "33.65", "27.82", ~
## $ Avg.Monthly.GB.Download           <chr> "16", "10", "30", "4", "11", "73", "~
## $ Unlimited.Data                    <chr> "Yes", "No", "Yes", "Yes", "Yes", "Y~
## $ Total.Extra.Data.Charges          <chr> "0", "10", "0", "0", "0", "0", "0", ~
## $ prob                              <chr> "0.118715221855491", "0.430575377275~

Standardizing Continuous features

Continuous festures: Tenure Avg. Monthly Charges Total.Months

num_columns <- c("Tenure.Months", "Monthly.Charges", "Total.Extra.Data.Charges")
tg[num_columns] <- sapply(tg[num_columns], as.numeric)

tg_int <- tg[,c("Tenure.Months", "Monthly.Charges", "Total.Extra.Data.Charges")]
tg_int <- data.frame(scale(tg_int))
head(tg_int,5)
##   Tenure.Months Monthly.Charges Total.Extra.Data.Charges
## 1        -0.952          0.0279                   -0.273
## 2        -0.952         -0.1616                    0.125
## 3        -1.155          0.3037                   -0.273
## 4        -0.789          1.1046                   -0.273
## 5        -1.196          0.6360                   -0.273

Create discrete options of features Tenure is highy correlate with target variable, therefore I am trying to create a discrete feature from tenure, where I have made different bins of tenure(which is in months) such as ‘0-1 year’, ‘2-3 years’, ‘3-4 years’ etc. to show how will trnure.months actually affect whether customer choose to churn or not churn.

tg <- mutate(tg, tenure_bin = Tenure.Months)

tg$tenure_bin[tg$tenure_bin >=0 & tg$tenure_bin <= 12] <- '0-1 year'
tg$tenure_bin[tg$tenure_bin > 12 & tg$tenure_bin <= 24] <- '1-2 years'
tg$tenure_bin[tg$tenure_bin > 24 & tg$tenure_bin <= 36] <- '2-3 years'
tg$tenure_bin[tg$tenure_bin > 36 & tg$tenure_bin <= 48] <- '3-4 years'
tg$tenure_bin[tg$tenure_bin > 48 & tg$tenure_bin <= 60] <- '4-5 years'
tg$tenure_bin[tg$tenure_bin > 60 & tg$tenure_bin <= 72] <- '5-6 years'

tg$tenure_bin <- as.factor(tg$tenure_bin)

After checking the distribution of data in each tenure bin, we found that maximum number of customers have a tenure of either 0-1 years and followed by 5-6 years.

options(repr.plot.width =6, repr.plot.height = 3)
ggplot(tg, aes(tenure_bin, fill = tenure_bin)) + geom_bar()

Create dummy

tg_c1 <- tg[,c("Paperless.Billing", "Payment.Method","tenure_bin", "Tech.Support","Online.Backup","Contract","Internet.Service","Partner","Senior.Citizen","Dependents","Churn.Label")]
dummy<- data.frame(sapply(tg_c1,function(x) data.frame(model.matrix(~x-1,data =tg_c1))[,-1]))
tail(dummy, n=5)
##      Paperless.Billing Payment.Method.xCredit.card..automatic.
## 7039                 0                                       0
## 7040                 1                                       0
## 7041                 1                                       0
## 7042                 0                                       0
## 7043                 0                                       0
##      Payment.Method.xElectronic.check Payment.Method.xMailed.check
## 7039                                0                            1
## 7040                                1                            0
## 7041                                0                            1
## 7042                                0                            1
## 7043                                1                            0
##      tenure_bin.x1.2.years tenure_bin.x2.3.years tenure_bin.x3.4.years
## 7039                     1                     0                     0
## 7040                     1                     0                     0
## 7041                     0                     0                     0
## 7042                     0                     0                     0
## 7043                     0                     0                     0
##      tenure_bin.x4.5.years tenure_bin.x5.6.years
## 7039                     0                     0
## 7040                     0                     0
## 7041                     0                     0
## 7042                     0                     1
## 7043                     0                     1
##      Tech.Support.xNo.internet.service Tech.Support.xYes
## 7039                                 0                 1
## 7040                                 0                 0
## 7041                                 0                 0
## 7042                                 0                 1
## 7043                                 0                 0
##      Online.Backup.xNo.internet.service Online.Backup.xYes Contract.xOne.year
## 7039                                  0                  0                  1
## 7040                                  0                  0                  0
## 7041                                  0                  1                  0
## 7042                                  0                  0                  0
## 7043                                  0                  1                  0
##      Contract.xTwo.year Internet.Service.xFiber.optic Internet.Service.xNo
## 7039                  0                             0                    0
## 7040                  0                             1                    0
## 7041                  0                             0                    0
## 7042                  1                             0                    0
## 7043                  1                             0                    0
##      Partner Senior.Citizen Dependents Churn.Label
## 7039       0              0          0           0
## 7040       1              0          0           1
## 7041       0              0          0           0
## 7042       1              0          0           0
## 7043       1              0          0           0

Create final dataset

Creating the final dataset by combining the numeric and dummy data frames.

#Combining the data
tg_final <- cbind(tg_int,dummy)
head(tg_final)
##   Tenure.Months Monthly.Charges Total.Extra.Data.Charges Paperless.Billing
## 1        -0.952          0.0279                   -0.273                 1
## 2        -0.952         -0.1616                    0.125                 0
## 3        -1.155          0.3037                   -0.273                 1
## 4        -0.789          1.1046                   -0.273                 1
## 5        -1.196          0.6360                   -0.273                 1
## 6        -0.952          0.1541                   -0.273                 1
##   Payment.Method.xCredit.card..automatic. Payment.Method.xElectronic.check
## 1                                       0                                0
## 2                                       0                                0
## 3                                       0                                1
## 4                                       0                                1
## 5                                       0                                0
## 6                                       1                                0
##   Payment.Method.xMailed.check tenure_bin.x1.2.years tenure_bin.x2.3.years
## 1                            1                     0                     0
## 2                            1                     0                     0
## 3                            0                     0                     0
## 4                            0                     1                     0
## 5                            1                     0                     0
## 6                            0                     0                     0
##   tenure_bin.x3.4.years tenure_bin.x4.5.years tenure_bin.x5.6.years
## 1                     0                     0                     0
## 2                     0                     0                     0
## 3                     0                     0                     0
## 4                     0                     0                     0
## 5                     0                     0                     0
## 6                     0                     0                     0
##   Tech.Support.xNo.internet.service Tech.Support.xYes
## 1                                 0                 1
## 2                                 0                 0
## 3                                 0                 0
## 4                                 0                 0
## 5                                 0                 1
## 6                                 0                 1
##   Online.Backup.xNo.internet.service Online.Backup.xYes Contract.xOne.year
## 1                                  0                  1                  1
## 2                                  0                  0                  0
## 3                                  0                  0                  0
## 4                                  0                  1                  0
## 5                                  0                  0                  0
## 6                                  0                  0                  0
##   Contract.xTwo.year Internet.Service.xFiber.optic Internet.Service.xNo Partner
## 1                  0                             0                    0       1
## 2                  0                             0                    0       0
## 3                  0                             1                    0       0
## 4                  0                             1                    0       1
## 5                  0                             1                    0       1
## 6                  0                             0                    0       0
##   Senior.Citizen Dependents Churn.Label
## 1              0          0           0
## 2              0          0           0
## 3              0          0           1
## 4              1          0           1
## 5              1          0           1
## 6              0          1           0
tg_final$Churn.Label<- as.factor(tg_final$Churn.Label)
head(tg_final,5)
##   Tenure.Months Monthly.Charges Total.Extra.Data.Charges Paperless.Billing
## 1        -0.952          0.0279                   -0.273                 1
## 2        -0.952         -0.1616                    0.125                 0
## 3        -1.155          0.3037                   -0.273                 1
## 4        -0.789          1.1046                   -0.273                 1
## 5        -1.196          0.6360                   -0.273                 1
##   Payment.Method.xCredit.card..automatic. Payment.Method.xElectronic.check
## 1                                       0                                0
## 2                                       0                                0
## 3                                       0                                1
## 4                                       0                                1
## 5                                       0                                0
##   Payment.Method.xMailed.check tenure_bin.x1.2.years tenure_bin.x2.3.years
## 1                            1                     0                     0
## 2                            1                     0                     0
## 3                            0                     0                     0
## 4                            0                     1                     0
## 5                            1                     0                     0
##   tenure_bin.x3.4.years tenure_bin.x4.5.years tenure_bin.x5.6.years
## 1                     0                     0                     0
## 2                     0                     0                     0
## 3                     0                     0                     0
## 4                     0                     0                     0
## 5                     0                     0                     0
##   Tech.Support.xNo.internet.service Tech.Support.xYes
## 1                                 0                 1
## 2                                 0                 0
## 3                                 0                 0
## 4                                 0                 0
## 5                                 0                 1
##   Online.Backup.xNo.internet.service Online.Backup.xYes Contract.xOne.year
## 1                                  0                  1                  1
## 2                                  0                  0                  0
## 3                                  0                  0                  0
## 4                                  0                  1                  0
## 5                                  0                  0                  0
##   Contract.xTwo.year Internet.Service.xFiber.optic Internet.Service.xNo Partner
## 1                  0                             0                    0       1
## 2                  0                             0                    0       0
## 3                  0                             1                    0       0
## 4                  0                             1                    0       1
## 5                  0                             1                    0       1
##   Senior.Citizen Dependents Churn.Label
## 1              0          0           0
## 2              0          0           0
## 3              0          0           1
## 4              1          0           1
## 5              1          0           1

Random Forest Model

Split data

Splitting data into train and validation

#Splitting the data
library(randomForest)
set.seed(123)
indices = sample.split(tg_final$Churn.Label, SplitRatio = 0.7)
train = tg_final[indices,]
validation = tg_final[!(indices),]
xkablesummary(train)
Table: Statistics summary.
Tenure.Months Monthly.Charges Total.Extra.Data.Charges Paperless.Billing Payment.Method.xCredit.card..automatic. Payment.Method.xElectronic.check Payment.Method.xMailed.check tenure_bin.x1.2.years tenure_bin.x2.3.years tenure_bin.x3.4.years tenure_bin.x4.5.years tenure_bin.x5.6.years Tech.Support.xNo.internet.service Tech.Support.xYes Online.Backup.xNo.internet.service Online.Backup.xYes Contract.xOne.year Contract.xTwo.year Internet.Service.xFiber.optic Internet.Service.xNo Partner Senior.Citizen Dependents Churn.Label
Min Min. :-1.318 Min. :-1.546 Min. :-0.27 Min. :0.000 Min. :0.00 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.00 0:3622
Q1 1st Qu.:-0.952 1st Qu.:-0.965 1st Qu.:-0.27 1st Qu.:0.000 1st Qu.:0.00 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.00 1:1308
Median Median :-0.178 Median : 0.186 Median :-0.27 Median :1.000 Median :0.00 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.00 NA
Mean Mean :-0.010 Mean : 0.001 Mean : 0.00 Mean :0.589 Mean :0.22 Mean :0.328 Mean :0.236 Mean :0.151 Mean :0.123 Mean :0.103 Mean :0.114 Mean :0.199 Mean :0.217 Mean :0.291 Mean :0.217 Mean :0.348 Mean :0.207 Mean :0.238 Mean :0.442 Mean :0.217 Mean :0.489 Mean :0.165 Mean :0.23 NA
Q3 3rd Qu.: 0.921 3rd Qu.: 0.834 3rd Qu.:-0.27 3rd Qu.:1.000 3rd Qu.:0.00 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.00 NA
Max Max. : 1.614 Max. : 1.794 Max. : 5.70 Max. :1.000 Max. :1.00 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.00 NA
xkablesummary(validation)
Table: Statistics summary.
Tenure.Months Monthly.Charges Total.Extra.Data.Charges Paperless.Billing Payment.Method.xCredit.card..automatic. Payment.Method.xElectronic.check Payment.Method.xMailed.check tenure_bin.x1.2.years tenure_bin.x2.3.years tenure_bin.x3.4.years tenure_bin.x4.5.years tenure_bin.x5.6.years Tech.Support.xNo.internet.service Tech.Support.xYes Online.Backup.xNo.internet.service Online.Backup.xYes Contract.xOne.year Contract.xTwo.year Internet.Service.xFiber.optic Internet.Service.xNo Partner Senior.Citizen Dependents Churn.Label
Min Min. :-1.318 Min. :-1.541 Min. :-0.27 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 0:1552
Q1 1st Qu.:-0.992 1st Qu.:-0.984 1st Qu.:-0.27 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1: 561
Median Median :-0.056 Median : 0.184 Median :-0.27 Median :1.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 Median :0.000 NA
Mean Mean : 0.024 Mean :-0.002 Mean : 0.01 Mean :0.601 Mean :0.206 Mean :0.354 Mean :0.212 Mean :0.132 Mean :0.108 Mean :0.121 Mean :0.128 Mean :0.201 Mean :0.216 Mean :0.288 Mean :0.216 Mean :0.338 Mean :0.213 Mean :0.247 Mean :0.435 Mean :0.216 Mean :0.469 Mean :0.156 Mean :0.233 NA
Q3 3rd Qu.: 0.962 3rd Qu.: 0.835 3rd Qu.:-0.27 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.000 NA
Max Max. : 1.614 Max. : 1.776 Max. : 5.70 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.000 NA

Train randomforest model

#Training the RandomForest Model

model.rf <- randomForest(Churn.Label ~ ., data=train, proximity=F,importance = F,ntree=500,mtry=4, do.trace=F, na.action=na.roughfix)
model.rf
## 
## Call:
##  randomForest(formula = Churn.Label ~ ., data = train, proximity = F,      importance = F, ntree = 500, mtry = 4, do.trace = F, na.action = na.roughfix) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 20.1%
## Confusion matrix:
##      0   1 class.error
## 0 3207 415       0.115
## 1  577 731       0.441

The OOB error estimate comes to around 20%, so the model has around 80% out of sample accuracy for the training set. Let’s check the prediction and accuracy on validation data.

testPred <- predict(model.rf, newdata=validation[,-24])
#table(testPred2, validation2$Churn.Label)

confusionMatrix(validation$Churn.Label, testPred)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1371  181
##          1  236  325
##                                         
##                Accuracy : 0.803         
##                  95% CI : (0.785, 0.819)
##     No Information Rate : 0.761         
##     P-Value [Acc > NIR] : 2.06e-06      
##                                         
##                   Kappa : 0.478         
##                                         
##  Mcnemar's Test P-Value : 0.00818       
##                                         
##             Sensitivity : 0.853         
##             Specificity : 0.642         
##          Pos Pred Value : 0.883         
##          Neg Pred Value : 0.579         
##              Prevalence : 0.761         
##          Detection Rate : 0.649         
##    Detection Prevalence : 0.735         
##       Balanced Accuracy : 0.748         
##                                         
##        'Positive' Class : 0             
## 

The basic RandomForest model gives an accuracy of 80.3%( almost close enough to the OOB estimate), Sensitivity 85.3% and Specificity 64%.,

Variable Importance Plot

Below is the variable importance plot, that shows the most significant attribute in decreasing order by mean decrease in Gini. The Mean decrease Gini measures how pure the nodes are at the end of the tree. Higher the Gini Index, better is the homogeneity.

#Checking the variable Importance Plot
varImpPlot(model.rf)

Checking the AUC

model.rf.roc <- roc(response = validation$Churn.Label, predictor = as.numeric(testPred))
#print()
#plot(rf.roc, col = "red" , add = TRUE, print.auc.y = 0.85, print.auc = TRUE)
plot(model.rf.roc,col = "red",print.auc.y = 0.85, print.auc = TRUE )

Brief summary of Model

RandomForest:

Accuracy 80.3%, Sensitivity 85.3% Specificity 64%


3.3 SVM Model

str(filtered_joined_churn_df)
## 'data.frame':    7043 obs. of  22 variables:
##  $ Senior.Citizen                   : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 1 2 1 2 1 ...
##  $ Partner                          : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 1 2 2 1 2 ...
##  $ Dependents                       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 2 ...
##  $ Tenure.Months                    : num  9 9 4 13 3 9 71 63 7 65 ...
##  $ Phone.Service                    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Internet.Service                 : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 2 2 2 1 2 2 1 1 ...
##  $ Online.Security                  : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 1 3 3 3 3 ...
##  $ Online.Backup                    : Factor w/ 3 levels "No","No internet service",..: 3 1 1 3 1 1 3 1 1 3 ...
##  $ Tech.Support                     : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 3 3 3 3 1 3 ...
##  $ Contract                         : Factor w/ 3 levels "Month-to-month",..: 2 1 1 1 1 1 3 3 1 3 ...
##  $ Paperless.Billing                : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 2 2 2 ...
##  $ Payment.Method                   : Factor w/ 4 levels "Bank transfer (automatic)",..: 4 4 3 3 4 2 1 2 3 4 ...
##  $ Monthly.Charges                  : num  65.6 59.9 73.9 98 83.9 ...
##  $ Churn.Label                      : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 1 1 1 1 1 ...
##  $ Churn.Reason                     : Factor w/ 20 levels "Attitude of service provider",..: NA NA 18 19 15 NA NA NA NA NA ...
##  $ Satisfaction.Score               : Factor w/ 5 levels "1","2","3","4",..: 3 5 1 1 1 3 3 4 3 3 ...
##  $ Churn.Category                   : Factor w/ 5 levels "Attitude","Competitor",..: NA NA 2 3 3 NA NA NA NA NA ...
##  $ Avg.Monthly.Long.Distance.Charges: num  42.39 10.69 33.65 27.82 7.38 ...
##  $ Avg.Monthly.GB.Download          : num  16 10 30 4 11 73 14 7 21 14 ...
##  $ Unlimited.Data                   : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 1 2 2 ...
##  $ Total.Extra.Data.Charges         : num  0 10 0 0 0 0 0 20 0 0 ...
##  $ prob                             : num  0.119 0.431 0.493 0.618 0.548 ...
view(filtered_joined_churn_df)
my_data <- data.frame(filtered_joined_churn_df$Senior.Citizen,filtered_joined_churn_df$Churn.Label,filtered_joined_churn_df$Partner,filtered_joined_churn_df$Dependents,filtered_joined_churn_df$Tenure.Months,filtered_joined_churn_df$Phone.Service,filtered_joined_churn_df$Online.Security,filtered_joined_churn_df$Online.Backup,filtered_joined_churn_df$Tech.Support,filtered_joined_churn_df$Paperless.Billing,filtered_joined_churn_df$Unlimited.Data,filtered_joined_churn_df$Avg.Monthly.Long.Distance.Charges,filtered_joined_churn_df$Avg.Monthly.GB.Download,filtered_joined_churn_df$Monthly.Charges)

str(my_data)
## 'data.frame':    7043 obs. of  14 variables:
##  $ filtered_joined_churn_df.Senior.Citizen                   : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 1 2 1 2 1 ...
##  $ filtered_joined_churn_df.Churn.Label                      : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 1 1 1 1 1 ...
##  $ filtered_joined_churn_df.Partner                          : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 1 2 2 1 2 ...
##  $ filtered_joined_churn_df.Dependents                       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 2 ...
##  $ filtered_joined_churn_df.Tenure.Months                    : num  9 9 4 13 3 9 71 63 7 65 ...
##  $ filtered_joined_churn_df.Phone.Service                    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ filtered_joined_churn_df.Online.Security                  : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 1 3 3 3 3 ...
##  $ filtered_joined_churn_df.Online.Backup                    : Factor w/ 3 levels "No","No internet service",..: 3 1 1 3 1 1 3 1 1 3 ...
##  $ filtered_joined_churn_df.Tech.Support                     : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 3 3 3 3 1 3 ...
##  $ filtered_joined_churn_df.Paperless.Billing                : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 2 2 2 ...
##  $ filtered_joined_churn_df.Unlimited.Data                   : Factor w/ 2 levels "No","Yes": 2 1 2 2 2 2 2 1 2 2 ...
##  $ filtered_joined_churn_df.Avg.Monthly.Long.Distance.Charges: num  42.39 10.69 33.65 27.82 7.38 ...
##  $ filtered_joined_churn_df.Avg.Monthly.GB.Download          : num  16 10 30 4 11 73 14 7 21 14 ...
##  $ filtered_joined_churn_df.Monthly.Charges                  : num  65.6 59.9 73.9 98 83.9 ...
### Libraries used
library(caret)
library(class)
## Partition of data
train_sample<-createDataPartition(y = my_data$filtered_joined_churn_df.Churn.Label, p= 0.7, list = FALSE)
train_data<-my_data[train_sample, ]
test_data<-my_data[-train_sample, ]
train_label<-my_data[train_sample,2]
test_label<-my_data[-train_sample,2]
### Linear SVM
svm_linear <- train(filtered_joined_churn_df.Churn.Label ~., data = train_data, method = "svmLinear", trControl=trainControl(method = "repeatedcv", number = 10, repeats = 3), preProcess = c("center", "scale"), tuneLength = 10)
svm_linear
## Support Vector Machines with Linear Kernel 
## 
## 4931 samples
##   13 predictor
##    2 classes: 'No', 'Yes' 
## 
## Pre-processing: centered (16), scaled (16) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 4438, 4437, 4437, 4438, 4438, 4438, ... 
## Resampling results:
## 
##   Accuracy  Kappa
##   0.802     0.467
## 
## Tuning parameter 'C' was held constant at a value of 1

It’s a linear model therefore, it just tested at value “C” =1.

Now, our model is trained with C value as 1. We are ready to predict classes for our test set and check the accuracy of our model using confusion matrix.

### Confusion Matrix
test_pred<-predict(svm_linear,test_data)
confusionMatrix(table(test_pred,test_data$filtered_joined_churn_df.Churn.Label))
## Confusion Matrix and Statistics
## 
##          
## test_pred   No  Yes
##       No  1381  245
##       Yes  171  315
##                                        
##                Accuracy : 0.803        
##                  95% CI : (0.785, 0.82)
##     No Information Rate : 0.735        
##     P-Value [Acc > NIR] : 1.5e-13      
##                                        
##                   Kappa : 0.472        
##                                        
##  Mcnemar's Test P-Value : 0.000345     
##                                        
##             Sensitivity : 0.890        
##             Specificity : 0.562        
##          Pos Pred Value : 0.849        
##          Neg Pred Value : 0.648        
##              Prevalence : 0.735        
##          Detection Rate : 0.654        
##    Detection Prevalence : 0.770        
##       Balanced Accuracy : 0.726        
##                                        
##        'Positive' Class : No           
## 

The output shows that our model accuracy for test set is 79.8%

We can also do some customization for selecting C value(Cost) in Linear classifier.

### griding for obtaining best C value
svm_grid<-train(filtered_joined_churn_df.Churn.Label~., data = train_data, method = "svmLinear", trControl=trainControl("cv", number = 10), tuneGrid = expand.grid(C=seq(0,2,length=20)), preProcess = c("center", "scale"))
# plot model accuracy vs different values of cost
plot(svm_grid)

svm_grid$bestTune
##       C
## 5 0.421
test_predgrid<-predict(svm_grid,test_data)

The above plot is showing that our classifier is giving best accuracy on C = 0.2.

checking its accuracy using confusion -matrix.

# Accuracy after finding best parameter
confusionMatrix(table(test_predgrid,test_data$filtered_joined_churn_df.Churn.Label))
## Confusion Matrix and Statistics
## 
##              
## test_predgrid   No  Yes
##           No  1381  244
##           Yes  171  316
##                                        
##                Accuracy : 0.804        
##                  95% CI : (0.786, 0.82)
##     No Information Rate : 0.735        
##     P-Value [Acc > NIR] : 1.01e-13     
##                                        
##                   Kappa : 0.474        
##                                        
##  Mcnemar's Test P-Value : 0.000409     
##                                        
##             Sensitivity : 0.890        
##             Specificity : 0.564        
##          Pos Pred Value : 0.850        
##          Neg Pred Value : 0.649        
##              Prevalence : 0.735        
##          Detection Rate : 0.654        
##    Detection Prevalence : 0.769        
##       Balanced Accuracy : 0.727        
##                                        
##        'Positive' Class : No           
## 

The results of the confusion matrix show that this time the accuracy on the test set is 79.9 %

### ROC Curve
library(pROC)
roc_svm_test <-plot(roc(as.numeric(test_data$filtered_joined_churn_df.Churn.Label),as.numeric(test_pred)),main="Comparaison ",col="#1c61b6")
plot(roc_svm_test, add = TRUE,col = "red", print.auc=TRUE, print.auc.x = 0.5, print.auc.y = 0.3)
legend(0.3, 0.2, legend = c("test-svm"), lty = c(1), col = c("blue"))

The area under the curve(AUC) is 72.4%


3.4 Decision Tree Model


Omit NA values


We have also tried to build the decision tree mode.A decision tree is a type of supervised machine learning used to categorize or make predictions based on how a previous set of questions were answered. The model is a form of supervised learning, meaning that the model is trained and tested on a set of data that contains the desired categorization

before building the actual tree , since the data is not so clean, we have tried to clean categorical variables, changing into factors.standardized numerical variables. I have also cleaned the na values we have a used label encoding for categorical variables.

We got the insight from our previous eda that tota extra data charges , avg monthly long dist chanrges has null on whether a customer to churn or not so we removed those variables from the model building.

loadPkg("rpart")

library(caret)

library(tidyr)

install.packages("dplyr")

library("dplyr")

filtered_joined_churn_df %>% na.omit(filtered_joined_churn_df)





filtered_joined_churn_df[complete.cases(filtered_joined_churn_df), ]

filtered_joined_churn_df %>% drop_na()
control1 <- rfeControl(functions = rpart, # random forest
                      method = "repeatedcv", # repeated cv
                      repeats = 5, # number of repeats
                      number = 10) # number of folds

Label encoding the categorical variables

head(filtered_joined_churn_df)
##   Senior.Citizen Partner Dependents Tenure.Months Phone.Service
## 1             No     Yes         No             9           Yes
## 2             No      No         No             9           Yes
## 3             No      No         No             4           Yes
## 4            Yes     Yes         No            13           Yes
## 5            Yes     Yes         No             3           Yes
## 6             No      No        Yes             9           Yes
##   Internet.Service Online.Security Online.Backup Tech.Support       Contract
## 1              DSL              No           Yes          Yes       One year
## 2              DSL              No            No           No Month-to-month
## 3      Fiber optic              No            No           No Month-to-month
## 4      Fiber optic              No           Yes           No Month-to-month
## 5      Fiber optic              No            No          Yes Month-to-month
## 6              DSL              No            No          Yes Month-to-month
##   Paperless.Billing          Payment.Method Monthly.Charges Churn.Label
## 1               Yes            Mailed check            65.6          No
## 2                No            Mailed check            59.9          No
## 3               Yes        Electronic check            73.9         Yes
## 4               Yes        Electronic check            98.0         Yes
## 5               Yes            Mailed check            83.9         Yes
## 6               Yes Credit card (automatic)            69.4          No
##              Churn.Reason Satisfaction.Score  Churn.Category
## 1                    <NA>                  3            <NA>
## 2                    <NA>                  5            <NA>
## 3          Price too high                  1      Competitor
## 4 Product dissatisfaction                  1 Dissatisfaction
## 5     Network reliability                  1 Dissatisfaction
## 6                    <NA>                  3            <NA>
##   Avg.Monthly.Long.Distance.Charges Avg.Monthly.GB.Download Unlimited.Data
## 1                             42.39                      16            Yes
## 2                             10.69                      10             No
## 3                             33.65                      30            Yes
## 4                             27.82                       4            Yes
## 5                              7.38                      11            Yes
## 6                             16.77                      73            Yes
##   Total.Extra.Data.Charges  prob
## 1                        0 0.119
## 2                       10 0.431
## 3                        0 0.493
## 4                        0 0.618
## 5                        0 0.548
## 6                        0 0.145
library(superml)




label <- LabelEncoder$new()


filtered_joined_churn_df$Internet.Service <- label$fit_transform(filtered_joined_churn_df$Internet.Service)



filtered_joined_churn_df$Contract <- label$fit_transform(filtered_joined_churn_df$Contract)



filtered_joined_churn_df$Payment.Method <- label$fit_transform(filtered_joined_churn_df$Payment.Method)
filtered_joined_churn_df$Churn.Label <- ifelse(filtered_joined_churn_df$Churn.Label == "Yes",1,0)

filtered_joined_churn_df$Dependents  <- ifelse(filtered_joined_churn_df$Dependents  == "Yes",1,0)

filtered_joined_churn_df$Senior.Citizen <- ifelse(filtered_joined_churn_df$Senior.Citizen== "Yes",1,0)

filtered_joined_churn_df$Partner <- ifelse(filtered_joined_churn_df$Partner== "Yes",1,0)

#filtered_joined_churn_df$Tenure.Months <- ifelse(filtered_joined_churn_df$Tenure.Months == "Yes",1,0)

filtered_joined_churn_df$Phone.Service <- ifelse(filtered_joined_churn_df$Phone.Service == "Yes",1,0)

filtered_joined_churn_df$Online.Security <- ifelse(filtered_joined_churn_df$Online.Security == "Yes",1,0)

filtered_joined_churn_df$Online.Backup <- ifelse(filtered_joined_churn_df$Online.Backup == "Yes",1,0)

filtered_joined_churn_df$Tech.Support <- ifelse(filtered_joined_churn_df$Tech.Support == "Yes",1,0)

filtered_joined_churn_df$Paperless.Billing <- ifelse(filtered_joined_churn_df$Paperless.Billing == "Yes",1,0)

filtered_joined_churn_df$Unlimited.Data <- ifelse(filtered_joined_churn_df$Unlimited.Data == "Yes",1,0)





head(filtered_joined_churn_df)
##   Senior.Citizen Partner Dependents Tenure.Months Phone.Service
## 1              0       1          0             9             1
## 2              0       0          0             9             1
## 3              0       0          0             4             1
## 4              1       1          0            13             1
## 5              1       1          0             3             1
## 6              0       0          1             9             1
##   Internet.Service Online.Security Online.Backup Tech.Support Contract
## 1                0               0             1            1        1
## 2                0               0             0            0        0
## 3                1               0             0            0        0
## 4                1               0             1            0        0
## 5                1               0             0            1        0
## 6                0               0             0            1        0
##   Paperless.Billing Payment.Method Monthly.Charges Churn.Label
## 1                 1              3            65.6           0
## 2                 0              3            59.9           0
## 3                 1              2            73.9           1
## 4                 1              2            98.0           1
## 5                 1              3            83.9           1
## 6                 1              1            69.4           0
##              Churn.Reason Satisfaction.Score  Churn.Category
## 1                    <NA>                  3            <NA>
## 2                    <NA>                  5            <NA>
## 3          Price too high                  1      Competitor
## 4 Product dissatisfaction                  1 Dissatisfaction
## 5     Network reliability                  1 Dissatisfaction
## 6                    <NA>                  3            <NA>
##   Avg.Monthly.Long.Distance.Charges Avg.Monthly.GB.Download Unlimited.Data
## 1                             42.39                      16              1
## 2                             10.69                      10              0
## 3                             33.65                      30              1
## 4                             27.82                       4              1
## 5                              7.38                      11              1
## 6                             16.77                      73              1
##   Total.Extra.Data.Charges  prob
## 1                        0 0.119
## 2                       10 0.431
## 3                        0 0.493
## 4                        0 0.618
## 5                        0 0.548
## 6                        0 0.145
myvars <- c("Churn.Label","Senior.Citizen", "Partner", "Dependents","Tenure.Months","Phone.Service","Online.Security","Online.Backup","Tech.Support","Paperless.Billing","Unlimited.Data","Avg.Monthly.Long.Distance.Charges","Avg.Monthly.GB.Download","Monthly.Charges","Tenure.Months")



n <- filtered_joined_churn_df[myvars]



my <- c("Senior.Citizen", "Partner", "Dependents","Tenure.Months","Phone.Service","Online.Security")

u = filtered_joined_churn_df[my]




a <- n[myvars]

typeof(a)
## [1] "list"
nrow(a)
## [1] 7043
head(a)
##   Churn.Label Senior.Citizen Partner Dependents Tenure.Months Phone.Service
## 1           0              0       1          0             9             1
## 2           0              0       0          0             9             1
## 3           1              0       0          0             4             1
## 4           1              1       1          0            13             1
## 5           1              1       1          0             3             1
## 6           0              0       0          1             9             1
##   Online.Security Online.Backup Tech.Support Paperless.Billing Unlimited.Data
## 1               0             1            1                 1              1
## 2               0             0            0                 0              0
## 3               0             0            0                 1              1
## 4               0             1            0                 1              1
## 5               0             0            1                 1              1
## 6               0             0            1                 1              1
##   Avg.Monthly.Long.Distance.Charges Avg.Monthly.GB.Download Monthly.Charges
## 1                             42.39                      16            65.6
## 2                             10.69                      10            59.9
## 3                             33.65                      30            73.9
## 4                             27.82                       4            98.0
## 5                              7.38                      11            83.9
## 6                             16.77                      73            69.4
##   Tenure.Months.1
## 1               9
## 2               9
## 3               4
## 4              13
## 5               3
## 6               9
myvars <- c("Senior.Citizen","Churn.Label", "Partner", "Dependents","Tenure.Months","Phone.Service","Online.Security","Online.Backup","Tech.Support","Paperless.Billing","Unlimited.Data","Avg.Monthly.Long.Distance.Charges","Avg.Monthly.GB.Download","Monthly.Charges","Tenure.Months")





n$Churn.Label <- as.factor(n$Churn.Label)
                              




head(n$Churn.Label)
## [1] 0 0 1 1 1 0
## Levels: 0 1
typeof(n$Churn.Label)
## [1] "integer"
b = as.factor(n["Churn.Label"])

Features of variable importance

fit_dt = rpart(Churn.Label~., data=filtered_joined_churn_df, method="class", control = list(maxdepth = 4))

varImp(fit_dt)
##                                   Overall
## Contract                              451
## Dependents                            170
## prob                                  590
## Satisfaction.Score                   1954
## Tenure.Months                         292
## Senior.Citizen                          0
## Partner                                 0
## Phone.Service                           0
## Internet.Service                        0
## Online.Security                         0
## Online.Backup                           0
## Tech.Support                            0
## Paperless.Billing                       0
## Payment.Method                          0
## Monthly.Charges                         0
## Churn.Reason                            0
## Churn.Category                          0
## Avg.Monthly.Long.Distance.Charges       0
## Avg.Monthly.GB.Download                 0
## Unlimited.Data                          0
## Total.Extra.Data.Charges                0

Decision Tree with Gini

myvars <- c("Senior.Citizen","Churn.Label", "Partner", "Dependents","Tenure.Months","Phone.Service","Online.Security","Online.Backup","Tech.Support","Paperless.Billing","Unlimited.Data","Avg.Monthly.GB.Download","Tenure.Months","Internet.Service","Contract")


model_df <- filtered_joined_churn_df[myvars]

We have removed Total.Extra.Data.Charges and Avg.Monthly.Long.Distance.Charges , as it is not significant from the test we performed above

set.seed(56)
library(caret)
split_train_test <- createDataPartition(model_df$Churn.Label,p=0.7,list=FALSE)
dtrain<- model_df[split_train_test,]
dtest<-  model_df[-split_train_test,]

# Remove Total Charges from the training dataset

Train-Test data


head(dtrain)
##   Senior.Citizen Churn.Label Partner Dependents Tenure.Months Phone.Service
## 2              0           0       0          0             9             1
## 3              0           1       0          0             4             1
## 4              1           1       1          0            13             1
## 5              1           1       1          0             3             1
## 6              0           0       0          1             9             1
## 7              1           0       1          0            71             1
##   Online.Security Online.Backup Tech.Support Paperless.Billing Unlimited.Data
## 2               0             0            0                 0              0
## 3               0             0            0                 1              1
## 4               0             1            0                 1              1
## 5               0             0            1                 1              1
## 6               0             0            1                 1              1
## 7               1             1            1                 1              1
##   Avg.Monthly.GB.Download Tenure.Months.1 Internet.Service Contract
## 2                      10               9                0        0
## 3                      30               4                1        0
## 4                       4              13                1        0
## 5                      11               3                1        0
## 6                      73               9                0        0
## 7                      14              71                1        2
head(dtest)
##    Senior.Citizen Churn.Label Partner Dependents Tenure.Months Phone.Service
## 1               0           0       1          0             9             1
## 8               0           0       1          0            63             1
## 10              0           0       1          1            65             1
## 11              0           0       0          0            54             0
## 12              0           0       1          1            72             1
## 16              0           0       1          1            71             1
##    Online.Security Online.Backup Tech.Support Paperless.Billing Unlimited.Data
## 1                0             1            1                 1              1
## 8                1             0            1                 1              0
## 10               1             1            1                 1              1
## 11               1             0            1                 0              1
## 12               1             1            1                 1              1
## 16               0             1            0                 1              1
##    Avg.Monthly.GB.Download Tenure.Months.1 Internet.Service Contract
## 1                       16               9                0        1
## 8                        7              63                1        2
## 10                      14              65                0        2
## 11                      10              54                0        2
## 12                      59              72                1        2
## 16                      12              71                1        2
loadPkg("rpart")

library(caret)

library(tidyr)

install.packages("dplyr")

library("dplyr")

library(superml)

loadPkg("caret")

loadPkg("rpart.plot")

loadPkg("rattle")

tr_fit <- rpart(Churn.Label ~., data = dtrain, method="class",na.action = na.exclude)
rpart.plot(tr_fit)

summary(tr_fit)
## Call:
## rpart(formula = Churn.Label ~ ., data = dtrain, na.action = na.exclude, 
##     method = "class")
##   n= 4931 
## 
##       CP nsplit rel error xerror   xstd
## 1 0.0468      0     1.000  1.000 0.0236
## 2 0.0333      4     0.813  0.915 0.0229
## 3 0.0205      5     0.780  0.825 0.0221
## 4 0.0129      6     0.759  0.775 0.0216
## 5 0.0100      9     0.719  0.766 0.0215
## 
## Variable importance
##                Contract           Tenure.Months         Tenure.Months.1 
##                      25                      19                      19 
##        Internet.Service            Tech.Support              Dependents 
##                       7                       6                       6 
##                 Partner         Online.Security Avg.Monthly.GB.Download 
##                       6                       5                       4 
##           Phone.Service           Online.Backup 
##                       1                       1 
## 
## Node number 1: 4931 observations,    complexity param=0.0468
##   predicted class=0  expected loss=0.268  P(node) =1
##     class counts:  3611  1320
##    probabilities: 0.732 0.268 
##   left son=2 (2217 obs) right son=3 (2714 obs)
##   Primary splits:
##       Contract                < 0.5  to the right, improve=311.0, (0 missing)
##       Tenure.Months           < 16.5 to the right, improve=194.0, (0 missing)
##       Tenure.Months.1         < 16.5 to the right, improve=194.0, (0 missing)
##       Dependents              < 0.5  to the right, improve=123.0, (0 missing)
##       Avg.Monthly.GB.Download < 1    to the left,  improve= 96.9, (0 missing)
##   Surrogate splits:
##       Tenure.Months   < 34.5 to the right, agree=0.786, adj=0.524, (0 split)
##       Tenure.Months.1 < 34.5 to the right, agree=0.786, adj=0.524, (0 split)
##       Tech.Support    < 0.5  to the right, agree=0.650, adj=0.222, (0 split)
##       Partner         < 0.5  to the right, agree=0.640, adj=0.200, (0 split)
##       Online.Security < 0.5  to the right, agree=0.630, adj=0.178, (0 split)
## 
## Node number 2: 2217 observations
##   predicted class=0  expected loss=0.0713  P(node) =0.45
##     class counts:  2059   158
##    probabilities: 0.929 0.071 
## 
## Node number 3: 2714 observations,    complexity param=0.0468
##   predicted class=0  expected loss=0.428  P(node) =0.55
##     class counts:  1552  1162
##    probabilities: 0.572 0.428 
##   left son=6 (443 obs) right son=7 (2271 obs)
##   Primary splits:
##       Dependents              < 0.5  to the right, improve=76.0, (0 missing)
##       Tenure.Months           < 1.5  to the right, improve=49.5, (0 missing)
##       Tenure.Months.1         < 1.5  to the right, improve=49.5, (0 missing)
##       Avg.Monthly.GB.Download < 1    to the left,  improve=43.5, (0 missing)
##       Internet.Service        < 1.5  to the right, improve=43.5, (0 missing)
## 
## Node number 6: 443 observations
##   predicted class=0  expected loss=0.16  P(node) =0.0898
##     class counts:   372    71
##    probabilities: 0.840 0.160 
## 
## Node number 7: 2271 observations,    complexity param=0.0468
##   predicted class=0  expected loss=0.48  P(node) =0.461
##     class counts:  1180  1091
##    probabilities: 0.520 0.480 
##   left son=14 (270 obs) right son=15 (2001 obs)
##   Primary splits:
##       Avg.Monthly.GB.Download < 1    to the left,  improve=38.5, (0 missing)
##       Internet.Service        < 1.5  to the right, improve=38.5, (0 missing)
##       Tenure.Months           < 1.5  to the right, improve=37.8, (0 missing)
##       Tenure.Months.1         < 1.5  to the right, improve=37.8, (0 missing)
##       Paperless.Billing       < 0.5  to the left,  improve=32.3, (0 missing)
##   Surrogate splits:
##       Internet.Service < 1.5  to the right, agree=1, adj=1, (0 split)
## 
## Node number 14: 270 observations
##   predicted class=0  expected loss=0.23  P(node) =0.0548
##     class counts:   208    62
##    probabilities: 0.770 0.230 
## 
## Node number 15: 2001 observations,    complexity param=0.0468
##   predicted class=1  expected loss=0.486  P(node) =0.406
##     class counts:   972  1029
##    probabilities: 0.486 0.514 
##   left son=30 (656 obs) right son=31 (1345 obs)
##   Primary splits:
##       Internet.Service < 0.5  to the left,  improve=49.4, (0 missing)
##       Tenure.Months    < 16.5 to the right, improve=46.5, (0 missing)
##       Tenure.Months.1  < 16.5 to the right, improve=46.5, (0 missing)
##       Online.Security  < 0.5  to the right, improve=32.6, (0 missing)
##       Tech.Support     < 0.5  to the right, improve=25.4, (0 missing)
##   Surrogate splits:
##       Phone.Service < 0.5  to the left,  agree=0.771, adj=0.302, (0 split)
## 
## Node number 30: 656 observations,    complexity param=0.0205
##   predicted class=0  expected loss=0.355  P(node) =0.133
##     class counts:   423   233
##    probabilities: 0.645 0.355 
##   left son=60 (451 obs) right son=61 (205 obs)
##   Primary splits:
##       Tenure.Months   < 3.5  to the right, improve=26.50, (0 missing)
##       Tenure.Months.1 < 3.5  to the right, improve=26.50, (0 missing)
##       Online.Security < 0.5  to the right, improve= 6.06, (0 missing)
##       Online.Backup   < 0.5  to the right, improve= 5.24, (0 missing)
##       Tech.Support    < 0.5  to the right, improve= 4.21, (0 missing)
##   Surrogate splits:
##       Tenure.Months.1 < 3.5  to the right, agree=1, adj=1, (0 split)
## 
## Node number 31: 1345 observations,    complexity param=0.0333
##   predicted class=1  expected loss=0.408  P(node) =0.273
##     class counts:   549   796
##    probabilities: 0.408 0.592 
##   left son=62 (660 obs) right son=63 (685 obs)
##   Primary splits:
##       Tenure.Months     < 16.5 to the right, improve=40.6, (0 missing)
##       Tenure.Months.1   < 16.5 to the right, improve=40.6, (0 missing)
##       Online.Security   < 0.5  to the right, improve=16.9, (0 missing)
##       Tech.Support      < 0.5  to the right, improve=12.8, (0 missing)
##       Paperless.Billing < 0.5  to the left,  improve= 6.6, (0 missing)
##   Surrogate splits:
##       Tenure.Months.1 < 16.5 to the right, agree=1.000, adj=1.000, (0 split)
##       Online.Backup   < 0.5  to the right, agree=0.619, adj=0.223, (0 split)
##       Partner         < 0.5  to the right, agree=0.599, adj=0.182, (0 split)
##       Online.Security < 0.5  to the right, agree=0.575, adj=0.135, (0 split)
##       Tech.Support    < 0.5  to the right, agree=0.558, adj=0.098, (0 split)
## 
## Node number 60: 451 observations
##   predicted class=0  expected loss=0.259  P(node) =0.0915
##     class counts:   334   117
##    probabilities: 0.741 0.259 
## 
## Node number 61: 205 observations
##   predicted class=1  expected loss=0.434  P(node) =0.0416
##     class counts:    89   116
##    probabilities: 0.434 0.566 
## 
## Node number 62: 660 observations,    complexity param=0.0129
##   predicted class=0  expected loss=0.467  P(node) =0.134
##     class counts:   352   308
##    probabilities: 0.533 0.467 
##   left son=124 (115 obs) right son=125 (545 obs)
##   Primary splits:
##       Tenure.Months           < 53.5 to the right, improve=9.00, (0 missing)
##       Tenure.Months.1         < 53.5 to the right, improve=9.00, (0 missing)
##       Tech.Support            < 0.5  to the right, improve=7.76, (0 missing)
##       Online.Security         < 0.5  to the right, improve=5.07, (0 missing)
##       Avg.Monthly.GB.Download < 28.5 to the right, improve=4.80, (0 missing)
##   Surrogate splits:
##       Tenure.Months.1 < 53.5 to the right, agree=1, adj=1, (0 split)
## 
## Node number 63: 685 observations
##   predicted class=1  expected loss=0.288  P(node) =0.139
##     class counts:   197   488
##    probabilities: 0.288 0.712 
## 
## Node number 124: 115 observations
##   predicted class=0  expected loss=0.287  P(node) =0.0233
##     class counts:    82    33
##    probabilities: 0.713 0.287 
## 
## Node number 125: 545 observations,    complexity param=0.0129
##   predicted class=1  expected loss=0.495  P(node) =0.111
##     class counts:   270   275
##    probabilities: 0.495 0.505 
##   left son=250 (91 obs) right son=251 (454 obs)
##   Primary splits:
##       Avg.Monthly.GB.Download < 29.5 to the right, improve=5.87, (0 missing)
##       Tech.Support            < 0.5  to the right, improve=4.85, (0 missing)
##       Partner                 < 0.5  to the left,  improve=4.62, (0 missing)
##       Paperless.Billing       < 0.5  to the left,  improve=3.92, (0 missing)
##       Senior.Citizen          < 0.5  to the left,  improve=3.54, (0 missing)
## 
## Node number 250: 91 observations
##   predicted class=0  expected loss=0.341  P(node) =0.0185
##     class counts:    60    31
##    probabilities: 0.659 0.341 
## 
## Node number 251: 454 observations,    complexity param=0.0129
##   predicted class=1  expected loss=0.463  P(node) =0.0921
##     class counts:   210   244
##    probabilities: 0.463 0.537 
##   left son=502 (83 obs) right son=503 (371 obs)
##   Primary splits:
##       Tech.Support      < 0.5  to the right, improve=4.69, (0 missing)
##       Partner           < 0.5  to the left,  improve=3.67, (0 missing)
##       Paperless.Billing < 0.5  to the left,  improve=2.97, (0 missing)
##       Online.Security   < 0.5  to the right, improve=1.94, (0 missing)
##       Senior.Citizen    < 0.5  to the left,  improve=1.64, (0 missing)
## 
## Node number 502: 83 observations
##   predicted class=0  expected loss=0.386  P(node) =0.0168
##     class counts:    51    32
##    probabilities: 0.614 0.386 
## 
## Node number 503: 371 observations
##   predicted class=1  expected loss=0.429  P(node) =0.0752
##     class counts:   159   212
##    probabilities: 0.429 0.571

From this decision tree, we can interpret the following:

The contract variable is the most important. Customers with month-to-month contracts are more likely to churn.Customers who has a dependents (value=1) has the higher chance to churn. Customers with DSL internet service are less likely to churn. Customers who have stayed longer than 54 months are less likely to churn. Now let’s assess the prediction accuracy of the decision tree model by investigating how well it predicts churn in the test subset. We will begin with the confustion matrix, which is a useful display of classification accuracy. It displays the following information:


true positives (TP): These are cases in which we predicted yes (they churned), and they did churn. true negatives (TN): We predicted no, and they didn’t churn. false positives (FP): We predicted yes, but they didn’t actually churn. (Also known as a “Type I error.”) false negatives (FN): We predicted no, but they actually churned. (Also known as a “Type II error.”) Let’s examine the confusion matrix for our decision tree model.


tr_prob1 <- predict(tr_fit, dtest)
tr_pred1 <- ifelse(tr_prob1[,2] > 0.5,"Yes","No")
table(Predicted = tr_pred1, Actual = dtest$Churn.Label)
##          Actual
## Predicted    0    1
##       No  1354  208
##       Yes  209  341

From this confusion matrix, we can see that the model performs well at predicting non-churning customers (1354 correct vs. 208 incorrect) and predicting churning customers 341 correct vs. 209 incorrect).


tr_prob2 <- predict(tr_fit, dtrain)
tr_pred2 <- ifelse(tr_prob2[,2] > 0.5,"Yes","No")
tr_tab1 <- table(Predicted = tr_pred2, Actual = dtrain$Churn.Label)
tr_tab2 <- table(Predicted = tr_pred1, Actual = dtest$Churn.Label)
# # Train
# confusionMatrix(
#   as.factor(tr_pred2),
#   as.factor(dtrain$Churn.Label),na.action = na.pass,
#   positive = "Yes" 
# )
# 
# #Test
# confusionMatrix(
# as.factor(tr_pred1),
#   as.factor(dtest$Churn.Label),
#   positive = "Yes" 
# )

tr_acc <- sum(diag(tr_tab2))/sum(tr_tab2)
tr_acc
## [1] 0.803

The overall accuracy of the model is 80.26%


library(pROC)
ROC_rf <- roc(dtest$Churn.Label, tr_prob1[,2])


ROC_rf_auc <- auc(ROC_rf)

plot(ROC_rf, col = "green", main = "ROC Curve for the decision tree with gini")

paste("Accuracy decision tree with gini: ", mean(dtest$Churn.Labe == round(tr_prob1[,2], digits = 0)))
## [1] "Accuracy decision tree with gini:  0.802556818181818"
paste("Area under curve for decision tree with gini is: ", ROC_rf_auc)
## [1] "Area under curve for decision tree with gini is:  0.824659387684466"

This is the auc roc curve for the decision tree with gini parameter

The Area Under the Curve (AUC) is the measure of the ability of a classifier to distinguish between classes and is used as a summary of the ROC curve. The higher the AUC, the better the performance of the model at distinguishing between the positive and negative classes.


The overall accuracy of the decision tree with gini model is 80.26% The AUC of the decision tree with gini model is 0.82


Decision Tree with Entropy- rpart

myvars <- c("Senior.Citizen","Churn.Label", "Partner", "Dependents","Tenure.Months","Phone.Service","Online.Security","Online.Backup","Tech.Support","Paperless.Billing","Unlimited.Data","Avg.Monthly.GB.Download","Tenure.Months","Internet.Service","Contract")


model_df <- filtered_joined_churn_df[myvars]

We have removed Total.Extra.Data.Charges and Avg.Monthly.Long.Distance.Charges , as it is not significant from the test we performed above

set.seed(56)
library(caret)
split_train_test <- createDataPartition(model_df$Churn.Label,p=0.7,list=FALSE)
dtrain<- model_df[split_train_test,]
dtest<-  model_df[-split_train_test,]

# Remove Total Charges from the training dataset

Train-Test data


head(dtrain)
##   Senior.Citizen Churn.Label Partner Dependents Tenure.Months Phone.Service
## 2              0           0       0          0             9             1
## 3              0           1       0          0             4             1
## 4              1           1       1          0            13             1
## 5              1           1       1          0             3             1
## 6              0           0       0          1             9             1
## 7              1           0       1          0            71             1
##   Online.Security Online.Backup Tech.Support Paperless.Billing Unlimited.Data
## 2               0             0            0                 0              0
## 3               0             0            0                 1              1
## 4               0             1            0                 1              1
## 5               0             0            1                 1              1
## 6               0             0            1                 1              1
## 7               1             1            1                 1              1
##   Avg.Monthly.GB.Download Tenure.Months.1 Internet.Service Contract
## 2                      10               9                0        0
## 3                      30               4                1        0
## 4                       4              13                1        0
## 5                      11               3                1        0
## 6                      73               9                0        0
## 7                      14              71                1        2
head(dtest)
##    Senior.Citizen Churn.Label Partner Dependents Tenure.Months Phone.Service
## 1               0           0       1          0             9             1
## 8               0           0       1          0            63             1
## 10              0           0       1          1            65             1
## 11              0           0       0          0            54             0
## 12              0           0       1          1            72             1
## 16              0           0       1          1            71             1
##    Online.Security Online.Backup Tech.Support Paperless.Billing Unlimited.Data
## 1                0             1            1                 1              1
## 8                1             0            1                 1              0
## 10               1             1            1                 1              1
## 11               1             0            1                 0              1
## 12               1             1            1                 1              1
## 16               0             1            0                 1              1
##    Avg.Monthly.GB.Download Tenure.Months.1 Internet.Service Contract
## 1                       16               9                0        1
## 8                        7              63                1        2
## 10                      14              65                0        2
## 11                      10              54                0        2
## 12                      59              72                1        2
## 16                      12              71                1        2
loadPkg("rpart")

library(caret)

library(tidyr)

install.packages("dplyr")

library("dplyr")

library(superml)

loadPkg("caret")

loadPkg("rpart.plot")

loadPkg("rattle")

tr_fit <- rpart(Churn.Label ~., data = dtrain, method="class",na.action = na.exclude,parms=list(split='entropy'))
rpart.plot(tr_fit)

summary(tr_fit)
## Call:
## rpart(formula = Churn.Label ~ ., data = dtrain, na.action = na.exclude, 
##     method = "class", parms = list(split = "entropy"))
##   n= 4931 
## 
##       CP nsplit rel error xerror   xstd
## 1 0.0468      0     1.000  1.000 0.0236
## 2 0.0333      4     0.813  0.915 0.0229
## 3 0.0205      5     0.780  0.825 0.0221
## 4 0.0129      6     0.759  0.775 0.0216
## 5 0.0100      9     0.719  0.766 0.0215
## 
## Variable importance
##                Contract           Tenure.Months         Tenure.Months.1 
##                      25                      19                      19 
##        Internet.Service            Tech.Support              Dependents 
##                       7                       6                       6 
##                 Partner         Online.Security Avg.Monthly.GB.Download 
##                       6                       5                       4 
##           Phone.Service           Online.Backup 
##                       1                       1 
## 
## Node number 1: 4931 observations,    complexity param=0.0468
##   predicted class=0  expected loss=0.268  P(node) =1
##     class counts:  3611  1320
##    probabilities: 0.732 0.268 
##   left son=2 (2217 obs) right son=3 (2714 obs)
##   Primary splits:
##       Contract                < 0.5  to the right, improve=311.0, (0 missing)
##       Tenure.Months           < 16.5 to the right, improve=194.0, (0 missing)
##       Tenure.Months.1         < 16.5 to the right, improve=194.0, (0 missing)
##       Dependents              < 0.5  to the right, improve=123.0, (0 missing)
##       Avg.Monthly.GB.Download < 1    to the left,  improve= 96.9, (0 missing)
##   Surrogate splits:
##       Tenure.Months   < 34.5 to the right, agree=0.786, adj=0.524, (0 split)
##       Tenure.Months.1 < 34.5 to the right, agree=0.786, adj=0.524, (0 split)
##       Tech.Support    < 0.5  to the right, agree=0.650, adj=0.222, (0 split)
##       Partner         < 0.5  to the right, agree=0.640, adj=0.200, (0 split)
##       Online.Security < 0.5  to the right, agree=0.630, adj=0.178, (0 split)
## 
## Node number 2: 2217 observations
##   predicted class=0  expected loss=0.0713  P(node) =0.45
##     class counts:  2059   158
##    probabilities: 0.929 0.071 
## 
## Node number 3: 2714 observations,    complexity param=0.0468
##   predicted class=0  expected loss=0.428  P(node) =0.55
##     class counts:  1552  1162
##    probabilities: 0.572 0.428 
##   left son=6 (443 obs) right son=7 (2271 obs)
##   Primary splits:
##       Dependents              < 0.5  to the right, improve=76.0, (0 missing)
##       Tenure.Months           < 1.5  to the right, improve=49.5, (0 missing)
##       Tenure.Months.1         < 1.5  to the right, improve=49.5, (0 missing)
##       Avg.Monthly.GB.Download < 1    to the left,  improve=43.5, (0 missing)
##       Internet.Service        < 1.5  to the right, improve=43.5, (0 missing)
## 
## Node number 6: 443 observations
##   predicted class=0  expected loss=0.16  P(node) =0.0898
##     class counts:   372    71
##    probabilities: 0.840 0.160 
## 
## Node number 7: 2271 observations,    complexity param=0.0468
##   predicted class=0  expected loss=0.48  P(node) =0.461
##     class counts:  1180  1091
##    probabilities: 0.520 0.480 
##   left son=14 (270 obs) right son=15 (2001 obs)
##   Primary splits:
##       Avg.Monthly.GB.Download < 1    to the left,  improve=38.5, (0 missing)
##       Internet.Service        < 1.5  to the right, improve=38.5, (0 missing)
##       Tenure.Months           < 1.5  to the right, improve=37.8, (0 missing)
##       Tenure.Months.1         < 1.5  to the right, improve=37.8, (0 missing)
##       Paperless.Billing       < 0.5  to the left,  improve=32.3, (0 missing)
##   Surrogate splits:
##       Internet.Service < 1.5  to the right, agree=1, adj=1, (0 split)
## 
## Node number 14: 270 observations
##   predicted class=0  expected loss=0.23  P(node) =0.0548
##     class counts:   208    62
##    probabilities: 0.770 0.230 
## 
## Node number 15: 2001 observations,    complexity param=0.0468
##   predicted class=1  expected loss=0.486  P(node) =0.406
##     class counts:   972  1029
##    probabilities: 0.486 0.514 
##   left son=30 (656 obs) right son=31 (1345 obs)
##   Primary splits:
##       Internet.Service < 0.5  to the left,  improve=49.4, (0 missing)
##       Tenure.Months    < 16.5 to the right, improve=46.5, (0 missing)
##       Tenure.Months.1  < 16.5 to the right, improve=46.5, (0 missing)
##       Online.Security  < 0.5  to the right, improve=32.6, (0 missing)
##       Tech.Support     < 0.5  to the right, improve=25.4, (0 missing)
##   Surrogate splits:
##       Phone.Service < 0.5  to the left,  agree=0.771, adj=0.302, (0 split)
## 
## Node number 30: 656 observations,    complexity param=0.0205
##   predicted class=0  expected loss=0.355  P(node) =0.133
##     class counts:   423   233
##    probabilities: 0.645 0.355 
##   left son=60 (451 obs) right son=61 (205 obs)
##   Primary splits:
##       Tenure.Months   < 3.5  to the right, improve=26.50, (0 missing)
##       Tenure.Months.1 < 3.5  to the right, improve=26.50, (0 missing)
##       Online.Security < 0.5  to the right, improve= 6.06, (0 missing)
##       Online.Backup   < 0.5  to the right, improve= 5.24, (0 missing)
##       Tech.Support    < 0.5  to the right, improve= 4.21, (0 missing)
##   Surrogate splits:
##       Tenure.Months.1 < 3.5  to the right, agree=1, adj=1, (0 split)
## 
## Node number 31: 1345 observations,    complexity param=0.0333
##   predicted class=1  expected loss=0.408  P(node) =0.273
##     class counts:   549   796
##    probabilities: 0.408 0.592 
##   left son=62 (660 obs) right son=63 (685 obs)
##   Primary splits:
##       Tenure.Months     < 16.5 to the right, improve=40.6, (0 missing)
##       Tenure.Months.1   < 16.5 to the right, improve=40.6, (0 missing)
##       Online.Security   < 0.5  to the right, improve=16.9, (0 missing)
##       Tech.Support      < 0.5  to the right, improve=12.8, (0 missing)
##       Paperless.Billing < 0.5  to the left,  improve= 6.6, (0 missing)
##   Surrogate splits:
##       Tenure.Months.1 < 16.5 to the right, agree=1.000, adj=1.000, (0 split)
##       Online.Backup   < 0.5  to the right, agree=0.619, adj=0.223, (0 split)
##       Partner         < 0.5  to the right, agree=0.599, adj=0.182, (0 split)
##       Online.Security < 0.5  to the right, agree=0.575, adj=0.135, (0 split)
##       Tech.Support    < 0.5  to the right, agree=0.558, adj=0.098, (0 split)
## 
## Node number 60: 451 observations
##   predicted class=0  expected loss=0.259  P(node) =0.0915
##     class counts:   334   117
##    probabilities: 0.741 0.259 
## 
## Node number 61: 205 observations
##   predicted class=1  expected loss=0.434  P(node) =0.0416
##     class counts:    89   116
##    probabilities: 0.434 0.566 
## 
## Node number 62: 660 observations,    complexity param=0.0129
##   predicted class=0  expected loss=0.467  P(node) =0.134
##     class counts:   352   308
##    probabilities: 0.533 0.467 
##   left son=124 (115 obs) right son=125 (545 obs)
##   Primary splits:
##       Tenure.Months           < 53.5 to the right, improve=9.00, (0 missing)
##       Tenure.Months.1         < 53.5 to the right, improve=9.00, (0 missing)
##       Tech.Support            < 0.5  to the right, improve=7.76, (0 missing)
##       Online.Security         < 0.5  to the right, improve=5.07, (0 missing)
##       Avg.Monthly.GB.Download < 28.5 to the right, improve=4.80, (0 missing)
##   Surrogate splits:
##       Tenure.Months.1 < 53.5 to the right, agree=1, adj=1, (0 split)
## 
## Node number 63: 685 observations
##   predicted class=1  expected loss=0.288  P(node) =0.139
##     class counts:   197   488
##    probabilities: 0.288 0.712 
## 
## Node number 124: 115 observations
##   predicted class=0  expected loss=0.287  P(node) =0.0233
##     class counts:    82    33
##    probabilities: 0.713 0.287 
## 
## Node number 125: 545 observations,    complexity param=0.0129
##   predicted class=1  expected loss=0.495  P(node) =0.111
##     class counts:   270   275
##    probabilities: 0.495 0.505 
##   left son=250 (91 obs) right son=251 (454 obs)
##   Primary splits:
##       Avg.Monthly.GB.Download < 29.5 to the right, improve=5.87, (0 missing)
##       Tech.Support            < 0.5  to the right, improve=4.85, (0 missing)
##       Partner                 < 0.5  to the left,  improve=4.62, (0 missing)
##       Paperless.Billing       < 0.5  to the left,  improve=3.92, (0 missing)
##       Senior.Citizen          < 0.5  to the left,  improve=3.54, (0 missing)
## 
## Node number 250: 91 observations
##   predicted class=0  expected loss=0.341  P(node) =0.0185
##     class counts:    60    31
##    probabilities: 0.659 0.341 
## 
## Node number 251: 454 observations,    complexity param=0.0129
##   predicted class=1  expected loss=0.463  P(node) =0.0921
##     class counts:   210   244
##    probabilities: 0.463 0.537 
##   left son=502 (83 obs) right son=503 (371 obs)
##   Primary splits:
##       Tech.Support      < 0.5  to the right, improve=4.69, (0 missing)
##       Partner           < 0.5  to the left,  improve=3.67, (0 missing)
##       Paperless.Billing < 0.5  to the left,  improve=2.97, (0 missing)
##       Online.Security   < 0.5  to the right, improve=1.94, (0 missing)
##       Senior.Citizen    < 0.5  to the left,  improve=1.64, (0 missing)
## 
## Node number 502: 83 observations
##   predicted class=0  expected loss=0.386  P(node) =0.0168
##     class counts:    51    32
##    probabilities: 0.614 0.386 
## 
## Node number 503: 371 observations
##   predicted class=1  expected loss=0.429  P(node) =0.0752
##     class counts:   159   212
##    probabilities: 0.429 0.571

The overall accuracy of the tree model with entropy is 80.26%

From this decision tree, we can interpret the following:

The contract variable is the most important. Customers with month-to-month contracts are more likely to churn.Customers who has a dependents (value=1) has the higher chance to churn. Customers with DSL internet service are less likely to churn. Customers who have stayed longer than 54 months are less likely to churn. Now let’s assess the prediction accuracy of the decision tree model by investigating how well it predicts churn in the test subset. We will begin with the confustion matrix, which is a useful display of classification accuracy. It displays the following information:


true positives (TP): These are cases in which we predicted yes (they churned), and they did churn. true negatives (TN): We predicted no, and they didn’t churn. false positives (FP): We predicted yes, but they didn’t actually churn. (Also known as a “Type I error.”) false negatives (FN): We predicted no, but they actually churned. (Also known as a “Type II error.”) Let’s examine the confusion matrix for our decision tree model.


tr_prob1 <- predict(tr_fit, dtest)
tr_pred1 <- ifelse(tr_prob1[,2] > 0.5,"Yes","No")
table(Predicted = tr_pred1, Actual = dtest$Churn.Label)
##          Actual
## Predicted    0    1
##       No  1354  208
##       Yes  209  341
tr_prob2 <- predict(tr_fit, dtrain)
tr_pred2 <- ifelse(tr_prob2[,2] > 0.5,"Yes","No")
tr_tab1 <- table(Predicted = tr_pred2, Actual = dtrain$Churn.Label)
tr_tab2 <- table(Predicted = tr_pred1, Actual = dtest$Churn.Label)
# # Train
# confusionMatrix(
#   as.factor(tr_pred2),
#   as.factor(dtrain$Churn.Label),na.action = na.pass,
#   positive = "Yes" 
# )
# 
# #Test
# confusionMatrix(
# as.factor(tr_pred1),
#   as.factor(dtest$Churn.Label),
#   positive = "Yes" 
# )

tr_acc <- sum(diag(tr_tab2))/sum(tr_tab2)
tr_acc
## [1] 0.803
library(pROC)
ROC_rf <- roc(dtest$Churn.Label, tr_prob1[,2])


ROC_rf_auc <- auc(ROC_rf)

plot(ROC_rf, col = "green", main = "ROC Curve for the decision tree with entropy")

paste("Accuracy decision tree with entropy: ", mean(dtest$Churn.Labe == round(tr_prob1[,2], digits = 0)))
## [1] "Accuracy decision tree with entropy:  0.802556818181818"
paste("Area under curve for decision tree with entropy is: ", ROC_rf_auc)
## [1] "Area under curve for decision tree with entropy is:  0.824659387684466"

The overall accuracy of the decision tree with entropy model is 80.26% The AUC of the decision tree with entropy model is 0.82


We have build the decision tree with entropy parameter but surprisingly it gave the same result as given by gini decision tree model.


Prune trees

Pruning is a data compression technique in machine learning and search algorithms that reduces the size of decision trees by removing sections of the tree that are non-critical and redundant to classify instances.

we have used the control paramteres for the tree for prunning as :

cp = 0.02,maxcompete = 2, maxsurrogate = 5, usesurrogate = 2, xval = 10, surrogatestyle = 0, maxdepth = 10.

cp:The complexity parameter (cp) in rpart is the minimum improvement in the model needed at each node. It’s based on the cost complexity of the model maxsurrogate :number of surrogate splits to evaluate.

maxdepth : Set the maximum depth of any node of the final tree,xval: number of cross-validations


tr_fit <- rpart(Churn.Label ~., data = dtrain, method="class",na.action = na.exclude,control = rpart.control(cp = 0.02,maxcompete = 2, maxsurrogate = 5, usesurrogate = 2, xval = 10, surrogatestyle = 0, maxdepth = 10 ))

rpart.plot(tr_fit)

summary(tr_fit)
## Call:
## rpart(formula = Churn.Label ~ ., data = dtrain, na.action = na.exclude, 
##     method = "class", control = rpart.control(cp = 0.02, maxcompete = 2, 
##         maxsurrogate = 5, usesurrogate = 2, xval = 10, surrogatestyle = 0, 
##         maxdepth = 10))
##   n= 4931 
## 
##       CP nsplit rel error xerror   xstd
## 1 0.0468      0     1.000  1.000 0.0236
## 2 0.0333      4     0.813  0.906 0.0228
## 3 0.0205      5     0.780  0.804 0.0219
## 4 0.0200      6     0.759  0.795 0.0218
## 
## Variable importance
##                Contract           Tenure.Months         Tenure.Months.1 
##                      26                      19                      19 
##        Internet.Service              Dependents            Tech.Support 
##                       7                       6                       6 
##                 Partner         Online.Security Avg.Monthly.GB.Download 
##                       6                       5                       3 
##           Phone.Service           Online.Backup 
##                       1                       1 
## 
## Node number 1: 4931 observations,    complexity param=0.0468
##   predicted class=0  expected loss=0.268  P(node) =1
##     class counts:  3611  1320
##    probabilities: 0.732 0.268 
##   left son=2 (2217 obs) right son=3 (2714 obs)
##   Primary splits:
##       Contract        < 0.5  to the right, improve=311, (0 missing)
##       Tenure.Months   < 16.5 to the right, improve=194, (0 missing)
##       Tenure.Months.1 < 16.5 to the right, improve=194, (0 missing)
##   Surrogate splits:
##       Tenure.Months   < 34.5 to the right, agree=0.786, adj=0.524, (0 split)
##       Tenure.Months.1 < 34.5 to the right, agree=0.786, adj=0.524, (0 split)
##       Tech.Support    < 0.5  to the right, agree=0.650, adj=0.222, (0 split)
##       Partner         < 0.5  to the right, agree=0.640, adj=0.200, (0 split)
##       Online.Security < 0.5  to the right, agree=0.630, adj=0.178, (0 split)
## 
## Node number 2: 2217 observations
##   predicted class=0  expected loss=0.0713  P(node) =0.45
##     class counts:  2059   158
##    probabilities: 0.929 0.071 
## 
## Node number 3: 2714 observations,    complexity param=0.0468
##   predicted class=0  expected loss=0.428  P(node) =0.55
##     class counts:  1552  1162
##    probabilities: 0.572 0.428 
##   left son=6 (443 obs) right son=7 (2271 obs)
##   Primary splits:
##       Dependents      < 0.5  to the right, improve=76.0, (0 missing)
##       Tenure.Months   < 1.5  to the right, improve=49.5, (0 missing)
##       Tenure.Months.1 < 1.5  to the right, improve=49.5, (0 missing)
## 
## Node number 6: 443 observations
##   predicted class=0  expected loss=0.16  P(node) =0.0898
##     class counts:   372    71
##    probabilities: 0.840 0.160 
## 
## Node number 7: 2271 observations,    complexity param=0.0468
##   predicted class=0  expected loss=0.48  P(node) =0.461
##     class counts:  1180  1091
##    probabilities: 0.520 0.480 
##   left son=14 (270 obs) right son=15 (2001 obs)
##   Primary splits:
##       Avg.Monthly.GB.Download < 1    to the left,  improve=38.5, (0 missing)
##       Internet.Service        < 1.5  to the right, improve=38.5, (0 missing)
##       Tenure.Months           < 1.5  to the right, improve=37.8, (0 missing)
##   Surrogate splits:
##       Internet.Service < 1.5  to the right, agree=1, adj=1, (0 split)
## 
## Node number 14: 270 observations
##   predicted class=0  expected loss=0.23  P(node) =0.0548
##     class counts:   208    62
##    probabilities: 0.770 0.230 
## 
## Node number 15: 2001 observations,    complexity param=0.0468
##   predicted class=1  expected loss=0.486  P(node) =0.406
##     class counts:   972  1029
##    probabilities: 0.486 0.514 
##   left son=30 (656 obs) right son=31 (1345 obs)
##   Primary splits:
##       Internet.Service < 0.5  to the left,  improve=49.4, (0 missing)
##       Tenure.Months    < 16.5 to the right, improve=46.5, (0 missing)
##       Tenure.Months.1  < 16.5 to the right, improve=46.5, (0 missing)
##   Surrogate splits:
##       Phone.Service < 0.5  to the left,  agree=0.771, adj=0.302, (0 split)
## 
## Node number 30: 656 observations,    complexity param=0.0205
##   predicted class=0  expected loss=0.355  P(node) =0.133
##     class counts:   423   233
##    probabilities: 0.645 0.355 
##   left son=60 (451 obs) right son=61 (205 obs)
##   Primary splits:
##       Tenure.Months   < 3.5  to the right, improve=26.50, (0 missing)
##       Tenure.Months.1 < 3.5  to the right, improve=26.50, (0 missing)
##       Online.Security < 0.5  to the right, improve= 6.06, (0 missing)
##   Surrogate splits:
##       Tenure.Months.1 < 3.5  to the right, agree=1, adj=1, (0 split)
## 
## Node number 31: 1345 observations,    complexity param=0.0333
##   predicted class=1  expected loss=0.408  P(node) =0.273
##     class counts:   549   796
##    probabilities: 0.408 0.592 
##   left son=62 (660 obs) right son=63 (685 obs)
##   Primary splits:
##       Tenure.Months   < 16.5 to the right, improve=40.6, (0 missing)
##       Tenure.Months.1 < 16.5 to the right, improve=40.6, (0 missing)
##       Online.Security < 0.5  to the right, improve=16.9, (0 missing)
##   Surrogate splits:
##       Tenure.Months.1 < 16.5 to the right, agree=1.000, adj=1.000, (0 split)
##       Online.Backup   < 0.5  to the right, agree=0.619, adj=0.223, (0 split)
##       Partner         < 0.5  to the right, agree=0.599, adj=0.182, (0 split)
##       Online.Security < 0.5  to the right, agree=0.575, adj=0.135, (0 split)
##       Tech.Support    < 0.5  to the right, agree=0.558, adj=0.098, (0 split)
## 
## Node number 60: 451 observations
##   predicted class=0  expected loss=0.259  P(node) =0.0915
##     class counts:   334   117
##    probabilities: 0.741 0.259 
## 
## Node number 61: 205 observations
##   predicted class=1  expected loss=0.434  P(node) =0.0416
##     class counts:    89   116
##    probabilities: 0.434 0.566 
## 
## Node number 62: 660 observations
##   predicted class=0  expected loss=0.467  P(node) =0.134
##     class counts:   352   308
##    probabilities: 0.533 0.467 
## 
## Node number 63: 685 observations
##   predicted class=1  expected loss=0.288  P(node) =0.139
##     class counts:   197   488
##    probabilities: 0.288 0.712

From this decision tree, we can interpret the following:

The contract variable is the most important. Customers with month-to-month contracts are more likely to churn.Customers who has a dependents (value=1) has the higher chance to churn. Customers with DSL internet service are less likely to churn. Customers who have stayed longer than 16 months(previously 54) are less likely to churn. Now let’s assess the prediction accuracy of the decision tree model by investigating how well it predicts churn in the test subset. We will begin with the confustion matrix, which is a useful display of classification accuracy. It displays the following information:


true positives (TP): These are cases in which we predicted yes (they churned), and they did churn. true negatives (TN): We predicted no, and they didn’t churn. false positives (FP): We predicted yes, but they didn’t actually churn. (Also known as a “Type I error.”) false negatives (FN): We predicted no, but they actually churned. (Also known as a “Type II error.”) Let’s examine the confusion matrix for our decision tree model.From this confusion matrix, we can see that the model performs well at predicting non-churning customers (1426 correct vs. 272 incorrect) and predicting churning customers 277 correct vs. 137 incorrect).


tr_prob1 <- predict(tr_fit, dtest)
tr_pred1 <- ifelse(tr_prob1[,2] > 0.5,"Yes","No")
table(Predicted = tr_pred1, Actual = dtest$Churn.Label)
##          Actual
## Predicted    0    1
##       No  1426  272
##       Yes  137  277
tr_prob2 <- predict(tr_fit, dtrain)
tr_pred2 <- ifelse(tr_prob2[,2] > 0.5,"Yes","No")
tr_tab1 <- table(Predicted = tr_pred2, Actual = dtrain$Churn.Label)
tr_tab2 <- table(Predicted = tr_pred1, Actual = dtest$Churn.Label)
# # Train
# confusionMatrix(
#   as.factor(tr_pred2),
#   as.factor(dtrain$Churn.Label),na.action = na.pass,
#   positive = "Yes" 
# )
# 
# #Test
# confusionMatrix(
# as.factor(tr_pred1),
#   as.factor(dtest$Churn.Label),
#   positive = "Yes" 
# )

tr_acc <- sum(diag(tr_tab2))/sum(tr_tab2)
tr_acc
## [1] 0.806
library(pROC)
ROC_rf <- roc(dtest$Churn.Label, tr_prob1[,2])


ROC_rf_auc <- auc(ROC_rf)

plot(ROC_rf, col = "green", main = "ROC Curve for the prune tree")

paste("Accuracy : ", mean(dtest$Churn.Labe == round(tr_prob1[,2], digits = 0)))
## [1] "Accuracy :  0.806344696969697"
paste("Area under curve for prune tree model is: ", ROC_rf_auc)
## [1] "Area under curve for prune tree model is:  0.824739799111279"

The overall accuracy of the prune tree model is 80.63% The AUC of the prune tree model is 0.82


Prune model is best than other models in terms of accuracy, efficiency and roc value


4. Model Comparisons

4.1 Accuarcy of Models

Accuracy of each model: Logistic Regression :73.5% Random Forest Model: 80.3% SVM: 80.9% Decision Tree: 80.63%

The Prune Tree Model had the best combined accuracy and AUC


4.3 Final Model Selection

After comparing the accuracy, sensitivity, specificity, and AUC value, the prune tree has the best performence in all cases. Therefore the prune tree is selected as the final model.

The overall accuracy of the decision tree with entropy model is 80.26% The AUC of the decision tree with entropy model is 0.82

The overall accuracy of the prune tree model is 80.63% The AUC of the prune tree model is 0.82.

Sensity : 50.45% Specificity : 91.23%

Prune gives higher accuacy with lower levels than decision tree with entropy or gini parametre.

As this is a binary classification, we have used auc score for comparing the model as a parametre . Random forest has close accuracy to the prune tree , but lower auc score than prune tree,also the decision tree model gives high importance to a particular set of features. But the random forest chooses features randomly during the training process. Therefore, it does not depend highly on any specific set of features,making to opt for prune tree over random forest.

Decision tree has 9 levels where prune tree has only 6 levels. Prune tree gives more clear picture of feature ‘Tenure months’ and ‘Average monthly gb downloaded’


5. Answering SMART Questions

We have two questions:

Customers with what behaviors and conditions would likely leave the platform?

  • From the prune tree, Customers with month-to-month contracts are more likely to churn. Customers who has a dependents (value=1) has the higher chance to churn. Customers with DSL internet service are less likely to churn. Customers who have stayed longer than 16 months(previously 54) are less likely to churn.

What services are important to deliver to a customer to keep them with the company?

  • According to the model analysis, we found a couple of important features that would affect whether customers decide to churn or not churn. For example,vfeatures from the random forest model would make impact on whether customers decide to churn or not churn include total charges, tenure months, monthly charges, internet service with fiber optic, dependents, payment method with an electronic check, contract two years, contract one year, tech support, and paperless billing. Since total charges, tenure months, and monthly charges rank the top 3 in importance, therefore a reasonable price plan will be important to keep customer with plaforms.

  • Another example, as explored in the Logistic Model, would be developing special deals to offer customers in order to get them to agree to a 1-year or 2-year contract, as being in a long-term contract significantly reduced the risk of churning. Several of the models found this feature to be an important one.

  • It is evident from prune tree that customers who has 16 months or more contract has low chance of churning than others, if the company need to keep their customers it is advisable to devise attractive 2 year long contract plans