Objectives:

In this project, I intend to use different machine learning algorithms to analyze customer churn analysis.

Multiple different models are employed to accurately predict those churn customers in the data set. These models are:

  1. Logistic Regression
  2. Support Vector Machine
  3. Random Forest
  4. Gradient Boosting Trees

Besides these machine learning techniques, the project also explores different ways to represent the data as shown in the exploratory data analysis section.

Last but not least, the project uses the “GoodmanKrusal” method to study the correlation between multiple categorical variables in the data set.

ChurnData <- read.csv("/Users/vidarithchan/Desktop/DuringBreakDataScience/CustomerChurnAnalysis/TelcoChurn.csv", header=T)

colnames(ChurnData)
##  [1] "customerID"       "gender"           "SeniorCitizen"   
##  [4] "Partner"          "Dependents"       "tenure"          
##  [7] "PhoneService"     "MultipleLines"    "InternetService" 
## [10] "OnlineSecurity"   "OnlineBackup"     "DeviceProtection"
## [13] "TechSupport"      "StreamingTV"      "StreamingMovies" 
## [16] "Contract"         "PaperlessBilling" "PaymentMethod"   
## [19] "MonthlyCharges"   "TotalCharges"     "Churn"
nrow(ChurnData)
## [1] 7043
ncol(ChurnData)
## [1] 21
summary(ChurnData$Churn) # look at how many churns we have. 
##   No  Yes 
## 5174 1869
summary(ChurnData$gender)
## Female   Male 
##   3488   3555

Exploratory Data Analysis:

## Let's explore and clean the data:
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
ChurnData %>% mutate(ChurnBinary=ifelse(Churn=="Yes",1,0)) %>% group_by(gender) %>% summarise(ChurnTotal=sum(ChurnBinary))
## # A tibble: 2 x 2
##   gender ChurnTotal
##   <fctr>      <dbl>
## 1 Female        939
## 2   Male        930
## OR
library(gmodels)
CrossTable(ChurnData$gender,ChurnData$Churn)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  7043 
## 
##  
##                  | ChurnData$Churn 
## ChurnData$gender |        No |       Yes | Row Total | 
## -----------------|-----------|-----------|-----------|
##           Female |      2549 |       939 |      3488 | 
##                  |     0.070 |     0.194 |           | 
##                  |     0.731 |     0.269 |     0.495 | 
##                  |     0.493 |     0.502 |           | 
##                  |     0.362 |     0.133 |           | 
## -----------------|-----------|-----------|-----------|
##             Male |      2625 |       930 |      3555 | 
##                  |     0.069 |     0.190 |           | 
##                  |     0.738 |     0.262 |     0.505 | 
##                  |     0.507 |     0.498 |           | 
##                  |     0.373 |     0.132 |           | 
## -----------------|-----------|-----------|-----------|
##     Column Total |      5174 |      1869 |      7043 | 
##                  |     0.735 |     0.265 |           | 
## -----------------|-----------|-----------|-----------|
## 
## 
## OR
ChurnData %>% mutate(Churn = case_when (Churn=="Yes" ~ 1,Churn =="No" ~0)) %>% 
  select(gender,Churn) %>% group_by(gender) %>% 
  summarise(TotalChurn = sum(Churn))
## # A tibble: 2 x 2
##   gender TotalChurn
##   <fctr>      <dbl>
## 1 Female        939
## 2   Male        930
sapply(ChurnData, function(x) sum(is.na(x))) 
##       customerID           gender    SeniorCitizen          Partner 
##                0                0                0                0 
##       Dependents           tenure     PhoneService    MultipleLines 
##                0                0                0                0 
##  InternetService   OnlineSecurity     OnlineBackup DeviceProtection 
##                0                0                0                0 
##      TechSupport      StreamingTV  StreamingMovies         Contract 
##                0                0                0                0 
## PaperlessBilling    PaymentMethod   MonthlyCharges     TotalCharges 
##                0                0                0               11 
##            Churn 
##                0
# find out the blank columns
# TotalCharges is the only column that has blank cell. Let's median-fill those cells.

mean(ChurnData$TotalCharges,na.rm=TRUE)
## [1] 2283.3
median(ChurnData$TotalCharges,na.rm=TRUE)
## [1] 1397.475
ChurnData$TotalCharges[which(is.na(ChurnData$TotalCharges))] <- median(ChurnData$TotalCharges,na.rm=TRUE)

########################################################################################################################################################
## Now that the data is relatively clean. Let's explore some of the fields. Mainly, let's find out characteristics of high risk customers. 
## Questions to consider after looking at the data: 

## Is it true that younger citizen has a higher Churn rate?
ChurnData %>% mutate(Churn = case_when(Churn=="Yes" ~ 1, Churn == "No" ~ 0)) %>% 
  select(SeniorCitizen,Churn) %>% 
  group_by(SeniorCitizen) %>% 
  count(Churn) %>% 
  mutate(ChurnRate = 100*round(n/sum(n),2)) # Senior citizens are indeed have a higher churn rate than younger customers.
## # A tibble: 4 x 4
## # Groups:   SeniorCitizen [2]
##   SeniorCitizen Churn     n ChurnRate
##           <int> <dbl> <int>     <dbl>
## 1             0     0  4508        76
## 2             0     1  1393        24
## 3             1     0   666        58
## 4             1     1   476        42
## Find out proportion of Churn by Internet Serivice. Graph. There are two types of internet service. 
ChurnData %>% mutate(Churn= case_when(Churn=="Yes" ~1, Churn=="No" ~ 0)) %>%
  select(InternetService,Churn) %>%
  group_by(InternetService) %>% count(Churn) %>% mutate(ChurnRate = 100*round(n/sum(n),2)) # Fiber optic customers have very high churn rate. Why? 
## # A tibble: 6 x 4
## # Groups:   InternetService [3]
##   InternetService Churn     n ChurnRate
##            <fctr> <dbl> <int>     <dbl>
## 1             DSL     0  1962        81
## 2             DSL     1   459        19
## 3     Fiber optic     0  1799        58
## 4     Fiber optic     1  1297        42
## 5              No     0  1413        93
## 6              No     1   113         7
##  Is "fiber optic" more costly than other types of internet service?
ChurnData %>% group_by(InternetService) %>% summarise (avgcost = mean(MonthlyCharges)) # Fiber optic has the highest monthly charge.
## # A tibble: 3 x 2
##   InternetService  avgcost
##            <fctr>    <dbl>
## 1             DSL 58.10217
## 2     Fiber optic 91.50013
## 3              No 21.07919
# What types of contract have the highest churn rate? 
ChurnData %>% mutate(Churn= case_when(Churn=="Yes" ~1, Churn=="No" ~ 0)) %>% 
  select(Contract, Churn) %>% group_by(Contract) %>% count(Churn) %>% mutate(ChurnRate = 100*round(n/sum(n),2)) # Month-to-month has the highest churn rate. 
## # A tibble: 6 x 4
## # Groups:   Contract [3]
##         Contract Churn     n ChurnRate
##           <fctr> <dbl> <int>     <dbl>
## 1 Month-to-month     0  2220        57
## 2 Month-to-month     1  1655        43
## 3       One year     0  1307        89
## 4       One year     1   166        11
## 5       Two year     0  1647        97
## 6       Two year     1    48         3
# Payment method Vs Churn rate
ChurnData %>% mutate(Churn = case_when(Churn=="Yes" ~1, Churn=="No" ~0)) %>%
  select(PaymentMethod, Churn) %>% group_by(PaymentMethod) %>% count(Churn) %>% mutate(ChurnRate = 100* round(n/sum(n),2)) %>% arrange(desc(ChurnRate)) # Electronic check has the highest churn rate.
## # A tibble: 8 x 4
## # Groups:   PaymentMethod [4]
##               PaymentMethod Churn     n ChurnRate
##                      <fctr> <dbl> <int>     <dbl>
## 1   Credit card (automatic)     0  1290        85
## 2 Bank transfer (automatic)     0  1286        83
## 3              Mailed check     0  1304        81
## 4          Electronic check     0  1294        55
## 5          Electronic check     1  1071        45
## 6              Mailed check     1   308        19
## 7 Bank transfer (automatic)     1   258        17
## 8   Credit card (automatic)     1   232        15
# Average tenure of those who churnned. 
subset(ChurnData, ChurnData$Churn=="Yes") %>% summarise(mean=mean(tenure)) # Average tenure for churnned customer is around 18 (month? year?). 
##       mean
## 1 17.97913
# Average monthly charge of those who churned. 
subset(ChurnData, ChurnData$Churn=="Yes") %>% summarise(mean=mean(MonthlyCharges)) # Average monthly charge is $74.
##       mean
## 1 74.44133
# Average total charge of those who churnned. 
subset(ChurnData,ChurnData$Churn=="Yes") %>% summarise(mean=mean(TotalCharges)) # Average total charge is $1,531.
##       mean
## 1 1531.796
# Average monthly charges by internet service types for churnned customer:
subset(ChurnData, ChurnData$Churn=="Yes") %>% group_by(InternetService) %>% summarise(mean=mean(MonthlyCharges)) %>%
  ggplot(aes(InternetService,mean)) + geom_bar(stat="identity", color="blue", fill="#DD8888", width=.5)+
  labs(x= "Internet Service",
       y= "Average Monthly Charge ($)",
       title = "Average Monthly Charge for Churnned Customers by Internet Service Type")

#######################################################################################################################################################
# Correlation between categorical variables: 
library(GoodmanKruskal)
GKtau(ChurnData$gender,ChurnData$InternetService)
##              xName                     yName Nx Ny tauxy tauyx
## 1 ChurnData$gender ChurnData$InternetService  2  3     0     0
GKtau(ChurnData$InternetService,ChurnData$Contract, dgts=5)
##                       xName              yName Nx Ny   tauxy   tauyx
## 1 ChurnData$InternetService ChurnData$Contract  3  3 0.05158 0.04222
GKtau(ChurnData$SeniorCitizen,ChurnData$Partner, dgts=5)
##                     xName             yName Nx Ny   tauxy   tauyx
## 1 ChurnData$SeniorCitizen ChurnData$Partner  2  2 0.00027 0.00027
# https://cran.r-project.org/web/packages/GoodmanKruskal/vignettes/GoodmanKruskal.html: Very good and insightful read.
varSet2 <- c("MultipleLines","InternetService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies","Contract")
ChurnData2 <- subset(ChurnData,select= varSet2)
plot(GKtauDataframe(ChurnData2))

# Extremely useful graph to explore relationships between categorical variables. 
# Looks like the only highly correlated variables are "streaming movies" and "streaming tv" which is expected. 

TRAIN vs TEST SETS:

nrow(ChurnData)
## [1] 7043
ChurnData$Churn <- ifelse(ChurnData$Churn =="Yes",1,0)
set.seed(42)
random <- sample(1:nrow(ChurnData),nrow(ChurnData)*.75)
Trainset <- ChurnData[random,]
Testset <- ChurnData[-random,]

PREPARATION FOR THE MODEL BUILDING:

## Models: 
# 1- Logistic Regression, 
# 2- SVM, 
# 3- Random Forest, 
# 4- Gradient Boosted Trees, 

## Variable selection: We know that we should not include "streaming movies" and "streaming tv" in the same equation. Their correlation from 
## the above section is fairly high. 
as.formula(paste('Churn ~', paste(colnames(ChurnData[2:(which(colnames(ChurnData)=="Churn")-2)]),collapse='+')))
## Churn ~ gender + SeniorCitizen + Partner + Dependents + tenure + 
##     PhoneService + MultipleLines + InternetService + OnlineSecurity + 
##     OnlineBackup + DeviceProtection + TechSupport + StreamingTV + 
##     StreamingMovies + Contract + PaperlessBilling + PaymentMethod + 
##     MonthlyCharges

LOGISTIC REGRESSION FOR TRAIN SET:

trainLRmodel<- glm(Churn ~ gender + SeniorCitizen + Partner + Dependents + tenure + 
      PhoneService + MultipleLines + InternetService + OnlineSecurity + 
      OnlineBackup + DeviceProtection + TechSupport + StreamingTV + 
      StreamingMovies + Contract + PaperlessBilling + PaymentMethod + 
      MonthlyCharges, data= Trainset, family=binomial)
# shocking results. Many of these variables are not statistically significant.
# Let's choose those that significant and re-run the regression. 

trainLRmodel1 <- glm(Churn ~ SeniorCitizen + Dependents + tenure + MultipleLines + Contract + 
                       PaperlessBilling + PaymentMethod, data = Trainset, family = binomial)

library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:gmodels':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
Trainset$predictedLR1 <- predict.glm(trainLRmodel1, newdata=Trainset,type="response")
ROCpred <- prediction(Trainset$predictedLR1, Trainset$Churn)
ROCperf <- performance(ROCpred,'tpr','fpr')
plot(ROCperf)

auc <- performance(ROCpred, measure="auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8286128
# 82.86% which is not bad for a logistic regression. 

SUPPORT VECTOR MACHINE (SVM)

library(e1071)
trainSVMmodel1 <- svm(Churn ~ SeniorCitizen + Dependents + tenure + MultipleLines + Contract + 
                        PaperlessBilling + PaymentMethod, Trainset)
Trainset$predSVM <- predict(trainSVMmodel1,Trainset)
ROCRpredSVM <- prediction(Trainset$predSVM,Trainset$Churn)
ROCRperfSVM <- performance(ROCRpredSVM,'tpr','fpr')
plot(ROCRperfSVM)

aucSVM <- performance(ROCRpredSVM,measure="auc")
aucSVM <- aucSVM@y.values[[1]]
aucSVM
## [1] 0.7975093
# 79.75% not very good prediction relatives to other methods. 

RANDOM FOREST (RF)

library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
trainRFmodel1 <- randomForest(as.factor(Churn) ~SeniorCitizen + Dependents + tenure 
                              + MultipleLines + Contract + PaperlessBilling + PaymentMethod, 
                              data = Trainset, importance= TRUE, ntree=1000)
summary(trainRFmodel1)
##                 Length Class  Mode     
## call                5  -none- call     
## type                1  -none- character
## predicted        5282  factor numeric  
## err.rate         3000  -none- numeric  
## confusion           6  -none- numeric  
## votes           10564  matrix numeric  
## oob.times        5282  -none- numeric  
## classes             2  -none- character
## importance         28  -none- numeric  
## importanceSD       21  -none- numeric  
## localImportance     0  -none- NULL     
## proximity           0  -none- NULL     
## ntree               1  -none- numeric  
## mtry                1  -none- numeric  
## forest             14  -none- list     
## y                5282  factor numeric  
## test                0  -none- NULL     
## inbag               0  -none- NULL     
## terms               3  terms  call
predictions <- as.vector(trainRFmodel1$votes[,2])
Trainset$predictRF <- predictions
ROCRpredwholeRandom <- prediction(Trainset$predictRF,Trainset$Churn)
ROCRperfwholeRandom <- performance(ROCRpredwholeRandom,'tpr','fpr')
plot(ROCRperfwholeRandom)

aucwholeRandom <- performance(ROCRpredwholeRandom,measure='auc')
aucwholeRandom <- aucwholeRandom@y.values[[1]]
aucwholeRandom
## [1] 0.8124204
# 81.24% is better than the logistic regression's AUC.

GRADIENT BOOSTING TREES (GBT)

library(gbm)
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
trainGBMmodel1 <- gbm(Churn ~ SeniorCitizen + Dependents + tenure + MultipleLines + Contract + 
                        PaperlessBilling + PaymentMethod, data = Trainset,
                      distribution= "gaussian",n.trees=1000,shrinkage = 0.01,interaction.depth=4)
summary(trainGBMmodel1)

##                               var   rel.inf
## Contract                 Contract 41.132851
## tenure                     tenure 26.806113
## PaymentMethod       PaymentMethod 14.940637
## PaperlessBilling PaperlessBilling  6.453567
## MultipleLines       MultipleLines  5.730687
## SeniorCitizen       SeniorCitizen  3.492444
## Dependents             Dependents  1.443702
# the most important factor to predict churn is "Contract" followed by "tenure". 
Trainset$predGBM <- predict(trainGBMmodel1,Trainset,n.trees=1000)
ROCRpredGBM <- prediction(Trainset$predGBM, Trainset$Churn)
ROCRperfGBM <- performance(ROCRpredGBM,'tpr','fpr')
plot(ROCRperfGBM)

aucGBM <- performance(ROCRpredGBM,measure='auc')
aucGBM <- aucGBM@y.values[[1]]
aucGBM
## [1] 0.8456603

Performance Comparisons for the train set

m <- matrix(c(auc,aucSVM,aucwholeRandom,aucGBM),nrow=4,ncol=1)
colnames(m) <- c("AUC Value")
rownames(m) <- c("Logistic Regression","SVM","Random Forest","Gradient Boosting")
m
##                     AUC Value
## Logistic Regression 0.8286128
## SVM                 0.7975093
## Random Forest       0.8124204
## Gradient Boosting   0.8456603
plot(ROCperf, col="red",colorsize = TRUE, text.adj = c(-.2,1.7), main="AUC Curves - 4 Models ")
plot(ROCRperfwholeRandom,add=TRUE,col="green", colorsize = TRUE, text.adj = c(-.2,1.7))
plot(ROCRperfSVM,add=TRUE,col="blue", colorsize = TRUE, text.adj = c(-.2,1.7))
plot(ROCRperfGBM,add=TRUE,col="black", colorsize = TRUE, text.adj = c(-.2,1.7))
labels <- c("GLM: AUC=82.86%","Random Forest: AUC=81.24%","SVM: AUC=79.75%", "Gradient Boosting: AUC=84.57%")
legend("bottom",xpd=TRUE,inset=c(0,0),labels,bty="n",pch=1,col=c("red","green","blue","black"))

CHOOSING MODEL:

According to the AUC curves, the method that gives us the most accurate model is gradient boosting with AUC value of 84.57%.

Performance analysis for the test set using the best model.

# Let apply GBM to the test set.

testGBMmodel1 <- gbm(Churn ~ SeniorCitizen + Dependents + tenure + MultipleLines + Contract + 
                       PaperlessBilling + PaymentMethod, data = Testset,
                     distribution= "gaussian",n.trees=1000,shrinkage = 0.01,interaction.depth=4)
summary(testGBMmodel1)

##                               var   rel.inf
## tenure                     tenure 33.430286
## Contract                 Contract 32.130401
## PaymentMethod       PaymentMethod 13.208323
## MultipleLines       MultipleLines  8.102637
## PaperlessBilling PaperlessBilling  5.891407
## SeniorCitizen       SeniorCitizen  4.348326
## Dependents             Dependents  2.888619
Testset$predGBM <- predict(testGBMmodel1,Testset,n.trees=1000)
ROCRpredTestGBM <- prediction(Testset$predGBM,Testset$Churn)
ROCRperfTestGBM <- performance(ROCRpredTestGBM,'tpr','fpr')
plot(ROCRperfTestGBM)

aucTestGBM <- performance(ROCRpredTestGBM,measure="auc")
aucTestGBM <- aucTestGBM@y.values[[1]]
aucTestGBM
## [1] 0.8599272
# 86% the best!

Conclusion:

a <- matrix(c("The Best Model","Average Model","Average Model","Underperforming"),nrow=4,ncol=1)
colnames(a) <- c("General Performance (accuracy) of the algorithms")
rownames(a) <- c("Gradient Boosting","Logistic Regression","Random Forest","Support Vector Machine")
a
##                        General Performance (accuracy) of the algorithms
## Gradient Boosting      "The Best Model"                                
## Logistic Regression    "Average Model"                                 
## Random Forest          "Average Model"                                 
## Support Vector Machine "Underperforming"