In this project, I intend to use different machine learning algorithms to analyze customer churn analysis.
Multiple different models are employed to accurately predict those churn customers in the data set. These models are:
Besides these machine learning techniques, the project also explores different ways to represent the data as shown in the exploratory data analysis section.
Last but not least, the project uses the “GoodmanKrusal” method to study the correlation between multiple categorical variables in the data set.
ChurnData <- read.csv("/Users/vidarithchan/Desktop/DuringBreakDataScience/CustomerChurnAnalysis/TelcoChurn.csv", header=T)
colnames(ChurnData)
## [1] "customerID" "gender" "SeniorCitizen"
## [4] "Partner" "Dependents" "tenure"
## [7] "PhoneService" "MultipleLines" "InternetService"
## [10] "OnlineSecurity" "OnlineBackup" "DeviceProtection"
## [13] "TechSupport" "StreamingTV" "StreamingMovies"
## [16] "Contract" "PaperlessBilling" "PaymentMethod"
## [19] "MonthlyCharges" "TotalCharges" "Churn"
nrow(ChurnData)
## [1] 7043
ncol(ChurnData)
## [1] 21
summary(ChurnData$Churn) # look at how many churns we have.
## No Yes
## 5174 1869
summary(ChurnData$gender)
## Female Male
## 3488 3555
## Let's explore and clean the data:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
ChurnData %>% mutate(ChurnBinary=ifelse(Churn=="Yes",1,0)) %>% group_by(gender) %>% summarise(ChurnTotal=sum(ChurnBinary))
## # A tibble: 2 x 2
## gender ChurnTotal
## <fctr> <dbl>
## 1 Female 939
## 2 Male 930
## OR
library(gmodels)
CrossTable(ChurnData$gender,ChurnData$Churn)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 7043
##
##
## | ChurnData$Churn
## ChurnData$gender | No | Yes | Row Total |
## -----------------|-----------|-----------|-----------|
## Female | 2549 | 939 | 3488 |
## | 0.070 | 0.194 | |
## | 0.731 | 0.269 | 0.495 |
## | 0.493 | 0.502 | |
## | 0.362 | 0.133 | |
## -----------------|-----------|-----------|-----------|
## Male | 2625 | 930 | 3555 |
## | 0.069 | 0.190 | |
## | 0.738 | 0.262 | 0.505 |
## | 0.507 | 0.498 | |
## | 0.373 | 0.132 | |
## -----------------|-----------|-----------|-----------|
## Column Total | 5174 | 1869 | 7043 |
## | 0.735 | 0.265 | |
## -----------------|-----------|-----------|-----------|
##
##
## OR
ChurnData %>% mutate(Churn = case_when (Churn=="Yes" ~ 1,Churn =="No" ~0)) %>%
select(gender,Churn) %>% group_by(gender) %>%
summarise(TotalChurn = sum(Churn))
## # A tibble: 2 x 2
## gender TotalChurn
## <fctr> <dbl>
## 1 Female 939
## 2 Male 930
sapply(ChurnData, function(x) sum(is.na(x)))
## customerID gender SeniorCitizen Partner
## 0 0 0 0
## Dependents tenure PhoneService MultipleLines
## 0 0 0 0
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## 0 0 0 0
## TechSupport StreamingTV StreamingMovies Contract
## 0 0 0 0
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## 0 0 0 11
## Churn
## 0
# find out the blank columns
# TotalCharges is the only column that has blank cell. Let's median-fill those cells.
mean(ChurnData$TotalCharges,na.rm=TRUE)
## [1] 2283.3
median(ChurnData$TotalCharges,na.rm=TRUE)
## [1] 1397.475
ChurnData$TotalCharges[which(is.na(ChurnData$TotalCharges))] <- median(ChurnData$TotalCharges,na.rm=TRUE)
########################################################################################################################################################
## Now that the data is relatively clean. Let's explore some of the fields. Mainly, let's find out characteristics of high risk customers.
## Questions to consider after looking at the data:
## Is it true that younger citizen has a higher Churn rate?
ChurnData %>% mutate(Churn = case_when(Churn=="Yes" ~ 1, Churn == "No" ~ 0)) %>%
select(SeniorCitizen,Churn) %>%
group_by(SeniorCitizen) %>%
count(Churn) %>%
mutate(ChurnRate = 100*round(n/sum(n),2)) # Senior citizens are indeed have a higher churn rate than younger customers.
## # A tibble: 4 x 4
## # Groups: SeniorCitizen [2]
## SeniorCitizen Churn n ChurnRate
## <int> <dbl> <int> <dbl>
## 1 0 0 4508 76
## 2 0 1 1393 24
## 3 1 0 666 58
## 4 1 1 476 42
## Find out proportion of Churn by Internet Serivice. Graph. There are two types of internet service.
ChurnData %>% mutate(Churn= case_when(Churn=="Yes" ~1, Churn=="No" ~ 0)) %>%
select(InternetService,Churn) %>%
group_by(InternetService) %>% count(Churn) %>% mutate(ChurnRate = 100*round(n/sum(n),2)) # Fiber optic customers have very high churn rate. Why?
## # A tibble: 6 x 4
## # Groups: InternetService [3]
## InternetService Churn n ChurnRate
## <fctr> <dbl> <int> <dbl>
## 1 DSL 0 1962 81
## 2 DSL 1 459 19
## 3 Fiber optic 0 1799 58
## 4 Fiber optic 1 1297 42
## 5 No 0 1413 93
## 6 No 1 113 7
## Is "fiber optic" more costly than other types of internet service?
ChurnData %>% group_by(InternetService) %>% summarise (avgcost = mean(MonthlyCharges)) # Fiber optic has the highest monthly charge.
## # A tibble: 3 x 2
## InternetService avgcost
## <fctr> <dbl>
## 1 DSL 58.10217
## 2 Fiber optic 91.50013
## 3 No 21.07919
# What types of contract have the highest churn rate?
ChurnData %>% mutate(Churn= case_when(Churn=="Yes" ~1, Churn=="No" ~ 0)) %>%
select(Contract, Churn) %>% group_by(Contract) %>% count(Churn) %>% mutate(ChurnRate = 100*round(n/sum(n),2)) # Month-to-month has the highest churn rate.
## # A tibble: 6 x 4
## # Groups: Contract [3]
## Contract Churn n ChurnRate
## <fctr> <dbl> <int> <dbl>
## 1 Month-to-month 0 2220 57
## 2 Month-to-month 1 1655 43
## 3 One year 0 1307 89
## 4 One year 1 166 11
## 5 Two year 0 1647 97
## 6 Two year 1 48 3
# Payment method Vs Churn rate
ChurnData %>% mutate(Churn = case_when(Churn=="Yes" ~1, Churn=="No" ~0)) %>%
select(PaymentMethod, Churn) %>% group_by(PaymentMethod) %>% count(Churn) %>% mutate(ChurnRate = 100* round(n/sum(n),2)) %>% arrange(desc(ChurnRate)) # Electronic check has the highest churn rate.
## # A tibble: 8 x 4
## # Groups: PaymentMethod [4]
## PaymentMethod Churn n ChurnRate
## <fctr> <dbl> <int> <dbl>
## 1 Credit card (automatic) 0 1290 85
## 2 Bank transfer (automatic) 0 1286 83
## 3 Mailed check 0 1304 81
## 4 Electronic check 0 1294 55
## 5 Electronic check 1 1071 45
## 6 Mailed check 1 308 19
## 7 Bank transfer (automatic) 1 258 17
## 8 Credit card (automatic) 1 232 15
# Average tenure of those who churnned.
subset(ChurnData, ChurnData$Churn=="Yes") %>% summarise(mean=mean(tenure)) # Average tenure for churnned customer is around 18 (month? year?).
## mean
## 1 17.97913
# Average monthly charge of those who churned.
subset(ChurnData, ChurnData$Churn=="Yes") %>% summarise(mean=mean(MonthlyCharges)) # Average monthly charge is $74.
## mean
## 1 74.44133
# Average total charge of those who churnned.
subset(ChurnData,ChurnData$Churn=="Yes") %>% summarise(mean=mean(TotalCharges)) # Average total charge is $1,531.
## mean
## 1 1531.796
# Average monthly charges by internet service types for churnned customer:
subset(ChurnData, ChurnData$Churn=="Yes") %>% group_by(InternetService) %>% summarise(mean=mean(MonthlyCharges)) %>%
ggplot(aes(InternetService,mean)) + geom_bar(stat="identity", color="blue", fill="#DD8888", width=.5)+
labs(x= "Internet Service",
y= "Average Monthly Charge ($)",
title = "Average Monthly Charge for Churnned Customers by Internet Service Type")
#######################################################################################################################################################
# Correlation between categorical variables:
library(GoodmanKruskal)
GKtau(ChurnData$gender,ChurnData$InternetService)
## xName yName Nx Ny tauxy tauyx
## 1 ChurnData$gender ChurnData$InternetService 2 3 0 0
GKtau(ChurnData$InternetService,ChurnData$Contract, dgts=5)
## xName yName Nx Ny tauxy tauyx
## 1 ChurnData$InternetService ChurnData$Contract 3 3 0.05158 0.04222
GKtau(ChurnData$SeniorCitizen,ChurnData$Partner, dgts=5)
## xName yName Nx Ny tauxy tauyx
## 1 ChurnData$SeniorCitizen ChurnData$Partner 2 2 0.00027 0.00027
# https://cran.r-project.org/web/packages/GoodmanKruskal/vignettes/GoodmanKruskal.html: Very good and insightful read.
varSet2 <- c("MultipleLines","InternetService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies","Contract")
ChurnData2 <- subset(ChurnData,select= varSet2)
plot(GKtauDataframe(ChurnData2))
# Extremely useful graph to explore relationships between categorical variables.
# Looks like the only highly correlated variables are "streaming movies" and "streaming tv" which is expected.
nrow(ChurnData)
## [1] 7043
ChurnData$Churn <- ifelse(ChurnData$Churn =="Yes",1,0)
set.seed(42)
random <- sample(1:nrow(ChurnData),nrow(ChurnData)*.75)
Trainset <- ChurnData[random,]
Testset <- ChurnData[-random,]
## Models:
# 1- Logistic Regression,
# 2- SVM,
# 3- Random Forest,
# 4- Gradient Boosted Trees,
## Variable selection: We know that we should not include "streaming movies" and "streaming tv" in the same equation. Their correlation from
## the above section is fairly high.
as.formula(paste('Churn ~', paste(colnames(ChurnData[2:(which(colnames(ChurnData)=="Churn")-2)]),collapse='+')))
## Churn ~ gender + SeniorCitizen + Partner + Dependents + tenure +
## PhoneService + MultipleLines + InternetService + OnlineSecurity +
## OnlineBackup + DeviceProtection + TechSupport + StreamingTV +
## StreamingMovies + Contract + PaperlessBilling + PaymentMethod +
## MonthlyCharges
trainLRmodel<- glm(Churn ~ gender + SeniorCitizen + Partner + Dependents + tenure +
PhoneService + MultipleLines + InternetService + OnlineSecurity +
OnlineBackup + DeviceProtection + TechSupport + StreamingTV +
StreamingMovies + Contract + PaperlessBilling + PaymentMethod +
MonthlyCharges, data= Trainset, family=binomial)
# shocking results. Many of these variables are not statistically significant.
# Let's choose those that significant and re-run the regression.
trainLRmodel1 <- glm(Churn ~ SeniorCitizen + Dependents + tenure + MultipleLines + Contract +
PaperlessBilling + PaymentMethod, data = Trainset, family = binomial)
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:gmodels':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
Trainset$predictedLR1 <- predict.glm(trainLRmodel1, newdata=Trainset,type="response")
ROCpred <- prediction(Trainset$predictedLR1, Trainset$Churn)
ROCperf <- performance(ROCpred,'tpr','fpr')
plot(ROCperf)
auc <- performance(ROCpred, measure="auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8286128
# 82.86% which is not bad for a logistic regression.
library(e1071)
trainSVMmodel1 <- svm(Churn ~ SeniorCitizen + Dependents + tenure + MultipleLines + Contract +
PaperlessBilling + PaymentMethod, Trainset)
Trainset$predSVM <- predict(trainSVMmodel1,Trainset)
ROCRpredSVM <- prediction(Trainset$predSVM,Trainset$Churn)
ROCRperfSVM <- performance(ROCRpredSVM,'tpr','fpr')
plot(ROCRperfSVM)
aucSVM <- performance(ROCRpredSVM,measure="auc")
aucSVM <- aucSVM@y.values[[1]]
aucSVM
## [1] 0.7975093
# 79.75% not very good prediction relatives to other methods.
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
trainRFmodel1 <- randomForest(as.factor(Churn) ~SeniorCitizen + Dependents + tenure
+ MultipleLines + Contract + PaperlessBilling + PaymentMethod,
data = Trainset, importance= TRUE, ntree=1000)
summary(trainRFmodel1)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 5282 factor numeric
## err.rate 3000 -none- numeric
## confusion 6 -none- numeric
## votes 10564 matrix numeric
## oob.times 5282 -none- numeric
## classes 2 -none- character
## importance 28 -none- numeric
## importanceSD 21 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 5282 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
predictions <- as.vector(trainRFmodel1$votes[,2])
Trainset$predictRF <- predictions
ROCRpredwholeRandom <- prediction(Trainset$predictRF,Trainset$Churn)
ROCRperfwholeRandom <- performance(ROCRpredwholeRandom,'tpr','fpr')
plot(ROCRperfwholeRandom)
aucwholeRandom <- performance(ROCRpredwholeRandom,measure='auc')
aucwholeRandom <- aucwholeRandom@y.values[[1]]
aucwholeRandom
## [1] 0.8124204
# 81.24% is better than the logistic regression's AUC.
library(gbm)
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
trainGBMmodel1 <- gbm(Churn ~ SeniorCitizen + Dependents + tenure + MultipleLines + Contract +
PaperlessBilling + PaymentMethod, data = Trainset,
distribution= "gaussian",n.trees=1000,shrinkage = 0.01,interaction.depth=4)
summary(trainGBMmodel1)
## var rel.inf
## Contract Contract 41.132851
## tenure tenure 26.806113
## PaymentMethod PaymentMethod 14.940637
## PaperlessBilling PaperlessBilling 6.453567
## MultipleLines MultipleLines 5.730687
## SeniorCitizen SeniorCitizen 3.492444
## Dependents Dependents 1.443702
# the most important factor to predict churn is "Contract" followed by "tenure".
Trainset$predGBM <- predict(trainGBMmodel1,Trainset,n.trees=1000)
ROCRpredGBM <- prediction(Trainset$predGBM, Trainset$Churn)
ROCRperfGBM <- performance(ROCRpredGBM,'tpr','fpr')
plot(ROCRperfGBM)
aucGBM <- performance(ROCRpredGBM,measure='auc')
aucGBM <- aucGBM@y.values[[1]]
aucGBM
## [1] 0.8456603
m <- matrix(c(auc,aucSVM,aucwholeRandom,aucGBM),nrow=4,ncol=1)
colnames(m) <- c("AUC Value")
rownames(m) <- c("Logistic Regression","SVM","Random Forest","Gradient Boosting")
m
## AUC Value
## Logistic Regression 0.8286128
## SVM 0.7975093
## Random Forest 0.8124204
## Gradient Boosting 0.8456603
plot(ROCperf, col="red",colorsize = TRUE, text.adj = c(-.2,1.7), main="AUC Curves - 4 Models ")
plot(ROCRperfwholeRandom,add=TRUE,col="green", colorsize = TRUE, text.adj = c(-.2,1.7))
plot(ROCRperfSVM,add=TRUE,col="blue", colorsize = TRUE, text.adj = c(-.2,1.7))
plot(ROCRperfGBM,add=TRUE,col="black", colorsize = TRUE, text.adj = c(-.2,1.7))
labels <- c("GLM: AUC=82.86%","Random Forest: AUC=81.24%","SVM: AUC=79.75%", "Gradient Boosting: AUC=84.57%")
legend("bottom",xpd=TRUE,inset=c(0,0),labels,bty="n",pch=1,col=c("red","green","blue","black"))
# Let apply GBM to the test set.
testGBMmodel1 <- gbm(Churn ~ SeniorCitizen + Dependents + tenure + MultipleLines + Contract +
PaperlessBilling + PaymentMethod, data = Testset,
distribution= "gaussian",n.trees=1000,shrinkage = 0.01,interaction.depth=4)
summary(testGBMmodel1)
## var rel.inf
## tenure tenure 33.430286
## Contract Contract 32.130401
## PaymentMethod PaymentMethod 13.208323
## MultipleLines MultipleLines 8.102637
## PaperlessBilling PaperlessBilling 5.891407
## SeniorCitizen SeniorCitizen 4.348326
## Dependents Dependents 2.888619
Testset$predGBM <- predict(testGBMmodel1,Testset,n.trees=1000)
ROCRpredTestGBM <- prediction(Testset$predGBM,Testset$Churn)
ROCRperfTestGBM <- performance(ROCRpredTestGBM,'tpr','fpr')
plot(ROCRperfTestGBM)
aucTestGBM <- performance(ROCRpredTestGBM,measure="auc")
aucTestGBM <- aucTestGBM@y.values[[1]]
aucTestGBM
## [1] 0.8599272
# 86% the best!
a <- matrix(c("The Best Model","Average Model","Average Model","Underperforming"),nrow=4,ncol=1)
colnames(a) <- c("General Performance (accuracy) of the algorithms")
rownames(a) <- c("Gradient Boosting","Logistic Regression","Random Forest","Support Vector Machine")
a
## General Performance (accuracy) of the algorithms
## Gradient Boosting "The Best Model"
## Logistic Regression "Average Model"
## Random Forest "Average Model"
## Support Vector Machine "Underperforming"