---
title: "Telecom Customer Churn - Model Evaluation"
output:
flexdashboard::flex_dashboard:
orientation: columns
vertical_layout: fill
social: [ "twitter", "facebook","menu" ]
source_code: embed
---
```{r setup, include=FALSE}
library(flexdashboard)
library(MachineShop)
library(survival)
library(MASS)
library(magrittr)
library(stats)
library(mboost)
library(plyr)
library(corrplot)
library(ggplot2)
library(gridExtra)
library(ggthemes)
library(caret)
```
```{r include=FALSE}
churn <- read.csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
# str(churn)
## Use sapply to check the number if missing values in each columns
# sapply(churn, function(x) sum(is.na(x)))
churn <- churn[complete.cases(churn), ]
# Look at the variables, we can see that we have some wrangling to do.
cols_recode1 <- c(10:15)
for(i in 1:ncol(churn[,cols_recode1])) {
churn[,cols_recode1][,i] <- as.factor(mapvalues
(churn[,cols_recode1][,i], from =c("No internet service"),to=c("No")))
}
churn$MultipleLines <- as.factor(mapvalues(churn$MultipleLines,
from=c("No phone service"),
to=c("No")))
min(churn$tenure); max(churn$tenure)
group_tenure <- function(tenure){
if (tenure >= 0 & tenure <= 12){
return('0-12 Month')
}else if(tenure > 12 & tenure <= 24){
return('12-24 Month')
}else if (tenure > 24 & tenure <= 48){
return('24-48 Month')
}else if (tenure > 48 & tenure <=60){
return('48-60 Month')
}else if (tenure > 60){
return('> 60 Month')
}
}
churn$tenure_group <- sapply(churn$tenure,group_tenure)
churn$tenure_group <- as.factor(churn$tenure_group)
churn$SeniorCitizen <- as.factor(mapvalues(churn$SeniorCitizen,
from=c("0","1"),
to=c("No", "Yes")))
churn$customerID <- NULL
churn$tenure <- NULL
## Exploratory data analysis and feature selection
## Correlation between numeric variables
numeric.var <- sapply(churn, is.numeric)
# corr.matrix <- cor(churn[,numeric.var])
# corrplot(corr.matrix, main="\n\nCorrelation Plot for Numerical Variables", method="number")
## The Monthly Charges and Total Charges are correlated. So one of them will be
## removed from the model. We remove Total Charges.
churn$TotalCharges <- NULL
# p1 <- ggplot(churn, aes(x=gender)) + ggtitle("Gender") + xlab("Gender") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# p2 <- ggplot(churn, aes(x=SeniorCitizen)) + ggtitle("Senior Citizen") + xlab("Senior Citizen") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# p3 <- ggplot(churn, aes(x=Partner)) + ggtitle("Partner") + xlab("Partner") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# p4 <- ggplot(churn, aes(x=Dependents)) + ggtitle("Dependents") + xlab("Dependents") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# grid.arrange(p1, p2, p3, p4, ncol=2)
# p5 <- ggplot(churn, aes(x=PhoneService)) + ggtitle("Phone Service") + xlab("Phone Service") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# p6 <- ggplot(churn, aes(x=MultipleLines)) + ggtitle("Multiple Lines") + xlab("Multiple Lines") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# p7 <- ggplot(churn, aes(x=InternetService)) + ggtitle("Internet Service") + xlab("Internet Service") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# p8 <- ggplot(churn, aes(x=OnlineSecurity)) + ggtitle("Online Security") + xlab("Online Security") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# grid.arrange(p5, p6, p7, p8, ncol=2)
# p9 <- ggplot(churn, aes(x=OnlineBackup)) + ggtitle("Online Backup") + xlab("Online Backup") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# p10 <- ggplot(churn, aes(x=DeviceProtection)) + ggtitle("Device Protection") + xlab("Device Protection") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# p11 <- ggplot(churn, aes(x=TechSupport)) + ggtitle("Tech Support") + xlab("Tech Support") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# p12 <- ggplot(churn, aes(x=StreamingTV)) + ggtitle("Streaming TV") + xlab("Streaming TV") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# grid.arrange(p9, p10, p11, p12, ncol=2)
# p13 <- ggplot(churn, aes(x=StreamingMovies)) + ggtitle("Streaming Movies") + xlab("Streaming Movies") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# p14 <- ggplot(churn, aes(x=Contract)) + ggtitle("Contract") + xlab("Contract") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# p15 <- ggplot(churn, aes(x=PaperlessBilling)) + ggtitle("Paperless Billing") + xlab("Paperless Billing") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# p16 <- ggplot(churn, aes(x=PaymentMethod)) + ggtitle("Payment Method") + xlab("Payment Method") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# p17 <- ggplot(churn, aes(x=tenure_group)) + ggtitle("Tenure Group") + xlab("Tenure Group") +
# geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
# grid.arrange(p13, p14, p15, p16, p17, ncol=2)
## All of the categorical variables seem to have a reasonably broad distribution,
## therefore, all of them will be kept for the further analysis.
## logistic regression
## Split the dataset into training and testing sets:
intrain<- createDataPartition(churn$Churn,p=0.7,list=FALSE)
set.seed(123)
surv_df <- churn
surv_train<- surv_df[intrain,]
surv_test<- surv_df[-intrain,]
## Global formula for the analysis
surv_fo <- Churn ~ .
## All available models
# modelinfo() %>% names
## Model-specific information
# modelinfo(RandomForestModel)
## Control parameters for K-fold cross-validation
## Prediction of survival means
surv_means_control <- CVControl(folds = 5, repeats = 3, seed = 123)
## Prediction of survival probabilities
surv_probs_control <- CVControl(folds = 5, repeats = 3, seed = 123)
```
GLM Model
========================
Row {.tabset .tabset-fade}
-------------------------
### Predicted Survival Means and Probability
```{r}
## Model function
surv_fit_GLM <- fit(surv_fo, data = surv_train, model = GLMModel)
## Observed responses
obs <- response(surv_fo, surv_test)
## Predicted survival means
pred_means_GLM <- predict(surv_fit_GLM, newdata = surv_test)
performance(obs, pred_means_GLM)
## Predicted survival probabilities
pred_probs_GLM <- predict(surv_fit_GLM, newdata = surv_test, type = "prob")
performance(obs, pred_probs_GLM)
(res_probs_GLM <- resample(surv_fo, data = surv_df, model = GLMModel, control = surv_probs_control))
```
### GLM Model
```{r}
## Confusion matrices
(conf_GLM <- confusion(res_probs_GLM, cutoff = 0.5))
performance(conf_GLM, metrics = c("Accuracy" = accuracy,
"Sensitivity" = sensitivity,
"Specificity" = specificity))
```
### Confusion Matrix using GLM Model
```{r}
summary(conf_GLM)
```
### Confusion Matrix Plot using GLM Model
```{r}
plot(conf_GLM)
```
### ROC curves using GLM Model
```{r}
roc_GLM <- performance_curve(res_probs_GLM)
plot(roc_GLM, diagonal = TRUE)
```
### ROC Curve Cutoffs
```{r}
plot(roc_GLM, type = "cutoffs")
```
### AUC using GLM MOdel
```{r}
auc(roc_GLM)
```
GLM Boost Model
=======================
Row {.tabset .tabset-fade}
-------------------------
### Predicted Survival Means and Probablity
```{r}
## Model function
surv_fit_GLMB <- fit(surv_fo, data = surv_train, model = GLMBoostModel)
## Observed responses
obs <- response(surv_fo, surv_test)
## Predicted survival means
pred_means_GLMB <- predict(surv_fit_GLMB, newdata = surv_test)
performance(obs, pred_means_GLMB)
## Predicted survival probabilities
pred_probs_GLMB <- predict(surv_fit_GLMB, newdata = surv_test, type = "prob")
performance(obs, pred_probs_GLMB)
(res_probs_GLMB <- resample(surv_fo, data = surv_df, model = GLMBoostModel, control = surv_probs_control))
```
### GLM Boost
```{r}
## Confusion matrices
(conf_GLMB <- confusion(res_probs_GLMB, cutoff = 0.5))
performance(conf_GLMB, metrics = c("Accuracy" = accuracy,
"Sensitivity" = sensitivity,
"Specificity" = specificity))
```
### Confusion Matrix using GLM Boost Model
```{r}
summary(conf_GLMB)
```
### Confusion Matrix Plot using GLM Model
```{r}
plot(conf_GLMB)
```
### ROC curves using GLM Boost Model
```{r}
roc_GLMB <- performance_curve(res_probs_GLMB)
plot(roc_GLMB, diagonal = TRUE)
```
### ROC Curve Cutoffs
```{r}
plot(roc_GLMB, type = "cutoffs")
```
### AUC using GLM Boost MOdel
```{r}
auc(roc_GLMB)
```
References
====================
https://cran.r-project.org/web/packages/MachineShop/vignettes/Introduction.html