Team Members:
Inputting the various libraries
library(tidyverse)
library(MASS)
library(ggcorrplot)
library(olsrr)
library(caret)
library('car')
library(randomForest)
df<- read.table(file = "C:/Users/17che/OneDrive - Georgia Institute of Technology/Desktop/telco.csv"
, sep=",", header=T,stringsAsFactors=T)
df <- subset(df, select = -customerID)
(head(df));
Get names of the columns that have NA values in them
colnames(df)[colSums(is.na(df)) > 0]
## [1] "TotalCharges"
Omitting the 11 rows that have NA values in them
df <- na.omit(df)
Seeing the variation of churn with respect to the continuous variables
library(ggpubr)
a <- ggplot(df, aes(y= tenure, x = "", fill = Churn)) +
geom_boxplot()+ theme_bw()+ ylab("Tenure ")+ xlab(" ")
b <- ggplot(df, aes(y= MonthlyCharges, x = "", fill = Churn, )) +
geom_boxplot()+ theme_bw()+ ylab("Monthly Charges")+ xlab(" ")
c <- ggplot(df, aes(y= TotalCharges, x = "", fill = Churn, colors ='b')) +
geom_boxplot()+ theme_bw()+ ylab("Total Charges")+ xlab(" ")
ggarrange(a,b,c, ncol = 3, nrow = 1)
Visualizing churn with respect to the categorical variables (Part 1)
color <- scale_fill_manual(values=c("chartreuse3","red"))
d <- ggplot(df,aes(x=gender,fill=Churn))+geom_bar(position = 'fill')+xlab("Gender ")+
ylab("") +color+theme_bw()
e <- ggplot(df,aes(x=Dependents,fill=Churn))+geom_bar(position = 'fill') +xlab("Dependents")+ ylab("")+color+theme_bw()
f <- ggplot(df,aes(x=Partner,fill=Churn))+geom_bar(position = 'fill') + xlab("Partner")+ ylab("")+color+theme_bw()
g <- ggplot(df,aes(x=InternetService,fill=Churn))+geom_bar(position = 'fill')+ xlab("Internet Service")+ ylab("")+color+theme_bw()
ggarrange(d,e,f,g, ncol = 2, nrow = 2)
(Part 2)
h <- ggplot(df,aes(x=OnlineSecurity,fill=Churn))+geom_bar(position = 'fill')+ xlab("Online Security")+ ylab("")+ scale_x_discrete(labels=function(x) str_wrap(x,width=5))+color+theme_bw()
i <- ggplot(df,aes(x=PaymentMethod,fill=Churn))+geom_bar(position = 'fill')+ xlab("Payment Method")+ ylab("") + scale_x_discrete(labels=function(x) str_wrap(x,width=10))+color+theme_bw()
j <- ggplot(df,aes(x=Contract,fill=Churn))+geom_bar(position = 'fill')+ xlab("Contract")+ ylab("")+ scale_x_discrete(labels=function(x) str_wrap(x,width=5))+color+theme_bw()
k <- ggplot(df,aes(x= DeviceProtection,fill=Churn))+ geom_bar(position = 'fill')+ xlab("Device Protection")+ ylab("") + scale_x_discrete(labels=function(x) str_wrap(x,width=5))+color+theme_bw()
ggarrange(h,i,j,k, ncol = 2, nrow = 2)
(Part 3)
l <- ggplot(df,aes(x=TechSupport,fill=Churn))+geom_bar(position = 'fill')+theme_bw()+ xlab("Tech Support")+ ylab("")+color
m <- ggplot(df,aes(x=StreamingTV,fill=Churn))+geom_bar(position = 'fill')+theme_bw()+ xlab("Streaming TV")+ ylab("")+color
n <- m <- ggplot(df,aes(x=PaperlessBilling,fill=Churn))+geom_bar(position = 'fill')+theme_bw()+ xlab("Paperless Billing")+ ylab("")+color
o <- ggplot(df,aes(x=StreamingMovies,fill=Churn))+geom_bar(position = 'fill')+theme_bw()+ xlab("Streaming Movies")+ ylab("")+color
ggarrange(l,m,n,o, ncol = 2, nrow = 2)
Visualizing the correlations between the continuous variables
corr <- cor(df[,c('tenure', 'MonthlyCharges', 'TotalCharges')])
ggcorrplot(corr,hc.order = TRUE, outline.col = "white")
Considering No phone service and No internet as No, thus cleaning the data Also setting ‘Male’ as 1 and ‘Female’ as 0
df <- data.frame(lapply(df, function(x) {
gsub("No internet service", "No", x)}))
df <- data.frame(lapply(df, function(x) {
gsub("No phone service", "No", x)}))
df$gender<-ifelse(df$gender=="Male",1,0)
Pre-processing the data by replacing all the data that has ‘No’ as 0 and ‘Yes’ as 1
df <- data.frame(lapply(df, function(x) {
gsub("No", 0, x)}))
df <- data.frame(lapply(df, function(x) {
gsub("Yes", 1, x)}))
Creating dummy variables for the categorical variables that have multiple levels
library('fastDummies')
categorical<- dummy_cols(df, select_columns = c('InternetService', 'Contract', 'PaymentMethod'), remove_first_dummy = FALSE)
cat<- categorical[,-c(5,8,15,17,18,19)]
Combining the categorical variables, dummy variables and continuous variables in one dataframe
cont <- df[,c(5,18,19)]
df_final <- cbind(cont,cat)
df_final <- sapply(df_final, as.numeric)
df_final <- as.data.frame(df_final)
head(df_final)
Renaming the variables for ease of understanding Splitting the data into testing and training data
library(caTools)
colnames(df_final)[20] <- "FiberOptic"
colnames(df_final)[21] <- "Contract_month_to_month"
colnames(df_final)[22] <- "Contract_one_year"
colnames(df_final)[23] <- "Contract_two_year"
colnames(df_final)[24] <- "Payment_bank_transfer"
colnames(df_final)[25] <- "Payment_credit_card"
colnames(df_final)[26] <- "Payment_electronic_check"
colnames(df_final)[27] <- "Payment_mailed_check"
set.seed(123)
x=sample.split (df_final, SplitRatio= 0.7)
Train= df_final[x, 1:27]
Test= df_final[!x, 1:27]
Fitting all the variables on the training data to get the full model
model <- glm(Churn ~ ., family = binomial(link="logit"), data = Train);
summary(model)
##
## Call:
## glm(formula = Churn ~ ., family = binomial(link = "logit"), data = Train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8969 -0.6780 -0.3014 0.7238 3.3806
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.059e+00 1.942e+00 1.575 0.115150
## tenure -5.623e-02 7.535e-03 -7.463 8.43e-14 ***
## MonthlyCharges -7.101e-02 3.837e-02 -1.850 0.064260 .
## TotalCharges 2.558e-04 8.512e-05 3.006 0.002649 **
## gender -4.326e-02 7.932e-02 -0.545 0.585512
## SeniorCitizen 1.630e-01 1.041e-01 1.566 0.117422
## Partner 1.383e-01 9.563e-02 1.446 0.148129
## Dependents -2.338e-01 1.100e-01 -2.125 0.033547 *
## PhoneService 9.277e-01 7.870e-01 1.179 0.238490
## MultipleLines 6.221e-01 2.146e-01 2.899 0.003740 **
## OnlineSecurity -8.279e-02 2.159e-01 -0.383 0.701367
## OnlineBackup 2.309e-01 2.120e-01 1.089 0.276151
## DeviceProtection 3.475e-01 2.145e-01 1.620 0.105216
## TechSupport 4.375e-02 2.168e-01 0.202 0.840046
## StreamingTV 8.913e-01 3.936e-01 2.265 0.023538 *
## StreamingMovies 9.470e-01 3.959e-01 2.392 0.016753 *
## PaperlessBilling 3.137e-01 9.160e-02 3.425 0.000615 ***
## InternetService_0 -5.244e+00 1.927e+00 -2.721 0.006501 **
## InternetService_DSL -2.615e+00 9.644e-01 -2.711 0.006700 **
## FiberOptic NA NA NA NA
## Contract_month_to_month 1.167e+00 2.094e-01 5.575 2.47e-08 ***
## Contract_one_year 5.844e-01 2.066e-01 2.828 0.004682 **
## Contract_two_year NA NA NA NA
## Payment_bank_transfer 6.516e-02 1.429e-01 0.456 0.648497
## Payment_credit_card 6.787e-02 1.425e-01 0.476 0.633881
## Payment_electronic_check 3.879e-01 1.202e-01 3.228 0.001247 **
## Payment_mailed_check NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5428.3 on 4686 degrees of freedom
## Residual deviance: 3909.5 on 4663 degrees of freedom
## AIC: 3957.5
##
## Number of Fisher Scoring iterations: 6
Predicting the churn using the fitted model
Test$model_prob <- predict(model, Test, type = "response")
Test <- Test %>% mutate(model_pred = 1*(model_prob > .5) + 0,
visit_binary = 1*(Churn == 1) + 0)
Checking the accuracy of the full model
Test <- Test %>% mutate(accurate = 1*(model_pred == visit_binary))
sum(Test$accurate, na.rm = T)/nrow(Test)
## [1] 0.7991471
Hence the accuracy of the logistic regression- full model is 79.9%
Calculating the accuracy for the reduced model determined using AIC
Test$model_prob <- predict(model_AIC, Test, type = "response")
Test <- Test %>% mutate(model_pred = 1*(model_prob > .5) + 0,
visit_binary = 1*(Churn == 1) + 0)
Test <- Test %>% mutate(accurate = 1*(model_pred == visit_binary))
sum(Test$accurate, na.rm = T)/nrow(Test)
## [1] 0.7987207
The accuracy of the model is now 79.87% when only retaining the variables decided by AIC
VIF is used for checking the multicollinearity between the variables selected by AIC
vif(model_AIC)
## tenure MonthlyCharges TotalCharges
## 15.490829 82.076936 20.278638
## SeniorCitizen Partner Dependents
## 1.138209 1.401656 1.300379
## PhoneService MultipleLines OnlineBackup
## 5.431540 2.120849 1.893234
## DeviceProtection StreamingTV StreamingMovies
## 1.986773 4.251463 4.426197
## PaperlessBilling InternetService_0 InternetService_DSL
## 1.118060 26.599814 14.879846
## Contract_month_to_month Contract_one_year Payment_electronic_check
## 4.475069 3.451895 1.140161
model_vif <- glm(formula = Churn ~ tenure + MonthlyCharges + SeniorCitizen +
InternetService_0 + OnlineBackup + DeviceProtection + StreamingTV+
OnlineSecurity + OnlineBackup + TechSupport + MultipleLines +
+ `Contract_month_to_month` + PaperlessBilling +
`Payment_electronic_check`, family = "binomial", data = Train)
summary(model_vif)
##
## Call:
## glm(formula = Churn ~ tenure + MonthlyCharges + SeniorCitizen +
## InternetService_0 + OnlineBackup + DeviceProtection + StreamingTV +
## OnlineSecurity + OnlineBackup + TechSupport + MultipleLines +
## +Contract_month_to_month + PaperlessBilling + Payment_electronic_check,
## family = "binomial", data = Train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9639 -0.6508 -0.3329 0.6998 3.0207
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.318793 0.214849 -10.793 < 2e-16 ***
## tenure -0.033959 0.002724 -12.465 < 2e-16 ***
## MonthlyCharges 0.023140 0.002862 8.086 6.15e-16 ***
## SeniorCitizen 0.264976 0.101608 2.608 0.009112 **
## InternetService_0 -0.677124 0.187495 -3.611 0.000305 ***
## OnlineBackup -0.197324 0.092709 -2.128 0.033302 *
## DeviceProtection -0.060840 0.096472 -0.631 0.528269
## StreamingTV 0.071674 0.101641 0.705 0.480705
## OnlineSecurity -0.576618 0.100596 -5.732 9.92e-09 ***
## TechSupport -0.445492 0.101351 -4.396 1.11e-05 ***
## MultipleLines 0.144893 0.098591 1.470 0.141659
## Contract_month_to_month 0.757402 0.122975 6.159 7.32e-10 ***
## PaperlessBilling 0.354271 0.090412 3.918 8.91e-05 ***
## Payment_electronic_check 0.403909 0.083875 4.816 1.47e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5428.3 on 4686 degrees of freedom
## Residual deviance: 3971.5 on 4673 degrees of freedom
## AIC: 3999.5
##
## Number of Fisher Scoring iterations: 5
vif(model_vif)
## tenure MonthlyCharges SeniorCitizen
## 2.098769 3.570934 1.082891
## InternetService_0 OnlineBackup DeviceProtection
## 1.879425 1.220498 1.325454
## StreamingTV OnlineSecurity TechSupport
## 1.621973 1.110586 1.169765
## MultipleLines Contract_month_to_month PaperlessBilling
## 1.545203 1.596986 1.107892
## Payment_electronic_check
## 1.128538
Fitting another model determined using the variance inflation factor
model_vif2 <- glm(formula = Churn ~ tenure + MonthlyCharges + SeniorCitizen +
InternetService_0 + OnlineBackup + OnlineSecurity + OnlineBackup + TechSupport +
+ `Contract_month_to_month` + PaperlessBilling +
`Payment_electronic_check`, family = "binomial", data = Train)
final <- model_vif2
summary(model_vif2)
##
## Call:
## glm(formula = Churn ~ tenure + MonthlyCharges + SeniorCitizen +
## InternetService_0 + OnlineBackup + OnlineSecurity + OnlineBackup +
## TechSupport + +Contract_month_to_month + PaperlessBilling +
## Payment_electronic_check, family = "binomial", data = Train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9193 -0.6541 -0.3314 0.7139 3.0074
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.421909 0.204018 -11.871 < 2e-16 ***
## tenure -0.033237 0.002615 -12.710 < 2e-16 ***
## MonthlyCharges 0.025167 0.002268 11.098 < 2e-16 ***
## SeniorCitizen 0.268023 0.101434 2.642 0.00823 **
## InternetService_0 -0.615371 0.183543 -3.353 0.00080 ***
## OnlineBackup -0.205191 0.092483 -2.219 0.02651 *
## OnlineSecurity -0.583323 0.100285 -5.817 6.00e-09 ***
## TechSupport -0.459340 0.100753 -4.559 5.14e-06 ***
## Contract_month_to_month 0.769366 0.121407 6.337 2.34e-10 ***
## PaperlessBilling 0.361056 0.090321 3.997 6.40e-05 ***
## Payment_electronic_check 0.405412 0.083700 4.844 1.27e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5428.3 on 4686 degrees of freedom
## Residual deviance: 3974.4 on 4676 degrees of freedom
## AIC: 3996.4
##
## Number of Fisher Scoring iterations: 5
vif(model_vif2)
## tenure MonthlyCharges SeniorCitizen
## 1.934739 2.245331 1.081630
## InternetService_0 OnlineBackup OnlineSecurity
## 1.800568 1.215665 1.104441
## TechSupport Contract_month_to_month PaperlessBilling
## 1.156389 1.556317 1.105920
## Payment_electronic_check
## 1.124904
predicted <- predict(final, type = "response", newdata = Test[,-17])
Test$prob <- predicted
predicted_churn <- factor(ifelse(predicted >= 0.50, "Yes", "No"))
actual_churn <- factor(ifelse(Test$Churn==1,"Yes","No"))
Get the values of accuracy, sensitivity and specificity for cutoff = 0.5
cutoff_churn <- factor(ifelse(predicted >=0.50, "Yes", "No"))
confusionMatrix_final <- confusionMatrix(cutoff_churn, actual_churn, positive = "Yes")
accuracy <- confusionMatrix_final$overall[1]
sensitivity <- confusionMatrix_final$byClass[1]
specificity <- confusionMatrix_final$byClass[2]
accuracy
## Accuracy
## 0.7970149
sensitivity
## Sensitivity
## 0.5280899
specificity
## Specificity
## 0.8943089
perform_fn <- function(cutoff)
{
predicted_churn <- factor(ifelse(predicted >= cutoff, "Yes", "No"))
conf <- confusionMatrix(predicted_churn, actual_churn, positive = "Yes")
accuray <- conf$overall[1]
sensitivity <- conf$byClass[1]
specificity <- conf$byClass[2]
out <- t(as.matrix(c(sensitivity, specificity, accuray)))
colnames(out) <- c("sensitivity", "specificity", "accuracy")
return(out)
}
Plotting the value of these three metrics for different cutoff values
s = seq(0.01,0.90,length=100)
OUT = matrix(0,100,3)
for(i in 1:100)
{
OUT[i,] = perform_fn(s[i])
}
plot(s, OUT[,1],xlab="Cutoff",ylab="Value",axes=FALSE,col=2,type="l")
axis(1,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
axis(2,seq(0,1,length=5),seq(0,1,length=5),cex.lab=1.5)
lines(s,OUT[,2],col='black',lwd=2)
lines(s,OUT[,3],col=4,lwd=2)
box()
legend("bottom",col=c(2,"black",4,"red")
,lwd=c(2,2,2,2),c("Sensitivity","Specificity","Accuracy"))
axis(1, at = seq(0.1, 1, by = 0.1))
Hence, the cutoff value is taken to be = 0.3 as that is where accuracy, specificity and sensitivity intersect. This gives a high value for all three parameters
cutoff_churn <- factor(ifelse(predicted >=0.3, "Yes", "No"))
conf_final <- confusionMatrix(cutoff_churn, actual_churn, positive = "Yes")
accuracy <- conf_final$overall[1]
sensitivity <- conf_final$byClass[1]
specificity <- conf_final$byClass[2]
accuracy
## Accuracy
## 0.7688699
sensitivity
## Sensitivity
## 0.7495987
specificity
## Specificity
## 0.775842
Get the Area under the curve value
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
model_final.roc <- roc(response = Test$Churn, predictor = as.numeric(predicted))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(model_final.roc, print.auc = TRUE,legacy.axes = TRUE, col='blue')
Training a random forest model
Train$Churn <- as.character(Train$Churn)
Train$Churn <- as.factor(Train$Churn)
model.rf <- randomForest(Churn ~ ., data=Train, proximity=FALSE,importance = FALSE,
ntree=500,mtry=4, do.trace=FALSE)
model.rf
##
## Call:
## randomForest(formula = Churn ~ ., data = Train, proximity = FALSE, importance = FALSE, ntree = 500, mtry = 4, do.trace = FALSE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 20.03%
## Confusion matrix:
## 0 1 class.error
## 0 3075 366 0.1063644
## 1 573 673 0.4598716
The ten most important variables according to the Random Forest
varImpPlot(model.rf, main="Random Forest Variable importance", n.var = 10)
Plotting the ROC curve for the random forest
predicted <- predict(model.rf, type = "response", newdata = Test[,-17])
model.rf.roc <- roc(response = Test$Churn, predictor = as.numeric(predicted))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(model.rf.roc, print.auc = TRUE,legacy.axes = TRUE, col='red')
Area under ROC curve is lesser for a random forest model as compared to the logistic regression case
Hence, logistic regression model with cutoff = 0.3 is the best model for this data