# Setting the working directory
setwd("G:\\IIMK DABS\\Session Files\\Datasets and R files")
getwd()
## [1] "G:/IIMK DABS/Session Files/Datasets and R files"
# Loading data and converting variables into proper classes
data <- read.csv("telecom_churn.csv", header = T)
sum(is.na(data))
## [1] 0
data$Churn <- as.factor(data$Churn)
data$ContractRenewal <- as.factor(data$ContractRenewal)
data$DataPlan <- as.factor(data$DataPlan)
# Loading necessary packages
library(caret)
library(ROSE)
library(car)
library(pROC)
# Estimation using the data
model <- glm(Churn~., data = data, family = binomial)
summary(model)
##
## Call:
## glm(formula = Churn ~ ., family = binomial, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0058 -0.5112 -0.3477 -0.2093 2.9981
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.9510252 0.5486763 -10.846 < 2e-16 ***
## AccountWeeks 0.0006525 0.0013873 0.470 0.638112
## ContractRenewal1 -1.9855172 0.1436107 -13.826 < 2e-16 ***
## DataPlan1 -1.1841611 0.5363668 -2.208 0.027262 *
## DataUsage 0.3636565 1.9231751 0.189 0.850021
## CustServCalls 0.5081349 0.0389682 13.040 < 2e-16 ***
## DayMins 0.0174407 0.0324841 0.537 0.591337
## DayCalls 0.0036523 0.0027497 1.328 0.184097
## MonthlyCharge -0.0275526 0.1909074 -0.144 0.885244
## OverageFee 0.1868114 0.3256902 0.574 0.566248
## RoamMins 0.0789226 0.0220522 3.579 0.000345 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2758.3 on 3332 degrees of freedom
## Residual deviance: 2188.4 on 3322 degrees of freedom
## AIC: 2210.4
##
## Number of Fisher Scoring iterations: 5
# Stepwise Regression to identify significant variables
model1 <- step(model, trace = F)
summary(model1)
##
## Call:
## glm(formula = Churn ~ ContractRenewal + DataPlan + CustServCalls +
## DayMins + OverageFee + RoamMins, family = binomial, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9932 -0.5154 -0.3480 -0.2095 2.9906
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.552897 0.432757 -12.831 < 2e-16 ***
## ContractRenewal1 -1.989219 0.143452 -13.867 < 2e-16 ***
## DataPlan1 -0.934814 0.144015 -6.491 8.52e-11 ***
## CustServCalls 0.505651 0.038834 13.021 < 2e-16 ***
## DayMins 0.012774 0.001073 11.907 < 2e-16 ***
## OverageFee 0.138612 0.022648 6.120 9.34e-10 ***
## RoamMins 0.083476 0.020304 4.111 3.93e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2758.3 on 3332 degrees of freedom
## Residual deviance: 2190.6 on 3326 degrees of freedom
## AIC: 2204.6
##
## Number of Fisher Scoring iterations: 5
# Obtaining model coefficients to understand the direction of independent variables
coefficients(model1)
## (Intercept) ContractRenewal1 DataPlan1 CustServCalls
## -5.55289657 -1.98921921 -0.93481403 0.50565134
## DayMins OverageFee RoamMins
## 0.01277352 0.13861170 0.08347648
# Obtaining odds ratio to understand the magnitude of impact of independent variables
exp(coefficients(model1))
## (Intercept) ContractRenewal1 DataPlan1 CustServCalls
## 0.003876213 0.136802198 0.392658882 1.658065131
## DayMins OverageFee RoamMins
## 1.012855446 1.148677979 1.087059649
# Checking for multicollinearity (variable inflation factor)
vif(model1)
## ContractRenewal DataPlan CustServCalls DayMins OverageFee
## 1.056179 1.018476 1.076219 1.039028 1.022948
## RoamMins
## 1.010395
# Using caret package to create data partition
set.seed(1234)
index <- createDataPartition(data$Churn, p=0.8, list = F)
train_data <- data [index,]
prop.table(table(train_data$Churn))
##
## 0 1
## 0.8548931 0.1451069
test_data <- data [-index,]
prop.table(table(test_data$Churn))
##
## 0 1
## 0.8558559 0.1441441
# Training the model using train data
model3 <- glm(Churn~., train_data, family = binomial)
# Stepwise Regression to identify significant variables
model4 <- step(model3, trace = F)
# predicting the test data using the model
predict <- predict(model4, newdata = test_data, type= "response")
test_data$predicted <- predict
test_data$class <- ifelse(test_data$predicted >= 0.5, 1, 0)
test_data$Churn <- as.factor(test_data$Churn)
test_data$class <- as.factor(test_data$class)
# Creating confusion matrix to identify accuracy and sensitivity
confusionMatrix(test_data$class, test_data$Churn, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 561 85
## 1 9 11
##
## Accuracy : 0.8589
## 95% CI : (0.8301, 0.8844)
## No Information Rate : 0.8559
## P-Value [Acc > NIR] : 0.4393
##
## Kappa : 0.1473
##
## Mcnemar's Test P-Value : 1.029e-14
##
## Sensitivity : 0.11458
## Specificity : 0.98421
## Pos Pred Value : 0.55000
## Neg Pred Value : 0.86842
## Prevalence : 0.14414
## Detection Rate : 0.01652
## Detection Prevalence : 0.03003
## Balanced Accuracy : 0.54940
##
## 'Positive' Class : 1
##
The sensitivity value is very low. Sensitivity is the True positive rate. The value of sensitivity is low because the data is imbalanced. The proportion of event class and non-event class in the dependent variable is not same. In order to balance that, data balancing needs to be done. This can be done using ROSE package and creating over-sampled, under-sampled and ‘both’ sampled data
set.seed(1234)
over<- ovun.sample(Churn~., data = train_data, method = "over", N = 4560)$data
table(over$Churn)
##
## 0 1
## 2280 2280
set.seed(1234)
under<- ovun.sample(Churn~., data = train_data, method = "under", N = 774)$data
table (under$Churn)
##
## 0 1
## 387 387
both<- ovun.sample(Churn~., data = train_data, method = "both", p = .50, seed = 1234, N = 2667 )$data
prop.table (table (both$Churn))
##
## 0 1
## 0.4983127 0.5016873
# Creating model using over-sampled data, predicitng test data using the model and creating confusion matrix
model5 <- glm (Churn~., family = binomial, data = over)
predicted1 <- predict (model5, newdata = test_data, type = "response")
class1 <- ifelse (predicted1 >= .50, 1, 0)
class1 <- factor (class1)
confusionMatrix(class1, test_data$Churn, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 455 24
## 1 115 72
##
## Accuracy : 0.7913
## 95% CI : (0.7584, 0.8216)
## No Information Rate : 0.8559
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3933
##
## Mcnemar's Test P-Value : 2.281e-14
##
## Sensitivity : 0.7500
## Specificity : 0.7982
## Pos Pred Value : 0.3850
## Neg Pred Value : 0.9499
## Prevalence : 0.1441
## Detection Rate : 0.1081
## Detection Prevalence : 0.2808
## Balanced Accuracy : 0.7741
##
## 'Positive' Class : 1
##
# Creating model using under-sampled data, predicitng test data using the model and creating confusion matrix
model6 <- glm (Churn~., family = binomial, data = under)
predicted2 <- predict (model6, newdata = test_data, type = "response")
class2 <- ifelse (predicted2 >= .50, 1, 0)
class2 <- factor (class2)
confusionMatrix(class2, test_data$Churn, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 457 26
## 1 113 70
##
## Accuracy : 0.7913
## 95% CI : (0.7584, 0.8216)
## No Information Rate : 0.8559
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3856
##
## Mcnemar's Test P-Value : 2.999e-13
##
## Sensitivity : 0.7292
## Specificity : 0.8018
## Pos Pred Value : 0.3825
## Neg Pred Value : 0.9462
## Prevalence : 0.1441
## Detection Rate : 0.1051
## Detection Prevalence : 0.2748
## Balanced Accuracy : 0.7655
##
## 'Positive' Class : 1
##
# Creating model using under-sampled data, predicitng test data using the model and creating confusion matrix
model7 <- glm (Churn~., family = binomial, data = both)
predicted3 <- predict (model7, newdata = test_data, type = "response")
class3 <- ifelse (predicted3 >= .50, 1, 0)
class3 <- factor (class3)
confusionMatrix(class3, test_data$Churn, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 455 24
## 1 115 72
##
## Accuracy : 0.7913
## 95% CI : (0.7584, 0.8216)
## No Information Rate : 0.8559
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3933
##
## Mcnemar's Test P-Value : 2.281e-14
##
## Sensitivity : 0.7500
## Specificity : 0.7982
## Pos Pred Value : 0.3850
## Neg Pred Value : 0.9499
## Prevalence : 0.1441
## Detection Rate : 0.1081
## Detection Prevalence : 0.2808
## Balanced Accuracy : 0.7741
##
## 'Positive' Class : 1
##
# Using library proc, creating ROC curve and calculatng AUCvalues for all 3 models
roc(test_data$Churn, predicted1, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="green4", lwd=2, print.auc=TRUE)
##
## Call:
## roc.default(response = test_data$Churn, predictor = predicted1, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Postive Percentage", col = "green4", lwd = 2, print.auc = TRUE)
##
## Data: predicted1 in 570 controls (test_data$Churn 0) < 96 cases (test_data$Churn 1).
## Area under the curve: 82.93%
plot.roc(test_data$Churn, predicted2, percent=TRUE, col="red", lwd=2, print.auc=TRUE, add=TRUE, print.auc.y=30)
plot.roc(test_data$Churn, predicted3, percent=TRUE, col ="blue", lwd=2,print.auc=TRUE, add=TRUE,print.auc.y=40)
legend("topright", bg ="transparent", legend=c("Over sampled", "Under sampled", "Both"), col=c("green4", "red","blue"), cex = .5,lwd=2)
