library(readxl)
# Loading package
library(caTools)
library(ROCR)
# Correlogram in R
# required packages
library(corrplot)
## corrplot 0.90 loaded
library(ROCR)
library(DataExplorer)
library(car)
## Loading required package: carData
getwd()
## [1] "D:/APGCBAA/III/RProg/Group-Assignment/Group-Assignment"
# Splitting dataset
Cellphone <- read_excel("Cellphone.xlsx",sheet = 2)
summary(Cellphone)
## Churn AccountWeeks ContractRenewal DataPlan
## Min. :0.0000 Min. : 1.0 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 74.0 1st Qu.:1.0000 1st Qu.:0.0000
## Median :0.0000 Median :101.0 Median :1.0000 Median :0.0000
## Mean :0.1449 Mean :101.1 Mean :0.9031 Mean :0.2766
## 3rd Qu.:0.0000 3rd Qu.:127.0 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :243.0 Max. :1.0000 Max. :1.0000
## DataUsage CustServCalls DayMins DayCalls
## Min. :0.0000 Min. :0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:143.7 1st Qu.: 87.0
## Median :0.0000 Median :1.000 Median :179.4 Median :101.0
## Mean :0.8165 Mean :1.563 Mean :179.8 Mean :100.4
## 3rd Qu.:1.7800 3rd Qu.:2.000 3rd Qu.:216.4 3rd Qu.:114.0
## Max. :5.4000 Max. :9.000 Max. :350.8 Max. :165.0
## MonthlyCharge OverageFee RoamMins
## Min. : 14.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 45.00 1st Qu.: 8.33 1st Qu.: 8.50
## Median : 53.50 Median :10.07 Median :10.30
## Mean : 56.31 Mean :10.05 Mean :10.24
## 3rd Qu.: 66.20 3rd Qu.:11.77 3rd Qu.:12.10
## Max. :111.30 Max. :18.19 Max. :20.00
dim(Cellphone)
## [1] 3333 11
str(Cellphone)
## tibble [3,333 x 11] (S3: tbl_df/tbl/data.frame)
## $ Churn : num [1:3333] 0 0 0 0 0 0 0 0 0 0 ...
## $ AccountWeeks : num [1:3333] 128 107 137 84 75 118 121 147 117 141 ...
## $ ContractRenewal: num [1:3333] 1 1 1 0 0 0 1 0 1 0 ...
## $ DataPlan : num [1:3333] 1 1 0 0 0 0 1 0 0 1 ...
## $ DataUsage : num [1:3333] 2.7 3.7 0 0 0 0 2.03 0 0.19 3.02 ...
## $ CustServCalls : num [1:3333] 1 1 0 2 3 0 3 0 1 0 ...
## $ DayMins : num [1:3333] 265 162 243 299 167 ...
## $ DayCalls : num [1:3333] 110 123 114 71 113 98 88 79 97 84 ...
## $ MonthlyCharge : num [1:3333] 89 82 52 57 41 57 87.3 36 63.9 93.2 ...
## $ OverageFee : num [1:3333] 9.87 9.78 6.06 3.1 7.42 ...
## $ RoamMins : num [1:3333] 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
plot_intro(Cellphone)

### Histogram of variables
#DataExplorer::plot_histogram(Cellphone,geom_histogram_args = list(fill="blue"),
# theme_config = list(axis.line = element_line(size = 1, colour = "black"), strip.background = element_rect(color = "black", fill = "grey"))) ## checking the distribution of variables
DataExplorer::plot_histogram(Cellphone)

plot_density(Cellphone)

plot_boxplot(Cellphone, by="Churn")

plot_bar(Cellphone)

#******** Correlation Matrix **********************
head(Cellphone)
## # A tibble: 6 x 11
## Churn AccountWeeks ContractRenewal DataPlan DataUsage CustServCalls DayMins
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 128 1 1 2.7 1 265.
## 2 0 107 1 1 3.7 1 162.
## 3 0 137 1 0 0 0 243.
## 4 0 84 0 0 0 2 299.
## 5 0 75 0 0 0 3 167.
## 6 0 118 0 0 0 0 223.
## # ... with 4 more variables: DayCalls <dbl>, MonthlyCharge <dbl>,
## # OverageFee <dbl>, RoamMins <dbl>
#correlation matrix
M<-cor(Cellphone)
head(round(M,2))
## Churn AccountWeeks ContractRenewal DataPlan DataUsage
## Churn 1.00 0.02 -0.26 -0.10 -0.09
## AccountWeeks 0.02 1.00 -0.02 0.00 0.01
## ContractRenewal -0.26 -0.02 1.00 -0.01 -0.02
## DataPlan -0.10 0.00 -0.01 1.00 0.95
## DataUsage -0.09 0.01 -0.02 0.95 1.00
## CustServCalls 0.21 0.00 0.02 -0.02 -0.02
## CustServCalls DayMins DayCalls MonthlyCharge OverageFee
## Churn 0.21 0.21 0.02 0.07 0.09
## AccountWeeks 0.00 0.01 0.04 0.01 -0.01
## ContractRenewal 0.02 -0.05 0.00 -0.05 -0.02
## DataPlan -0.02 0.00 -0.01 0.74 0.02
## DataUsage -0.02 0.00 -0.01 0.78 0.02
## CustServCalls 1.00 -0.01 -0.02 -0.03 -0.01
## RoamMins
## Churn 0.07
## AccountWeeks 0.01
## ContractRenewal -0.05
## DataPlan 0.00
## DataUsage 0.16
## CustServCalls -0.01
corrplot(M, method="number")

split <- sample.split(Cellphone$Churn, SplitRatio = 0.7)
train_reg <- subset(Cellphone, split == TRUE)
test_reg <- subset(Cellphone, split == FALSE)
logistic_model = glm(Churn ~ ., data = Cellphone, family = "binomial")
summary(logistic_model)
##
## Call:
## glm(formula = Churn ~ ., family = "binomial", data = Cellphone)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0058 -0.5112 -0.3477 -0.2093 2.9981
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.9510252 0.5486763 -10.846 < 2e-16 ***
## AccountWeeks 0.0006525 0.0013873 0.470 0.638112
## ContractRenewal -1.9855172 0.1436107 -13.826 < 2e-16 ***
## DataPlan -1.1841611 0.5363668 -2.208 0.027262 *
## DataUsage 0.3636565 1.9231751 0.189 0.850021
## CustServCalls 0.5081349 0.0389682 13.040 < 2e-16 ***
## DayMins 0.0174407 0.0324841 0.537 0.591337
## DayCalls 0.0036523 0.0027497 1.328 0.184097
## MonthlyCharge -0.0275526 0.1909074 -0.144 0.885244
## OverageFee 0.1868114 0.3256902 0.574 0.566248
## RoamMins 0.0789226 0.0220522 3.579 0.000345 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2758.3 on 3332 degrees of freedom
## Residual deviance: 2188.4 on 3322 degrees of freedom
## AIC: 2210.4
##
## Number of Fisher Scoring iterations: 5
vif(logistic_model)
## AccountWeeks ContractRenewal DataPlan DataUsage CustServCalls
## 1.003246 1.058705 14.087816 1601.163095 1.081250
## DayMins DayCalls MonthlyCharge OverageFee RoamMins
## 952.539781 1.004592 2829.804947 211.716226 1.193368
logistic_model1 = glm(Churn ~ . - MonthlyCharge - DataUsage, data = Cellphone, family = "binomial")
summary(logistic_model1)
##
## Call:
## glm(formula = Churn ~ . - MonthlyCharge - DataUsage, family = "binomial",
## data = Cellphone)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0113 -0.5099 -0.3496 -0.2100 2.9978
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.9945437 0.5377103 -11.148 < 2e-16 ***
## AccountWeeks 0.0006621 0.0013873 0.477 0.633
## ContractRenewal -1.9880114 0.1435423 -13.850 < 2e-16 ***
## DataPlan -0.9353165 0.1441298 -6.489 8.62e-11 ***
## CustServCalls 0.5072934 0.0389173 13.035 < 2e-16 ***
## DayMins 0.0127543 0.0010725 11.892 < 2e-16 ***
## DayCalls 0.0036213 0.0027486 1.318 0.188
## OverageFee 0.1398147 0.0226568 6.171 6.79e-10 ***
## RoamMins 0.0831284 0.0203211 4.091 4.30e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2758.3 on 3332 degrees of freedom
## Residual deviance: 2188.6 on 3324 degrees of freedom
## AIC: 2206.6
##
## Number of Fisher Scoring iterations: 5
vif(logistic_model1)
## AccountWeeks ContractRenewal DataPlan CustServCalls DayMins
## 1.002458 1.057352 1.018550 1.078674 1.038782
## DayCalls OverageFee RoamMins
## 1.004109 1.025100 1.010804
logistic_model2 = glm(Churn ~ . - MonthlyCharge - DataUsage -AccountWeeks -DayCalls, data = Cellphone, family = "binomial")
summary(logistic_model2)
##
## Call:
## glm(formula = Churn ~ . - MonthlyCharge - DataUsage - AccountWeeks -
## DayCalls, family = "binomial", data = Cellphone)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9932 -0.5154 -0.3480 -0.2095 2.9906
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.552897 0.432757 -12.831 < 2e-16 ***
## ContractRenewal -1.989219 0.143452 -13.867 < 2e-16 ***
## DataPlan -0.934814 0.144015 -6.491 8.52e-11 ***
## CustServCalls 0.505651 0.038834 13.021 < 2e-16 ***
## DayMins 0.012774 0.001073 11.907 < 2e-16 ***
## OverageFee 0.138612 0.022648 6.120 9.34e-10 ***
## RoamMins 0.083476 0.020304 4.111 3.93e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2758.3 on 3332 degrees of freedom
## Residual deviance: 2190.6 on 3326 degrees of freedom
## AIC: 2204.6
##
## Number of Fisher Scoring iterations: 5
vif(logistic_model2)
## ContractRenewal DataPlan CustServCalls DayMins OverageFee
## 1.056179 1.018476 1.076219 1.039028 1.022948
## RoamMins
## 1.010395
# ******** Calculation confusion Matrix *****************
Pred_tr <- predict(logistic_model2, newdata = train_reg[,-1], type="response")
pred_trb <- table(train_reg$Churn,Pred_tr > 0.5 )
pred_trb
##
## FALSE TRUE
## 0 1950 45
## 1 271 67
tr_accuracy <- sum(diag(pred_trb)) / sum(pred_trb)
tr_accuracy
## [1] 0.8645521
tr_errorrate <- 1 - tr_accuracy
tr_errorrate
## [1] 0.1354479
tr_sensitivity <- pred_trb[2,2] / ( pred_trb[2,2] + pred_trb[1,2])
tr_sensitivity
## [1] 0.5982143
tr_specificity <- pred_trb[1,1] / ( pred_trb[1,1] + pred_trb[2,1])
tr_specificity
## [1] 0.8779829
Pred_tt <- predict(logistic_model2, newdata = test_reg[,-1], type="response")
pred_ttb <- table(test_reg$Churn,Pred_tt > 0.5 )
pred_ttb
##
## FALSE TRUE
## 0 830 25
## 1 119 26
tt_accuracy <- sum(diag(pred_ttb)) / sum(pred_ttb)
tt_accuracy
## [1] 0.856
tt_errorrate <- 1 - tt_accuracy
tt_errorrate
## [1] 0.144
tt_sensitivity <- pred_ttb[2,2] / ( pred_ttb[2,2] + pred_ttb[1,2])
tt_sensitivity
## [1] 0.5098039
tt_specificity <- pred_ttb[1,1] / ( pred_ttb[1,1] + pred_ttb[2,1])
tt_specificity
## [1] 0.8746048
# ************** ROC Train & Test Data *********************
ROCRpredtrain <- prediction(Pred_tr, train_reg$Churn)
ROCRpredtest <- prediction(Pred_tt, test_reg$Churn)
par(mfrow=c(1, 2)) # divide graph area in 2 columns
ROCRperftr <- performance(ROCRpredtrain, 'tpr','fpr')
plot(ROCRperftr, colorize = TRUE, text.adj = c(-0.2,1.7))
title(main = "Train Dataset (ROC)",
cex.main = 2, font.main= 4, col.main= "black",
)
auctr <- performance(ROCRpredtrain, measure = "auc")
auctr <- auctr@y.values[[1]]
auctr
## [1] 0.8288932
abline(a = 0, b = 1)
auctr <- round(auctr, 4)
legend(.6, .4, auctr, title = "AUC", cex = 1)
ROCRperftt <- performance(ROCRpredtest, 'tpr','fpr')
plot(ROCRperftt, colorize = TRUE, text.adj = c(-0.2,1.7))
title(main = "Test Dataset (ROC)",
cex.main = 2, font.main= 4, col.main= "black",
)
auctt <- performance(ROCRpredtest, measure = "auc")
auctt <- auctt@y.values[[1]]
auctt
## [1] 0.7907481
abline(a = 0, b = 1)
auctt <- round(auctt, 4)
legend(.6, .4, auctt, title = "AUC", cex = 1)
