library(readxl)

# Loading package
library(caTools)
library(ROCR)
# Correlogram in R
# required packages
library(corrplot)
## corrplot 0.90 loaded
library(ROCR)
library(DataExplorer)
library(car)
## Loading required package: carData
getwd()
## [1] "D:/APGCBAA/III/RProg/Group-Assignment/Group-Assignment"
# Splitting dataset

Cellphone <- read_excel("Cellphone.xlsx",sheet = 2)
summary(Cellphone)
##      Churn         AccountWeeks   ContractRenewal     DataPlan     
##  Min.   :0.0000   Min.   :  1.0   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 74.0   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :101.0   Median :1.0000   Median :0.0000  
##  Mean   :0.1449   Mean   :101.1   Mean   :0.9031   Mean   :0.2766  
##  3rd Qu.:0.0000   3rd Qu.:127.0   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :243.0   Max.   :1.0000   Max.   :1.0000  
##    DataUsage      CustServCalls      DayMins         DayCalls    
##  Min.   :0.0000   Min.   :0.000   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:143.7   1st Qu.: 87.0  
##  Median :0.0000   Median :1.000   Median :179.4   Median :101.0  
##  Mean   :0.8165   Mean   :1.563   Mean   :179.8   Mean   :100.4  
##  3rd Qu.:1.7800   3rd Qu.:2.000   3rd Qu.:216.4   3rd Qu.:114.0  
##  Max.   :5.4000   Max.   :9.000   Max.   :350.8   Max.   :165.0  
##  MonthlyCharge      OverageFee       RoamMins    
##  Min.   : 14.00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 45.00   1st Qu.: 8.33   1st Qu.: 8.50  
##  Median : 53.50   Median :10.07   Median :10.30  
##  Mean   : 56.31   Mean   :10.05   Mean   :10.24  
##  3rd Qu.: 66.20   3rd Qu.:11.77   3rd Qu.:12.10  
##  Max.   :111.30   Max.   :18.19   Max.   :20.00
dim(Cellphone)
## [1] 3333   11
str(Cellphone)
## tibble [3,333 x 11] (S3: tbl_df/tbl/data.frame)
##  $ Churn          : num [1:3333] 0 0 0 0 0 0 0 0 0 0 ...
##  $ AccountWeeks   : num [1:3333] 128 107 137 84 75 118 121 147 117 141 ...
##  $ ContractRenewal: num [1:3333] 1 1 1 0 0 0 1 0 1 0 ...
##  $ DataPlan       : num [1:3333] 1 1 0 0 0 0 1 0 0 1 ...
##  $ DataUsage      : num [1:3333] 2.7 3.7 0 0 0 0 2.03 0 0.19 3.02 ...
##  $ CustServCalls  : num [1:3333] 1 1 0 2 3 0 3 0 1 0 ...
##  $ DayMins        : num [1:3333] 265 162 243 299 167 ...
##  $ DayCalls       : num [1:3333] 110 123 114 71 113 98 88 79 97 84 ...
##  $ MonthlyCharge  : num [1:3333] 89 82 52 57 41 57 87.3 36 63.9 93.2 ...
##  $ OverageFee     : num [1:3333] 9.87 9.78 6.06 3.1 7.42 ...
##  $ RoamMins       : num [1:3333] 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
plot_intro(Cellphone)

### Histogram of variables
#DataExplorer::plot_histogram(Cellphone,geom_histogram_args = list(fill="blue"),
#               theme_config = list(axis.line = element_line(size = 1, colour = "black"), strip.background = element_rect(color = "black", fill = "grey")))  ## checking the distribution of variables
DataExplorer::plot_histogram(Cellphone)

plot_density(Cellphone)

plot_boxplot(Cellphone, by="Churn")

plot_bar(Cellphone)

#******** Correlation Matrix **********************

head(Cellphone)
## # A tibble: 6 x 11
##   Churn AccountWeeks ContractRenewal DataPlan DataUsage CustServCalls DayMins
##   <dbl>        <dbl>           <dbl>    <dbl>     <dbl>         <dbl>   <dbl>
## 1     0          128               1        1       2.7             1    265.
## 2     0          107               1        1       3.7             1    162.
## 3     0          137               1        0       0               0    243.
## 4     0           84               0        0       0               2    299.
## 5     0           75               0        0       0               3    167.
## 6     0          118               0        0       0               0    223.
## # ... with 4 more variables: DayCalls <dbl>, MonthlyCharge <dbl>,
## #   OverageFee <dbl>, RoamMins <dbl>
#correlation matrix
M<-cor(Cellphone)
head(round(M,2))
##                 Churn AccountWeeks ContractRenewal DataPlan DataUsage
## Churn            1.00         0.02           -0.26    -0.10     -0.09
## AccountWeeks     0.02         1.00           -0.02     0.00      0.01
## ContractRenewal -0.26        -0.02            1.00    -0.01     -0.02
## DataPlan        -0.10         0.00           -0.01     1.00      0.95
## DataUsage       -0.09         0.01           -0.02     0.95      1.00
## CustServCalls    0.21         0.00            0.02    -0.02     -0.02
##                 CustServCalls DayMins DayCalls MonthlyCharge OverageFee
## Churn                    0.21    0.21     0.02          0.07       0.09
## AccountWeeks             0.00    0.01     0.04          0.01      -0.01
## ContractRenewal          0.02   -0.05     0.00         -0.05      -0.02
## DataPlan                -0.02    0.00    -0.01          0.74       0.02
## DataUsage               -0.02    0.00    -0.01          0.78       0.02
## CustServCalls            1.00   -0.01    -0.02         -0.03      -0.01
##                 RoamMins
## Churn               0.07
## AccountWeeks        0.01
## ContractRenewal    -0.05
## DataPlan            0.00
## DataUsage           0.16
## CustServCalls      -0.01
corrplot(M, method="number")

split <- sample.split(Cellphone$Churn, SplitRatio = 0.7)

train_reg <- subset(Cellphone,  split == TRUE)
test_reg <- subset(Cellphone, split == FALSE)

logistic_model = glm(Churn ~ ., data = Cellphone, family = "binomial")
summary(logistic_model)
## 
## Call:
## glm(formula = Churn ~ ., family = "binomial", data = Cellphone)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0058  -0.5112  -0.3477  -0.2093   2.9981  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -5.9510252  0.5486763 -10.846  < 2e-16 ***
## AccountWeeks     0.0006525  0.0013873   0.470 0.638112    
## ContractRenewal -1.9855172  0.1436107 -13.826  < 2e-16 ***
## DataPlan        -1.1841611  0.5363668  -2.208 0.027262 *  
## DataUsage        0.3636565  1.9231751   0.189 0.850021    
## CustServCalls    0.5081349  0.0389682  13.040  < 2e-16 ***
## DayMins          0.0174407  0.0324841   0.537 0.591337    
## DayCalls         0.0036523  0.0027497   1.328 0.184097    
## MonthlyCharge   -0.0275526  0.1909074  -0.144 0.885244    
## OverageFee       0.1868114  0.3256902   0.574 0.566248    
## RoamMins         0.0789226  0.0220522   3.579 0.000345 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2758.3  on 3332  degrees of freedom
## Residual deviance: 2188.4  on 3322  degrees of freedom
## AIC: 2210.4
## 
## Number of Fisher Scoring iterations: 5
vif(logistic_model)
##    AccountWeeks ContractRenewal        DataPlan       DataUsage   CustServCalls 
##        1.003246        1.058705       14.087816     1601.163095        1.081250 
##         DayMins        DayCalls   MonthlyCharge      OverageFee        RoamMins 
##      952.539781        1.004592     2829.804947      211.716226        1.193368
logistic_model1 = glm(Churn ~ . - MonthlyCharge - DataUsage, data = Cellphone, family = "binomial")
summary(logistic_model1)
## 
## Call:
## glm(formula = Churn ~ . - MonthlyCharge - DataUsage, family = "binomial", 
##     data = Cellphone)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0113  -0.5099  -0.3496  -0.2100   2.9978  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -5.9945437  0.5377103 -11.148  < 2e-16 ***
## AccountWeeks     0.0006621  0.0013873   0.477    0.633    
## ContractRenewal -1.9880114  0.1435423 -13.850  < 2e-16 ***
## DataPlan        -0.9353165  0.1441298  -6.489 8.62e-11 ***
## CustServCalls    0.5072934  0.0389173  13.035  < 2e-16 ***
## DayMins          0.0127543  0.0010725  11.892  < 2e-16 ***
## DayCalls         0.0036213  0.0027486   1.318    0.188    
## OverageFee       0.1398147  0.0226568   6.171 6.79e-10 ***
## RoamMins         0.0831284  0.0203211   4.091 4.30e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2758.3  on 3332  degrees of freedom
## Residual deviance: 2188.6  on 3324  degrees of freedom
## AIC: 2206.6
## 
## Number of Fisher Scoring iterations: 5
vif(logistic_model1)
##    AccountWeeks ContractRenewal        DataPlan   CustServCalls         DayMins 
##        1.002458        1.057352        1.018550        1.078674        1.038782 
##        DayCalls      OverageFee        RoamMins 
##        1.004109        1.025100        1.010804
logistic_model2 = glm(Churn ~ . - MonthlyCharge - DataUsage -AccountWeeks -DayCalls, data = Cellphone, family = "binomial")
summary(logistic_model2)
## 
## Call:
## glm(formula = Churn ~ . - MonthlyCharge - DataUsage - AccountWeeks - 
##     DayCalls, family = "binomial", data = Cellphone)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9932  -0.5154  -0.3480  -0.2095   2.9906  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -5.552897   0.432757 -12.831  < 2e-16 ***
## ContractRenewal -1.989219   0.143452 -13.867  < 2e-16 ***
## DataPlan        -0.934814   0.144015  -6.491 8.52e-11 ***
## CustServCalls    0.505651   0.038834  13.021  < 2e-16 ***
## DayMins          0.012774   0.001073  11.907  < 2e-16 ***
## OverageFee       0.138612   0.022648   6.120 9.34e-10 ***
## RoamMins         0.083476   0.020304   4.111 3.93e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2758.3  on 3332  degrees of freedom
## Residual deviance: 2190.6  on 3326  degrees of freedom
## AIC: 2204.6
## 
## Number of Fisher Scoring iterations: 5
vif(logistic_model2)
## ContractRenewal        DataPlan   CustServCalls         DayMins      OverageFee 
##        1.056179        1.018476        1.076219        1.039028        1.022948 
##        RoamMins 
##        1.010395
# ********    Calculation confusion Matrix    *****************

Pred_tr <- predict(logistic_model2, newdata = train_reg[,-1], type="response")
pred_trb <- table(train_reg$Churn,Pred_tr > 0.5 )
pred_trb
##    
##     FALSE TRUE
##   0  1950   45
##   1   271   67
tr_accuracy <- sum(diag(pred_trb)) / sum(pred_trb)
tr_accuracy
## [1] 0.8645521
tr_errorrate <-  1 - tr_accuracy
tr_errorrate
## [1] 0.1354479
tr_sensitivity <-   pred_trb[2,2] / ( pred_trb[2,2] +  pred_trb[1,2])
tr_sensitivity
## [1] 0.5982143
tr_specificity <-  pred_trb[1,1] / ( pred_trb[1,1] +  pred_trb[2,1])
tr_specificity
## [1] 0.8779829
Pred_tt <- predict(logistic_model2, newdata = test_reg[,-1], type="response")
pred_ttb <- table(test_reg$Churn,Pred_tt > 0.5 )
pred_ttb
##    
##     FALSE TRUE
##   0   830   25
##   1   119   26
tt_accuracy <- sum(diag(pred_ttb)) / sum(pred_ttb)
tt_accuracy
## [1] 0.856
tt_errorrate <-  1 - tt_accuracy
tt_errorrate
## [1] 0.144
tt_sensitivity <-   pred_ttb[2,2] / ( pred_ttb[2,2] +  pred_ttb[1,2])
tt_sensitivity
## [1] 0.5098039
tt_specificity <-  pred_ttb[1,1] / ( pred_ttb[1,1] +  pred_ttb[2,1])
tt_specificity
## [1] 0.8746048
# ************** ROC Train & Test Data   *********************

ROCRpredtrain <- prediction(Pred_tr, train_reg$Churn)
ROCRpredtest <- prediction(Pred_tt, test_reg$Churn)
par(mfrow=c(1, 2))  # divide graph area in 2 columns
ROCRperftr <- performance(ROCRpredtrain, 'tpr','fpr')
plot(ROCRperftr, colorize = TRUE, text.adj = c(-0.2,1.7))
title(main = "Train Dataset (ROC)",
      cex.main = 2,   font.main= 4, col.main= "black",
)
auctr <- performance(ROCRpredtrain, measure = "auc")
auctr <- auctr@y.values[[1]]
auctr
## [1] 0.8288932
abline(a = 0, b = 1)
auctr <- round(auctr, 4)
legend(.6, .4, auctr, title = "AUC", cex = 1)

ROCRperftt <- performance(ROCRpredtest, 'tpr','fpr')
plot(ROCRperftt, colorize = TRUE, text.adj = c(-0.2,1.7))
title(main = "Test Dataset (ROC)",
      cex.main = 2,   font.main= 4, col.main= "black",
)
auctt <- performance(ROCRpredtest, measure = "auc")
auctt <- auctt@y.values[[1]]
auctt
## [1] 0.7907481
abline(a = 0, b = 1)
auctt <- round(auctt, 4)
legend(.6, .4, auctt, title = "AUC", cex = 1)