churn_predictivemodel

library(plyr)

## Warning: package 'plyr' was built under R version 4.0.5

library(corrplot)

## corrplot 0.90 loaded

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.0.5

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 4.0.5

library(ggthemes)

## Warning: package 'ggthemes' was built under R version 4.0.5

library(caret)

## Warning: package 'caret' was built under R version 4.0.5

## Loading required package: lattice

library(MASS)
library(randomForest)

## Warning: package 'randomForest' was built under R version 4.0.5

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(party)

## Warning: package 'party' was built under R version 4.0.5

## Loading required package: grid

## Loading required package: mvtnorm

## Warning: package 'mvtnorm' was built under R version 4.0.5

## Loading required package: modeltools

## Loading required package: stats4

## 
## Attaching package: 'modeltools'

## The following object is masked from 'package:plyr':
## 
##     empty

## Loading required package: strucchange

## Warning: package 'strucchange' was built under R version 4.0.5

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 4.0.5

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Warning: package 'sandwich' was built under R version 4.0.5

churn <- read.csv('churn.csv')

str(churn)

## 'data.frame':    3333 obs. of  22 variables:
##  $ Sl.No.        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ State         : chr  "KS" "OH" "NJ" "OH" ...
##  $ Account.Length: int  128 107 137 84 75 118 121 147 117 141 ...
##  $ Area.Code     : int  415 415 415 408 415 510 510 415 408 415 ...
##  $ Phone         : chr  "382-4657" "371-7191" "358-1921" "375-9999" ...
##  $ Int.l.Plan    : chr  "no" "no" "no" "yes" ...
##  $ VMail.Plan    : chr  "yes" "yes" "no" "no" ...
##  $ VMail.Message : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ Day.Mins      : num  265 162 243 299 167 ...
##  $ Day.Calls     : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ Day.Charge    : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ Eve.Mins      : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ Eve.Calls     : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ Eve.Charge    : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ Night.Mins    : num  245 254 163 197 187 ...
##  $ Night.Calls   : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ Night.Charge  : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ Intl.Mins     : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ Intl.Calls    : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ Intl.Charge   : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ CustServ.Calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ Churn         : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

sapply(churn, function(x) sum(is.na(x)))

##         Sl.No.          State Account.Length      Area.Code          Phone 
##              0              0              0              0              0 
##     Int.l.Plan     VMail.Plan  VMail.Message       Day.Mins      Day.Calls 
##              0              0              0              0              0 
##     Day.Charge       Eve.Mins      Eve.Calls     Eve.Charge     Night.Mins 
##              0              0              0              0              0 
##    Night.Calls   Night.Charge      Intl.Mins     Intl.Calls    Intl.Charge 
##              0              0              0              0              0 
## CustServ.Calls          Churn 
##              0              0

range(churn$Account.Length)

## [1]   1 243

range(churn$CustServ.Calls)

## [1] 0 9

range(churn$Intl.Calls)

## [1]  0 20

group_Account.Length <- function(Account.Length){
  if (Account.Length >= 0 & Account.Length <= 120){
    return('0-120 group')
  }else if (Account.Length > 120){
    return('120 and above')
  }
}

churn$Account.Length.group <- sapply(churn$Account.Length, group_Account.Length)
churn$Account.Length.group <- as.factor(churn$Account.Length.group)

str(churn)

## 'data.frame':    3333 obs. of  23 variables:
##  $ Sl.No.              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ State               : chr  "KS" "OH" "NJ" "OH" ...
##  $ Account.Length      : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ Area.Code           : int  415 415 415 408 415 510 510 415 408 415 ...
##  $ Phone               : chr  "382-4657" "371-7191" "358-1921" "375-9999" ...
##  $ Int.l.Plan          : chr  "no" "no" "no" "yes" ...
##  $ VMail.Plan          : chr  "yes" "yes" "no" "no" ...
##  $ VMail.Message       : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ Day.Mins            : num  265 162 243 299 167 ...
##  $ Day.Calls           : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ Day.Charge          : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ Eve.Mins            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ Eve.Calls           : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ Eve.Charge          : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ Night.Mins          : num  245 254 163 197 187 ...
##  $ Night.Calls         : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ Night.Charge        : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ Intl.Mins           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ Intl.Calls          : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ Intl.Charge         : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ CustServ.Calls      : int  1 1 0 2 3 0 3 0 1 0 ...
##  $ Churn               : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Account.Length.group: Factor w/ 2 levels "0-120 group",..: 2 1 2 1 1 1 2 2 1 2 ...

churn$Int.l.Plan <- as.factor(churn$Int.l.Plan)
churn$VMail.Plan <- as.factor(churn$VMail.Plan)
churn$Churn <- as.factor(churn$Churn)

churn$Sl.No. <- NULL
churn$State <- NULL
churn$Area.Code <- NULL
churn$Phone <- NULL
churn$Account.Length <- NULL

str(churn)

## 'data.frame':    3333 obs. of  18 variables:
##  $ Int.l.Plan          : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ VMail.Plan          : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ VMail.Message       : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ Day.Mins            : num  265 162 243 299 167 ...
##  $ Day.Calls           : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ Day.Charge          : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ Eve.Mins            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ Eve.Calls           : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ Eve.Charge          : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ Night.Mins          : num  245 254 163 197 187 ...
##  $ Night.Calls         : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ Night.Charge        : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ Intl.Mins           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ Intl.Calls          : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ Intl.Charge         : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ CustServ.Calls      : int  1 1 0 2 3 0 3 0 1 0 ...
##  $ Churn               : Factor w/ 2 levels "FALSE","TRUE": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Account.Length.group: Factor w/ 2 levels "0-120 group",..: 2 1 2 1 1 1 2 2 1 2 ...

numeric.var <- sapply(churn, is.numeric)
corr.matrix <- cor(churn[,numeric.var])
corrplot(corr.matrix, main="\n\nCorrelation Plot for Numerical Variables", method="number")

churn$Day.Mins <- NULL
churn$Day.Charge <- NULL
churn$Eve.Mins <- NULL
churn$Eve.Charge <- NULL
churn$Night.Mins <- NULL
churn$Night.Charge <- NULL
churn$Intl.Mins <- NULL
churn$Intl.Charge <- NULL

numeric.var <- sapply(churn, is.numeric)
corr.matrix <- cor(churn[,numeric.var])
corrplot(corr.matrix, main="\n\nCorrelation Plot for Numerical Variables", method="number")

p1 <- ggplot(churn, aes(x=churn$Int.l.Plan)) + ggtitle("Int.I.Plan") + xlab("Int.I.Plan") +
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
p2 <- ggplot(churn, aes(x=churn$VMail.Plan)) + ggtitle("VMail.Plan") + xlab("VMail.Plan") +
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
p3 <- ggplot(churn, aes(x=churn$Account.Length.group)) + ggtitle("Account.Length.group") + xlab("Account.Length.group") +
  geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()

grid.arrange(p1, p2, p3, ncol=2)

## Warning: Use of `churn$Int.l.Plan` is discouraged. Use `Int.l.Plan` instead.

## Warning: Use of `churn$VMail.Plan` is discouraged. Use `VMail.Plan` instead.

## Warning: Use of `churn$Account.Length.group` is discouraged. Use
## `Account.Length.group` instead.

intrain <- createDataPartition(churn$Churn, p=0.8,list=FALSE)
set.seed(2021)
training<- churn[intrain,]
testing<- churn[-intrain,]

dim(training); dim(testing)

## [1] 2667   10

## [1] 666  10

LogModel <- glm(Churn ~ .,family=binomial(link="logit"),data=training)
print(summary(LogModel))

## 
## Call:
## glm(formula = Churn ~ ., family = binomial(link = "logit"), data = training)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7545  -0.5428  -0.4011  -0.2768   2.8970  
## 
## Coefficients:
##                                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -2.7121946  0.5496958  -4.934 8.06e-07 ***
## Int.l.Planyes                      2.0646118  0.1555187  13.276  < 2e-16 ***
## VMail.Planyes                     -1.4345782  0.6201156  -2.313 0.020700 *  
## VMail.Message                      0.0159856  0.0198061   0.807 0.419607    
## Day.Calls                          0.0018448  0.0029562   0.624 0.532596    
## Eve.Calls                          0.0003704  0.0030253   0.122 0.902543    
## Night.Calls                        0.0015930  0.0030439   0.523 0.600740    
## Intl.Calls                        -0.0980823  0.0271046  -3.619 0.000296 ***
## CustServ.Calls                     0.4760230  0.0418160  11.384  < 2e-16 ***
## Account.Length.group120 and above -0.0174273  0.1297663  -0.134 0.893168    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2209.0  on 2666  degrees of freedom
## Residual deviance: 1871.3  on 2657  degrees of freedom
## AIC: 1891.3
## 
## Number of Fisher Scoring iterations: 5

anova(LogModel, test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Churn
## 
## Terms added sequentially (first to last)
## 
## 
##                      Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                  2666     2208.9              
## Int.l.Plan            1  147.687      2665     2061.3 < 2.2e-16 ***
## VMail.Plan            1   39.645      2664     2021.6 3.045e-10 ***
## VMail.Message         1    1.234      2663     2020.4 0.2667098    
## Day.Calls             1    0.090      2662     2020.3 0.7639656    
## Eve.Calls             1    0.047      2661     2020.2 0.8276522    
## Night.Calls           1    0.062      2660     2020.2 0.8035225    
## Intl.Calls            1   14.929      2659     2005.3 0.0001116 ***
## CustServ.Calls        1  133.960      2658     1871.3 < 2.2e-16 ***
## Account.Length.group  1    0.018      2657     1871.3 0.8931007    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

testing$Churn <- as.character(testing$Churn)
testing$Churn[testing$Churn=="No"]

## character(0)

testing$Churn[testing$Churn=="Yes"]

## character(0)

fitted.results <- predict(LogModel,newdata=testing,type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != testing$Churn)
print(paste('Logistic Regression Accuracy',1-misClasificError))

## [1] "Logistic Regression Accuracy 0"

print("Confusion Matrix for Logistic Regression"); table(testing$Churn, fitted.results > 0.5)

## [1] "Confusion Matrix for Logistic Regression"

##        
##         FALSE TRUE
##   FALSE   558   12
##   TRUE     82   14

library(MASS)
exp(cbind(OR=coef(LogModel), confint(LogModel)))

## Waiting for profiling to be done...

##                                           OR      2.5 %     97.5 %
## (Intercept)                       0.06639095 0.02243988  0.1937691
## Int.l.Planyes                     7.88223714 5.81341606 10.7011725
## VMail.Planyes                     0.23821581 0.06810138  0.7763525
## VMail.Message                     1.01611402 0.97760473  1.0566258
## Day.Calls                         1.00184652 0.99606778  1.0076840
## Eve.Calls                         1.00037051 0.99445917  1.0063284
## Night.Calls                       1.00159424 0.99563394  1.0075905
## Intl.Calls                        0.90657429 0.85885734  0.9551539
## CustServ.Calls                    1.60966008 1.48385355  1.7483754
## Account.Length.group120 and above 0.98272372 0.76006606  1.2645707

tree <- ctree(Churn~Int.l.Plan+VMail.Plan+CustServ.Calls+Intl.Calls+VMail.Message, training)
plot(tree)

pred_tree <- predict(tree, testing)
print("Confusion Matrix for Decision Tree"); table(Predicted = pred_tree, Actual = testing$Churn)

## [1] "Confusion Matrix for Decision Tree"

##          Actual
## Predicted FALSE TRUE
##     FALSE   538   63
##     TRUE     32   33

p1 <- predict(tree, training)
tab1 <- table(Predicted = p1, Actual = training$Churn)
tab2 <- table(Predicted = pred_tree, Actual = testing$Churn)
print(paste('Decision Tree Accuracy',sum(diag(tab2))/sum(tab2)))

## [1] "Decision Tree Accuracy 0.857357357357357"

rfModel <- randomForest(Churn ~., data = training)
print(rfModel)

## 
## Call:
##  randomForest(formula = Churn ~ ., data = training) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 12.3%
## Confusion matrix:
##       FALSE TRUE class.error
## FALSE  2224   56   0.0245614
## TRUE    272  115   0.7028424

pred_rf <- predict(rfModel, testing)

library(caret)
confusionMatrix(pred_rf, as.factor(testing$Churn))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   552   68
##      TRUE     18   28
##                                          
##                Accuracy : 0.8709         
##                  95% CI : (0.843, 0.8954)
##     No Information Rate : 0.8559         
##     P-Value [Acc > NIR] : 0.1469         
##                                          
##                   Kappa : 0.332          
##                                          
##  Mcnemar's Test P-Value : 1.265e-07      
##                                          
##             Sensitivity : 0.9684         
##             Specificity : 0.2917         
##          Pos Pred Value : 0.8903         
##          Neg Pred Value : 0.6087         
##              Prevalence : 0.8559         
##          Detection Rate : 0.8288         
##    Detection Prevalence : 0.9309         
##       Balanced Accuracy : 0.6300         
##                                          
##        'Positive' Class : FALSE          
##

plot(rfModel)

t <- tuneRF(training[, -9], training[, 9], stepFactor = .5, plot = TRUE, ntreeTry = 100, trace = TRUE, improve = 0.05)

## mtry = 3  OOB error = 12.26% 
## Searching left ...
## mtry = 6     OOB error = 13.01% 
## -0.06116208 0.05 
## Searching right ...
## mtry = 1     OOB error = 14.36% 
## -0.1712538 0.05

rfModel_new <- randomForest(Churn ~., data = training, ntree = 100, mtry = 3, importance = TRUE, proximity = TRUE)
print(rfModel_new)

## 
## Call:
##  randomForest(formula = Churn ~ ., data = training, ntree = 100,      mtry = 3, importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 12.26%
## Confusion matrix:
##       FALSE TRUE class.error
## FALSE  2227   53  0.02324561
## TRUE    274  113  0.70801034

pred_rf_new <- predict(rfModel_new, testing)
confusionMatrix(pred_rf_new, as.factor(testing$Churn))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE   553   67
##      TRUE     17   29
##                                           
##                Accuracy : 0.8739          
##                  95% CI : (0.8462, 0.8981)
##     No Information Rate : 0.8559          
##     P-Value [Acc > NIR] : 0.1007          
##                                           
##                   Kappa : 0.3475          
##                                           
##  Mcnemar's Test P-Value : 8.975e-08       
##                                           
##             Sensitivity : 0.9702          
##             Specificity : 0.3021          
##          Pos Pred Value : 0.8919          
##          Neg Pred Value : 0.6304          
##              Prevalence : 0.8559          
##          Detection Rate : 0.8303          
##    Detection Prevalence : 0.9309          
##       Balanced Accuracy : 0.6361          
##                                           
##        'Positive' Class : FALSE           
##

varImpPlot(rfModel_new, sort=T, n.var = 3, main = 'Feature Importance')

churn_predictivemodel_method1.R

tariqm

2021-08-07