The objective is to assemble a machine algorithm that may predict if a customer churns (Y/N). Data is imported from the telecom dataset.

#Explovative Analysis

Exploration of dementions and names.

dim_desc(Telecom)
## [1] "[1,200 x 12]"
names(Telecom)
##  [1] "X1"                 "Customer.ID"        "Years.customer"    
##  [4] "Months.customer"    "Minutes.in.2018"    "Minutes.onnet"     
##  [7] "Minutes.offnet"     "Number.of.SMS"      "KBs.used"          
## [10] "Total.Unique.Calls" "Previous.provider"  "Churn.Status"

Glimpsing at the feature details.

glimpse(head(Telecom, n=1))
## Rows: 1
## Columns: 12
## $ X1                 <dbl> 1
## $ Customer.ID        <chr> "ADF1259"
## $ Years.customer     <dbl> 3805
## $ Months.customer    <dbl> 126.83
## $ Minutes.in.2018    <dbl> 4091.616
## $ Minutes.onnet      <dbl> 1436.325
## $ Minutes.offnet     <dbl> 2655.291
## $ Number.of.SMS      <dbl> 81
## $ KBs.used           <dbl> 3624.375
## $ Total.Unique.Calls <dbl> 117
## $ Previous.provider  <chr> "KPN"
## $ Churn.Status       <dbl> 0

#Cleaning Data

Mutate variable if character or a factor.

df <- Telecom %>% mutate_if(is.character, as.factor)

Inspecting missing values (NA).

df %>% map(~ sum(is.na(.)))
## $X1
## [1] 0
## 
## $Customer.ID
## [1] 0
## 
## $Years.customer
## [1] 0
## 
## $Months.customer
## [1] 0
## 
## $Minutes.in.2018
## [1] 0
## 
## $Minutes.onnet
## [1] 0
## 
## $Minutes.offnet
## [1] 0
## 
## $Number.of.SMS
## [1] 0
## 
## $KBs.used
## [1] 0
## 
## $Total.Unique.Calls
## [1] 0
## 
## $Previous.provider
## [1] 6
## 
## $Churn.Status
## [1] 0

df$Previous.provider has 6 missing values, therefore it will be removed. Unique Customer.ID is not an usefull feature henche this will also be removed.

df <- df %>% select(-Customer.ID, -Previous.provider)
head(df)
## # A tibble: 6 x 10
##      X1 Years.customer Months.customer Minutes.in.2018 Minutes.onnet
##   <dbl>          <dbl>           <dbl>           <dbl>         <dbl>
## 1     1           3805           127.            4092.         1436.
## 2     2           2905            96.8           3179.         1950.
## 3     3           3819           127.            7233.          949.
## 4     4           3643           121.            7336.         1765.
## 5     5           3809           127.            5391.         3519.
## 6     6           2260            75.3            901.          160.
## # … with 5 more variables: Minutes.offnet <dbl>, Number.of.SMS <dbl>,
## #   KBs.used <dbl>, Total.Unique.Calls <dbl>, Churn.Status <dbl>

Changing Churn.Status values from 0/1 to No/Yes.

df$Churn.Status <- as.factor(mapvalues(df$Churn.Status,
                                      from=c("0","1"),
                                      to=c("No", "Yes")))
head(df)
## # A tibble: 6 x 10
##      X1 Years.customer Months.customer Minutes.in.2018 Minutes.onnet
##   <dbl>          <dbl>           <dbl>           <dbl>         <dbl>
## 1     1           3805           127.            4092.         1436.
## 2     2           2905            96.8           3179.         1950.
## 3     3           3819           127.            7233.          949.
## 4     4           3643           121.            7336.         1765.
## 5     5           3809           127.            5391.         3519.
## 6     6           2260            75.3            901.          160.
## # … with 5 more variables: Minutes.offnet <dbl>, Number.of.SMS <dbl>,
## #   KBs.used <dbl>, Total.Unique.Calls <dbl>, Churn.Status <fct>

#Partitioning

Splitting data for cross validation. Creating 75% train / 25% test.

set.seed(1)
inTrain <- createDataPartition(y = df$Churn.Status, p=0.75, list=FALSE)
train <- df[inTrain,]
## Warning: The `i` argument of ``[`()` can't be a matrix as of tibble 3.0.0.
## Convert to a vector.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
test <- df[-inTrain,]

#GLM

Fitting GLM model.

fit <- glm(Churn.Status~., data=train, family=binomial)
fit
## 
## Call:  glm(formula = Churn.Status ~ ., family = binomial, data = train)
## 
## Coefficients:
##        (Intercept)                  X1      Years.customer     Months.customer  
##          4.196e-01           3.383e-06           3.889e-03          -1.212e-01  
##    Minutes.in.2018       Minutes.onnet      Minutes.offnet       Number.of.SMS  
##         -3.699e+05           3.699e+05           3.699e+05           5.396e-03  
##           KBs.used  Total.Unique.Calls  
##         -5.535e-08          -1.268e-03  
## 
## Degrees of Freedom: 900 Total (i.e. Null);  891 Residual
## Null Deviance:       1249 
## Residual Deviance: 1150  AIC: 1170

Making predictions. Convert character responses to factor types. This is so that the ecoding is correct for the logistic regression model.

set.seed(1)
churn.probs <- predict(fit, test, type="response")
head(churn.probs, n = 10)
##          1          2          3          4          5          6          7 
## 0.52975201 0.43020542 0.39418607 0.27744830 0.59813793 0.23279180 0.62859986 
##          8          9         10 
## 0.04271687 0.56925716 0.60507800
glm.pred = rep("No", length(churn.probs))
glm.pred[churn.probs > 0.4] = "Yes" # set threshold (0/1)
glm.pred <- as.factor(glm.pred)
head(glm.pred, n=5)
## [1] Yes Yes No  No  Yes
## Levels: No Yes

Evaluating the model by confusion matrix.

confusionMatrix(glm.pred, test$Churn.Status, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No   64  14
##        Yes  88 133
##                                           
##                Accuracy : 0.6589          
##                  95% CI : (0.6021, 0.7125)
##     No Information Rate : 0.5084          
##     P-Value [Acc > NIR] : 1.028e-07       
##                                           
##                   Kappa : 0.3231          
##                                           
##  Mcnemar's Test P-Value : 4.899e-13       
##                                           
##             Sensitivity : 0.9048          
##             Specificity : 0.4211          
##          Pos Pred Value : 0.6018          
##          Neg Pred Value : 0.8205          
##              Prevalence : 0.4916          
##          Detection Rate : 0.4448          
##    Detection Prevalence : 0.7391          
##       Balanced Accuracy : 0.6629          
##                                           
##        'Positive' Class : Yes             
## 

Plotting ROC curve.

#create prediction object
pr <- prediction(churn.probs, test$Churn.Status)

#plot curve
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

# AUC value
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.7188955

#Decision Three

Training model and plot results.

set.seed(35)
# Train model
tree.model <- rpart(Churn.Status ~ .,
                    data = train,
                    method = "class",
                    control = rpart.control(xval = 10))

# Plot
rpart.plot(tree.model)

Evaluation model metrics.

tree.pred      <- predict(tree.model, newdata = test, type = "class")
tree.result    <- confusionMatrix(data = tree.pred, test$Churn.Status)
tree.result
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  128  37
##        Yes  24 110
##                                           
##                Accuracy : 0.796           
##                  95% CI : (0.7458, 0.8402)
##     No Information Rate : 0.5084          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5913          
##                                           
##  Mcnemar's Test P-Value : 0.1244          
##                                           
##             Sensitivity : 0.8421          
##             Specificity : 0.7483          
##          Pos Pred Value : 0.7758          
##          Neg Pred Value : 0.8209          
##              Prevalence : 0.5084          
##          Detection Rate : 0.4281          
##    Detection Prevalence : 0.5518          
##       Balanced Accuracy : 0.7952          
##                                           
##        'Positive' Class : No              
## 
tree.precision <- tree.result$byClass['Pos Pred Value']
tree.precision
## Pos Pred Value 
##      0.7757576
tree.recall    <- tree.result$byClass['Sensitivity']
tree.recall
## Sensitivity 
##   0.8421053
tree.F1        <- tree.result$byClass['F1']
tree.F1
##       F1 
## 0.807571

#Random Forest

Train model with 200 trees.

set.seed(7)
# Train model
forest.model <- randomForest(Churn.Status ~., 
                       data = train, 
                       ntree=200, # set number of trees
                       type="classification")

# See error reduction with number of trees ( not much gained beyond ~25 trees)
plot(forest.model)

Plotting features random forest.

varImpPlot(forest.model, sort = T, main="Variable Importance")

Evaluation model metrics.

forest.pred      <- predict(forest.model, newdata = test, type = "class")
forest.result    <- confusionMatrix(data = forest.pred, test$Churn.Status)
forest.result
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  132  30
##        Yes  20 117
##                                           
##                Accuracy : 0.8328          
##                  95% CI : (0.7856, 0.8733)
##     No Information Rate : 0.5084          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6651          
##                                           
##  Mcnemar's Test P-Value : 0.2031          
##                                           
##             Sensitivity : 0.8684          
##             Specificity : 0.7959          
##          Pos Pred Value : 0.8148          
##          Neg Pred Value : 0.8540          
##              Prevalence : 0.5084          
##          Detection Rate : 0.4415          
##    Detection Prevalence : 0.5418          
##       Balanced Accuracy : 0.8322          
##                                           
##        'Positive' Class : No              
## 
forest.precision <- forest.result$byClass['Pos Pred Value']
forest.precision
## Pos Pred Value 
##      0.8148148
forest.recall    <- forest.result$byClass['Sensitivity']
forest.recall
## Sensitivity 
##   0.8684211
forest.F1        <- forest.result$byClass['F1']
forest.F1
##        F1 
## 0.8407643