Read Library

library(tidyverse)
library(caret)
library(MASS)
library(class)
library(plotly)
library(kableExtra)

Explanation

Hi!! Welcome to my LBB in this LBB i’m gonna use Telco Customer Churn dataset and make a classification whether if its churn or not churn using logistic regression and K-Nearest Neighbor. Enjoy!!

Input Data

data <- read_csv("Telco-Customer-Churn.csv")

Data Inspections

(head(data))
dim(data)
#> [1] 7043   21
names(data)
#>  [1] "customerID"       "gender"           "SeniorCitizen"    "Partner"         
#>  [5] "Dependents"       "tenure"           "PhoneService"     "MultipleLines"   
#>  [9] "InternetService"  "OnlineSecurity"   "OnlineBackup"     "DeviceProtection"
#> [13] "TechSupport"      "StreamingTV"      "StreamingMovies"  "Contract"        
#> [17] "PaperlessBilling" "PaymentMethod"    "MonthlyCharges"   "TotalCharges"    
#> [21] "Churn"

Data Cleansing & Coertions

The data type still isn’t correct

glimpse(data)
#> Rows: 7,043
#> Columns: 21
#> $ customerID       <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CFOC…
#> $ gender           <chr> "Female", "Male", "Male", "Male", "Female", "Female"…
#> $ SeniorCitizen    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
#> $ Partner          <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "Ye…
#> $ Dependents       <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "No…
#> $ tenure           <dbl> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49, …
#> $ PhoneService     <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No",…
#> $ MultipleLines    <chr> "No phone service", "No", "No", "No phone service", …
#> $ InternetService  <chr> "DSL", "DSL", "DSL", "DSL", "Fiber optic", "Fiber op…
#> $ OnlineSecurity   <chr> "No", "Yes", "Yes", "Yes", "No", "No", "No", "Yes", …
#> $ OnlineBackup     <chr> "Yes", "No", "Yes", "No", "No", "No", "Yes", "No", "…
#> $ DeviceProtection <chr> "No", "Yes", "No", "Yes", "No", "Yes", "No", "No", "…
#> $ TechSupport      <chr> "No", "No", "No", "Yes", "No", "No", "No", "No", "Ye…
#> $ StreamingTV      <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "No", "Y…
#> $ StreamingMovies  <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "Ye…
#> $ Contract         <chr> "Month-to-month", "One year", "Month-to-month", "One…
#> $ PaperlessBilling <chr> "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No",…
#> $ PaymentMethod    <chr> "Electronic check", "Mailed check", "Mailed check", …
#> $ MonthlyCharges   <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 29.…
#> $ TotalCharges     <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 194…
#> $ Churn            <chr> "No", "No", "Yes", "No", "Yes", "Yes", "No", "No", "…

we’re not using CustomerID columm

data <- data %>% select(-customerID)

Change the data type to factor

data <- data %>% mutate_if(is_character, as_factor)

Lets check if there is any missing value

colSums(is.na(data))
#>           gender    SeniorCitizen          Partner       Dependents 
#>                0                0                0                0 
#>           tenure     PhoneService    MultipleLines  InternetService 
#>                0                0                0                0 
#>   OnlineSecurity     OnlineBackup DeviceProtection      TechSupport 
#>                0                0                0                0 
#>      StreamingTV  StreamingMovies         Contract PaperlessBilling 
#>                0                0                0                0 
#>    PaymentMethod   MonthlyCharges     TotalCharges            Churn 
#>                0                0               11                0
data[is.na(data)] <- 0
anyNA(data)
#> [1] FALSE

great!! there is no missing value

Cross Validation

Split data to data train and data test

set.seed(123)
index <- sample(nrow(data), 0.8*nrow(data))

data_train <- data[index,]
data_test <- data[-index,]

check proportion data train

prop.table(table(data_train$Churn))
#> 
#>        No       Yes 
#> 0.7305644 0.2694356

Check proportion data test

prop.table(table(data_test$Churn))
#> 
#>        No       Yes 
#> 0.7508872 0.2491128

Because the proportion is inbalance so we proceed to do downsampling for the data train

data_train_down <- downSample(x = data_train %>% select(-Churn),
                               y = data_train$Churn,
                               yname = "Churn")
prop.table(table(data_train_down$Churn))
#> 
#>  No Yes 
#> 0.5 0.5

Great! the proportion is now balance now we can proceed to build the model

Data Modeling

Logistic Regresion

Modeling

we use all the predictors available in the data

model_glm <- glm(Churn~.,data_train_down, family = "binomial")
summary(model_glm)
#> 
#> Call:
#> glm(formula = Churn ~ ., family = "binomial", data = data_train_down)
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -2.2191  -0.7729   0.1546   0.7645   3.0628  
#> 
#> Coefficients: (7 not defined because of singularities)
#>                                           Estimate  Std. Error z value
#> (Intercept)                             3.52054273  1.38026677   2.551
#> genderMale                             -0.04184073  0.09024822  -0.464
#> SeniorCitizen                           0.24157998  0.12150817   1.988
#> PartnerNo                              -0.01722219  0.10936909  -0.157
#> DependentsYes                          -0.18500418  0.12150220  -1.523
#> tenure                                 -0.05413926  0.00768639  -7.044
#> PhoneServiceYes                         1.07832884  1.14430260   0.942
#> MultipleLinesNo                        -0.45409564  0.25212014  -1.801
#> MultipleLinesYes                                NA          NA      NA
#> InternetServiceFiber optic              2.19940906  1.13675618   1.935
#> InternetServiceNo                      -2.47129737  1.37052324  -1.803
#> OnlineSecurityYes                      -0.10055271  0.25176066  -0.399
#> OnlineSecurityNo internet service               NA          NA      NA
#> OnlineBackupNo                         -0.16118902  0.25198318  -0.640
#> OnlineBackupNo internet service                 NA          NA      NA
#> DeviceProtectionYes                     0.23924528  0.25078268   0.954
#> DeviceProtectionNo internet service             NA          NA      NA
#> TechSupportYes                          0.02687458  0.25532682   0.105
#> TechSupportNo internet service                  NA          NA      NA
#> StreamingTVYes                          0.75148380  0.46659468   1.611
#> StreamingTVNo internet service                  NA          NA      NA
#> StreamingMoviesYes                      0.84013281  0.46327069   1.813
#> StreamingMoviesNo internet service              NA          NA      NA
#> ContractOne year                       -0.73878992  0.13851227  -5.334
#> ContractTwo year                       -1.57732657  0.22123086  -7.130
#> PaperlessBillingNo                     -0.37418187  0.10139684  -3.690
#> PaymentMethodMailed check              -0.29778960  0.13499433  -2.206
#> PaymentMethodBank transfer (automatic) -0.30394119  0.12836486  -2.368
#> PaymentMethodCredit card (automatic)   -0.42369733  0.13356329  -3.172
#> MonthlyCharges                         -0.06284044  0.04522148  -1.390
#> TotalCharges                            0.00033104  0.00008915   3.713
#>                                                Pr(>|z|)    
#> (Intercept)                                    0.010753 *  
#> genderMale                                     0.642921    
#> SeniorCitizen                                  0.046792 *  
#> PartnerNo                                      0.874876    
#> DependentsYes                                  0.127849    
#> tenure                                 0.00000000000187 ***
#> PhoneServiceYes                                0.346016    
#> MultipleLinesNo                                0.071686 .  
#> MultipleLinesYes                                     NA    
#> InternetServiceFiber optic                     0.053013 .  
#> InternetServiceNo                              0.071360 .  
#> OnlineSecurityYes                              0.689600    
#> OnlineSecurityNo internet service                    NA    
#> OnlineBackupNo                                 0.522380    
#> OnlineBackupNo internet service                      NA    
#> DeviceProtectionYes                            0.340086    
#> DeviceProtectionNo internet service                  NA    
#> TechSupportYes                                 0.916173    
#> TechSupportNo internet service                       NA    
#> StreamingTVYes                                 0.107273    
#> StreamingTVNo internet service                       NA    
#> StreamingMoviesYes                             0.069758 .  
#> StreamingMoviesNo internet service                   NA    
#> ContractOne year                       0.00000009620453 ***
#> ContractTwo year                       0.00000000000101 ***
#> PaperlessBillingNo                             0.000224 ***
#> PaymentMethodMailed check                      0.027388 *  
#> PaymentMethodBank transfer (automatic)         0.017895 *  
#> PaymentMethodCredit card (automatic)           0.001513 ** 
#> MonthlyCharges                                 0.164646    
#> TotalCharges                                   0.000205 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 4208.8  on 3035  degrees of freedom
#> Residual deviance: 2996.4  on 3012  degrees of freedom
#> AIC: 3044.4
#> 
#> Number of Fisher Scoring iterations: 5

Prediction

determine the threshold by looking at the distribution of churn

hist(model_glm$fitted.values)

We’re gonna test using threshold 0.5

pred_test <- predict(model_glm, data_test, type = "response")
pred_class <- ifelse(pred_test > 0.5, "Yes", "No") %>% 
  as.factor()
head(pred_class)
#>   1   2   3   4   5   6 
#> Yes  No Yes Yes  No  No 
#> Levels: No Yes
head(data_test$Churn)
#> [1] Yes No  No  No  No  No 
#> Levels: No Yes

Model Perfomance

confusionMatrix(pred_class,data_test$Churn, positive = "Yes")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  No Yes
#>        No  777  62
#>        Yes 281 289
#>                                              
#>                Accuracy : 0.7566             
#>                  95% CI : (0.7333, 0.7788)   
#>     No Information Rate : 0.7509             
#>     P-Value [Acc > NIR] : 0.3235             
#>                                              
#>                   Kappa : 0.4615             
#>                                              
#>  Mcnemar's Test P-Value : <0.0000000000000002
#>                                              
#>             Sensitivity : 0.8234             
#>             Specificity : 0.7344             
#>          Pos Pred Value : 0.5070             
#>          Neg Pred Value : 0.9261             
#>              Prevalence : 0.2491             
#>          Detection Rate : 0.2051             
#>    Detection Prevalence : 0.4045             
#>       Balanced Accuracy : 0.7789             
#>                                              
#>        'Positive' Class : Yes                
#> 

The recall value we get is 0.82

K-Nearest Neighbor

Feature Selection

in this model we’re gonna do one-hot encoding method to all factor type data in the predictors variable

data_scale <- data %>% select(-Churn)
dummy <- dummyVars(~gender + SeniorCitizen + Partner + Dependents + tenure + PhoneService + InternetService + Contract + MonthlyCharges + TotalCharges + PaymentMethod, data_scale, fullRank = T)

Change dummy class to data frame

dummy.df <- data.frame(predict(dummy, data_scale))
data_scale <- cbind("Churn" = data$Churn, dummy.df)
head(data_scale)

after we done doing one-hot encoding to our data, now we scale all the predictors variable

data_scale <-  data_scale %>% 
  mutate_if(is.numeric, scale)

Cross Validation

Split data to data train and data test

set.seed(123)
index <- sample(nrow(data_scale), 0.8*nrow(data_scale))

data_train_scale <- data_scale[index,]
data_test_scale <- data_scale[-index,]

Check the proportion data

prop.table(table(data_train_scale$Churn))
#> 
#>        No       Yes 
#> 0.7305644 0.2694356

Because the proportion is inbalance so we proceed to do downsampling for the data train

data_train_scale_down <- downSample(x = data_train_scale %>% select(-Churn),
                               y = data_train_scale$Churn,
                               yname = "Churn")
prop.table(table(data_train_scale_down$Churn))
#> 
#>  No Yes 
#> 0.5 0.5

Great! the proportion is now balance now we can proceed to build the model

k_choose <- sqrt(nrow(data_train_scale_down)) %>% round()

K = 55

Modeling

model_knn <- knn3(x = data_train_scale_down %>% select(-Churn),  # prediktor data train
                  y = data_train_scale_down$Churn,  # target variabel data train
                  k = k_choose # Jumlah K
                  )

Prediction

pred_knn <- predict(model_knn, newdata = data_test_scale %>% select(-Churn), type = "class")
head(pred_knn)
#> [1] Yes No  Yes Yes No  No 
#> Levels: No Yes
head(data_test_scale$Churn)
#> [1] Yes No  No  No  No  No 
#> Levels: No Yes

Model Performance

confusionMatrix(pred_knn, data_test_scale$Churn, positive = "Yes")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  No Yes
#>        No  728  58
#>        Yes 330 293
#>                                              
#>                Accuracy : 0.7246             
#>                  95% CI : (0.7005, 0.7478)   
#>     No Information Rate : 0.7509             
#>     P-Value [Acc > NIR] : 0.989              
#>                                              
#>                   Kappa : 0.4153             
#>                                              
#>  Mcnemar's Test P-Value : <0.0000000000000002
#>                                              
#>             Sensitivity : 0.8348             
#>             Specificity : 0.6881             
#>          Pos Pred Value : 0.4703             
#>          Neg Pred Value : 0.9262             
#>              Prevalence : 0.2491             
#>          Detection Rate : 0.2079             
#>    Detection Prevalence : 0.4422             
#>       Balanced Accuracy : 0.7614             
#>                                              
#>        'Positive' Class : Yes                
#> 

The recall value we get is 0.84

Conclusion

Recall Value :
- Logistic Regression is 82%
- K-Nearest Neighbors (with scalling) is 84%

Based on the result, it can be concluded that recall value from KNN model is higher than Logistic Regression Model.