dat<-read.csv("C:/Users/user/Downloads/archive (1)/TelcoCustomerChurn.csv", stringsAsFactors = TRUE)
dat <- dat[, !names(dat) %in% c("customerID", "gender")]
dat$SeniorCitizen = as.factor(dat$SeniorCitizen)
dat$tenure = as.numeric(dat$tenure)
str(dat)
## 'data.frame':    7043 obs. of  19 variables:
##  $ SeniorCitizen   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Partner         : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
##  $ Dependents      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
##  $ tenure          : num  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
##  $ MultipleLines   : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
##  $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
##  $ OnlineSecurity  : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
##  $ OnlineBackup    : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
##  $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
##  $ TechSupport     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ StreamingTV     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
##  $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
##  $ Contract        : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
##  $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
##  $ PaymentMethod   : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...
summary(dat)
##  SeniorCitizen Partner    Dependents     tenure      PhoneService
##  0:5901        No :3641   No :4933   Min.   : 0.00   No : 682    
##  1:1142        Yes:3402   Yes:2110   1st Qu.: 9.00   Yes:6361    
##                                      Median :29.00               
##                                      Mean   :32.37               
##                                      3rd Qu.:55.00               
##                                      Max.   :72.00               
##                                                                  
##           MultipleLines     InternetService             OnlineSecurity
##  No              :3390   DSL        :2421   No                 :3498  
##  No phone service: 682   Fiber optic:3096   No internet service:1526  
##  Yes             :2971   No         :1526   Yes                :2019  
##                                                                       
##                                                                       
##                                                                       
##                                                                       
##               OnlineBackup             DeviceProtection
##  No                 :3088   No                 :3095   
##  No internet service:1526   No internet service:1526   
##  Yes                :2429   Yes                :2422   
##                                                        
##                                                        
##                                                        
##                                                        
##               TechSupport                StreamingTV  
##  No                 :3473   No                 :2810  
##  No internet service:1526   No internet service:1526  
##  Yes                :2044   Yes                :2707  
##                                                       
##                                                       
##                                                       
##                                                       
##             StreamingMovies           Contract    PaperlessBilling
##  No                 :2785   Month-to-month:3875   No :2872        
##  No internet service:1526   One year      :1473   Yes:4171        
##  Yes                :2732   Two year      :1695                   
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##                    PaymentMethod  MonthlyCharges    TotalCharges    Churn     
##  Bank transfer (automatic):1544   Min.   : 18.25   Min.   :  18.8   No :5174  
##  Credit card (automatic)  :1522   1st Qu.: 35.50   1st Qu.: 401.4   Yes:1869  
##  Electronic check         :2365   Median : 70.35   Median :1397.5             
##  Mailed check             :1612   Mean   : 64.76   Mean   :2283.3             
##                                   3rd Qu.: 89.85   3rd Qu.:3794.7             
##                                   Max.   :118.75   Max.   :8684.8             
##                                                    NA's   :11
library(corrplot)
## corrplot 0.92 loaded
num_cols <- sapply(dat, is.numeric)
cor_matrix <- cor(dat[, num_cols], use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.8)

library(ggplot2)
# Histogram
ggplot(dat, aes(x = tenure)) + 
  geom_histogram(binwidth = 5, fill = "steelblue", color = "black") +
  labs(title = "Distribusi Tenure")

# Boxplot
ggplot(dat, aes(y = MonthlyCharges)) + 
  geom_boxplot(fill = "tomato") +
  labs(title = "Boxplot Monthly Charges")

# Scatter Plot
ggplot(dat, aes(x = tenure, y = MonthlyCharges, color = Churn)) +
  geom_point(alpha = 0.6) +
  labs(title = "Scatter Plot: Tenure vs Monthly Charges by Churn")

colSums(is.na(dat))
##    SeniorCitizen          Partner       Dependents           tenure 
##                0                0                0                0 
##     PhoneService    MultipleLines  InternetService   OnlineSecurity 
##                0                0                0                0 
##     OnlineBackup DeviceProtection      TechSupport      StreamingTV 
##                0                0                0                0 
##  StreamingMovies         Contract PaperlessBilling    PaymentMethod 
##                0                0                0                0 
##   MonthlyCharges     TotalCharges            Churn 
##                0               11                0
dat$TotalCharges <- median(dat$TotalCharges, na.rm = TRUE)
colSums(is.na(dat))
##    SeniorCitizen          Partner       Dependents           tenure 
##                0                0                0                0 
##     PhoneService    MultipleLines  InternetService   OnlineSecurity 
##                0                0                0                0 
##     OnlineBackup DeviceProtection      TechSupport      StreamingTV 
##                0                0                0                0 
##  StreamingMovies         Contract PaperlessBilling    PaymentMethod 
##                0                0                0                0 
##   MonthlyCharges     TotalCharges            Churn 
##                0                0                0
dat <- dat[!duplicated(dat), ]
sum(duplicated(dat))
## [1] 0
prop.table(table(dat$Churn)) * 100
## 
##       No      Yes 
## 73.58626 26.41374
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.4.3
## Loaded ROSE 0.0-4
data_balanced <- ROSE(Churn ~ ., data = dat, seed = 123)$dat
prop.table(table(data_balanced$Churn)) * 100
## 
##      No     Yes 
## 50.5655 49.4345
str(data_balanced)
## 'data.frame':    6985 obs. of  19 variables:
##  $ SeniorCitizen   : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
##  $ Partner         : Factor w/ 2 levels "No","Yes": 1 2 1 1 2 2 1 1 2 1 ...
##  $ Dependents      : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
##  $ tenure          : num  19.3 42.1 76.2 14 63 ...
##  $ PhoneService    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ MultipleLines   : Factor w/ 3 levels "No","No phone service",..: 3 1 3 1 3 1 1 1 3 3 ...
##  $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 3 1 2 1 3 1 1 1 1 3 ...
##  $ OnlineSecurity  : Factor w/ 3 levels "No","No internet service",..: 2 3 3 3 2 1 1 1 3 2 ...
##  $ OnlineBackup    : Factor w/ 3 levels "No","No internet service",..: 2 1 3 1 2 1 1 3 1 2 ...
##  $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 2 3 3 1 2 1 1 3 3 2 ...
##  $ TechSupport     : Factor w/ 3 levels "No","No internet service",..: 2 1 3 3 2 1 1 3 3 2 ...
##  $ StreamingTV     : Factor w/ 3 levels "No","No internet service",..: 2 1 3 1 2 1 1 3 3 2 ...
##  $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 2 1 3 1 2 1 1 3 3 2 ...
##  $ Contract        : Factor w/ 3 levels "Month-to-month",..: 1 3 3 2 3 1 1 3 3 3 ...
##  $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 2 1 ...
##  $ PaymentMethod   : Factor w/ 4 levels "Bank transfer (automatic)",..: 4 2 1 4 2 1 2 2 2 1 ...
##  $ MonthlyCharges  : num  11 52.4 124.3 61.8 25.7 ...
##  $ TotalCharges    : num  1397 1397 1397 1397 1397 ...
##  $ Churn           : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
library(class)
library(caret)
## Loading required package: lattice
set.seed(123) 
train_proportion = 0.7
 
train_index = createDataPartition(data_balanced$Churn, p = train_proportion, list = FALSE, times = 1)
train_set <- data_balanced[train_index, ]
test_set  <- data_balanced[-train_index, ]
sum(is.na(train_set))
## [1] 0
sum(is.na(test_set))
## [1] 0
str(train_set)
## 'data.frame':    4891 obs. of  19 variables:
##  $ SeniorCitizen   : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 1 ...
##  $ Partner         : Factor w/ 2 levels "No","Yes": 1 2 1 2 1 1 2 1 1 1 ...
##  $ Dependents      : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ tenure          : num  19.31 42.06 14.02 62.97 4.81 ...
##  $ PhoneService    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 2 ...
##  $ MultipleLines   : Factor w/ 3 levels "No","No phone service",..: 3 1 1 3 1 1 3 3 2 3 ...
##  $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 3 1 1 3 1 1 1 3 1 1 ...
##  $ OnlineSecurity  : Factor w/ 3 levels "No","No internet service",..: 2 3 3 2 1 1 3 2 1 3 ...
##  $ OnlineBackup    : Factor w/ 3 levels "No","No internet service",..: 2 1 1 2 1 3 1 2 1 3 ...
##  $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 2 3 1 2 1 3 3 2 1 3 ...
##  $ TechSupport     : Factor w/ 3 levels "No","No internet service",..: 2 1 3 2 1 3 3 2 3 1 ...
##  $ StreamingTV     : Factor w/ 3 levels "No","No internet service",..: 2 1 1 2 1 3 3 2 1 3 ...
##  $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 2 1 1 2 1 3 3 2 1 3 ...
##  $ Contract        : Factor w/ 3 levels "Month-to-month",..: 1 3 2 3 1 3 3 3 2 2 ...
##  $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 1 1 2 ...
##  $ PaymentMethod   : Factor w/ 4 levels "Bank transfer (automatic)",..: 4 2 4 2 2 2 2 1 2 2 ...
##  $ MonthlyCharges  : num  11 52.4 61.8 25.7 50 ...
##  $ TotalCharges    : num  1397 1397 1397 1397 1397 ...
##  $ Churn           : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
str(test_set)
## 'data.frame':    2094 obs. of  19 variables:
##  $ SeniorCitizen   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Partner         : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 1 2 ...
##  $ Dependents      : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 2 2 1 1 1 ...
##  $ tenure          : num  76.2 3.36 82.48 28.54 49.2 ...
##  $ PhoneService    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 2 ...
##  $ MultipleLines   : Factor w/ 3 levels "No","No phone service",..: 3 1 3 1 3 1 3 1 2 3 ...
##  $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 2 1 1 1 3 1 2 1 1 3 ...
##  $ OnlineSecurity  : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 2 3 1 3 1 2 ...
##  $ OnlineBackup    : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 2 3 3 3 1 2 ...
##  $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 2 1 3 1 1 2 ...
##  $ TechSupport     : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 2 3 1 1 1 2 ...
##  $ StreamingTV     : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 2 1 3 3 1 2 ...
##  $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 2 3 3 3 1 2 ...
##  $ Contract        : Factor w/ 3 levels "Month-to-month",..: 3 1 3 1 3 1 3 2 1 3 ...
##  $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 1 1 1 2 1 2 2 2 2 2 ...
##  $ PaymentMethod   : Factor w/ 4 levels "Bank transfer (automatic)",..: 1 1 2 2 4 2 1 2 2 2 ...
##  $ MonthlyCharges  : num  124.35 46.24 69.8 57.21 5.77 ...
##  $ TotalCharges    : num  1397 1397 1397 1397 1397 ...
##  $ Churn           : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
k=sqrt(NROW(data_balanced))
k
## [1] 83.57631
knn.1 <- knn(train=train_set[, c("tenure", "MonthlyCharges", "TotalCharges")], test=test_set[, c("tenure", "MonthlyCharges", "TotalCharges")], cl=train_set$Churn, k=floor(k))
# use confusion matrix to calculate accuracy
cf.1 <- confusionMatrix(test_set$Churn,knn.1) 
cf.1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  713 346
##        Yes 243 792
##                                           
##                Accuracy : 0.7187          
##                  95% CI : (0.6989, 0.7379)
##     No Information Rate : 0.5435          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.438           
##                                           
##  Mcnemar's Test P-Value : 2.636e-05       
##                                           
##             Sensitivity : 0.7458          
##             Specificity : 0.6960          
##          Pos Pred Value : 0.6733          
##          Neg Pred Value : 0.7652          
##              Prevalence : 0.4565          
##          Detection Rate : 0.3405          
##    Detection Prevalence : 0.5057          
##       Balanced Accuracy : 0.7209          
##                                           
##        'Positive' Class : No              
## 
knn.2 <- knn(train=train_set[, c("tenure", "MonthlyCharges", "TotalCharges")], test=test_set[, c("tenure", "MonthlyCharges", "TotalCharges")], cl=train_set$Churn, k=ceiling(k))
# use confusion matrix to calculate accuracy
cf.2 <- confusionMatrix(test_set$Churn,knn.2) 
cf.2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  711 348
##        Yes 248 787
##                                           
##                Accuracy : 0.7154          
##                  95% CI : (0.6955, 0.7346)
##     No Information Rate : 0.542           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4313          
##                                           
##  Mcnemar's Test P-Value : 5.009e-05       
##                                           
##             Sensitivity : 0.7414          
##             Specificity : 0.6934          
##          Pos Pred Value : 0.6714          
##          Neg Pred Value : 0.7604          
##              Prevalence : 0.4580          
##          Detection Rate : 0.3395          
##    Detection Prevalence : 0.5057          
##       Balanced Accuracy : 0.7174          
##                                           
##        'Positive' Class : No              
## 
library(caret)
library(DescTools)
## 
## Attaching package: 'DescTools'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
library(rpart)
library(rpart.plot)
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(ISLR)
set.seed(123) 
churn_class <- rpart(formula = Churn ~ ., data = train_set, method = "class") 
churn_class$control
## $minsplit
## [1] 20
## 
## $minbucket
## [1] 7
## 
## $cp
## [1] 0.01
## 
## $maxcompete
## [1] 4
## 
## $maxsurrogate
## [1] 5
## 
## $usesurrogate
## [1] 2
## 
## $surrogatestyle
## [1] 0
## 
## $maxdepth
## [1] 30
## 
## $xval
## [1] 10
rpart.plot(churn_class, yesno = TRUE)

prediksi_test <- predict(churn_class, test_set, type = "class")
plot(test_set$Churn, prediksi_test, 
     main = "Simple Classification: Predicted vs. Actual",
     xlab = "Actual",
     ylab = "Predicted")

confusionMatrix(data = prediksi_test, reference = test_set$Churn) 
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  724 212
##        Yes 335 823
##                                           
##                Accuracy : 0.7388          
##                  95% CI : (0.7194, 0.7575)
##     No Information Rate : 0.5057          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4782          
##                                           
##  Mcnemar's Test P-Value : 1.825e-07       
##                                           
##             Sensitivity : 0.6837          
##             Specificity : 0.7952          
##          Pos Pred Value : 0.7735          
##          Neg Pred Value : 0.7107          
##              Prevalence : 0.5057          
##          Detection Rate : 0.3457          
##    Detection Prevalence : 0.4470          
##       Balanced Accuracy : 0.7394          
##                                           
##        'Positive' Class : No              
## 
library(knitr)
library(tidyr)
library(caret)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
library(ROCR)
NBClassifier <- naiveBayes(Churn ~., data = train_set) 
NBClassifier 
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##        No       Yes 
## 0.5056226 0.4943774 
## 
## Conditional probabilities:
##      SeniorCitizen
## Y             0         1
##   No  0.8697938 0.1302062
##   Yes 0.7419355 0.2580645
## 
##      Partner
## Y            No       Yes
##   No  0.4690659 0.5309341
##   Yes 0.6368900 0.3631100
## 
##      Dependents
## Y            No       Yes
##   No  0.6595228 0.3404772
##   Yes 0.8283706 0.1716294
## 
##      tenure
## Y         [,1]     [,2]
##   No  37.96104 25.01407
##   Yes 18.20087 20.38663
## 
##      PhoneService
## Y             No        Yes
##   No  0.09987869 0.90012131
##   Yes 0.09181141 0.90818859
## 
##      MultipleLines
## Y             No No phone service        Yes
##   No  0.47957946       0.09987869 0.42054185
##   Yes 0.43465674       0.09181141 0.47353184
## 
##      InternetService
## Y            DSL Fiber optic         No
##   No  0.38576628  0.35341690 0.26081682
##   Yes 0.24855252  0.69437552 0.05707196
## 
##      OnlineSecurity
## Y             No No internet service        Yes
##   No  0.40113223          0.26081682 0.33805095
##   Yes 0.78370554          0.05707196 0.15922250
## 
##      OnlineBackup
## Y             No No internet service        Yes
##   No  0.36554792          0.26081682 0.37363526
##   Yes 0.65012407          0.05707196 0.29280397
## 
##      DeviceProtection
## Y             No No internet service        Yes
##   No  0.36190861          0.26081682 0.37727457
##   Yes 0.65136476          0.05707196 0.29156328
## 
##      TechSupport
## Y             No No internet service        Yes
##   No  0.39142742          0.26081682 0.34775576
##   Yes 0.77005790          0.05707196 0.17287014
## 
##      StreamingTV
## Y             No No internet service        Yes
##   No  0.35584311          0.26081682 0.38334007
##   Yes 0.50289495          0.05707196 0.44003309
## 
##      StreamingMovies
## Y             No No internet service        Yes
##   No  0.35503437          0.26081682 0.38414881
##   Yes 0.51157982          0.05707196 0.43134822
## 
##      Contract
## Y     Month-to-month   One year   Two year
##   No      0.43914274 0.25030328 0.31055398
##   Yes     0.88751034 0.08602151 0.02646816
## 
##      PaperlessBilling
## Y            No       Yes
##   No  0.4577436 0.5422564
##   Yes 0.2518610 0.7481390
## 
##      PaymentMethod
## Y     Bank transfer (automatic) Credit card (automatic) Electronic check
##   No                  0.2547513               0.2511120        0.2527295
##   Yes                 0.1381307               0.1261373        0.5636890
##      PaymentMethod
## Y     Mailed check
##   No     0.2414072
##   Yes    0.1720430
## 
##      MonthlyCharges
## Y         [,1]     [,2]
##   No  62.31154 32.22484
##   Yes 74.67350 25.82264
## 
##      TotalCharges
## Y         [,1] [,2]
##   No  1397.475    0
##   Yes 1397.475    0
test_set$predicted <- predict(NBClassifier,test_set)
test_set$actual <- test_set$Churn
confusionMatrix(factor(test_set$predicted),
                factor(test_set$actual))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  655 158
##        Yes 404 877
##                                           
##                Accuracy : 0.7316          
##                  95% CI : (0.7121, 0.7505)
##     No Information Rate : 0.5057          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4646          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6185          
##             Specificity : 0.8473          
##          Pos Pred Value : 0.8057          
##          Neg Pred Value : 0.6846          
##              Prevalence : 0.5057          
##          Detection Rate : 0.3128          
##    Detection Prevalence : 0.3883          
##       Balanced Accuracy : 0.7329          
##                                           
##        'Positive' Class : No              
##