dat<-read.csv("C:/Users/user/Downloads/archive (1)/TelcoCustomerChurn.csv", stringsAsFactors = TRUE)
dat <- dat[, !names(dat) %in% c("customerID", "gender")]
dat$SeniorCitizen = as.factor(dat$SeniorCitizen)
dat$tenure = as.numeric(dat$tenure)
str(dat)
## 'data.frame': 7043 obs. of 19 variables:
## $ SeniorCitizen : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
## $ tenure : num 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
## $ MultipleLines : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
## $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
## $ OnlineSecurity : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
## $ OnlineBackup : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
## $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
## $ TechSupport : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ StreamingTV : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
## $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
## $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
## $ PaymentMethod : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
## $ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...
summary(dat)
## SeniorCitizen Partner Dependents tenure PhoneService
## 0:5901 No :3641 No :4933 Min. : 0.00 No : 682
## 1:1142 Yes:3402 Yes:2110 1st Qu.: 9.00 Yes:6361
## Median :29.00
## Mean :32.37
## 3rd Qu.:55.00
## Max. :72.00
##
## MultipleLines InternetService OnlineSecurity
## No :3390 DSL :2421 No :3498
## No phone service: 682 Fiber optic:3096 No internet service:1526
## Yes :2971 No :1526 Yes :2019
##
##
##
##
## OnlineBackup DeviceProtection
## No :3088 No :3095
## No internet service:1526 No internet service:1526
## Yes :2429 Yes :2422
##
##
##
##
## TechSupport StreamingTV
## No :3473 No :2810
## No internet service:1526 No internet service:1526
## Yes :2044 Yes :2707
##
##
##
##
## StreamingMovies Contract PaperlessBilling
## No :2785 Month-to-month:3875 No :2872
## No internet service:1526 One year :1473 Yes:4171
## Yes :2732 Two year :1695
##
##
##
##
## PaymentMethod MonthlyCharges TotalCharges Churn
## Bank transfer (automatic):1544 Min. : 18.25 Min. : 18.8 No :5174
## Credit card (automatic) :1522 1st Qu.: 35.50 1st Qu.: 401.4 Yes:1869
## Electronic check :2365 Median : 70.35 Median :1397.5
## Mailed check :1612 Mean : 64.76 Mean :2283.3
## 3rd Qu.: 89.85 3rd Qu.:3794.7
## Max. :118.75 Max. :8684.8
## NA's :11
library(corrplot)
## corrplot 0.92 loaded
num_cols <- sapply(dat, is.numeric)
cor_matrix <- cor(dat[, num_cols], use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.8)

library(ggplot2)
# Histogram
ggplot(dat, aes(x = tenure)) +
geom_histogram(binwidth = 5, fill = "steelblue", color = "black") +
labs(title = "Distribusi Tenure")

# Boxplot
ggplot(dat, aes(y = MonthlyCharges)) +
geom_boxplot(fill = "tomato") +
labs(title = "Boxplot Monthly Charges")

# Scatter Plot
ggplot(dat, aes(x = tenure, y = MonthlyCharges, color = Churn)) +
geom_point(alpha = 0.6) +
labs(title = "Scatter Plot: Tenure vs Monthly Charges by Churn")

colSums(is.na(dat))
## SeniorCitizen Partner Dependents tenure
## 0 0 0 0
## PhoneService MultipleLines InternetService OnlineSecurity
## 0 0 0 0
## OnlineBackup DeviceProtection TechSupport StreamingTV
## 0 0 0 0
## StreamingMovies Contract PaperlessBilling PaymentMethod
## 0 0 0 0
## MonthlyCharges TotalCharges Churn
## 0 11 0
dat$TotalCharges <- median(dat$TotalCharges, na.rm = TRUE)
colSums(is.na(dat))
## SeniorCitizen Partner Dependents tenure
## 0 0 0 0
## PhoneService MultipleLines InternetService OnlineSecurity
## 0 0 0 0
## OnlineBackup DeviceProtection TechSupport StreamingTV
## 0 0 0 0
## StreamingMovies Contract PaperlessBilling PaymentMethod
## 0 0 0 0
## MonthlyCharges TotalCharges Churn
## 0 0 0
dat <- dat[!duplicated(dat), ]
sum(duplicated(dat))
## [1] 0
prop.table(table(dat$Churn)) * 100
##
## No Yes
## 73.58626 26.41374
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.4.3
## Loaded ROSE 0.0-4
data_balanced <- ROSE(Churn ~ ., data = dat, seed = 123)$dat
prop.table(table(data_balanced$Churn)) * 100
##
## No Yes
## 50.5655 49.4345
str(data_balanced)
## 'data.frame': 6985 obs. of 19 variables:
## $ SeniorCitizen : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 1 2 1 1 2 2 1 1 2 1 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
## $ tenure : num 19.3 42.1 76.2 14 63 ...
## $ PhoneService : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ MultipleLines : Factor w/ 3 levels "No","No phone service",..: 3 1 3 1 3 1 1 1 3 3 ...
## $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 3 1 2 1 3 1 1 1 1 3 ...
## $ OnlineSecurity : Factor w/ 3 levels "No","No internet service",..: 2 3 3 3 2 1 1 1 3 2 ...
## $ OnlineBackup : Factor w/ 3 levels "No","No internet service",..: 2 1 3 1 2 1 1 3 1 2 ...
## $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 2 3 3 1 2 1 1 3 3 2 ...
## $ TechSupport : Factor w/ 3 levels "No","No internet service",..: 2 1 3 3 2 1 1 3 3 2 ...
## $ StreamingTV : Factor w/ 3 levels "No","No internet service",..: 2 1 3 1 2 1 1 3 3 2 ...
## $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 2 1 3 1 2 1 1 3 3 2 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 1 3 3 2 3 1 1 3 3 3 ...
## $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 2 2 1 ...
## $ PaymentMethod : Factor w/ 4 levels "Bank transfer (automatic)",..: 4 2 1 4 2 1 2 2 2 1 ...
## $ MonthlyCharges : num 11 52.4 124.3 61.8 25.7 ...
## $ TotalCharges : num 1397 1397 1397 1397 1397 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
library(class)
library(caret)
## Loading required package: lattice
set.seed(123)
train_proportion = 0.7
train_index = createDataPartition(data_balanced$Churn, p = train_proportion, list = FALSE, times = 1)
train_set <- data_balanced[train_index, ]
test_set <- data_balanced[-train_index, ]
sum(is.na(train_set))
## [1] 0
sum(is.na(test_set))
## [1] 0
str(train_set)
## 'data.frame': 4891 obs. of 19 variables:
## $ SeniorCitizen : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 1 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 1 2 1 2 1 1 2 1 1 1 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ tenure : num 19.31 42.06 14.02 62.97 4.81 ...
## $ PhoneService : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 2 ...
## $ MultipleLines : Factor w/ 3 levels "No","No phone service",..: 3 1 1 3 1 1 3 3 2 3 ...
## $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 3 1 1 3 1 1 1 3 1 1 ...
## $ OnlineSecurity : Factor w/ 3 levels "No","No internet service",..: 2 3 3 2 1 1 3 2 1 3 ...
## $ OnlineBackup : Factor w/ 3 levels "No","No internet service",..: 2 1 1 2 1 3 1 2 1 3 ...
## $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 2 3 1 2 1 3 3 2 1 3 ...
## $ TechSupport : Factor w/ 3 levels "No","No internet service",..: 2 1 3 2 1 3 3 2 3 1 ...
## $ StreamingTV : Factor w/ 3 levels "No","No internet service",..: 2 1 1 2 1 3 3 2 1 3 ...
## $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 2 1 1 2 1 3 3 2 1 3 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 1 3 2 3 1 3 3 3 2 2 ...
## $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 1 1 1 1 2 2 2 1 1 2 ...
## $ PaymentMethod : Factor w/ 4 levels "Bank transfer (automatic)",..: 4 2 4 2 2 2 2 1 2 2 ...
## $ MonthlyCharges : num 11 52.4 61.8 25.7 50 ...
## $ TotalCharges : num 1397 1397 1397 1397 1397 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
str(test_set)
## 'data.frame': 2094 obs. of 19 variables:
## $ SeniorCitizen : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 1 2 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 2 2 1 1 1 ...
## $ tenure : num 76.2 3.36 82.48 28.54 49.2 ...
## $ PhoneService : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 1 2 ...
## $ MultipleLines : Factor w/ 3 levels "No","No phone service",..: 3 1 3 1 3 1 3 1 2 3 ...
## $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 2 1 1 1 3 1 2 1 1 3 ...
## $ OnlineSecurity : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 2 3 1 3 1 2 ...
## $ OnlineBackup : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 2 3 3 3 1 2 ...
## $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 2 1 3 1 1 2 ...
## $ TechSupport : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 2 3 1 1 1 2 ...
## $ StreamingTV : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 2 1 3 3 1 2 ...
## $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 3 1 1 1 2 3 3 3 1 2 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 3 1 3 1 3 1 3 2 1 3 ...
## $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 1 1 1 2 1 2 2 2 2 2 ...
## $ PaymentMethod : Factor w/ 4 levels "Bank transfer (automatic)",..: 1 1 2 2 4 2 1 2 2 2 ...
## $ MonthlyCharges : num 124.35 46.24 69.8 57.21 5.77 ...
## $ TotalCharges : num 1397 1397 1397 1397 1397 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
k=sqrt(NROW(data_balanced))
k
## [1] 83.57631
knn.1 <- knn(train=train_set[, c("tenure", "MonthlyCharges", "TotalCharges")], test=test_set[, c("tenure", "MonthlyCharges", "TotalCharges")], cl=train_set$Churn, k=floor(k))
# use confusion matrix to calculate accuracy
cf.1 <- confusionMatrix(test_set$Churn,knn.1)
cf.1
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 713 346
## Yes 243 792
##
## Accuracy : 0.7187
## 95% CI : (0.6989, 0.7379)
## No Information Rate : 0.5435
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.438
##
## Mcnemar's Test P-Value : 2.636e-05
##
## Sensitivity : 0.7458
## Specificity : 0.6960
## Pos Pred Value : 0.6733
## Neg Pred Value : 0.7652
## Prevalence : 0.4565
## Detection Rate : 0.3405
## Detection Prevalence : 0.5057
## Balanced Accuracy : 0.7209
##
## 'Positive' Class : No
##
knn.2 <- knn(train=train_set[, c("tenure", "MonthlyCharges", "TotalCharges")], test=test_set[, c("tenure", "MonthlyCharges", "TotalCharges")], cl=train_set$Churn, k=ceiling(k))
# use confusion matrix to calculate accuracy
cf.2 <- confusionMatrix(test_set$Churn,knn.2)
cf.2
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 711 348
## Yes 248 787
##
## Accuracy : 0.7154
## 95% CI : (0.6955, 0.7346)
## No Information Rate : 0.542
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4313
##
## Mcnemar's Test P-Value : 5.009e-05
##
## Sensitivity : 0.7414
## Specificity : 0.6934
## Pos Pred Value : 0.6714
## Neg Pred Value : 0.7604
## Prevalence : 0.4580
## Detection Rate : 0.3395
## Detection Prevalence : 0.5057
## Balanced Accuracy : 0.7174
##
## 'Positive' Class : No
##
library(caret)
library(DescTools)
##
## Attaching package: 'DescTools'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
library(rpart)
library(rpart.plot)
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(ISLR)
set.seed(123)
churn_class <- rpart(formula = Churn ~ ., data = train_set, method = "class")
churn_class$control
## $minsplit
## [1] 20
##
## $minbucket
## [1] 7
##
## $cp
## [1] 0.01
##
## $maxcompete
## [1] 4
##
## $maxsurrogate
## [1] 5
##
## $usesurrogate
## [1] 2
##
## $surrogatestyle
## [1] 0
##
## $maxdepth
## [1] 30
##
## $xval
## [1] 10
rpart.plot(churn_class, yesno = TRUE)

prediksi_test <- predict(churn_class, test_set, type = "class")
plot(test_set$Churn, prediksi_test,
main = "Simple Classification: Predicted vs. Actual",
xlab = "Actual",
ylab = "Predicted")

confusionMatrix(data = prediksi_test, reference = test_set$Churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 724 212
## Yes 335 823
##
## Accuracy : 0.7388
## 95% CI : (0.7194, 0.7575)
## No Information Rate : 0.5057
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4782
##
## Mcnemar's Test P-Value : 1.825e-07
##
## Sensitivity : 0.6837
## Specificity : 0.7952
## Pos Pred Value : 0.7735
## Neg Pred Value : 0.7107
## Prevalence : 0.5057
## Detection Rate : 0.3457
## Detection Prevalence : 0.4470
## Balanced Accuracy : 0.7394
##
## 'Positive' Class : No
##
library(knitr)
library(tidyr)
library(caret)
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
library(ROCR)
NBClassifier <- naiveBayes(Churn ~., data = train_set)
NBClassifier
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## No Yes
## 0.5056226 0.4943774
##
## Conditional probabilities:
## SeniorCitizen
## Y 0 1
## No 0.8697938 0.1302062
## Yes 0.7419355 0.2580645
##
## Partner
## Y No Yes
## No 0.4690659 0.5309341
## Yes 0.6368900 0.3631100
##
## Dependents
## Y No Yes
## No 0.6595228 0.3404772
## Yes 0.8283706 0.1716294
##
## tenure
## Y [,1] [,2]
## No 37.96104 25.01407
## Yes 18.20087 20.38663
##
## PhoneService
## Y No Yes
## No 0.09987869 0.90012131
## Yes 0.09181141 0.90818859
##
## MultipleLines
## Y No No phone service Yes
## No 0.47957946 0.09987869 0.42054185
## Yes 0.43465674 0.09181141 0.47353184
##
## InternetService
## Y DSL Fiber optic No
## No 0.38576628 0.35341690 0.26081682
## Yes 0.24855252 0.69437552 0.05707196
##
## OnlineSecurity
## Y No No internet service Yes
## No 0.40113223 0.26081682 0.33805095
## Yes 0.78370554 0.05707196 0.15922250
##
## OnlineBackup
## Y No No internet service Yes
## No 0.36554792 0.26081682 0.37363526
## Yes 0.65012407 0.05707196 0.29280397
##
## DeviceProtection
## Y No No internet service Yes
## No 0.36190861 0.26081682 0.37727457
## Yes 0.65136476 0.05707196 0.29156328
##
## TechSupport
## Y No No internet service Yes
## No 0.39142742 0.26081682 0.34775576
## Yes 0.77005790 0.05707196 0.17287014
##
## StreamingTV
## Y No No internet service Yes
## No 0.35584311 0.26081682 0.38334007
## Yes 0.50289495 0.05707196 0.44003309
##
## StreamingMovies
## Y No No internet service Yes
## No 0.35503437 0.26081682 0.38414881
## Yes 0.51157982 0.05707196 0.43134822
##
## Contract
## Y Month-to-month One year Two year
## No 0.43914274 0.25030328 0.31055398
## Yes 0.88751034 0.08602151 0.02646816
##
## PaperlessBilling
## Y No Yes
## No 0.4577436 0.5422564
## Yes 0.2518610 0.7481390
##
## PaymentMethod
## Y Bank transfer (automatic) Credit card (automatic) Electronic check
## No 0.2547513 0.2511120 0.2527295
## Yes 0.1381307 0.1261373 0.5636890
## PaymentMethod
## Y Mailed check
## No 0.2414072
## Yes 0.1720430
##
## MonthlyCharges
## Y [,1] [,2]
## No 62.31154 32.22484
## Yes 74.67350 25.82264
##
## TotalCharges
## Y [,1] [,2]
## No 1397.475 0
## Yes 1397.475 0
test_set$predicted <- predict(NBClassifier,test_set)
test_set$actual <- test_set$Churn
confusionMatrix(factor(test_set$predicted),
factor(test_set$actual))
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 655 158
## Yes 404 877
##
## Accuracy : 0.7316
## 95% CI : (0.7121, 0.7505)
## No Information Rate : 0.5057
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4646
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6185
## Specificity : 0.8473
## Pos Pred Value : 0.8057
## Neg Pred Value : 0.6846
## Prevalence : 0.5057
## Detection Rate : 0.3128
## Detection Prevalence : 0.3883
## Balanced Accuracy : 0.7329
##
## 'Positive' Class : No
##