telco <- read.csv("telcochurn.csv", stringsAsFactors = TRUE)
glimpse(telco)
#> Rows: 7,043
#> Columns: 22
#> $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16~
#> $ customerID <fct> 7590-VHVEG, 5575-GNVDE, 3668-QPYBK, 7795-CFOCW, 9237-~
#> $ gender <fct> Female, Male, Male, Male, Female, Female, Male, Femal~
#> $ SeniorCitizen <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
#> $ Partner <fct> Yes, No, No, No, No, No, No, No, Yes, No, Yes, No, Ye~
#> $ Dependents <fct> No, No, No, No, No, No, Yes, No, No, Yes, Yes, No, No~
#> $ tenure <int> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49, 2~
#> $ PhoneService <fct> No, Yes, Yes, No, Yes, Yes, Yes, No, Yes, Yes, Yes, Y~
#> $ MultipleLines <fct> No, No, No, No, No, Yes, Yes, No, Yes, No, No, No, Ye~
#> $ InternetService <fct> DSL, DSL, DSL, DSL, Fiber optic, Fiber optic, Fiber o~
#> $ OnlineSecurity <fct> No, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Yes, No,~
#> $ OnlineBackup <fct> Yes, No, Yes, No, No, No, Yes, No, No, Yes, No, No, N~
#> $ DeviceProtection <fct> No, Yes, No, Yes, No, Yes, No, No, Yes, No, No, No, Y~
#> $ TechSupport <fct> No, No, No, Yes, No, No, No, No, Yes, No, No, No, No,~
#> $ StreamingTV <fct> No, No, No, No, No, Yes, Yes, No, Yes, No, No, No, Ye~
#> $ StreamingMovies <fct> No, No, No, No, No, Yes, No, No, Yes, No, No, No, Yes~
#> $ Contract <fct> Month-to-month, One year, Month-to-month, One year, M~
#> $ PaperlessBilling <fct> Yes, No, Yes, No, Yes, Yes, Yes, No, Yes, No, Yes, No~
#> $ PaymentMethod <fct> Electronic check, Mailed check, Mailed check, Bank tr~
#> $ MonthlyCharges <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 29.7~
#> $ TotalCharges <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1949~
#> $ Churn <fct> No, No, Yes, No, Yes, Yes, No, No, Yes, No, No, No, N~
Data Explanation:
Overall, the column in dataset is self explanatory. Maybe only a few that needed further explanation.
Business Question:
We assume ourself to be a data analyst in a Telco company. We were asked to build the best model to predict the customer that intent to stop using our services, so we could ask them for their feedback of our services in order to improve our services.
telco <- telco %>%
select(-c(X, customerID))
colSums(is.na(telco))
#> gender SeniorCitizen Partner Dependents
#> 0 0 0 0
#> tenure PhoneService MultipleLines InternetService
#> 0 0 0 0
#> OnlineSecurity OnlineBackup DeviceProtection TechSupport
#> 0 0 0 0
#> StreamingTV StreamingMovies Contract PaperlessBilling
#> 0 0 0 0
#> PaymentMethod MonthlyCharges TotalCharges Churn
#> 0 0 11 0
telco_clean <- telco %>%
na.omit()
colSums(is.na(telco_clean))
#> gender SeniorCitizen Partner Dependents
#> 0 0 0 0
#> tenure PhoneService MultipleLines InternetService
#> 0 0 0 0
#> OnlineSecurity OnlineBackup DeviceProtection TechSupport
#> 0 0 0 0
#> StreamingTV StreamingMovies Contract PaperlessBilling
#> 0 0 0 0
#> PaymentMethod MonthlyCharges TotalCharges Churn
#> 0 0 0 0
summary(telco_clean)
#> gender SeniorCitizen Partner Dependents tenure
#> Female:3483 Min. :0.0000 No :3639 No :4933 Min. : 1.00
#> Male :3549 1st Qu.:0.0000 Yes:3393 Yes:2099 1st Qu.: 9.00
#> Median :0.0000 Median :29.00
#> Mean :0.1624 Mean :32.42
#> 3rd Qu.:0.0000 3rd Qu.:55.00
#> Max. :1.0000 Max. :72.00
#> PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup
#> No : 680 No :4065 DSL :2416 No :5017 No :4607
#> Yes:6352 Yes:2967 Fiber optic:3096 Yes:2015 Yes:2425
#> No :1520
#>
#>
#>
#> DeviceProtection TechSupport StreamingTV StreamingMovies Contract
#> No :4614 No :4992 No :4329 No :4301 Month-to-month:3875
#> Yes:2418 Yes:2040 Yes:2703 Yes:2731 One year :1472
#> Two year :1685
#>
#>
#>
#> PaperlessBilling PaymentMethod MonthlyCharges
#> No :2864 Bank transfer (automatic):1542 Min. : 18.25
#> Yes:4168 Credit card (automatic) :1521 1st Qu.: 35.59
#> Electronic check :2365 Median : 70.35
#> Mailed check :1604 Mean : 64.80
#> 3rd Qu.: 89.86
#> Max. :118.75
#> TotalCharges Churn
#> Min. : 18.8 No :5163
#> 1st Qu.: 401.4 Yes:1869
#> Median :1397.5
#> Mean :2283.3
#> 3rd Qu.:3794.7
#> Max. :8684.8
prop.table(table(telco_clean$Churn))
#>
#> No Yes
#> 0.734215 0.265785
RNGkind(sample.kind= "Rounding")
set.seed(417)
idx <- sample(nrow(telco_clean), nrow(telco_clean)*0.8)
telco_train <- telco[idx,]
telco_test <- telco[-idx,]
prop.table(table(telco_train$Churn))
#>
#> No Yes
#> 0.7365333 0.2634667
model_telco <- glm(Churn ~ tenure + PhoneService + InternetService + OnlineSecurity + DeviceProtection + TechSupport + Contract + MonthlyCharges + TotalCharges, data = telco_train, family = "binomial")
summary(model_telco)
#>
#> Call:
#> glm(formula = Churn ~ tenure + PhoneService + InternetService +
#> OnlineSecurity + DeviceProtection + TechSupport + Contract +
#> MonthlyCharges + TotalCharges, family = "binomial", data = telco_train)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -1.7654 -0.7018 -0.3025 0.7975 3.4207
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -0.19403178 0.22138474 -0.876 0.380788
#> tenure -0.05788874 0.00676739 -8.554 < 0.0000000000000002
#> PhoneServiceYes -1.08649453 0.16786811 -6.472 0.00000000009651560
#> InternetServiceFiber optic 0.39881947 0.16714590 2.386 0.017030
#> InternetServiceNo -0.37205905 0.21180474 -1.757 0.078984
#> OnlineSecurityYes -0.55407445 0.09401083 -5.894 0.00000000377573224
#> DeviceProtectionYes -0.11960999 0.09276209 -1.289 0.197250
#> TechSupportYes -0.49959432 0.09785774 -5.105 0.00000033024861226
#> ContractOne year -0.71815862 0.11756214 -6.109 0.00000000100409694
#> ContractTwo year -1.52371416 0.19555406 -7.792 0.00000000000000661
#> MonthlyCharges 0.02230609 0.00503005 4.435 0.00000922571294063
#> TotalCharges 0.00028404 0.00007751 3.665 0.000248
#>
#> (Intercept)
#> tenure ***
#> PhoneServiceYes ***
#> InternetServiceFiber optic *
#> InternetServiceNo .
#> OnlineSecurityYes ***
#> DeviceProtectionYes
#> TechSupportYes ***
#> ContractOne year ***
#> ContractTwo year ***
#> MonthlyCharges ***
#> TotalCharges ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 6481.2 on 5614 degrees of freedom
#> Residual deviance: 4749.3 on 5603 degrees of freedom
#> (10 observations deleted due to missingness)
#> AIC: 4773.3
#>
#> Number of Fisher Scoring iterations: 6
telco_train_up <- upSample(x = telco_train %>% select(-Churn), y = telco_train$Churn, yname = "Churn")
prop.table(table(telco_train_up$Churn))
#>
#> No Yes
#> 0.5 0.5
model_telco_up <- glm(Churn ~ tenure + PhoneService + InternetService + OnlineSecurity + DeviceProtection + TechSupport + Contract + MonthlyCharges + TotalCharges, data = telco_train_up, family = "binomial")
summary(model_telco_up)
#>
#> Call:
#> glm(formula = Churn ~ tenure + PhoneService + InternetService +
#> OnlineSecurity + DeviceProtection + TechSupport + Contract +
#> MonthlyCharges + TotalCharges, family = "binomial", data = telco_train_up)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -2.2142 -0.7720 0.4394 0.7619 3.0438
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 0.71657833 0.17458872 4.104 0.0000405401620
#> tenure -0.05189657 0.00447150 -11.606 < 0.0000000000000002
#> PhoneServiceYes -1.08677613 0.12687293 -8.566 < 0.0000000000000002
#> InternetServiceFiber optic 0.34886696 0.12975330 2.689 0.00717
#> InternetServiceNo -0.35329247 0.15737167 -2.245 0.02477
#> OnlineSecurityYes -0.66165649 0.06986900 -9.470 < 0.0000000000000002
#> DeviceProtectionYes -0.11689012 0.07143866 -1.636 0.10179
#> TechSupportYes -0.47447157 0.07291791 -6.507 0.0000000000767
#> ContractOne year -0.78519553 0.08461996 -9.279 < 0.0000000000000002
#> ContractTwo year -1.56402805 0.12918918 -12.106 < 0.0000000000000002
#> MonthlyCharges 0.02389724 0.00395189 6.047 0.0000000014754
#> TotalCharges 0.00024041 0.00005271 4.561 0.0000050843990
#>
#> (Intercept) ***
#> tenure ***
#> PhoneServiceYes ***
#> InternetServiceFiber optic **
#> InternetServiceNo *
#> OnlineSecurityYes ***
#> DeviceProtectionYes
#> TechSupportYes ***
#> ContractOne year ***
#> ContractTwo year ***
#> MonthlyCharges ***
#> TotalCharges ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 11473.0 on 8275 degrees of freedom
#> Residual deviance: 8130.3 on 8264 degrees of freedom
#> (10 observations deleted due to missingness)
#> AIC: 8154.3
#>
#> Number of Fisher Scoring iterations: 5
telco_train_down <- downSample(x = telco_train %>% select(-Churn), y = telco_train$Churn, yname = "Churn")
prop.table(table(telco_train_down$Churn))
#>
#> No Yes
#> 0.5 0.5
model_telco_down <- glm(Churn ~ tenure + PhoneService + InternetService + OnlineSecurity + DeviceProtection + TechSupport + Contract + MonthlyCharges + TotalCharges, data = telco_train_down, family = "binomial")
summary(model_telco_down)
#>
#> Call:
#> glm(formula = Churn ~ tenure + PhoneService + InternetService +
#> OnlineSecurity + DeviceProtection + TechSupport + Contract +
#> MonthlyCharges + TotalCharges, family = "binomial", data = telco_train_down)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -2.1406 -0.7898 0.4750 0.7702 3.0887
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 0.66164377 0.28225478 2.344 0.01907
#> tenure -0.05308601 0.00742503 -7.150 0.000000000000870
#> PhoneServiceYes -0.86844702 0.20401968 -4.257 0.000020748259572
#> InternetServiceFiber optic 0.53177243 0.21533579 2.470 0.01353
#> InternetServiceNo -0.41466893 0.25946005 -1.598 0.11000
#> OnlineSecurityYes -0.47881179 0.11652209 -4.109 0.000039704352676
#> DeviceProtectionYes -0.05013473 0.11903054 -0.421 0.67361
#> TechSupportYes -0.33989323 0.12382565 -2.745 0.00605
#> ContractOne year -0.66320532 0.14284377 -4.643 0.000003435993715
#> ContractTwo year -1.65914013 0.22050595 -7.524 0.000000000000053
#> MonthlyCharges 0.01838505 0.00646965 2.842 0.00449
#> TotalCharges 0.00026861 0.00008729 3.077 0.00209
#>
#> (Intercept) *
#> tenure ***
#> PhoneServiceYes ***
#> InternetServiceFiber optic *
#> InternetServiceNo
#> OnlineSecurityYes ***
#> DeviceProtectionYes
#> TechSupportYes **
#> ContractOne year ***
#> ContractTwo year ***
#> MonthlyCharges **
#> TotalCharges **
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 4104.8 on 2960 degrees of freedom
#> Residual deviance: 2959.3 on 2949 degrees of freedom
#> (3 observations deleted due to missingness)
#> AIC: 2983.3
#>
#> Number of Fisher Scoring iterations: 5
Note:
From 3 data above (original, upsampling, and downsampling) we know that downsampling data train is the best for making model, since that data generated minimal AIC (information loss), which is 2983.3. Next, we make an evaluation for this model.
telco_test$prob <- predict(model_telco_down, newdata = telco_test, type = "response")
telco_test$prediction <- ifelse(telco_test$prob > 0.5, "Yes", "No")
telco_test <- telco_test %>%
mutate(prediction = as.factor(prediction))
confusionMatrix(data = as.factor(telco_test$prediction), reference = as.factor(telco_test$Churn), positive = "Yes")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction No Yes
#> No 751 77
#> Yes 279 310
#>
#> Accuracy : 0.7488
#> 95% CI : (0.7253, 0.7712)
#> No Information Rate : 0.7269
#> P-Value [Acc > NIR] : 0.03367
#>
#> Kappa : 0.4559
#>
#> Mcnemar's Test P-Value : < 0.0000000000000002
#>
#> Sensitivity : 0.8010
#> Specificity : 0.7291
#> Pos Pred Value : 0.5263
#> Neg Pred Value : 0.9070
#> Prevalence : 0.2731
#> Detection Rate : 0.2188
#> Detection Prevalence : 0.4157
#> Balanced Accuracy : 0.7651
#>
#> 'Positive' Class : Yes
#>
Note:
model_telco_down :Since we get better model after we downsampling the data train, we are going to use this data from now on.
Now we will make a logistic regression model by stepwise regression, so we can get a better model with all significant predictor variables.
model_telco_all_down <- glm(Churn ~ ., data = telco_train_down, family = "binomial")
model_telco_back_down <- step(model_telco_all_down, direction = "backward", trace = F)
summary(model_telco_back_down)
#>
#> Call:
#> glm(formula = Churn ~ SeniorCitizen + Partner + tenure + MultipleLines +
#> InternetService + OnlineSecurity + DeviceProtection + StreamingTV +
#> StreamingMovies + Contract + PaperlessBilling + PaymentMethod +
#> MonthlyCharges + TotalCharges, family = "binomial", data = telco_train_down)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -2.2458 -0.7883 0.3969 0.7571 3.0671
#>
#> Coefficients:
#> Estimate Std. Error z value
#> (Intercept) 1.56859513 0.38747226 4.048
#> SeniorCitizen 0.18174743 0.12040907 1.509
#> PartnerYes -0.16422904 0.09812947 -1.674
#> tenure -0.05267663 0.00761300 -6.919
#> MultipleLinesYes 0.37847866 0.12609509 3.002
#> InternetServiceFiber optic 1.54063568 0.25828128 5.965
#> InternetServiceNo -1.38680854 0.24404416 -5.683
#> OnlineSecurityYes -0.18501777 0.12240354 -1.512
#> DeviceProtectionYes 0.18403691 0.11804460 1.559
#> StreamingTVYes 0.31536176 0.13538638 2.329
#> StreamingMoviesYes 0.56834362 0.13549890 4.194
#> ContractOne year -0.58186109 0.14436729 -4.030
#> ContractTwo year -1.54601666 0.22051729 -7.011
#> PaperlessBillingYes 0.25666137 0.10375234 2.474
#> PaymentMethodCredit card (automatic) 0.00915159 0.15341122 0.060
#> PaymentMethodElectronic check 0.34967129 0.13031856 2.683
#> PaymentMethodMailed check 0.01029899 0.15715691 0.066
#> MonthlyCharges -0.02799523 0.00792970 -3.530
#> TotalCharges 0.00028121 0.00008807 3.193
#> Pr(>|z|)
#> (Intercept) 0.00005159600013 ***
#> SeniorCitizen 0.131192
#> PartnerYes 0.094210 .
#> tenure 0.00000000000454 ***
#> MultipleLinesYes 0.002686 **
#> InternetServiceFiber optic 0.00000000244704 ***
#> InternetServiceNo 0.00000001326521 ***
#> OnlineSecurityYes 0.130651
#> DeviceProtectionYes 0.118986
#> StreamingTVYes 0.019841 *
#> StreamingMoviesYes 0.00002735321185 ***
#> ContractOne year 0.00005567679449 ***
#> ContractTwo year 0.00000000000237 ***
#> PaperlessBillingYes 0.013369 *
#> PaymentMethodCredit card (automatic) 0.952431
#> PaymentMethodElectronic check 0.007292 **
#> PaymentMethodMailed check 0.947749
#> MonthlyCharges 0.000415 ***
#> TotalCharges 0.001407 **
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 4104.8 on 2960 degrees of freedom
#> Residual deviance: 2923.4 on 2942 degrees of freedom
#> (3 observations deleted due to missingness)
#> AIC: 2961.4
#>
#> Number of Fisher Scoring iterations: 5
telco_test$Prob <- predict(model_telco_back_down, newdata = telco_test, type = "response")
telco_test$Prediction <- ifelse(telco_test$Prob > 0.5, "Yes", "No")
telco_test <- telco_test %>%
mutate(Prediction = as.factor(Prediction))
confusionMatrix(data = as.factor(telco_test$Prediction), reference = as.factor(telco_test$Churn), positive = "Yes")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction No Yes
#> No 757 79
#> Yes 273 308
#>
#> Accuracy : 0.7516
#> 95% CI : (0.7282, 0.7739)
#> No Information Rate : 0.7269
#> P-Value [Acc > NIR] : 0.01912
#>
#> Kappa : 0.459
#>
#> Mcnemar's Test P-Value : < 0.0000000000000002
#>
#> Sensitivity : 0.7959
#> Specificity : 0.7350
#> Pos Pred Value : 0.5301
#> Neg Pred Value : 0.9055
#> Prevalence : 0.2731
#> Detection Rate : 0.2174
#> Detection Prevalence : 0.4100
#> Balanced Accuracy : 0.7654
#>
#> 'Positive' Class : Yes
#>
Note:
Model summary of model_telco_back_down :
Logistic model or model_telco_down is better than stepwise regression model or model_telco_back_down, because it generated better Recall / Sensitivity, which is 0.8010.
We use Recall / Sensitivity as an Evaluation Matrix because we want to have a model that is good in predicting class for true positive, as described in our business question.
telco_knn <- telco_clean %>%
select(tenure, MonthlyCharges, TotalCharges, Churn) %>%
mutate(tenure = as.numeric(tenure))
glimpse(telco_knn)
#> Rows: 7,032
#> Columns: 4
#> $ tenure <dbl> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49, 25,~
#> $ MonthlyCharges <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 29.75,~
#> $ TotalCharges <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1949.4~
#> $ Churn <fct> No, No, Yes, No, Yes, Yes, No, No, Yes, No, No, No, No,~
RNGkind(sample.kind= "Rounding")
set.seed(417)
idx_knn <- sample(nrow(telco_knn), nrow(telco_knn)*0.8)
telco_knn_train <- telco_knn[idx_knn,]
telco_knn_test <- telco_knn[-idx_knn,]
table(telco_knn_train$Churn)
#>
#> No Yes
#> 4133 1492
telco_knn_train_down <- downSample(x = telco_knn_train %>% select(-Churn), y = telco_knn_train$Churn, yname = "Churn")
# cara base R
# train_up <- upSample(x = train[,-1], y = train$label, yname = "label")
table(telco_knn_train_down$Churn)
#>
#> No Yes
#> 1492 1492
prop.table(table(telco_knn_train_down$Churn))
#>
#> No Yes
#> 0.5 0.5
telco_train_x_scaled <- telco_knn_train_down %>%
select_if(is.numeric) %>%
scale()
telco_test_x_scaled <- telco_knn_test %>%
select_if(is.numeric) %>%
scale(center = attr(telco_train_x_scaled, "scaled:center"),
scale = attr(telco_train_x_scaled, "scaled:scale"))
# y train
telco_train_y <- telco_knn_train_down$Churn
# y test
telco_test_y <- telco_knn_test$Churn
k <- sqrt(nrow(telco_train_x_scaled))
k
#> [1] 54.626
Note:
Based on computation, K optimum is 54.626. Because target variable is binomial, we rounding the K value to be an odd number, which is 55.
knn_telco <- knn(train = telco_train_x_scaled, test = telco_test_x_scaled, cl = telco_train_y, k = 55)
confusionMatrix(data = knn_telco, reference = telco_test_y, positive = "Yes")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction No Yes
#> No 734 75
#> Yes 296 302
#>
#> Accuracy : 0.7363
#> 95% CI : (0.7125, 0.7592)
#> No Information Rate : 0.7321
#> P-Value [Acc > NIR] : 0.3719
#>
#> Kappa : 0.4332
#>
#> Mcnemar's Test P-Value : <0.0000000000000002
#>
#> Sensitivity : 0.8011
#> Specificity : 0.7126
#> Pos Pred Value : 0.5050
#> Neg Pred Value : 0.9073
#> Prevalence : 0.2679
#> Detection Rate : 0.2146
#> Detection Prevalence : 0.4250
#> Balanced Accuracy : 0.7568
#>
#> 'Positive' Class : Yes
#>
Note:
kNN_telco:Here’s some conclusions we can take from this study case:
kNN_telco is the best model for this study case. It generate sensitivity of 0.8011.