This analysis was initially published on Datascienceplus.com. I have used the concepts from there and remodeled the whole analysis
library(dplyr)
library(ggplot2)
library(MASS)
library(randomForest)
library(corrplot)
library(gridExtra)
library(rpart)
library(rpart.plot)
library(party)
library(caret)
telco <- read.csv("Telco_customer_churn.csv")
colSums(is.na(telco))
telco <- na.omit(telco)
str(telco)
summary(telco)
telco$MultipleLines <- ifelse(telco$MultipleLines == "No phone service","No", "Yes")
telco$OnlineSecurity <- ifelse(telco$OnlineSecurity == "No internet service","No", "Yes")
telco$OnlineBackup <- ifelse(telco$OnlineBackup == "No internet service","No", "Yes")
telco$DeviceProtection <- ifelse(telco$DeviceProtection == "No internet service","No", "Yes")
telco$TechSupport <- ifelse(telco$TechSupport == "No internet service","No", "Yes")
telco$StreamingTV <- ifelse(telco$StreamingTV == "No internet service","No", "Yes")
telco$StreamingMovies <- ifelse(telco$StreamingMovies == "No internet service","No", "Yes")
In total we have 7032 observation and 21 variable
table(telco$Churn)
##
## No Yes
## 5163 1869
p1 <- ggplot(data = telco, aes(x = gender, fill = Churn)) +
geom_bar(position = "fill")
p2 <- ggplot(data = telco, aes(x = SeniorCitizen, fill = Churn)) +
geom_bar(position = "fill")
p3 <- ggplot(data = telco, aes(x = Partner, fill = Churn)) +
geom_bar(position = "fill")
p4 <- ggplot(data = telco, aes(x = Dependents, fill = Churn)) +
geom_bar(position = "fill")
grid.arrange(p1, p2, p3, p4, ncol=2)
ggplot(data = telco, aes(x = tenure, fill = Churn)) +
geom_histogram(position = "fill")
p5 <- ggplot(data = telco, aes(x = PhoneService, fill = Churn)) +
geom_bar(position = "fill")
p6 <- ggplot(data = telco, aes(x = MultipleLines, fill = Churn)) +
geom_bar(position = "fill")
p7 <- ggplot(data = telco, aes(x = InternetService, fill = Churn)) +
geom_bar(position = "fill")
p8 <- ggplot(data = telco, aes(x = OnlineSecurity, fill = Churn)) +
geom_bar(position = "fill")
grid.arrange(p5, p6, p7, p8, ncol=2)
p9 <- ggplot(data = telco, aes(x = OnlineBackup, fill = Churn)) +
geom_bar(position = "fill")
p10 <- ggplot(data = telco, aes(x = DeviceProtection, fill = Churn)) +
geom_bar(position = "fill")
p11 <- ggplot(data = telco, aes(x = TechSupport, fill = Churn)) +
geom_bar(position = "fill")
p12 <- ggplot(data = telco, aes(x = StreamingTV, fill = Churn)) +
geom_bar(position = "fill")
grid.arrange(p9, p10, p11, p12, ncol=2)
p13 <- ggplot(data = telco, aes(x = StreamingMovies, fill = Churn)) +
geom_bar(position = "fill")
p14 <- ggplot(data = telco, aes(x = Contract, fill = Churn)) +
geom_bar(position = "fill")
p15 <- ggplot(data = telco, aes(x = PaperlessBilling, fill = Churn)) +
geom_bar(position = "fill")
p16 <- ggplot(data = telco, aes(x = PaymentMethod, fill = Churn)) +
geom_bar(position = "fill")
grid.arrange(p13, p14, p15, p16, ncol=2)
ggplot(data = telco, aes(x = MonthlyCharges, fill = Churn)) +
geom_histogram(position = "fill")
ggplot(data = telco, aes(x = TotalCharges, fill = Churn)) +
geom_histogram(position = "fill")
table(telco$Partner, telco$Dependents)
##
## No Yes
## No 3280 359
## Yes 1653 1740
table(telco$StreamingTV, telco$StreamingMovies)
##
## No Yes
## No 1520 0
## Yes 0 5512
table(telco$InternetService, telco$OnlineSecurity)
##
## No Yes
## DSL 0 2416
## Fiber optic 0 3096
## No 1520 0
table(telco$InternetService, telco$DeviceProtection)
##
## No Yes
## DSL 0 2416
## Fiber optic 0 3096
## No 1520 0
numeric.var <- sapply(telco, is.numeric)
corr.matrix <- cor(telco[,numeric.var])
corrplot(corr.matrix, main="\n\nCorrelation Plot for Numerical Variables", method="number")
set.seed(12345)
index <- sample(1:nrow(telco), 0.7*nrow(telco))
train <- telco[index,2:21]
test <- telco[-index,2:21]
telco_log <- glm(Churn ~ ., data = train, family = "binomial")
summary(telco_log)
##
## Call:
## glm(formula = Churn ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6773 -0.6888 -0.3120 0.7801 3.3103
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) -0.1998005 0.2669884 -0.748
## genderMale -0.0421137 0.0764787 -0.551
## SeniorCitizen 0.1819262 0.0994083 1.830
## PartnerYes -0.0001958 0.0919891 -0.002
## DependentsYes -0.1986362 0.1056846 -1.880
## tenure -0.0537164 0.0071279 -7.536
## PhoneServiceYes -0.6542136 0.1751012 -3.736
## MultipleLinesYes NA NA NA
## InternetServiceFiber optic 0.8627519 0.1577810 5.468
## InternetServiceNo -0.4630047 0.2236187 -2.071
## OnlineSecurityYes NA NA NA
## OnlineBackupYes NA NA NA
## DeviceProtectionYes NA NA NA
## TechSupportYes NA NA NA
## StreamingTVYes NA NA NA
## StreamingMoviesYes NA NA NA
## ContractOne year -0.7576200 0.1261411 -6.006
## ContractTwo year -1.4140798 0.1986139 -7.120
## PaperlessBillingYes 0.4092732 0.0881269 4.644
## PaymentMethodCredit card (automatic) -0.1191094 0.1335949 -0.892
## PaymentMethodElectronic check 0.3192109 0.1124708 2.838
## PaymentMethodMailed check -0.0834132 0.1369695 -0.609
## MonthlyCharges 0.0033448 0.0047726 0.701
## TotalCharges 0.0002640 0.0000814 3.244
## Pr(>|z|)
## (Intercept) 0.454250
## genderMale 0.581868
## SeniorCitizen 0.067236 .
## PartnerYes 0.998302
## DependentsYes 0.060174 .
## tenure 4.84e-14 ***
## PhoneServiceYes 0.000187 ***
## MultipleLinesYes NA
## InternetServiceFiber optic 4.55e-08 ***
## InternetServiceNo 0.038405 *
## OnlineSecurityYes NA
## OnlineBackupYes NA
## DeviceProtectionYes NA
## TechSupportYes NA
## StreamingTVYes NA
## StreamingMoviesYes NA
## ContractOne year 1.90e-09 ***
## ContractTwo year 1.08e-12 ***
## PaperlessBillingYes 3.42e-06 ***
## PaymentMethodCredit card (automatic) 0.372623
## PaymentMethodElectronic check 0.004537 **
## PaymentMethodMailed check 0.542530
## MonthlyCharges 0.483404
## TotalCharges 0.001181 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5652.2 on 4921 degrees of freedom
## Residual deviance: 4165.2 on 4905 degrees of freedom
## AIC: 4199.2
##
## Number of Fisher Scoring iterations: 6
predict_1_prob <- predict(telco_log, newdata = test, type = "response")
predict_1_class <- ifelse(predict_1_prob > 0.5, "Yes", "No")
mean(predict_1_class == test$Churn)
## [1] 0.8047393
Now lets make a model only with important variables i.e. variables with less p-value
telco_log <- glm(Churn ~ SeniorCitizen + tenure + PhoneService + InternetService +
Contract + PaperlessBilling + PaymentMethod + TotalCharges,
data = train, family = "binomial")
summary(telco_log)
##
## Call:
## glm(formula = Churn ~ SeniorCitizen + tenure + PhoneService +
## InternetService + Contract + PaperlessBilling + PaymentMethod +
## TotalCharges, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6816 -0.6960 -0.3129 0.7830 3.2881
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -1.359e-01 1.854e-01 -0.733
## SeniorCitizen 2.175e-01 9.727e-02 2.236
## tenure -5.612e-02 6.773e-03 -8.286
## PhoneServiceYes -6.037e-01 1.548e-01 -3.900
## InternetServiceFiber optic 9.472e-01 1.133e-01 8.361
## InternetServiceNo -5.761e-01 1.572e-01 -3.665
## ContractOne year -7.600e-01 1.248e-01 -6.087
## ContractTwo year -1.424e+00 1.975e-01 -7.207
## PaperlessBillingYes 4.173e-01 8.792e-02 4.747
## PaymentMethodCredit card (automatic) -1.119e-01 1.334e-01 -0.838
## PaymentMethodElectronic check 3.276e-01 1.123e-01 2.917
## PaymentMethodMailed check -8.593e-02 1.366e-01 -0.629
## TotalCharges 2.967e-04 7.162e-05 4.142
## Pr(>|z|)
## (Intercept) 0.463603
## SeniorCitizen 0.025372 *
## tenure < 2e-16 ***
## PhoneServiceYes 9.62e-05 ***
## InternetServiceFiber optic < 2e-16 ***
## InternetServiceNo 0.000247 ***
## ContractOne year 1.15e-09 ***
## ContractTwo year 5.70e-13 ***
## PaperlessBillingYes 2.07e-06 ***
## PaymentMethodCredit card (automatic) 0.401900
## PaymentMethodElectronic check 0.003535 **
## PaymentMethodMailed check 0.529271
## TotalCharges 3.44e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5652.2 on 4921 degrees of freedom
## Residual deviance: 4170.3 on 4909 degrees of freedom
## AIC: 4196.3
##
## Number of Fisher Scoring iterations: 6
predict_1_prob <- predict(telco_log, newdata = test, type = "response")
predict_1_class <- ifelse(predict_1_prob > 0.5, "Yes", "No")
mean(predict_1_class == test$Churn)
## [1] 0.8056872
We get almost the same accuracy as we got in our previous Logistic Model
telco_tree <- rpart(Churn ~ SeniorCitizen + tenure + PhoneService + InternetService +
Contract + PaperlessBilling + PaymentMethod + TotalCharges,
data = train)
rpart.plot(telco_tree)
telco_tree_predict <- predict(telco_tree, newdata = test, type = "class")
mean(test$Churn == telco_tree_predict)
## [1] 0.7862559
telco_rf <- randomForest(Churn ~ SeniorCitizen + tenure + PhoneService +
InternetService + Contract + PaperlessBilling + PaymentMethod
+ TotalCharges, data = train)
plot(telco_rf)
varImpPlot(telco_rf)
telco_rf
##
## Call:
## randomForest(formula = Churn ~ SeniorCitizen + tenure + PhoneService + InternetService + Contract + PaperlessBilling + PaymentMethod + TotalCharges, data = train)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 20.78%
## Confusion matrix:
## No Yes class.error
## No 3263 374 0.1028320
## Yes 649 636 0.5050584
telco_rf_predict <- predict(telco_rf, newdata = test)
confusionMatrix(telco_rf_predict, test$Churn)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1388 285
## Yes 138 299
##
## Accuracy : 0.7995
## 95% CI : (0.7818, 0.8164)
## No Information Rate : 0.7232
## P-Value [Acc > NIR] : 3.612e-16
##
## Kappa : 0.4571
## Mcnemar's Test P-Value : 1.259e-12
##
## Sensitivity : 0.9096
## Specificity : 0.5120
## Pos Pred Value : 0.8296
## Neg Pred Value : 0.6842
## Prevalence : 0.7232
## Detection Rate : 0.6578
## Detection Prevalence : 0.7929
## Balanced Accuracy : 0.7108
##
## 'Positive' Class : No
##