Goal : predict behavior to retain customers
The data set includes information about:
data source https://www.kaggle.com/blastchar/telco-customer-churn
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(boot)
library(caret)
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
library(e1071)
library(SDMTools)
##
## Attaching package: 'SDMTools'
## The following objects are masked from 'package:caret':
##
## sensitivity, specificity
#import data (default stringAsFactor = TRUE )
customer <- read.csv("WA_Fn-UseC_-Telco-Customer-Churn.csv",stringsAsFactors = T)
View(customer)
# Make ID back to correct format
customer$customerID <- as.character(customer$customerID)
#reorder
customer <- customer[c(1,21,2:20)]
# scan missing value
colSums(is.na(customer))
## customerID Churn gender SeniorCitizen
## 0 0 0 0
## Partner Dependents tenure PhoneService
## 0 0 0 0
## MultipleLines InternetService OnlineSecurity OnlineBackup
## 0 0 0 0
## DeviceProtection TechSupport StreamingTV StreamingMovies
## 0 0 0 0
## Contract PaperlessBilling PaymentMethod MonthlyCharges
## 0 0 0 0
## TotalCharges
## 11
#observation with missing values
missing <- filter(customer, is.na(customer$TotalCharges) == TRUE )
# since TotalCharges is roughly equals to tenure * monthly charges, I replace the missing value accordingly.
customer_m <- customer %>% mutate(TotalCharges = ifelse(is.na(customer$TotalCharges), customer$MonthlyCharges*customer$tenure, TotalCharges) )
boxplot(customer_m$tenure)
boxplot(customer_m$MonthlyCharges)
# not outliers
ggplot(customer_m, aes(x = Churn))+
geom_histogram(stat = "count", fill = c("sky blue", "orange"))
## Warning: Ignoring unknown parameters: binwidth, bins, pad
# churn yes : 1869 , no: 5174
# churn mostly happens in short tenure
customer_m %>% filter(customer_m$Churn == "Yes") %>%
ggplot( aes(x= tenure))+
geom_bar(fill = "orange" )
variables <- list( 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport','StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling','PaymentMethod' )
plotG <- list() #x
for (i in variables){
plotG <- ggplot(customer_m, aes_string(x = i, fill = as.factor(customer_m$Churn)))+
geom_bar( position = "stack")+ scale_fill_discrete(name = "churn")
print(plotG)
}
# full logistic model
logisticModelFull <- glm(Churn ~ gender + SeniorCitizen + Partner + Dependents + tenure + PhoneService + MultipleLines + InternetService + OnlineSecurity + OnlineBackup + DeviceProtection + TechSupport + StreamingTV + StreamingMovies + Contract + PaperlessBilling + PaymentMethod + MonthlyCharges , family = "binomial", customer_m)
summary(logisticModelFull)
##
## Call:
## glm(formula = Churn ~ gender + SeniorCitizen + Partner + Dependents +
## tenure + PhoneService + MultipleLines + InternetService +
## OnlineSecurity + OnlineBackup + DeviceProtection + TechSupport +
## StreamingTV + StreamingMovies + Contract + PaperlessBilling +
## PaymentMethod + MonthlyCharges, family = "binomial", data = customer_m)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9780 -0.6707 -0.2946 0.6918 3.1454
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.612080 0.811986 0.754 0.45097
## genderMale -0.020514 0.064885 -0.316 0.75189
## SeniorCitizen 0.217015 0.084920 2.556 0.01060
## PartnerYes -0.002440 0.077741 -0.031 0.97496
## DependentsYes -0.167071 0.089678 -1.863 0.06246
## tenure -0.034172 0.002366 -14.443 < 2e-16
## PhoneServiceYes 0.165499 0.652460 0.254 0.79976
## MultipleLinesNo phone service NA NA NA NA
## MultipleLinesYes 0.462796 0.178054 2.599 0.00934
## InternetServiceFiber optic 1.720069 0.803709 2.140 0.03234
## InternetServiceNo -1.622325 0.811846 -1.998 0.04568
## OnlineSecurityNo internet service NA NA NA NA
## OnlineSecurityYes -0.199497 0.179719 -1.110 0.26698
## OnlineBackupNo internet service NA NA NA NA
## OnlineBackupYes 0.049975 0.176251 0.284 0.77676
## DeviceProtectionNo internet service NA NA NA NA
## DeviceProtectionYes 0.162576 0.177303 0.917 0.35918
## TechSupportNo internet service NA NA NA NA
## TechSupportYes -0.168836 0.181586 -0.930 0.35248
## StreamingTVNo internet service NA NA NA NA
## StreamingTVYes 0.593806 0.328488 1.808 0.07065
## StreamingMoviesNo internet service NA NA NA NA
## StreamingMoviesYes 0.608397 0.328840 1.850 0.06429
## ContractOne year -0.666321 0.106644 -6.248 4.15e-10
## ContractTwo year -1.356836 0.173956 -7.800 6.20e-15
## PaperlessBillingYes 0.335906 0.074277 4.522 6.12e-06
## PaymentMethodCredit card (automatic) -0.086598 0.114085 -0.759 0.44782
## PaymentMethodElectronic check 0.314319 0.094582 3.323 0.00089
## PaymentMethodMailed check -0.005299 0.113719 -0.047 0.96283
## MonthlyCharges -0.032716 0.031940 -1.024 0.30570
##
## (Intercept)
## genderMale
## SeniorCitizen *
## PartnerYes
## DependentsYes .
## tenure ***
## PhoneServiceYes
## MultipleLinesNo phone service
## MultipleLinesYes **
## InternetServiceFiber optic *
## InternetServiceNo *
## OnlineSecurityNo internet service
## OnlineSecurityYes
## OnlineBackupNo internet service
## OnlineBackupYes
## DeviceProtectionNo internet service
## DeviceProtectionYes
## TechSupportNo internet service
## TechSupportYes
## StreamingTVNo internet service
## StreamingTVYes .
## StreamingMoviesNo internet service
## StreamingMoviesYes .
## ContractOne year ***
## ContractTwo year ***
## PaperlessBillingYes ***
## PaymentMethodCredit card (automatic)
## PaymentMethodElectronic check ***
## PaymentMethodMailed check
## MonthlyCharges
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8150.1 on 7042 degrees of freedom
## Residual deviance: 5851.0 on 7020 degrees of freedom
## AIC: 5897
##
## Number of Fisher Scoring iterations: 6
logisticModelNew <- stepAIC(logisticModelFull, trace = 0)
summary(logisticModelNew)
##
## Call:
## glm(formula = Churn ~ SeniorCitizen + Dependents + tenure + MultipleLines +
## InternetService + OnlineSecurity + TechSupport + StreamingTV +
## StreamingMovies + Contract + PaperlessBilling + PaymentMethod,
## family = "binomial", data = customer_m)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9707 -0.6703 -0.2945 0.6969 3.1704
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.721834 0.124473 -5.799 6.67e-09
## SeniorCitizen 0.215385 0.084324 2.554 0.010642
## DependentsYes -0.170407 0.081230 -2.098 0.035920
## tenure -0.035008 0.002232 -15.681 < 2e-16
## MultipleLinesNo phone service 0.488421 0.126689 3.855 0.000116
## MultipleLinesYes 0.297143 0.078485 3.786 0.000153
## InternetServiceFiber optic 0.903428 0.091324 9.893 < 2e-16
## InternetServiceNo -0.774775 0.135591 -5.714 1.10e-08
## OnlineSecurityNo internet service NA NA NA NA
## OnlineSecurityYes -0.364489 0.084334 -4.322 1.55e-05
## TechSupportNo internet service NA NA NA NA
## TechSupportYes -0.336474 0.085088 -3.954 7.67e-05
## StreamingTVNo internet service NA NA NA NA
## StreamingTVYes 0.265842 0.079022 3.364 0.000768
## StreamingMoviesNo internet service NA NA NA NA
## StreamingMoviesYes 0.282268 0.078875 3.579 0.000345
## ContractOne year -0.668486 0.106275 -6.290 3.17e-10
## ContractTwo year -1.355075 0.173584 -7.806 5.88e-15
## PaperlessBillingYes 0.335235 0.074208 4.518 6.26e-06
## PaymentMethodCredit card (automatic) -0.087953 0.113990 -0.772 0.440360
## PaymentMethodElectronic check 0.314533 0.094484 3.329 0.000872
## PaymentMethodMailed check -0.006216 0.113561 -0.055 0.956347
##
## (Intercept) ***
## SeniorCitizen *
## DependentsYes *
## tenure ***
## MultipleLinesNo phone service ***
## MultipleLinesYes ***
## InternetServiceFiber optic ***
## InternetServiceNo ***
## OnlineSecurityNo internet service
## OnlineSecurityYes ***
## TechSupportNo internet service
## TechSupportYes ***
## StreamingTVNo internet service
## StreamingTVYes ***
## StreamingMoviesNo internet service
## StreamingMoviesYes ***
## ContractOne year ***
## ContractTwo year ***
## PaperlessBillingYes ***
## PaymentMethodCredit card (automatic)
## PaymentMethodElectronic check ***
## PaymentMethodMailed check
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8150.1 on 7042 degrees of freedom
## Residual deviance: 5854.3 on 7025 degrees of freedom
## AIC: 5890.3
##
## Number of Fisher Scoring iterations: 6
interpretation of odds : eg. user with Multiple Lines (MultipleLinesYes) increase the odd of churn by 35%. Being a senior citizen is 24% more likely to churn than non-senior citizen group.
# transform Coefficient to Odds
odds <- coef(logisticModelNew) %>% exp() %>% round(2)
odds
## (Intercept) SeniorCitizen
## 0.49 1.24
## DependentsYes tenure
## 0.84 0.97
## MultipleLinesNo phone service MultipleLinesYes
## 1.63 1.35
## InternetServiceFiber optic InternetServiceNo
## 2.47 0.46
## OnlineSecurityNo internet service OnlineSecurityYes
## NA 0.69
## TechSupportNo internet service TechSupportYes
## NA 0.71
## StreamingTVNo internet service StreamingTVYes
## NA 1.30
## StreamingMoviesNo internet service StreamingMoviesYes
## NA 1.33
## ContractOne year ContractTwo year
## 0.51 0.26
## PaperlessBillingYes PaymentMethodCredit card (automatic)
## 1.40 0.92
## PaymentMethodElectronic check PaymentMethodMailed check
## 1.37 0.99
#generating random index for training & testing set
set.seed(27345)
customer_m$isTrain <- rbinom(nrow(customer_m),1,0.66)
train <- customer_m %>% filter(customer_m$isTrain =="1")
test <- customer_m %>% filter(customer_m$isTrain == "0")
# modeling
LogisticTrainNew <- glm( Churn ~ gender + SeniorCitizen + Partner + Dependents + tenure + PhoneService + MultipleLines + InternetService + OnlineSecurity + OnlineBackup + DeviceProtection + TechSupport + StreamingTV + StreamingMovies + Contract + PaperlessBilling + PaymentMethod + MonthlyCharges , family = "binomial", train)
#prediciton
test$predictNew <- predict(LogisticTrainNew , type = "response" , newdata = test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
test$Churn <- ifelse(test$Churn == "Yes", 1, 0)
# calculating confusion matrix
confMatrix <- confusion.matrix(test$Churn, test$predictNew ,threshold = 0.5) # tested different thresholds where 0.5 got highest accuracy
confMatrix
## obs
## pred 0 1
## 0 1636 300
## 1 168 354
## attr(,"class")
## [1] "confusion.matrix"
accurracyNew <- sum(diag(confMatrix))/sum(confMatrix)
accurracyNew
## [1] 0.8096013
# Accuracy function with threshold 0.5
Accu <- function(r , pi = 0){
cm <- confusion.matrix( r , pi , 0.5)
acc <- sum(diag(cm))/sum(cm)
return(acc)
}
# cross-validation
set.seed(34238)
crossAcc <- cv.glm(customer_m, logisticModelNew, cost = Accu , K =8 )$delta
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
crossAcc
## [1] 0.8010791 0.8020909