library(readxl)
#get current directory path
getwd()
## [1] "D:/APGCBAA/III/RProg/workspace/DataAnalysis"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.2 v purrr 0.3.4
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(repr)
library(caTools)
library(rpart)
library(rpart.plot)
library(ggpubr)
#************************************************************************
# Real Teleco Customer Dataset
#************************************************************************
tcc <- read_excel("Telco-Customer-Churn.xlsx",sheet = 1)
glimpse(tcc)
## Rows: 7,043
## Columns: 21
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CFOCW~
## $ gender <chr> "Female", "Male", "Male", "Male", "Female", "Female",~
## $ SeniorCitizen <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ Partner <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes~
## $ Dependents <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "No"~
## $ tenure <dbl> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49, 2~
## $ PhoneService <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No", ~
## $ MultipleLines <chr> "No phone service", "No", "No", "No phone service", "~
## $ InternetService <chr> "DSL", "DSL", "DSL", "DSL", "Fiber optic", "Fiber opt~
## $ OnlineSecurity <chr> "No", "Yes", "Yes", "Yes", "No", "No", "No", "Yes", "~
## $ OnlineBackup <chr> "Yes", "No", "Yes", "No", "No", "No", "Yes", "No", "N~
## $ DeviceProtection <chr> "No", "Yes", "No", "Yes", "No", "Yes", "No", "No", "Y~
## $ TechSupport <chr> "No", "No", "No", "Yes", "No", "No", "No", "No", "Yes~
## $ StreamingTV <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "No", "Ye~
## $ StreamingMovies <chr> "No", "No", "No", "No", "No", "Yes", "No", "No", "Yes~
## $ Contract <chr> "Month-to-month", "One year", "Month-to-month", "One ~
## $ PaperlessBilling <chr> "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No", ~
## $ PaymentMethod <chr> "Electronic check", "Mailed check", "Mailed check", "~
## $ MonthlyCharges <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 29.7~
## $ TotalCharges <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1949~
## $ Churn <chr> "No", "No", "Yes", "No", "Yes", "Yes", "No", "No", "Y~
#Check for missing/NULL/NA Data
sumdata <- colSums(is.na(tcc))
sumdata
## customerID gender SeniorCitizen Partner
## 0 0 0 0
## Dependents tenure PhoneService MultipleLines
## 0 0 0 0
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## 0 0 0 0
## TechSupport StreamingTV StreamingMovies Contract
## 0 0 0 0
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## 0 0 0 11
## Churn
## 0
colSums(is.na(tcc))/nrow(tcc)
## customerID gender SeniorCitizen Partner
## 0.000000000 0.000000000 0.000000000 0.000000000
## Dependents tenure PhoneService MultipleLines
## 0.000000000 0.000000000 0.000000000 0.000000000
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## 0.000000000 0.000000000 0.000000000 0.000000000
## TechSupport StreamingTV StreamingMovies Contract
## 0.000000000 0.000000000 0.000000000 0.000000000
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## 0.000000000 0.000000000 0.000000000 0.001561834
## Churn
## 0.000000000
barplot(colSums(is.na(tcc)),
las = 2,
col = rainbow(7),cex.names=0.7,
ylab = 'count',
horiz = FALSE,
)

#Terms like “No internet service” and “No phone service” should be changed to “No” for convenience.
data <- data.frame(lapply(tcc, function(x) {
gsub("No internet service", "No", x)}))
data <- data.frame(lapply(tcc, function(x) {
gsub("No phone service", "No", x)}))
# 2. SeniorCitizen responds in 1 and 0, which is changed to “Yes” or “No”.
data$SeniorCitizen <- as.factor(ifelse(data$SeniorCitizen==1, 'YES', 'NO'))
# 3. Convert double type variables into numeric type and store them in a variable as a data frame.
num_columns <- c("tenure", "MonthlyCharges", "TotalCharges")
data[num_columns] <- sapply(data[num_columns], as.numeric)
data_int <- data[,c("tenure", "MonthlyCharges", "TotalCharges")]
data_int <- data.frame(scale(data_int))
#4 imputing Total Charges with mean
colSums(is.na(data))
## customerID gender SeniorCitizen Partner
## 0 0 0 0
## Dependents tenure PhoneService MultipleLines
## 0 0 0 0
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## 0 0 0 0
## TechSupport StreamingTV StreamingMovies Contract
## 0 0 0 0
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## 0 0 0 11
## Churn
## 0
data$TotalCharges[is.na(data$TotalCharges)] <- mean(data$TotalCharges,na.rm=TRUE)
#Check for missing/NULL/NA Data
colSums(is.na(data))
## customerID gender SeniorCitizen Partner
## 0 0 0 0
## Dependents tenure PhoneService MultipleLines
## 0 0 0 0
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## 0 0 0 0
## TechSupport StreamingTV StreamingMovies Contract
## 0 0 0 0
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## 0 0 0 0
## Churn
## 0
#5. Prepare the final data for analysis excluding selected attributes which we don’t need throughout the analysis process.
data_req <- data[,-c(1,6,19,20)]
x <- data.frame(sapply(data_req,function(x) data.frame(model.matrix(~x-1,data =data_req))[,-1]))
colSums(is.na(x))
## gender SeniorCitizen
## 0 0
## Partner Dependents
## 0 0
## PhoneService MultipleLines
## 0 0
## InternetService.xFiber.optic InternetService.xNo
## 0 0
## OnlineSecurity.xNo.internet.service OnlineSecurity.xYes
## 0 0
## OnlineBackup.xNo.internet.service OnlineBackup.xYes
## 0 0
## DeviceProtection.xNo.internet.service DeviceProtection.xYes
## 0 0
## TechSupport.xNo.internet.service TechSupport.xYes
## 0 0
## StreamingTV.xNo.internet.service StreamingTV.xYes
## 0 0
## StreamingMovies.xNo.internet.service StreamingMovies.xYes
## 0 0
## Contract.xOne.year Contract.xTwo.year
## 0 0
## PaperlessBilling PaymentMethod.xCredit.card..automatic.
## 0 0
## PaymentMethod.xElectronic.check PaymentMethod.xMailed.check
## 0 0
## Churn
## 0
colSums(is.na(data_int))
## tenure MonthlyCharges TotalCharges
## 0 0 11
data_int$TotalCharges[is.na(data_int$TotalCharges)] <- mean(data_int$TotalCharges,na.rm=TRUE)
colSums(is.na(data_int))
## tenure MonthlyCharges TotalCharges
## 0 0 0
############# Cleaning Further in any Further NA's ###################################
x <- na.omit(x) # omit the NA's
data_int <- na.omit(data_int) # omit the NA's
colSums(is.na(data_int))
## tenure MonthlyCharges TotalCharges
## 0 0 0
Clean_Data <- cbind(x, data_int)
##################### Exploratory Analysis ##############################
#Correlation plot between the numeric variables:
#Correlation plot between the numeric variables:
library(corrplot)
## corrplot 0.90 loaded
corr = cor(data_int)
corr
## tenure MonthlyCharges TotalCharges
## tenure 1.0000000 0.2478999 0.8247573
## MonthlyCharges 0.2478999 1.0000000 0.6504680
## TotalCharges 0.8247573 0.6504680 1.0000000
library("PerformanceAnalytics")
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
chart.Correlation(data_int, histogram=TRUE, pch=19)

# 2. Churn percentage tells around 26.58% of the customer left during last month.
churn <- data %>%
group_by(Churn) %>%
summarise(Count = n())%>%
mutate(percentage = prop.table(Count)*100)
ggplot(churn, aes(reorder(Churn, -percentage), percentage), fill = Churn)+
geom_col(fill = c("blue", "orange"))+
geom_text(aes(label = sprintf("%.2f%%", percentage)))+
xlab("Churn") +
ylab("Percent")+
ggtitle("Churn Percentage")

#################### Logistic Regression #######################
#Split the data in train and test sets
set.seed(123)
split <- sample.split(Clean_Data$Churn, SplitRatio = 0.70)
train <- Clean_Data[split,]
test <- Clean_Data[!(split),]
# 2. Calculate the baseline accuracy
prop.table(table(train$Churn))
##
## 0 1
## 0.7346856 0.2653144
# 3. Fitting the model using glm() function
model <- glm(Churn ~., data = train, family = "binomial")
summary(model)
##
## Call:
## glm(formula = Churn ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9703 -0.6667 -0.2740 0.6976 3.4427
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.8993427 1.5190886 -2.567 0.010261
## gender 0.0007079 0.0782945 0.009 0.992786
## SeniorCitizen 0.2280666 0.1021463 2.233 0.025566
## Partner 0.0380986 0.0938620 0.406 0.684816
## Dependents -0.1881251 0.1085452 -1.733 0.083069
## PhoneService 0.8674040 0.7720755 1.123 0.261238
## MultipleLines 0.6201966 0.2109465 2.940 0.003281
## InternetService.xFiber.optic 2.3386261 0.9486119 2.465 0.013689
## InternetService.xNo -2.4804426 0.9588639 -2.587 0.009686
## OnlineSecurity.xNo.internet.service NA NA NA NA
## OnlineSecurity.xYes -0.0572797 0.2146295 -0.267 0.789564
## OnlineBackup.xNo.internet.service NA NA NA NA
## OnlineBackup.xYes 0.1488093 0.2095328 0.710 0.477583
## DeviceProtection.xNo.internet.service NA NA NA NA
## DeviceProtection.xYes 0.2282995 0.2122399 1.076 0.282076
## TechSupport.xNo.internet.service NA NA NA NA
## TechSupport.xYes -0.1066490 0.2171769 -0.491 0.623377
## StreamingTV.xNo.internet.service NA NA NA NA
## StreamingTV.xYes 0.8858400 0.3899902 2.271 0.023120
## StreamingMovies.xNo.internet.service NA NA NA NA
## StreamingMovies.xYes 0.8601929 0.3898006 2.207 0.027331
## Contract.xOne.year -0.7130903 0.1326759 -5.375 7.67e-08
## Contract.xTwo.year -1.3852640 0.2120211 -6.534 6.42e-11
## PaperlessBilling 0.3479089 0.0897581 3.876 0.000106
## PaymentMethod.xCredit.card..automatic. 0.0036981 0.1382275 0.027 0.978656
## PaymentMethod.xElectronic.check 0.3835965 0.1151436 3.331 0.000864
## PaymentMethod.xMailed.check -0.0185990 0.1397057 -0.133 0.894091
## tenure -1.4985032 0.1835281 -8.165 3.21e-16
## MonthlyCharges -1.9939708 1.1354690 -1.756 0.079075
## TotalCharges 0.7512229 0.1925731 3.901 9.58e-05
##
## (Intercept) *
## gender
## SeniorCitizen *
## Partner
## Dependents .
## PhoneService
## MultipleLines **
## InternetService.xFiber.optic *
## InternetService.xNo **
## OnlineSecurity.xNo.internet.service
## OnlineSecurity.xYes
## OnlineBackup.xNo.internet.service
## OnlineBackup.xYes
## DeviceProtection.xNo.internet.service
## DeviceProtection.xYes
## TechSupport.xNo.internet.service
## TechSupport.xYes
## StreamingTV.xNo.internet.service
## StreamingTV.xYes *
## StreamingMovies.xNo.internet.service
## StreamingMovies.xYes *
## Contract.xOne.year ***
## Contract.xTwo.year ***
## PaperlessBilling ***
## PaymentMethod.xCredit.card..automatic.
## PaymentMethod.xElectronic.check ***
## PaymentMethod.xMailed.check
## tenure ***
## MonthlyCharges .
## TotalCharges ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5704.4 on 4929 degrees of freedom
## Residual deviance: 4009.0 on 4906 degrees of freedom
## AIC: 4057
##
## Number of Fisher Scoring iterations: 6
################# bservations on the Train set ##########################
pred_train <- predict(model, data = train, type = "response")
summary(pred_train)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001097 0.039613 0.188353 0.265314 0.466860 0.868097
# confusion matrix on training set
cmatrix_train <- table(train$Churn, pred_train >= 0.5)
cmatrix_train
##
## FALSE TRUE
## 0 3254 368
## 1 559 749
#Accuracy
mod_train_acc <- ((cmatrix_train[1,1]+cmatrix_train[2,2]))/nrow(train)
# Misclassification rate or Error Rate
mod_train_mr <- 1-mod_train_acc
# True positive rate / Sensitivity , tpr = tp/(tp+fn)
mod_train_tpr <- cmatrix_train[2,2] / (cmatrix_train[2,2] + cmatrix_train[1,2])
#False positive rate fpr = fp/(fp+tn)
mod_train_fpr <- cmatrix_train[2,1] / (cmatrix_train[2,1] + cmatrix_train[1,1])
# Specificity Spec tnr = tn/(tn+fp)
mod_train_tnr <- cmatrix_train[1,1] / (cmatrix_train[1,1] + cmatrix_train[2,1])
#False Negative rate fnr = fn/(fn+tp)
mod_train_fnr <- cmatrix_train[1,2] / (cmatrix_train[1,2] + cmatrix_train[2,2])
#Negative predictive value (NPV) = tn/(tn+fn)
mod_train_npv <- cmatrix_train[1,1] / (cmatrix_train[1,1] + cmatrix_train[1,2])
#Positive predictive value / Precision ppv = tp/(tp+fp)
mod_train_ppv <- cmatrix_train[2,2] / (cmatrix_train[2,2] + cmatrix_train[2,1])
#Positive DLR(Diagnostic likelihood ratio) = tpr/fpr
mod_train_pdlr <- ( mod_train_tpr / mod_train_fpr )
#Negative DLR(Diagnostic likelihood ratio) = tnr/fnr
mod_train_ndlr <- ( mod_train_tnr / mod_train_fnr )
#F-Score = 2 x (ppv x tpr)/(ppv + tpr)
mod_train_fscore <- 2 * ((mod_train_ppv * mod_train_tpr) / (mod_train_ppv + mod_train_tpr ))
cat("************* Observations on the Train set ************* \n",
"Accuracy : ", mod_train_acc,"\n" ,
"Misclassification rate or Error Rate : ", mod_train_mr,"\n" ,
"Sensitivity (TPR) : ", mod_train_tpr,"\n" ,
"Specificity (TNR) : ", mod_train_tnr,"\n" ,
"False Negative Rate (FNR) : ", mod_train_fnr,"\n" ,
"False positive rate (FPR) : ", mod_train_fpr,"\n" ,
"Negative predictive value (NPV) : ", mod_train_npv,"\n" ,
"Positive predictive value / Precision (PPV) : ", mod_train_ppv,"\n" ,
"Positive DLR(Diagnostic likelihood ratio) : ", mod_train_pdlr,"\n" ,
"Negative DLR(Diagnostic likelihood ratio) : ", mod_train_ndlr,"\n" ,
"F-Score : ", mod_train_fscore,"\n" )
## ************* Observations on the Train set *************
## Accuracy : 0.8119675
## Misclassification rate or Error Rate : 0.1880325
## Sensitivity (TPR) : 0.6705461
## Specificity (TNR) : 0.8533963
## False Negative Rate (FNR) : 0.3294539
## False positive rate (FPR) : 0.1466037
## Negative predictive value (NPV) : 0.8983987
## Positive predictive value / Precision (PPV) : 0.57263
## Positive DLR(Diagnostic likelihood ratio) : 4.573868
## Negative DLR(Diagnostic likelihood ratio) : 2.590336
## F-Score : 0.617732
################# Observations on the test set ##########################
pred_test <- predict(model, newdata = test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
# confusion matrix on training set
cmatrix_test <- table(test$Churn, pred_test >= 0.5)
cmatrix_test
##
## FALSE TRUE
## 0 1387 165
## 1 257 304
#Accuracy = (TP+TN)/(TN+FN+TP+FP)
mod_test_acc <- ((cmatrix_test[1,1]+cmatrix_test[2,2]))/nrow(test)
# Misclassification rate or Error Rate
mod_test_mr <- 1-mod_test_acc
# True positive rate / Sensitivity , tpr = tp/(tp+fn)
mod_test_tpr <- cmatrix_test[2,2] / (cmatrix_test[2,2] + cmatrix_test[1,2])
#False positive rate fpr = fp/(fp+tn)
mod_test_fpr <- cmatrix_test[2,1] / (cmatrix_test[2,1] + cmatrix_test[1,1])
# Specificity Spec tnr = tn/(tn+fp)
mod_test_tnr <- cmatrix_test[1,1] / (cmatrix_test[1,1] + cmatrix_test[2,1])
#False Negative rate fnr = fn/(fn+tp)
mod_test_fnr <- cmatrix_test[1,2] / (cmatrix_test[1,2] + cmatrix_test[2,2])
#Negative predictive value (NPV) = tn/(tn+fn)
mod_test_npv <- cmatrix_test[1,1] / (cmatrix_test[1,1] + cmatrix_test[1,2])
#Positive predictive value / Precision ppv = tp/(tp+fp)
mod_test_ppv <- cmatrix_test[2,2] / (cmatrix_test[2,2] + cmatrix_test[2,1])
#Positive DLR(Diagnostic likelihood ratio) = tpr/fpr
mod_test_pdlr <- ( mod_test_tpr / mod_test_fpr )
#Negative DLR(Diagnostic likelihood ratio) = tnr/fnr
mod_test_ndlr <- ( mod_test_tnr / mod_test_fnr )
#F-Score = 2 x (ppv x tpr)/(ppv + tpr)
mod_test_fscore <- 2 * ((mod_test_ppv * mod_test_tpr) / (mod_test_ppv + mod_test_tpr ))
cat("************* Observations on the Test set ************* \n",
"Accuracy : ", mod_test_acc,"\n" ,
"Misclassification rate or Error Rate : ", mod_test_mr,"\n" ,
"Sensitivity (TPR) : ", mod_test_tpr,"\n" ,
"Specificity (TNR) : ", mod_test_tnr,"\n" ,
"False Negative Rate (FNR) : ", mod_test_fnr,"\n" ,
"False positive rate (FPR) : ", mod_test_fpr,"\n" ,
"Negative predictive value (NPV) : ", mod_test_npv,"\n" ,
"Positive predictive value / Precision (PPV) : ", mod_test_ppv,"\n" ,
"Positive DLR(Diagnostic likelihood ratio) : ", mod_test_pdlr,"\n" ,
"Negative DLR(Diagnostic likelihood ratio) : ", mod_test_ndlr,"\n" ,
"F-Score : ", mod_test_fscore,"\n" )
## ************* Observations on the Test set *************
## Accuracy : 0.800284
## Misclassification rate or Error Rate : 0.199716
## Sensitivity (TPR) : 0.6481876
## Specificity (TNR) : 0.843674
## False Negative Rate (FNR) : 0.3518124
## False positive rate (FPR) : 0.156326
## Negative predictive value (NPV) : 0.8936856
## Positive predictive value / Precision (PPV) : 0.5418895
## Positive DLR(Diagnostic likelihood ratio) : 4.146383
## Negative DLR(Diagnostic likelihood ratio) : 2.398079
## F-Score : 0.5902913