#Customer churn
#abstract-
#The biggest problem a Telco company faces is of Churned Customers.
#Churning is a term used in this industry to describe whether the consumer or the user is going to continue the service with the company any further or not.
#Churning has a huge impact on Revenue of a company. If the company predicts the churn rate of the customers with high accuracy,
#it gives the company a guesstimate of how its revenues would look like and in turn give it freedom to plan finances ahead.
#problem statement-
#n this project we are more concerned about the customers who are going to get churned.
#We are going to predict whether the customer be Churned or not on the basis of its billing information and customer demographics.
#1
par(mfrow=c(1,1))
df<- read.csv("C:\\Users\\xholi\\Downloads\\Telco-Customer-Churn.csv",
header = TRUE)
head(df)
## customerID gender SeniorCitizen Partner Dependents tenure PhoneService
## 1 7590-VHVEG Female 0 Yes No 1 No
## 2 5575-GNVDE Male 0 No No 34 Yes
## 3 3668-QPYBK Male 0 No No 2 Yes
## 4 7795-CFOCW Male 0 No No 45 No
## 5 9237-HQITU Female 0 No No 2 Yes
## 6 9305-CDSKC Female 0 No No 8 Yes
## MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection
## 1 No phone service DSL No Yes No
## 2 No DSL Yes No Yes
## 3 No DSL Yes Yes No
## 4 No phone service DSL Yes No Yes
## 5 No Fiber optic No No No
## 6 Yes Fiber optic No No Yes
## TechSupport StreamingTV StreamingMovies Contract PaperlessBilling
## 1 No No No Month-to-month Yes
## 2 No No No One year No
## 3 No No No Month-to-month Yes
## 4 Yes No No One year No
## 5 No No No Month-to-month Yes
## 6 No Yes Yes Month-to-month Yes
## PaymentMethod MonthlyCharges TotalCharges Churn
## 1 Electronic check 29.85 29.85 No
## 2 Mailed check 56.95 1889.50 No
## 3 Mailed check 53.85 108.15 Yes
## 4 Bank transfer (automatic) 42.30 1840.75 No
## 5 Electronic check 70.70 151.65 Yes
## 6 Electronic check 99.65 820.50 Yes
#loading libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(rpart)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(ROCR)
library(rpart.plot)
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(ggplot2)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(DT)
pacman::p_load(pacman,dplyr,GGally,ggvis,ggthemes,ggplot2,
rio,lubridate,shiny,tidyr,plotly,psych,rmarkdown,httr,tidyverse,stringr)
churn<-df
#converting the variables to factors
churn$gender<-as.factor(churn$gender)
churn$Partner<-as.factor(churn$Partner)
churn$Dependents<-as.factor(churn$Dependents)
churn$PhoneService<-as.factor(churn$PhoneService)
churn$MultipleLines<-as.factor(churn$MultipleLines)
churn$InternetService<-as.factor(churn$InternetService)
churn$OnlineBackup<-as.factor(churn$OnlineBackup)
churn$OnlineSecurity<-as.factor(churn$OnlineSecurity)
churn$DeviceProtection<-as.factor(churn$DeviceProtection)
churn$TechSupport<-as.factor(churn$TechSupport)
churn$StreamingTV<-as.factor(churn$StreamingTV)
churn$StreamingMovies<-as.factor(churn$StreamingMovies)
churn$Contract<-as.factor(churn$Contract)
churn$PaperlessBilling<-as.factor(churn$PaperlessBilling)
churn$PaymentMethod<-as.factor(churn$PaymentMethod)
churn$SeniorCitizen<-as.factor(churn$SeniorCitizen)
churn$Churn<-as.factor(churn$Churn)
str(churn)
## 'data.frame': 7043 obs. of 21 variables:
## $ customerID : chr "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
## $ gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
## $ SeniorCitizen : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
## $ tenure : int 1 34 2 45 2 8 22 10 28 62 ...
## $ PhoneService : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
## $ MultipleLines : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
## $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
## $ OnlineSecurity : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
## $ OnlineBackup : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
## $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
## $ TechSupport : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ StreamingTV : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
## $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
## $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
## $ PaymentMethod : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
## $ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
## $ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...
#create a copy of the data set but without the 1st variable customer id
churn_copy <- churn[,-1]
sum(is.na(churn_copy))
## [1] 11
#We can see that there are 11 entries in our data set which are empty or
#have NA values. The number of Na's as compared to the whole data set is relatively less,
#and hence we would drop these 11 values.
#CLEANING THE DATA SET AND CREATING TRAIN-TEST SPLITS
set.seed(999)
churn_final <- churn_copy[complete.cases(churn_copy),] # THIS RETURNS ONLY THE COMPLETE CASES IN OUR STUDY
intrain <- createDataPartition(y = churn_final$Churn, p = 0.8, list = FALSE)
training <- churn_final[intrain, ]
testing <- churn_final[-intrain, ]
#Visualizing the data by specific variables#
#number of customers by gender
p<-churn_final$gender
plot(p,col=rainbow(2),
main="NUMBER OF CUSTOMERS BY GENDER",
ylab="Number of Customers",
xlab="Gender")
box()

summary(churn_final$gender)
## Female Male
## 3483 3549
#there are 3549 Males and 3483 Females in our data set
# gender population almost if not evenly balanced
#number of customers who churned by gender
p<-churn_final$gender[churn_final$Churn=="Yes"]
plot(p,col=rainbow(2),
main="NUMBER OF CUSTOMERS WHO CHURNED BY GENDER",
ylab="Number of Customers",
xlab="Gender")
box()

summary(churn_final$gender[churn_final$Churn=="Yes"])
## Female Male
## 939 930
#939 Females churned and 930 Males also churned
# in total 1869 Customers churned away from Teleco Company
#that is around 26.5% of our customers churned and this is a huge deal since it means that our revenue will be 26.5 slightly short
#the churning with respect to gender is almost evenly balanced
#number of customers who churned by Senior Citizen
p<-churn_final$SeniorCitizen[churn_final$Churn=="Yes"]
plot(p,
main="NUMBER OF CUSTOMERS CHURNED BY SENIOR CITIZEN",
ylab="Number of Customers",
xlab="Senior Citizen",col=rainbow(2))
box()

summary(churn_final$SeniorCitizen[churn_final$Churn=="Yes"])
## 0 1
## 1393 476
#there are about 5890 Senior Citizen in our data and 1142 which are not Senior Citizen
# and 1393 of those Senior Citizens Churned while only 476 non Senior Citizens also Churned
#its clear that Senior Citizens are most likely to churn.
#number of customers who churned by Partner
p<-churn_final$Partner[churn_final$Churn=="Yes"]
plot(p,
main="NUMBER OF CUSTOMERS CHURNED BY PARTNER",
ylab="Number of Customers",
xlab="Partner",col=rainbow(2))
box()

summary(churn_final$Partner[churn_final$Churn=="Yes"])
## No Yes
## 1200 669
summary(churn_final$Partner)
## No Yes
## 3639 3393
#There are 3393 Partners with Teleco and 3639 which are not
#among the Partners, 669 Churned while 1200 non-Partners also Churned
#its clear that non partners are more likly to churn
#number of customers who churned by Dependents
p<-churn_final$Dependents[churn_final$Churn=="Yes"]
plot(p,
main="NUMBER OF CUSTOMERS CHURNED BY DEPENDENTS",
ylab="Number of Customers",
xlab="Dependents",col=rainbow(2))
box()

summary(churn_final$Dependents[churn_final$Churn=="Yes"])
## No Yes
## 1543 326
summary(churn_final$Dependents)
## No Yes
## 4933 2099
#There are 4933 Customers with no dependents and 2099 with dependents
#1543 Customers with no Dependents churned while the Customers who have Dependents only churned about 326
# its apparent that Customers with no Dependents are most likely to Churn than those with Dependents
par(mfrow=c(2,1))
#number of customers who churned by TENURE
p<-churn_final$tenure
boxplot(p,
main="BOX PLOT OF TENURE",
xlab="TENURE",col="red",horizontal = T)
box()
summary(churn_final$tenure[churn_final$Churn=="Yes"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 10.00 17.98 29.00 72.00
p<-churn_final$tenure[churn_final$Churn=="Yes"]
boxplot(p,horizontal = T,col = "blue",
xlab="Tenure in years",
main="BOX PLOT OF TENURE BY CHURNED CUSTOMERS")

par(mfrow=c(1,1))
#from the box plot we see that the average tenure of Customers who Churned is about 10 yrs
#while that of those who have not Churned is at about 28 years
#so to help minimize the number of customers who leave, i suggest we start targeting
#customers with tenures of > 3 years and less than 10
# if we target this group of Customers we might reduce the Percentage of Churn significantly
#number of customers who churned by Contract
p<-churn_final$Contract[churn_final$Churn=="Yes"]
plot(p,
main="NUMBER OF CUSTOMERS CHURNED BY TYPE OF CONTRACT",
ylab="Number of Customers",
xlab="Contract type",col=rainbow(3))
box()

summary(churn_final$Contract[churn_final$Churn=="Yes"])
## Month-to-month One year Two year
## 1655 166 48
summary(churn_final$Contract)
## Month-to-month One year Two year
## 3875 1472 1685
#There are three types of Contracts, Month-to-Month, One year, Two years
#We have a total of 3875 Month-to-Month Customers, 1472 One year, and 1685 Two year Contract Customers
#from the Monthly basis we had 1655 Customers who Churned,from the ONE YEAR, we had 166 Customers and lastly
#From the TWO year we had 48 Customers who Churned
# its clear that Month-to-Month Customers are more likely to Churn than others ,
#followed by One year Contract Customers.
#this are the target market if we want to reduce the number of Churns
par(mfrow=c(2,1))
#number of customers who churned by Total Charges
p<-churn_final$TotalCharges[churn_final$Churn=="Yes"]
boxplot(p,
main="BOX PLOT OF TOTAL CHARGES FOR THOSE WHO CHURNED",
xlab="TOTAL CHARGES",horizontal = T,col = "pink")
box()
p<-churn_final$TotalCharges
boxplot(p,
main="BOX PLOT OF TOTAL CHARGES",
xlab="TOTAL CHARGES",horizontal = T,col = "purple")

summary(churn_final$TotalCharges[churn_final$Churn=="Yes"])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.85 134.50 703.55 1531.80 2331.30 8684.80
summary(churn_final$TotalCharges)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.8 401.4 1397.5 2283.3 3794.7 8684.8
#the average Charge for those who churned is $1531.80 while that of the entire population is
# $2283.3
#this implies a profit loss of about $2862934 or $2.86M since the average Charge for those who churned is $1531.80
# this is a lot of revenue loss, due to loss of customers.
summary(churn_final$Churn)
## No Yes
## 5163 1869
par(mfrow=c(1,1))
#Modeling
#Model 1
#Stepwise Logistic Regression Model
md_glm <- glm(as.factor(Churn) ~ ., data = training, family = binomial(logit))
md_step <- step(md_glm, direction = "both", steps = 10000, trace = FALSE)
predictions_step <- predict(md_step, testing, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
#To decide the cut off point
pred_step <- prediction(predictions_step, testing$Churn)
plot(performance(pred_step, "tpr", "fpr"), colorize = TRUE)

auc_step <- performance(pred_step, "auc")
auc_step
## A performance instance
## 'Area under the ROC curve'
roc_step <- roc(response = testing$Churn, predictor = predictions_step)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
d <- coords(roc_step, "best", "threshold",transpose = T)
d
## threshold specificity sensitivity
## 0.1962807 0.6375969 0.8981233
#stepwise logistic reg
# threshold specificity sensitivity
# 0.1962807 0.6375969 0.8981233
#We find that the point 0.1962807 is the best cut-off point to distinguish the class probabilities into 0's and 1's.
#We now form the Predicted scores using ifelse loops.
str(testing)
## 'data.frame': 1405 obs. of 20 variables:
## $ gender : Factor w/ 2 levels "Female","Male": 1 2 1 2 2 1 1 2 1 1 ...
## $ SeniorCitizen : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
## $ Partner : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 2 1 2 2 1 ...
## $ Dependents : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 2 1 1 ...
## $ tenure : int 1 2 58 1 1 72 11 10 13 15 ...
## $ PhoneService : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 2 2 2 ...
## $ MultipleLines : Factor w/ 3 levels "No","No phone service",..: 2 1 3 1 1 3 3 1 3 1 ...
## $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 3 1 2 2 1 1 2 ...
## $ OnlineSecurity : Factor w/ 3 levels "No","No internet service",..: 1 3 1 2 1 3 1 1 3 3 ...
## $ OnlineBackup : Factor w/ 3 levels "No","No internet service",..: 3 3 3 2 1 3 1 3 3 3 ...
## $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 1 1 2 1 1 3 1 1 1 ...
## $ TechSupport : Factor w/ 3 levels "No","No internet service",..: 1 1 3 2 1 3 1 1 3 1 ...
## $ StreamingTV : Factor w/ 3 levels "No","No internet service",..: 1 1 1 2 1 3 3 1 3 3 ...
## $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 2 1 1 3 1 1 3 ...
## $ Contract : Factor w/ 3 levels "Month-to-month",..: 1 1 3 1 1 3 1 2 1 1 ...
## $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 2 2 1 1 1 2 1 2 2 ...
## $ PaymentMethod : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 2 1 1 1 1 4 3 2 ...
## $ MonthlyCharges : num 29.9 53.9 59.9 20.2 45.2 ...
## $ TotalCharges : num 29.9 108.2 3505.1 20.2 45.2 ...
## $ Churn : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 2 1 1 2 ...
#Forming Confusion-matrix from the decided cut-off point
Churn_step <- ifelse(predictions_step >= d[[1]],
"Yes", "No")
cm_1 <- confusionMatrix(table(testing$Churn,Churn_step))
cm_1
## Confusion Matrix and Statistics
##
## Churn_step
## No Yes
## No 658 374
## Yes 38 335
##
## Accuracy : 0.7068
## 95% CI : (0.6822, 0.7305)
## No Information Rate : 0.5046
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4161
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9454
## Specificity : 0.4725
## Pos Pred Value : 0.6376
## Neg Pred Value : 0.8981
## Prevalence : 0.4954
## Detection Rate : 0.4683
## Detection Prevalence : 0.7345
## Balanced Accuracy : 0.7089
##
## 'Positive' Class : No
##
#From the confusion matrix for stepwise logisitic model we find that the Accuracy of the model is 0.7068
#& Specificity is 0.4725.
#Model 2
#Random-Forest modeling
tc_rf <- trainControl(method = "repeatedcv",repeats = 2,number = 3, search = "random")
rf_train <- train(Churn ~ ., data = training, method = "rf", trainControl = tc_rf)
plot(varImp(rf_train, scale = F))

predict_rftrain <- predict(rf_train, testing)
cm_2 <- confusionMatrix(predict_rftrain, testing$Churn)
cm_2
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 980 252
## Yes 52 121
##
## Accuracy : 0.7836
## 95% CI : (0.7612, 0.8049)
## No Information Rate : 0.7345
## P-Value [Acc > NIR] : 1.204e-05
##
## Kappa : 0.3306
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9496
## Specificity : 0.3244
## Pos Pred Value : 0.7955
## Neg Pred Value : 0.6994
## Prevalence : 0.7345
## Detection Rate : 0.6975
## Detection Prevalence : 0.8769
## Balanced Accuracy : 0.6370
##
## 'Positive' Class : No
##
# we find that the accuracy of model 2 is 0.7836 and the sensitivity is 0.9496
#Monthly charge as a function of Payment Method per Churn Status
ggplot(training,aes(x=PaymentMethod ,
y=MonthlyCharges,col=Churn))+
geom_point()+
labs(title = "Monthly Charge by Payment Method as a func of Churn")

#it seem that when we examine Monthly charges as a function of Payment method,
# most Customers that payed by Electronic check based o their monthly charges are more likely to churn
#from the plot it suggests that Customers who pay by Mailed check are Charged averagely more than the other methods of payment
#but that is in terms of the base price not the max price possible to charge
#in terms of Max Charge ,Bank transfers takes the spot
# tenure as a function of payment method per Churn status
ggplot(training,aes(x=PaymentMethod ,
y=tenure,col=Churn))+
geom_point()+
labs(title = "Tunure by Payment Method as a func of Churn")

#its clear that Customers who paid by Mailed Check have the lowest churn rate
# it seems like those that pay by Electronic check churned the most followed by
#Credit card and Bank transfer
# tenure as a function of monthly charge per Churn status
ggplot(training,aes(x=tenure ,
y=MonthlyCharges,col=Contract ))+
geom_point()+
labs(title = "Tunure by Monthly Charge ")

#it seems like there are more Month-to-Month Customers and they are the ones more likely to Churn
#suggestion, look at ways to convert Month-to-Month customers into One or Two Year contracts,
#this will reduce the number of Churns and increase Revenues
#maybe offer loyalty points and discounts to Monthly Customers with options to increase their Contract tenure
#Model 3
#Decision Trees - RPART
rp <- rpart(Churn ~ ., data = training, method = "class")
predictions_rp <- predict(rp, testing, type = "class")
rpart.plot(rp, type=1,
extra=100,
branch.lty=3,
box.palette="RdYlGn",
tweak = 1.2,
fallen.leaves = FALSE)

cm_rpart <- confusionMatrix(predictions_rp, testing$Churn)
cm_rpart
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 938 211
## Yes 94 162
##
## Accuracy : 0.7829
## 95% CI : (0.7604, 0.8042)
## No Information Rate : 0.7345
## P-Value [Acc > NIR] : 1.591e-05
##
## Kappa : 0.3814
##
## Mcnemar's Test P-Value : 3.092e-11
##
## Sensitivity : 0.9089
## Specificity : 0.4343
## Pos Pred Value : 0.8164
## Neg Pred Value : 0.6328
## Prevalence : 0.7345
## Detection Rate : 0.6676
## Detection Prevalence : 0.8178
## Balanced Accuracy : 0.6716
##
## 'Positive' Class : No
##
# the accuracy of model 3 is 0.7829 and the sensitivity is 0.9089
#Model 4
#Principal Component Analysis + Rpart modeling
#As majority of our variables are Factor variables, we create dummy variables for our data to perform Principal Component Analysis.
dummy.data <- dummy.data.frame(churn_final)
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
intrain.pca <- createDataPartition(y = dummy.data$ChurnYes, p = 0.8, list = FALSE)
pca.train <- dummy.data[intrain.pca, ]
pca.test <- dummy.data[-intrain.pca, ]
pca.train1 <- pca.train[,-c(47,48)] #Removing the dependent variables
pca.test1 <- pca.test[,-c(47,48)] #Removing the dependent variables
pr_train <- prcomp(pca.train1, center = T, scale. = T)
summary(pr_train)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 3.3871 2.4623 2.0783 1.60761 1.50172 1.41525 1.36926
## Proportion of Variance 0.2494 0.1318 0.0939 0.05618 0.04902 0.04354 0.04076
## Cumulative Proportion 0.2494 0.3812 0.4751 0.53129 0.58031 0.62385 0.66461
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 1.25810 1.22258 1.16020 1.14358 1.1268 1.09126 1.06319
## Proportion of Variance 0.03441 0.03249 0.02926 0.02843 0.0276 0.02589 0.02457
## Cumulative Proportion 0.69902 0.73151 0.76078 0.78921 0.8168 0.84269 0.86727
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 1.05986 1.00686 0.96760 0.95985 0.93065 0.90264 0.60848
## Proportion of Variance 0.02442 0.02204 0.02035 0.02003 0.01883 0.01771 0.00805
## Cumulative Proportion 0.89169 0.91372 0.93408 0.95411 0.97293 0.99065 0.99870
## PC22 PC23 PC24 PC25 PC26 PC27
## Standard deviation 0.24303 0.03080 2.479e-14 2.401e-14 1.894e-14 8.08e-15
## Proportion of Variance 0.00128 0.00002 0.000e+00 0.000e+00 0.000e+00 0.00e+00
## Cumulative Proportion 0.99998 1.00000 1.000e+00 1.000e+00 1.000e+00 1.00e+00
## PC28 PC29 PC30 PC31 PC32
## Standard deviation 6.515e-15 5.491e-15 4.756e-15 3.763e-15 3.514e-15
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
## PC33 PC34 PC35 PC36 PC37
## Standard deviation 2.865e-15 2.8e-15 2.344e-15 1.642e-15 1.373e-15
## Proportion of Variance 0.000e+00 0.0e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.0e+00 1.000e+00 1.000e+00 1.000e+00
## PC38 PC39 PC40 PC41 PC42
## Standard deviation 1.179e-15 5.429e-16 2.754e-16 2.754e-16 2.754e-16
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
## PC43 PC44 PC45 PC46
## Standard deviation 2.754e-16 2.754e-16 2.754e-16 1.628e-16
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion 1.000e+00 1.000e+00 1.000e+00 1.000e+00
plot(pr_train, type = "l")
box()

biplot(pr_train, scale = T)

#principal component as a function of variance
pca_var <- pr_train$sdev ^ 2
pca_var_prop <- pca_var / sum(pca_var)
plot(cumsum(pca_var_prop),
type = 'b',
ylab = "Cummulative Variance",
xlab = "Principal Components",
main = "Principal Components VS Variance Explained")

#The summary of the Principal components tells us that the first 15 components explain 89% variance and hence we select the first 15 PCs and move forward to perform Decision trees prediction.
#We feed these 15 PCs to the RPART model to predict whether the customer be churned or not.
#The biplot suggests the directions of all the variables. It states that Tenure, Contract, Partner etc convey the same data value and explain the same variances as these vectors are in the same direction.
#The Cumulative variance plot states that first 15 PCs explain almost 90% variance and PC's after 23 are redundant and should be dropped as 23 PC's explain 100% variance.
# model with only the fist 15 variables
pca.final.train <- data.frame(ChurnYes = pca.train$ChurnYes, pr_train$x)
pca.final.train1 <- pca.final.train[,1:15] #Selecting the first 15 Principal components
pca.final.test <- predict(pr_train, pca.test1)
pca.final.test <- as.data.frame(pca.final.test)
pca.final.test1 <- pca.final.test[,1:15]
rpart_pca <- rpart(ChurnYes ~ ., data = pca.final.train1)
predict_rpart.pca <- predict(rpart_pca, newdata = pca.final.test1)
pred_rpart_pca <- prediction(predict_rpart.pca, pca.test$ChurnNo)
perf_rpart_pca <- performance(pred_rpart_pca, "tpr", "fpr")
plot(perf_rpart_pca, colorize = TRUE)

myroc <- roc(pca.test$ChurnNo, predict_rpart.pca)
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
c_pca <- coords(myroc, "best", ret = "threshold",transpose = T)
churned <- ifelse(predict_rpart.pca > c_pca[[1]], 1, 0)
cm_pca <- confusionMatrix(table(churned, pca.test$ChurnYes))
cm_pca
## Confusion Matrix and Statistics
##
##
## churned 0 1
## 0 704 65
## 1 336 301
##
## Accuracy : 0.7148
## 95% CI : (0.6904, 0.7383)
## No Information Rate : 0.7397
## P-Value [Acc > NIR] : 0.9839
##
## Kappa : 0.4027
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.6769
## Specificity : 0.8224
## Pos Pred Value : 0.9155
## Neg Pred Value : 0.4725
## Prevalence : 0.7397
## Detection Rate : 0.5007
## Detection Prevalence : 0.5469
## Balanced Accuracy : 0.7497
##
## 'Positive' Class : 0
##
#the accuracy of model 4 is 0.7148 and the sensitivity is 0.6769
#Model Specificty Accuracy
#Logistic 0.4725 0.7068
#Random Forest 0.3244 0.7836
#RPART 0.4343 0.7829
#PCA + RPART 0.8224 0.7148
##Conclusion
#According to our problem statement,
#PCA + Decisison trees yeilds SPECIFICITY of 82%,
#better than any other model and hence we select it as a final model.
#If Accuracy is taken into account, then Random Forest is our model of choice
#Model 5
#with 5 variables to predict Churn
#tenure,Total Charge,Monthly Charge,Internet Service Fiber optic,Contract Two year
#Stepwise Logistic Regression Model
md2_glm <- glm(as.factor(Churn) ~ tenure+TotalCharges+
MonthlyCharges+InternetService+Contract, data = training, family = binomial(logit))
md2_step <- step(md2_glm, direction = "both", steps = 10000, trace = FALSE)
predictions_step2 <- predict(md2_step, testing, type = "response")
#To decide the cut off point
pred_step2 <- prediction(predictions_step2, testing$Churn)
plot(performance(pred_step2, "tpr", "fpr"), colorize = TRUE)

auc_step2 <- performance(pred_step2, "auc")
auc_step2
## A performance instance
## 'Area under the ROC curve'
roc_step2<- roc(response = testing$Churn, predictor = predictions_step)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
z <- coords(roc_step2, "best", "threshold",transpose = T)
z
## threshold specificity sensitivity
## 0.1962807 0.6375969 0.8981233
#stepwise logistic reg with the 1st 5 variables
# threshold specificity sensitivity
# 0.1962807 0.6375969 0.8981233
#the same as model 1
#We find that the point is the best cut-off point to distinguish the class probabilities into 0's and 1's.
#We now form the Predicted scores using ifelse loops.
#Forming Confusion-matrix from the decided cut-off point
Churn_step <- ifelse(predictions_step2 >= z[[1]],
"Yes", "No")
cm_5 <- confusionMatrix(table(testing$Churn,Churn_step))
cm_5
## Confusion Matrix and Statistics
##
## Churn_step
## No Yes
## No 641 391
## Yes 48 325
##
## Accuracy : 0.6875
## 95% CI : (0.6626, 0.7117)
## No Information Rate : 0.5096
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3807
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9303
## Specificity : 0.4539
## Pos Pred Value : 0.6211
## Neg Pred Value : 0.8713
## Prevalence : 0.4904
## Detection Rate : 0.4562
## Detection Prevalence : 0.7345
## Balanced Accuracy : 0.6921
##
## 'Positive' Class : No
##
# the model Accuracy is 0.7148
# Sensitivity 0.6769
# and Specificity 0.8224
# Model 5 has the same Specificity as PCA+RPART Model