Customer-churn-Telco.R

#Customer churn 

#abstract-
#The biggest problem a Telco company faces is of Churned Customers.
#Churning is a term used in this industry to describe whether the consumer or the user is going to continue the service with the company any further or not.
#Churning has a huge impact on Revenue of a company. If the company predicts the churn rate of the customers with high accuracy, 
#it gives the company a guesstimate of how its revenues would look like and in turn give it freedom to plan finances ahead.



#problem statement-
#n this project we are more concerned about the customers who are going to get churned.
#We are going to predict whether the customer be Churned or not on the basis of its billing information and customer demographics.


#1
par(mfrow=c(1,1))
df<- read.csv("C:\\Users\\xholi\\Downloads\\Telco-Customer-Churn.csv",
              header = TRUE)             
head(df)

##   customerID gender SeniorCitizen Partner Dependents tenure PhoneService
## 1 7590-VHVEG Female             0     Yes         No      1           No
## 2 5575-GNVDE   Male             0      No         No     34          Yes
## 3 3668-QPYBK   Male             0      No         No      2          Yes
## 4 7795-CFOCW   Male             0      No         No     45           No
## 5 9237-HQITU Female             0      No         No      2          Yes
## 6 9305-CDSKC Female             0      No         No      8          Yes
##      MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection
## 1 No phone service             DSL             No          Yes               No
## 2               No             DSL            Yes           No              Yes
## 3               No             DSL            Yes          Yes               No
## 4 No phone service             DSL            Yes           No              Yes
## 5               No     Fiber optic             No           No               No
## 6              Yes     Fiber optic             No           No              Yes
##   TechSupport StreamingTV StreamingMovies       Contract PaperlessBilling
## 1          No          No              No Month-to-month              Yes
## 2          No          No              No       One year               No
## 3          No          No              No Month-to-month              Yes
## 4         Yes          No              No       One year               No
## 5          No          No              No Month-to-month              Yes
## 6          No         Yes             Yes Month-to-month              Yes
##               PaymentMethod MonthlyCharges TotalCharges Churn
## 1          Electronic check          29.85        29.85    No
## 2              Mailed check          56.95      1889.50    No
## 3              Mailed check          53.85       108.15   Yes
## 4 Bank transfer (automatic)          42.30      1840.75    No
## 5          Electronic check          70.70       151.65   Yes
## 6          Electronic check          99.65       820.50   Yes

#loading libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(rpart)
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(ROCR)
library(rpart.plot)
library(dummies)

## dummies-1.5.6 provided by Decision Patterns

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

library(ggplot2)
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(DT)
pacman::p_load(pacman,dplyr,GGally,ggvis,ggthemes,ggplot2,
               rio,lubridate,shiny,tidyr,plotly,psych,rmarkdown,httr,tidyverse,stringr)

churn<-df


#converting the variables to factors
churn$gender<-as.factor(churn$gender)
churn$Partner<-as.factor(churn$Partner)
churn$Dependents<-as.factor(churn$Dependents)
churn$PhoneService<-as.factor(churn$PhoneService)
churn$MultipleLines<-as.factor(churn$MultipleLines)
churn$InternetService<-as.factor(churn$InternetService)
churn$OnlineBackup<-as.factor(churn$OnlineBackup)
churn$OnlineSecurity<-as.factor(churn$OnlineSecurity)
churn$DeviceProtection<-as.factor(churn$DeviceProtection)
churn$TechSupport<-as.factor(churn$TechSupport)
churn$StreamingTV<-as.factor(churn$StreamingTV)
churn$StreamingMovies<-as.factor(churn$StreamingMovies)
churn$Contract<-as.factor(churn$Contract)
churn$PaperlessBilling<-as.factor(churn$PaperlessBilling)
churn$PaymentMethod<-as.factor(churn$PaymentMethod)
churn$SeniorCitizen<-as.factor(churn$SeniorCitizen)
churn$Churn<-as.factor(churn$Churn)
str(churn)

## 'data.frame':    7043 obs. of  21 variables:
##  $ customerID      : chr  "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
##  $ gender          : Factor w/ 2 levels "Female","Male": 1 2 2 2 1 1 2 1 1 2 ...
##  $ SeniorCitizen   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Partner         : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 2 1 ...
##  $ Dependents      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 2 ...
##  $ tenure          : int  1 34 2 45 2 8 22 10 28 62 ...
##  $ PhoneService    : Factor w/ 2 levels "No","Yes": 1 2 2 1 2 2 2 1 2 2 ...
##  $ MultipleLines   : Factor w/ 3 levels "No","No phone service",..: 2 1 1 2 1 3 3 2 3 1 ...
##  $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 1 2 2 2 1 2 1 ...
##  $ OnlineSecurity  : Factor w/ 3 levels "No","No internet service",..: 1 3 3 3 1 1 1 3 1 3 ...
##  $ OnlineBackup    : Factor w/ 3 levels "No","No internet service",..: 3 1 3 1 1 1 3 1 1 3 ...
##  $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 3 1 3 1 3 1 1 3 1 ...
##  $ TechSupport     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ StreamingTV     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 3 1 3 1 ...
##  $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 1 1 3 1 1 3 1 ...
##  $ Contract        : Factor w/ 3 levels "Month-to-month",..: 1 2 1 2 1 1 1 1 1 2 ...
##  $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 1 2 1 ...
##  $ PaymentMethod   : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 4 1 3 3 2 4 3 1 ...
##  $ MonthlyCharges  : num  29.9 57 53.9 42.3 70.7 ...
##  $ TotalCharges    : num  29.9 1889.5 108.2 1840.8 151.7 ...
##  $ Churn           : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 2 1 1 2 1 ...

#create a copy of the data set but without the 1st variable customer id
churn_copy <- churn[,-1] 
sum(is.na(churn_copy))

## [1] 11

#We can see that there are 11 entries in our data set which are empty or
#have NA values. The number of Na's as compared to the whole data set is relatively less,
#and hence we would drop these 11 values.

#CLEANING THE DATA SET AND CREATING TRAIN-TEST SPLITS
set.seed(999)
churn_final <- churn_copy[complete.cases(churn_copy),] # THIS RETURNS ONLY THE COMPLETE CASES IN OUR STUDY
intrain <- createDataPartition(y = churn_final$Churn, p = 0.8, list = FALSE)
training <- churn_final[intrain, ]
testing <- churn_final[-intrain, ]


#Visualizing the data by specific variables#

#number of customers by gender
p<-churn_final$gender

plot(p,col=rainbow(2),
     main="NUMBER OF CUSTOMERS BY GENDER",
     ylab="Number of Customers",
     xlab="Gender")
box()

summary(churn_final$gender)

## Female   Male 
##   3483   3549

#there are 3549 Males and 3483 Females in our data set
# gender population almost if not evenly balanced

#number of customers who churned by gender
p<-churn_final$gender[churn_final$Churn=="Yes"]

plot(p,col=rainbow(2),
     main="NUMBER OF CUSTOMERS WHO CHURNED BY GENDER",
     ylab="Number of Customers",
     xlab="Gender")
box()

summary(churn_final$gender[churn_final$Churn=="Yes"])

## Female   Male 
##    939    930

#939 Females churned and 930 Males also churned
# in total 1869 Customers churned away from Teleco Company
#that is around 26.5% of our customers churned and this is a huge deal since it means that our revenue will be 26.5 slightly short
#the churning with respect to gender is almost evenly balanced

#number of customers who churned  by Senior Citizen
p<-churn_final$SeniorCitizen[churn_final$Churn=="Yes"]

plot(p,
     main="NUMBER OF CUSTOMERS CHURNED BY SENIOR CITIZEN",
     ylab="Number of Customers",
     xlab="Senior Citizen",col=rainbow(2))
box()

summary(churn_final$SeniorCitizen[churn_final$Churn=="Yes"])

##    0    1 
## 1393  476

#there are about 5890 Senior Citizen in our data and 1142 which are not Senior Citizen
# and 1393 of those Senior Citizens Churned while only 476 non Senior Citizens also Churned
#its clear that Senior Citizens are most likely to churn.

#number of customers who churned  by Partner
p<-churn_final$Partner[churn_final$Churn=="Yes"]

plot(p,
     main="NUMBER OF CUSTOMERS CHURNED BY PARTNER",
     ylab="Number of Customers",
     xlab="Partner",col=rainbow(2))
box()

summary(churn_final$Partner[churn_final$Churn=="Yes"])

##   No  Yes 
## 1200  669

summary(churn_final$Partner)

##   No  Yes 
## 3639 3393

#There are 3393 Partners with Teleco and 3639 which are not
#among the Partners, 669 Churned while 1200 non-Partners also Churned
#its clear that non partners are more likly to churn

#number of customers who churned  by Dependents
p<-churn_final$Dependents[churn_final$Churn=="Yes"]

plot(p,
     main="NUMBER OF CUSTOMERS CHURNED BY DEPENDENTS",
     ylab="Number of Customers",
     xlab="Dependents",col=rainbow(2))
box()

summary(churn_final$Dependents[churn_final$Churn=="Yes"])

##   No  Yes 
## 1543  326

summary(churn_final$Dependents)

##   No  Yes 
## 4933 2099

#There are 4933 Customers with no dependents and 2099 with dependents
#1543 Customers with no Dependents churned while the Customers who have Dependents only churned about 326
# its apparent that Customers with no Dependents are most likely to Churn than those with Dependents


par(mfrow=c(2,1))
#number of customers who churned  by TENURE
p<-churn_final$tenure

boxplot(p,
     main="BOX PLOT OF TENURE",
     xlab="TENURE",col="red",horizontal = T)
box()
summary(churn_final$tenure[churn_final$Churn=="Yes"])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.00   10.00   17.98   29.00   72.00

p<-churn_final$tenure[churn_final$Churn=="Yes"]
boxplot(p,horizontal = T,col = "blue",
        xlab="Tenure in years",
        main="BOX PLOT OF TENURE BY CHURNED CUSTOMERS")

par(mfrow=c(1,1))
#from the box plot we see that the average tenure of Customers who Churned is about 10 yrs
#while that of those who have not Churned is at about 28 years
#so to help minimize the number of customers who leave, i suggest we start targeting 
#customers with tenures of > 3 years and less than 10
# if we target this group of Customers we might reduce the Percentage of Churn significantly

#number of customers who churned  by Contract
p<-churn_final$Contract[churn_final$Churn=="Yes"]

plot(p,
     main="NUMBER OF CUSTOMERS CHURNED BY TYPE OF CONTRACT",
     ylab="Number of Customers",
     xlab="Contract type",col=rainbow(3))
box()

summary(churn_final$Contract[churn_final$Churn=="Yes"])

## Month-to-month       One year       Two year 
##           1655            166             48

summary(churn_final$Contract)

## Month-to-month       One year       Two year 
##           3875           1472           1685

#There are three types of Contracts, Month-to-Month, One year, Two years
#We have a total of 3875 Month-to-Month Customers, 1472 One year, and 1685 Two year Contract Customers
#from the Monthly basis we had 1655 Customers who Churned,from the ONE YEAR, we had 166 Customers and lastly 
#From the TWO year we had 48 Customers who Churned
# its clear that Month-to-Month Customers are more likely to Churn than others ,
#followed by One year Contract Customers.
#this are the target market if we want to reduce the number of Churns

par(mfrow=c(2,1))
#number of customers who churned  by Total Charges
p<-churn_final$TotalCharges[churn_final$Churn=="Yes"]

boxplot(p,
     main="BOX PLOT OF TOTAL CHARGES FOR THOSE WHO CHURNED",
     xlab="TOTAL CHARGES",horizontal = T,col = "pink")
box()
p<-churn_final$TotalCharges

boxplot(p,
        main="BOX PLOT OF TOTAL CHARGES",
        xlab="TOTAL CHARGES",horizontal = T,col = "purple")

summary(churn_final$TotalCharges[churn_final$Churn=="Yes"])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.85  134.50  703.55 1531.80 2331.30 8684.80

summary(churn_final$TotalCharges)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    18.8   401.4  1397.5  2283.3  3794.7  8684.8

#the average Charge for those who churned is $1531.80 while that of the entire population is
# $2283.3 
#this implies a profit loss of about $2862934 or $2.86M since the average Charge for those who churned is $1531.80
# this is a lot of revenue loss, due to loss of customers.

summary(churn_final$Churn)

##   No  Yes 
## 5163 1869

par(mfrow=c(1,1))
#Modeling
#Model 1

#Stepwise Logistic Regression Model
md_glm <- glm(as.factor(Churn) ~ ., data = training, family = binomial(logit))
md_step <- step(md_glm, direction = "both", steps = 10000, trace = FALSE)
predictions_step <- predict(md_step, testing, type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

#To decide the cut off point
pred_step <- prediction(predictions_step, testing$Churn)
plot(performance(pred_step, "tpr", "fpr"), colorize = TRUE)

auc_step <- performance(pred_step, "auc")
auc_step

## A performance instance
##   'Area under the ROC curve'

roc_step <- roc(response = testing$Churn, predictor = predictions_step)

## Setting levels: control = No, case = Yes

## Setting direction: controls < cases

d <- coords(roc_step, "best", "threshold",transpose = T)
d

##   threshold specificity sensitivity 
##   0.1962807   0.6375969   0.8981233

#stepwise logistic reg
# threshold specificity sensitivity
# 0.1962807   0.6375969   0.8981233

#We find that the point 0.1962807 is the best cut-off point to distinguish the class probabilities into 0's and 1's.
#We now form the Predicted scores using ifelse loops.

str(testing)

## 'data.frame':    1405 obs. of  20 variables:
##  $ gender          : Factor w/ 2 levels "Female","Male": 1 2 1 2 2 1 1 2 1 1 ...
##  $ SeniorCitizen   : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
##  $ Partner         : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 2 1 2 2 1 ...
##  $ Dependents      : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 2 1 1 ...
##  $ tenure          : int  1 2 58 1 1 72 11 10 13 15 ...
##  $ PhoneService    : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 2 2 2 2 2 ...
##  $ MultipleLines   : Factor w/ 3 levels "No","No phone service",..: 2 1 3 1 1 3 3 1 3 1 ...
##  $ InternetService : Factor w/ 3 levels "DSL","Fiber optic",..: 1 1 1 3 1 2 2 1 1 2 ...
##  $ OnlineSecurity  : Factor w/ 3 levels "No","No internet service",..: 1 3 1 2 1 3 1 1 3 3 ...
##  $ OnlineBackup    : Factor w/ 3 levels "No","No internet service",..: 3 3 3 2 1 3 1 3 3 3 ...
##  $ DeviceProtection: Factor w/ 3 levels "No","No internet service",..: 1 1 1 2 1 1 3 1 1 1 ...
##  $ TechSupport     : Factor w/ 3 levels "No","No internet service",..: 1 1 3 2 1 3 1 1 3 1 ...
##  $ StreamingTV     : Factor w/ 3 levels "No","No internet service",..: 1 1 1 2 1 3 3 1 3 3 ...
##  $ StreamingMovies : Factor w/ 3 levels "No","No internet service",..: 1 1 1 2 1 1 3 1 1 3 ...
##  $ Contract        : Factor w/ 3 levels "Month-to-month",..: 1 1 3 1 1 3 1 2 1 1 ...
##  $ PaperlessBilling: Factor w/ 2 levels "No","Yes": 2 2 2 1 1 1 2 1 2 2 ...
##  $ PaymentMethod   : Factor w/ 4 levels "Bank transfer (automatic)",..: 3 4 2 1 1 1 1 4 3 2 ...
##  $ MonthlyCharges  : num  29.9 53.9 59.9 20.2 45.2 ...
##  $ TotalCharges    : num  29.9 108.2 3505.1 20.2 45.2 ...
##  $ Churn           : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 1 2 1 1 2 ...

#Forming Confusion-matrix from the decided cut-off point
Churn_step <- ifelse(predictions_step >= d[[1]], 
                     "Yes", "No")
cm_1 <- confusionMatrix(table(testing$Churn,Churn_step))
cm_1

## Confusion Matrix and Statistics
## 
##      Churn_step
##        No Yes
##   No  658 374
##   Yes  38 335
##                                           
##                Accuracy : 0.7068          
##                  95% CI : (0.6822, 0.7305)
##     No Information Rate : 0.5046          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4161          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9454          
##             Specificity : 0.4725          
##          Pos Pred Value : 0.6376          
##          Neg Pred Value : 0.8981          
##              Prevalence : 0.4954          
##          Detection Rate : 0.4683          
##    Detection Prevalence : 0.7345          
##       Balanced Accuracy : 0.7089          
##                                           
##        'Positive' Class : No              
##

#From the confusion matrix for stepwise logisitic model we find that the Accuracy of the model is 0.7068 
#& Specificity is 0.4725.


#Model 2

#Random-Forest modeling
tc_rf <- trainControl(method = "repeatedcv",repeats = 2,number = 3, search = "random")
rf_train <- train(Churn ~ ., data = training, method = "rf", trainControl = tc_rf)
plot(varImp(rf_train, scale = F))

predict_rftrain <- predict(rf_train, testing)
cm_2 <- confusionMatrix(predict_rftrain, testing$Churn)
cm_2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  980 252
##        Yes  52 121
##                                           
##                Accuracy : 0.7836          
##                  95% CI : (0.7612, 0.8049)
##     No Information Rate : 0.7345          
##     P-Value [Acc > NIR] : 1.204e-05       
##                                           
##                   Kappa : 0.3306          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9496          
##             Specificity : 0.3244          
##          Pos Pred Value : 0.7955          
##          Neg Pred Value : 0.6994          
##              Prevalence : 0.7345          
##          Detection Rate : 0.6975          
##    Detection Prevalence : 0.8769          
##       Balanced Accuracy : 0.6370          
##                                           
##        'Positive' Class : No              
##

# we find that the accuracy of model 2 is 0.7836 and the sensitivity is 0.9496

#Monthly charge as a function of Payment Method per Churn Status
ggplot(training,aes(x=PaymentMethod ,
                    y=MonthlyCharges,col=Churn))+
  geom_point()+
  labs(title = "Monthly Charge by Payment Method as a func of Churn")

#it seem that when we examine Monthly charges as a function of Payment method,
# most Customers that payed by Electronic check based o their monthly charges are more likely to churn
#from the plot it suggests that Customers who pay by Mailed check are Charged averagely more than the other methods of payment
#but that is in terms of the base price not the max price possible to charge
#in terms of Max Charge ,Bank transfers takes the spot

# tenure as a function of payment method per Churn status
ggplot(training,aes(x=PaymentMethod ,
                    y=tenure,col=Churn))+
  geom_point()+
  labs(title = "Tunure by Payment Method as a func of Churn")

#its clear that Customers who paid by Mailed Check have the lowest churn rate
# it seems like those that pay by Electronic check churned the most followed by 
#Credit card and Bank transfer

# tenure as a function of monthly charge per Churn status
ggplot(training,aes(x=tenure ,
                    y=MonthlyCharges,col=Contract ))+
  geom_point()+
  labs(title = "Tunure by Monthly Charge ")

#it seems like there are more Month-to-Month Customers and they are the ones more likely to Churn
#suggestion, look at ways to convert Month-to-Month customers into One or Two Year contracts,
#this will reduce the number of Churns and increase Revenues
#maybe offer loyalty points and discounts to Monthly Customers with options to increase their Contract tenure


#Model 3

#Decision Trees - RPART
rp <- rpart(Churn ~ ., data = training, method = "class")
predictions_rp <- predict(rp, testing, type = "class")
rpart.plot(rp, type=1, 
           extra=100,
           branch.lty=3,
           box.palette="RdYlGn",
           tweak = 1.2,
           fallen.leaves = FALSE)

cm_rpart <- confusionMatrix(predictions_rp, testing$Churn)
cm_rpart

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  938 211
##        Yes  94 162
##                                           
##                Accuracy : 0.7829          
##                  95% CI : (0.7604, 0.8042)
##     No Information Rate : 0.7345          
##     P-Value [Acc > NIR] : 1.591e-05       
##                                           
##                   Kappa : 0.3814          
##                                           
##  Mcnemar's Test P-Value : 3.092e-11       
##                                           
##             Sensitivity : 0.9089          
##             Specificity : 0.4343          
##          Pos Pred Value : 0.8164          
##          Neg Pred Value : 0.6328          
##              Prevalence : 0.7345          
##          Detection Rate : 0.6676          
##    Detection Prevalence : 0.8178          
##       Balanced Accuracy : 0.6716          
##                                           
##        'Positive' Class : No              
##

# the accuracy of model 3 is 0.7829 and the sensitivity is 0.9089


#Model 4

#Principal Component Analysis + Rpart modeling
#As majority of our variables are Factor variables, we create dummy variables for our data to perform Principal Component Analysis.

dummy.data <- dummy.data.frame(churn_final)

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored

intrain.pca <- createDataPartition(y = dummy.data$ChurnYes, p = 0.8, list = FALSE)
pca.train <- dummy.data[intrain.pca, ]
pca.test <- dummy.data[-intrain.pca, ]
pca.train1 <- pca.train[,-c(47,48)] #Removing the dependent variables
pca.test1 <- pca.test[,-c(47,48)] #Removing the dependent variables
pr_train <- prcomp(pca.train1, center = T, scale. = T)
summary(pr_train)

## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     3.3871 2.4623 2.0783 1.60761 1.50172 1.41525 1.36926
## Proportion of Variance 0.2494 0.1318 0.0939 0.05618 0.04902 0.04354 0.04076
## Cumulative Proportion  0.2494 0.3812 0.4751 0.53129 0.58031 0.62385 0.66461
##                            PC8     PC9    PC10    PC11   PC12    PC13    PC14
## Standard deviation     1.25810 1.22258 1.16020 1.14358 1.1268 1.09126 1.06319
## Proportion of Variance 0.03441 0.03249 0.02926 0.02843 0.0276 0.02589 0.02457
## Cumulative Proportion  0.69902 0.73151 0.76078 0.78921 0.8168 0.84269 0.86727
##                           PC15    PC16    PC17    PC18    PC19    PC20    PC21
## Standard deviation     1.05986 1.00686 0.96760 0.95985 0.93065 0.90264 0.60848
## Proportion of Variance 0.02442 0.02204 0.02035 0.02003 0.01883 0.01771 0.00805
## Cumulative Proportion  0.89169 0.91372 0.93408 0.95411 0.97293 0.99065 0.99870
##                           PC22    PC23      PC24      PC25      PC26     PC27
## Standard deviation     0.24303 0.03080 2.479e-14 2.401e-14 1.894e-14 8.08e-15
## Proportion of Variance 0.00128 0.00002 0.000e+00 0.000e+00 0.000e+00 0.00e+00
## Cumulative Proportion  0.99998 1.00000 1.000e+00 1.000e+00 1.000e+00 1.00e+00
##                             PC28      PC29      PC30      PC31      PC32
## Standard deviation     6.515e-15 5.491e-15 4.756e-15 3.763e-15 3.514e-15
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
##                             PC33    PC34      PC35      PC36      PC37
## Standard deviation     2.865e-15 2.8e-15 2.344e-15 1.642e-15 1.373e-15
## Proportion of Variance 0.000e+00 0.0e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.0e+00 1.000e+00 1.000e+00 1.000e+00
##                             PC38      PC39      PC40      PC41      PC42
## Standard deviation     1.179e-15 5.429e-16 2.754e-16 2.754e-16 2.754e-16
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00 1.000e+00
##                             PC43      PC44      PC45      PC46
## Standard deviation     2.754e-16 2.754e-16 2.754e-16 1.628e-16
## Proportion of Variance 0.000e+00 0.000e+00 0.000e+00 0.000e+00
## Cumulative Proportion  1.000e+00 1.000e+00 1.000e+00 1.000e+00

plot(pr_train, type = "l")
box()

biplot(pr_train, scale = T)

#principal component as a function of variance
pca_var <- pr_train$sdev ^ 2
pca_var_prop <- pca_var / sum(pca_var)
plot(cumsum(pca_var_prop),
     type = 'b',
     ylab = "Cummulative Variance",
     xlab = "Principal Components",
     main = "Principal Components VS Variance Explained")

#The summary of the Principal components tells us that the first 15 components explain 89% variance and hence we select the first 15 PCs and move forward to perform Decision trees prediction.
#We feed these 15 PCs to the RPART model to predict whether the customer be churned or not.
#The biplot suggests the directions of all the variables. It states that Tenure, Contract, Partner etc convey the same data value and explain the same variances as these vectors are in the same direction.
#The Cumulative variance plot states that first 15 PCs explain almost 90% variance and PC's after 23 are redundant and should be dropped as 23 PC's explain 100% variance.

# model with only the fist 15 variables
pca.final.train <- data.frame(ChurnYes = pca.train$ChurnYes, pr_train$x)
pca.final.train1 <- pca.final.train[,1:15] #Selecting the first 15 Principal components
pca.final.test <- predict(pr_train, pca.test1)
pca.final.test <- as.data.frame(pca.final.test)
pca.final.test1 <- pca.final.test[,1:15]
rpart_pca <- rpart(ChurnYes ~ ., data = pca.final.train1)
predict_rpart.pca <- predict(rpart_pca, newdata = pca.final.test1)
pred_rpart_pca <- prediction(predict_rpart.pca, pca.test$ChurnNo)
perf_rpart_pca <- performance(pred_rpart_pca, "tpr", "fpr")
plot(perf_rpart_pca, colorize = TRUE)

myroc <- roc(pca.test$ChurnNo, predict_rpart.pca)

## Setting levels: control = 0, case = 1

## Setting direction: controls > cases

c_pca <- coords(myroc, "best", ret = "threshold",transpose = T)
churned <- ifelse(predict_rpart.pca > c_pca[[1]], 1, 0)
cm_pca <- confusionMatrix(table(churned, pca.test$ChurnYes))
cm_pca

## Confusion Matrix and Statistics
## 
##        
## churned   0   1
##       0 704  65
##       1 336 301
##                                           
##                Accuracy : 0.7148          
##                  95% CI : (0.6904, 0.7383)
##     No Information Rate : 0.7397          
##     P-Value [Acc > NIR] : 0.9839          
##                                           
##                   Kappa : 0.4027          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.6769          
##             Specificity : 0.8224          
##          Pos Pred Value : 0.9155          
##          Neg Pred Value : 0.4725          
##              Prevalence : 0.7397          
##          Detection Rate : 0.5007          
##    Detection Prevalence : 0.5469          
##       Balanced Accuracy : 0.7497          
##                                           
##        'Positive' Class : 0               
##

#the accuracy of model 4 is 0.7148 and the sensitivity is 0.6769



#Model  Specificty  Accuracy

#Logistic   0.4725  0.7068
#Random Forest  0.3244  0.7836
#RPART  0.4343  0.7829
#PCA + RPART    0.8224  0.7148

##Conclusion

#According to our problem statement,
#PCA + Decisison trees yeilds SPECIFICITY of 82%,
#better than any other model and hence we select it as a final model.

#If Accuracy is taken into account, then Random Forest is our model of choice

#Model 5
#with 5 variables to predict Churn
#tenure,Total Charge,Monthly Charge,Internet Service Fiber optic,Contract Two year

#Stepwise Logistic Regression Model
md2_glm <- glm(as.factor(Churn) ~ tenure+TotalCharges+
                 MonthlyCharges+InternetService+Contract, data = training, family = binomial(logit))
md2_step <- step(md2_glm, direction = "both", steps = 10000, trace = FALSE)
predictions_step2 <- predict(md2_step, testing, type = "response")

#To decide the cut off point
pred_step2 <- prediction(predictions_step2, testing$Churn)
plot(performance(pred_step2, "tpr", "fpr"), colorize = TRUE)

auc_step2 <- performance(pred_step2, "auc")
auc_step2

## A performance instance
##   'Area under the ROC curve'

roc_step2<- roc(response = testing$Churn, predictor = predictions_step)

## Setting levels: control = No, case = Yes

## Setting direction: controls < cases

z <- coords(roc_step2, "best", "threshold",transpose = T)
z

##   threshold specificity sensitivity 
##   0.1962807   0.6375969   0.8981233

#stepwise logistic reg with the 1st 5 variables
# threshold specificity sensitivity
# 0.1962807   0.6375969   0.8981233 
#the same as model 1

#We find that the point  is the best cut-off point to distinguish the class probabilities into 0's and 1's.
#We now form the Predicted scores using ifelse loops.


#Forming Confusion-matrix from the decided cut-off point
Churn_step <- ifelse(predictions_step2 >= z[[1]], 
                     "Yes", "No")
cm_5 <- confusionMatrix(table(testing$Churn,Churn_step))
cm_5

## Confusion Matrix and Statistics
## 
##      Churn_step
##        No Yes
##   No  641 391
##   Yes  48 325
##                                           
##                Accuracy : 0.6875          
##                  95% CI : (0.6626, 0.7117)
##     No Information Rate : 0.5096          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3807          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9303          
##             Specificity : 0.4539          
##          Pos Pred Value : 0.6211          
##          Neg Pred Value : 0.8713          
##              Prevalence : 0.4904          
##          Detection Rate : 0.4562          
##    Detection Prevalence : 0.7345          
##       Balanced Accuracy : 0.6921          
##                                           
##        'Positive' Class : No              
##

# the model Accuracy is  0.7148
# Sensitivity  0.6769
# and Specificity 0.8224
# Model 5 has the same Specificity as PCA+RPART Model

Customer-churn-Telco.R

xholi

2021-01-13