Group 1
# Importing Data
library(ISLR)
# reading inbuilt data as data frame
churn <- read.csv(paste("ChurnData.csv", sep =))
nrow(churn)
[1] 6347
ncol(churn)
[1] 13
summary(churn)
ID Age Churn CHIDec CHIChange
Min. : 1 Min. : 1.0 No :6024 Min. : 1.0 Min. : 1.00
1st Qu.:1588 1st Qu.: 8.0 Yes: 323 1st Qu.: 16.0 1st Qu.: 2.00
Median :3174 Median :18.0 Median : 73.0 Median : 49.00
Mean :3174 Mean :24.5 Mean :103.1 Mean : 80.39
3rd Qu.:4760 3rd Qu.:38.0 3rd Qu.:204.0 3rd Qu.:160.50
Max. :6347 Max. :61.0 Max. :263.0 Max. :242.00
SupportCasesDec SupportCasesChange SupportPriorityDec
Min. : 1.000 Min. : 1.000 Min. : 1.000
1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 1.000
Median : 1.000 Median : 1.000 Median : 1.000
Mean : 3.032 Mean : 6.419 Mean : 3.604
3rd Qu.: 2.000 3rd Qu.:11.000 3rd Qu.: 8.000
Max. :21.000 Max. :37.000 Max. :17.000
SupportPriorityChange LoginsChange BlogPostsChange ViewsChange
Min. : 1.000 Min. : 1.0 Min. : 1.000 Min. : 1.0
1st Qu.: 1.000 1st Qu.: 12.0 1st Qu.: 1.000 1st Qu.: 1.0
Median : 1.000 Median : 98.0 Median : 1.000 Median : 372.0
Mean : 8.794 Mean :115.9 Mean : 9.944 Mean : 471.7
3rd Qu.:17.000 3rd Qu.:210.0 3rd Qu.:15.000 3rd Qu.: 877.0
Max. :46.000 Max. :294.0 Max. :57.000 Max. :1360.0
DaysSinceLoginChange
Min. : 1.0
1st Qu.: 1.0
Median : 11.0
Mean : 52.8
3rd Qu.:121.0
Max. :143.0
table(churn$Churn)
No Yes
6024 323
count <-table(churn$Churn)
View(count)
str(churn)
'data.frame': 6347 obs. of 13 variables:
$ ID : int 1 1112 2223 3334 4445 5556 6015 6126 6237 1002 ...
$ Age : int 58 58 52 57 54 55 54 42 53 53 ...
$ Churn : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ CHIDec : int 1 223 1 149 202 45 92 21 240 240 ...
$ CHIChange : int 1 179 1 95 2 3 53 8 75 39 ...
$ SupportCasesDec : int 1 1 1 2 1 1 2 1 2 1 ...
$ SupportCasesChange : int 1 1 1 2 1 1 20 1 11 1 ...
$ SupportPriorityDec : int 1 1 1 11 1 1 11 1 11 1 ...
$ SupportPriorityChange: int 1 1 1 1 1 1 42 1 1 1 ...
$ LoginsChange : int 1 1 1 152 1 228 120 1 84 72 ...
$ BlogPostsChange : int 1 1 1 23 1 1 2 1 25 1 ...
$ ViewsChange : int 1 145 1 887 1331 319 1334 1044 1 989 ...
$ DaysSinceLoginChange : int 134 134 134 1 134 1 1 139 141 115 ...
count
No Yes
6024 323
count_percent <-count*100/6347
count_percent
No Yes
94.910982 5.089018
pie(count_percent, col = c("red","blue"))
barplot(count_percent, xlab = "Churn", ylab = "Percentage", las = 1, col = c("red","blue") )
boxplot(Age ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "Age", ylab = "churn")
boxplot(CHIDec ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "CHI", ylab = "churn")
boxplot(CHIChange ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "CHI Change", ylab = "churn")
boxplot(SupportCasesDec ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "NO. of Suport Cases", ylab = "churn")
boxplot(SupportCasesChange ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "CHnage in Support Cases", ylab = "churn")
boxplot(SupportPriorityDec ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "Support priority type", ylab = "churn")
boxplot(SupportPriorityChange ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "Support priority Change", ylab = "churn")
boxplot(LoginsChange ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "No. of Logins change", ylab = "churn")
boxplot(BlogPostsChange ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "Blog Posts Change", ylab = "churn")
boxplot(ViewsChange ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "No. of views change", ylab = "churn")
boxplot(DaysSinceLoginChange ~ Churn, data = churn, horizontal = TRUE, col = "blue", xlab = "No. of days since login change", ylab = "churn")
library(caTools)
# use set.seed to use the same random number sequence
set.seed(123)
# craeting 70% data for training
split <- sample.split(churn$Churn, SplitRatio = 0.70)
trainData <- subset(churn, split == TRUE)
# dimensions of training data
dim(trainData)
[1] 4443 13
# creating 30% data for testing
testData <- subset(churn, split == FALSE)
# dimensions of testing data
dim(testData)
[1] 1904 13
logitClassifier1 <- glm(Churn ~ DaysSinceLoginChange,
data = trainData,
family = binomial())
summary(logitClassifier1)
Call:
glm(formula = Churn ~ DaysSinceLoginChange, family = binomial(),
data = trainData)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.4393 -0.3991 -0.2508 -0.2498 2.6392
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.459623 0.113939 -30.364 < 2e-16 ***
DaysSinceLoginChange 0.008183 0.001163 7.037 1.96e-12 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1786.6 on 4442 degrees of freedom
Residual deviance: 1735.7 on 4441 degrees of freedom
AIC: 1739.7
Number of Fisher Scoring iterations: 6
logitClassifier2 <- glm(Churn ~ Age,
data = trainData,
family = binomial())
summary(logitClassifier2)
Call:
glm(formula = Churn ~ Age, family = binomial(), data = trainData)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.3506 -0.3405 -0.3259 -0.2972 2.5495
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.750736 0.105856 -25.986 <2e-16 ***
Age -0.007537 0.003664 -2.057 0.0397 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1786.6 on 4442 degrees of freedom
Residual deviance: 1782.2 on 4441 degrees of freedom
AIC: 1786.2
Number of Fisher Scoring iterations: 5
logitClassifier3 <- glm(Churn ~ Age + CHIDec + CHIChange + SupportCasesDec + SupportCasesChange + SupportPriorityDec + SupportPriorityChange + LoginsChange + BlogPostsChange + ViewsChange + DaysSinceLoginChange,
data = trainData,
family = binomial())
summary(logitClassifier3)
Call:
glm(formula = Churn ~ Age + CHIDec + CHIChange + SupportCasesDec +
SupportCasesChange + SupportPriorityDec + SupportPriorityChange +
LoginsChange + BlogPostsChange + ViewsChange + DaysSinceLoginChange,
family = binomial(), data = trainData)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.5818 -0.3653 -0.2924 -0.2260 2.9497
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.0004030 0.1661893 -18.054 < 2e-16 ***
Age -0.0078014 0.0037800 -2.064 0.0390 *
CHIDec 0.0002225 0.0007690 0.289 0.7724
CHIChange -0.0024404 0.0011366 -2.147 0.0318 *
SupportCasesDec -0.0164333 0.0314598 -0.522 0.6014
SupportCasesChange -0.0067348 0.0191888 -0.351 0.7256
SupportPriorityDec -0.0133276 0.0289308 -0.461 0.6450
SupportPriorityChange 0.0069379 0.0093564 0.742 0.4584
LoginsChange -0.0008033 0.0008359 -0.961 0.3365
BlogPostsChange -0.0050106 0.0062331 -0.804 0.4215
ViewsChange 0.0001539 0.0001613 0.954 0.3400
DaysSinceLoginChange 0.0080484 0.0012157 6.620 3.58e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1786.6 on 4442 degrees of freedom
Residual deviance: 1709.8 on 4431 degrees of freedom
AIC: 1733.8
Number of Fisher Scoring iterations: 6
predProbClass1 <- predict(logitClassifier1, type = 'response', newdata = testData[-3])
#table(predProbClass1)
yPred1 <- ifelse(predProbClass1 > 0.5, "Yes", "No")
table(yPred1)
yPred1
No
1904
predProbClass2 <- predict(logitClassifier2, type = 'response', newdata = testData[-3])
yPred2 <- ifelse(predProbClass2 > 0.5, "Yes", "No")
table(yPred2)
yPred2
No
1904
predProbClass3 <- predict(logitClassifier3, type = 'response', newdata = testData[-3])
yPred3 <- ifelse(predProbClass3 > 0.5, "Yes", "No")
table(yPred3)
yPred3
No
1904
confMatrix1 <- table(yActual = testData[, 3], yPred1)
confMatrix1
yPred1
yActual No
No 1807
Yes 97
confMatrix2 <- table(yActual = testData[, 3], yPred2)
confMatrix2
yPred2
yActual No
No 1807
Yes 97
confMatrix3 <- table(yActual = testData[, 3], yPred3)
confMatrix3
yPred3
yActual No
No 1807
Yes 97
#Model 1
library(MLmetrics)
Accuracy(y_pred = yPred1, y_true = testData$Churn)
[1] 0.9490546
#Model 2
Accuracy(y_pred = yPred2, y_true = testData$Churn)
[1] 0.9490546
#Model 3
Accuracy(y_pred = yPred3, y_true = testData$Churn)
[1] 0.9490546
library(ROCR)
#Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
PredictObject1 <- prediction(predProbClass3, testData$Churn)
# All kinds of predictor evaluations are performed using the performance function
PerformObject1 <- performance(PredictObject1, "tpr","fpr")
# Plot the ROC Curve for Credit Card Default
plot(PerformObject1, main = "ROC Curve for Churn", col = "black", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")
library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(churn$Churn, p = 0.70, list = FALSE)
# 70% training data
trainChurn <- churn[trainIndex, ]
# 30% testing data
testChurn <- churn[-trainIndex, ]
table(trainChurn$Churn)
No Yes
4217 227
table(testChurn$Churn)
No Yes
1807 96
dTreeInfoGain <- train(Churn ~ .,
data = trainChurn,
method = "rpart",
parms = list(split = "information"),
trControl = trainControl(method = "cv"))
dTreeInfoGain
CART
4444 samples
12 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 3999, 3999, 4000, 4000, 4000, 3999, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.002202643 0.9446436 0.11174868
0.002936858 0.9450931 0.11335721
0.004405286 0.9475716 0.07675207
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.004405286.
# viasulaziation
library(rpart.plot)
prp(dTreeInfoGain$finalModel, box.palette = "Reds", tweak = 1.2, varlen = 20)
# prediction of churn = {no, yes} on test data (information gain)
predClassInfoGain <- predict(dTreeInfoGain, testChurn, type = 'raw')
table(predClassInfoGain)
predClassInfoGain
No Yes
1891 12
# prediction on test data
predClassTestInfoGain <- predict(dTreeInfoGain,
testChurn[, 1:13],
type = 'raw')
# confusion matrix
confusionMatrix(predClassTestInfoGain, testChurn$Churn,
positive = "Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1801 90
Yes 6 6
Accuracy : 0.9496
95% CI : (0.9387, 0.9589)
No Information Rate : 0.9496
P-Value [Acc > NIR] : 0.5271
Kappa : 0.101
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.062500
Specificity : 0.996680
Pos Pred Value : 0.500000
Neg Pred Value : 0.952406
Prevalence : 0.050447
Detection Rate : 0.003153
Detection Prevalence : 0.006306
Balanced Accuracy : 0.529590
'Positive' Class : Yes
library(ROCR)
predClassTestInfoGain2 <- predict(dTreeInfoGain,
testChurn,
type = 'prob')
PredictObject2 <- prediction(predClassTestInfoGain2[2], testChurn$Churn)
PerformObject2 <- performance(PredictObject2, "tpr","fpr")
plot(PerformObject2, main = "ROC Curve for Churn", col = "black", lwd = 2)
abline(a = 0,b = 1, lwd = 2, lty = 3, col = "black")
aucInfoGain <- performance(PredictObject2, measure = "auc")
aucInfoGain <- aucInfoGain@y.values[[1]]
aucInfoGain
[1] 0.6458016
# predicted probabilities (information gain)
predTestProbInfoGain <- predict(dTreeInfoGain, testChurn, type = "prob")
# plot of probabilities
plot(predTestProbInfoGain[,2],
main = "Scatterplot of Probabilities of churn (test data)",
xlab = "Customer ID", ylab = "Predicted Probability of Churn")
set.seed(4321)
dTreeGiniIndex <- train(Churn ~.,
data = trainChurn,
method = "rpart",
parms = list(split = "gini"))
dTreeGiniIndex
CART
4444 samples
12 predictor
2 classes: 'No', 'Yes'
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 4444, 4444, 4444, 4444, 4444, 4444, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.002202643 0.9343059 0.08866110
0.002936858 0.9366505 0.08679528
0.004405286 0.9402744 0.08634553
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.004405286.
library(rpart.plot)
prp(dTreeGiniIndex$finalModel, box.palette = "Reds", tweak = 1.2, varlen = 20)
# prediction of Churn = {no, yes} on test data (gini Index)
predClassGiniIndex <- predict(dTreeGiniIndex, testChurn, type = 'raw')
table(predClassGiniIndex)
predClassGiniIndex
No Yes
1903 0
#confusion matrix (gini index)
confusionMatrix(predClassGiniIndex, testChurn$Churn, positive = "Yes")
Confusion Matrix and Statistics
Reference
Prediction No Yes
No 1807 96
Yes 0 0
Accuracy : 0.9496
95% CI : (0.9387, 0.9589)
No Information Rate : 0.9496
P-Value [Acc > NIR] : 0.5271
Kappa : 0
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.00000
Specificity : 1.00000
Pos Pred Value : NaN
Neg Pred Value : 0.94955
Prevalence : 0.05045
Detection Rate : 0.00000
Detection Prevalence : 0.00000
Balanced Accuracy : 0.50000
'Positive' Class : Yes
library(ROCR)
# prediction
predTestProbGiniIndex <- predict(dTreeGiniIndex, testChurn, type = 'prob')
PredictObjectGiniIndex <- prediction(predTestProbGiniIndex[2], testChurn$Churn)
# performance
PerformObjectGiniIndex <- performance(PredictObjectGiniIndex, "tpr","fpr")
# plot of the ROC curve for Customer Churn
plot(PerformObjectGiniIndex,
main = "ROC Curve for Customer Churn (Gini Index)",
col = "red",
lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")
aucGiniIndex <- performance(PredictObjectGiniIndex, measure = "auc")
aucGiniIndex <- aucGiniIndex@y.values[[1]]
aucGiniIndex
[1] 0.5
predTestProbGiniIndex <- predict(dTreeGiniIndex, testChurn, type = "prob")
# plot of probabilities
plot(predTestProbGiniIndex[,2],
main = "Scatterplot of Probabilities of Customer Churn (test data)",
xlab = "Customer ID", ylab = "Predicted Probability of Customer Churn")