library(ggplot2)
library(ROCR)
library(popbio)
churnData = read.csv("churn.csv")
churnData = churnData[,2:6]
head(churnData)
## Churned Age Married Cust_years Churned_contacts
## 1 0 61 1 3 1
## 2 0 50 1 3 2
## 3 0 47 1 2 0
## 4 0 50 1 3 3
## 5 0 29 1 1 3
## 6 0 43 1 4 3
sum(churnData$Churned)
## [1] 1743
qplot(Age, Churned, data=churnData) + geom_point(colour = "#3366FF", size = 3)
Computes and plots conditional densities describing how the conditional distribution of a categorical variable Chruned changes over a numerical variable Age.
cdplot(factor(Churned) ~ Age, data=churnData, main="Estimated categ prob", ylab='Churned')
qplot(Age, ..count.., data=churnData, geom="density", fill=factor(Churned), position="fill") +
ylab('Probability')+theme(legend.position='bottom')
Function glm is used to fit generalized linear models, specified by giving a symbolic description of the linear predictor and a description of the error distribution.
churnLogreg = glm(Churned ~ ., data=churnData, family=binomial(link="logit"))
summary(churnLogreg)
##
## Call:
## glm(formula = Churned ~ ., family = binomial(link = "logit"),
## data = churnData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4542 -0.5206 -0.1971 -0.0728 3.3786
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.415201 0.163734 20.858 <2e-16 ***
## Age -0.156643 0.004088 -38.320 <2e-16 ***
## Married 0.066432 0.068302 0.973 0.331
## Cust_years 0.017857 0.030497 0.586 0.558
## Churned_contacts 0.382324 0.027313 13.998 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8387.3 on 7999 degrees of freedom
## Residual deviance: 5357.9 on 7995 degrees of freedom
## AIC: 5367.9
##
## Number of Fisher Scoring iterations: 6
confint(churnLogreg)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 3.09678395 3.73872531
## Age -0.16478103 -0.14875424
## Married -0.06740737 0.20037431
## Cust_years -0.04202648 0.07755425
## Churned_contacts 0.32902320 0.43611325
set.seed(2015)
splitChurn = caret::createDataPartition(churnData[,1], p = 0.8, list=F, times=1)
head(splitChurn)
## Resample1
## [1,] 2
## [2,] 3
## [3,] 4
## [4,] 7
## [5,] 8
## [6,] 9
trainChurn = churnData[splitChurn,]
head(trainChurn)
## Churned Age Married Cust_years Churned_contacts
## 2 0 50 1 3 2
## 3 0 47 1 2 0
## 4 0 50 1 3 3
## 7 0 50 0 2 1
## 8 0 29 0 3 2
## 9 0 32 1 3 3
testChurn = churnData[!row.names(churnData) %in% row.names(trainChurn),]
head(testChurn)
## Churned Age Married Cust_years Churned_contacts
## 1 0 61 1 3 1
## 5 0 29 1 1 3
## 6 0 43 1 4 3
## 11 0 59 1 5 2
## 15 0 34 1 2 2
## 16 0 59 1 2 1
trainChurnLR = glm(Churned ~ ., data=trainChurn, family=binomial(link="logit"))
summary(trainChurnLR)
##
## Call:
## glm(formula = Churned ~ ., family = binomial(link = "logit"),
## data = trainChurn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4067 -0.5190 -0.1940 -0.0736 3.3821
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.442674 0.180464 19.077 <2e-16 ***
## Age -0.156591 0.004551 -34.405 <2e-16 ***
## Married 0.136188 0.076362 1.783 0.0745 .
## Cust_years 0.010409 0.033916 0.307 0.7589
## Churned_contacts 0.362264 0.030413 11.912 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6726.7 on 6399 degrees of freedom
## Residual deviance: 4290.3 on 6395 degrees of freedom
## AIC: 4300.3
##
## Number of Fisher Scoring iterations: 6
testChurn$Predicted = round(predict(trainChurnLR, testChurn[,2:5], type="response"), 2)
head(testChurn)
## Churned Age Married Cust_years Churned_contacts Predicted
## 1 0 61 1 3 1 0.00
## 5 0 29 1 1 3 0.53
## 6 0 43 1 4 3 0.12
## 11 0 59 1 5 2 0.01
## 15 0 34 1 2 2 0.27
## 16 0 59 1 2 1 0.01
popbio packageplot(trainChurn$Age, trainChurn$Churned, xlab="Age", ylab="P(Churned)")
trainLR = glm(Churned ~ Age, data=trainChurn, family=binomial(link="logit"))
curve(predict(trainLR,data.frame(Age=x),type="resp"),add=TRUE)
points(trainChurn$Age,fitted(trainLR),pch=20)
popbio::logi.hist.plot(trainChurn$Age, trainChurn$Churned, boxp=FALSE,type="hist",col="gray")
visreg packagelogReg = glm(Churned ~ ., data=trainChurn, family=binomial(link="logit"))
visreg::visreg(logReg, "Age", scale="response", partial=FALSE, xlab="Age", ylab="P(Churned)", rug=2)
Every classifier evaluation using ROCR starts with creating a prediction object. This function is used to transform the input data into a standardized format.
pred = predict(trainChurnLR, testChurn[,2:5], type="response")
pObject = ROCR::prediction(pred, testChurn$Churned )
All kinds of predictor evaluations are performed using performance function
rocObj = ROCR::performance(pObject, measure="tpr", x.measure="fpr")
aucObj = ROCR::performance(pObject, measure="auc")
plot(rocObj, main = paste("Area under the curve:", round(aucObj@y.values[[1]] ,4)))
rTest = data.frame(
'Age' = round(rnorm(nrow(testChurn), mean(testChurn$Age), sd(testChurn$Age))),
'Married' = sample(c(1,0), nrow(testChurn), replace=T),
'Cust_years' = round(rnorm(nrow(testChurn), mean(testChurn$Cust_years), sd(testChurn$Cust_years))),
'Churned_contacts' = round(rnorm(nrow(testChurn), mean(testChurn$Churned_contacts), sd(testChurn$Churned_contacts))),
'Churned' = sample(c(1,0), nrow(testChurn), replace=T)
)
rand_pred = predict(trainChurnLR, rTest[,1:4], type="response")
randObject = ROCR::prediction(rand_pred, rTest$Churned )
rocRandObj = ROCR::performance(randObject, measure="tpr", x.measure="fpr")
aucRandObj = ROCR::performance(randObject, measure="auc")
plot(rocRandObj, main = paste("Area under the curve:", round(aucRandObj@y.values[[1]] ,4)))
Phat = predict(trainChurnLR,testChurn,type="response")
head(Phat)
## 1 5 6 11 15 16
## 0.003759550 0.533681236 0.116488135 0.007514151 0.268964943 0.005082195
prop.table(xtabs(~ Churned, data=testChurn))
## Churned
## 0 1
## 0.78625 0.21375
thresh = 0.5
facHat = cut(Phat, breaks=c(-Inf, thresh, Inf), labels=c(0, 1))
cTab = xtabs(~ Churned + facHat, data=testChurn)
addmargins(cTab)
## facHat
## Churned 0 1 Sum
## 0 1157 101 1258
## 1 148 194 342
## Sum 1305 295 1600
CCR = sum(diag(cTab)) / sum(cTab)
CCR
## [1] 0.844375
If you want to modify a model you may consider using the special function update
lrUpdate = update(trainChurnLR, ~ . -Married-Churned_contacts-Cust_years)
summary(lrUpdate)
##
## Call:
## glm(formula = Churned ~ Age, family = binomial(link = "logit"),
## data = trainChurn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7484 -0.5360 -0.2084 -0.0917 3.3087
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.985340 0.139029 28.66 <2e-16 ***
## Age -0.150076 0.004357 -34.45 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6726.7 on 6399 degrees of freedom
## Residual deviance: 4441.4 on 6398 degrees of freedom
## AIC: 4445.4
##
## Number of Fisher Scoring iterations: 6
pred = predict(lrUpdate, testChurn[,2:5], type="response")
pObject = ROCR::prediction(pred, testChurn$Churned )
rocObj = ROCR::performance(pObject, measure="tpr", x.measure="fpr")
aucObj = ROCR::performance(pObject, measure="auc")
plot(rocObj, main = paste("Area under the curve:", round(aucObj@y.values[[1]] ,4)))