Nowadays churn predicition becomes very popular and important for many companies to analyse which customers will stop using their sevices so that they can adjuste their business to fit well the marketing.
In this project, dataset(Churn_Modelling.csv) will be used to analyse churn prediction.
And these following modeles will be introduced: 1, Logistic Regression 2, Decision Tree 3, Random Forest
This report is also availible online, see http://rpubs.com/jz8/Churnprediction.
library(ggplot2)
library(gridExtra)
library(ggthemes)
library(caret)
library(plyr)
library(corrplot)
library(MASS)
library(randomForest)
library(party)
library(dplyr)
library(rpart)
library(rpart.plot)
churn <- read.csv('Churn_Modelling.csv')
dim(churn)
[1] 10000 14
names(churn)
[1] "RowNumber" "CustomerId" "Surname" "CreditScore" "Geography" "Gender" "Age"
[8] "Tenure" "Balance" "NumOfProducts" "HasCrCard" "IsActiveMember" "EstimatedSalary" "Exited"
str(churn)
'data.frame': 10000 obs. of 14 variables:
$ RowNumber : int 1 2 3 4 5 6 7 8 9 10 ...
$ CustomerId : int 15634602 15647311 15619304 15701354 15737888 15574012 15592531 15656148 15792365 15592389 ...
$ Surname : Factor w/ 2932 levels "Abazu","Abbie",..: 1116 1178 2041 290 1823 538 178 2001 1147 1082 ...
$ CreditScore : int 619 608 502 699 850 645 822 376 501 684 ...
$ Geography : Factor w/ 3 levels "France","Germany",..: 1 3 1 1 3 3 1 2 1 1 ...
$ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
$ Age : int 42 41 42 39 43 44 50 29 44 27 ...
$ Tenure : int 2 1 8 1 2 8 7 4 4 2 ...
$ Balance : num 0 83808 159661 0 125511 ...
$ NumOfProducts : int 1 1 3 2 1 2 2 4 2 1 ...
$ HasCrCard : int 1 0 1 0 1 1 1 1 0 1 ...
$ IsActiveMember : int 1 1 0 0 1 0 1 0 1 1 ...
$ EstimatedSalary: num 101349 112543 113932 93827 79084 ...
$ Exited : int 1 0 1 0 0 1 0 1 0 0 ...
head(churn)
sapply(churn, function(x) sum(is.na(x)))
RowNumber CustomerId Surname CreditScore Geography Gender Age Tenure
0 0 0 0 0 0 0 0
Balance NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
0 0 0 0 0 0
churnData <- churn %>% select(-c(RowNumber)) %>% subset(!duplicated(churn$CustomerId))
dim(churnData)
[1] 10000 13
summary(churnData)
CustomerId Surname CreditScore Geography Gender Age Tenure Balance
Min. :15565701 Smith : 32 Min. :350.0 France :5014 Female:4543 Min. :18.00 Min. : 0.000 Min. : 0
1st Qu.:15628528 Martin : 29 1st Qu.:584.0 Germany:2509 Male :5457 1st Qu.:32.00 1st Qu.: 3.000 1st Qu.: 0
Median :15690738 Scott : 29 Median :652.0 Spain :2477 Median :37.00 Median : 5.000 Median : 97199
Mean :15690941 Walker : 28 Mean :650.5 Mean :38.92 Mean : 5.013 Mean : 76486
3rd Qu.:15753234 Brown : 26 3rd Qu.:718.0 3rd Qu.:44.00 3rd Qu.: 7.000 3rd Qu.:127644
Max. :15815690 Genovese: 25 Max. :850.0 Max. :92.00 Max. :10.000 Max. :250898
(Other) :9831
NumOfProducts HasCrCard IsActiveMember EstimatedSalary Exited
Min. :1.00 Min. :0.0000 Min. :0.0000 Min. : 11.58 Min. :0.0000
1st Qu.:1.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 51002.11 1st Qu.:0.0000
Median :1.00 Median :1.0000 Median :1.0000 Median :100193.91 Median :0.0000
Mean :1.53 Mean :0.7055 Mean :0.5151 Mean :100090.24 Mean :0.2037
3rd Qu.:2.00 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:149388.25 3rd Qu.:0.0000
Max. :4.00 Max. :1.0000 Max. :1.0000 Max. :199992.48 Max. :1.0000
churnData$HasCrCard <- as.factor(mapvalues(churnData$HasCrCard, from = c("0", "1"), to = c("No", "Yes")))
churnData$IsActiveMember <- as.factor(mapvalues(churnData$IsActiveMember, from = c("0", "1"), to = c("No", "Yes")))
Remove no useful variables.
churnData$CustomerId <- NULL
churnData$Surname <- NULL
numeric_var <- sapply(churnData, is.numeric)
corr_matrix <- cor(churnData[, numeric_var])
corrplot(corr_matrix, main = "\n\nCorrelation Plot for Numerical Variables", method = "number")
These variables are not very correlated so that they are all kept.
str(churnData)
'data.frame': 10000 obs. of 11 variables:
$ CreditScore : int 619 608 502 699 850 645 822 376 501 684 ...
$ Geography : Factor w/ 3 levels "France","Germany",..: 1 3 1 1 3 3 1 2 1 1 ...
$ Gender : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 2 2 1 2 2 ...
$ Age : int 42 41 42 39 43 44 50 29 44 27 ...
$ Tenure : int 2 1 8 1 2 8 7 4 4 2 ...
$ Balance : num 0 83808 159661 0 125511 ...
$ NumOfProducts : int 1 1 3 2 1 2 2 4 2 1 ...
$ HasCrCard : Factor w/ 2 levels "No","Yes": 2 1 2 1 2 2 2 2 1 2 ...
$ IsActiveMember : Factor w/ 2 levels "No","Yes": 2 2 1 1 2 1 2 1 2 2 ...
$ EstimatedSalary: num 101349 112543 113932 93827 79084 ...
$ Exited : int 1 0 1 0 0 1 0 1 0 0 ...
p1 <- ggplot(churnData, aes(x=CreditScore )) + ggtitle("Credit Score ") + xlab("Credit Score ") +
geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) +
ylab("Percentage") + coord_flip() + theme_minimal()
p2 <- ggplot(churnData, aes(x=Geography)) + ggtitle("Geography") +
xlab("Geography") + geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) + ylab("Percentage") + coord_flip() + theme_minimal()
p3 <- ggplot(churnData, aes(x=Gender)) + ggtitle("Gender") + xlab("Gender") +
geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) +
ylab("Percentage") + coord_flip() + theme_minimal()
p4 <- ggplot(churnData, aes(x=Age)) + ggtitle("Age") + xlab("Age") +
geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) +
ylab("Percentage") + coord_flip() + theme_minimal()
grid.arrange(p1, p2, p3, p4, ncol=2)
p5 <- ggplot(churnData, aes(x=Tenure)) + ggtitle("Tenure") +
xlab("Tenure") +
geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) +
ylab("Percentage") + coord_flip() + theme_minimal()
p6 <- ggplot(churnData, aes(x=NumOfProducts )) + ggtitle("Num Of Products ") +
xlab("Multiple Lines") +
geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) +
ylab("Num Of Products ") + coord_flip() + theme_minimal()
p7 <- ggplot(churnData, aes(x=HasCrCard)) + ggtitle("Has Credit Card") +
xlab("Has Credit Card") +
geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) +
ylab("Percentage") + coord_flip() + theme_minimal()
p8 <- ggplot(churnData, aes(x=IsActiveMember)) + ggtitle("Is Active Member ") +
xlab("Is Active Member ") +
geom_bar(aes(y = 100*(..count..)/sum(..count..)), width = 0.5) +
ylab("Percentage") + coord_flip() + theme_minimal()
grid.arrange(p5, p6, p7, p8, ncol=2)
boxplot(churnData$EstimatedSalary, col = grey(0.9), main = "Estimated Salary", xlab = "Estimated Salary",ylab = "Effectif")
abline(h = median(churnData$EstimatedSalary, na.rm = TRUE), col = "navy", lty = 2)
text(1.35, median(churnData$EstimatedSalary, na.rm = TRUE) + 0.15, "Médiane", col = "navy")
Q1 <- quantile(churnData$EstimatedSalary, probs = 0.25, na.rm = TRUE)
abline(h = Q1, col = "darkred")
text(1.35, Q1 + 0.15, "Q1 : premier quartile", col = "darkred", lty = 2)
Q3 <- quantile(churnData$EstimatedSalary, probs = 0.75, na.rm = TRUE)
abline(h = Q3, col = "darkred")
text(1.35, Q3 + 0.15, "Q3 : troisième quartile", col = "darkred", lty = 2)
arrows(x0 = 0.7, y0 = quantile(churnData$EstimatedSalary, probs = 0.75,
na.rm = TRUE), x1 = 0.7, y1 = quantile(churnData$EstimatedSalary,
probs = 0.25, na.rm = TRUE), length = 0.1, code = 3)
text(0.7, Q1 + (Q3 - Q1)/2 + 0.15, "h", pos = 2)
mtext("L'écart inter-quartile h is about 100000", side = 1)
Split data into training and testing sets:
trainIndex <- createDataPartition(churnData$Exited, p = .75, list = FALSE, times = 1)
set.seed(2019)
training <- churnData[trainIndex, ]
testing <- churnData[- trainIndex, ]
Virifiy the 2 sets:
dim(training)
[1] 7500 11
dim(testing)
[1] 2500 11
Fitting the Logistic Regression Model:
LogModel <- glm(Exited ~ ., family = binomial(link = "logit"), data = training)
print(summary(LogModel))
Call:
glm(formula = Exited ~ ., family = binomial(link = "logit"),
data = training)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.3148 -0.6540 -0.4537 -0.2661 3.0098
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.481e+00 2.858e-01 -12.179 < 2e-16 ***
CreditScore -6.625e-04 3.254e-04 -2.036 0.0418 *
GeographyGermany 7.196e-01 7.840e-02 9.178 < 2e-16 ***
GeographySpain 5.360e-03 8.144e-02 0.066 0.9475
GenderMale -5.477e-01 6.315e-02 -8.673 < 2e-16 ***
Age 7.358e-02 2.991e-03 24.605 < 2e-16 ***
Tenure -1.146e-02 1.079e-02 -1.063 0.2879
Balance 3.074e-06 5.943e-07 5.173 2.3e-07 ***
NumOfProducts -7.875e-02 5.494e-02 -1.433 0.1517
HasCrCardYes -6.955e-02 6.844e-02 -1.016 0.3095
IsActiveMemberYes -1.088e+00 6.704e-02 -16.235 < 2e-16 ***
EstimatedSalary 5.072e-07 5.493e-07 0.923 0.3558
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 7558.4 on 7499 degrees of freedom
Residual deviance: 6383.0 on 7488 degrees of freedom
AIC: 6407
Number of Fisher Scoring iterations: 5
Feature Analysis:
anova(LogModel, test = "Chisq")
Analysis of Deviance Table
Model: binomial, link: logit
Response: Exited
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL 7499 7558.4
CreditScore 1 6.18 7498 7552.2 0.01289 *
Geography 2 203.77 7496 7348.5 < 2.2e-16 ***
Gender 1 83.52 7495 7264.9 < 2.2e-16 ***
Age 1 557.53 7494 6707.4 < 2.2e-16 ***
Tenure 1 0.19 7493 6707.2 0.66618
Balance 1 34.69 7492 6672.5 3.872e-09 ***
NumOfProducts 1 3.79 7491 6668.7 0.05161 .
HasCrCard 1 0.67 7490 6668.1 0.41444
IsActiveMember 1 284.22 7489 6383.9 < 2.2e-16 ***
EstimatedSalary 1 0.85 7488 6383.0 0.35575
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
tableLR <- table(Actual=testing$Exited, prediction=fitted_results > 0.5)
tableLR
prediction
Actual FALSE TRUE
0 1835 147
1 476 42
print(paste('Logistic Regression Accuracy',sum(diag(tableLR))/sum(tableLR)))
[1] "Logistic Regression Accuracy 0.7508"
print(paste('Logistic Regression precision',p1<-tableLR[2,2]/(tableLR[2,2]+tableLR[1,2])))
[1] "Logistic Regression precision 0.222222222222222"
print(paste('Logistic Regression recall',r1<-tableLR[2,2]/(tableLR[2,2]+tableLR[2,1])))
[1] "Logistic Regression recall 0.0810810810810811"
print(paste('Logistic Regression F1Score',2*p1*r1/(r1+p1)))
[1] "Logistic Regression F1Score 0.118811881188119"
Odds Ratio (performance measurements in logistic regression, what the odds of an event is happening.)
exp(cbind(OR = coef(LogModel), confint(LogModel)))
Waiting for profiling to be done...
OR 2.5 % 97.5 %
(Intercept) 0.03076797 0.01753066 0.05376236
CreditScore 0.99933769 0.99870005 0.99997513
GeographyGermany 2.05353738 1.76136254 2.39523675
GeographySpain 1.00537473 0.85637590 1.17854048
GenderMale 0.57825994 0.51082063 0.65432453
Age 1.07635596 1.07010304 1.08272393
Tenure 0.98860261 0.96791050 1.00972076
Balance 1.00000307 1.00000191 1.00000424
NumOfProducts 0.92427503 0.82958615 1.02898358
HasCrCardYes 0.93281029 0.81609227 1.06727095
IsActiveMemberYes 0.33673319 0.29503972 0.38373937
EstimatedSalary 1.00000051 0.99999943 1.00000158
set.seed(91)
DTree <- rpart(Exited ~ ., training, method="class")
plotcp(DTree)
prp(DTree,extra=1)
tableDT <- table(Actual = testing$Exited, Predicted = predict(DTree, testing, type="class"))
tableDT
Predicted
Actual 0 1
0 1935 47
1 324 194
print(paste('Decision Tree Accuracy',sum(diag(tableDT))/sum(tableDT)))
[1] "Decision Tree Accuracy 0.8516"
print(paste('Decision Tree precision',p2<-tableDT[2,2]/(tableDT[2,2]+tableDT[1,2])))
[1] "Decision Tree precision 0.804979253112033"
print(paste('Decision Tree recall',r2<-tableDT[2,2]/(tableDT[2,2]+tableDT[2,1])))
[1] "Decision Tree recall 0.374517374517374"
print(paste('Decision Tree F1Score',2*p2*r2/(r2+p2)))
[1] "Decision Tree F1Score 0.511198945981555"
rfModel <- randomForest(Exited ~., data = training)
The response has five or fewer unique values. Are you sure you want to do regression?
print(rfModel)
Call:
randomForest(formula = Exited ~ ., data = training)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 3
Mean of squared residuals: 0.1042103
% Var explained: 35.48
tableRF <- table(Actual = testing$Exited, Predicted = ifelse(predict(rfModel, testing) > 0.5, 1, 0))
tableRF
Predicted
Actual 0 1
0 1914 68
1 280 238
print(paste('Random Forest Accuracy',sum(diag(tableRF))/sum(tableRF)))
[1] "Random Forest Accuracy 0.8608"
print(paste('Random Forest precision',p3<-tableRF[2,2]/(tableRF[2,2]+tableRF[1,2])))
[1] "Random Forest precision 0.777777777777778"
print(paste('Random Forest recall',r3<-tableRF[2,2]/(tableRF[2,2]+tableRF[2,1])))
[1] "Random Forest recall 0.459459459459459"
print(paste('Random Forest F1Score',2*p3*r3/(r3+p3)))
[1] "Random Forest F1Score 0.577669902912621"
plot(rfModel)
t <- tuneRF(training[, -10], training[, 10], stepFactor = 0.5, plot = TRUE,
ntreeTry = 100, trace = TRUE, improve = 0.05)
mtry = 3 OOB error = 3469354985
Searching left ...
mtry = 6 OOB error = 3517620289
-0.0139119 0.05
Searching right ...
mtry = 1 OOB error = 3312994210
0.04506912 0.05
rfModel_new <- randomForest(Exited ~., data = training, ntree = 100,
mtry = 1, importance = TRUE, proximity = TRUE)
The response has five or fewer unique values. Are you sure you want to do regression?
print(rfModel_new)
Call:
randomForest(formula = Exited ~ ., data = training, ntree = 100, mtry = 1, importance = TRUE, proximity = TRUE)
Type of random forest: regression
Number of trees: 100
No. of variables tried at each split: 1
Mean of squared residuals: 0.1228001
% Var explained: 23.97
tableRFN <- table(Actual = testing$Exited, Predicted = ifelse(predict(rfModel_new, testing) > 0.5, 1, 0))
tableRFN
Predicted
Actual 0 1
0 1978 4
1 440 78
print(paste('New Random Forest Accuracy',sum(diag(tableRFN))/sum(tableRFN)))
[1] "New Random Forest Accuracy 0.8224"
print(paste('New Random Forest precision',p4<-tableRFN[2,2]/(tableRFN[2,2]+tableRFN[1,2])))
[1] "New Random Forest precision 0.951219512195122"
print(paste('New Random Forest recall',r4<-tableRFN[2,2]/(tableRFN[2,2]+tableRFN[2,1])))
[1] "New Random Forest recall 0.150579150579151"
print(paste('New Random Forest F1Score',2*p4*r4/(r4+p4)))
[1] "New Random Forest F1Score 0.26"
varImpPlot(rfModel_new, sort=T, n.var = 5, main = 'Top 5 Feature Importance')
In this project, we use Logistic Regression, Decision Tree and Random Forest to analysis customer churn on this dataset. As a result, if we use precision to evaluate these models, tuned Random Forest model has the best perfermance(0.95); if we use recall to evaluate these models, Random Forest model has the best perfermance(0.46); if we use F1 Score to evaluate these models, Random Forest model and decision tree model have the best perfermance(0.58); if we use accuracy to evaluate these models, Random Forest model and decision tree model has the best perfermance(0.86);