This report describes Customer Churn Analysis using classification model. The dataset used in this project is data from kaggle. https://www.kaggle.com/barun2104/telecom-churn
Report Outline:
1. Data Extraction
2. Exploratory Data Analysis
3. Data Preparation
4. Modelling
5. Evaluation
6. Recommendation
Extract data in csv format into dataframe in R.
churn_df <- read.csv("telecom_churn.csv")
See the structure of dataframe. There are 3333 observation and 11 variables.
str(churn_df)
## 'data.frame': 3333 obs. of 11 variables:
## $ Churn : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AccountWeeks : int 128 107 137 84 75 118 121 147 117 141 ...
## $ ContractRenewal: int 1 1 1 0 0 0 1 0 1 0 ...
## $ DataPlan : int 1 1 0 0 0 0 1 0 0 1 ...
## $ DataUsage : num 2.7 3.7 0 0 0 0 2.03 0 0.19 3.02 ...
## $ CustServCalls : int 1 1 0 2 3 0 3 0 1 0 ...
## $ DayMins : num 265 162 243 299 167 ...
## $ DayCalls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ MonthlyCharge : num 89 82 52 57 41 57 87.3 36 63.9 93.2 ...
## $ OverageFee : num 9.87 9.78 6.06 3.1 7.42 ...
## $ RoamMins : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
Calculate statistical summary of numerical variables.
summary(churn_df)
## Churn AccountWeeks ContractRenewal DataPlan
## Min. :0.0000 Min. : 1.0 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 74.0 1st Qu.:1.0000 1st Qu.:0.0000
## Median :0.0000 Median :101.0 Median :1.0000 Median :0.0000
## Mean :0.1449 Mean :101.1 Mean :0.9031 Mean :0.2766
## 3rd Qu.:0.0000 3rd Qu.:127.0 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :243.0 Max. :1.0000 Max. :1.0000
## DataUsage CustServCalls DayMins DayCalls
## Min. :0.0000 Min. :0.000 Min. : 0.0 Min. : 0.0
## 1st Qu.:0.0000 1st Qu.:1.000 1st Qu.:143.7 1st Qu.: 87.0
## Median :0.0000 Median :1.000 Median :179.4 Median :101.0
## Mean :0.8165 Mean :1.563 Mean :179.8 Mean :100.4
## 3rd Qu.:1.7800 3rd Qu.:2.000 3rd Qu.:216.4 3rd Qu.:114.0
## Max. :5.4000 Max. :9.000 Max. :350.8 Max. :165.0
## MonthlyCharge OverageFee RoamMins
## Min. : 14.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 45.00 1st Qu.: 8.33 1st Qu.: 8.50
## Median : 53.50 Median :10.07 Median :10.30
## Mean : 56.31 Mean :10.05 Mean :10.24
## 3rd Qu.: 66.20 3rd Qu.:11.77 3rd Qu.:12.10
## Max. :111.30 Max. :18.19 Max. :20.00
Change to factor (categorical)
churn_df$ContractRenewal <- factor(churn_df$ContractRenewal,
levels = c(0,1),
labels = c("Stop", "Continue"))
churn_df$DataPlan <- factor(churn_df$DataPlan,
levels = c(0,1),
labels = c("No", "Yes"))
churn_df$Churn <- factor(churn_df$Churn,
levels = c(0,1),
labels = c("No", "Yes"))
In here i wanna saw some graphic for analyst Customer Churn using single variable with bar chart
p1 <- ggplot(churn_df, aes(x = AccountWeeks)) +
geom_bar(color="#d45087",fill = "white") +
labs(x = "AccountWeeks",
y = "Count")
p2 <- ggplot(churn_df, aes(x = DataPlan)) +
geom_bar(color="white",fill = "#d45087") +
labs(x = "Data Plan",
y = "Count")
p3 <- ggplot(churn_df, aes(x = ContractRenewal)) +
geom_bar(color="white",fill = "#d45087") +
labs(x = "contract Renewal",
y = "Count")
p4 <- ggplot(churn_df, aes(x = DataUsage)) +
geom_bar(color="#d45087",fill = "white") +
scale_y_continuous(breaks = c(50, 30, 10),
labels = c("50", "30", "10"),
limits = c(0,50)) +
labs(x = "Data Usage",
y = "Count")
p5 <- ggplot(churn_df, aes(x = CustServCalls)) +
geom_bar(color="white",fill = "#d45087") +
labs(x = "Customer Service Calls",
y = "Count")
p6 <- ggplot(churn_df, aes(x = DayMins)) +
geom_bar(color="#d45087",fill = "white") +
labs(x = "Day Mins",
y = "Count")
p7 <- ggplot(churn_df, aes(x = DayCalls)) +
geom_bar(color="#d45087",fill = "white") +
labs(x = "Day Calls",
y = "Count")
p8 <- ggplot(churn_df, aes(x = MonthlyCharge)) +
geom_bar(color="#d45087",fill = "#d45087") +
labs(x = "Monthly Charge",
y = "Count")
p9 <- ggplot(churn_df, aes(x = OverageFee)) +
geom_bar(color="#d45087",fill = "white") +
labs(x = "OverageFee",
y = "Count")
p10 <- ggplot(churn_df, aes(x = RoamMins)) +
geom_bar(color="#d45087",fill = "white") +
labs(x = "Roaming Mins",
y = "Count")
p11 <- ggplot(churn_df, aes(x = Churn)) +
geom_bar(color="white",fill = "#d45087") +
labs(x = "churn",
y = "Count")
And then to display some bar charts that have been created using the from function Gridextra. The chart is displayed in 3 columns.
library(gridExtra)
grid.arrange( p1,p2,p3,p4,p5,p6,p7,p8,p9,p10, p11, ncol = 3)
In here i wanna saw some graphic for analyst Customer Churn using two variable with density and bar chart
b1<-ggplot(data = churn_df, aes(x = AccountWeeks, fill = Churn)) +
geom_bar(position = position_dodge(preserve = "single"))+
labs(x = "Account Weeks",
y = "Count", title = "Analysis by Account Weeks")
b2<-ggplot(data = churn_df, aes(x = ContractRenewal, fill = Churn)) +
geom_bar(position = position_dodge(preserve = "single"))+
labs(x = "Contact Renewal",
y = "Count", title = "Analysis by Contract Renewal")
b3<-ggplot(data = churn_df, aes(x = DataPlan, fill = Churn)) +
geom_bar(position = position_dodge(preserve = "single"))+
labs(x = "Data Plan",
y = "Count", title = "Analysis by Data Plan")
b4<-ggplot(data=churn_df, aes(x= DataUsage, fill= Churn)) +
geom_density(alpha=.6)+
labs(title="Analysis by Data Usage",
x="Data Usage")
b5<-ggplot(data = churn_df, aes(x = CustServCalls, fill = Churn)) +
geom_bar(position = position_dodge(preserve = "single"))+
labs(x = "Customer Service Calls",
y = "Count", title = "Analysis by Cust.Serv.Calls")
b6<- ggplot(data=churn_df, aes(x= DayMins, fill= Churn)) +
geom_density(alpha=.6)+
labs(title="Analysis by Day Mins",
x="Day Mins")
b7<- ggplot(data = churn_df, aes(x = DayCalls, fill = Churn)) +
geom_bar(position = position_dodge(preserve = "single"))+
labs(x = "Day Calls",
y = "Count", title = "Analysis by Day Calls")
b8<- ggplot(data=churn_df, aes(x= MonthlyCharge, fill= Churn)) +
geom_density(alpha=.6)+
labs(title="Analysis by Monthly Charge",
x="Monthly Charge")
b9<- ggplot(data=churn_df, aes(x= OverageFee, fill= Churn)) +
geom_density(alpha=.6)+
labs(title="Analaysis by Overage Fee",
x="Overage Fee")
b10<- ggplot(data = churn_df, aes(x = RoamMins, fill = Churn)) +
geom_bar(position = position_dodge(preserve = "single"))+
labs(x = "Roam Mins",
y = "Count", title = "Analysis by Roam Mins")
And then to display some bar charts that have been created using the from function Gridextra. The chart is displayed in 2 columns.
library(gridExtra)
grid.arrange( b1,b2,b3,b4,b5,b6,b7,b8,b9,b10,ncol = 2)
Based on this graph, it can be seen that customers who don't use data plan, have less data usage, and high monthly costs tending to churn.
Compute and visualize Pearson’s Correlation Coefficient (R) between numerical variables.
library(corrgram)
corrgram(churn_df[1:11], order = TRUE,
upper.panel = panel.pie)
Based on diagram above, Variable Monthly Charge, Data Usage, and Day Mins have the highest correlation with the target variable (Churn).
m1<-ggplot(churn_df,
aes(x = DataUsage,
y = MonthlyCharge,
color = Churn)) +
geom_point(alpha = .6,
size = 3) +
geom_smooth(se=FALSE,
method = "lm",
formula = y~poly(x,2),
size = 1.5) +
labs(x = "Data Usage", y = "Monthly Charge",
title = "Customer Churn Analyst",
color = "Churn") +
theme_minimal()
m2 <- ggplot(churn_df, aes(x=MonthlyCharge, y=DayMins, color=Churn,
shape=ContractRenewal)) +
geom_point() +
geom_jitter() +
facet_grid(~DataPlan)
Based on this graph, it can be seen that the number of customer churn will be directly proportional to Monthly Charge, Data Usage, and Day Mins. It can be also be seen that the churn of customers who use Data Plan is less than those who don't.
m = nrow(churn_df)
set.seed(2021)
train_idx <- sample(m, 0.6*m)
train_df <- churn_df[ train_idx, ]
test_df <- churn_df[ -train_idx, ]
Amount of Training Data is 70% (1999 observation) and Testing Data is 30% (1334 observation)
fit.log <- glm(formula = Churn ~ .,
data = train_df,
family = binomial)
summary(fit.log)
##
## Call:
## glm(formula = Churn ~ ., family = binomial, data = train_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8438 -0.5281 -0.3548 -0.2163 3.0029
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.3088728 0.7071436 -8.922 < 2e-16 ***
## AccountWeeks -0.0007961 0.0017391 -0.458 0.64712
## ContractRenewalContinue -1.9778014 0.1897152 -10.425 < 2e-16 ***
## DataPlanYes -1.2992494 0.6898780 -1.883 0.05966 .
## DataUsage 1.0342599 2.4731136 0.418 0.67580
## CustServCalls 0.5023753 0.0510192 9.847 < 2e-16 ***
## DayMins 0.0282999 0.0417790 0.677 0.49817
## DayCalls 0.0092181 0.0034921 2.640 0.00830 **
## MonthlyCharge -0.0894548 0.2454705 -0.364 0.71554
## OverageFee 0.2698143 0.4186154 0.645 0.51923
## RoamMins 0.0939108 0.0291313 3.224 0.00127 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1662.5 on 1998 degrees of freedom
## Residual deviance: 1337.5 on 1988 degrees of freedom
## AIC: 1359.5
##
## Number of Fisher Scoring iterations: 5
prob <- predict(fit.log,test_df,type = "response")
log.pred <- factor(prob>0.5, levels = c(FALSE, TRUE),
labels = c("NO", "YES"))
library(party)
set.seed(2021)
fit.ctree <- ctree(formula = Churn ~ .,
data = train_df,
control = ctree_control(maxdept = 3))
ctree.pred <- predict(fit.ctree, test_df, type = "response")
library(randomForest)
set.seed(2021)
fit.forest <- randomForest(formula = Churn~.,
data = train_df, importance=TRUE)
fit.forest
##
## Call:
## randomForest(formula = Churn ~ ., data = train_df, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 6.35%
## Confusion matrix:
## No Yes class.error
## No 1678 29 0.01698887
## Yes 98 194 0.33561644
importance(fit.forest)
## No Yes MeanDecreaseAccuracy MeanDecreaseGini
## AccountWeeks -2.4491753 2.130693 -1.176247 32.26186
## ContractRenewal 27.5080315 50.209070 48.202287 33.57199
## DataPlan 11.6130205 12.653960 13.118663 16.52698
## DataUsage 21.8358697 18.063430 24.210707 41.55346
## CustServCalls 50.0470940 75.212236 79.332684 66.30263
## DayMins 22.2954935 38.596029 40.265581 96.26619
## DayCalls 0.2962053 4.083924 2.358852 34.50497
## MonthlyCharge 27.8700141 43.053403 36.557543 84.82238
## OverageFee 16.1740370 16.905621 23.274860 46.06384
## RoamMins 20.2313641 24.835491 30.119859 44.84486
forest.pred <- predict(fit.forest, test_df, type = "response")
Visualize
varImpPlot(fit.forest)
Based on graph , the most important variables are Day Mins, Monthly Charge and Customer Service Calls
library(e1071)
set.seed(2021)
fit.svm <- svm(formula = Churn ~ ., data= train_df)
svm.pred <- predict(fit.svm, na.omit(test_df))
Used to calculate value of Accuracy, Precission, Recall and F1 Score
Logistic Regression
log.perf <- table (test_df$Churn, log.pred,
dnn = c("Actual", "Predicted"))
log.perf
## Predicted
## Actual NO YES
## No 1109 34
## Yes 141 50
Classification Tree
ctree.perf <- table(test_df$Churn, ctree.pred,
dnn = c("Actual", "Predicted" ))
ctree.perf
## Predicted
## Actual No Yes
## No 1099 44
## Yes 97 94
Random Forest
forest.perf <- table(test_df$Churn, forest.pred,
dnn = c("Actual", "Predited" ))
forest.perf
## Predited
## Actual No Yes
## No 1129 14
## Yes 70 121
Support Vector Machine (SVM)
svm.perf <- table(na.omit(test_df)$Churn, svm.pred,
dnn = c("Actual", "Predicted"))
svm.perf
## Predicted
## Actual No Yes
## No 1134 9
## Yes 111 80
This code is for Evaluation result of data. We compute the accuracy, preccision, recall and F1 Score.
actual <- test_df$Churn
performance <- function(table, n=2){
TN = table[1,1]
TP = table[2,2]
FN = table[2,1]
FP = table[1,2]
accuracy = (TP+TN)/(TP+TN+FP+FN)
preccision = TP/(FP+TP)
recall = TP/(FN+TP)
f1 = 2 * preccision * recall/(preccision + recall)
result <- paste("Accuracy =", round(accuracy,n),
"Precission = ", round(preccision,n),
"Recall = ", round(recall,n),
"F1 Score = ", round(f1,n))
result
}
** Accuracy, Preccision, Recall, and F1 Score**
performance(log.perf) # Logistic Regression
## [1] "Accuracy = 0.87 Precission = 0.6 Recall = 0.26 F1 Score = 0.36"
performance(ctree.perf) # Classification Tree
## [1] "Accuracy = 0.89 Precission = 0.68 Recall = 0.49 F1 Score = 0.57"
performance(forest.perf) # Random Forest
## [1] "Accuracy = 0.94 Precission = 0.9 Recall = 0.63 F1 Score = 0.74"
performance(svm.perf) # Support Vector Machine (SVM)
## [1] "Accuracy = 0.91 Precission = 0.9 Recall = 0.42 F1 Score = 0.57"
Random Forest algorithm the best among all the tested algorithms.
However, the current result surpass human level performance (94% accurracy).
Based Random Forest Model , the most important variables are Monthly Charge, Day Mins, and Customer Service Calls.
The Result can be improved by better data preparation or using other algorithms.