Description

This report describes Customer Churn Analysis using classification model. The dataset used in this project is data from kaggle. https://www.kaggle.com/barun2104/telecom-churn

Report Outline:
1. Data Extraction
2. Exploratory Data Analysis
3. Data Preparation
4. Modelling
5. Evaluation
6. Recommendation

1. Data Extraction

Extract data in csv format into dataframe in R.

churn_df <- read.csv("telecom_churn.csv")

See the structure of dataframe. There are 3333 observation and 11 variables.

str(churn_df)
## 'data.frame':    3333 obs. of  11 variables:
##  $ Churn          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AccountWeeks   : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ ContractRenewal: int  1 1 1 0 0 0 1 0 1 0 ...
##  $ DataPlan       : int  1 1 0 0 0 0 1 0 0 1 ...
##  $ DataUsage      : num  2.7 3.7 0 0 0 0 2.03 0 0.19 3.02 ...
##  $ CustServCalls  : int  1 1 0 2 3 0 3 0 1 0 ...
##  $ DayMins        : num  265 162 243 299 167 ...
##  $ DayCalls       : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ MonthlyCharge  : num  89 82 52 57 41 57 87.3 36 63.9 93.2 ...
##  $ OverageFee     : num  9.87 9.78 6.06 3.1 7.42 ...
##  $ RoamMins       : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...

Calculate statistical summary of numerical variables.

summary(churn_df)
##      Churn         AccountWeeks   ContractRenewal     DataPlan     
##  Min.   :0.0000   Min.   :  1.0   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 74.0   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :101.0   Median :1.0000   Median :0.0000  
##  Mean   :0.1449   Mean   :101.1   Mean   :0.9031   Mean   :0.2766  
##  3rd Qu.:0.0000   3rd Qu.:127.0   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :243.0   Max.   :1.0000   Max.   :1.0000  
##    DataUsage      CustServCalls      DayMins         DayCalls    
##  Min.   :0.0000   Min.   :0.000   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.:0.0000   1st Qu.:1.000   1st Qu.:143.7   1st Qu.: 87.0  
##  Median :0.0000   Median :1.000   Median :179.4   Median :101.0  
##  Mean   :0.8165   Mean   :1.563   Mean   :179.8   Mean   :100.4  
##  3rd Qu.:1.7800   3rd Qu.:2.000   3rd Qu.:216.4   3rd Qu.:114.0  
##  Max.   :5.4000   Max.   :9.000   Max.   :350.8   Max.   :165.0  
##  MonthlyCharge      OverageFee       RoamMins    
##  Min.   : 14.00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 45.00   1st Qu.: 8.33   1st Qu.: 8.50  
##  Median : 53.50   Median :10.07   Median :10.30  
##  Mean   : 56.31   Mean   :10.05   Mean   :10.24  
##  3rd Qu.: 66.20   3rd Qu.:11.77   3rd Qu.:12.10  
##  Max.   :111.30   Max.   :18.19   Max.   :20.00

2. Exploratory Data Analysis

Change to factor (categorical)

churn_df$ContractRenewal <- factor(churn_df$ContractRenewal, 
                       levels = c(0,1),
                       labels = c("Stop", "Continue"))

churn_df$DataPlan <- factor(churn_df$DataPlan,
                       levels = c(0,1),
                       labels = c("No", "Yes"))

churn_df$Churn <- factor(churn_df$Churn,
                            levels = c(0,1),
                            labels = c("No", "Yes"))

2.1 Univariate Analysis

In here i wanna saw some graphic for analyst Customer Churn using single variable with bar chart

p1 <- ggplot(churn_df, aes(x = AccountWeeks)) + 
  geom_bar(color="#d45087",fill = "white") +
  labs(x = "AccountWeeks", 
       y = "Count")

p2 <- ggplot(churn_df, aes(x = DataPlan)) + 
  geom_bar(color="white",fill = "#d45087") +
  labs(x = "Data Plan", 
       y = "Count")

p3 <- ggplot(churn_df, aes(x = ContractRenewal)) + 
  geom_bar(color="white",fill = "#d45087") +
  labs(x = "contract Renewal", 
       y = "Count")

p4 <- ggplot(churn_df, aes(x = DataUsage)) + 
  geom_bar(color="#d45087",fill = "white") +
  scale_y_continuous(breaks = c(50, 30, 10),
                     labels = c("50", "30", "10"),
                     limits = c(0,50)) +
  labs(x = "Data Usage", 
       y = "Count")

p5 <- ggplot(churn_df, aes(x = CustServCalls)) + 
  geom_bar(color="white",fill = "#d45087") +
  labs(x = "Customer Service Calls", 
       y = "Count")

p6 <- ggplot(churn_df, aes(x = DayMins)) + 
  geom_bar(color="#d45087",fill = "white") +
  labs(x = "Day Mins", 
       y = "Count")

p7 <- ggplot(churn_df, aes(x = DayCalls)) + 
  geom_bar(color="#d45087",fill = "white") +
  labs(x = "Day Calls", 
       y = "Count")

p8 <- ggplot(churn_df, aes(x = MonthlyCharge)) + 
  geom_bar(color="#d45087",fill = "#d45087") +
  labs(x = "Monthly Charge", 
       y = "Count")

p9 <- ggplot(churn_df, aes(x = OverageFee)) + 
  geom_bar(color="#d45087",fill = "white") +
  labs(x = "OverageFee", 
       y = "Count")

p10 <- ggplot(churn_df, aes(x = RoamMins)) + 
  geom_bar(color="#d45087",fill = "white") +
  labs(x = "Roaming Mins", 
       y = "Count")

p11 <- ggplot(churn_df, aes(x = Churn)) + 
  geom_bar(color="white",fill = "#d45087") +
  labs(x = "churn", 
       y = "Count")

And then to display some bar charts that have been created using the from function Gridextra. The chart is displayed in 3 columns.

library(gridExtra)
grid.arrange( p1,p2,p3,p4,p5,p6,p7,p8,p9,p10, p11, ncol = 3)

2.2 Bivariate Data Analysis

In here i wanna saw some graphic for analyst Customer Churn using two variable with density and bar chart

b1<-ggplot(data = churn_df, aes(x = AccountWeeks, fill = Churn)) + 
  geom_bar(position = position_dodge(preserve = "single"))+
  labs(x = "Account Weeks", 
       y = "Count", title = "Analysis by Account Weeks")

b2<-ggplot(data = churn_df, aes(x = ContractRenewal, fill = Churn)) + 
  geom_bar(position = position_dodge(preserve = "single"))+
  labs(x = "Contact Renewal", 
       y = "Count", title = "Analysis by Contract Renewal")

b3<-ggplot(data = churn_df, aes(x = DataPlan, fill = Churn)) + 
  geom_bar(position = position_dodge(preserve = "single"))+
  labs(x = "Data Plan", 
       y = "Count", title = "Analysis by Data Plan")

b4<-ggplot(data=churn_df, aes(x= DataUsage, fill= Churn)) +
  geom_density(alpha=.6)+
  labs(title="Analysis by Data Usage", 
       x="Data Usage")

b5<-ggplot(data = churn_df, aes(x = CustServCalls, fill = Churn)) + 
  geom_bar(position = position_dodge(preserve = "single"))+
  labs(x = "Customer Service Calls", 
       y = "Count", title = "Analysis by Cust.Serv.Calls")

b6<- ggplot(data=churn_df, aes(x= DayMins, fill= Churn)) +
  geom_density(alpha=.6)+
  labs(title="Analysis by Day Mins", 
       x="Day Mins")

b7<- ggplot(data = churn_df, aes(x = DayCalls, fill = Churn)) + 
  geom_bar(position = position_dodge(preserve = "single"))+
  labs(x = "Day Calls", 
       y = "Count", title = "Analysis by Day Calls")

b8<- ggplot(data=churn_df, aes(x= MonthlyCharge, fill= Churn)) +
  geom_density(alpha=.6)+
  labs(title="Analysis by Monthly Charge", 
       x="Monthly Charge")

b9<- ggplot(data=churn_df, aes(x= OverageFee, fill= Churn)) +
  geom_density(alpha=.6)+
  labs(title="Analaysis by Overage Fee", 
       x="Overage Fee")

b10<- ggplot(data = churn_df, aes(x = RoamMins, fill = Churn)) +
  geom_bar(position = position_dodge(preserve = "single"))+
  labs(x = "Roam Mins", 
       y = "Count", title = "Analysis by Roam Mins")

And then to display some bar charts that have been created using the from function Gridextra. The chart is displayed in 2 columns.

library(gridExtra)
grid.arrange( b1,b2,b3,b4,b5,b6,b7,b8,b9,b10,ncol = 2)

Based on this graph, it can be seen that customers who don't use data plan, have less data usage, and high monthly costs tending to churn.

2.3 Multivariate Data Analysis

Compute and visualize Pearson’s Correlation Coefficient (R) between numerical variables.

library(corrgram)

corrgram(churn_df[1:11], order = TRUE,
         upper.panel = panel.pie)

Based on diagram above, Variable Monthly Charge, Data Usage, and Day Mins have the highest correlation with the target variable (Churn).

m1<-ggplot(churn_df, 
           aes(x = DataUsage, 
               y = MonthlyCharge, 
               color = Churn)) +
  geom_point(alpha = .6, 
             size = 3) +
  geom_smooth(se=FALSE, 
              method = "lm", 
              formula = y~poly(x,2), 
              size = 1.5) +
  labs(x = "Data Usage", y = "Monthly Charge",
       title = "Customer Churn Analyst",
       color = "Churn") +
  theme_minimal()

m2 <- ggplot(churn_df, aes(x=MonthlyCharge, y=DayMins, color=Churn,
                        shape=ContractRenewal)) +
  geom_point() +
  geom_jitter() +
  facet_grid(~DataPlan)

Based on this graph, it can be seen that the number of customer churn will be directly proportional to Monthly Charge, Data Usage, and Day Mins. It can be also be seen that the churn of customers who use Data Plan is less than those who don't.

3. Data Preparation

m = nrow(churn_df)
set.seed(2021) 

train_idx <- sample(m, 0.6*m)
train_df <- churn_df[ train_idx, ]
test_df <- churn_df[ -train_idx, ]

Amount of Training Data is 70% (1999 observation) and Testing Data is 30% (1334 observation)

4. Modeling

4.1 Logistic Regression

fit.log <- glm(formula = Churn ~ .,
                 data = train_df, 
                 family = binomial)
summary(fit.log)
## 
## Call:
## glm(formula = Churn ~ ., family = binomial, data = train_df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8438  -0.5281  -0.3548  -0.2163   3.0029  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -6.3088728  0.7071436  -8.922  < 2e-16 ***
## AccountWeeks            -0.0007961  0.0017391  -0.458  0.64712    
## ContractRenewalContinue -1.9778014  0.1897152 -10.425  < 2e-16 ***
## DataPlanYes             -1.2992494  0.6898780  -1.883  0.05966 .  
## DataUsage                1.0342599  2.4731136   0.418  0.67580    
## CustServCalls            0.5023753  0.0510192   9.847  < 2e-16 ***
## DayMins                  0.0282999  0.0417790   0.677  0.49817    
## DayCalls                 0.0092181  0.0034921   2.640  0.00830 ** 
## MonthlyCharge           -0.0894548  0.2454705  -0.364  0.71554    
## OverageFee               0.2698143  0.4186154   0.645  0.51923    
## RoamMins                 0.0939108  0.0291313   3.224  0.00127 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1662.5  on 1998  degrees of freedom
## Residual deviance: 1337.5  on 1988  degrees of freedom
## AIC: 1359.5
## 
## Number of Fisher Scoring iterations: 5
prob <- predict(fit.log,test_df,type = "response")
log.pred <- factor(prob>0.5, levels = c(FALSE, TRUE),
                     labels = c("NO", "YES"))

4.2 Classification Tree

library(party)
set.seed(2021)
fit.ctree <- ctree(formula = Churn ~ .,
                   data = train_df,
                   control = ctree_control(maxdept = 3))

ctree.pred <- predict(fit.ctree, test_df, type = "response")

4.3 Random Forest

library(randomForest)
set.seed(2021)
fit.forest <- randomForest(formula = Churn~.,
                           data = train_df,  importance=TRUE)
fit.forest
## 
## Call:
##  randomForest(formula = Churn ~ ., data = train_df, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 6.35%
## Confusion matrix:
##       No Yes class.error
## No  1678  29  0.01698887
## Yes   98 194  0.33561644
importance(fit.forest)
##                         No       Yes MeanDecreaseAccuracy MeanDecreaseGini
## AccountWeeks    -2.4491753  2.130693            -1.176247         32.26186
## ContractRenewal 27.5080315 50.209070            48.202287         33.57199
## DataPlan        11.6130205 12.653960            13.118663         16.52698
## DataUsage       21.8358697 18.063430            24.210707         41.55346
## CustServCalls   50.0470940 75.212236            79.332684         66.30263
## DayMins         22.2954935 38.596029            40.265581         96.26619
## DayCalls         0.2962053  4.083924             2.358852         34.50497
## MonthlyCharge   27.8700141 43.053403            36.557543         84.82238
## OverageFee      16.1740370 16.905621            23.274860         46.06384
## RoamMins        20.2313641 24.835491            30.119859         44.84486
forest.pred <- predict(fit.forest, test_df, type = "response")

Visualize

varImpPlot(fit.forest)

Based on graph , the most important variables are Day Mins, Monthly Charge and Customer Service Calls

4.4 Support Vector Machine(SVM)

library(e1071)
set.seed(2021)
fit.svm <- svm(formula = Churn ~ ., data= train_df)

svm.pred <- predict(fit.svm, na.omit(test_df))

5. Evaluation

5.1 Confusion Matrix

Used to calculate value of Accuracy, Precission, Recall and F1 Score

Logistic Regression

log.perf <- table (test_df$Churn, log.pred,
                     dnn = c("Actual", "Predicted"))
log.perf
##       Predicted
## Actual   NO  YES
##    No  1109   34
##    Yes  141   50

Classification Tree

ctree.perf <- table(test_df$Churn, ctree.pred, 
                    dnn = c("Actual", "Predicted" ))
ctree.perf
##       Predicted
## Actual   No  Yes
##    No  1099   44
##    Yes   97   94

Random Forest

forest.perf <- table(test_df$Churn, forest.pred,
                     dnn = c("Actual", "Predited" ))
forest.perf
##       Predited
## Actual   No  Yes
##    No  1129   14
##    Yes   70  121

Support Vector Machine (SVM)

svm.perf <- table(na.omit(test_df)$Churn, svm.pred,
                  dnn = c("Actual", "Predicted"))
svm.perf
##       Predicted
## Actual   No  Yes
##    No  1134    9
##    Yes  111   80

5.2 Performance

This code is for Evaluation result of data. We compute the accuracy, preccision, recall and F1 Score.

actual <- test_df$Churn

performance <- function(table, n=2){
  
  TN = table[1,1]
  TP = table[2,2]
  FN = table[2,1]
  FP = table[1,2]
  
  accuracy = (TP+TN)/(TP+TN+FP+FN)
  preccision = TP/(FP+TP)
  recall = TP/(FN+TP)
  f1 = 2 * preccision * recall/(preccision + recall)
  
  result <- paste("Accuracy =", round(accuracy,n),
                  "Precission = ", round(preccision,n),
                  "Recall = ", round(recall,n),
                  "F1 Score = ", round(f1,n))
  result
  
}

** Accuracy, Preccision, Recall, and F1 Score**

performance(log.perf)     # Logistic Regression
## [1] "Accuracy = 0.87 Precission =  0.6 Recall =  0.26 F1 Score =  0.36"
performance(ctree.perf)   # Classification Tree
## [1] "Accuracy = 0.89 Precission =  0.68 Recall =  0.49 F1 Score =  0.57"
performance(forest.perf)  # Random Forest
## [1] "Accuracy = 0.94 Precission =  0.9 Recall =  0.63 F1 Score =  0.74"
performance(svm.perf)     # Support Vector Machine (SVM)
## [1] "Accuracy = 0.91 Precission =  0.9 Recall =  0.42 F1 Score =  0.57"

6. Recommendation

  1. Random Forest algorithm the best among all the tested algorithms.

  2. However, the current result surpass human level performance (94% accurracy).

  3. Based Random Forest Model , the most important variables are Monthly Charge, Day Mins, and Customer Service Calls.

  4. The Result can be improved by better data preparation or using other algorithms.