Read the Data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#library(lubridate)
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## funModeling v.1.9.3 :)
## Examples and tutorials at livebook.datascienceheroes.com
##  / Now in Spanish: librovivodecienciadedatos.ai

Read the Data

library(readxl) # fast excel reader
churn <- read_excel("Churn.xlsx", sheet = "Case Data")

Show the raw data

#library(knitr) # the package that renders R markdown and has some good additional functionality
#kable(churn)
glimpse(churn)
## Observations: 6,347
## Variables: 13
## $ ID                          <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13…
## $ `Customer Age (in months)`  <dbl> 67, 67, 55, 63, 57, 58, 57, 46, 56, 56, 5…
## $ `Churn (1 = Yes, 0 = No)`   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `CHI Score Month 0`         <dbl> 0, 62, 0, 231, 43, 138, 180, 116, 78, 78,…
## $ `CHI Score 0-1`             <dbl> 0, 4, 0, 1, -1, -10, -5, -11, -7, -37, -1…
## $ `Support Cases Month 0`     <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,…
## $ `Support Cases 0-1`         <dbl> 0, 0, 0, -1, 0, 0, 1, 0, -2, 0, 0, 0, 0, …
## $ `SP Month 0`                <dbl> 0, 0, 0, 3, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0,…
## $ `SP 0-1`                    <dbl> 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0,…
## $ `Logins 0-1`                <dbl> 0, 0, 0, 167, 0, 43, 13, 0, -9, -7, 14, 0…
## $ `Blog Articles 0-1`         <dbl> 0, 0, 0, -8, 0, 0, -1, 0, 1, 0, 3, 0, 9, …
## $ `Views 0-1`                 <dbl> 0, -16, 0, 21996, 9, -33, 907, 38, 0, 30,…
## $ `Days Since Last Login 0-1` <dbl> 31, 31, 31, 0, 31, 0, 0, 6, 7, 14, 0, 31,…

Checking Missing Values

dim(churn)
## [1] 6347   13
str(churn)
## Classes 'tbl_df', 'tbl' and 'data.frame':    6347 obs. of  13 variables:
##  $ ID                       : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Customer Age (in months) : num  67 67 55 63 57 58 57 46 56 56 ...
##  $ Churn (1 = Yes, 0 = No)  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CHI Score Month 0        : num  0 62 0 231 43 138 180 116 78 78 ...
##  $ CHI Score 0-1            : num  0 4 0 1 -1 -10 -5 -11 -7 -37 ...
##  $ Support Cases Month 0    : num  0 0 0 1 0 0 1 0 1 0 ...
##  $ Support Cases 0-1        : num  0 0 0 -1 0 0 1 0 -2 0 ...
##  $ SP Month 0               : num  0 0 0 3 0 0 3 0 3 0 ...
##  $ SP 0-1                   : num  0 0 0 0 0 0 3 0 0 0 ...
##  $ Logins 0-1               : num  0 0 0 167 0 43 13 0 -9 -7 ...
##  $ Blog Articles 0-1        : num  0 0 0 -8 0 0 -1 0 1 0 ...
##  $ Views 0-1                : num  0 -16 0 21996 9 ...
##  $ Days Since Last Login 0-1: num  31 31 31 0 31 0 0 6 7 14 ...
summary(churn)
##        ID       Customer Age (in months) Churn (1 = Yes, 0 = No)
##  Min.   :   1   Min.   : 0.0             Min.   :0.00000        
##  1st Qu.:1588   1st Qu.: 5.0             1st Qu.:0.00000        
##  Median :3174   Median :11.0             Median :0.00000        
##  Mean   :3174   Mean   :13.9             Mean   :0.05089        
##  3rd Qu.:4760   3rd Qu.:20.0             3rd Qu.:0.00000        
##  Max.   :6347   Max.   :67.0             Max.   :1.00000        
##  CHI Score Month 0 CHI Score 0-1      Support Cases Month 0
##  Min.   :  0.00    Min.   :-125.000   Min.   : 0.0000      
##  1st Qu.: 24.50    1st Qu.:  -8.000   1st Qu.: 0.0000      
##  Median : 87.00    Median :   0.000   Median : 0.0000      
##  Mean   : 87.32    Mean   :   5.059   Mean   : 0.7063      
##  3rd Qu.:139.00    3rd Qu.:  15.000   3rd Qu.: 1.0000      
##  Max.   :298.00    Max.   : 208.000   Max.   :32.0000      
##  Support Cases 0-1      SP Month 0         SP 0-1           Logins 0-1     
##  Min.   :-29.000000   Min.   :0.0000   Min.   :-4.00000   Min.   :-293.00  
##  1st Qu.:  0.000000   1st Qu.:0.0000   1st Qu.: 0.00000   1st Qu.:  -1.00  
##  Median :  0.000000   Median :0.0000   Median : 0.00000   Median :   2.00  
##  Mean   : -0.006932   Mean   :0.8128   Mean   : 0.03017   Mean   :  15.73  
##  3rd Qu.:  0.000000   3rd Qu.:2.6667   3rd Qu.: 0.00000   3rd Qu.:  23.00  
##  Max.   : 31.000000   Max.   :4.0000   Max.   : 4.00000   Max.   : 865.00  
##  Blog Articles 0-1    Views 0-1         Days Since Last Login 0-1
##  Min.   :-75.0000   Min.   :-28322.00   Min.   :-648.000         
##  1st Qu.:  0.0000   1st Qu.:   -11.00   1st Qu.:   0.000         
##  Median :  0.0000   Median :     0.00   Median :   0.000         
##  Mean   :  0.1572   Mean   :    96.31   Mean   :   1.765         
##  3rd Qu.:  0.0000   3rd Qu.:    27.00   3rd Qu.:   3.000         
##  Max.   :217.0000   Max.   :230414.00   Max.   :  61.000
# Checking Missing Values 
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
aggr_plot = aggr(churn, 
                 col = c("red", "blue"), 
                 numbers = TRUE, 
                 prop = TRUE, 
                 sortVars = TRUE, 
                 labels = names(churn), 
                 cex.axis = 1, 
                 gap = 0, 
                 ylab = c("Histogram of missing data", "Pattern"))

## 
##  Variables sorted by number of missings: 
##                   Variable Count
##                         ID     0
##   Customer Age (in months)     0
##    Churn (1 = Yes, 0 = No)     0
##          CHI Score Month 0     0
##              CHI Score 0-1     0
##      Support Cases Month 0     0
##          Support Cases 0-1     0
##                 SP Month 0     0
##                     SP 0-1     0
##                 Logins 0-1     0
##          Blog Articles 0-1     0
##                  Views 0-1     0
##  Days Since Last Login 0-1     0
# This shows that there are no missing data in our Dataset, Good Start!

Handling the Age variables for the initial Hypothesis

churn$`Customer Age (in months)` <- factor(churn$`Customer Age (in months)`)
levels(churn$`Customer Age (in months)`) = list("Less than 6 Months" = c(0:5),
                                        "Btw 6 to 14 Months" = c(6:14), 
                                        "More than 14 Months" = c(15:67))
library(Hmisc)
describe(churn$`Customer Age (in months)`)
## churn$`Customer Age (in months)` 
##        n  missing distinct 
##     6347        0        3 
##                                                                       
## Value       Less than 6 Months  Btw 6 to 14 Months More than 14 Months
## Frequency                 1669                2284                2394
## Proportion               0.263               0.360               0.377

Handling the Dependent Variable

# converting all the variables into factor which have less than 4 unique values 

col_names <- sapply(churn, function(col) length(unique(col)) < 4)
churn[ , col_names] <- lapply(churn[ , col_names] , factor)


library(Hmisc)
describe(churn$`Churn (1 = Yes, 0 = No)`)
## churn$`Churn (1 = Yes, 0 = No)` 
##        n  missing distinct 
##     6347        0        2 
##                       
## Value          0     1
## Frequency   6024   323
## Proportion 0.949 0.051

Question1

Let us check Wall’s belief about the dependence of churn rates on customer age supported by the data? Let us try to do it through Vistualisation

png(filename= "Age Over Churn Rate.png", 
    width = 888,
    height = 571)

ggplot(data = churn, mapping = aes(x =`Customer Age (in months)`)) +
geom_bar(color="black",fill="orange")+
facet_wrap(~ `Churn (1 = Yes, 0 = No)`)+
ggtitle("Vistualising Age Categories over Churn")+
  xlab("Customer Age - in months")+
  theme_bw()

xtabs(~`Churn (1 = Yes, 0 = No)` + `Customer Age (in months)`, data = churn)
##                        Customer Age (in months)
## Churn (1 = Yes, 0 = No) Less than 6 Months Btw 6 to 14 Months
##                       0               1643               2119
##                       1                 26                165
##                        Customer Age (in months)
## Churn (1 = Yes, 0 = No) More than 14 Months
##                       0                2262
##                       1                 132
library(Hmisc)
describe(churn$`Customer Age (in months)`)
## churn$`Customer Age (in months)` 
##        n  missing distinct 
##     6347        0        3 
##                                                                       
## Value       Less than 6 Months  Btw 6 to 14 Months More than 14 Months
## Frequency                 1669                2284                2394
## Proportion               0.263               0.360               0.377
# Churn - Yes(1) - means that the Customer left the application and No(0) - means that the customer is still using the application 

# Now we see that wall's belief is correct for people who are still using the application; 
# People who have been using the application for more than 14 months are still likely to stay with us 

# However, People who might leave us are from the age range of 6 to 14 months, which was the correct assessment of Walls. However, people who have been using the system for more than 14 months are leaving more than that people who recently started using the system. 

# This is a potential problem, as it negates our Hypothesis

Let us check other variables

#View(churn)

str(churn)
## Classes 'tbl_df', 'tbl' and 'data.frame':    6347 obs. of  13 variables:
##  $ ID                       : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Customer Age (in months) : Factor w/ 3 levels "Less than 6 Months",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Churn (1 = Yes, 0 = No)  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ CHI Score Month 0        : num  0 62 0 231 43 138 180 116 78 78 ...
##  $ CHI Score 0-1            : num  0 4 0 1 -1 -10 -5 -11 -7 -37 ...
##  $ Support Cases Month 0    : num  0 0 0 1 0 0 1 0 1 0 ...
##  $ Support Cases 0-1        : num  0 0 0 -1 0 0 1 0 -2 0 ...
##  $ SP Month 0               : num  0 0 0 3 0 0 3 0 3 0 ...
##  $ SP 0-1                   : num  0 0 0 0 0 0 3 0 0 0 ...
##  $ Logins 0-1               : num  0 0 0 167 0 43 13 0 -9 -7 ...
##  $ Blog Articles 0-1        : num  0 0 0 -8 0 0 -1 0 1 0 ...
##  $ Views 0-1                : num  0 -16 0 21996 9 ...
##  $ Days Since Last Login 0-1: num  31 31 31 0 31 0 0 6 7 14 ...

Question 2

Let us build a logistic regression Model

#install.packages("aod")
library(aod)
## 
## Attaching package: 'aod'
## The following object is masked from 'package:survival':
## 
##     rats
mylogit <- glm( `Churn (1 = Yes, 0 = No)`~.-ID, data = churn, family = "binomial")
summary(mylogit)
## 
## Call:
## glm(formula = `Churn (1 = Yes, 0 = No)` ~ . - ID, family = "binomial", 
##     data = churn)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8402  -0.3603  -0.2518  -0.2048   3.5279  
## 
## Coefficients:
##                                                 Estimate Std. Error z value
## (Intercept)                                   -3.840e+00  1.996e-01 -19.234
## `Customer Age (in months)`Btw 6 to 14 Months   2.152e+00  2.281e-01   9.432
## `Customer Age (in months)`More than 14 Months  1.876e+00  2.371e-01   7.916
## `CHI Score Month 0`                           -9.640e-03  1.278e-03  -7.545
## `CHI Score 0-1`                               -4.792e-03  2.509e-03  -1.910
## `Support Cases Month 0`                       -8.937e-02  1.007e-01  -0.888
## `Support Cases 0-1`                            1.409e-01  8.998e-02   1.566
## `SP Month 0`                                  -4.745e-02  1.004e-01  -0.473
## `SP 0-1`                                      -3.412e-04  7.750e-02  -0.004
## `Logins 0-1`                                   6.634e-04  2.121e-03   0.313
## `Blog Articles 0-1`                           -1.801e-03  2.177e-02  -0.083
## `Views 0-1`                                   -1.085e-04  4.173e-05  -2.600
## `Days Since Last Login 0-1`                    7.178e-03  3.303e-03   2.174
##                                               Pr(>|z|)    
## (Intercept)                                    < 2e-16 ***
## `Customer Age (in months)`Btw 6 to 14 Months   < 2e-16 ***
## `Customer Age (in months)`More than 14 Months 2.45e-15 ***
## `CHI Score Month 0`                           4.54e-14 ***
## `CHI Score 0-1`                                0.05610 .  
## `Support Cases Month 0`                        0.37467    
## `Support Cases 0-1`                            0.11736    
## `SP Month 0`                                   0.63644    
## `SP 0-1`                                       0.99649    
## `Logins 0-1`                                   0.75448    
## `Blog Articles 0-1`                            0.93408    
## `Views 0-1`                                    0.00932 ** 
## `Days Since Last Login 0-1`                    0.02974 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2553.1  on 6346  degrees of freedom
## Residual deviance: 2327.8  on 6334  degrees of freedom
## AIC: 2353.8
## 
## Number of Fisher Scoring iterations: 7
exp(coef(mylogit))
##                                   (Intercept) 
##                                     0.0215033 
##  `Customer Age (in months)`Btw 6 to 14 Months 
##                                     8.6011163 
## `Customer Age (in months)`More than 14 Months 
##                                     6.5305370 
##                           `CHI Score Month 0` 
##                                     0.9904059 
##                               `CHI Score 0-1` 
##                                     0.9952192 
##                       `Support Cases Month 0` 
##                                     0.9145070 
##                           `Support Cases 0-1` 
##                                     1.1513041 
##                                  `SP Month 0` 
##                                     0.9536617 
##                                      `SP 0-1` 
##                                     0.9996588 
##                                  `Logins 0-1` 
##                                     1.0006636 
##                           `Blog Articles 0-1` 
##                                     0.9982006 
##                                   `Views 0-1` 
##                                     0.9998915 
##                   `Days Since Last Login 0-1` 
##                                     1.0072042
exp(cbind(OR = coef(mylogit), confint(mylogit)))
## Waiting for profiling to be done...
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                                                      OR      2.5 %      97.5 %
## (Intercept)                                   0.0215033 0.01417431  0.03109744
## `Customer Age (in months)`Btw 6 to 14 Months  8.6011163 5.58713431 13.71059992
## `Customer Age (in months)`More than 14 Months 6.5305370 4.16074590 10.57355208
## `CHI Score Month 0`                           0.9904059 0.98790977  0.99287323
## `CHI Score 0-1`                               0.9952192 0.99036435  1.00015120
## `Support Cases Month 0`                       0.9145070 0.73216737  1.08684979
## `Support Cases 0-1`                           1.1513041 0.98713233  1.40367908
## `SP Month 0`                                  0.9536617 0.78385505  1.16236915
## `SP 0-1`                                      0.9996588 0.85830693  1.16330783
## `Logins 0-1`                                  1.0006636 0.99611913  1.00432867
## `Blog Articles 0-1`                           0.9982006 0.95239243  1.02704612
## `Views 0-1`                                   0.9998915 0.99981279  0.99998422
## `Days Since Last Login 0-1`                   1.0072042 1.00105981  1.01410347