library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#library(lubridate)
library(funModeling)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.9.3 :)
## Examples and tutorials at livebook.datascienceheroes.com
## / Now in Spanish: librovivodecienciadedatos.ai
library(readxl) # fast excel reader
churn <- read_excel("Churn.xlsx", sheet = "Case Data")
#library(knitr) # the package that renders R markdown and has some good additional functionality
#kable(churn)
glimpse(churn)
## Observations: 6,347
## Variables: 13
## $ ID <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13…
## $ `Customer Age (in months)` <dbl> 67, 67, 55, 63, 57, 58, 57, 46, 56, 56, 5…
## $ `Churn (1 = Yes, 0 = No)` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ `CHI Score Month 0` <dbl> 0, 62, 0, 231, 43, 138, 180, 116, 78, 78,…
## $ `CHI Score 0-1` <dbl> 0, 4, 0, 1, -1, -10, -5, -11, -7, -37, -1…
## $ `Support Cases Month 0` <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,…
## $ `Support Cases 0-1` <dbl> 0, 0, 0, -1, 0, 0, 1, 0, -2, 0, 0, 0, 0, …
## $ `SP Month 0` <dbl> 0, 0, 0, 3, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0,…
## $ `SP 0-1` <dbl> 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0,…
## $ `Logins 0-1` <dbl> 0, 0, 0, 167, 0, 43, 13, 0, -9, -7, 14, 0…
## $ `Blog Articles 0-1` <dbl> 0, 0, 0, -8, 0, 0, -1, 0, 1, 0, 3, 0, 9, …
## $ `Views 0-1` <dbl> 0, -16, 0, 21996, 9, -33, 907, 38, 0, 30,…
## $ `Days Since Last Login 0-1` <dbl> 31, 31, 31, 0, 31, 0, 0, 6, 7, 14, 0, 31,…
dim(churn)
## [1] 6347 13
str(churn)
## Classes 'tbl_df', 'tbl' and 'data.frame': 6347 obs. of 13 variables:
## $ ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Customer Age (in months) : num 67 67 55 63 57 58 57 46 56 56 ...
## $ Churn (1 = Yes, 0 = No) : num 0 0 0 0 0 0 0 0 0 0 ...
## $ CHI Score Month 0 : num 0 62 0 231 43 138 180 116 78 78 ...
## $ CHI Score 0-1 : num 0 4 0 1 -1 -10 -5 -11 -7 -37 ...
## $ Support Cases Month 0 : num 0 0 0 1 0 0 1 0 1 0 ...
## $ Support Cases 0-1 : num 0 0 0 -1 0 0 1 0 -2 0 ...
## $ SP Month 0 : num 0 0 0 3 0 0 3 0 3 0 ...
## $ SP 0-1 : num 0 0 0 0 0 0 3 0 0 0 ...
## $ Logins 0-1 : num 0 0 0 167 0 43 13 0 -9 -7 ...
## $ Blog Articles 0-1 : num 0 0 0 -8 0 0 -1 0 1 0 ...
## $ Views 0-1 : num 0 -16 0 21996 9 ...
## $ Days Since Last Login 0-1: num 31 31 31 0 31 0 0 6 7 14 ...
summary(churn)
## ID Customer Age (in months) Churn (1 = Yes, 0 = No)
## Min. : 1 Min. : 0.0 Min. :0.00000
## 1st Qu.:1588 1st Qu.: 5.0 1st Qu.:0.00000
## Median :3174 Median :11.0 Median :0.00000
## Mean :3174 Mean :13.9 Mean :0.05089
## 3rd Qu.:4760 3rd Qu.:20.0 3rd Qu.:0.00000
## Max. :6347 Max. :67.0 Max. :1.00000
## CHI Score Month 0 CHI Score 0-1 Support Cases Month 0
## Min. : 0.00 Min. :-125.000 Min. : 0.0000
## 1st Qu.: 24.50 1st Qu.: -8.000 1st Qu.: 0.0000
## Median : 87.00 Median : 0.000 Median : 0.0000
## Mean : 87.32 Mean : 5.059 Mean : 0.7063
## 3rd Qu.:139.00 3rd Qu.: 15.000 3rd Qu.: 1.0000
## Max. :298.00 Max. : 208.000 Max. :32.0000
## Support Cases 0-1 SP Month 0 SP 0-1 Logins 0-1
## Min. :-29.000000 Min. :0.0000 Min. :-4.00000 Min. :-293.00
## 1st Qu.: 0.000000 1st Qu.:0.0000 1st Qu.: 0.00000 1st Qu.: -1.00
## Median : 0.000000 Median :0.0000 Median : 0.00000 Median : 2.00
## Mean : -0.006932 Mean :0.8128 Mean : 0.03017 Mean : 15.73
## 3rd Qu.: 0.000000 3rd Qu.:2.6667 3rd Qu.: 0.00000 3rd Qu.: 23.00
## Max. : 31.000000 Max. :4.0000 Max. : 4.00000 Max. : 865.00
## Blog Articles 0-1 Views 0-1 Days Since Last Login 0-1
## Min. :-75.0000 Min. :-28322.00 Min. :-648.000
## 1st Qu.: 0.0000 1st Qu.: -11.00 1st Qu.: 0.000
## Median : 0.0000 Median : 0.00 Median : 0.000
## Mean : 0.1572 Mean : 96.31 Mean : 1.765
## 3rd Qu.: 0.0000 3rd Qu.: 27.00 3rd Qu.: 3.000
## Max. :217.0000 Max. :230414.00 Max. : 61.000
# Checking Missing Values
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr_plot = aggr(churn,
col = c("red", "blue"),
numbers = TRUE,
prop = TRUE,
sortVars = TRUE,
labels = names(churn),
cex.axis = 1,
gap = 0,
ylab = c("Histogram of missing data", "Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## ID 0
## Customer Age (in months) 0
## Churn (1 = Yes, 0 = No) 0
## CHI Score Month 0 0
## CHI Score 0-1 0
## Support Cases Month 0 0
## Support Cases 0-1 0
## SP Month 0 0
## SP 0-1 0
## Logins 0-1 0
## Blog Articles 0-1 0
## Views 0-1 0
## Days Since Last Login 0-1 0
# This shows that there are no missing data in our Dataset, Good Start!
churn$`Customer Age (in months)` <- factor(churn$`Customer Age (in months)`)
levels(churn$`Customer Age (in months)`) = list("Less than 6 Months" = c(0:5),
"Btw 6 to 14 Months" = c(6:14),
"More than 14 Months" = c(15:67))
library(Hmisc)
describe(churn$`Customer Age (in months)`)
## churn$`Customer Age (in months)`
## n missing distinct
## 6347 0 3
##
## Value Less than 6 Months Btw 6 to 14 Months More than 14 Months
## Frequency 1669 2284 2394
## Proportion 0.263 0.360 0.377
# converting all the variables into factor which have less than 4 unique values
col_names <- sapply(churn, function(col) length(unique(col)) < 4)
churn[ , col_names] <- lapply(churn[ , col_names] , factor)
library(Hmisc)
describe(churn$`Churn (1 = Yes, 0 = No)`)
## churn$`Churn (1 = Yes, 0 = No)`
## n missing distinct
## 6347 0 2
##
## Value 0 1
## Frequency 6024 323
## Proportion 0.949 0.051
Let us check Wall’s belief about the dependence of churn rates on customer age supported by the data? Let us try to do it through Vistualisation
png(filename= "Age Over Churn Rate.png",
width = 888,
height = 571)
ggplot(data = churn, mapping = aes(x =`Customer Age (in months)`)) +
geom_bar(color="black",fill="orange")+
facet_wrap(~ `Churn (1 = Yes, 0 = No)`)+
ggtitle("Vistualising Age Categories over Churn")+
xlab("Customer Age - in months")+
theme_bw()
xtabs(~`Churn (1 = Yes, 0 = No)` + `Customer Age (in months)`, data = churn)
## Customer Age (in months)
## Churn (1 = Yes, 0 = No) Less than 6 Months Btw 6 to 14 Months
## 0 1643 2119
## 1 26 165
## Customer Age (in months)
## Churn (1 = Yes, 0 = No) More than 14 Months
## 0 2262
## 1 132
library(Hmisc)
describe(churn$`Customer Age (in months)`)
## churn$`Customer Age (in months)`
## n missing distinct
## 6347 0 3
##
## Value Less than 6 Months Btw 6 to 14 Months More than 14 Months
## Frequency 1669 2284 2394
## Proportion 0.263 0.360 0.377
# Churn - Yes(1) - means that the Customer left the application and No(0) - means that the customer is still using the application
# Now we see that wall's belief is correct for people who are still using the application;
# People who have been using the application for more than 14 months are still likely to stay with us
# However, People who might leave us are from the age range of 6 to 14 months, which was the correct assessment of Walls. However, people who have been using the system for more than 14 months are leaving more than that people who recently started using the system.
# This is a potential problem, as it negates our Hypothesis
#View(churn)
str(churn)
## Classes 'tbl_df', 'tbl' and 'data.frame': 6347 obs. of 13 variables:
## $ ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Customer Age (in months) : Factor w/ 3 levels "Less than 6 Months",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Churn (1 = Yes, 0 = No) : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ CHI Score Month 0 : num 0 62 0 231 43 138 180 116 78 78 ...
## $ CHI Score 0-1 : num 0 4 0 1 -1 -10 -5 -11 -7 -37 ...
## $ Support Cases Month 0 : num 0 0 0 1 0 0 1 0 1 0 ...
## $ Support Cases 0-1 : num 0 0 0 -1 0 0 1 0 -2 0 ...
## $ SP Month 0 : num 0 0 0 3 0 0 3 0 3 0 ...
## $ SP 0-1 : num 0 0 0 0 0 0 3 0 0 0 ...
## $ Logins 0-1 : num 0 0 0 167 0 43 13 0 -9 -7 ...
## $ Blog Articles 0-1 : num 0 0 0 -8 0 0 -1 0 1 0 ...
## $ Views 0-1 : num 0 -16 0 21996 9 ...
## $ Days Since Last Login 0-1: num 31 31 31 0 31 0 0 6 7 14 ...
Let us build a logistic regression Model
#install.packages("aod")
library(aod)
##
## Attaching package: 'aod'
## The following object is masked from 'package:survival':
##
## rats
mylogit <- glm( `Churn (1 = Yes, 0 = No)`~.-ID, data = churn, family = "binomial")
summary(mylogit)
##
## Call:
## glm(formula = `Churn (1 = Yes, 0 = No)` ~ . - ID, family = "binomial",
## data = churn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8402 -0.3603 -0.2518 -0.2048 3.5279
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -3.840e+00 1.996e-01 -19.234
## `Customer Age (in months)`Btw 6 to 14 Months 2.152e+00 2.281e-01 9.432
## `Customer Age (in months)`More than 14 Months 1.876e+00 2.371e-01 7.916
## `CHI Score Month 0` -9.640e-03 1.278e-03 -7.545
## `CHI Score 0-1` -4.792e-03 2.509e-03 -1.910
## `Support Cases Month 0` -8.937e-02 1.007e-01 -0.888
## `Support Cases 0-1` 1.409e-01 8.998e-02 1.566
## `SP Month 0` -4.745e-02 1.004e-01 -0.473
## `SP 0-1` -3.412e-04 7.750e-02 -0.004
## `Logins 0-1` 6.634e-04 2.121e-03 0.313
## `Blog Articles 0-1` -1.801e-03 2.177e-02 -0.083
## `Views 0-1` -1.085e-04 4.173e-05 -2.600
## `Days Since Last Login 0-1` 7.178e-03 3.303e-03 2.174
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## `Customer Age (in months)`Btw 6 to 14 Months < 2e-16 ***
## `Customer Age (in months)`More than 14 Months 2.45e-15 ***
## `CHI Score Month 0` 4.54e-14 ***
## `CHI Score 0-1` 0.05610 .
## `Support Cases Month 0` 0.37467
## `Support Cases 0-1` 0.11736
## `SP Month 0` 0.63644
## `SP 0-1` 0.99649
## `Logins 0-1` 0.75448
## `Blog Articles 0-1` 0.93408
## `Views 0-1` 0.00932 **
## `Days Since Last Login 0-1` 0.02974 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2553.1 on 6346 degrees of freedom
## Residual deviance: 2327.8 on 6334 degrees of freedom
## AIC: 2353.8
##
## Number of Fisher Scoring iterations: 7
exp(coef(mylogit))
## (Intercept)
## 0.0215033
## `Customer Age (in months)`Btw 6 to 14 Months
## 8.6011163
## `Customer Age (in months)`More than 14 Months
## 6.5305370
## `CHI Score Month 0`
## 0.9904059
## `CHI Score 0-1`
## 0.9952192
## `Support Cases Month 0`
## 0.9145070
## `Support Cases 0-1`
## 1.1513041
## `SP Month 0`
## 0.9536617
## `SP 0-1`
## 0.9996588
## `Logins 0-1`
## 1.0006636
## `Blog Articles 0-1`
## 0.9982006
## `Views 0-1`
## 0.9998915
## `Days Since Last Login 0-1`
## 1.0072042
exp(cbind(OR = coef(mylogit), confint(mylogit)))
## Waiting for profiling to be done...
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## OR 2.5 % 97.5 %
## (Intercept) 0.0215033 0.01417431 0.03109744
## `Customer Age (in months)`Btw 6 to 14 Months 8.6011163 5.58713431 13.71059992
## `Customer Age (in months)`More than 14 Months 6.5305370 4.16074590 10.57355208
## `CHI Score Month 0` 0.9904059 0.98790977 0.99287323
## `CHI Score 0-1` 0.9952192 0.99036435 1.00015120
## `Support Cases Month 0` 0.9145070 0.73216737 1.08684979
## `Support Cases 0-1` 1.1513041 0.98713233 1.40367908
## `SP Month 0` 0.9536617 0.78385505 1.16236915
## `SP 0-1` 0.9996588 0.85830693 1.16330783
## `Logins 0-1` 1.0006636 0.99611913 1.00432867
## `Blog Articles 0-1` 0.9982006 0.95239243 1.02704612
## `Views 0-1` 0.9998915 0.99981279 0.99998422
## `Days Since Last Login 0-1` 1.0072042 1.00105981 1.01410347