[1] "/Users/anitaowens/R_Programming/HI Bootcamp"
df <- read.csv("~/R_Programming/HI Bootcamp/Telco-Customer-Churn.csv")
Spot problems
str(df)
'data.frame': 7043 obs. of 21 variables:
$ customerID : chr "7590-VHVEG" "5575-GNVDE" "3668-QPYBK" "7795-CFOCW" ...
$ gender : chr "Female" "Male" "Male" "Male" ...
$ SeniorCitizen : int 0 0 0 0 0 0 0 0 0 0 ...
$ Partner : chr "Yes" "No" "No" "No" ...
$ Dependents : chr "No" "No" "No" "No" ...
$ tenure : int 1 34 2 45 2 8 22 10 28 62 ...
$ PhoneService : chr "No" "Yes" "Yes" "No" ...
$ MultipleLines : chr "No phone service" "No" "No" "No phone service" ...
$ InternetService : chr "DSL" "DSL" "DSL" "DSL" ...
$ OnlineSecurity : chr "No" "Yes" "Yes" "Yes" ...
$ OnlineBackup : chr "Yes" "No" "Yes" "No" ...
$ DeviceProtection: chr "No" "Yes" "No" "Yes" ...
$ TechSupport : chr "No" "No" "No" "Yes" ...
$ StreamingTV : chr "No" "No" "No" "No" ...
$ StreamingMovies : chr "No" "No" "No" "No" ...
$ Contract : chr "Month-to-month" "One year" "Month-to-month" "One year" ...
$ PaperlessBilling: chr "Yes" "No" "Yes" "No" ...
$ PaymentMethod : chr "Electronic check" "Mailed check" "Mailed check" "Bank transfer (automatic)" ...
$ MonthlyCharges : num 29.9 57 53.9 42.3 70.7 ...
$ TotalCharges : num 29.9 1889.5 108.2 1840.8 151.7 ...
$ Churn : chr "No" "No" "Yes" "No" ...
#7043 obs and 21 vars
#Explore data interactively with Explore package
#explore(df) #uncomment to run
#we have several different measures in the dataset
#churn rate baseline
(churn.base <- df %>%
group_by(Churn) %>%
count(Churn) %>%
mutate(perc = n/nrow(df) * 100) %>%
rename(customers = n))
#ordered bar plots - need to arrange column in desc order b4 plotting
ggbarplot(churn.base, x = "customers", y = "perc",
fill = "Churn",
color = "white",
palette = "jco",
sort.val = "desc", # Sort the value in dscending order
sort.by.groups = FALSE, # Don't sort inside each group
x.text.angle = 0, # Rotate vertically x axis texts
legend = "right",
xlab = " Churn",
ylab = " Percentage",
label = paste(round(churn.base$perc,0),"%", sep = ""),
label.pos = "out",
title = "Churn rate baseline",
ggtheme = theme_minimal()
)
table(df$Churn)
No Yes
5174 1869
prop.table(table(df$Churn))
No Yes
0.7346301 0.2653699
#Our churn rate is 27%
#gender on x axis
box1 <- ggplot(data = df, aes(x = gender, y = MonthlyCharges, fill = gender))+geom_boxplot() ##+ stat_summary(fun=mean, geom="point", shape=20, size=8, color="red", fill="red")
box2 <- ggplot(data = df, aes(x = gender, y = TotalCharges, fill = gender))+geom_boxplot()
box3 <- ggplot(data = df, aes(x = gender, y = tenure, fill = gender))+geom_boxplot()
#phone service on x axis
box4 <- ggplot(data = df, aes(x = PhoneService, y = MonthlyCharges, fill = PhoneService))+geom_boxplot()
box5 <- ggplot(data = df, aes(x = PhoneService, y = TotalCharges, fill = PhoneService))+geom_boxplot()
box6 <- ggplot(data = df, aes(x = PhoneService, y = tenure, fill = PhoneService))+geom_boxplot()
#contract on x axis
box7 <- ggplot(data = df, aes(x = Contract, y = MonthlyCharges, fill = Contract))+geom_boxplot() + stat_summary(fun=mean, geom="point", shape=20, size=8, color="red", fill="red") + coord_flip()
#Similar avg monthly charges across all the different term plans,
#with the largest variability amongst 1-year contract holders
box8 <- ggplot(data = df, aes(x = Contract, y = TotalCharges, fill = Contract))+geom_boxplot() # some missing values
#higher than avg total charges for those on 2-year contracts
box9 <- ggplot(data = df, aes(x = Contract, y = tenure, fill = Contract))+geom_boxplot()
#The longer the contract the longer the average customer tenure with -year contract giving the highest avg tenure. Makes sense!
sjPlot::plot_grid(box1, box2, box3, box4, labels="AUTO")
Error in sjPlot::plot_grid(box1, box2, box3, box4, labels = "AUTO") :
unused arguments (box4, labels = "AUTO")
bi1 <- ggplot(data = df, aes(x = factor(Churn), y = tenure, fill = Churn)) +geom_boxplot()
#High churn for those with lower tenures
bi2 <- ggplot(data = df, aes(x = factor(Churn), y = MonthlyCharges, fill = Churn))+geom_boxplot()
#Higher churn for those with higher than avg monthly charges
bi3 <- ggplot(data = df, aes(x = factor(Churn), y = TotalCharges, fill = Churn))+geom_boxplot()
#Lower churn for those with lower than avg total charges
plot_grid(bi1, bi2, bi3, labels = "AUTO")
bibar1 <- ggplot(data = df, aes(x=gender, fill = factor(Churn))) + geom_bar(position = "fill") + scale_fill_manual(values = c("#1b9e77", "#d95f02")) + theme(
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
plot.title = element_text(hjust = 0.5),
text=element_text(size=14, family="Helvetica")) + labs(x = " ", title = "Churn by gender")
#no difference by gender
bibar2 <- ggplot(data = df, aes(x=Contract, fill = factor(Churn)))+ geom_bar(position = "fill") + scale_fill_manual(values = c("#1b9e77", "#d95f02")) + theme(
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
plot.title = element_text(hjust = 0.5),
text=element_text(size=14, family="Helvetica")) + labs(x = " ", title = "Contract type") + coord_flip()
#We have significantly higher churn than for those on month-to-month contracts
bibar3 <- ggplot(data = df, aes(x=factor(SeniorCitizen), fill = factor(Churn)))+ geom_bar(position = "fill") + scale_fill_manual(values = c("#1b9e77", "#d95f02")) + theme(
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
plot.title = element_text(hjust = 0.5),
text=element_text(size=14, family="Helvetica")) + labs(x = " ", title = "Is a senior citizen")
#Higher churn for Senior Citizens
bibar4 <- ggplot(data = df, aes(x=PaperlessBilling, fill = factor(Churn))) + geom_bar(position = "fill") + scale_fill_manual(values = c("#1b9e77", "#d95f02")) + theme(
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
plot.title = element_text(hjust = 0.5),
text=element_text(size=14, family="Helvetica")) + labs(x = " ", title = "Has paperless billing")
#higher churn rate for those on paperless billing
bibar5 <- ggplot(data = df, aes(x=PaymentMethod, fill = factor(Churn))) + geom_bar(position = "fill") + scale_fill_manual(values = c("#1b9e77", "#d95f02")) + theme(
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
plot.title = element_text(hjust = 0.5),
text=element_text(size=12, family="Helvetica")) + labs(x = " ", title = "Payment Method") + coord_flip()
#higher churn rate for those who pay by electronic
#check - lowest churn rates among those with
#automatic billing.
bibar6 <- ggplot(data = df, aes(x=PhoneService, fill = factor(Churn))) + geom_bar(position = "fill") + labs(title = "By phone service")
#no difference by phone service
bibar7 <- ggplot(data = df, aes(x=Partner, fill = factor(Churn))) + geom_bar(position = "fill") + scale_fill_manual(values = c("#1b9e77", "#d95f02")) + theme(
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
plot.title = element_text(hjust = 0.5),
text=element_text(size=12, family="Helvetica")) + labs(x = " ", title = "Has a partner")
#slightly higher churn rate for singles
bibar8 <-ggplot(data = df, aes(x=Dependents, fill = factor(Churn))) + geom_bar(position = "fill") + scale_fill_manual(values = c("#1b9e77", "#d95f02")) + theme(
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
plot.title = element_text(hjust = 0.5),
text=element_text(size=12, family="Helvetica")) + labs(x = " ", title = "Has dependents") + coord_flip()
#higher churn rate for those with no dependents
bibar9 <- ggplot(data = df, aes(x=MultipleLines, fill = factor(Churn))) + geom_bar(position = "fill") + labs(title = "Has multiple phone lines")
##very little difference if have multiple lines
bibar10 <- ggplot(data = df, aes(x=InternetService, fill = factor(Churn))) + geom_bar(position = "fill") + scale_fill_manual(values = c("#1b9e77", "#d95f02")) + theme(
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
plot.title = element_text(hjust = 0.5),
text=element_text(size=12, family="Helvetica")) + labs(x = " ", title = "Internet service") + coord_flip()
#Much higher churn if have fiber optic internet service
bibar11 <- ggplot(data = df, aes(x=OnlineSecurity, fill = factor(Churn))) + geom_bar(position = "fill") + coord_flip() + labs(title = "Online security")
#higher churn if have no online security
bibar12 <- ggplot(data = df, aes(x=OnlineBackup, fill = factor(Churn))) + geom_bar(position = "fill") + coord_flip() + labs(title = "Online backup")
#higher churn for those who have no online backup
bibar13 <- ggplot(data = df, aes(x=DeviceProtection, fill = factor(Churn))) + geom_bar(position = "fill") + coord_flip() + labs(title = "Device protection")
#higher churn for those who have no device protection
bibar14 <- ggplot(data = df, aes(x=TechSupport, fill = factor(Churn))) + geom_bar(position = "fill") + coord_flip() + labs(title = "Tech support")
#higher churn for those with no tech support
bibar15 <- ggplot(data = df, aes(x=StreamingTV, fill = factor(Churn))) + geom_bar(position = "fill") + labs(title = "Streaming TV")
#no difference among TV streamers - but very low churners if you don't have internet service
bibar16 <- ggplot(data = df, aes(x=StreamingMovies, fill = factor(Churn))) + geom_bar(position = "fill") + labs(title = "Streaming movies")
#no difference among movie streamers - but very low churners if you don't have internet service
plot_grid(bibar1, bibar2, bibar3, bibar4,
bibar5, bibar6,
labels = "AUTO")
plot_grid(bibar7, bibar8,
bibar9, bibar10,
labels = "AUTO")
plot_grid(bibar11, bibar12, bibar13,
labels = "AUTO")
plot_grid(bibar14,
bibar15, bibar16,
labels = "AUTO")
sp1 <- ggplot(data = df, aes(x=MonthlyCharges, y=tenure, color=factor(Churn))) + geom_point(alpha = 0.5) + geom_smooth(method=lm) + labs(title = " ")
#lots of vertical lines - no horizontal patterns
#suggests that monthly charges has some association with churn as the churned customers are heavily represented on the higher end of the monthly charges
#there is most likely a higher likelihood of churn once you reach a certain point on the monthly fee
#believe that total charges can also be a proxy for tenure-hence the triangle looking scatterplot
sp2 <- ggplot(data = df, aes(x=TotalCharges, y=tenure, color=factor(Churn))) + geom_point(alpha = 0.5) +geom_smooth(method=lm) + labs(title = " ")
#histogram of tenure
hist(df$tenure)
#log transformation
sp3 <- ggplot(data = df, aes(x=MonthlyCharges, y=tenure, color=factor(Churn))) + geom_point(alpha = 0.5) + geom_smooth(method=lm) + scale_x_log10() + scale_y_log10() + labs(title = "Log transformation")
sp4 <- ggplot(data = df, aes(x=TotalCharges, y=tenure, color=factor(Churn))) + geom_point(alpha = 0.5) +geom_smooth(method=lm) + scale_x_log10() + labs(title = "Log transformation")
#awesome looking charts
plot_grid(sp1, sp2, sp3, sp4, labels="AUTO")
#Tenure and total charges are possibly collinear or bordering on collinearity!
#Internet Service
CrossTable(df$InternetService, df$Churn, digits=2, prop.c = TRUE,
prop.r = TRUE, prop.t = FALSE, chisq = FALSE, format = "SAS", expected = FALSE)
#
table(df$InternetService, df$Churn)
round(addmargins(prop.table(table(df$InternetService, df$Churn))),3)
#Statistical test
chisq.test(df$Churn, df$InternetService)
#Contract
CrossTable(df$Contract, df$Churn, digits=2, prop.c = TRUE,
prop.r = TRUE, prop.t = FALSE, chisq = FALSE, format = "SAS", expected = FALSE)
#
table(df$Contract, df$Churn)
round(addmargins(prop.table(table(df$Contract, df$Churn))),3)
#statistical test
chisq.test(df$Churn, df$Contract)
#Let's take age and put it into buckets
#let's get min and max Monthly Charge
min(df$MonthlyCharges)
[1] 18.25
max(df$MonthlyCharges)
[1] 118.75
mean(df$MonthlyCharges)
[1] 64.79821
median(df$MonthlyCharges)
[1] 70.35
#quartiles
ggplot(df, aes(y=MonthlyCharges)) + geom_boxplot()
#quantiles - takes a vector of proportions
quantile(df$MonthlyCharges, probs = c(0, 0.2, 0.4, 0.6, 0.8,1))
0% 20% 40% 60% 80% 100%
18.25 25.05 58.92 79.15 94.30 118.75
#create monthly fee bands
df$monthly_fee_bin <- cut(df$MonthlyCharges,
breaks= 5,
labels=c("bin1", "bin2", "bin3", "bin4", "bin5"),
right=FALSE)
table(df$monthly_fee_bin) #check results
bin1 bin2 bin3 bin4 bin5
1791 1002 1366 1821 1052
#check averages across the different fee bands
(fees <- df %>%
group_by(monthly_fee_bin) %>%
summarize(avg_monthly_fee = format(round(mean(MonthlyCharges),2)),
median_monthly_fee = format(round(median(MonthlyCharges),2))))
`summarise()` ungrouping output (override with `.groups` argument)
#visualize fee bands
ggplot(data = df, aes(x=monthly_fee_bin, fill = factor(Churn))) + geom_bar(position = "fill")
##Step 3: Manage data
#any missing data?
#Using VIM package
missing <- aggr(df, prop = TRUE, bars = TRUE) # - some missing values
summary(missing)
Missings per variable:
Missings in combinations of variables:
## Show cases with missing values
(missing.df <- df[!complete.cases(df),]) #11 rows
#Some total charges missing
#We can remove missing rows - removing them won't affect our results that much - this is the code to remove below
df <- df[complete.cases(df),]
#check results
sum(is.na(df$TotalCharges)) #all gone
[1] 0
#visualize entire dataframe again
aggr(df, prop = TRUE, bars = TRUE)
#any duplicate rows -
anyDuplicated(df)
[1] 0
#any duplicate columns -
names(df)[duplicated(names(df))]
character(0)
df[names(df)[!duplicated(names(df))]]
NA
There were no duplicates to worry about
Now we want to dig deeper We have some ideas of what to look for thanks to our EDA
#pairs.panels
df %>%
select_if(is.numeric) %>%
scale() %>%
pairs.panels()
#Tenure and TotalCharges has a very high correlation coefficient!
#Also shows us that SeniorCitizen is coded as a numerical variable (0 or 1)
#Monthly charges and TotalCharges has a high correlation coefficient also.
# i take these results with a grain of salt.
df %>%
select_if(is.numeric) %>%
cor() %>%
corrplot(type = "upper", insig = "blank", addCoef.col = "black", diag=FALSE)
#monthly charges is highly-correlated with total charges which makes sense. at 0.65.
#total charges and tenure has a high correlation.83
#which is bordering on collinearity
str(df$Churn)
chr [1:7032] "No" "No" "Yes" "No" "Yes" "Yes" "No" "No" ...
(churnRate <- round(prop.table(table(df$Churn)),5))
No Yes
0.73422 0.26578
(churn_rate_overall <- churnRate[[2]])
[1] 0.26578
# recode churn variable into a numeric variable
df <- df %>%
mutate(Churn = ifelse(Churn == "Yes", 1, 0))
#check results
head(df$Churn)
[1] 0 0 1 0 1 1
`summarise()` ungrouping output (override with `.groups` argument)
[1] 1869
[1] 7032
[1] 1 0 0
#data frame grouped by fee
monthly.fee.df <- df %>%
group_by(monthly_fee_bin) %>%
summarize(total_count = n(),
total_churns = sum(Churn)) %>%
mutate(churn_rate = total_churns/total_count)
Error: Must group by variables found in `.data`.
* Column `monthly_fee_bin` is not found.
Run `rlang::last_error()` to see where the error occurred.
#data frame grouped by fee
internet.df <- df %>%
group_by(InternetService) %>%
summarize(total_count = n(),
total_churns = sum(Churn)) %>%
mutate(churn_rate = total_churns/total_count)
head(internet.df) #check results
#check churn columns
sum(internet.df$total_churns)
sum(internet.df$total_count)
#add highlight flag column
internet.df <- internet.df %>%
mutate(highlight_flag =
ifelse(churn_rate > churn_rate_overall, 1, 0))
#check results
head(internet.df$highlight_flag)
#plot response rate by fee difference
ggplot(data=internet.df, aes(x=reorder(InternetService, churn_rate), y=churn_rate,
fill = factor(highlight_flag))) +
geom_bar(stat="identity") +
geom_hline(yintercept=churn_rate_overall, linetype="dashed", color = "black") +
theme(axis.text.x = element_text(angle = 90)) +
coord_flip() +
scale_fill_manual(values = c('#595959', '#e41a1c')) +
labs(x = ' ', y = 'Churn Rate', title = str_c("Churn rate by internet service")) +
theme(legend.position = "none")
#data frame grouped by payment method
pymt.method.df <- df %>%
group_by(PaymentMethod) %>%
summarize(total_count = n(),
total_churns = sum(Churn)) %>%
mutate(churn_rate = total_churns/total_count)
head(pymt.method.df) #check results
#check churn columns
sum(pymt.method.df$total_churns)
sum(pymt.method.df$total_count)
#add highlight flag column
pymt.method.df <- pymt.method.df %>%
mutate(highlight_flag =
ifelse(churn_rate > churn_rate_overall, 1, 0))
#check results
head(pymt.method.df$highlight_flag)
#plot response rate by fee difference
ggplot(data=pymt.method.df, aes(x=reorder(PaymentMethod, churn_rate), y=churn_rate,
fill = factor(highlight_flag))) +
geom_bar(stat="identity") +
geom_hline(yintercept=churn_rate_overall, linetype="dashed", color = "black") +
theme(axis.text.x = element_text(angle = 90)) +
coord_flip() +
scale_fill_manual(values = c('#595959', '#e41a1c')) +
labs(x = ' ', y = 'Churn Rate', title = str_c("Churn rate by payment method")) +
theme(legend.position = "none")
We have uncovered some insights so far:
Now we have to decide whether we should do some data modeling.
#let's use a fresh dataset
df_raw <- read.csv("~/R_Programming/HI Bootcamp/Telco-Customer-Churn.csv")
str(df_dummy$Churn) #check churn variable
Factor w/ 2 levels "0","1": 1 1 2 1 2 2 1 1 2 1 ...
table(df_dummy$Churn)
0 1
5163 1869
# Check total number of rows in our dataset before splitting
nrow(df_dummy)
[1] 7032
#Set seed for reproducibility
set.seed(123)
#Split the dataset into training and test sets at 70:30 ratio using caTools package
split <- sample.split(df_dummy$Churn, SplitRatio = 0.7)
train_data <- subset(df_dummy, split == TRUE)
test_data <- subset(df_dummy, split == FALSE)
#Check if distribution of partition data is correct for train and test set
prop.table(table(train_data$Churn))
0 1
0.7342544 0.2657456
prop.table(table(test_data$Churn))
0 1
0.7341232 0.2658768
set.seed(1) # set seed for reproducibility
#Use glm function from glmnet package to create model
model_01_glm <- glm(formula = Churn ~ .,
data = train_data, family = "binomial")
#use to debug if glm function not working
# train_data %>%
# select_if(is.factor) %>%
# lapply(table)
#Print the model output
summary(model_01_glm)
Call:
glm(formula = Churn ~ ., family = "binomial", data = train_data)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.9177 -0.6761 -0.2719 0.7253 3.5090
Coefficients:
Estimate
(Intercept) 1.445e+00
gender1 1.162e-02
SeniorCitizen1 3.252e-01
Partner1 2.019e-02
Dependents1 -1.550e-01
tenure -6.148e-02
PhoneService1 5.856e-01
MultipleLines1 4.504e-01
InternetServiceFiber optic 2.019e+00
InternetServiceNo -2.319e+00
OnlineSecurity1 -2.303e-01
OnlineBackup1 1.104e-01
DeviceProtection1 2.161e-01
TechSupport1 -6.384e-02
StreamingTV1 7.138e-01
StreamingMovies1 6.613e-01
ContractOne year -6.419e-01
ContractTwo year -1.543e+00
PaperlessBilling1 3.710e-01
PaymentMethodCredit card (automatic) -4.064e-02
PaymentMethodElectronic check 2.866e-01
PaymentMethodMailed check -2.638e-02
MonthlyCharges -5.401e-02
TotalCharges 3.415e-04
Std. Error z value
(Intercept) 9.764e-01 1.480
gender1 7.790e-02 0.149
SeniorCitizen1 1.010e-01 3.219
Partner1 9.429e-02 0.214
Dependents1 1.095e-01 -1.415
tenure 7.597e-03 -8.093
PhoneService1 7.788e-01 0.752
MultipleLines1 2.125e-01 2.120
InternetServiceFiber optic 9.538e-01 2.117
InternetServiceNo 9.678e-01 -2.396
OnlineSecurity1 2.138e-01 -1.077
OnlineBackup1 2.093e-01 0.528
DeviceProtection1 2.123e-01 1.018
TechSupport1 2.138e-01 -0.299
StreamingTV1 3.904e-01 1.828
StreamingMovies1 3.917e-01 1.688
ContractOne year 1.283e-01 -5.001
ContractTwo year 2.252e-01 -6.851
PaperlessBilling1 8.956e-02 4.142
PaymentMethodCredit card (automatic) 1.371e-01 -0.296
PaymentMethodElectronic check 1.132e-01 2.532
PaymentMethodMailed check 1.367e-01 -0.193
MonthlyCharges 3.800e-02 -1.421
TotalCharges 8.628e-05 3.958
Pr(>|z|)
(Intercept) 0.13899
gender1 0.88144
SeniorCitizen1 0.00129 **
Partner1 0.83041
Dependents1 0.15702
tenure 5.81e-16 ***
PhoneService1 0.45206
MultipleLines1 0.03401 *
InternetServiceFiber optic 0.03424 *
InternetServiceNo 0.01659 *
OnlineSecurity1 0.28136
OnlineBackup1 0.59782
DeviceProtection1 0.30871
TechSupport1 0.76528
StreamingTV1 0.06751 .
StreamingMovies1 0.09132 .
ContractOne year 5.70e-07 ***
ContractTwo year 7.33e-12 ***
PaperlessBilling1 3.44e-05 ***
PaymentMethodCredit card (automatic) 0.76687
PaymentMethodElectronic check 0.01135 *
PaymentMethodMailed check 0.84698
MonthlyCharges 0.15526
TotalCharges 7.55e-05 ***
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5699.5 on 4921 degrees of freedom
Residual deviance: 4041.2 on 4898 degrees of freedom
AIC: 4089.2
Number of Fisher Scoring iterations: 6
The Odds Ratio and Probability of each x variable is calculated based on the formulae, Odds Ratio = exp(Co-efficient estimate) Probability = Odds Ratio / (1 + Odds Ratio)
##Individual coefficients significance and interpretation
#library(coef.lmList)
summary(model_01_glm)
Call:
glm(formula = Churn ~ ., family = "binomial", data = train_data)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.9177 -0.6761 -0.2719 0.7253 3.5090
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.445e+00 9.764e-01 1.480 0.13899
gender1 1.162e-02 7.790e-02 0.149 0.88144
SeniorCitizen1 3.252e-01 1.010e-01 3.219 0.00129 **
Partner1 2.019e-02 9.429e-02 0.214 0.83041
Dependents1 -1.550e-01 1.095e-01 -1.415 0.15702
tenure -6.148e-02 7.597e-03 -8.093 5.81e-16 ***
PhoneService1 5.856e-01 7.788e-01 0.752 0.45206
MultipleLines1 4.504e-01 2.125e-01 2.120 0.03401 *
InternetServiceFiber optic 2.019e+00 9.538e-01 2.117 0.03424 *
InternetServiceNo -2.319e+00 9.678e-01 -2.396 0.01659 *
OnlineSecurity1 -2.303e-01 2.138e-01 -1.077 0.28136
OnlineBackup1 1.104e-01 2.093e-01 0.528 0.59782
DeviceProtection1 2.161e-01 2.123e-01 1.018 0.30871
TechSupport1 -6.384e-02 2.138e-01 -0.299 0.76528
StreamingTV1 7.138e-01 3.904e-01 1.828 0.06751 .
StreamingMovies1 6.613e-01 3.917e-01 1.688 0.09132 .
ContractOne year -6.419e-01 1.283e-01 -5.001 5.70e-07 ***
ContractTwo year -1.543e+00 2.252e-01 -6.851 7.33e-12 ***
PaperlessBilling1 3.710e-01 8.956e-02 4.142 3.44e-05 ***
PaymentMethodCredit card (automatic) -4.064e-02 1.371e-01 -0.296 0.76687
PaymentMethodElectronic check 2.866e-01 1.132e-01 2.532 0.01135 *
PaymentMethodMailed check -2.638e-02 1.367e-01 -0.193 0.84698
MonthlyCharges -5.401e-02 3.800e-02 -1.421 0.15526
TotalCharges 3.415e-04 8.628e-05 3.958 7.55e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5699.5 on 4921 degrees of freedom
Residual deviance: 4041.2 on 4898 degrees of freedom
AIC: 4089.2
Number of Fisher Scoring iterations: 6
#prints confidence intervals
exp(confint(model_01_glm))
Waiting for profiling to be done...
2.5 % 97.5 %
(Intercept) 0.62585298 28.7897010
gender1 0.86839042 1.1785628
SeniorCitizen1 1.13557648 1.6876014
Partner1 0.84831532 1.2277633
Dependents1 0.69048169 1.0607877
tenure 0.92620175 0.9542076
PhoneService1 0.39040235 8.2748304
MultipleLines1 1.03485063 2.3807466
InternetServiceFiber optic 1.16401499 49.0061508
InternetServiceNo 0.01474087 0.6555454
OnlineSecurity1 0.52200493 1.2072804
OnlineBackup1 0.74097961 1.6836167
DeviceProtection1 0.81873857 1.8824724
TechSupport1 0.61666623 1.4262243
StreamingTV1 0.95055719 4.3940173
StreamingMovies1 0.89964339 4.1788776
ContractOne year 0.40818333 0.6752918
ContractTwo year 0.13509802 0.3274961
PaperlessBilling1 1.21627232 1.7279757
PaymentMethodCredit card (automatic) 0.73366100 1.2559173
PaymentMethodElectronic check 1.06753609 1.6640737
PaymentMethodMailed check 0.74534346 1.2739225
MonthlyCharges 0.87935181 1.0206554
TotalCharges 1.00017446 1.0005129
#plot coefficients on odds ratio using sjPlot package
plot_model(model_01_glm, vline.color = "red",
sort.est = TRUE, show.values = TRUE)
#Calculate Variance Inflation factor using car package
vif(model_01_glm)
GVIF Df GVIF^(1/(2*Df))
gender 1.006295 1 1.003142
SeniorCitizen 1.134530 1 1.065143
Partner 1.398308 1 1.182501
Dependents 1.304214 1 1.142022
tenure 16.009663 1 4.001208
PhoneService 34.990361 1 5.915265
MultipleLines 7.328239 1 2.707072
InternetService 368.911620 2 4.382587
OnlineSecurity 4.940076 1 2.222628
OnlineBackup 6.321778 1 2.514315
DeviceProtection 6.503891 1 2.550273
TechSupport 5.181150 1 2.276214
StreamingTV 24.424063 1 4.942071
StreamingMovies 24.781447 1 4.978097
Contract 1.581055 2 1.121339
PaperlessBilling 1.127862 1 1.062009
PaymentMethod 1.405279 3 1.058345
MonthlyCharges 683.317139 1 26.140335
TotalCharges 20.614481 1 4.540317
# Feature (x) variables with a VIF value above 5 indicate high degree of
# multi-collinearity.
MonthlyCharges has a very high VIF at 600+. We have some other variables e.g. InternetServiceFiber.optic, InternetServiceNo, and TotalCharges, PhoneServiceYes, with high VIFs. Normally, we would want to remove variables with high VIF as it can really mess with our model.
pred <- predict(model_01_glm, test_data, type = "response") #predict using test data
#check results
head(pred)
2 5 7 8 9
0.04019090 0.69621075 0.50018862 0.28517147 0.59281756
12
0.02259621
predicted <- round(pred) #>0.5 will convert to 1
#contingency table
contingency_tab <- table(test_data$Churn, predicted)
contingency_tab
predicted
0 1
0 1373 176
1 238 323
# Confusion Matrix using the caret package
caret::confusionMatrix(contingency_tab)
Confusion Matrix and Statistics
predicted
0 1
0 1373 176
1 238 323
Accuracy : 0.8038
95% CI : (0.7862, 0.8205)
No Information Rate : 0.7635
P-Value [Acc > NIR] : 5.01e-06
Kappa : 0.479
Mcnemar's Test P-Value : 0.002718
Sensitivity : 0.8523
Specificity : 0.6473
Pos Pred Value : 0.8864
Neg Pred Value : 0.5758
Prevalence : 0.7635
Detection Rate : 0.6507
Detection Prevalence : 0.7341
Balanced Accuracy : 0.7498
'Positive' Class : 0
#Plot ROC Curve & Calculate AUC area
library(ROCR)
#ROC Curves are useful for comparing classifiers
#Check data structures first or else the ROC curve won't plot
typeof(predicted)
[1] "double"
typeof(test_data$Churn)
[1] "integer"
pr <- prediction(pred, test_data$Churn)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
#The ideal ROC curve hugs the top left corner, indicating a high
#true positive rate and a low false positive rate.
#True positive rate on y-axis
#False positive rate on the x-axis
#The larger the AUC, the better the classifier
#The AUC line is insufficient to identify a best model
#It's used in combination with qualitative examination
#of the ROC curve
auc <- performance(pr, measure = "auc")
auc
A performance instance
'Area under the ROC curve'
# AUC is 0.861989
as.numeric(performance(pr, measure = "auc")@y.values)
[1] 0.8396119
#Double density plot for explaining and picking thresholds of predicted churn probabilities. Perhaps better alternative than AUC or ROC curve
ggplot(data = test_data) + geom_density(aes(x=pred, color = Churn,linetype = Churn))
AUC Interpretation A: Outstanding = 0.9 to 1.0 B: Excellent/Good = 0.8 to 0.9 C: Acceptable/Fair = 0.7 to 0.8 D: Poor = 0.6 to 0.7 E: No Discrimination = 0.5 to 0.6
set.seed(1) # for reproducibility
model_02_glm <- glm(formula = Churn ~ . -MonthlyCharges,
data = train_data, family = "binomial")
summary(model_02_glm)
Call:
glm(formula = Churn ~ . - MonthlyCharges, family = "binomial",
data = train_data)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.9155 -0.6758 -0.2734 0.7196 3.5126
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 8.802e-02 2.059e-01 0.428 0.66898
gender1 1.479e-02 7.784e-02 0.190 0.84931
SeniorCitizen1 3.247e-01 1.010e-01 3.216 0.00130 **
Partner1 2.214e-02 9.423e-02 0.235 0.81426
Dependents1 -1.545e-01 1.094e-01 -1.412 0.15802
tenure -6.092e-02 7.567e-03 -8.051 8.21e-16 ***
PhoneService1 -4.981e-01 1.589e-01 -3.134 0.00173 **
MultipleLines1 1.811e-01 9.593e-02 1.888 0.05906 .
InternetServiceFiber optic 6.748e-01 1.172e-01 5.757 8.57e-09 ***
InternetServiceNo -9.634e-01 1.660e-01 -5.804 6.48e-09 ***
OnlineSecurity1 -4.980e-01 1.017e-01 -4.894 9.87e-07 ***
OnlineBackup1 -1.566e-01 9.240e-02 -1.694 0.09020 .
DeviceProtection1 -5.383e-02 9.496e-02 -0.567 0.57078
TechSupport1 -3.309e-01 1.024e-01 -3.232 0.00123 **
StreamingTV1 1.771e-01 9.836e-02 1.800 0.07183 .
StreamingMovies1 1.226e-01 9.808e-02 1.250 0.21136
ContractOne year -6.426e-01 1.283e-01 -5.008 5.50e-07 ***
ContractTwo year -1.542e+00 2.252e-01 -6.848 7.49e-12 ***
PaperlessBilling1 3.729e-01 8.953e-02 4.164 3.12e-05 ***
PaymentMethodCredit card (automatic) -4.020e-02 1.370e-01 -0.293 0.76923
PaymentMethodElectronic check 2.880e-01 1.132e-01 2.544 0.01096 *
PaymentMethodMailed check -2.494e-02 1.367e-01 -0.182 0.85523
TotalCharges 3.343e-04 8.593e-05 3.891 1.00e-04 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 5699.5 on 4921 degrees of freedom
Residual deviance: 4043.2 on 4899 degrees of freedom
AIC: 4089.2
Number of Fisher Scoring iterations: 6
vif(model_02_glm)
GVIF Df GVIF^(1/(2*Df))
gender 1.005342 1 1.002668
SeniorCitizen 1.135116 1 1.065418
Partner 1.397455 1 1.182140
Dependents 1.304336 1 1.142075
tenure 15.872054 1 3.983975
PhoneService 1.460078 1 1.208337
MultipleLines 1.494095 1 1.222332
InternetService 2.649715 2 1.275851
OnlineSecurity 1.119919 1 1.058262
OnlineBackup 1.232516 1 1.110187
DeviceProtection 1.301363 1 1.140773
TechSupport 1.188972 1 1.090400
StreamingTV 1.550164 1 1.245056
StreamingMovies 1.554139 1 1.246651
Contract 1.581378 2 1.121396
PaperlessBilling 1.127640 1 1.061904
PaymentMethod 1.405135 3 1.058327
TotalCharges 20.435807 1 4.520598
#plot coefficients on odds ratio
plot_model(model_02_glm, vline.color = "red",
sort.est = TRUE, show.values = TRUE)
InternetService - Fiber Optic cable still increases likelihood of churn followed by paperless billing and being a senior citizen.
pred <- predict(model_02_glm, test_data, type = "response") #predict using test data
#check results
head(pred)
2 5 7 8 9 12
0.04440227 0.70457399 0.48898415 0.28325323 0.59169889 0.02143049
predicted <- round(pred) #>0.5 will convert predicted values to 1
#contingency table - manually done with table function
contingency_tab <- table(test_data$Churn, predicted)
contingency_tab
predicted
0 1
0 1372 177
1 237 324
# Confusion Matrix using the caret package
caret::confusionMatrix(contingency_tab)
Confusion Matrix and Statistics
predicted
0 1
0 1372 177
1 237 324
Accuracy : 0.8038
95% CI : (0.7862, 0.8205)
No Information Rate : 0.7626
P-Value [Acc > NIR] : 3.136e-06
Kappa : 0.4796
Mcnemar's Test P-Value : 0.003735
Sensitivity : 0.8527
Specificity : 0.6467
Pos Pred Value : 0.8857
Neg Pred Value : 0.5775
Prevalence : 0.7626
Detection Rate : 0.6502
Detection Prevalence : 0.7341
Balanced Accuracy : 0.7497
'Positive' Class : 0
#Plot ROC Curve & Calculate AUC area
library(ROCR)
#check data structures first
typeof(predicted)
[1] "double"
typeof(test_data$Churn)
[1] "integer"
pr <- prediction(pred, test_data$Churn)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
#The ideal ROC curve hugs the top left corner, indicating a high
#true positive rate and a low false positive rate.
#True positive rate on y-axis
#False positive rate on the x-axis
#The larger the AUC, the better the classifier
#The AUC line is insufficient to identify a best model
#It's used in combination with qualitative examination
#of the ROC curve
auc <- performance(pr, measure = "auc")
auc
A performance instance
'Area under the ROC curve'
as.numeric(performance(pr, measure = "auc")@y.values)
[1] 0.8397523
Comparing logistic regression models 1 and 2 -Model 1 has better accuracy than Model 2 (just slightly better), but we do care about specificity and sensitivity.
Sensitivity is the proportion of our variable of interest(churn) correctly identified.
Specificity is the proportion of our variable of interest(churn)
Read: https://www.theanalysisfactor.com/sensitivity-and-specificity/
# Train a Random Forest using randomForest package
set.seed(1) # for reproducibility
model_01_rf <- randomForest(formula = Churn ~ ., data = train_data, importance = TRUE, na.action=na.exclude)
# Print the model output
print(model_01_rf)
Model 500 trees: -No. of variables tried at each split: 5 -OOB estimate of error rate: 20%
# prints variable importance
summary(model_01_rf)
varImpPlot(model_01_rf, main="Variable Importance")
# Train a Random Forest
set.seed(1) # for reproducibility
model_02_rf <- randomForest(formula = Churn ~ . -tenure,
data = train_data, importance = TRUE)
# Print the model output
print(model_02_rf)
varImpPlot(model_02_rf, main="Variable Importance without tenure")
Type of random forest: classification Number of trees: 500 No. of variables tried at each split: 4
OOB estimate of error rate: 21.51% Confusion matrix: 0 1 class.error 0 3621 499 0.1211165 1 829 677 0.5504648
our oob error rate increased to 21.51% from 21.3% - let’s tune the 1st model - model_01_rf
# Grab OOB error matrix & take a look
err <- model_01_rf$err.rate
head(err)
# Look at final OOB error rate (last row in err matrix)
oob_err <- err[nrow(err), "OOB"]
print(oob_err)
# Plot the model trained
plot(model_01_rf)
# Add a legend since it doesn't have one by default
legend(x = "right",
legend = colnames(err),
fill = 1:ncol(err))
# Generate predicted classes using the model object
class_prediction <- predict(object = model_01_rf, # model object
newdata = test_data, # test dataset
type = "class") # return classification labels
# Calculate the confusion matrix for the test set
cm <- caret::confusionMatrix(data = class_prediction,#predicted classes
reference = test_data$Churn) # actual classes
print(cm)
# Compare test set accuracy to OOB accuracy
paste0("Test Accuracy: ", cm$overall[1])
paste0("OOB Accuracy: ", 1 - oob_err)
Compare test set accuracy to OOB accuracy
[1] “Test Accuracy: 0.809388335704125”
[1] “OOB Accuracy: 0.796836118023462”
# Generate predictions on the test set
pred <-predict(object = model_01_rf,
newdata = test_data,
type = "prob")
# Uncomment to take a look at the pred format - `pred` object is a matrix
#head(pred)
# Compute the AUC (`actual` must be a binary 1/0 numeric vector)
round(auc(actual = ifelse(test_data$Churn == "1", 1, 0),
predicted = pred[,"1"]) ,2)
AUC is good to excellent
CAUTION: Random forest models are computationally heavy to run on your computer especially the parameter tuning as the model needs to iterate through all possible values in the hyper grid. This could take many hours to run and if your dataset is large, could crash your computer!
Optimal model
Mtry = 4 nodesize = 7 sampsize = 3445.4
# Train a Random Forest
set.seed(1) # for reproducibility
model_03_rf_final <- randomForest(formula = Churn ~ .,
mtry = 4,
nodesize = 7,
sampsize = 3445.4,
data = train_data, importance = TRUE, keep.forest=TRUE)
# Print the model output
print(model_03_rf_final)
#variable importance
importance(model_03_rf_final)
#variable importance plot
varImpPlot(model_03_rf_final, main="Variable importance on the tuned model")
# Generate predictions on the test set
pred <-predict(object = model_03_rf_final,
newdata = test_data,
type = "prob")
round(auc(actual = ifelse(test_data$Churn == "1", 1, 0),
predicted = pred[,"1"]) ,2)
require(pROC)
rf.roc<-roc(train_data$Churn,model_03_rf_final$votes[,2])
plot(rf.roc)
auc(rf.roc)
# Train the model (to predict 'Churn') using rpart package
tree_mod_01 <- rpart(formula = Churn ~.,
data = train_data,
method = "class")
# Look at the model output
print(tree_mod_01)
# Display the results using rpart.plot package
rpart.plot(x = tree_mod_01, yesno = 2, type = 0, extra = 0)
rpart.plot(tree_mod_01,
yesno = 2,
extra = 104, # show fitted class, probs, percentages
box.palette = "GnBu", # color scheme
branch.lty = 3, # 1= solid, 3 = dotted branch lines
shadow.col = "gray", # shadows under the node boxes
nn = TRUE,
main = "Classification tree on our churn data")
# Train a gini-based model
tree_mod_02<- rpart(formula = Churn ~.,
data = train_data_tree,
method = "class",
parms = list(split = "gini"))
# Display the results
rpart.plot(x = tree_mod_02, yesno = 2, type = 0, extra = 0)
# Train an information-based model
tree_mod_03<- rpart(formula = Churn ~.,
data = train_data_tree,
method = "class",
parms = list(split = "information"))
# Display the results
rpart.plot(x = tree_mod_03, yesno = 2, type = 5, extra = 6)
# Generate predictions on the test set using the gini model
pred1 <- predict(object = tree_mod_02,
newdata = test_data_tree,
type = "class")
# Generate predictions on the test set using the information model
pred2 <- predict(object = tree_mod_03,
newdata = test_data_tree,
type = "class")
# Compare classification error - using ModelMetrics library
ce(actual = test_data$Churn,
predicted = pred1)
ce(actual = test_data$Churn,
predicted = pred2)
What insights have we discovered and can confidently report to our stakeholders?
df %>%
filter(Churn == "Yes" & InternetService == "Fiber optic") %>%
summarize(lost_revenue = sum(TotalCharges))
NA
Now, you have a good starting point to present your findings & insights to stakeholders, but this is only the beginning. Currently, you only have a strong hypotheses so now you need to validate and test what you have uncovered. You may need to do more research or data to do this as you may need to uncover the root causes.