Descobrindo o diretório atual
getwd()
## [1] "C:/Users/ewerson.pimenta.DP6INC/Documents/R/project/Churn_Predict/02.NOTEBOOK"
Configurando o diretório de trabalho
#setwd("/cloud/project/Churn_Predict/")
#getwd()
Os dados foram retirados do site kaggle.com, os dados podem ser acessados em Telecom Churn Data Sets.
Esses dados são de clientes de telecomunicações, produtos de internet adquiridos, pagamentos, e churn (quando ocorre a saída do cliente da companhia de telecom).
require("readr") || install.packages("readr")
## Loading required package: readr
## [1] TRUE
library(readr)
churn_data <- read_csv("~/R/project/Churn_Predict/01.DATA/churn_data.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## customerID = col_character(),
## tenure = col_double(),
## PhoneService = col_character(),
## Contract = col_character(),
## PaperlessBilling = col_character(),
## PaymentMethod = col_character(),
## MonthlyCharges = col_double(),
## TotalCharges = col_double(),
## Churn = col_character()
## )
#View(churn_data)
customer_data <- read_csv("~/R/project/Churn_Predict/01.DATA/customer_data.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## customerID = col_character(),
## gender = col_character(),
## SeniorCitizen = col_double(),
## Partner = col_character(),
## Dependents = col_character()
## )
#View(customer_data)
internet_data <- read_csv("~/R/project/Churn_Predict/01.DATA/internet_data.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## customerID = col_character(),
## MultipleLines = col_character(),
## InternetService = col_character(),
## OnlineSecurity = col_character(),
## OnlineBackup = col_character(),
## DeviceProtection = col_character(),
## TechSupport = col_character(),
## StreamingTV = col_character(),
## StreamingMovies = col_character()
## )
#View(internet_data)
Telecom_Churn_Data_Dictionary <- read_csv("~/R/project/Churn_Predict/01.DATA/Telecom_Churn_Data_Dictionary.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## S.No. = col_double(),
## `Variable Name` = col_character(),
## Meaning = col_character()
## )
#View(Telecom_Churn_Data_Dictionary)
require("tidyverse") || install.packages("tidyverse")
## Loading required package: tidyverse
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v dplyr 1.0.2
## v tibble 3.0.4 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.0
## v purrr 0.3.4
## Warning: package 'tibble' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## [1] TRUE
library(tidyverse)
glimpse(churn_data)
## Rows: 7,043
## Columns: 9
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CF...
## $ tenure <dbl> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49...
## $ PhoneService <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No...
## $ Contract <chr> "Month-to-month", "One year", "Month-to-month", "O...
## $ PaperlessBilling <chr> "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No...
## $ PaymentMethod <chr> "Electronic check", "Mailed check", "Mailed check"...
## $ MonthlyCharges <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 2...
## $ TotalCharges <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1...
## $ Churn <chr> "No", "No", "Yes", "No", "Yes", "Yes", "No", "No",...
#glimpse(churn_data)
cols1 <- c("PhoneService", "Contract", "PaperlessBilling", "PaymentMethod", "Churn")
churn_data[cols1] <- lapply(churn_data[cols1], factor)
glimpse(churn_data)
## Rows: 7,043
## Columns: 9
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CF...
## $ tenure <dbl> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49...
## $ PhoneService <fct> No, Yes, Yes, No, Yes, Yes, Yes, No, Yes, Yes, Yes...
## $ Contract <fct> Month-to-month, One year, Month-to-month, One year...
## $ PaperlessBilling <fct> Yes, No, Yes, No, Yes, Yes, Yes, No, Yes, No, Yes,...
## $ PaymentMethod <fct> Electronic check, Mailed check, Mailed check, Bank...
## $ MonthlyCharges <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 2...
## $ TotalCharges <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1...
## $ Churn <fct> No, No, Yes, No, Yes, Yes, No, No, Yes, No, No, No...
summary(churn_data)
## customerID tenure PhoneService Contract
## Length:7043 Min. : 0.00 No : 682 Month-to-month:3875
## Class :character 1st Qu.: 9.00 Yes:6361 One year :1473
## Mode :character Median :29.00 Two year :1695
## Mean :32.37
## 3rd Qu.:55.00
## Max. :72.00
##
## PaperlessBilling PaymentMethod MonthlyCharges
## No :2872 Bank transfer (automatic):1544 Min. : 18.25
## Yes:4171 Credit card (automatic) :1522 1st Qu.: 35.50
## Electronic check :2365 Median : 70.35
## Mailed check :1612 Mean : 64.76
## 3rd Qu.: 89.85
## Max. :118.75
##
## TotalCharges Churn
## Min. : 18.8 No :5174
## 1st Qu.: 401.4 Yes:1869
## Median :1397.5
## Mean :2283.3
## 3rd Qu.:3794.7
## Max. :8684.8
## NA's :11
cols2 <- colnames(internet_data)[-1]
internet_data[cols2] <- lapply(internet_data[cols2], factor)
glimpse(internet_data)
## Rows: 7,043
## Columns: 9
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CF...
## $ MultipleLines <fct> No phone service, No, No, No phone service, No, Ye...
## $ InternetService <fct> DSL, DSL, DSL, DSL, Fiber optic, Fiber optic, Fibe...
## $ OnlineSecurity <fct> No, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Yes, ...
## $ OnlineBackup <fct> Yes, No, Yes, No, No, No, Yes, No, No, Yes, No, No...
## $ DeviceProtection <fct> No, Yes, No, Yes, No, Yes, No, No, Yes, No, No, No...
## $ TechSupport <fct> No, No, No, Yes, No, No, No, No, Yes, No, No, No i...
## $ StreamingTV <fct> No, No, No, No, No, Yes, Yes, No, Yes, No, No, No ...
## $ StreamingMovies <fct> No, No, No, No, No, Yes, No, No, Yes, No, No, No i...
summary(internet_data)
## customerID MultipleLines InternetService
## Length:7043 No :3390 DSL :2421
## Class :character No phone service: 682 Fiber optic:3096
## Mode :character Yes :2971 No :1526
## OnlineSecurity OnlineBackup
## No :3498 No :3088
## No internet service:1526 No internet service:1526
## Yes :2019 Yes :2429
## DeviceProtection TechSupport
## No :3095 No :3473
## No internet service:1526 No internet service:1526
## Yes :2422 Yes :2044
## StreamingTV StreamingMovies
## No :2810 No :2785
## No internet service:1526 No internet service:1526
## Yes :2707 Yes :2732
glimpse(customer_data)
## Rows: 7,043
## Columns: 5
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CFOCW...
## $ gender <chr> "Female", "Male", "Male", "Male", "Female", "Female",...
## $ SeniorCitizen <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes...
## $ Dependents <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "No"...
summary(customer_data)
## customerID gender SeniorCitizen Partner
## Length:7043 Length:7043 Min. :0.0000 Length:7043
## Class :character Class :character 1st Qu.:0.0000 Class :character
## Mode :character Mode :character Median :0.0000 Mode :character
## Mean :0.1621
## 3rd Qu.:0.0000
## Max. :1.0000
## Dependents
## Length:7043
## Class :character
## Mode :character
##
##
##
levels(as.factor((customer_data$SeniorCitizen)))
## [1] "0" "1"
customer_data <- customer_data %>%
mutate(SeniorCitizen = ifelse(SeniorCitizen == 0, "No", "Yes"))
glimpse(customer_data)
## Rows: 7,043
## Columns: 5
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CFOCW...
## $ gender <chr> "Female", "Male", "Male", "Male", "Female", "Female",...
## $ SeniorCitizen <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No",...
## $ Partner <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes...
## $ Dependents <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "No"...
cols3 <- colnames(customer_data)[-1]
customer_data[cols3] <- lapply(customer_data[cols3], factor)
glimpse(customer_data)
## Rows: 7,043
## Columns: 5
## $ customerID <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CFOCW...
## $ gender <fct> Female, Male, Male, Male, Female, Female, Male, Femal...
## $ SeniorCitizen <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N...
## $ Partner <fct> Yes, No, No, No, No, No, No, No, Yes, No, Yes, No, Ye...
## $ Dependents <fct> No, No, No, No, No, No, Yes, No, No, Yes, Yes, No, No...
summary(customer_data)
## customerID gender SeniorCitizen Partner Dependents
## Length:7043 Female:3488 No :5901 No :3641 No :4933
## Class :character Male :3555 Yes:1142 Yes:3402 Yes:2110
## Mode :character
remove(cols1,cols2,cols3)
dim(churn_data)[1] == dim(internet_data)[1]
## [1] TRUE
dim(churn_data)[1] == dim(customer_data)[1]
## [1] TRUE
# Total de colunas da base após merge
dim(churn_data)[2] + dim(internet_data)[2] + dim(customer_data)[2] - 2
## [1] 21
data <- merge(churn_data, merge(internet_data,customer_data,by.x = "customerID"),by.x = "customerID")
glimpse(data)
## Rows: 7,043
## Columns: 21
## $ customerID <chr> "0002-ORFBO", "0003-MKNFE", "0004-TLHLJ", "0011-IG...
## $ tenure <dbl> 9, 9, 4, 13, 3, 9, 71, 63, 7, 65, 54, 72, 5, 72, 5...
## $ PhoneService <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, ...
## $ Contract <fct> One year, Month-to-month, Month-to-month, Month-to...
## $ PaperlessBilling <fct> Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, N...
## $ PaymentMethod <fct> Mailed check, Mailed check, Electronic check, Elec...
## $ MonthlyCharges <dbl> 65.60, 59.90, 73.90, 98.00, 83.90, 69.40, 109.70, ...
## $ TotalCharges <dbl> 593.30, 542.40, 280.85, 1237.85, 267.40, 571.45, 7...
## $ Churn <fct> No, No, Yes, Yes, Yes, No, No, No, No, No, No, No,...
## $ MultipleLines <fct> No, Yes, No, No, No, No, No, Yes, No, Yes, No phon...
## $ InternetService <fct> DSL, DSL, Fiber optic, Fiber optic, Fiber optic, D...
## $ OnlineSecurity <fct> No, No, No, No, No, No, Yes, Yes, Yes, Yes, Yes, Y...
## $ OnlineBackup <fct> Yes, No, No, Yes, No, No, Yes, No, No, Yes, No, Ye...
## $ DeviceProtection <fct> No, No, Yes, Yes, No, No, Yes, No, No, Yes, No, Ye...
## $ TechSupport <fct> Yes, No, No, No, Yes, Yes, Yes, Yes, No, Yes, Yes,...
## $ StreamingTV <fct> Yes, No, No, Yes, Yes, Yes, Yes, No, No, Yes, Yes,...
## $ StreamingMovies <fct> No, Yes, No, Yes, No, Yes, Yes, No, No, Yes, No, Y...
## $ gender <fct> Female, Male, Male, Male, Female, Female, Female, ...
## $ SeniorCitizen <fct> No, No, No, Yes, Yes, No, Yes, No, Yes, No, No, No...
## $ Partner <fct> Yes, No, No, Yes, Yes, No, Yes, Yes, No, Yes, No, ...
## $ Dependents <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, No, Yes...
DT::datatable(data)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
summary(data)
## customerID tenure PhoneService Contract
## Length:7043 Min. : 0.00 No : 682 Month-to-month:3875
## Class :character 1st Qu.: 9.00 Yes:6361 One year :1473
## Mode :character Median :29.00 Two year :1695
## Mean :32.37
## 3rd Qu.:55.00
## Max. :72.00
##
## PaperlessBilling PaymentMethod MonthlyCharges
## No :2872 Bank transfer (automatic):1544 Min. : 18.25
## Yes:4171 Credit card (automatic) :1522 1st Qu.: 35.50
## Electronic check :2365 Median : 70.35
## Mailed check :1612 Mean : 64.76
## 3rd Qu.: 89.85
## Max. :118.75
##
## TotalCharges Churn MultipleLines InternetService
## Min. : 18.8 No :5174 No :3390 DSL :2421
## 1st Qu.: 401.4 Yes:1869 No phone service: 682 Fiber optic:3096
## Median :1397.5 Yes :2971 No :1526
## Mean :2283.3
## 3rd Qu.:3794.7
## Max. :8684.8
## NA's :11
## OnlineSecurity OnlineBackup
## No :3498 No :3088
## No internet service:1526 No internet service:1526
## Yes :2019 Yes :2429
##
##
##
##
## DeviceProtection TechSupport
## No :3095 No :3473
## No internet service:1526 No internet service:1526
## Yes :2422 Yes :2044
##
##
##
##
## StreamingTV StreamingMovies gender
## No :2810 No :2785 Female:3488
## No internet service:1526 No internet service:1526 Male :3555
## Yes :2707 Yes :2732
##
##
##
##
## SeniorCitizen Partner Dependents
## No :5901 No :3641 No :4933
## Yes:1142 Yes:3402 Yes:2110
##
##
##
##
##
length(levels(as.factor(data$customerID)))
## [1] 7043
dim(data)
## [1] 7043 21
Existem 7.043 ID’s de clientes únicos.
options(repr.plot.width = 6, repr.plot.height = 4)
missing_data <- data %>% summarise_all(funs(sum(is.na(.))/n()))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
missing_data <- gather(missing_data, key = "variables", value = "percent_missing")
ggplot(missing_data, aes(x = reorder(variables, percent_missing), y = percent_missing)) +
geom_bar(stat = "identity", fill = "red", aes(color = I('white')), size = 0.3)+
xlab('variables')+
coord_flip()+
theme_bw()
Podemos perceber que a variável TotalCharger possui valores faltantes. Existem apenas \(0,15\%\) de valores faltantes presentes na variável TotalCharges.
options(repr.plot.width = 6, repr.plot.height = 4)
churn_data %>%
group_by(Churn) %>%
summarise(Count = n())%>%
mutate(percent = prop.table(Count)*100)%>%
ggplot(aes(reorder(Churn, -percent), percent), fill = Churn)+
geom_col(fill = c("#b0b0b0", "#E7B800"))+
geom_text(aes(label = sprintf("%.2f%%", percent)), hjust = 0.01,vjust = -0.5, size =3)+
theme_bw()+
xlab("Churn") +
ylab("Percent")+
ggtitle("Churn Percent")
## `summarise()` ungrouping output (override with `.groups` argument)
Aproximadamente \(26\%\) dos clientes da base de fato realizaram churn.
library(cowplot)
theme1 <- theme_bw()+
theme(axis.text.x = element_text(angle = 0, hjust = 1, vjust = 0.5),legend.position="none",axis.text = element_text(colour = "red"))
theme2 <- theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 0.8),legend.position="none",axis.text = element_text(colour = "red"))
options(repr.plot.width = 12, repr.plot.height = 8)
#update_geom_defaults("bar", list(fill = "#E7B800"))
plot_grid(
ggplot(data, aes(x=gender ,fill=Churn))+
geom_bar(position = 'fill')+ theme1 +
scale_fill_manual(values=c('#b0b0b0','#E7B800')),
ggplot(data, aes(x=SeniorCitizen,fill=Churn))+
geom_bar(position = 'fill')+theme1+
scale_fill_manual(values=c('#b0b0b0','#E7B800')),
ggplot(data, aes(x=Partner,fill=Churn))+
geom_bar(position = 'fill')+theme1+
scale_fill_manual(values=c('#b0b0b0','#E7B800')),
ggplot(data, aes(x=Dependents,fill=Churn))+
geom_bar(position = 'fill')+theme1+
scale_fill_manual(values=c('#b0b0b0','#E7B800'))+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
align = "h")
options(repr.plot.width = 12, repr.plot.height = 8)
plot_grid(
ggplot(data, aes(x=PhoneService,fill=Churn))+
geom_bar(position = 'fill')+theme1+
scale_fill_manual(values=c('#b0b0b0','#E7B800')),
ggplot(data, aes(x=MultipleLines,fill=Churn))+
geom_bar(position = 'fill')+theme2+
scale_fill_manual(values=c('#b0b0b0','#E7B800')),
ggplot(data, aes(x=Contract,fill=Churn))+
geom_bar(position = 'fill')+theme2+
scale_fill_manual(values=c('#b0b0b0','#E7B800')),
ggplot(data, aes(x=PaperlessBilling,fill=Churn))+
geom_bar(position = 'fill')+theme1+
scale_fill_manual(values=c('#b0b0b0','#E7B800'))+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
align = "h")
options(repr.plot.width = 12, repr.plot.height = 8)
plot_grid(
ggplot(data, aes(x=PaymentMethod,fill=Churn))+
geom_bar(position = 'fill')+theme2+
scale_fill_manual(values=c('#b0b0b0','#E7B800')),
ggplot(data, aes(x=InternetService,fill=Churn))+
geom_bar(position = 'fill')+theme2+
scale_fill_manual(values=c('#b0b0b0','#E7B800')),
ggplot(data, aes(x=OnlineSecurity,fill=Churn))+
geom_bar(position = 'fill')+theme2+
scale_fill_manual(values=c('#b0b0b0','#E7B800')),
ggplot(data, aes(x=PaymentMethod,fill=Churn))+
geom_bar(position = 'fill')+theme2+
scale_fill_manual(values=c('#b0b0b0','#E7B800'))+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
align = "h")
options(repr.plot.width = 12, repr.plot.height = 8)
plot_grid(
ggplot(data, aes(x=StreamingMovies,fill=Churn))+
geom_bar(position = 'fill')+theme2+
scale_fill_manual(values=c('#b0b0b0','#E7B800')),
ggplot(data, aes(x=StreamingTV,fill=Churn))+
geom_bar(position = 'fill')+theme2+
scale_fill_manual(values=c('#b0b0b0','#E7B800')),
ggplot(data, aes(x=OnlineBackup,fill=Churn))+
geom_bar(position = 'fill')+theme2+
scale_fill_manual(values=c('#b0b0b0','#E7B800')),
ggplot(data, aes(x=DeviceProtection,fill=Churn))+
geom_bar(position = 'fill')+theme2+
scale_fill_manual(values=c('#b0b0b0','#E7B800'))+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
align = "h")
library(ggthemes)
##
## Attaching package: 'ggthemes'
## The following object is masked from 'package:cowplot':
##
## theme_map
numeric_cols <- c("TotalCharges", "MonthlyCharges", "tenure", "Churn")
numeric_data <- data %>% dplyr::select(numeric_cols) %>% na.omit
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(numeric_cols)` instead of `numeric_cols` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
options(repr.plot.width = 12, repr.plot.height = 8)
plot_grid(
ggplot(numeric_data, aes(tenure, MonthlyCharges, colour = Churn))+
geom_point() + theme_hc()+ scale_colour_hc(),
ggplot(numeric_data, aes(tenure, TotalCharges, colour = Churn))+
geom_point() + theme_hc()+ scale_colour_hc() +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
align = "h")
library(tidyverse)
data <- data %>%
mutate(censura = ifelse(Churn == "Yes", 1, 0))
summary(data$TotalCharges)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.8 401.4 1397.5 2283.3 3794.7 8684.8 11
summary(data$MonthlyCharges)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.25 35.50 70.35 64.76 89.85 118.75
library(tidyverse)
data <- data %>%
mutate(MonthlyCharges_categ = ifelse(MonthlyCharges < 70, "< 70",
">= 70"),
TotalCharges_categ = ifelse(TotalCharges < 2000, "< 2000",
">= 2000"))
library(tidyverse)
data <- data %>%
mutate(censura = ifelse(Churn == "Yes", 1, 0))
library(survival)
ekm <-survfit(Surv(data$tenure, data$censura)~1)
plot(ekm,ylab="S(t)_KM",xlab="Tempo (meses)",
conf.int=F, col="purple", lwd=2)
library(survival)
#Funcao de sobrevivencia por Kaplan-Meier:
ekm.Contract <-survfit(Surv(data$tenure,data$censura)~data$Contract)
summary(ekm)
## Call: survfit(formula = Surv(data$tenure, data$censura) ~ 1)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 7032 380 0.946 0.00270 0.941 0.951
## 2 6419 123 0.928 0.00310 0.922 0.934
## 3 6181 94 0.914 0.00338 0.907 0.920
## 4 5981 83 0.901 0.00361 0.894 0.908
## 5 5805 64 0.891 0.00377 0.884 0.899
## 6 5672 40 0.885 0.00388 0.877 0.892
## 7 5562 51 0.877 0.00400 0.869 0.885
## 8 5431 42 0.870 0.00411 0.862 0.878
## 9 5308 46 0.862 0.00422 0.854 0.871
## 10 5189 45 0.855 0.00433 0.846 0.863
## 11 5073 31 0.850 0.00440 0.841 0.858
## 12 4974 38 0.843 0.00449 0.834 0.852
## 13 4857 38 0.837 0.00458 0.828 0.846
## 14 4748 24 0.832 0.00464 0.823 0.842
## 15 4672 37 0.826 0.00473 0.817 0.835
## 16 4573 28 0.821 0.00479 0.811 0.830
## 17 4493 26 0.816 0.00486 0.807 0.826
## 18 4406 24 0.812 0.00491 0.802 0.821
## 19 4309 19 0.808 0.00496 0.798 0.818
## 20 4236 18 0.805 0.00501 0.795 0.814
## 21 4165 17 0.801 0.00505 0.791 0.811
## 22 4102 27 0.796 0.00512 0.786 0.806
## 23 4012 13 0.793 0.00515 0.783 0.804
## 24 3927 23 0.789 0.00521 0.779 0.799
## 25 3833 23 0.784 0.00527 0.774 0.794
## 26 3754 15 0.781 0.00531 0.771 0.791
## 27 3675 13 0.778 0.00535 0.768 0.789
## 28 3603 12 0.776 0.00538 0.765 0.786
## 29 3546 15 0.772 0.00542 0.762 0.783
## 30 3474 16 0.769 0.00547 0.758 0.779
## 31 3402 16 0.765 0.00552 0.754 0.776
## 32 3337 19 0.761 0.00558 0.750 0.772
## 33 3268 14 0.757 0.00562 0.747 0.769
## 34 3204 12 0.755 0.00566 0.744 0.766
## 35 3139 15 0.751 0.00571 0.740 0.762
## 36 3051 10 0.749 0.00574 0.737 0.760
## 37 3001 15 0.745 0.00580 0.734 0.756
## 38 2936 13 0.742 0.00584 0.730 0.753
## 39 2877 14 0.738 0.00589 0.726 0.750
## 40 2821 13 0.734 0.00594 0.723 0.746
## 41 2757 14 0.731 0.00599 0.719 0.743
## 42 2687 14 0.727 0.00605 0.715 0.739
## 43 2622 15 0.723 0.00611 0.711 0.735
## 44 2557 6 0.721 0.00613 0.709 0.733
## 45 2506 6 0.719 0.00616 0.707 0.732
## 46 2445 12 0.716 0.00621 0.704 0.728
## 47 2371 14 0.712 0.00628 0.699 0.724
## 48 2303 9 0.709 0.00632 0.697 0.721
## 49 2239 15 0.704 0.00640 0.692 0.717
## 50 2173 10 0.701 0.00645 0.688 0.714
## 51 2105 8 0.698 0.00649 0.686 0.711
## 52 2037 8 0.695 0.00654 0.683 0.708
## 53 1957 14 0.690 0.00663 0.678 0.704
## 54 1887 13 0.686 0.00671 0.673 0.699
## 55 1819 9 0.682 0.00677 0.669 0.696
## 56 1755 10 0.678 0.00684 0.665 0.692
## 57 1675 8 0.675 0.00691 0.662 0.689
## 58 1610 11 0.671 0.00700 0.657 0.684
## 59 1543 8 0.667 0.00707 0.653 0.681
## 60 1483 6 0.664 0.00713 0.651 0.679
## 61 1407 8 0.661 0.00721 0.647 0.675
## 62 1331 5 0.658 0.00727 0.644 0.673
## 63 1261 4 0.656 0.00732 0.642 0.671
## 64 1189 4 0.654 0.00738 0.640 0.668
## 65 1109 9 0.649 0.00753 0.634 0.663
## 66 1033 13 0.640 0.00776 0.625 0.656
## 67 944 10 0.634 0.00797 0.618 0.649
## 68 846 9 0.627 0.00820 0.611 0.643
## 69 746 8 0.620 0.00845 0.604 0.637
## 70 651 11 0.610 0.00888 0.593 0.627
## 71 532 6 0.603 0.00921 0.585 0.621
## 72 362 6 0.593 0.00992 0.574 0.613
plot(ekm.Contract, lty=c(2,1,3),xlab="Customer Tenure (months)",ylab="Customer Survival Chance (%)",lwd=2, col=c(3,6,7))
legend(1,0.4,lty=c(2,1,3),legend = c("Month-to-month","One year", "Two year"),lwd=2,bty="n", col=c(3,6,7),cex=0.7)
survdiff(Surv(data$tenure,data$censura)~data$Contract)
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$Contract)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## data$Contract=Month-to-month 3875 1655 708 1265 2304
## data$Contract=One year 1473 166 471 197 270
## data$Contract=Two year 1695 48 690 597 1061
##
## Chisq= 2353 on 2 degrees of freedom, p= <2e-16
O teste de logrank parte da hipótese nula de que as curvas, dos modelos com e sem a inclusão da covariável em questão, são iguais.
O \(\text{p-valor}\le 2e^{-16}\), indicando que existe diferença significativa entre as curvas, e que a covariável Contract pode ser relevante para explicar a variável resposta (tempo até ocorrência de churn).
Não entraremos em detalhes de métodos de seleção de covariáveis. Porém, repetindo esse procedimento com as outras coavariáveis existentes, e testando interações (modelos multivariados) pode-se chegar em um modelo que melhor explica a resposta dos dados.
VERIFICANDO SIGNIFICÂNCIA DAS COVAR. INDIVIDUALMENTE
survdiff(Surv(data$tenure,data$censura)~data$gender) # não signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$gender)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## data$gender=Female 3488 939 923 0.261 0.526
## data$gender=Male 3555 930 946 0.255 0.526
##
## Chisq= 0.5 on 1 degrees of freedom, p= 0.5
survdiff(Surv(data$tenure,data$censura)~data$Partner) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$Partner)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## data$Partner=No 3641 1200 773 236 424
## data$Partner=Yes 3402 669 1096 166 424
##
## Chisq= 424 on 1 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$Dependents) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$Dependents)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## data$Dependents=No 4933 1543 1234 77.3 233
## data$Dependents=Yes 2110 326 635 150.3 233
##
## Chisq= 233 on 1 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$SeniorCitizen) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$SeniorCitizen)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## data$SeniorCitizen=No 5901 1393 1560 17.8 109
## data$SeniorCitizen=Yes 1142 476 309 89.8 109
##
## Chisq= 110 on 1 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$StreamingMovies) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$StreamingMovies)
##
## N Observed Expected (O-E)^2/E
## data$StreamingMovies=No 2785 938 614 171.50
## data$StreamingMovies=No internet service 1526 113 388 195.36
## data$StreamingMovies=Yes 2732 818 867 2.76
## (O-E)^2/V
## data$StreamingMovies=No 264.22
## data$StreamingMovies=No internet service 251.20
## data$StreamingMovies=Yes 5.33
##
## Chisq= 378 on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$StreamingTV) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$StreamingTV)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## data$StreamingTV=No 2810 942 624 162.47 252.07
## data$StreamingTV=No internet service 1526 113 388 195.36 251.20
## data$StreamingTV=Yes 2707 814 857 2.14 4.09
##
## Chisq= 368 on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$TechSupport) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$TechSupport)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## data$TechSupport=No 3473 1446 788 549 985
## data$TechSupport=No internet service 1526 113 388 195 251
## data$TechSupport=Yes 2044 310 692 211 349
##
## Chisq= 990 on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$DeviceProtection) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$DeviceProtection)
##
## N Observed Expected (O-E)^2/E
## data$DeviceProtection=No 3095 1211 664 450
## data$DeviceProtection=No internet service 1526 113 388 195
## data$DeviceProtection=Yes 2422 545 816 90
## (O-E)^2/V
## data$DeviceProtection=No 729
## data$DeviceProtection=No internet service 251
## data$DeviceProtection=Yes 167
##
## Chisq= 764 on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$OnlineBackup) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$OnlineBackup)
##
## N Observed Expected (O-E)^2/E
## data$OnlineBackup=No 3088 1233 664 488
## data$OnlineBackup=No internet service 1526 113 388 195
## data$OnlineBackup=Yes 2429 523 817 106
## (O-E)^2/V
## data$OnlineBackup=No 792
## data$OnlineBackup=No internet service 251
## data$OnlineBackup=Yes 197
##
## Chisq= 821 on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$OnlineSecurity) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$OnlineSecurity)
##
## N Observed Expected (O-E)^2/E
## data$OnlineSecurity=No 3498 1461 794 559
## data$OnlineSecurity=No internet service 1526 113 388 195
## data$OnlineSecurity=Yes 2019 295 686 223
## (O-E)^2/V
## data$OnlineSecurity=No 1010
## data$OnlineSecurity=No internet service 251
## data$OnlineSecurity=Yes 367
##
## Chisq= 1014 on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$InternetService) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$InternetService)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## data$InternetService=DSL 2421 459 649 55.4 86.4
## data$InternetService=Fiber optic 3096 1297 832 260.1 477.1
## data$InternetService=No 1526 113 388 195.4 251.2
##
## Chisq= 520 on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$MultipleLines) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$MultipleLines)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## data$MultipleLines=No 3390 849 735 17.597 30.276
## data$MultipleLines=No phone service 682 170 178 0.383 0.431
## data$MultipleLines=Yes 2971 850 955 11.646 24.850
##
## Chisq= 31 on 2 degrees of freedom, p= 2e-07
survdiff(Surv(data$tenure,data$censura)~data$TotalCharges_categ) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$TotalCharges_categ)
##
## n=7032, 11 observations deleted due to missingness.
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## data$TotalCharges_categ=< 2000 4176 1339 778 404 775
## data$TotalCharges_categ=>= 2000 2856 530 1091 288 775
##
## Chisq= 775 on 1 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$MonthlyCharges_categ) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$MonthlyCharges_categ)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## data$MonthlyCharges_categ=< 70 3452 595 823 63.2 116
## data$MonthlyCharges_categ=>= 70 3591 1274 1046 49.7 116
##
## Chisq= 116 on 1 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$PaperlessBilling) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$PaperlessBilling)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## data$PaperlessBilling=No 2872 469 759 110.6 190
## data$PaperlessBilling=Yes 4171 1400 1110 75.6 190
##
## Chisq= 190 on 1 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$PhoneService) # Não signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$PhoneService)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## data$PhoneService=No 682 170 178 0.3828 0.431
## data$PhoneService=Yes 6361 1699 1691 0.0404 0.431
##
## Chisq= 0.4 on 1 degrees of freedom, p= 0.5
survdiff(Surv(data$tenure,data$censura)~data$Contract) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$Contract)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## data$Contract=Month-to-month 3875 1655 708 1265 2304
## data$Contract=One year 1473 166 471 197 270
## data$Contract=Two year 1695 48 690 597 1061
##
## Chisq= 2353 on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$PaymentMethod) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$PaymentMethod)
##
## N Observed Expected (O-E)^2/E
## data$PaymentMethod=Bank transfer (automatic) 1544 258 512 126.12
## data$PaymentMethod=Credit card (automatic) 1522 232 502 145.00
## data$PaymentMethod=Electronic check 2365 1071 528 558.93
## data$PaymentMethod=Mailed check 1612 308 327 1.14
## (O-E)^2/V
## data$PaymentMethod=Bank transfer (automatic) 178.66
## data$PaymentMethod=Credit card (automatic) 203.63
## data$PaymentMethod=Electronic check 803.75
## data$PaymentMethod=Mailed check 1.43
##
## Chisq= 865 on 3 degrees of freedom, p= <2e-16
library(survival)
#Funcao de sobrevivencia por Kaplan-Meier:
ekm.Contract <-survfit(Surv(data$tenure)~data$Contract)
#summary(ekm)
plot(ekm.Contract, lty=c(2,1,3),xlab="Customer Tenure (months)",ylab="Customer Survival Chance (%)",lwd=2, col=c(3,6,7))
legend(1,0.4,lty=c(2,1,3),legend = c("Month-to-month","One year", "Two year"),lwd=2,bty="n", col=c(3,6,7),cex=0.7)
CONSTRUINDO MODELO COMPLETO SEM AS VARIÁVEIS Ñ SIGNIF. DO PASSO ANTERIOR
Modelo de Cox Multivariado
m.comp1<-coxph(formula=Surv(tenure,censura)~Contract+Partner+Dependents+
SeniorCitizen+StreamingMovies+StreamingTV+TechSupport+DeviceProtection+OnlineBackup+OnlineSecurity+InternetService+MultipleLines+TotalCharges_categ+PaperlessBilling,data=data, x=T,method="breslow")
summary(m.comp1)
## Call:
## coxph(formula = Surv(tenure, censura) ~ Contract + Partner +
## Dependents + SeniorCitizen + StreamingMovies + StreamingTV +
## TechSupport + DeviceProtection + OnlineBackup + OnlineSecurity +
## InternetService + MultipleLines + TotalCharges_categ + PaperlessBilling,
## data = data, x = T, method = "breslow")
##
## n= 7032, number of events= 1869
## (11 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z
## ContractOne year -1.581991 0.205565 0.092017 -17.192
## ContractTwo year -3.589254 0.027619 0.173145 -20.730
## PartnerYes -0.400950 0.669684 0.055000 -7.290
## DependentsYes -0.101548 0.903438 0.068500 -1.482
## SeniorCitizenYes -0.006968 0.993057 0.056485 -0.123
## StreamingMoviesNo internet service -1.523324 0.217986 0.130141 -11.705
## StreamingMoviesYes 0.133537 1.142863 0.054367 2.456
## StreamingTVNo internet service NA NA 0.000000 NA
## StreamingTVYes 0.173929 1.189971 0.054222 3.208
## TechSupportNo internet service NA NA 0.000000 NA
## TechSupportYes -0.308813 0.734318 0.065838 -4.690
## DeviceProtectionNo internet service NA NA 0.000000 NA
## DeviceProtectionYes -0.173634 0.840604 0.055061 -3.153
## OnlineBackupNo internet service NA NA 0.000000 NA
## OnlineBackupYes -0.378560 0.684847 0.056043 -6.755
## OnlineSecurityNo internet service NA NA 0.000000 NA
## OnlineSecurityYes -0.495768 0.609103 0.066479 -7.457
## InternetServiceFiber optic 0.658262 1.931433 0.071888 9.157
## InternetServiceNo NA NA 0.000000 NA
## MultipleLinesNo phone service -0.394537 0.673992 0.101844 -3.874
## MultipleLinesYes -0.244447 0.783137 0.053194 -4.595
## TotalCharges_categ>= 2000 -2.363429 0.094097 0.085549 -27.627
## PaperlessBillingYes 0.166501 1.181165 0.056510 2.946
## Pr(>|z|)
## ContractOne year < 2e-16 ***
## ContractTwo year < 2e-16 ***
## PartnerYes 3.10e-13 ***
## DependentsYes 0.138217
## SeniorCitizenYes 0.901828
## StreamingMoviesNo internet service < 2e-16 ***
## StreamingMoviesYes 0.014041 *
## StreamingTVNo internet service NA
## StreamingTVYes 0.001338 **
## TechSupportNo internet service NA
## TechSupportYes 2.73e-06 ***
## DeviceProtectionNo internet service NA
## DeviceProtectionYes 0.001613 **
## OnlineBackupNo internet service NA
## OnlineBackupYes 1.43e-11 ***
## OnlineSecurityNo internet service NA
## OnlineSecurityYes 8.82e-14 ***
## InternetServiceFiber optic < 2e-16 ***
## InternetServiceNo NA
## MultipleLinesNo phone service 0.000107 ***
## MultipleLinesYes 4.32e-06 ***
## TotalCharges_categ>= 2000 < 2e-16 ***
## PaperlessBillingYes 0.003215 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## ContractOne year 0.20557 4.8646 0.17164 0.24619
## ContractTwo year 0.02762 36.2071 0.01967 0.03878
## PartnerYes 0.66968 1.4932 0.60125 0.74591
## DependentsYes 0.90344 1.1069 0.78993 1.03325
## SeniorCitizenYes 0.99306 1.0070 0.88898 1.10931
## StreamingMoviesNo internet service 0.21799 4.5875 0.16891 0.28132
## StreamingMoviesYes 1.14286 0.8750 1.02735 1.27137
## StreamingTVNo internet service NA NA NA NA
## StreamingTVYes 1.18997 0.8404 1.07000 1.32340
## TechSupportNo internet service NA NA NA NA
## TechSupportYes 0.73432 1.3618 0.64542 0.83546
## DeviceProtectionNo internet service NA NA NA NA
## DeviceProtectionYes 0.84060 1.1896 0.75461 0.93640
## OnlineBackupNo internet service NA NA NA NA
## OnlineBackupYes 0.68485 1.4602 0.61361 0.76436
## OnlineSecurityNo internet service NA NA NA NA
## OnlineSecurityYes 0.60910 1.6418 0.53469 0.69387
## InternetServiceFiber optic 1.93143 0.5178 1.67760 2.22367
## InternetServiceNo NA NA NA NA
## MultipleLinesNo phone service 0.67399 1.4837 0.55203 0.82290
## MultipleLinesYes 0.78314 1.2769 0.70560 0.86919
## TotalCharges_categ>= 2000 0.09410 10.6273 0.07957 0.11127
## PaperlessBillingYes 1.18116 0.8466 1.05733 1.31951
##
## Concordance= 0.889 (se = 0.003 )
## Likelihood ratio test= 4275 on 17 df, p=<2e-16
## Wald test = 2355 on 17 df, p=<2e-16
## Score (logrank) test = 3965 on 17 df, p=<2e-16
PASSO XX RETORNANDO VARIÁVEIS NÃO SIGNIF. DO PASSO ANTERIOR
Retornando as variáveis excluídas anteriormente
m.comp.gender<-coxph(formula=Surv(tenure,censura)~Contract+Partner+Dependents+
SeniorCitizen+StreamingMovies+StreamingTV+TechSupport+DeviceProtection+OnlineBackup+OnlineSecurity+InternetService+MultipleLines+TotalCharges_categ+PaperlessBilling+gender,data=data, x=T,method="breslow")
Avaliação de significância da variável
TRV.npartner=2*(m.comp1$loglik-m.comp.gender$loglik)
1-pchisq(TRV.npartner,df=1)
## [1] 1 1
Teste de razão de Verossimilhança, comparando os modelos com e sem a adição da covar. Gender, p-valor = 1, variável não significativa para o modelo.
m.comp.phone<-coxph(formula=Surv(tenure,censura)~Contract+Partner+Dependents+
SeniorCitizen+StreamingMovies+StreamingTV+TechSupport+DeviceProtection+OnlineBackup+OnlineSecurity+InternetService+MultipleLines+TotalCharges_categ+PaperlessBilling+PhoneService,data=data, x=T,method="breslow")
Avaliação de significância da variável
TRV.npartner=2*(m.comp1$loglik-m.comp.phone$loglik)
1-pchisq(TRV.npartner,df=1)
## [1] 1 1
Assim como a variável Gender, pelo Teste de Razão de Verossimilhança, comparando os modelos com e sem a adição da covar. PhoneService, p-valor = 1, variável não significativa para o modelo.
REMOVENDO COVAR DO MODELO COMPLETO
m.sem.contract<-coxph(formula=Surv(tenure,censura)~Partner+Dependents+
SeniorCitizen+StreamingMovies+StreamingTV+TechSupport+DeviceProtection+OnlineBackup+OnlineSecurity+InternetService+MultipleLines+TotalCharges_categ+PaperlessBilling+PhoneService,data=data, x=T,method="breslow")
TRV.npartner=2*(m.comp1$loglik-m.sem.contract$loglik)
1-pchisq(TRV.npartner,df=1)
## [1] 1 0
P-valor = 0, variável Contract signif. para o modelo
#final<-coxph(formula=Surv(tenure)~1,data=data, x=T,method="breslow")
#install.packages("survminer")
#library(survminer)
#test.ph <- cox.zph(final, transform = "identity")