Configurando o ambiente de desenvolvimento

Descobrindo o diretório atual

getwd()
## [1] "C:/Users/ewerson.pimenta.DP6INC/Documents/R/project/Churn_Predict/02.NOTEBOOK"

Configurando o diretório de trabalho

#setwd("/cloud/project/Churn_Predict/")
#getwd()

Base de dados

Sobre os dados

Os dados foram retirados do site kaggle.com, os dados podem ser acessados em Telecom Churn Data Sets.

Esses dados são de clientes de telecomunicações, produtos de internet adquiridos, pagamentos, e churn (quando ocorre a saída do cliente da companhia de telecom).

Leitura das bases de dados

require("readr") || install.packages("readr")
## Loading required package: readr
## [1] TRUE
library(readr)

churn_data <- read_csv("~/R/project/Churn_Predict/01.DATA/churn_data.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   customerID = col_character(),
##   tenure = col_double(),
##   PhoneService = col_character(),
##   Contract = col_character(),
##   PaperlessBilling = col_character(),
##   PaymentMethod = col_character(),
##   MonthlyCharges = col_double(),
##   TotalCharges = col_double(),
##   Churn = col_character()
## )
#View(churn_data)

customer_data <- read_csv("~/R/project/Churn_Predict/01.DATA/customer_data.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   customerID = col_character(),
##   gender = col_character(),
##   SeniorCitizen = col_double(),
##   Partner = col_character(),
##   Dependents = col_character()
## )
#View(customer_data)

internet_data <- read_csv("~/R/project/Churn_Predict/01.DATA/internet_data.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   customerID = col_character(),
##   MultipleLines = col_character(),
##   InternetService = col_character(),
##   OnlineSecurity = col_character(),
##   OnlineBackup = col_character(),
##   DeviceProtection = col_character(),
##   TechSupport = col_character(),
##   StreamingTV = col_character(),
##   StreamingMovies = col_character()
## )
#View(internet_data)

Telecom_Churn_Data_Dictionary <- read_csv("~/R/project/Churn_Predict/01.DATA/Telecom_Churn_Data_Dictionary.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   S.No. = col_double(),
##   `Variable Name` = col_character(),
##   Meaning = col_character()
## )
#View(Telecom_Churn_Data_Dictionary)
require("tidyverse") || install.packages("tidyverse")
## Loading required package: tidyverse
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v dplyr   1.0.2
## v tibble  3.0.4     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.0
## v purrr   0.3.4
## Warning: package 'tibble' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## [1] TRUE
library(tidyverse)
glimpse(churn_data)
## Rows: 7,043
## Columns: 9
## $ customerID       <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CF...
## $ tenure           <dbl> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49...
## $ PhoneService     <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "No...
## $ Contract         <chr> "Month-to-month", "One year", "Month-to-month", "O...
## $ PaperlessBilling <chr> "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "No...
## $ PaymentMethod    <chr> "Electronic check", "Mailed check", "Mailed check"...
## $ MonthlyCharges   <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 2...
## $ TotalCharges     <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1...
## $ Churn            <chr> "No", "No", "Yes", "No", "Yes", "Yes", "No", "No",...
#glimpse(churn_data)

Transformando variáveis em factor

cols1 <- c("PhoneService", "Contract", "PaperlessBilling", "PaymentMethod", "Churn")
churn_data[cols1] <- lapply(churn_data[cols1], factor)
glimpse(churn_data)
## Rows: 7,043
## Columns: 9
## $ customerID       <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CF...
## $ tenure           <dbl> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, 49...
## $ PhoneService     <fct> No, Yes, Yes, No, Yes, Yes, Yes, No, Yes, Yes, Yes...
## $ Contract         <fct> Month-to-month, One year, Month-to-month, One year...
## $ PaperlessBilling <fct> Yes, No, Yes, No, Yes, Yes, Yes, No, Yes, No, Yes,...
## $ PaymentMethod    <fct> Electronic check, Mailed check, Mailed check, Bank...
## $ MonthlyCharges   <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10, 2...
## $ TotalCharges     <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50, 1...
## $ Churn            <fct> No, No, Yes, No, Yes, Yes, No, No, Yes, No, No, No...
summary(churn_data)
##   customerID            tenure      PhoneService           Contract   
##  Length:7043        Min.   : 0.00   No : 682     Month-to-month:3875  
##  Class :character   1st Qu.: 9.00   Yes:6361     One year      :1473  
##  Mode  :character   Median :29.00                Two year      :1695  
##                     Mean   :32.37                                     
##                     3rd Qu.:55.00                                     
##                     Max.   :72.00                                     
##                                                                       
##  PaperlessBilling                   PaymentMethod  MonthlyCharges  
##  No :2872         Bank transfer (automatic):1544   Min.   : 18.25  
##  Yes:4171         Credit card (automatic)  :1522   1st Qu.: 35.50  
##                   Electronic check         :2365   Median : 70.35  
##                   Mailed check             :1612   Mean   : 64.76  
##                                                    3rd Qu.: 89.85  
##                                                    Max.   :118.75  
##                                                                    
##   TotalCharges    Churn     
##  Min.   :  18.8   No :5174  
##  1st Qu.: 401.4   Yes:1869  
##  Median :1397.5             
##  Mean   :2283.3             
##  3rd Qu.:3794.7             
##  Max.   :8684.8             
##  NA's   :11
cols2 <- colnames(internet_data)[-1]
internet_data[cols2] <- lapply(internet_data[cols2], factor)
glimpse(internet_data)
## Rows: 7,043
## Columns: 9
## $ customerID       <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CF...
## $ MultipleLines    <fct> No phone service, No, No, No phone service, No, Ye...
## $ InternetService  <fct> DSL, DSL, DSL, DSL, Fiber optic, Fiber optic, Fibe...
## $ OnlineSecurity   <fct> No, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Yes, ...
## $ OnlineBackup     <fct> Yes, No, Yes, No, No, No, Yes, No, No, Yes, No, No...
## $ DeviceProtection <fct> No, Yes, No, Yes, No, Yes, No, No, Yes, No, No, No...
## $ TechSupport      <fct> No, No, No, Yes, No, No, No, No, Yes, No, No, No i...
## $ StreamingTV      <fct> No, No, No, No, No, Yes, Yes, No, Yes, No, No, No ...
## $ StreamingMovies  <fct> No, No, No, No, No, Yes, No, No, Yes, No, No, No i...
summary(internet_data)
##   customerID                 MultipleLines     InternetService
##  Length:7043        No              :3390   DSL        :2421  
##  Class :character   No phone service: 682   Fiber optic:3096  
##  Mode  :character   Yes             :2971   No         :1526  
##              OnlineSecurity              OnlineBackup 
##  No                 :3498   No                 :3088  
##  No internet service:1526   No internet service:1526  
##  Yes                :2019   Yes                :2429  
##             DeviceProtection              TechSupport  
##  No                 :3095    No                 :3473  
##  No internet service:1526    No internet service:1526  
##  Yes                :2422    Yes                :2044  
##               StreamingTV              StreamingMovies
##  No                 :2810   No                 :2785  
##  No internet service:1526   No internet service:1526  
##  Yes                :2707   Yes                :2732
glimpse(customer_data)
## Rows: 7,043
## Columns: 5
## $ customerID    <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CFOCW...
## $ gender        <chr> "Female", "Male", "Male", "Male", "Female", "Female",...
## $ SeniorCitizen <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Partner       <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes...
## $ Dependents    <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "No"...
summary(customer_data)
##   customerID           gender          SeniorCitizen      Partner         
##  Length:7043        Length:7043        Min.   :0.0000   Length:7043       
##  Class :character   Class :character   1st Qu.:0.0000   Class :character  
##  Mode  :character   Mode  :character   Median :0.0000   Mode  :character  
##                                        Mean   :0.1621                     
##                                        3rd Qu.:0.0000                     
##                                        Max.   :1.0000                     
##   Dependents       
##  Length:7043       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
levels(as.factor((customer_data$SeniorCitizen)))
## [1] "0" "1"
customer_data <- customer_data %>%
      mutate(SeniorCitizen = ifelse(SeniorCitizen == 0, "No", "Yes"))
glimpse(customer_data)
## Rows: 7,043
## Columns: 5
## $ customerID    <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CFOCW...
## $ gender        <chr> "Female", "Male", "Male", "Male", "Female", "Female",...
## $ SeniorCitizen <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No",...
## $ Partner       <chr> "Yes", "No", "No", "No", "No", "No", "No", "No", "Yes...
## $ Dependents    <chr> "No", "No", "No", "No", "No", "No", "Yes", "No", "No"...
cols3 <- colnames(customer_data)[-1]
customer_data[cols3] <- lapply(customer_data[cols3], factor)
glimpse(customer_data)
## Rows: 7,043
## Columns: 5
## $ customerID    <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-CFOCW...
## $ gender        <fct> Female, Male, Male, Male, Female, Female, Male, Femal...
## $ SeniorCitizen <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N...
## $ Partner       <fct> Yes, No, No, No, No, No, No, No, Yes, No, Yes, No, Ye...
## $ Dependents    <fct> No, No, No, No, No, No, Yes, No, No, Yes, Yes, No, No...
summary(customer_data)
##   customerID           gender     SeniorCitizen Partner    Dependents
##  Length:7043        Female:3488   No :5901      No :3641   No :4933  
##  Class :character   Male  :3555   Yes:1142      Yes:3402   Yes:2110  
##  Mode  :character
remove(cols1,cols2,cols3)

Join Data-bases

dim(churn_data)[1] == dim(internet_data)[1]
## [1] TRUE
dim(churn_data)[1] == dim(customer_data)[1]
## [1] TRUE
# Total de colunas da base após merge
dim(churn_data)[2] + dim(internet_data)[2] + dim(customer_data)[2] - 2 
## [1] 21
data <- merge(churn_data, merge(internet_data,customer_data,by.x = "customerID"),by.x = "customerID")
glimpse(data)
## Rows: 7,043
## Columns: 21
## $ customerID       <chr> "0002-ORFBO", "0003-MKNFE", "0004-TLHLJ", "0011-IG...
## $ tenure           <dbl> 9, 9, 4, 13, 3, 9, 71, 63, 7, 65, 54, 72, 5, 72, 5...
## $ PhoneService     <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, ...
## $ Contract         <fct> One year, Month-to-month, Month-to-month, Month-to...
## $ PaperlessBilling <fct> Yes, No, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, N...
## $ PaymentMethod    <fct> Mailed check, Mailed check, Electronic check, Elec...
## $ MonthlyCharges   <dbl> 65.60, 59.90, 73.90, 98.00, 83.90, 69.40, 109.70, ...
## $ TotalCharges     <dbl> 593.30, 542.40, 280.85, 1237.85, 267.40, 571.45, 7...
## $ Churn            <fct> No, No, Yes, Yes, Yes, No, No, No, No, No, No, No,...
## $ MultipleLines    <fct> No, Yes, No, No, No, No, No, Yes, No, Yes, No phon...
## $ InternetService  <fct> DSL, DSL, Fiber optic, Fiber optic, Fiber optic, D...
## $ OnlineSecurity   <fct> No, No, No, No, No, No, Yes, Yes, Yes, Yes, Yes, Y...
## $ OnlineBackup     <fct> Yes, No, No, Yes, No, No, Yes, No, No, Yes, No, Ye...
## $ DeviceProtection <fct> No, No, Yes, Yes, No, No, Yes, No, No, Yes, No, Ye...
## $ TechSupport      <fct> Yes, No, No, No, Yes, Yes, Yes, Yes, No, Yes, Yes,...
## $ StreamingTV      <fct> Yes, No, No, Yes, Yes, Yes, Yes, No, No, Yes, Yes,...
## $ StreamingMovies  <fct> No, Yes, No, Yes, No, Yes, Yes, No, No, Yes, No, Y...
## $ gender           <fct> Female, Male, Male, Male, Female, Female, Female, ...
## $ SeniorCitizen    <fct> No, No, No, Yes, Yes, No, Yes, No, Yes, No, No, No...
## $ Partner          <fct> Yes, No, No, Yes, Yes, No, Yes, Yes, No, Yes, No, ...
## $ Dependents       <fct> Yes, No, No, No, No, Yes, No, No, No, Yes, No, Yes...

Visualizando Tabela de Dados

DT::datatable(data)
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

Análise Descritiva

summary(data)
##   customerID            tenure      PhoneService           Contract   
##  Length:7043        Min.   : 0.00   No : 682     Month-to-month:3875  
##  Class :character   1st Qu.: 9.00   Yes:6361     One year      :1473  
##  Mode  :character   Median :29.00                Two year      :1695  
##                     Mean   :32.37                                     
##                     3rd Qu.:55.00                                     
##                     Max.   :72.00                                     
##                                                                       
##  PaperlessBilling                   PaymentMethod  MonthlyCharges  
##  No :2872         Bank transfer (automatic):1544   Min.   : 18.25  
##  Yes:4171         Credit card (automatic)  :1522   1st Qu.: 35.50  
##                   Electronic check         :2365   Median : 70.35  
##                   Mailed check             :1612   Mean   : 64.76  
##                                                    3rd Qu.: 89.85  
##                                                    Max.   :118.75  
##                                                                    
##   TotalCharges    Churn               MultipleLines     InternetService
##  Min.   :  18.8   No :5174   No              :3390   DSL        :2421  
##  1st Qu.: 401.4   Yes:1869   No phone service: 682   Fiber optic:3096  
##  Median :1397.5              Yes             :2971   No         :1526  
##  Mean   :2283.3                                                        
##  3rd Qu.:3794.7                                                        
##  Max.   :8684.8                                                        
##  NA's   :11                                                            
##              OnlineSecurity              OnlineBackup 
##  No                 :3498   No                 :3088  
##  No internet service:1526   No internet service:1526  
##  Yes                :2019   Yes                :2429  
##                                                       
##                                                       
##                                                       
##                                                       
##             DeviceProtection              TechSupport  
##  No                 :3095    No                 :3473  
##  No internet service:1526    No internet service:1526  
##  Yes                :2422    Yes                :2044  
##                                                        
##                                                        
##                                                        
##                                                        
##               StreamingTV              StreamingMovies    gender    
##  No                 :2810   No                 :2785   Female:3488  
##  No internet service:1526   No internet service:1526   Male  :3555  
##  Yes                :2707   Yes                :2732                
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##  SeniorCitizen Partner    Dependents
##  No :5901      No :3641   No :4933  
##  Yes:1142      Yes:3402   Yes:2110  
##                                     
##                                     
##                                     
##                                     
## 

Contagem de clientes únicos

length(levels(as.factor(data$customerID)))
## [1] 7043
dim(data)
## [1] 7043   21

Existem 7.043 ID’s de clientes únicos.

Contagem de NAs

options(repr.plot.width = 6, repr.plot.height = 4)
missing_data <- data %>% summarise_all(funs(sum(is.na(.))/n()))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
missing_data <- gather(missing_data, key = "variables", value = "percent_missing")
ggplot(missing_data, aes(x = reorder(variables, percent_missing), y = percent_missing)) +
geom_bar(stat = "identity", fill = "red", aes(color = I('white')), size = 0.3)+
xlab('variables')+
coord_flip()+ 
theme_bw()

Podemos perceber que a variável TotalCharger possui valores faltantes. Existem apenas \(0,15\%\) de valores faltantes presentes na variável TotalCharges.

Contagem de churns na base

options(repr.plot.width = 6, repr.plot.height = 4)
churn_data %>% 
group_by(Churn) %>% 
summarise(Count = n())%>% 
mutate(percent = prop.table(Count)*100)%>%
ggplot(aes(reorder(Churn, -percent), percent), fill = Churn)+
geom_col(fill = c("#b0b0b0", "#E7B800"))+
geom_text(aes(label = sprintf("%.2f%%", percent)), hjust = 0.01,vjust = -0.5, size =3)+ 
theme_bw()+  
xlab("Churn") + 
ylab("Percent")+
ggtitle("Churn Percent")
## `summarise()` ungrouping output (override with `.groups` argument)

Aproximadamente \(26\%\) dos clientes da base de fato realizaram churn.

Visualização de variáveis categóricas

library(cowplot)

theme1 <- theme_bw()+
theme(axis.text.x = element_text(angle = 0, hjust = 1, vjust = 0.5),legend.position="none",axis.text = element_text(colour = "red"))
theme2 <- theme_bw()+
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 0.8),legend.position="none",axis.text = element_text(colour = "red"))

options(repr.plot.width = 12, repr.plot.height = 8)
#update_geom_defaults("bar",   list(fill = "#E7B800"))
plot_grid(
          ggplot(data, aes(x=gender ,fill=Churn))+ 
            geom_bar(position = 'fill')+ theme1 +
            scale_fill_manual(values=c('#b0b0b0','#E7B800')),
          ggplot(data, aes(x=SeniorCitizen,fill=Churn))+ 
            geom_bar(position = 'fill')+theme1+
            scale_fill_manual(values=c('#b0b0b0','#E7B800')),
          ggplot(data, aes(x=Partner,fill=Churn))+ 
            geom_bar(position = 'fill')+theme1+
            scale_fill_manual(values=c('#b0b0b0','#E7B800')),
          ggplot(data, aes(x=Dependents,fill=Churn))+ 
            geom_bar(position = 'fill')+theme1+
            scale_fill_manual(values=c('#b0b0b0','#E7B800'))+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
          align = "h") 

options(repr.plot.width = 12, repr.plot.height = 8)
plot_grid(
          ggplot(data, aes(x=PhoneService,fill=Churn))+ 
            geom_bar(position = 'fill')+theme1+
            scale_fill_manual(values=c('#b0b0b0','#E7B800')),
          ggplot(data, aes(x=MultipleLines,fill=Churn))+ 
            geom_bar(position = 'fill')+theme2+
            scale_fill_manual(values=c('#b0b0b0','#E7B800')),
          ggplot(data, aes(x=Contract,fill=Churn))+
            geom_bar(position = 'fill')+theme2+
            scale_fill_manual(values=c('#b0b0b0','#E7B800')),
          ggplot(data, aes(x=PaperlessBilling,fill=Churn))+
            geom_bar(position = 'fill')+theme1+
            scale_fill_manual(values=c('#b0b0b0','#E7B800'))+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
          align = "h") 

options(repr.plot.width = 12, repr.plot.height = 8)
plot_grid(
          ggplot(data, aes(x=PaymentMethod,fill=Churn))+
            geom_bar(position = 'fill')+theme2+
            scale_fill_manual(values=c('#b0b0b0','#E7B800')),
          ggplot(data, aes(x=InternetService,fill=Churn))+
            geom_bar(position = 'fill')+theme2+
            scale_fill_manual(values=c('#b0b0b0','#E7B800')),
          ggplot(data, aes(x=OnlineSecurity,fill=Churn))+
            geom_bar(position = 'fill')+theme2+
            scale_fill_manual(values=c('#b0b0b0','#E7B800')),
          ggplot(data, aes(x=PaymentMethod,fill=Churn))+
            geom_bar(position = 'fill')+theme2+
            scale_fill_manual(values=c('#b0b0b0','#E7B800'))+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
          align = "h") 

options(repr.plot.width = 12, repr.plot.height = 8)
plot_grid(
          ggplot(data, aes(x=StreamingMovies,fill=Churn))+
            geom_bar(position = 'fill')+theme2+
            scale_fill_manual(values=c('#b0b0b0','#E7B800')),
          ggplot(data, aes(x=StreamingTV,fill=Churn))+
            geom_bar(position = 'fill')+theme2+
            scale_fill_manual(values=c('#b0b0b0','#E7B800')),
          ggplot(data, aes(x=OnlineBackup,fill=Churn))+
            geom_bar(position = 'fill')+theme2+
            scale_fill_manual(values=c('#b0b0b0','#E7B800')),
          ggplot(data, aes(x=DeviceProtection,fill=Churn))+
            geom_bar(position = 'fill')+theme2+
            scale_fill_manual(values=c('#b0b0b0','#E7B800'))+
    scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
          align = "h") 

Visualização de variáveis numéricas

library(ggthemes)
## 
## Attaching package: 'ggthemes'
## The following object is masked from 'package:cowplot':
## 
##     theme_map
numeric_cols <- c("TotalCharges", "MonthlyCharges", "tenure", "Churn")
numeric_data <- data %>% dplyr::select(numeric_cols) %>% na.omit
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(numeric_cols)` instead of `numeric_cols` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
options(repr.plot.width = 12, repr.plot.height = 8)
plot_grid(
  ggplot(numeric_data, aes(tenure, MonthlyCharges, colour = Churn))+
    geom_point() + theme_hc()+ scale_colour_hc(),
  ggplot(numeric_data, aes(tenure, TotalCharges, colour = Churn))+
    geom_point() + theme_hc()+ scale_colour_hc() +
    scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
  align = "h") 

Análise de Sobrevivência

Transformação da Variável Churn

library(tidyverse)
data <- data %>%
  mutate(censura = ifelse(Churn == "Yes", 1, 0))

Transformando as variáveis contínuas em categóricas

summary(data$TotalCharges)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    18.8   401.4  1397.5  2283.3  3794.7  8684.8      11
summary(data$MonthlyCharges)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.25   35.50   70.35   64.76   89.85  118.75
library(tidyverse)
data <- data %>%
  mutate(MonthlyCharges_categ = ifelse(MonthlyCharges < 70, "< 70",
                                       ">= 70"),
         TotalCharges_categ = ifelse(TotalCharges < 2000, "< 2000",
                                     ">= 2000"))
library(tidyverse)
data <- data %>%
  mutate(censura = ifelse(Churn == "Yes", 1, 0))

Funcao de sobrevivencia por Kaplan-Meier

library(survival)
ekm <-survfit(Surv(data$tenure, data$censura)~1)
plot(ekm,ylab="S(t)_KM",xlab="Tempo (meses)",
conf.int=F, col="purple", lwd=2)

Função de Sobrev. por Covariável

library(survival)
#Funcao de sobrevivencia por Kaplan-Meier:
ekm.Contract <-survfit(Surv(data$tenure,data$censura)~data$Contract)
summary(ekm)
## Call: survfit(formula = Surv(data$tenure, data$censura) ~ 1)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1   7032     380    0.946 0.00270        0.941        0.951
##     2   6419     123    0.928 0.00310        0.922        0.934
##     3   6181      94    0.914 0.00338        0.907        0.920
##     4   5981      83    0.901 0.00361        0.894        0.908
##     5   5805      64    0.891 0.00377        0.884        0.899
##     6   5672      40    0.885 0.00388        0.877        0.892
##     7   5562      51    0.877 0.00400        0.869        0.885
##     8   5431      42    0.870 0.00411        0.862        0.878
##     9   5308      46    0.862 0.00422        0.854        0.871
##    10   5189      45    0.855 0.00433        0.846        0.863
##    11   5073      31    0.850 0.00440        0.841        0.858
##    12   4974      38    0.843 0.00449        0.834        0.852
##    13   4857      38    0.837 0.00458        0.828        0.846
##    14   4748      24    0.832 0.00464        0.823        0.842
##    15   4672      37    0.826 0.00473        0.817        0.835
##    16   4573      28    0.821 0.00479        0.811        0.830
##    17   4493      26    0.816 0.00486        0.807        0.826
##    18   4406      24    0.812 0.00491        0.802        0.821
##    19   4309      19    0.808 0.00496        0.798        0.818
##    20   4236      18    0.805 0.00501        0.795        0.814
##    21   4165      17    0.801 0.00505        0.791        0.811
##    22   4102      27    0.796 0.00512        0.786        0.806
##    23   4012      13    0.793 0.00515        0.783        0.804
##    24   3927      23    0.789 0.00521        0.779        0.799
##    25   3833      23    0.784 0.00527        0.774        0.794
##    26   3754      15    0.781 0.00531        0.771        0.791
##    27   3675      13    0.778 0.00535        0.768        0.789
##    28   3603      12    0.776 0.00538        0.765        0.786
##    29   3546      15    0.772 0.00542        0.762        0.783
##    30   3474      16    0.769 0.00547        0.758        0.779
##    31   3402      16    0.765 0.00552        0.754        0.776
##    32   3337      19    0.761 0.00558        0.750        0.772
##    33   3268      14    0.757 0.00562        0.747        0.769
##    34   3204      12    0.755 0.00566        0.744        0.766
##    35   3139      15    0.751 0.00571        0.740        0.762
##    36   3051      10    0.749 0.00574        0.737        0.760
##    37   3001      15    0.745 0.00580        0.734        0.756
##    38   2936      13    0.742 0.00584        0.730        0.753
##    39   2877      14    0.738 0.00589        0.726        0.750
##    40   2821      13    0.734 0.00594        0.723        0.746
##    41   2757      14    0.731 0.00599        0.719        0.743
##    42   2687      14    0.727 0.00605        0.715        0.739
##    43   2622      15    0.723 0.00611        0.711        0.735
##    44   2557       6    0.721 0.00613        0.709        0.733
##    45   2506       6    0.719 0.00616        0.707        0.732
##    46   2445      12    0.716 0.00621        0.704        0.728
##    47   2371      14    0.712 0.00628        0.699        0.724
##    48   2303       9    0.709 0.00632        0.697        0.721
##    49   2239      15    0.704 0.00640        0.692        0.717
##    50   2173      10    0.701 0.00645        0.688        0.714
##    51   2105       8    0.698 0.00649        0.686        0.711
##    52   2037       8    0.695 0.00654        0.683        0.708
##    53   1957      14    0.690 0.00663        0.678        0.704
##    54   1887      13    0.686 0.00671        0.673        0.699
##    55   1819       9    0.682 0.00677        0.669        0.696
##    56   1755      10    0.678 0.00684        0.665        0.692
##    57   1675       8    0.675 0.00691        0.662        0.689
##    58   1610      11    0.671 0.00700        0.657        0.684
##    59   1543       8    0.667 0.00707        0.653        0.681
##    60   1483       6    0.664 0.00713        0.651        0.679
##    61   1407       8    0.661 0.00721        0.647        0.675
##    62   1331       5    0.658 0.00727        0.644        0.673
##    63   1261       4    0.656 0.00732        0.642        0.671
##    64   1189       4    0.654 0.00738        0.640        0.668
##    65   1109       9    0.649 0.00753        0.634        0.663
##    66   1033      13    0.640 0.00776        0.625        0.656
##    67    944      10    0.634 0.00797        0.618        0.649
##    68    846       9    0.627 0.00820        0.611        0.643
##    69    746       8    0.620 0.00845        0.604        0.637
##    70    651      11    0.610 0.00888        0.593        0.627
##    71    532       6    0.603 0.00921        0.585        0.621
##    72    362       6    0.593 0.00992        0.574        0.613
plot(ekm.Contract, lty=c(2,1,3),xlab="Customer Tenure (months)",ylab="Customer Survival Chance (%)",lwd=2, col=c(3,6,7))
legend(1,0.4,lty=c(2,1,3),legend = c("Month-to-month","One year", "Two year"),lwd=2,bty="n", col=c(3,6,7),cex=0.7)

  • Teste logrank para covariável Contract
survdiff(Surv(data$tenure,data$censura)~data$Contract)
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$Contract)
## 
##                                 N Observed Expected (O-E)^2/E (O-E)^2/V
## data$Contract=Month-to-month 3875     1655      708      1265      2304
## data$Contract=One year       1473      166      471       197       270
## data$Contract=Two year       1695       48      690       597      1061
## 
##  Chisq= 2353  on 2 degrees of freedom, p= <2e-16

O teste de logrank parte da hipótese nula de que as curvas, dos modelos com e sem a inclusão da covariável em questão, são iguais.

O \(\text{p-valor}\le 2e^{-16}\), indicando que existe diferença significativa entre as curvas, e que a covariável Contract pode ser relevante para explicar a variável resposta (tempo até ocorrência de churn).

Não entraremos em detalhes de métodos de seleção de covariáveis. Porém, repetindo esse procedimento com as outras coavariáveis existentes, e testando interações (modelos multivariados) pode-se chegar em um modelo que melhor explica a resposta dos dados.

VERIFICANDO SIGNIFICÂNCIA DAS COVAR. INDIVIDUALMENTE

survdiff(Surv(data$tenure,data$censura)~data$gender)               # não signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$gender)
## 
##                       N Observed Expected (O-E)^2/E (O-E)^2/V
## data$gender=Female 3488      939      923     0.261     0.526
## data$gender=Male   3555      930      946     0.255     0.526
## 
##  Chisq= 0.5  on 1 degrees of freedom, p= 0.5
survdiff(Surv(data$tenure,data$censura)~data$Partner)              # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$Partner)
## 
##                     N Observed Expected (O-E)^2/E (O-E)^2/V
## data$Partner=No  3641     1200      773       236       424
## data$Partner=Yes 3402      669     1096       166       424
## 
##  Chisq= 424  on 1 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$Dependents)           # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$Dependents)
## 
##                        N Observed Expected (O-E)^2/E (O-E)^2/V
## data$Dependents=No  4933     1543     1234      77.3       233
## data$Dependents=Yes 2110      326      635     150.3       233
## 
##  Chisq= 233  on 1 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$SeniorCitizen)        # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$SeniorCitizen)
## 
##                           N Observed Expected (O-E)^2/E (O-E)^2/V
## data$SeniorCitizen=No  5901     1393     1560      17.8       109
## data$SeniorCitizen=Yes 1142      476      309      89.8       109
## 
##  Chisq= 110  on 1 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$StreamingMovies)      # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$StreamingMovies)
## 
##                                             N Observed Expected (O-E)^2/E
## data$StreamingMovies=No                  2785      938      614    171.50
## data$StreamingMovies=No internet service 1526      113      388    195.36
## data$StreamingMovies=Yes                 2732      818      867      2.76
##                                          (O-E)^2/V
## data$StreamingMovies=No                     264.22
## data$StreamingMovies=No internet service    251.20
## data$StreamingMovies=Yes                      5.33
## 
##  Chisq= 378  on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$StreamingTV)          # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$StreamingTV)
## 
##                                         N Observed Expected (O-E)^2/E (O-E)^2/V
## data$StreamingTV=No                  2810      942      624    162.47    252.07
## data$StreamingTV=No internet service 1526      113      388    195.36    251.20
## data$StreamingTV=Yes                 2707      814      857      2.14      4.09
## 
##  Chisq= 368  on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$TechSupport)          # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$TechSupport)
## 
##                                         N Observed Expected (O-E)^2/E (O-E)^2/V
## data$TechSupport=No                  3473     1446      788       549       985
## data$TechSupport=No internet service 1526      113      388       195       251
## data$TechSupport=Yes                 2044      310      692       211       349
## 
##  Chisq= 990  on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$DeviceProtection)     # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$DeviceProtection)
## 
##                                              N Observed Expected (O-E)^2/E
## data$DeviceProtection=No                  3095     1211      664       450
## data$DeviceProtection=No internet service 1526      113      388       195
## data$DeviceProtection=Yes                 2422      545      816        90
##                                           (O-E)^2/V
## data$DeviceProtection=No                        729
## data$DeviceProtection=No internet service       251
## data$DeviceProtection=Yes                       167
## 
##  Chisq= 764  on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$OnlineBackup)         # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$OnlineBackup)
## 
##                                          N Observed Expected (O-E)^2/E
## data$OnlineBackup=No                  3088     1233      664       488
## data$OnlineBackup=No internet service 1526      113      388       195
## data$OnlineBackup=Yes                 2429      523      817       106
##                                       (O-E)^2/V
## data$OnlineBackup=No                        792
## data$OnlineBackup=No internet service       251
## data$OnlineBackup=Yes                       197
## 
##  Chisq= 821  on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$OnlineSecurity)       # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$OnlineSecurity)
## 
##                                            N Observed Expected (O-E)^2/E
## data$OnlineSecurity=No                  3498     1461      794       559
## data$OnlineSecurity=No internet service 1526      113      388       195
## data$OnlineSecurity=Yes                 2019      295      686       223
##                                         (O-E)^2/V
## data$OnlineSecurity=No                       1010
## data$OnlineSecurity=No internet service       251
## data$OnlineSecurity=Yes                       367
## 
##  Chisq= 1014  on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$InternetService)      # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$InternetService)
## 
##                                     N Observed Expected (O-E)^2/E (O-E)^2/V
## data$InternetService=DSL         2421      459      649      55.4      86.4
## data$InternetService=Fiber optic 3096     1297      832     260.1     477.1
## data$InternetService=No          1526      113      388     195.4     251.2
## 
##  Chisq= 520  on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$MultipleLines)        # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$MultipleLines)
## 
##                                        N Observed Expected (O-E)^2/E (O-E)^2/V
## data$MultipleLines=No               3390      849      735    17.597    30.276
## data$MultipleLines=No phone service  682      170      178     0.383     0.431
## data$MultipleLines=Yes              2971      850      955    11.646    24.850
## 
##  Chisq= 31  on 2 degrees of freedom, p= 2e-07
survdiff(Surv(data$tenure,data$censura)~data$TotalCharges_categ)   # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$TotalCharges_categ)
## 
## n=7032, 11 observations deleted due to missingness.
## 
##                                    N Observed Expected (O-E)^2/E (O-E)^2/V
## data$TotalCharges_categ=< 2000  4176     1339      778       404       775
## data$TotalCharges_categ=>= 2000 2856      530     1091       288       775
## 
##  Chisq= 775  on 1 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$MonthlyCharges_categ) # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$MonthlyCharges_categ)
## 
##                                    N Observed Expected (O-E)^2/E (O-E)^2/V
## data$MonthlyCharges_categ=< 70  3452      595      823      63.2       116
## data$MonthlyCharges_categ=>= 70 3591     1274     1046      49.7       116
## 
##  Chisq= 116  on 1 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$PaperlessBilling)     # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$PaperlessBilling)
## 
##                              N Observed Expected (O-E)^2/E (O-E)^2/V
## data$PaperlessBilling=No  2872      469      759     110.6       190
## data$PaperlessBilling=Yes 4171     1400     1110      75.6       190
## 
##  Chisq= 190  on 1 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$PhoneService)         # Não signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$PhoneService)
## 
##                          N Observed Expected (O-E)^2/E (O-E)^2/V
## data$PhoneService=No   682      170      178    0.3828     0.431
## data$PhoneService=Yes 6361     1699     1691    0.0404     0.431
## 
##  Chisq= 0.4  on 1 degrees of freedom, p= 0.5
survdiff(Surv(data$tenure,data$censura)~data$Contract)             # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$Contract)
## 
##                                 N Observed Expected (O-E)^2/E (O-E)^2/V
## data$Contract=Month-to-month 3875     1655      708      1265      2304
## data$Contract=One year       1473      166      471       197       270
## data$Contract=Two year       1695       48      690       597      1061
## 
##  Chisq= 2353  on 2 degrees of freedom, p= <2e-16
survdiff(Surv(data$tenure,data$censura)~data$PaymentMethod)        # signif
## Call:
## survdiff(formula = Surv(data$tenure, data$censura) ~ data$PaymentMethod)
## 
##                                                 N Observed Expected (O-E)^2/E
## data$PaymentMethod=Bank transfer (automatic) 1544      258      512    126.12
## data$PaymentMethod=Credit card (automatic)   1522      232      502    145.00
## data$PaymentMethod=Electronic check          2365     1071      528    558.93
## data$PaymentMethod=Mailed check              1612      308      327      1.14
##                                              (O-E)^2/V
## data$PaymentMethod=Bank transfer (automatic)    178.66
## data$PaymentMethod=Credit card (automatic)      203.63
## data$PaymentMethod=Electronic check             803.75
## data$PaymentMethod=Mailed check                   1.43
## 
##  Chisq= 865  on 3 degrees of freedom, p= <2e-16
library(survival)
#Funcao de sobrevivencia por Kaplan-Meier:
ekm.Contract <-survfit(Surv(data$tenure)~data$Contract)
#summary(ekm)
plot(ekm.Contract, lty=c(2,1,3),xlab="Customer Tenure (months)",ylab="Customer Survival Chance (%)",lwd=2, col=c(3,6,7))
legend(1,0.4,lty=c(2,1,3),legend = c("Month-to-month","One year", "Two year"),lwd=2,bty="n", col=c(3,6,7),cex=0.7)

CONSTRUINDO MODELO COMPLETO SEM AS VARIÁVEIS Ñ SIGNIF. DO PASSO ANTERIOR

Modelo de Cox Multivariado

  • Modelo Completo
m.comp1<-coxph(formula=Surv(tenure,censura)~Contract+Partner+Dependents+
SeniorCitizen+StreamingMovies+StreamingTV+TechSupport+DeviceProtection+OnlineBackup+OnlineSecurity+InternetService+MultipleLines+TotalCharges_categ+PaperlessBilling,data=data, x=T,method="breslow")
summary(m.comp1)
## Call:
## coxph(formula = Surv(tenure, censura) ~ Contract + Partner + 
##     Dependents + SeniorCitizen + StreamingMovies + StreamingTV + 
##     TechSupport + DeviceProtection + OnlineBackup + OnlineSecurity + 
##     InternetService + MultipleLines + TotalCharges_categ + PaperlessBilling, 
##     data = data, x = T, method = "breslow")
## 
##   n= 7032, number of events= 1869 
##    (11 observations deleted due to missingness)
## 
##                                          coef exp(coef)  se(coef)       z
## ContractOne year                    -1.581991  0.205565  0.092017 -17.192
## ContractTwo year                    -3.589254  0.027619  0.173145 -20.730
## PartnerYes                          -0.400950  0.669684  0.055000  -7.290
## DependentsYes                       -0.101548  0.903438  0.068500  -1.482
## SeniorCitizenYes                    -0.006968  0.993057  0.056485  -0.123
## StreamingMoviesNo internet service  -1.523324  0.217986  0.130141 -11.705
## StreamingMoviesYes                   0.133537  1.142863  0.054367   2.456
## StreamingTVNo internet service             NA        NA  0.000000      NA
## StreamingTVYes                       0.173929  1.189971  0.054222   3.208
## TechSupportNo internet service             NA        NA  0.000000      NA
## TechSupportYes                      -0.308813  0.734318  0.065838  -4.690
## DeviceProtectionNo internet service        NA        NA  0.000000      NA
## DeviceProtectionYes                 -0.173634  0.840604  0.055061  -3.153
## OnlineBackupNo internet service            NA        NA  0.000000      NA
## OnlineBackupYes                     -0.378560  0.684847  0.056043  -6.755
## OnlineSecurityNo internet service          NA        NA  0.000000      NA
## OnlineSecurityYes                   -0.495768  0.609103  0.066479  -7.457
## InternetServiceFiber optic           0.658262  1.931433  0.071888   9.157
## InternetServiceNo                          NA        NA  0.000000      NA
## MultipleLinesNo phone service       -0.394537  0.673992  0.101844  -3.874
## MultipleLinesYes                    -0.244447  0.783137  0.053194  -4.595
## TotalCharges_categ>= 2000           -2.363429  0.094097  0.085549 -27.627
## PaperlessBillingYes                  0.166501  1.181165  0.056510   2.946
##                                     Pr(>|z|)    
## ContractOne year                     < 2e-16 ***
## ContractTwo year                     < 2e-16 ***
## PartnerYes                          3.10e-13 ***
## DependentsYes                       0.138217    
## SeniorCitizenYes                    0.901828    
## StreamingMoviesNo internet service   < 2e-16 ***
## StreamingMoviesYes                  0.014041 *  
## StreamingTVNo internet service            NA    
## StreamingTVYes                      0.001338 ** 
## TechSupportNo internet service            NA    
## TechSupportYes                      2.73e-06 ***
## DeviceProtectionNo internet service       NA    
## DeviceProtectionYes                 0.001613 ** 
## OnlineBackupNo internet service           NA    
## OnlineBackupYes                     1.43e-11 ***
## OnlineSecurityNo internet service         NA    
## OnlineSecurityYes                   8.82e-14 ***
## InternetServiceFiber optic           < 2e-16 ***
## InternetServiceNo                         NA    
## MultipleLinesNo phone service       0.000107 ***
## MultipleLinesYes                    4.32e-06 ***
## TotalCharges_categ>= 2000            < 2e-16 ***
## PaperlessBillingYes                 0.003215 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                     exp(coef) exp(-coef) lower .95 upper .95
## ContractOne year                      0.20557     4.8646   0.17164   0.24619
## ContractTwo year                      0.02762    36.2071   0.01967   0.03878
## PartnerYes                            0.66968     1.4932   0.60125   0.74591
## DependentsYes                         0.90344     1.1069   0.78993   1.03325
## SeniorCitizenYes                      0.99306     1.0070   0.88898   1.10931
## StreamingMoviesNo internet service    0.21799     4.5875   0.16891   0.28132
## StreamingMoviesYes                    1.14286     0.8750   1.02735   1.27137
## StreamingTVNo internet service             NA         NA        NA        NA
## StreamingTVYes                        1.18997     0.8404   1.07000   1.32340
## TechSupportNo internet service             NA         NA        NA        NA
## TechSupportYes                        0.73432     1.3618   0.64542   0.83546
## DeviceProtectionNo internet service        NA         NA        NA        NA
## DeviceProtectionYes                   0.84060     1.1896   0.75461   0.93640
## OnlineBackupNo internet service            NA         NA        NA        NA
## OnlineBackupYes                       0.68485     1.4602   0.61361   0.76436
## OnlineSecurityNo internet service          NA         NA        NA        NA
## OnlineSecurityYes                     0.60910     1.6418   0.53469   0.69387
## InternetServiceFiber optic            1.93143     0.5178   1.67760   2.22367
## InternetServiceNo                          NA         NA        NA        NA
## MultipleLinesNo phone service         0.67399     1.4837   0.55203   0.82290
## MultipleLinesYes                      0.78314     1.2769   0.70560   0.86919
## TotalCharges_categ>= 2000             0.09410    10.6273   0.07957   0.11127
## PaperlessBillingYes                   1.18116     0.8466   1.05733   1.31951
## 
## Concordance= 0.889  (se = 0.003 )
## Likelihood ratio test= 4275  on 17 df,   p=<2e-16
## Wald test            = 2355  on 17 df,   p=<2e-16
## Score (logrank) test = 3965  on 17 df,   p=<2e-16

PASSO XX RETORNANDO VARIÁVEIS NÃO SIGNIF. DO PASSO ANTERIOR

Retornando as variáveis excluídas anteriormente

  • Covariável gender
m.comp.gender<-coxph(formula=Surv(tenure,censura)~Contract+Partner+Dependents+
SeniorCitizen+StreamingMovies+StreamingTV+TechSupport+DeviceProtection+OnlineBackup+OnlineSecurity+InternetService+MultipleLines+TotalCharges_categ+PaperlessBilling+gender,data=data, x=T,method="breslow")

Avaliação de significância da variável

TRV.npartner=2*(m.comp1$loglik-m.comp.gender$loglik)
1-pchisq(TRV.npartner,df=1)
## [1] 1 1

Teste de razão de Verossimilhança, comparando os modelos com e sem a adição da covar. Gender, p-valor = 1, variável não significativa para o modelo.

  • Covariável PhoneService
m.comp.phone<-coxph(formula=Surv(tenure,censura)~Contract+Partner+Dependents+
SeniorCitizen+StreamingMovies+StreamingTV+TechSupport+DeviceProtection+OnlineBackup+OnlineSecurity+InternetService+MultipleLines+TotalCharges_categ+PaperlessBilling+PhoneService,data=data, x=T,method="breslow")

Avaliação de significância da variável

TRV.npartner=2*(m.comp1$loglik-m.comp.phone$loglik)
1-pchisq(TRV.npartner,df=1)
## [1] 1 1

Assim como a variável Gender, pelo Teste de Razão de Verossimilhança, comparando os modelos com e sem a adição da covar. PhoneService, p-valor = 1, variável não significativa para o modelo.

REMOVENDO COVAR DO MODELO COMPLETO

  • Removendo Covariável Contract
m.sem.contract<-coxph(formula=Surv(tenure,censura)~Partner+Dependents+
SeniorCitizen+StreamingMovies+StreamingTV+TechSupport+DeviceProtection+OnlineBackup+OnlineSecurity+InternetService+MultipleLines+TotalCharges_categ+PaperlessBilling+PhoneService,data=data, x=T,method="breslow")

TRV.npartner=2*(m.comp1$loglik-m.sem.contract$loglik)
1-pchisq(TRV.npartner,df=1)
## [1] 1 0

P-valor = 0, variável Contract signif. para o modelo

#final<-coxph(formula=Surv(tenure)~1,data=data, x=T,method="breslow")

#install.packages("survminer")
#library(survminer)

#test.ph <- cox.zph(final, transform = "identity")