Pacotes

#install.packages('tinytex')

#tinytex::install_tinytex()

#install.packages("dplyr")

#install.packages("survival")

library(tidyverse) 
library(MASS)
library(car)
library(e1071)
library(caret)
library(cowplot)
library(caTools)
library(pROC)
library(ggcorrplot)
library(car)
library(lmtest)
library(xtable) 
library(dplyr)
library(dbplyr)
library(flexsurv)
library(readxl)
library(summarytools)
library(dplyr)
library(survival)
library(survminer)
library(ggplot2)
library(ggExtra)
library(ggfortify)
library(survival)
library(ggplot2)
library(pander)
library(tidyverse)
library(survminer)
library(gridExtra)
library(xtable)

Leitura dos dados

#install.packages("reticulate")

library(readr)

dados_tel=read_csv("Telecom_Churn.csv")

Legenda das Colunas:

customerID: Identificação única do cliente.

gender: Gênero do cliente.

SeniorCitizen: Indicação se o cliente é idoso (1 para sim, 0 para não).

Partner: Indicação se o cliente tem parceiro (1 para sim, 0 para não).

Dependents: Indicação se o cliente tem dependentes (1 para sim, 0 para não).

tenure: Tempo que o cliente permaneceu como cliente da empresa (em meses).

PhoneService: Indicação se o cliente possui serviço de telefone (1 para sim, 0 para não).

MultipleLines: Indicação se o cliente tem múltiplas linhas de telefone (por exemplo, celular e fixo) (Sim, Não, Sem serviço de telefone).

InternetService: Tipo de serviço de internet do cliente ou se nao tem internet (por exemplo, DSL, fibra ótica).

OnlineSecurity: Indicação se o cliente possui segurança online (Sim, Não, Sem serviço de internet).

OnlineBackup: Indicação se o cliente possui backup online (Sim, Não, Sem serviço de internet).

DeviceProtection: Indicação se o cliente possui proteção de dispositivo (Sim, Não, Sem serviço de internet).

TechSupport: Indicação se o cliente possui suporte técnico (Sim, Não, Sem serviço de internet).

StreamingTV: Indicação se o cliente possui serviço de streaming de TV (Sim, Não, Sem serviço de internet).

StreamingMovies: Indicação se o cliente possui serviço de streaming de filmes (Sim, Não, Sem serviço de internet).

Contract: Tipo de contrato do cliente (por exemplo, mensal, anual).

PaperlessBilling: Indicação se o cliente recebe faturas em formato eletrônico (1 para sim, 0 para não).

PaymentMethod: Método de pagamento do cliente.

MonthlyCharges: Valor da cobrança mensal do cliente.

TotalCharges: Valor total cobrado ao cliente.

Churn: Indicação se o cliente cancelou o serviço (1 para sim, 0 para não).

Tratamento dos dados

dados_tel$Churn=ifelse(dados_tel$Churn=="Yes",1,0)


# Novos nomes das colunas em português
novos_nomes <- c("IDCliente", "Genero", "Idoso", "Parceiro", "Dependentes",
                 "Tempo", "ServicoTelefone", "MultiplasLinhas", "ServicoInternet", 
                 "SegurancaOnline", "BackupOnline", "ProtecaoDispositivo", 
                 "SuporteTecnico", "StreamingTV", "StreamingFilmes", 
                 "Contrato", "FaturamentoEletronico", "MetodoPagamento", 
                 "CobrancaMensal", "CobrancaTotal", "Cancelamento")



# Renomeando as colunas
colnames(dados_tel) <- novos_nomes

dados_tel$Idoso <- ifelse(dados_tel$Idoso==1,"Yes","No")

# Verificando se os nomes foram alterados
print(colnames(dados_tel))

##  [1] "IDCliente"             "Genero"                "Idoso"                
##  [4] "Parceiro"              "Dependentes"           "Tempo"                
##  [7] "ServicoTelefone"       "MultiplasLinhas"       "ServicoInternet"      
## [10] "SegurancaOnline"       "BackupOnline"          "ProtecaoDispositivo"  
## [13] "SuporteTecnico"        "StreamingTV"           "StreamingFilmes"      
## [16] "Contrato"              "FaturamentoEletronico" "MetodoPagamento"      
## [19] "CobrancaMensal"        "CobrancaTotal"         "Cancelamento"

# mundado para 0 ou 1 valores das colunas
# dados_tel$Parceiro=ifelse(dados_tel$Parceiro=="Yes","Sim","Nao")

# qtd de dados igual a 0
summary(dados_tel$Tempo)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   29.00   32.37   55.00   72.00

table(dados_tel$Tempo <= 0)

## 
## FALSE  TRUE 
##  7032    11

DT::datatable(dados_tel)

Tranformando em fator as variaveis

library(dplyr)
colnames(dados_tel)

##  [1] "IDCliente"             "Genero"                "Idoso"                
##  [4] "Parceiro"              "Dependentes"           "Tempo"                
##  [7] "ServicoTelefone"       "MultiplasLinhas"       "ServicoInternet"      
## [10] "SegurancaOnline"       "BackupOnline"          "ProtecaoDispositivo"  
## [13] "SuporteTecnico"        "StreamingTV"           "StreamingFilmes"      
## [16] "Contrato"              "FaturamentoEletronico" "MetodoPagamento"      
## [19] "CobrancaMensal"        "CobrancaTotal"         "Cancelamento"

head(dados_tel)

## # A tibble: 6 × 21
##   IDCliente  Genero Idoso Parceiro Dependentes Tempo ServicoTelefone
##   <chr>      <chr>  <chr> <chr>    <chr>       <dbl> <chr>          
## 1 7590-VHVEG Female No    Yes      No              1 No             
## 2 5575-GNVDE Male   No    No       No             34 Yes            
## 3 3668-QPYBK Male   No    No       No              2 Yes            
## 4 7795-CFOCW Male   No    No       No             45 No             
## 5 9237-HQITU Female No    No       No              2 Yes            
## 6 9305-CDSKC Female No    No       No              8 Yes            
## # ℹ 14 more variables: MultiplasLinhas <chr>, ServicoInternet <chr>,
## #   SegurancaOnline <chr>, BackupOnline <chr>, ProtecaoDispositivo <chr>,
## #   SuporteTecnico <chr>, StreamingTV <chr>, StreamingFilmes <chr>,
## #   Contrato <chr>, FaturamentoEletronico <chr>, MetodoPagamento <chr>,
## #   CobrancaMensal <dbl>, CobrancaTotal <dbl>, Cancelamento <dbl>

cols1 <- colnames(dados_tel)[c(-6,-19,-20,-21)]

dados_tel[cols1] <- lapply(dados_tel[cols1], as.character)

glimpse(dados_tel)

## Rows: 7,043
## Columns: 21
## $ IDCliente             <chr> "7590-VHVEG", "5575-GNVDE", "3668-QPYBK", "7795-…
## $ Genero                <chr> "Female", "Male", "Male", "Male", "Female", "Fem…
## $ Idoso                 <chr> "No", "No", "No", "No", "No", "No", "No", "No", …
## $ Parceiro              <chr> "Yes", "No", "No", "No", "No", "No", "No", "No",…
## $ Dependentes           <chr> "No", "No", "No", "No", "No", "No", "Yes", "No",…
## $ Tempo                 <dbl> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, …
## $ ServicoTelefone       <chr> "No", "Yes", "Yes", "No", "Yes", "Yes", "Yes", "…
## $ MultiplasLinhas       <chr> "No phone service", "No", "No", "No phone servic…
## $ ServicoInternet       <chr> "DSL", "DSL", "DSL", "DSL", "Fiber optic", "Fibe…
## $ SegurancaOnline       <chr> "No", "Yes", "Yes", "Yes", "No", "No", "No", "Ye…
## $ BackupOnline          <chr> "Yes", "No", "Yes", "No", "No", "No", "Yes", "No…
## $ ProtecaoDispositivo   <chr> "No", "Yes", "No", "Yes", "No", "Yes", "No", "No…
## $ SuporteTecnico        <chr> "No", "No", "No", "Yes", "No", "No", "No", "No",…
## $ StreamingTV           <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "No"…
## $ StreamingFilmes       <chr> "No", "No", "No", "No", "No", "Yes", "No", "No",…
## $ Contrato              <chr> "Month-to-month", "One year", "Month-to-month", …
## $ FaturamentoEletronico <chr> "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "…
## $ MetodoPagamento       <chr> "Electronic check", "Mailed check", "Mailed chec…
## $ CobrancaMensal        <dbl> 29.85, 56.95, 53.85, 42.30, 70.70, 99.65, 89.10,…
## $ CobrancaTotal         <dbl> 29.85, 1889.50, 108.15, 1840.75, 151.65, 820.50,…
## $ Cancelamento          <dbl> 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, …

head(dados_tel)

## # A tibble: 6 × 21
##   IDCliente  Genero Idoso Parceiro Dependentes Tempo ServicoTelefone
##   <chr>      <chr>  <chr> <chr>    <chr>       <dbl> <chr>          
## 1 7590-VHVEG Female No    Yes      No              1 No             
## 2 5575-GNVDE Male   No    No       No             34 Yes            
## 3 3668-QPYBK Male   No    No       No              2 Yes            
## 4 7795-CFOCW Male   No    No       No             45 No             
## 5 9237-HQITU Female No    No       No              2 Yes            
## 6 9305-CDSKC Female No    No       No              8 Yes            
## # ℹ 14 more variables: MultiplasLinhas <chr>, ServicoInternet <chr>,
## #   SegurancaOnline <chr>, BackupOnline <chr>, ProtecaoDispositivo <chr>,
## #   SuporteTecnico <chr>, StreamingTV <chr>, StreamingFilmes <chr>,
## #   Contrato <chr>, FaturamentoEletronico <chr>, MetodoPagamento <chr>,
## #   CobrancaMensal <dbl>, CobrancaTotal <dbl>, Cancelamento <dbl>

dados_tel$Cancelamento=as.numeric( dados_tel$Cancelamento)
dados_tel$CobrancaMensal=as.numeric(dados_tel$CobrancaMensal)
dados_tel$CobrancaTotal=as.numeric(dados_tel$CobrancaTotal)
dados_tel$Tempo=as.numeric(dados_tel$Tempo)

Removendo tempo zero para testar exp, log, weibull…

#dados_tel <- dados_tel[dados_tel$Tempo > 0, ]
head(dados_tel)

## # A tibble: 6 × 21
##   IDCliente  Genero Idoso Parceiro Dependentes Tempo ServicoTelefone
##   <chr>      <chr>  <chr> <chr>    <chr>       <dbl> <chr>          
## 1 7590-VHVEG Female No    Yes      No              1 No             
## 2 5575-GNVDE Male   No    No       No             34 Yes            
## 3 3668-QPYBK Male   No    No       No              2 Yes            
## 4 7795-CFOCW Male   No    No       No             45 No             
## 5 9237-HQITU Female No    No       No              2 Yes            
## 6 9305-CDSKC Female No    No       No              8 Yes            
## # ℹ 14 more variables: MultiplasLinhas <chr>, ServicoInternet <chr>,
## #   SegurancaOnline <chr>, BackupOnline <chr>, ProtecaoDispositivo <chr>,
## #   SuporteTecnico <chr>, StreamingTV <chr>, StreamingFilmes <chr>,
## #   Contrato <chr>, FaturamentoEletronico <chr>, MetodoPagamento <chr>,
## #   CobrancaMensal <dbl>, CobrancaTotal <dbl>, Cancelamento <dbl>

Alterando os dados

summary(dados_tel)

##   IDCliente            Genero             Idoso             Parceiro        
##  Length:7043        Length:7043        Length:7043        Length:7043       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Dependentes            Tempo       ServicoTelefone    MultiplasLinhas   
##  Length:7043        Min.   : 0.00   Length:7043        Length:7043       
##  Class :character   1st Qu.: 9.00   Class :character   Class :character  
##  Mode  :character   Median :29.00   Mode  :character   Mode  :character  
##                     Mean   :32.37                                        
##                     3rd Qu.:55.00                                        
##                     Max.   :72.00                                        
##                                                                          
##  ServicoInternet    SegurancaOnline    BackupOnline       ProtecaoDispositivo
##  Length:7043        Length:7043        Length:7043        Length:7043        
##  Class :character   Class :character   Class :character   Class :character   
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character   
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##  SuporteTecnico     StreamingTV        StreamingFilmes      Contrato        
##  Length:7043        Length:7043        Length:7043        Length:7043       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  FaturamentoEletronico MetodoPagamento    CobrancaMensal   CobrancaTotal   
##  Length:7043           Length:7043        Min.   : 18.25   Min.   :  18.8  
##  Class :character      Class :character   1st Qu.: 35.50   1st Qu.: 401.4  
##  Mode  :character      Mode  :character   Median : 70.35   Median :1397.5  
##                                           Mean   : 64.76   Mean   :2283.3  
##                                           3rd Qu.: 89.85   3rd Qu.:3794.7  
##                                           Max.   :118.75   Max.   :8684.8  
##                                                            NA's   :11      
##   Cancelamento   
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.2654  
##  3rd Qu.:1.0000  
##  Max.   :1.0000  
##

library(dplyr)

## 
## Anexando pacote: 'dplyr'

## Os seguintes objetos são mascarados por 'package:stats':
## 
##     filter, lag

## Os seguintes objetos são mascarados por 'package:base':
## 
##     intersect, setdiff, setequal, union

library(dbplyr)

## 
## Anexando pacote: 'dbplyr'

## Os seguintes objetos são mascarados por 'package:dplyr':
## 
##     ident, sql

dados_tel$Cancelamento=as.numeric( dados_tel$Cancelamento)
dados_tel$CobrancaMensal=as.numeric(dados_tel$CobrancaMensal)
dados_tel$CobrancaTotal=as.numeric(dados_tel$CobrancaTotal)
dados_tel$Tempo=as.integer(dados_tel$Tempo)



dados_tel$CobrancaMensal=ifelse(dados_tel$CobrancaMensal < 70, "< 70",
                                       ">= 70")
dados_tel$CobrancaTotal=ifelse(dados_tel$CobrancaTotal< 2000, "< 2000"," >= 2000")



dados_tel$Tempo_Categoria <- cut(dados_tel$Tempo,breaks = c(1, 3, 6, 12,18, 24, Inf),labels = c("1-3 Meses","3-6 Meses" ,"6-12 Meses", "12-18 Meses", "18-24","Acima de 24 Meses"),right = FALSE)

valores_unicos <- lapply(dados_tel[, c(2:5,7:18,21,22)], unique)
valores_unicos

## $Genero
## [1] "Female" "Male"  
## 
## $Idoso
## [1] "No"  "Yes"
## 
## $Parceiro
## [1] "Yes" "No" 
## 
## $Dependentes
## [1] "No"  "Yes"
## 
## $ServicoTelefone
## [1] "No"  "Yes"
## 
## $MultiplasLinhas
## [1] "No phone service" "No"               "Yes"             
## 
## $ServicoInternet
## [1] "DSL"         "Fiber optic" "No"         
## 
## $SegurancaOnline
## [1] "No"                  "Yes"                 "No internet service"
## 
## $BackupOnline
## [1] "Yes"                 "No"                  "No internet service"
## 
## $ProtecaoDispositivo
## [1] "No"                  "Yes"                 "No internet service"
## 
## $SuporteTecnico
## [1] "No"                  "Yes"                 "No internet service"
## 
## $StreamingTV
## [1] "No"                  "Yes"                 "No internet service"
## 
## $StreamingFilmes
## [1] "No"                  "Yes"                 "No internet service"
## 
## $Contrato
## [1] "Month-to-month" "One year"       "Two year"      
## 
## $FaturamentoEletronico
## [1] "Yes" "No" 
## 
## $MetodoPagamento
## [1] "Electronic check"          "Mailed check"             
## [3] "Bank transfer (automatic)" "Credit card (automatic)"  
## 
## $Cancelamento
## [1] 0 1
## 
## $Tempo_Categoria
## [1] 1-3 Meses         Acima de 24 Meses 6-12 Meses        18-24            
## [5] 12-18 Meses       3-6 Meses         <NA>             
## 6 Levels: 1-3 Meses 3-6 Meses 6-12 Meses 12-18 Meses ... Acima de 24 Meses

dados_tel[cols1] <- lapply(dados_tel[cols1], as.factor)
glimpse(dados_tel)

## Rows: 7,043
## Columns: 22
## $ IDCliente             <fct> 7590-VHVEG, 5575-GNVDE, 3668-QPYBK, 7795-CFOCW, …
## $ Genero                <fct> Female, Male, Male, Male, Female, Female, Male, …
## $ Idoso                 <fct> No, No, No, No, No, No, No, No, No, No, No, No, …
## $ Parceiro              <fct> Yes, No, No, No, No, No, No, No, Yes, No, Yes, N…
## $ Dependentes           <fct> No, No, No, No, No, No, Yes, No, No, Yes, Yes, N…
## $ Tempo                 <int> 1, 34, 2, 45, 2, 8, 22, 10, 28, 62, 13, 16, 58, …
## $ ServicoTelefone       <fct> No, Yes, Yes, No, Yes, Yes, Yes, No, Yes, Yes, Y…
## $ MultiplasLinhas       <fct> No phone service, No, No, No phone service, No, …
## $ ServicoInternet       <fct> DSL, DSL, DSL, DSL, Fiber optic, Fiber optic, Fi…
## $ SegurancaOnline       <fct> No, Yes, Yes, Yes, No, No, No, Yes, No, Yes, Yes…
## $ BackupOnline          <fct> Yes, No, Yes, No, No, No, Yes, No, No, Yes, No, …
## $ ProtecaoDispositivo   <fct> No, Yes, No, Yes, No, Yes, No, No, Yes, No, No, …
## $ SuporteTecnico        <fct> No, No, No, Yes, No, No, No, No, Yes, No, No, No…
## $ StreamingTV           <fct> No, No, No, No, No, Yes, Yes, No, Yes, No, No, N…
## $ StreamingFilmes       <fct> No, No, No, No, No, Yes, No, No, Yes, No, No, No…
## $ Contrato              <fct> Month-to-month, One year, Month-to-month, One ye…
## $ FaturamentoEletronico <fct> Yes, No, Yes, No, Yes, Yes, Yes, No, Yes, No, Ye…
## $ MetodoPagamento       <fct> Electronic check, Mailed check, Mailed check, Ba…
## $ CobrancaMensal        <chr> "< 70", "< 70", "< 70", "< 70", ">= 70", ">= 70"…
## $ CobrancaTotal         <chr> "< 2000", "< 2000", "< 2000", "< 2000", "< 2000"…
## $ Cancelamento          <dbl> 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, …
## $ Tempo_Categoria       <fct> 1-3 Meses, Acima de 24 Meses, 1-3 Meses, Acima d…

Análise exploratória desses dados

cHURN PLOT

# Calcular a taxa de churn para cada período
library(dplyr)

# Agrupar os dados por período e calcular a taxa de churn
taxa_churn_por_periodo <- dados_tel %>%
  group_by(Tempo) %>%
  summarise(
    Total = n(),
    Churn = sum(Cancelamento == 1),
    Taxa_Churn = (Churn / Total) * 100
  )

# Plotar a taxa de churn
plot(taxa_churn_por_periodo$Tempo, taxa_churn_por_periodo$Taxa_Churn,
     type = "o", col = "blue", lwd = 2,
     xlab = "Tempo", ylab = "Taxa de Churn (%)",
     main = "Taxa de Churn ao Longo do Tempo")

Grafico clientes

library(ggplot2)
head(dados_tel[,c(1,6,21)])

## # A tibble: 6 × 3
##   IDCliente  Tempo Cancelamento
##   <fct>      <int>        <dbl>
## 1 7590-VHVEG     1            0
## 2 5575-GNVDE    34            0
## 3 3668-QPYBK     2            1
## 4 7795-CFOCW    45            0
## 5 9237-HQITU     2            1
## 6 9305-CDSKC     8            1

dados_tel$Tempo=as.numeric( dados_tel$Tempo)

ggplot(dados_tel[1:20,], aes(x = 0, xend = Tempo, y = IDCliente, yend = IDCliente)) +geom_segment(size = 1) + # Adiciona segmentos de linha para cada cliente
  theme(axis.text.y = element_text(angle = 0, hjust = 1)) + # Ajusta os rótulos do eixo y para melhor visualização
  labs(x = "Tempo", y = "ID do Cliente", title = "Linha do Tempo dos Clientes-Primeiros 20 cliente") + 
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Contagem de Censura na base

options(repr.plot.width = 6, repr.plot.height = 4)

dados_tel %>% 
group_by(Cancelamento) %>% 
summarise(Count = n())%>% 
mutate(percent = prop.table(Count)*100)%>%
ggplot(aes(reorder(Cancelamento, -percent), percent), fill = Cancelamento)+
geom_col(fill = c("#b0b0b0", "#E7B800"))+
geom_text(aes(label = sprintf("%.2f%%", percent)), hjust = 0.01,vjust = -0.5, size =3)+ 
theme_bw()+  
xlab("Churn") + 
ylab("Percentual")+
ggtitle("Censura percentual 1-Sim e 0-Nao")

Churn Total

summary(dados_tel$Tempo)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   29.00   32.37   55.00   72.00

table(dados_tel$Cancelamento)

## 
##    0    1 
## 5174 1869

Taxa_Churn=(
  sum((dados_tel$Cancelamento==1))/
  sum((dados_tel$Cancelamento==0))
  )*100

Taxa_Churn

## [1] 36.12292

Churn de 36% considerando o periodo total

% cancelamento Covariaveis

library(ggplot2)
library(cowplot)
library(stringr)

telco <- dados_tel
telco$Cancelamento <- factor(telco$Cancelamento)
attach(telco)
theme1 <- theme_bw()+theme(axis.text.x = element_text(angle = 0, hjust = 1, vjust = 0.5),legend.position="none")
theme2 <- theme_bw()+theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),legend.position="none")

options(repr.plot.width = 12, repr.plot.height = 8)
plot_grid(ggplot(telco, aes(x=Genero,fill=Cancelamento))+ geom_bar()+ theme1, 
          ggplot(telco, aes(x=Idoso,fill=Cancelamento))+ geom_bar(position = 'fill')+theme1,
          ggplot(telco, aes(x=Parceiro,fill=Cancelamento))+ geom_bar(position = 'fill')+theme1,
          ggplot(telco, aes(x=Dependentes,fill=Cancelamento))+ geom_bar(position = 'fill')+theme1,
          ggplot(telco, aes(x=ServicoTelefone,fill=Cancelamento))+ geom_bar(position = 'fill')+theme1,
          ggplot(telco, aes(x=MultiplasLinhas,fill=Cancelamento))+ geom_bar(position = 'fill')+theme_bw()+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
          align = "h")

options(repr.plot.width = 12, repr.plot.height = 8)
plot_grid(ggplot(telco, aes(x=ServicoInternet,fill=Cancelamento))+ geom_bar(position = 'fill')+ theme1+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)), 
          ggplot(telco, aes(x=SegurancaOnline,fill=Cancelamento))+ geom_bar(position = 'fill')+theme1+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
          ggplot(telco, aes(x=BackupOnline,fill=Cancelamento))+ geom_bar(position = 'fill')+theme1+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
          ggplot(telco, aes(x=ProtecaoDispositivo,fill=Cancelamento))+ geom_bar(position = 'fill')+theme1+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
          ggplot(telco, aes(x=SuporteTecnico,fill=Cancelamento))+ geom_bar(position = 'fill')+theme1+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
          ggplot(telco, aes(x=StreamingTV,fill=Cancelamento))+ geom_bar(position = 'fill')+theme_bw()+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
          align = "h")

plot_grid(ggplot(telco, aes(x=StreamingFilmes,fill=Cancelamento))+ 
          geom_bar(position = 'fill')+ theme1+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)), 
          ggplot(telco, aes(x=Contrato,fill=Cancelamento))+ 
          geom_bar(position = 'fill')+theme1+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
          ggplot(telco, aes(x=FaturamentoEletronico,fill=Cancelamento))+ 
          geom_bar(position = 'fill')+theme1+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
          ggplot(telco, aes(x=CobrancaTotal,fill=Cancelamento))+ 
          geom_bar(position = 'fill')+theme1+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
          ggplot(telco, aes(x=MetodoPagamento,fill=Cancelamento))+
          geom_bar(position = 'fill')+theme_bw()+
          scale_x_discrete(labels = function(x) str_wrap(x, width = 10)),
          align = "h")

Curva de sobrevivencia

attach(dados_tel)
library(survival)
library(ggplot2)
library(ggfortify) # Para usar autoplot()

km_Tel <- survfit(Surv(dados_tel$Tempo, dados_tel$Cancelamento) ~ 1, data = dados_tel)

plot(km_Tel, main = "Kaplan-Meier",
     xlab = "Tempos", ylab = "S(t) estimada", 
     mark.time = TRUE)

# Loop para plotar Kaplan-Meier com nome da variável na legenda
for (i in c(2:5, 7:20, 21, 22)) {
  NomesColunas_i <- colnames(dados_tel)[i]  # Nome da coluna
  coluna_atual <- dados_tel[[NomesColunas_i]]  # Valores da coluna


km_Tel_teste <-survfit(Surv(dados_tel$Tempo, dados_tel$Cancelamento) ~ coluna_atual, data = dados_tel)

plot <- autoplot(km_Tel_teste) + 
  labs(x = "Tempo de Sobrevivencia (Meses)", 
       y = "Probabilidade de Sobrevivencia (%)", 
       title = paste("Kaplan-Meier -", NomesColunas_i)) +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1), expand = c(0, 0)) +  # Eixo Y em percentuais
  theme_minimal() +
  theme(legend.title = element_blank())  # Remove o título da legenda

  print(plot)
}

KM Para Contrato

attach(dados_tel)
library(survival)
#Funcao de sobrevivencia por Kaplan-Meier:
ekm.Contract <-survfit(Surv(dados_tel$Tempo, dados_tel$Cancelamento) ~ dados_tel$CobrancaMensal)

plot(ekm.Contract, lty=c(2,1,3),xlab="Customer Tenure (months)",ylab="Customer Survival Chance (%)",lwd=2, col=c(3,6,7))
legend(1,0.4,lty=c(2,1,3),legend = c("Month-to-month","One year", "Two year"),lwd=2,bty="n", col=c(3,6,7),cex=0.7)

library(survival)
#Funcao de sobrevivencia por Kaplan-Meier:
ekm.Contract <-survfit(Surv(dados_tel$Tempo)~dados_tel$Contrato)
#summary(ekm)
plot(ekm.Contract, lty=c(2,1,3),xlab="Customer Tenure (months)",ylab="Customer Survival Chance (%)",lwd=2, col=c(3,6,7))
legend(1,0.4,lty=c(2,1,3),legend = c("Month-to-month","One year", "Two year"),lwd=2,bty="n", col=c(3,6,7),cex=0.7)

O p-valor≤2e−16, indicando que existe diferença significativa entre as curvas, e que a covariável Contract pode ser relevante para explicar a variável resposta (tempo até ocorrência de churn).

Teste logrank Para colunas

Esse teste é utilizado para comparar as curvas de Kaplan-Meier de dois grupos e utilizamos seu p-valor para determinar se essas curvas podem ou não ser consideradas diferentes com significância estatística.

logrankTabela <- data.frame(
  Variavel = character(),
  Estatistica_Teste = numeric(),
  p_value = numeric(),
  significativo=character(),
  stringsAsFactors = FALSE
)


for (i in c(2:5, 7:20)) {

NomesColunas_i <- colnames(dados_tel)[i]

coluna_atual <- dados_tel[[NomesColunas_i]]

LogRank=survdiff(Surv(dados_tel$Tempo, dados_tel$Cancelamento) ~ coluna_atual)

significativo=ifelse(LogRank$pvalue<0.05,"significativo","N significativo")


logrankTabela <- rbind(logrankTabela, data.frame(
    Variavel = NomesColunas_i,
    Estatistica_Teste = round(LogRank$chisq,1),
    p_value = round(LogRank$pvalue,6),
    significativo=significativo
  ))

}

logrankTabela

##                 Variavel Estatistica_Teste  p_value   significativo
## 1                 Genero               0.5 0.468417 N significativo
## 2                  Idoso             109.5 0.000000   significativo
## 3               Parceiro             423.5 0.000000   significativo
## 4            Dependentes             232.7 0.000000   significativo
## 5        ServicoTelefone               0.4 0.511588 N significativo
## 6        MultiplasLinhas              31.0 0.000000   significativo
## 7        ServicoInternet             520.1 0.000000   significativo
## 8        SegurancaOnline            1013.9 0.000000   significativo
## 9           BackupOnline             821.3 0.000000   significativo
## 10   ProtecaoDispositivo             763.5 0.000000   significativo
## 11        SuporteTecnico             989.6 0.000000   significativo
## 12           StreamingTV             368.3 0.000000   significativo
## 13       StreamingFilmes             378.4 0.000000   significativo
## 14              Contrato            2352.9 0.000000   significativo
## 15 FaturamentoEletronico             189.5 0.000000   significativo
## 16       MetodoPagamento             865.2 0.000000   significativo
## 17        CobrancaMensal             115.7 0.000000   significativo
## 18         CobrancaTotal             775.0 0.000000   significativo

cox

colnames(dados_tel)

##  [1] "IDCliente"             "Genero"                "Idoso"                
##  [4] "Parceiro"              "Dependentes"           "Tempo"                
##  [7] "ServicoTelefone"       "MultiplasLinhas"       "ServicoInternet"      
## [10] "SegurancaOnline"       "BackupOnline"          "ProtecaoDispositivo"  
## [13] "SuporteTecnico"        "StreamingTV"           "StreamingFilmes"      
## [16] "Contrato"              "FaturamentoEletronico" "MetodoPagamento"      
## [19] "CobrancaMensal"        "CobrancaTotal"         "Cancelamento"         
## [22] "Tempo_Categoria"

# Carregar pacotes necessários
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dbplyr::ident() masks dplyr::ident()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dbplyr::sql()   masks dplyr::sql()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(survival)

# Definir a fórmula do modelo com as variáveis significativas
cox_formula <- Surv(Tempo, dados_tel$Cancelamento) ~ 
  Genero+Idoso + Parceiro + Dependentes +ServicoTelefone+
  MultiplasLinhas + ServicoInternet + SegurancaOnline +
  BackupOnline + ProtecaoDispositivo + SuporteTecnico +
  StreamingTV + StreamingFilmes + Contrato + FaturamentoEletronico +
  MetodoPagamento + CobrancaMensal + CobrancaTotal

# Ajustar o Modelo de Cox Completo
m.comp1 <- coxph(formula = cox_formula, data = dados_tel, x = TRUE, method = "breslow")

cox_formula2 <- Surv(Tempo, dados_tel$Cancelamento) ~ 
  Idoso + Parceiro + Dependentes +ServicoTelefone+
  MultiplasLinhas + ServicoInternet + SegurancaOnline +
  BackupOnline + ProtecaoDispositivo + SuporteTecnico +
  StreamingTV + StreamingFilmes + Contrato + FaturamentoEletronico +
  MetodoPagamento + CobrancaMensal + CobrancaTotal

# Ajustar o Modelo de Cox Completo
m.comp2 <- coxph(formula = cox_formula, data = dados_tel, x = TRUE, method = "breslow")

TRV.npartner=2*(m.comp1$loglik-m.comp2$loglik)
1-pchisq(TRV.npartner,df=1)

## [1] 1 1

# Definir a fórmula do modelo com as variáveis significativas
cox_formula <- Surv(Tempo, dados_tel$Cancelamento) ~
  Genero+Parceiro + Dependentes +ServicoTelefone+
  MultiplasLinhas + ServicoInternet + SegurancaOnline +
  BackupOnline + ProtecaoDispositivo + SuporteTecnico +
  StreamingTV + StreamingFilmes + Contrato + FaturamentoEletronico +
  MetodoPagamento + CobrancaMensal + CobrancaTotal

# Ajustar o Modelo de Cox Completo
m.comp1 <- coxph(formula = cox_formula, data = dados_tel, x = TRUE, method = "breslow")

cox_formula2 <- Surv(Tempo, dados_tel$Cancelamento) ~ 
  Parceiro + Dependentes +ServicoTelefone+
  MultiplasLinhas + ServicoInternet + SegurancaOnline +
  BackupOnline + ProtecaoDispositivo + SuporteTecnico +
  StreamingTV + StreamingFilmes + Contrato + FaturamentoEletronico +
  MetodoPagamento + CobrancaMensal + CobrancaTotal

# Ajustar o Modelo de Cox Completo
m.comp2 <- coxph(formula = cox_formula, data = dados_tel, x = TRUE, method = "breslow")

TRV.npartner=2*(m.comp1$loglik-m.comp2$loglik)
1-pchisq(TRV.npartner,df=1)

## [1] 1 1

# Definir a fórmula do modelo com as variáveis significativas
cox_formula <- Surv(Tempo, dados_tel$Cancelamento) ~
  Parceiro + Dependentes +ServicoTelefone+
  MultiplasLinhas + ServicoInternet + SegurancaOnline +
  BackupOnline + ProtecaoDispositivo + SuporteTecnico +
  StreamingTV + StreamingFilmes + Contrato + FaturamentoEletronico +
  MetodoPagamento + CobrancaMensal + CobrancaTotal

# Ajustar o Modelo de Cox Completo
m.comp1 <- coxph(formula = cox_formula, data = dados_tel, x = TRUE, method = "breslow")

cox_formula2 <- Surv(Tempo, dados_tel$Cancelamento) ~ 
  Parceiro + Dependentes +
  MultiplasLinhas + ServicoInternet + SegurancaOnline +
  BackupOnline + ProtecaoDispositivo + SuporteTecnico +
  StreamingTV + StreamingFilmes + Contrato + FaturamentoEletronico +
  MetodoPagamento + CobrancaMensal + CobrancaTotal

# Ajustar o Modelo de Cox Completo
m.comp2 <- coxph(formula = cox_formula, data = dados_tel, x = TRUE, method = "breslow")

TRV.npartner=2*(m.comp1$loglik-m.comp2$loglik)
1-pchisq(TRV.npartner,df=1)

## [1] 1 1

Random Forest para Sobrevivência

# ## Carregar pacotes necessários
# library(ranger)
# library(survival)
# 
# # Verificar valores ausentes
# colSums(is.na(dados_limpos))
# 
# # Remover linhas com valores ausentes
# dados_limpos <- na.omit(dados_limpos)
# 
# dados_tel <- na.omit(dados_tel)
# 
# dados_1 <- dados_tel[ , -c(19, 20)]
# 
# # Remover a variável IDCliente e quaisquer variáveis irrelevantes
# dados_limpos <- dados_1[, !(names(dados_1) %in% c("IDCliente"))]
# 
# dados_limpos <- dados_tel[ , -c(19)]
# 
# # Ajuste do modelo Random Forests para sobrevivência com todas as variáveis limpas
# ranger_fit <- ranger(
#   Surv(Tempo, Cancelamento) ~ .,   # Todas as variáveis restantes
#   data = dados_limpos,             # Dados limpos
#   mtry = floor(sqrt(ncol(dados_limpos) - 2)),  # Número ideal de variáveis
#   importance = "permutation",      # Calcular a importância
#   splitrule = "logrank",           # Regra de divisão compatível
#   verbose = TRUE                   # Exibe o progresso do modelo
# )
# 
# # Avaliar a importância das variáveis
# importancia <- ranger_fit$variable.importance
# 
# # Ordenar a importância em ordem decrescente
# importancia <- sort(importancia, decreasing = TRUE)
# print(importancia)
# 
# # Gráfico da importância das variáveis
# library(ggplot2)
# importancia_df <- data.frame(Variavel = names(importancia), Importancia = importancia)
# 
# ggplot(importancia_df, aes(x = reorder(Variavel, Importancia), y = Importancia)) +
#   geom_bar(stat = "identity", fill = "steelblue") +
#   coord_flip() +
#   labs(title = "Importancia das Variaveis no Modelo Random Forests",  # Sem acento
#        x = "Variaveis",  # Sem acento
#        y = "Importancia") +
#   theme_minimal()
#

funcoes

# Dagum ----------------------------------------------------------------

ddagum <- function(t, par) {
 a <- par[1]
 b <- par[2] 
 c <- par[3]
if (any(a < 0)) stop("a must be positive")
  if (any(b < 0)) stop("b must be positive")
  #if (any(c < 0) | any(c > 1)) stop("c must be in the interval (0,1)")
 # if (any(t <= 0)) stop("t must be greater than 0")


 p0 <- 1 - c

 density<- (a * b * c^2 *  t^(-a-1)) / (b + c * t ^(-a))^2 
 return(density)
return(p0)
 
}

## Dagum survival function
sdagum <- function(y, par, log.p = FALSE) {
 a <- par[1]
 b <- par[2] 
 c <- par[3]
  if (any(a < 0)) stop("a must be positive")
  if (any(b < 0)) stop("b must be positive")
  #if (any(c < 0) | any(c > 1)) stop("c must be in the interval (0,1)")
  #if (any(y <= 0)) stop("y must be greater than 0")
  
  p0 <- 1 - c
  
  surv   <- (b + c * y ^ (-a) - c * b)/(b + c * y ^ (-a))
  
  if (log.p == FALSE)
    surv <- surv
  else
    surv <- surv
  
  return(surv)
return(p0)
}

loglik_dagum <- function(par, D){
  if (length(D$Tempo[D$Cancelamento == 1]) > 0) {
    aux1 <- ddagum(D$Tempo[D$Cancelamento == 1], par)
  } else {
    aux1 <- 1
  }
  if (length(D$Tempo[D$Cancelamento == 0]) > 0) {
    aux2 <- sdagum(D$Tempo[D$Cancelamento == 0], par, log.p = FALSE)
  } else {
    aux2 <- 1
  }
  sum(log(aux1)) + sum(log(aux2))
}




# Gamma-dagum ----------------------------------------------------------

dGdagum <- function(t, par) {
  g <- ddagum(t, par[2:4]) 
  G <- 1-sdagum(t, par[2:4])
  g*(-log(1-G))^(par[1]-1)/gamma(par[1])
}

pGdagum <- function(t, par) {
 g <- ddagum(t, par[2:4]) 
  G <- 1-sdagum(t, par[2:4])
  pgamma(-log(1-G), par[1], lower = T)

}

curegd<-function(par){
pgamma(-log(1-par[4]),par[1],lower=T)  }





loglik_Gdagum <- function(par, D){
  if (length(D$Tempo[D$Cancelamento == 1]) > 0) {
    aux1 <- dGdagum(D$Tempo[D$Cancelamento == 1], par)
  } else {
    aux1 <- 1
  }
  if (length(D$Tempo[D$Cancelamento == 0]) > 0) {
    aux2 <- 1 - pGdagum(D$Tempo[D$Cancelamento == 0], par)
  } else {
    aux2 <- 1
  }
  sum(log(aux1)) + sum(log(aux2))
}





# Random samples ----------------------------------------------------------

rGdagum <- function (n, par, tmax = 1e+03) {
  out <- data.frame(time = rep(NA, n), status = rep(NA, n))
  
  for (i in 1:n) {
    u <- runif(1)
    ff <- function(z) pGdagum(z, par) - u
    y <- try(uniroot(ff, c(0, 100))$root, silent = T)
    y <- ifelse(is.numeric(y), y, tmax)
    y_star <- runif(1, 0, tmax)
    temp <- min(y, y_star)
    out$time[i] <- temp
    out$status[i] <- ifelse(temp == tmax | temp == y_star, 0, 1)
  }
  
  out[out$time > 0, ]
}


# Delta method ------------------------------------------------------------

deltamethod_Gdagum <- function(par, cov) {
  myfun <- function(z) {
    c(z[1], z[2], z[3], pgamma(-log(1 - z[4]), z[1], lower = TRUE))
  }
  j <- jacobian(myfun, par)  # Calcular a Jacobiana
  v <- j %*% cov %*% t(j)    # Matriz de variância-covariância ajustada
  return(v)
}


deltamethod_invgauss <- function (mod, cov) {
  myfun <- function (z) {
    c(z[1], z[2], z[3], pgamma(-log(1 - exp(-2 * z[3] / z[2])), z[1], lower = F))
  }
  j <- jacobian(myfun, mod)
  v <- j %*% cov %*% t(j)
  return(v)
}

gama dagum defectiva

# 
# library(foreign)
# library(dplyr)
# library(survival)
# library(maxLik)
# library(numDeriv)
# #source("gammaG.R")
# 
# # Dataset -----------------------------------------------------------------
# library(readxl)
# dados=read_excel("dados2.xlsx")
# dados$Parceiro=ifelse(dados$Parceiro=="Yes",1,0)
# attach(dados)
# ####https://rpubs.com/ewersonpimenta/676770
# # Groups para a variável  ------------------------------------------------------------------
# dados_1=dados %>% filter(Contrato == "Month-to-month") %>% select(Tempo, Cancelamento)
# dados_2=dados %>% filter(Contrato == "One year") %>% select(Tempo, Cancelamento)
# dados_3=dados %>% filter(Contrato == "Two year") %>% select(Tempo, Cancelamento)
# 
# # Groups para a variável  ------------------------------------------------------------------
# dados_1=dados %>% filter( Parceiro=="1") %>% select(Tempo, Cancelamento)
# dados_2=dados %>% filter(Parceiro == "0") %>% select(Tempo, Cancelamento)
# 
# # Loglikelihood -----------------------------------------------------------
# 
# l_dagum_1 <- function(z) loglik_dagum(z, dados_1)
# l_dagum_2 <- function(z) loglik_dagum(z, dados_2)
# #l_dagum_3 <- function(z) loglik_dagum(z, dados_3)
# #l_dagum <- function(z) loglik_dagum(z, mela)
# 
# l_gd_1 <- function(z) loglik_Gdagum(z, dados_1)
# l_gd_2 <- function(z) loglik_Gdagum(z, dados_2)
# #l_gd_3 <- function(z) loglik_Gdagum(z, dados_3)
# #l_gd <- function(z) loglik_Gdagum(z, mela)
# 
# 
# 
# # Estimation --------------------------------------------------------------
# 
# A2 <- diag(c(1, 1,0.1))
# d2 <- c(0, 0, 0)
# 
# A3 <- diag(c(1, 1, 1,1))
# d3 <- c(0, 0, 0,0)
# 
# A<-diag(c(1, 1, 1,1))
# d <- c(0, 0, 0,0)
# 
# 
# fit_dagum_1 <- maxSANN(l_dagum_1, start = c(1.3, 1.5,0.8),
#                      constraints = list(ineqA = A2, ineqB = d2))
# fit_dagum_2 <- maxSANN(l_dagum_2, start = c(1.3, 1.5,0.8),
#                      constraints = list(ineqA = A2, ineqB = d2))
# 
# fit_dagum_1 <- maxSANN(l_dagum_1, start = c(0.5, 0.5,0.1),
#                      constraints = list(ineqA = A2, ineqB = d2))
# fit_dagum_2 <- maxSANN(l_dagum_2, start = c(0.5, 0.5,0.1),
#                      constraints = list(ineqA = A2, ineqB = d2))
# 
# fit_dagum_11<-maxBFGS(l_dagum_1, start = fit_dagum_1$estimate,
#                       constraints = list(ineqA = A2, ineqB = d2))
# fit_dagum_22<-maxBFGS(l_dagum_2, start = fit_dagum_2$estimate,
#                       constraints = list(ineqA = A2, ineqB = d2))
# 
# #fit_dagum_3 <- maxSANN(l_dagum_3, start = c(1.3, 1.5,0.8),constraints = list(ineqA = A2, ineqB = d2))
# 
# fit_Gdagum_1 <- maxSANN(l_gd_1, start = c(1.3, 1.5,0.3,1),
#                       constraints = list(ineqA = A3, ineqB = d3))
# fit_Gdagum_2 <- maxSANN(l_gd_2, start = c(1.3, 1.5,0.3,1),
#                       constraints = list(ineqA = A3, ineqB = d3))
# #fit_Gdagum_3 <- maxSANN(l_gd_3, start = c(1.3, 1.5,0.8,1),constraints = list(ineqA = A3, ineqB = d3))
# 
# # fit_Gdagum <- maxSANN(l_gd, start = c(1.3, 1.5,0.8,1),constraints = list(ineqA = A, ineqB = d))
# 
# fit_Gdagum_11<-maxBFGS(l_gd_1, start = fit_Gdagum_1$estimate,
#                       constraints = list(ineqA = A3, ineqB = d3))
# fit_Gdagum_22<-maxBFGS(l_gd_2, start = fit_Gdagum_2$estimate,
#                       constraints = list(ineqA = A3, ineqB = d3))
# 
# # AIC ---------------------------------------------------------------------
# 
# #group 1
# 2 * 3 - 2 * l_dagum_1(fit_dagum_1$estimate) #selected
# #2 * 2 - 2 * l_inv_1(fit_inv_1$estimate)
# 2 * 4 - 2 * l_gd_1(fit_Gdagum_1$estimate)
# #2 * 3 - 2 * l_Ginv_1(fit_Ginv_1$estimate)
# 
# #group 2
# 2 * 3 - 2 * l_gd_1(fit_Gdagum_2$estimate) #selected
# #2 * 2 - 2 * l_inv_2(fit_inv_2$estimate)
# 2 * 4 - 2 * l_gd_2(fit_Gdagum_2$estimate)
# #2 * 3 - 2 * l_Ginv_2(fit_Ginv_2$estimate)
# 
# 
# 
# # Figures -----------------------------------------------------------------
# 
# s_dagum_1 <- Vectorize(function(z) sdagum(z, fit_dagum_11$estimate))
# s_dagum_2 <- Vectorize(function(z) sdagum(z, fit_dagum_22$estimate))
# #s_dagum_3 <- Vectorize(function(z) 1 - pdagum(z, fit_dagum_3$estimate))
# 
# s_Gdagum_1 <- Vectorize(function(z) 1 - pGdagum(z, fit_Gdagum_11$estimate))
# s_Gdagum_2 <- Vectorize(function(z) 1 - pGdagum(z, fit_Gdagum_22$estimate))
# #s_Gdagum_3 <- Vectorize(function(z) 1 - pGdagum(z, fit_Gdagum_3$estimate))
# 
# #s_Gdagum <- Vectorize(function(z) 1 - pGdagum(z, fit_Gdagum$estimate))
# 
# 
# 
# ##Categorizado por contrato
# grid <- seq(0, 75, 0.05)
# plot(survfit(Surv(Tempo, Cancelamento) ~ Contrato, dados),
#      ylab = "survival", xlab = "time")
# lines(grid, s_dagum_1(grid), col = "darkblue", lwd = 2)
# lines(grid, s_dagum_2(grid), col = "darkgreen", lwd = 2)
# lines(grid, s_dagum_3(grid), col = "darkred", lwd = 2)
# 
# lines(grid, s_Gdagum_1(grid), col = "red", lwd =3,lty=4)
# lines(grid, s_Gdagum_2(grid), col = "green", lwd = 3,lty=4)
# lines(grid, s_Gdagum_3(grid), col = "blue", lwd = 3,lty=4)
# 
# 
# ##Categorizado por Parceiro
# grid <- seq(0, 75, 0.05)
# plot(survfit(Surv(Tempo, Cancelamento) ~ Parceiro, dados),
#      ylab = "survival", xlab = "time",main = "KM Parceiro")
# 
# # Linhas de sobrevivência
# lines(grid, s_dagum_1(grid), col = "blue", lwd = 2)
# lines(grid, s_dagum_2(grid), col = "purple", lwd = 2)
# lines(grid, s_Gdagum_1(grid), col = "red", lwd = 3, lty = 4)
# lines(grid, s_Gdagum_2(grid), col = "green", lwd = 3, lty = 4)
# 
# # Legenda corrigida
# legend("bottomright",
#        legend = c("Dagum Defectiva - Yes", 
#                   "Dagum Defectiva - No", 
#                   "Gamma-Dagum - Yes", 
#                   "Gamma-Dagum - No"),
#        col = c("blue", "purple", "red", "green"),
#        lty = c(1, 1, 4, 4), 
#        lwd = c(2, 2, 3, 3))
# 
# # Coefficients ------------------------------------------------------------
# 
# #group 1
# #c(fit_Gdagum_1$estimate, (fit_Gdagum_1$estimate)) #puntal
# #diag(deltamethod_gompertz(fit_Ggom_1$estimate, solve(-fit_Ggom_1$hessian))) #standard errors
# ##dagum
# c(fit_dagum_1$estimate, 1-fit_dagum_1$estimate[3]) #puntal
# c(fit_dagum_2$estimate, 1-fit_dagum_2$estimate[3]) #puntal
# #c(fit_dagum_3$estimate, curedagum(fit_dagum_3$estimate)) #puntal
# 
# ##Gdagum
# c(fit_Gdagum_11$estimate, curegd(fit_Gdagum_11$estimate)) #puntal
# c(fit_Gdagum_22$estimate, curegd(fit_Gdagum_22$estimate)) #puntal
# #c(fit_Gdagum_3$estimate, curegd(fit_Gdagum_3$estimate)) #puntal

gama dagum defectiva parceiro

# 
# library(foreign)
# library(dplyr)
# library(survival)
# library(maxLik)
# library(numDeriv)
# #source("gammaG.R")
# 
# # Dataset -----------------------------------------------------------------
# library(readxl)
# dados=read_excel("dados2.xlsx")
# dados$Parceiro=ifelse(dados$Parceiro=="Yes",1,0)
# attach(dados)
# 
# # Groups para a variável  ------------------------------------------------------------------
# 
# dados_1=dados %>%
#   filter(Parceiro == "1") %>%
#   dplyr::select(Tempo, Cancelamento)
# 
# dados_2=dados %>%
#   filter(Parceiro == "0") %>%
#   dplyr::select(Tempo, Cancelamento)
# 
# # Loglikelihood -----------------------------------------------------------
# 
# l_dagum_1 <- function(z) loglik_dagum(z, dados_1)
# l_dagum_2 <- function(z) loglik_dagum(z, dados_2)
# 
# l_gd_1 <- function(z) loglik_Gdagum(z, dados_1)
# l_gd_2 <- function(z) loglik_Gdagum(z, dados_2)
# 
# 
# 
# # Estimation --------------------------------------------------------------
# 
# A2 <- diag(c(1, 1,0.1))
# d2 <- c(0, 0, 0)
# 
# A3 <- diag(c(1, 1, 1,1))
# d3 <- c(0, 0, 0,0)
# 
# A<-diag(c(1, 1, 1,1))
# d <- c(0, 0, 0,0)
# 
# 
# fit_dagum_1 <- maxSANN(l_dagum_1, start = c(0.5, 0.5,0.1),
#                      constraints = list(ineqA = A2, ineqB = d2))
# fit_dagum_2 <- maxSANN(l_dagum_2, start = c(0.5, 0.5,0.1),
#                      constraints = list(ineqA = A2, ineqB = d2))
# 
# fit_dagum_11<-maxBFGS(l_dagum_1, start = fit_dagum_1$estimate,
#                       constraints = list(ineqA = A2, ineqB = d2))
# fit_dagum_22<-maxBFGS(l_dagum_2, start = fit_dagum_2$estimate,
#                       constraints = list(ineqA = A2, ineqB = d2))
# # Gama Dagum Defectivo
# 
# fit_Gdagum_1 <- maxSANN(l_gd_1, start = c(1.3, 1.5,0.3,1),
#                       constraints = list(ineqA = A3, ineqB = d3))
# fit_Gdagum_2 <- maxSANN(l_gd_2, start = c(1.3, 1.5,0.3,1),
#                       constraints = list(ineqA = A3, ineqB = d3))
# 
# fit_Gdagum_11<-maxBFGS(l_gd_1, start = fit_Gdagum_1$estimate,
#                       constraints = list(ineqA = A3, ineqB = d3))
# fit_Gdagum_22<-maxBFGS(l_gd_2, start = fit_Gdagum_2$estimate,
#                       constraints = list(ineqA = A3, ineqB = d3))
# 
# 
# # AIC ---------------------------------------------------------------------
# 
# #Para o Dagum Defectiva: 𝑘=3.
# 
# #Para o Gama-Dagum Defectiva: 𝑘=4.
# 
# #group 1
# 2 * 3 - 2 * l_dagum_1(fit_dagum_1$estimate) #selected
# 2 * 4 - 2 * l_gd_1(fit_Gdagum_1$estimate)
# 
# 
# # Figures -----------------------------------------------------------------
# 
# s_dagum_1 <- Vectorize(function(z) sdagum(z, fit_dagum_11$estimate))
# s_dagum_2 <- Vectorize(function(z) sdagum(z, fit_dagum_22$estimate))
# #s_dagum_3 <- Vectorize(function(z) 1 - pdagum(z, fit_dagum_3$estimate))
# 
# s_Gdagum_1 <- Vectorize(function(z) 1 - pGdagum(z, fit_Gdagum_11$estimate))
# s_Gdagum_2 <- Vectorize(function(z) 1 - pGdagum(z, fit_Gdagum_22$estimate))
# #s_Gdagum_3 <- Vectorize(function(z) 1 - pGdagum(z, fit_Gdagum_3$estimate))
# 
# #s_Gdagum <- Vectorize(function(z) 1 - pGdagum(z, fit_Gdagum$estimate))
# 
# 
# ##Categorizado por contrato
# grid <- seq(0, 75, 0.05)
# plot(survfit(Surv(Tempo, Cancelamento) ~ Contrato, dados),
#      ylab = "survival", xlab = "time")
# lines(grid, s_dagum_1(grid), col = "darkblue", lwd = 2)
# lines(grid, s_dagum_2(grid), col = "darkgreen", lwd = 2)
# #lines(grid, s_dagum_3(grid), col = "darkred", lwd = 2)
# 
# lines(grid, s_Gdagum_1(grid), col = "red", lwd =3,lty=4)
# lines(grid, s_Gdagum_2(grid), col = "green", lwd = 3,lty=4)
# #lines(grid, s_Gdagum_3(grid), col = "blue", lwd = 3,lty=4)
# 
# 
# ##Categorizado por Parceiro
# grid <- seq(0, 75, 0.05)
# plot(survfit(Surv(Tempo, Cancelamento) ~ Parceiro, dados),
#      ylab = "survival", xlab = "time")
# lines(grid, s_dagum_1(grid), col = "darkblue", lwd = 2)
# lines(grid, s_dagum_2(grid), col = "purple", lwd = 2)
# #lines(grid, s_dagum_3(grid), col = "darkred", lwd = 2)
# 
# lines(grid, s_Gdagum_1(grid), col = "red", lwd =3,lty=4)
# lines(grid, s_Gdagum_2(grid), col = "green", lwd = 3,lty=4)
# #lines(grid, s_Gdagum_3(grid), col = "blue", lwd = 3,lty=4)
# legend("bottomright", c("Gamma-Dagum - Yes", "Gamma-Dagum - No"),
#        col = c("red", "green"), lty=c(4,4,2))
# 
# 
# 
# # Coefficients ------------------------------------------------------------
# 
# #group 1
# #c(fit_Gdagum_1$estimate, (fit_Gdagum_1$estimate)) #puntal
# #diag(deltamethod_gompertz(fit_Ggom_1$estimate, solve(-fit_Ggom_1$hessian))) #standard errors
# ##dagum
# c(fit_dagum_1$estimate, 1-fit_dagum_1$estimate[3]) #puntal
# c(fit_dagum_2$estimate, 1-fit_dagum_2$estimate[3]) #puntal
# #c(fit_dagum_3$estimate, curedagum(fit_dagum_3$estimate)) #puntal
# 
# ##Gdagum
# c(fit_Gdagum_11$estimate, curegd(fit_Gdagum_11$estimate)) #puntal
# c(fit_Gdagum_22$estimate, curegd(fit_Gdagum_22$estimate)) #puntal
# #c(fit_Gdagum_3$estimate, curegd(fit_Gdagum_3$estimate)) #puntal

gama dagum defectiva Geral

# 
# library(readxl)
# dados=read_excel("dados2.xlsx")
# 
# # Loglikelihood sem covariáveis -----------------------------------------------------------
# 
# l_dagum <- function(z) loglik_dagum(z, dados)       # Log-verossimilhança para Dagum
# l_gd <- function(z) loglik_Gdagum(z, dados)         # Log-verossimilhança para Gama-Dagum
# 
# # Estimation ------------------------------------------------------------------------------
# 
# # Restrições para os modelos
# A2 <- diag(c(1, 1, 0.1))
# d2 <- c(0, 0, 0)
# 
# A3 <- diag(c(1, 1, 1, 1))
# d3 <- c(0, 0, 0, 0)
# 
# # Ajuste do Modelo Dagum Defectiva
# fit_dagum <- maxSANN(l_dagum, start = c(0.5, 0.5, 0.1),
#                      constraints = list(ineqA = A2, ineqB = d2))
# fit_dagum_ref <- maxBFGS(l_dagum, start = fit_dagum$estimate,
#                          constraints = list(ineqA = A2, ineqB = d2))
# 
# # Ajuste do Modelo Gama-Dagum Defectiva
# fit_Gdagum <- maxSANN(l_gd, start = c(1.3, 1.5, 0.3, 1),
#                       constraints = list(ineqA = A3, ineqB = d3))
# fit_Gdagum_ref <- maxBFGS(l_gd, start = fit_Gdagum$estimate,
#                           constraints = list(ineqA = A3, ineqB = d3))
# 
# # AIC ---------------------------------------------------------------------
# 
# # Para o Dagum Defectiva: k = 3
# aic_dagum <- 2 * 3 - 2 * l_dagum(fit_dagum$estimate)
# 
# # Para o Gama-Dagum Defectiva: k = 4
# aic_gd <- 2 * 4 - 2 * l_gd(fit_Gdagum$estimate)
# 
# # Resultados
# cat("AIC - Dagum Defectiva: ", aic_dagum, "\n")
# cat("AIC - Gama-Dagum Defectiva: ", aic_gd, "\n")
# 
# # Figures -----------------------------------------------------------------
# 
# # Funções de sobrevivência estimadas
# s_dagum_1 <- Vectorize(function(z) sdagum(z, fit_dagum_ref$estimate))
# s_Gdagum_1 <- Vectorize(function(z) 1 - pGdagum(z, fit_Gdagum_ref$estimate))
# 
# # Grid de tempo
# grid <- seq(0, 75, 0.05)
# 
# 
# plot(survfit(Surv(Tempo, Cancelamento) ~ 1, dados),
#      ylab = "Survival", xlab = "Time", main = "Kaplan−Meier")
# lines(grid, s_dagum_1(grid), col = "darkblue", lwd = 2)
# lines(grid, s_Gdagum_1(grid), col = "red", lwd = 3, lty = 4)
# legend("bottomright", legend = c("Dagum", "Gama-Dagum"), 
#        col = c("darkblue", "red"), lty = c(1, 4), lwd = c(2, 3))
# 
# 
# # Coefficients ------------------------------------------------------------
# 
# ## Coeficientes ajustados - Modelos Dagum
# cat("Coeficientes - Dagum Defectiva:\n", 
#     c(fit_dagum_ref$estimate, 1 - fit_dagum_ref$estimate[3]), "\n")
# 
# ## Coeficientes ajustados - Modelos Gama-Dagum
# cat("Coeficientes - Gama-Dagum Defectiva:\n", 
#     c(fit_Gdagum_ref$estimate, curegd(fit_Gdagum_ref$estimate)), "\n")
#

gama dagum defectiva servico internet

# 
# library(foreign)
# library(dplyr)
# library(survival)
# library(maxLik)
# library(numDeriv)
# #source("gammaG.R")
# 
# # Groups para a variável ServicoInternet ---------------------------------
# dados_1 <- dados %>% filter(ServicoInternet == "DSL") %>% select(Tempo, Cancelamento)
# dados_2 <- dados %>% filter(ServicoInternet == "Fiber optic") %>% select(Tempo, Cancelamento)
# dados_3 <- dados %>% filter(ServicoInternet == "No") %>% select(Tempo, Cancelamento)
# 
# # Loglikelihood -----------------------------------------------------------
# l_dagum_1 <- function(z) loglik_dagum(z, dados_1)
# l_dagum_2 <- function(z) loglik_dagum(z, dados_2)
# l_dagum_3 <- function(z) loglik_dagum(z, dados_3)
# 
# l_gd_1 <- function(z) loglik_Gdagum(z, dados_1)
# l_gd_2 <- function(z) loglik_Gdagum(z, dados_2)
# l_gd_3 <- function(z) loglik_Gdagum(z, dados_3)
# 
# # Estimation --------------------------------------------------------------
# 
# A2 <- diag(c(1, 1, 0.1))
# d2 <- c(0, 0, 0)
# 
# A3 <- diag(c(1, 1, 1, 1))
# d3 <- c(0, 0, 0, 0)
# 
# # Ajuste do Modelo Dagum Defectiva
# fit_dagum_1 <- maxSANN(l_dagum_1, start = c(0.5, 0.5, 0.1),
#                        constraints = list(ineqA = A2, ineqB = d2))
# fit_dagum_2 <- maxSANN(l_dagum_2, start = c(0.5, 0.5, 0.1),
#                        constraints = list(ineqA = A2, ineqB = d2))
# fit_dagum_3 <- maxSANN(l_dagum_3, start = c(0.5, 0.5, 0.1),
#                        constraints = list(ineqA = A2, ineqB = d2))
# 
# fit_dagum_11 <- maxBFGS(l_dagum_1, start = fit_dagum_1$estimate, constraints = list(ineqA = A2, ineqB = d2))
# fit_dagum_22 <- maxBFGS(l_dagum_2, start = fit_dagum_2$estimate, constraints = list(ineqA = A2, ineqB = d2))
# fit_dagum_33 <- maxBFGS(l_dagum_3, start = fit_dagum_3$estimate, constraints = list(ineqA = A2, ineqB = d2))
# 
# # Ajuste do Modelo Gama-Dagum Defectiva
# fit_Gdagum_1 <- maxSANN(l_gd_1, start = c(1.3, 1.5, 0.3, 1),
#                         constraints = list(ineqA = A3, ineqB = d3))
# fit_Gdagum_2 <- maxSANN(l_gd_2, start = c(1.3, 1.5, 0.3, 1),
#                         constraints = list(ineqA = A3, ineqB = d3))
# fit_Gdagum_3 <- maxSANN(l_gd_3, start = c(1.3, 1.5, 0.3, 1),
#                         constraints = list(ineqA = A3, ineqB = d3))
# 
# fit_Gdagum_11 <- maxBFGS(l_gd_1, start = fit_Gdagum_1$estimate, constraints = list(ineqA = A3, ineqB = d3))
# fit_Gdagum_22 <- maxBFGS(l_gd_2, start = fit_Gdagum_2$estimate, constraints = list(ineqA = A3, ineqB = d3))
# fit_Gdagum_33 <- maxBFGS(l_gd_3, start = fit_Gdagum_3$estimate, constraints = list(ineqA = A3, ineqB = d3))
# 
# # Figures -----------------------------------------------------------------
# 
# s_dagum_1 <- Vectorize(function(z) sdagum(z, fit_dagum_11$estimate))
# s_dagum_2 <- Vectorize(function(z) sdagum(z, fit_dagum_22$estimate))
# s_dagum_3 <- Vectorize(function(z) sdagum(z, fit_dagum_33$estimate))
# 
# s_Gdagum_1 <- Vectorize(function(z) 1 - pGdagum(z, fit_Gdagum_11$estimate))
# s_Gdagum_2 <- Vectorize(function(z) 1 - pGdagum(z, fit_Gdagum_22$estimate))
# s_Gdagum_3 <- Vectorize(function(z) 1 - pGdagum(z, fit_Gdagum_33$estimate))
# 
# grid <- seq(0, 75, 0.05)
# 
# ## Categorizado por ServicoInternet
# plot(survfit(Surv(Tempo, Cancelamento) ~ ServicoInternet, dados),
#      ylab = "Survival", xlab = "Time", main = "Curvas por Tipo de Serviço de Internet")
# lines(grid, s_dagum_1(grid), col = "darkblue", lwd = 2)
# lines(grid, s_dagum_2(grid), col = "darkgreen", lwd = 2)
# lines(grid, s_dagum_3(grid), col = "darkred", lwd = 2)
# 
# lines(grid, s_Gdagum_1(grid), col = "red", lwd = 3, lty = 4)
# lines(grid, s_Gdagum_2(grid), col = "green", lwd = 3, lty = 4)
# lines(grid, s_Gdagum_3(grid), col = "blue", lwd = 3, lty = 4)
# 
# legend("bottomright", 
#        legend = c("Dagum - DSL", "Dagum - Fiber optic", "Dagum - No",
#                   "Gama-Dagum - DSL", "Gama-Dagum - Fiber optic", "Gama-Dagum - No"), 
#        col = c("darkblue", "darkgreen", "darkred", "red", "green", "blue"), 
#        lty = c(1, 1, 1, 4, 4, 4), lwd = c(2, 2, 2, 3, 3, 3))
# 
# 
# # Coefficients ------------------------------------------------------------
# 
# ## Coeficientes ajustados - Modelo Dagum Defectiva
# cat("Coeficientes - Dagum Defectiva (DSL):\n", 
#     c(fit_dagum_11$estimate, 1 - fit_dagum_11$estimate[3]), "\n")
# cat("Coeficientes - Dagum Defectiva (Fiber optic):\n", 
#     c(fit_dagum_22$estimate, 1 - fit_dagum_22$estimate[3]), "\n")
# cat("Coeficientes - Dagum Defectiva (No):\n", 
#     c(fit_dagum_33$estimate, 1 - fit_dagum_33$estimate[3]), "\n")
# 
# ## Coeficientes ajustados - Modelo Gama-Dagum Defectiva
# cat("Coeficientes - Gama-Dagum Defectiva (DSL):\n", 
#     c(fit_Gdagum_11$estimate, curegd(fit_Gdagum_11$estimate)), "\n")
# cat("Coeficientes - Gama-Dagum Defectiva (Fiber optic):\n", 
#     c(fit_Gdagum_22$estimate, curegd(fit_Gdagum_22$estimate)), "\n")
# cat("Coeficientes - Gama-Dagum Defectiva (No):\n", 
#     c(fit_Gdagum_33$estimate, curegd(fit_Gdagum_33$estimate)), "\n")
# 
# # Erros padrão usando Delta Method
# ## Gama-Dagum (Fiber optic)
# cat("Erros padrão - Gama-Dagum Defectiva (Fiber optic):\n")
# print(diag(deltamethod_Gdagum(fit_Gdagum_11$estimate, solve(-fit_Gdagum_11$hessian))))
# 
# cat("Erros padrão - Gama-Dagum Defectiva (Fiber optic):\n")
# print(diag(deltamethod_Gdagum(fit_Gdagum_22$estimate, solve(-fit_Gdagum_22$hessian))))
# 
# ## Gama-Dagum (No)
# cat("Erros padrão - Gama-Dagum Defectiva (No):\n")
# print(diag(deltamethod_Gdagum(fit_Gdagum_33$estimate, solve(-fit_Gdagum_33$hessian))))

ic

1.5311 + c(-1, 1) * 1.96 * 0.0181

## [1] 1.495624 1.566576

estimativas_yes <- c(0.7220,0.02975,0.3594,0.6406) 
EP_yes <- c(1.2717,0.05596,0.1759)   

estimativas_yes - 1.96 * EP_yes

## Warning in estimativas_yes - 1.96 * EP_yes: comprimento do objeto maior não é
## múltiplo do comprimento do objeto menor

## [1] -1.7705320 -0.0799316  0.0146360 -1.8519320

estimativas_yes + 1.96 * EP_yes

## Warning in estimativas_yes + 1.96 * EP_yes: comprimento do objeto maior não é
## múltiplo do comprimento do objeto menor

## [1] 3.2145320 0.1394316 0.7041640 3.1331320

trabalho sobrevivencia

WILLIAM IRINEU

2024-04-23