Variáveis - Dicionário

Regressão Logística Binária

Passo 1: Carregar os pacotes que serão usados

if(!require(pacman)) install.packages("pacman")
## Loading required package: pacman
library(pacman)
pacman::p_load(dplyr, psych, car, MASS, DescTools, QuantPsyc, ggplot2,readxl,descriptr,esquisse,e1071, caret, hmeasure, pROC, tidyverse, broom, readr,mfx,ResourceSelection,modEvA,foreign,stargazer)

theme_set (theme_bw())

Passo 2: Carregar o banco de dados

dados <- read_excel("dados.xlsx")
#View(dados) # Visualização dos dados em janela separada
#glimpse(dados) # Visualização de um resumo dos dados

dados$om <- factor(dados$om,
                   label = c("EsSA", "CIJF", "DCMun", "AMAN","EspCEx_Bda_CoudCamp"),
                   levels = 1:5, order = F)

dados$area <- factor(dados$area)

dados$prescar <- factor(dados$prescar ,
                     label = c("não", "sim"),
                     levels = 0:1, order = F)

dados$prescar_asl_asn <- factor(dados$prescar_asl_asn ,
                     label = c("não", "sim"),
                     levels = 0:1, order = F)

dados$vegeta <- factor(dados$vegeta ,
                       label = c("primária", "secundária", "ciliar_secundária", 
                       "campos_sujos_graminosos_e_capoeiras","gramado", "pastagem"),
                       levels = 1:6, order = F)

dados$vestsil <- factor(dados$vestsil,
                        label = c("não", "sim"),
                        levels = 0:1, order = F)

dados$capivara <- factor(dados$capivara,
                        label = c("não", "sim"),
                        levels = 0:1, order = F)

dados$vestdom <- factor(dados$vestdom,
                         label = c("não", "sim"),
                         levels = 0:1, order = F)

dados$avistadom <- factor(dados$avistadom ,
                       label = c("não", "equideo", "bovino", 
                                 "equideo_bovino","canino"),
                       levels = 0:4, order = F)

dados$agua <- factor(dados$agua,
                        label = c("não", "sim"),
                        levels = 0:1, order = F)

dados$tipoagua <- factor(dados$tipoagua ,
                          label = c("não", "rio", "lagoa", 
                                    "ribeirao","brejo"),
                          levels = 0:4, order = F)
dados$queimada <- factor(dados$queimada,
                        label = c("não", "sim"),
                        levels = 0:1, order = F)
#View(dados)# Visualização dos dados em janela separada
glimpse(dados)# Visualização de um resumo dos dados
## Rows: 66
## Columns: 31
## $ om              <fct> EsSA, EsSA, EsSA, EsSA, EsSA, EsSA, EsSA, EsSA, EsS...
## $ area            <fct> 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 1...
## $ altitude        <dbl> 827, 827, 827, 827, 1148, 970, 963, 989, 979, 970, ...
## $ vegeta          <fct> ciliar_secundária, gramado, ciliar_secundária, cili...
## $ prescar         <fct> sim, sim, sim, sim, sim, sim, sim, sim, sim, sim, s...
## $ totcar          <dbl> 199, 23, 98, 77, 124, 78, 89, 34, 41, 39, 119, 12, ...
## $ totamb          <dbl> 199, 23, 98, 77, 11, 71, 46, 34, 41, 39, 119, 12, 4...
## $ totas           <dbl> 199, 23, 96, 77, 11, 71, 45, 34, 41, 39, 118, 12, 4...
## $ asl             <dbl> 6, 7, 23, 4, 0, 0, 29, 0, 0, 0, 0, 0, 0, 47, 2, 2, ...
## $ asn             <dbl> 191, 15, 71, 70, 10, 70, 15, 33, 40, 38, 115, 11, 4...
## $ prescar_asl_asn <fct> sim, sim, sim, sim, sim, sim, sim, sim, sim, sim, s...
## $ asa             <dbl> 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, ...
## $ totad           <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 1, 25,...
## $ adn             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 1, 24,...
## $ ada             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
## $ abn             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ aba             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ aln             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ aan             <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ dnl             <dbl> 0, 0, 0, 0, 0, 0, 43, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ rml             <dbl> 0, 0, 0, 0, 113, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ ixol            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...
## $ hael            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ vestsil         <fct> sim, sim, sim, sim, não, não, sim, sim, sim, não, s...
## $ capivara        <fct> sim, sim, sim, sim, não, não, não, não, não, não, s...
## $ vestdom         <fct> sim, sim, não, sim, sim, sim, sim, sim, sim, sim, n...
## $ avistadom       <fct> equideo, não, não, canino, equideo_bovino, equideo_...
## $ agua            <fct> sim, sim, sim, sim, não, sim, sim, sim, sim, sim, s...
## $ tipoagua        <fct> rio, rio, rio, rio, não, ribeirao, ribeirao, ribeir...
## $ queimada        <fct> não, não, não, não, não, não, não, não, não, não, n...
## $ tot_asl_asn     <dbl> 197, 22, 94, 74, 10, 70, 44, 33, 40, 38, 115, 11, 4...

Passo 3: Análise das frequências das categorias da VD e resultados descritivos

#table(dados$prescar)
table(dados$prescar_asl_asn)
## 
## não sim 
##  15  51
#summary(dados)
# descritiva 

#ds_auto_cross_table()
#ds_auto_freq_table()
#ds_auto_group_summary()
#ds_auto_summary_stats()

#ds_auto_summary_stats(dados)

# cont x discreta
#ds_auto_group_summary(dados, tot_asl_asn, capivara)
#ds_auto_group_summary(dados, tot_asl_asn, om)
#ds_auto_group_summary(dados, tot_asl_asn, agua)

# disc x disc
#ds_auto_cross_table (dados,prescar_asl_asn,capivara,agua,om)

#ds_cross_table(dados, prescar_asl_asn,capivara)
#ds_cross_table(dados, prescar_asl_asn,agua)
#ds_cross_table(dados, prescar_asl_asn,om)

#ds_auto_freq_table(dados,prescar_asl_asn,capivara,agua,om,vegeta)
# pacote para criar gáfico com ggplot
# install.packages("esquisse")
#library(esquisse)
  # esquisser(viewer = "browser")  
# om x capivara x vegeta
ggplot(dados) +
 aes(x = om, fill = capivara) +
 geom_bar() +
 scale_fill_hue() +
 labs(x = "OM", y = "Frequência", title = "Título", subtitle = "Subtítulo", fill = "Capivara") +
 coord_flip() +
 ggthemes::theme_base() +
 facet_wrap(vars(vegeta))

# capivara x presença_asl_e_asn x OM 

ggplot(dados) +
 aes(x = om, fill = capivara) +
 geom_bar() +
 scale_fill_hue() +
 labs(x = "OM", y = "Frequência", title = "Título", subtitle = "Subtítulo", caption = "Fonte: xxx", fill = "Capivara") +
 coord_flip() +
 theme_minimal() +
 facet_wrap(vars(prescar_asl_asn))

Passo 4: Checagem das categorias de referência

#Como modificar as categorias de referência?
#ATENÇÃO: é necessário rodar o modelo novamente!
dados$prescar_asl_asn <- relevel(dados$prescar_asl_asn, ref = "não")
dados$capivara <- relevel(dados$capivara, ref = "não")
dados$vestdom <- relevel(dados$vestdom, ref = "não")
dados$tipoagua <- relevel(dados$tipoagua, ref = "não")
dados$om <- relevel(dados$om, ref = "AMAN")

levels(dados$prescar_asl_asn)  
## [1] "não" "sim"
levels(dados$capivara)  
## [1] "não" "sim"
levels(dados$vestdom)  
## [1] "não" "sim"
levels(dados$agua)  
## [1] "não" "sim"
levels(dados$tipoagua)
## [1] "não"      "rio"      "lagoa"    "ribeirao" "brejo"
levels(dados$om)
## [1] "AMAN"                "EsSA"                "CIJF"               
## [4] "DCMun"               "EspCEx_Bda_CoudCamp"

Passo 5: Modelagem

#1. Variável dependente dicotômica (categorias mutuamente exclusivas)
#2. Independência das observações (sem medidas repetidas)
#3. Ausência de outliers/ pontos de alavancagem (in the continuous predictors)
#4. Ausência de multicolinearidade
#5. Relação linear entre cada VI contínua e o logito logit(p) = log(p/(1-p)) da VD

## Construção do modelo:  

#mod <- glm(prescar_asl_asn ~ capivara + agua + vestdom + om,
#           family = binomial(link = 'logit'), data = dados)
#summary(mod)
#anova(mod, test="Chisq")

mod <- glm(prescar_asl_asn ~ capivara + agua + vestdom,
           family = binomial(link = 'logit'), data = dados)
summary(mod)
## 
## Call:
## glm(formula = prescar_asl_asn ~ capivara + agua + vestdom, family = binomial(link = "logit"), 
##     data = dados)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5783   0.1192   0.2708   0.4668   2.1612  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -2.2336     1.0080  -2.216   0.0267 *
## capivarasim   3.0601     1.1975   2.555   0.0106 *
## aguasim       2.4608     0.9652   2.549   0.0108 *
## vestdomsim    1.6559     0.9236   1.793   0.0730 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 70.747  on 65  degrees of freedom
## Residual deviance: 40.716  on 62  degrees of freedom
## AIC: 48.716
## 
## Number of Fisher Scoring iterations: 6
anova(mod, test="Chisq")
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL NA NA 65 70.74671 NA
capivara 1 18.980509 64 51.76620 0.0000132
agua 1 7.287607 63 44.47859 0.0069432
vestdom 1 3.762148 62 40.71644 0.0524252
Anova(mod, type = 'II', test = "Wald")
Df Chisq Pr(>Chisq)
capivara 1 6.529635 0.0106092
agua 1 6.499812 0.0107886
vestdom 1 3.214476 0.0729896
#step.model <- mod %>% stepAIC(trace =FALSE)
#coef(step.model)
#step.model

#Criação de um segundo modelo
mod2 <- glm(prescar_asl_asn ~ capivara + agua,
            family = binomial(link = 'logit'), data = dados)
anova(mod2, test="Chisq")
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL NA NA 65 70.74671 NA
capivara 1 18.980509 64 51.76620 0.0000132
agua 1 7.287607 63 44.47859 0.0069432
Anova(mod2, type = 'II', test = "Wald")
Df Chisq Pr(>Chisq)
capivara 1 4.792092 0.0285907
agua 1 6.327058 0.0118909
anova(mod,mod2)
Resid. Df Resid. Dev Df Deviance
62 40.71644 NA NA
63 44.47859 -1 -3.762148

#Análise do modelo

## Overall effects
Anova(mod, type = 'II', test = "Wald")
Df Chisq Pr(>Chisq)
capivara 1 6.529635 0.0106092
agua 1 6.499812 0.0107886
vestdom 1 3.214476 0.0729896
## Efeitos específicos
summary(mod)
## 
## Call:
## glm(formula = prescar_asl_asn ~ capivara + agua + vestdom, family = binomial(link = "logit"), 
##     data = dados)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5783   0.1192   0.2708   0.4668   2.1612  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -2.2336     1.0080  -2.216   0.0267 *
## capivarasim   3.0601     1.1975   2.555   0.0106 *
## aguasim       2.4608     0.9652   2.549   0.0108 *
## vestdomsim    1.6559     0.9236   1.793   0.0730 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 70.747  on 65  degrees of freedom
## Residual deviance: 40.716  on 62  degrees of freedom
## AIC: 48.716
## 
## Number of Fisher Scoring iterations: 6
## Obtenção das razões de chance com IC 95% (usando log-likelihood)
exp(cbind(OR = coef(mod), confint(mod)))
## Waiting for profiling to be done...
##                     OR      2.5 %      97.5 %
## (Intercept)  0.1071382 0.01059831   0.6086829
## capivarasim 21.3296503 2.73934437 458.7842053
## aguasim     11.7143499 2.07846846 103.7606635
## vestdomsim   5.2376978 0.98381710  42.2201756
## Obtenção das razões de chance com IC 95% (usando erro padrão = SPSS)
exp(cbind(OR = coef(mod), confint.default(mod)))
##                     OR      2.5 %      97.5 %
## (Intercept)  0.1071382 0.01485799   0.7725541
## capivarasim 21.3296503 2.04001691 223.0148098
## aguasim     11.7143499 1.76651485  77.6817670
## vestdomsim   5.2376978 0.85701434  32.0105238
hoslem.test(dados$prescar_asl_asn, fitted(mod),g=5)
## Warning in Ops.factor(1, y): '-' not meaningful for factors
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  dados$prescar_asl_asn, fitted(mod)
## X-squared = 66, df = 3, p-value = 3.064e-14
## Overall effects
Anova(mod2, type="II", test="Wald")
Df Chisq Pr(>Chisq)
capivara 1 4.792092 0.0285907
agua 1 6.327058 0.0118909
## Efeitos específicos
summary(mod2)
## 
## Call:
## glm(formula = prescar_asl_asn ~ capivara + agua, family = binomial(link = "logit"), 
##     data = dados)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6666   0.2408   0.2408   0.6463   1.6651  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -1.0986     0.6667  -1.648   0.0994 .
## capivarasim   2.4967     1.1405   2.189   0.0286 *
## aguasim       2.1282     0.8461   2.515   0.0119 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 70.747  on 65  degrees of freedom
## Residual deviance: 44.479  on 63  degrees of freedom
## AIC: 50.479
## 
## Number of Fisher Scoring iterations: 6
## Obtenção das razões de chance com IC 95% (usando log-likelihood)
exp(cbind(OR = coef(mod2), confint(mod2)))
## Waiting for profiling to be done...
##                     OR     2.5 %     97.5 %
## (Intercept)  0.3333333 0.0739549   1.117189
## capivarasim 12.1428571 1.7543842 244.104446
## aguasim      8.4000000 1.7532008  51.540358
## Obtenção das razões de chance com IC 95% (usando erro padrão = SPSS)
exp(cbind(OR = coef(mod2), confint.default(mod2)))
##                     OR      2.5 %     97.5 %
## (Intercept)  0.3333333 0.09024249   1.231251
## capivarasim 12.1428571 1.29864533 113.540607
## aguasim      8.4000000 1.59986668  44.103675
hoslem.test(dados$prescar_asl_asn, fitted(mod2),g=5)
## Warning in Ops.factor(1, y): '-' not meaningful for factors
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  dados$prescar_asl_asn, fitted(mod2)
## X-squared = 66, df = 3, p-value = 3.064e-14
#Avaliação da qualidade e comparação entre modelos
## Pseudo R-quadrado
PseudoR2(mod, which = "Nagelkerke")
## Nagelkerke 
##  0.5558503
PseudoR2(mod, which = "Nagelkerke")
## Nagelkerke 
##  0.5558503
PseudoR2(mod, which = "all")
##        McFadden     McFaddenAdj        CoxSnell      Nagelkerke   AldrichNelson 
##       0.4244758       0.3113963       0.3655546       0.5558503       0.3127167 
## VeallZimmermann           Efron McKelveyZavoina            Tjur             AIC 
##       0.6044518       0.4620491       0.5650231       0.4614323      48.7164411 
##             BIC          logLik         logLik0              G2 
##      57.4750601     -20.3582206     -35.3733527      30.0302642
PseudoR2(mod2, which = "all")
##        McFadden     McFaddenAdj        CoxSnell      Nagelkerke   AldrichNelson 
##       0.3712981       0.2864885       0.3283392       0.4992617       0.2846933 
## VeallZimmermann           Efron McKelveyZavoina            Tjur             AIC 
##       0.5502853       0.4042194       0.4970415       0.4042194      50.4785890 
##             BIC          logLik         logLik0              G2 
##      57.0475532     -22.2392945     -35.3733527      26.2681164
# Comparação de modelos
## AIC e BIC
AIC(mod, mod2)
df AIC
mod 4 48.71644
mod2 3 50.47859
BIC(mod, mod2)
df BIC
mod 4 57.47506
mod2 3 57.04755
## Qui-quadrado
anova(mod2, mod, test="Chisq")
Resid. Df Resid. Dev Df Deviance Pr(>Chi)
63 44.47859 NA NA NA
62 40.71644 1 3.762148 0.0524252
# Tabela de classificação
ClassLog(mod, dados$prescar_asl_asn)
## $rawtab
##        resp
##         não sim
##   FALSE   9   3
##   TRUE    6  48
## 
## $classtab
##        resp
##                não        sim
##   FALSE 0.60000000 0.05882353
##   TRUE  0.40000000 0.94117647
## 
## $overall
## [1] 0.8636364
## 
## $mcFadden
## [1] 0.4244758
ClassLog(mod2, dados$prescar_asl_asn)
## $rawtab
##        resp
##         não sim
##   FALSE   9   3
##   TRUE    6  48
## 
## $classtab
##        resp
##                não        sim
##   FALSE 0.60000000 0.05882353
##   TRUE  0.40000000 0.94117647
## 
## $overall
## [1] 0.8636364
## 
## $mcFadden
## [1] 0.3712981
levels(dados$prescar_asl_asn)
## [1] "não" "sim"
contrasts(dados$prescar_asl_asn)
##     sim
## não   0
## sim   1
# Make predictions
probabilities <- mod %>%
  predict(dados, type ="response")

predicted.classes <- ifelse(probabilities >0.5, "sim", "não")
#predicted.classes

dados$predicted <- factor(predicted.classes)
                             
dados$probabilities  <- probabilities

# Model accuracy
mean(dados$predicted==dados$prescar_asl_asn)
## [1] 0.8636364
# Código similar para calcular prob e classificar o ou 1
#predicoes <- predict(mod)
#probEstimada <- 1/(1+exp(-predicoes))
#classeEstimada <- round(probEstimada,0)
#dados$classeEstimada <- factor(classeEstimada,
#                         label = c("não", "sim"),
#                         levels = 0:1, order = F)
#dados$prescar_asl_asn 
#Model accuracy
#mean(dados$classeEstimada == dados$prescar_asl_asn)

Logistic regression diagnostics #Variável dependente dicotômica e ME #Linearity assumption

# 1.Remove qualitative variables from the original data frame and bind the logit values to the data: # Select only numeric predictors (apenas para veriável independente contínua) #If the scatter plot shows non-linearity, you need other methods to build the model #such as including 2 or 3-power terms, fractional polynomials and spline function

 #não é o caso em nossa análise, pois não há Xi numeric. Exemplo de Cód. 
mydata <- dados %>%
  dplyr::select_if(is.numeric) 
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata <- mydata %>%
  mutate(logit =log(probabilities/(1-probabilities))) %>%
  gather(key ="predictors", value ="predictor.value", -logit)
## Warning: attributes are not identical across measure variables;
## they will be dropped
  # 2.Create the scatter plots:

ggplot(mydata, aes(logit, predictor.value))+
  geom_point(size =0.5, alpha =0.5) +
  geom_smooth(method ="loess") +
  theme_bw() +
  facet_wrap(~predictors, scales ="free_y")
## `geom_smooth()` using formula 'y ~ x'

#Influential values

plot(mod, which =4, id.n =3)

#The most extreme values in the data can be examined by visualizing the Cook's distance values. Here we label the top 3 largest values
plot(mod, which = 5)

summary(stdres(mod))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -5.34800  0.08625  0.19978  0.02338  0.35808  3.24796
# Extract model results
model.data <- augment(mod) %>%
  mutate(index =1:n())

#The data for the top 3 largest values, according to the Cook's distance,
# can be displayed as follow:
model.data %>%
  top_n(5, .cooksd)
prescar_asl_asn capivara agua vestdom .fitted .resid .std.resid .hat .sigma .cooksd index
sim não não sim -0.5777534 1.430511 1.526075 0.1213204 0.7932867 0.0700048 5
sim não não sim -0.5777534 1.430511 1.526075 0.1213204 0.7932867 0.0700048 30
não não sim sim 1.8830612 -2.012296 -2.088720 0.0718389 0.7719858 0.1370428 33
sim não não não -2.2336354 2.161210 2.264077 0.0888045 0.7638384 0.2495785 36
não sim sim não 3.2872773 -2.578353 -2.626149 0.0360686 0.7445957 0.2597907 45
#Plot the standardized residuals:
  ggplot(model.data, aes(index, .std.resid)) +
    geom_point(aes(color =dados$prescar_asl_asn),size=4 ,alpha =.5) +
    theme_bw()

#Filter potential influential data points with abs(.std.res) > 3 :

model.data %>%
  filter(abs(.std.resid) >3)  #There is no influential observations in our data.
prescar_asl_asn capivara agua vestdom .fitted .resid .std.resid .hat .sigma .cooksd index
#When you have outliers in a continuous predictor, potential solutions include:
#Removing the concerned records
#Transform the data into log scale
#Use non parametric methods

#Multicollinearity

car::vif(mod)
## capivara     agua  vestdom 
## 1.171065 1.283017 1.327852
#As a rule of thumb, a VIF value that exceeds 5 or 10 indicates a problematic amount of collinearity. In our example, there is no collinearity: all variables have a value of VIF well below 5.

#Confusion matrix

#The R function table() can be used to produce a confusion matrix in order to determine how many observations were correctly or incorrectly classified. It compares the observed and the predicted outcome values and shows the number of correct and incorrect predictions categorized by type of outcome.

table(dados$prescar_asl_asn, dados$predicted)
##      
##       não sim
##   não   9   6
##   sim   3  48
# Confusion matrix, proportion of cases
table(dados$prescar_asl_asn, dados$predicted) %>%
  prop.table() %>%
  round(digits =3)
##      
##         não   sim
##   não 0.136 0.091
##   sim 0.045 0.727
# Confusion matrix, proportion of cases (linha)
table(dados$prescar_asl_asn, dados$predicted) %>%
  prop.table(margin =1 ) %>%
  round(digits =3)
##      
##         não   sim
##   não 0.600 0.400
##   sim 0.059 0.941
# Confusion matrix, proportion of cases (coluna)
table(dados$prescar_asl_asn, dados$predicted) %>%
  prop.table(margin =2 ) %>%
  round(digits =3)
##      
##         não   sim
##   não 0.750 0.111
##   sim 0.250 0.889
#The diagonal elements of the confusion matrix indicate correct predictions, while the off-diagonals represent
#incorrect predictions. So, the correct classification rate is the sum of the number on the diagonal divided by the sample size in the test data. In our example, that is (48 + 15)/78 = 81%.


#True positives (d): these are cases in which we predicted the individuals would be diabetes-positive and they were.
#True negatives (a): We predicted diabetes-negative, and the individuals were diabetes-negative.
#False positives (b): We predicted diabetes-positive, but the individuals didn't actually have diabetes. (Also known as a Type I error .)
#False negatives (c): We predicted diabetes-negative, but they did have diabetes. (Also known as a Type II error .)
#Technically the raw prediction accuracy of the model is defined as (TruePositives + TrueNegatives)/SampleSize .


#Precision, Recall and Specificity
#In addition to the raw classification accuracy, there are many other metrics that are widely used to examine the performance of a classification model, including:
#Precision , which is the proportion of true positives among all the individuals that have been predicted to be diabetes-positive by the model. This represents the accuracy of a predicted positive outcome. Precision = TruePositives/(TruePositives + FalsePositives) .
#Sensitivity (or Recall ), which is the True Positive Rate (TPR) or the proportion of identified positives among the diabetes-positive population (class = 1). Sensitivity = TruePositives/(TruePositives + FalseNegatives) .
#Specificity , which measures the True Negative Rate (TNR), that is the proportion of identified negatives among the diabetes-negative population (class = 0). Specificity = TrueNegatives/(TrueNegatives + FalseNegatives) .
#False Positive Rate (FPR), which represents the proportion of identified positives among the healthy individuals (i.e. diabetes-negative). This can be seen as a false alarm. The FPR can be also calculated as 1-specificity . When positives are rare, the FPR can be high, leading to the situation where a predicted positive is most likely a negative.
#Sensitivy and Specificity are commonly used to measure the performance of a predictive model.


confusionMatrix(dados$prescar_asl_asn, dados$predicted, positive ="sim")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction não sim
##        não   9   6
##        sim   3  48
##                                           
##                Accuracy : 0.8636          
##                  95% CI : (0.7569, 0.9357)
##     No Information Rate : 0.8182          
##     P-Value [Acc > NIR] : 0.2161          
##                                           
##                   Kappa : 0.5823          
##                                           
##  Mcnemar's Test P-Value : 0.5050          
##                                           
##             Sensitivity : 0.8889          
##             Specificity : 0.7500          
##          Pos Pred Value : 0.9412          
##          Neg Pred Value : 0.6000          
##              Prevalence : 0.8182          
##          Detection Rate : 0.7273          
##    Detection Prevalence : 0.7727          
##       Balanced Accuracy : 0.8194          
##                                           
##        'Positive' Class : sim             
## 

Suppose a 2x2 table with notation Reference
Predicted Event No Event Event A B No Event C D The formulas used here are: Sensitivity = A/(A+C) Specificity = D/(B+D) Prevalence = (A+C)/(A+B+C+D) PPV = (sensitivity * prevalence)/((sensitivityprevalence) + ((1-specificity)(1-prevalence))) NPV = (specificity * (1-prevalence))/(((1-sensitivity)prevalence) + ((specificity)(1-prevalence))) Detection Rate = A/(A+B+C+D) Detection Prevalence = (A+B)/(A+B+C+D) Balanced Accuracy = (sensitivity+specificity)/2 Precision = A/(A+B) Recall = A/(A+C) F1 = (1+beta2)precisionrecall/((beta2 * precision)+recall)

#ROC curve
#The ROC curve (or receiver operating characteristics curve ) is a popular graphical measure for assessing the performance or the accuracy of a classifier, which corresponds to the total proportion of correctly classified observations.
#For example, the accuracy of a medical diagnostic test can be assessed by considering the two possible types of errors: false positives, and false negatives. In classification point of view, the test will be declared positive when the corresponding predicted probability, returned by the classifier algorithm, is above a fixed threshold. This threshold is generally set to 0.5 (i.e., 50%), which corresponds to the random guessing probability.
#So, in reference to our diabetes data example, for a given fixed probability cutoff:
# the true positive rate (or fraction) is the proportion of identified positives among the diabetes-positive population. Recall that, this is also known as the sensitivity of the predictive classifier model.
#and the false positive rate is the proportion of identified positives among the healthy (i.e. diabetes-negative) individuals. This is also defined as 1-specificity , where specificity measures the true negative rate , that is the proportion of identified negatives among the diabetes-negative population.
#Since we don't usually know the probability cutoff in advance, the ROC curve is typically used to plot the true positive rate (or sensitivity on y-axis) against the false positive rate (or "1-specificity" on x-axis) at all possible probability cutoffs. This shows the trade off between the rate at which you can correctly predict something with the rate of incorrectly predicting something. Another visual representation of the ROC plot is to simply display the sensitive against the specificity.
#The Area Under the Curve (AUC ) summarizes the overall performance of the classifier, over all possible probability cutoffs. It represents the ability of a classification algorithm to distinguish 1s from 0s (i.e, events from non-events or positives from negatives).

#For a good model, the ROC curve should rise steeply, indicating that the true positive rate (y-axis) increases faster than the false positive rate (x-axis) as the probability threshold decreases.
#So, the "ideal point" is the top left corner of the graph, that is a false positive rate of zero, and a true positive rate of one. This is not very realistic, but it does mean that the larger the AUC the better the classifier.
#The AUC metric varies between 0.50 (random classifier) and 1.00. Values above 0.80 is an indication of a good classifier.

# Compute roc
res.roc <- roc(dados$prescar_asl_asn, dados$probabilities)
## Setting levels: control = não, case = sim
## Setting direction: controls < cases
plot.roc(res.roc, print.auc =TRUE,col="black",lwd=2)

#summary(res.roc)

#The gray diagonal line represents a classifier no better than random chance.
#A highly performant classifier will have an ROC that rises steeply to the top-left corner, that is it will correctly identify lots of positives without misclassifying lots of negatives as positives.
#In our example, the AUC is 0.85, which is close to the maximum ( max = 1). So, our classifier can be considered as very good. A classifier that performs no better than chance is expected to have an AUC of 0.5 when evaluated on an independent test set not used to train the model.
#If we want a classifier model with a specificity of at least 60%, then the sensitivity is about 0.88%. The corresponding probability threshold can be extract as follow:
 
# Extract some interesting results
roc.data <-data_frame(thresholds =res.roc$thresholds,sensitivity =res.roc$sensitivities,specificity =res.roc$specificities)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# Get the probability threshold for specificity = 0.6
roc.data %>%filter(specificity >=0.6)
thresholds sensitivity specificity
0.4580007 0.9411765 0.6000000
0.7122571 0.8627451 0.8666667
0.9159761 0.6666667 0.9333333
0.9784541 0.1764706 1.0000000
Inf 0.0000000 1.0000000
roc(dados$prescar_asl_asn,dados$probabilities) %>%
    coords(ret = "all", transpose = FALSE) %>%
    filter(sensitivity >= 0.7,
           specificity >= 0.7)
## Setting levels: control = não, case = sim
## Setting direction: controls < cases
threshold specificity sensitivity accuracy tn tp fn fp npv ppv fdr fpr tpr tnr fnr 1-specificity 1-sensitivity 1-accuracy 1-npv 1-ppv precision recall youden closest.topleft
0.7122571 0.8666667 0.8627451 0.8636364 13 44 7 2 0.65 0.9565217 0.0434783 0.1333333 0.8627451 0.8666667 0.1372549 0.1333333 0.1372549 0.1363636 0.35 0.0434783 0.9565217 0.8627451 1.729412 0.0366167
#The best threshold with the highest sum sensitivity + specificity can be printed as follow. There might be more than one threshold.

plot.roc(res.roc, print.auc =TRUE, print.thres ="best")

#Here, the best probability cutoff is xxx resulting to a predictive classifier with a specificity of xxx and a sensitivity of xxx.

#Note that, print.thres can be also a numeric vector containing a direct definition of the thresholds to display:

plot.roc(res.roc, print.thres =c(0.3, 0.5, 0.7))