Regressão Logística Binária
Passo 1: Carregar os pacotes que serão usados
Passo 2: Carregar o banco de dados
Passo 3: Análise das frequências das categorias da VD e resultados descritivos
Passo 4: Checagem das categorias de referência
Passo 5: Modelagem

Variáveis - Dicionário

om Organização militar da coleta
area Área de instrução (numerei como casa da centena sendo o número da OM e dezena e unidade o número da área na OM)
altitude altimetria metros em relação ao nível do mar
prescar presença de carrapatos na área de instrução
totcar total de carrapatos coletados na área de instrução
totamb Total de carrapatos do gênero Amblyomma coletados na área
totas total de carrapatos da espécie Amblyomma sculptum coletados na área
asl total de carrapatos Amblyomma na fase larval coletados na área
asn total de carrapatos Amblyomma sculptum na fase ninfal coletados na área
asa total de carrapatos Amblyomma sculptum na fase adulta coletados na área
totad Total de carrapatos Amblyomma dubitatum coletados na área
adn total de carrapatos Amblyomma dubitatum na fase ninfal coletados na área
ada total de carrapatos Amblyomma dubitatum na fase adulta coletados na área
abn total de carrapatos Amblyomma brasiliense na fase ninfal coletados na área
aba total de carrapatos Amblyomma brasiliense na fase adulta coletados na área
aln total de carrapatos Amblyomma longirostre na fase ninfal coletados na área
aan total de carrapatos Amblyomma aureolatum na fase ninfal coletados na área
dnl total de carrapatos Dermacentor nittens na fase larval coletados na área
rml total de carrapatos Rhipicephalus microplus na fase larval coletados na área
ixol total de carrapatos Ixodes na fase larval coletados na área
hael total de carrapatos Haemaphysalis na fase larval coletados na área
vegeta tipo de vegetação predominante na área de instrução
vestsil vestígio (fezes, pegadas, trilhas) ou avistamento de animais silvestres (que não seja capivara) verificados na área
capivara vestígio ou avistamento de capivara verificados na área
vestdom vestígio de animais domésticos verificados na área
avistdom avistamento de animais domésticos por espécie
agua presença ou ausência de curso d’água nas proximidades da área
tipoagua tipo de curso de água verificado na área
queimada presença de vestígio de queimada recente na área
prescar_asl_asn presença de carrapatos asl e asn na área de instrução
tot_asl_asn total de carrapatos asl e asn na área de instrução

Regressão Logística Binária

Passo 1: Carregar os pacotes que serão usados

if(!require(pacman)) install.packages("pacman")

## Loading required package: pacman

library(pacman)
pacman::p_load(dplyr, psych, car, MASS, DescTools, QuantPsyc, ggplot2,readxl,descriptr,esquisse,e1071, caret, hmeasure, pROC, tidyverse, broom, readr,mfx,ResourceSelection,modEvA,foreign,stargazer)

theme_set (theme_bw())

Passo 2: Carregar o banco de dados

dados <- read_excel("dados.xlsx")
#View(dados) # Visualização dos dados em janela separada
#glimpse(dados) # Visualização de um resumo dos dados

dados$om <- factor(dados$om,
                   label = c("EsSA", "CIJF", "DCMun", "AMAN","EspCEx_Bda_CoudCamp"),
                   levels = 1:5, order = F)

dados$area <- factor(dados$area)

dados$prescar <- factor(dados$prescar ,
                     label = c("não", "sim"),
                     levels = 0:1, order = F)

dados$prescar_asl_asn <- factor(dados$prescar_asl_asn ,
                     label = c("não", "sim"),
                     levels = 0:1, order = F)

dados$vegeta <- factor(dados$vegeta ,
                       label = c("primária", "secundária", "ciliar_secundária", 
                       "campos_sujos_graminosos_e_capoeiras","gramado", "pastagem"),
                       levels = 1:6, order = F)

dados$vestsil <- factor(dados$vestsil,
                        label = c("não", "sim"),
                        levels = 0:1, order = F)

dados$capivara <- factor(dados$capivara,
                        label = c("não", "sim"),
                        levels = 0:1, order = F)

dados$vestdom <- factor(dados$vestdom,
                         label = c("não", "sim"),
                         levels = 0:1, order = F)

dados$avistadom <- factor(dados$avistadom ,
                       label = c("não", "equideo", "bovino", 
                                 "equideo_bovino","canino"),
                       levels = 0:4, order = F)

dados$agua <- factor(dados$agua,
                        label = c("não", "sim"),
                        levels = 0:1, order = F)

dados$tipoagua <- factor(dados$tipoagua ,
                          label = c("não", "rio", "lagoa", 
                                    "ribeirao","brejo"),
                          levels = 0:4, order = F)
dados$queimada <- factor(dados$queimada,
                        label = c("não", "sim"),
                        levels = 0:1, order = F)
#View(dados)# Visualização dos dados em janela separada
glimpse(dados)# Visualização de um resumo dos dados

## Rows: 66
## Columns: 31
## $ om              <fct> EsSA, EsSA, EsSA, EsSA, EsSA, EsSA, EsSA, EsSA, EsS...
## $ area            <fct> 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 1...
## $ altitude        <dbl> 827, 827, 827, 827, 1148, 970, 963, 989, 979, 970, ...
## $ vegeta          <fct> ciliar_secundária, gramado, ciliar_secundária, cili...
## $ prescar         <fct> sim, sim, sim, sim, sim, sim, sim, sim, sim, sim, s...
## $ totcar          <dbl> 199, 23, 98, 77, 124, 78, 89, 34, 41, 39, 119, 12, ...
## $ totamb          <dbl> 199, 23, 98, 77, 11, 71, 46, 34, 41, 39, 119, 12, 4...
## $ totas           <dbl> 199, 23, 96, 77, 11, 71, 45, 34, 41, 39, 118, 12, 4...
## $ asl             <dbl> 6, 7, 23, 4, 0, 0, 29, 0, 0, 0, 0, 0, 0, 47, 2, 2, ...
## $ asn             <dbl> 191, 15, 71, 70, 10, 70, 15, 33, 40, 38, 115, 11, 4...
## $ prescar_asl_asn <fct> sim, sim, sim, sim, sim, sim, sim, sim, sim, sim, s...
## $ asa             <dbl> 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, ...
## $ totad           <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 1, 25,...
## $ adn             <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 1, 24,...
## $ ada             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
## $ abn             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ aba             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ aln             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ aan             <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ dnl             <dbl> 0, 0, 0, 0, 0, 0, 43, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ rml             <dbl> 0, 0, 0, 0, 113, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ ixol            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...
## $ hael            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ vestsil         <fct> sim, sim, sim, sim, não, não, sim, sim, sim, não, s...
## $ capivara        <fct> sim, sim, sim, sim, não, não, não, não, não, não, s...
## $ vestdom         <fct> sim, sim, não, sim, sim, sim, sim, sim, sim, sim, n...
## $ avistadom       <fct> equideo, não, não, canino, equideo_bovino, equideo_...
## $ agua            <fct> sim, sim, sim, sim, não, sim, sim, sim, sim, sim, s...
## $ tipoagua        <fct> rio, rio, rio, rio, não, ribeirao, ribeirao, ribeir...
## $ queimada        <fct> não, não, não, não, não, não, não, não, não, não, n...
## $ tot_asl_asn     <dbl> 197, 22, 94, 74, 10, 70, 44, 33, 40, 38, 115, 11, 4...

Passo 3: Análise das frequências das categorias da VD e resultados descritivos

#table(dados$prescar)
table(dados$prescar_asl_asn)

## 
## não sim 
##  15  51

#summary(dados)

# descritiva 

#ds_auto_cross_table()
#ds_auto_freq_table()
#ds_auto_group_summary()
#ds_auto_summary_stats()

#ds_auto_summary_stats(dados)

# cont x discreta
#ds_auto_group_summary(dados, tot_asl_asn, capivara)
#ds_auto_group_summary(dados, tot_asl_asn, om)
#ds_auto_group_summary(dados, tot_asl_asn, agua)

# disc x disc
#ds_auto_cross_table (dados,prescar_asl_asn,capivara,agua,om)

#ds_cross_table(dados, prescar_asl_asn,capivara)
#ds_cross_table(dados, prescar_asl_asn,agua)
#ds_cross_table(dados, prescar_asl_asn,om)

#ds_auto_freq_table(dados,prescar_asl_asn,capivara,agua,om,vegeta)

# pacote para criar gáfico com ggplot
# install.packages("esquisse")
#library(esquisse)
  # esquisser(viewer = "browser")

# om x capivara x vegeta
ggplot(dados) +
 aes(x = om, fill = capivara) +
 geom_bar() +
 scale_fill_hue() +
 labs(x = "OM", y = "Frequência", title = "Título", subtitle = "Subtítulo", fill = "Capivara") +
 coord_flip() +
 ggthemes::theme_base() +
 facet_wrap(vars(vegeta))

# capivara x presença_asl_e_asn x OM 

ggplot(dados) +
 aes(x = om, fill = capivara) +
 geom_bar() +
 scale_fill_hue() +
 labs(x = "OM", y = "Frequência", title = "Título", subtitle = "Subtítulo", caption = "Fonte: xxx", fill = "Capivara") +
 coord_flip() +
 theme_minimal() +
 facet_wrap(vars(prescar_asl_asn))

Passo 4: Checagem das categorias de referência

#Como modificar as categorias de referência?
#ATENÇÃO: é necessário rodar o modelo novamente!
dados$prescar_asl_asn <- relevel(dados$prescar_asl_asn, ref = "não")
dados$capivara <- relevel(dados$capivara, ref = "não")
dados$vestdom <- relevel(dados$vestdom, ref = "não")
dados$tipoagua <- relevel(dados$tipoagua, ref = "não")
dados$om <- relevel(dados$om, ref = "AMAN")

levels(dados$prescar_asl_asn)

## [1] "não" "sim"

levels(dados$capivara)

## [1] "não" "sim"

levels(dados$vestdom)

## [1] "não" "sim"

levels(dados$agua)

## [1] "não" "sim"

levels(dados$tipoagua)

## [1] "não"      "rio"      "lagoa"    "ribeirao" "brejo"

levels(dados$om)

## [1] "AMAN"                "EsSA"                "CIJF"               
## [4] "DCMun"               "EspCEx_Bda_CoudCamp"

Passo 5: Modelagem

#1. Variável dependente dicotômica (categorias mutuamente exclusivas)
#2. Independência das observações (sem medidas repetidas)
#3. Ausência de outliers/ pontos de alavancagem (in the continuous predictors)
#4. Ausência de multicolinearidade
#5. Relação linear entre cada VI contínua e o logito logit(p) = log(p/(1-p)) da VD

## Construção do modelo:  

#mod <- glm(prescar_asl_asn ~ capivara + agua + vestdom + om,
#           family = binomial(link = 'logit'), data = dados)
#summary(mod)
#anova(mod, test="Chisq")

mod <- glm(prescar_asl_asn ~ capivara + agua + vestdom,
           family = binomial(link = 'logit'), data = dados)
summary(mod)

## 
## Call:
## glm(formula = prescar_asl_asn ~ capivara + agua + vestdom, family = binomial(link = "logit"), 
##     data = dados)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5783   0.1192   0.2708   0.4668   2.1612  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -2.2336     1.0080  -2.216   0.0267 *
## capivarasim   3.0601     1.1975   2.555   0.0106 *
## aguasim       2.4608     0.9652   2.549   0.0108 *
## vestdomsim    1.6559     0.9236   1.793   0.0730 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 70.747  on 65  degrees of freedom
## Residual deviance: 40.716  on 62  degrees of freedom
## AIC: 48.716
## 
## Number of Fisher Scoring iterations: 6

anova(mod, test="Chisq")

	Df	Deviance	Resid. Df	Resid. Dev	Pr(>Chi)
NULL	NA	NA	65	70.74671	NA
capivara	1	18.980509	64	51.76620	0.0000132
agua	1	7.287607	63	44.47859	0.0069432
vestdom	1	3.762148	62	40.71644	0.0524252

Anova(mod, type = 'II', test = "Wald")

	Df	Chisq	Pr(>Chisq)
capivara	1	6.529635	0.0106092
agua	1	6.499812	0.0107886
vestdom	1	3.214476	0.0729896

#step.model <- mod %>% stepAIC(trace =FALSE)
#coef(step.model)
#step.model

#Criação de um segundo modelo
mod2 <- glm(prescar_asl_asn ~ capivara + agua,
            family = binomial(link = 'logit'), data = dados)
anova(mod2, test="Chisq")

	Df	Deviance	Resid. Df	Resid. Dev	Pr(>Chi)
NULL	NA	NA	65	70.74671	NA
capivara	1	18.980509	64	51.76620	0.0000132
agua	1	7.287607	63	44.47859	0.0069432

Anova(mod2, type = 'II', test = "Wald")

	Df	Chisq	Pr(>Chisq)
capivara	1	4.792092	0.0285907
agua	1	6.327058	0.0118909

anova(mod,mod2)

Resid. Df	Resid. Dev	Df	Deviance
62	40.71644	NA	NA
63	44.47859	-1	-3.762148

#Análise do modelo

## Overall effects
Anova(mod, type = 'II', test = "Wald")

	Df	Chisq	Pr(>Chisq)
capivara	1	6.529635	0.0106092
agua	1	6.499812	0.0107886
vestdom	1	3.214476	0.0729896

## Efeitos específicos
summary(mod)

## 
## Call:
## glm(formula = prescar_asl_asn ~ capivara + agua + vestdom, family = binomial(link = "logit"), 
##     data = dados)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5783   0.1192   0.2708   0.4668   2.1612  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -2.2336     1.0080  -2.216   0.0267 *
## capivarasim   3.0601     1.1975   2.555   0.0106 *
## aguasim       2.4608     0.9652   2.549   0.0108 *
## vestdomsim    1.6559     0.9236   1.793   0.0730 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 70.747  on 65  degrees of freedom
## Residual deviance: 40.716  on 62  degrees of freedom
## AIC: 48.716
## 
## Number of Fisher Scoring iterations: 6

## Obtenção das razões de chance com IC 95% (usando log-likelihood)
exp(cbind(OR = coef(mod), confint(mod)))

## Waiting for profiling to be done...

##                     OR      2.5 %      97.5 %
## (Intercept)  0.1071382 0.01059831   0.6086829
## capivarasim 21.3296503 2.73934437 458.7842053
## aguasim     11.7143499 2.07846846 103.7606635
## vestdomsim   5.2376978 0.98381710  42.2201756

## Obtenção das razões de chance com IC 95% (usando erro padrão = SPSS)
exp(cbind(OR = coef(mod), confint.default(mod)))

##                     OR      2.5 %      97.5 %
## (Intercept)  0.1071382 0.01485799   0.7725541
## capivarasim 21.3296503 2.04001691 223.0148098
## aguasim     11.7143499 1.76651485  77.6817670
## vestdomsim   5.2376978 0.85701434  32.0105238

hoslem.test(dados$prescar_asl_asn, fitted(mod),g=5)

## Warning in Ops.factor(1, y): '-' not meaningful for factors

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  dados$prescar_asl_asn, fitted(mod)
## X-squared = 66, df = 3, p-value = 3.064e-14

## Overall effects
Anova(mod2, type="II", test="Wald")

	Df	Chisq	Pr(>Chisq)
capivara	1	4.792092	0.0285907
agua	1	6.327058	0.0118909

## Efeitos específicos
summary(mod2)

## 
## Call:
## glm(formula = prescar_asl_asn ~ capivara + agua, family = binomial(link = "logit"), 
##     data = dados)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6666   0.2408   0.2408   0.6463   1.6651  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -1.0986     0.6667  -1.648   0.0994 .
## capivarasim   2.4967     1.1405   2.189   0.0286 *
## aguasim       2.1282     0.8461   2.515   0.0119 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 70.747  on 65  degrees of freedom
## Residual deviance: 44.479  on 63  degrees of freedom
## AIC: 50.479
## 
## Number of Fisher Scoring iterations: 6

## Obtenção das razões de chance com IC 95% (usando log-likelihood)
exp(cbind(OR = coef(mod2), confint(mod2)))

## Waiting for profiling to be done...

##                     OR     2.5 %     97.5 %
## (Intercept)  0.3333333 0.0739549   1.117189
## capivarasim 12.1428571 1.7543842 244.104446
## aguasim      8.4000000 1.7532008  51.540358

## Obtenção das razões de chance com IC 95% (usando erro padrão = SPSS)
exp(cbind(OR = coef(mod2), confint.default(mod2)))

##                     OR      2.5 %     97.5 %
## (Intercept)  0.3333333 0.09024249   1.231251
## capivarasim 12.1428571 1.29864533 113.540607
## aguasim      8.4000000 1.59986668  44.103675

hoslem.test(dados$prescar_asl_asn, fitted(mod2),g=5)

## Warning in Ops.factor(1, y): '-' not meaningful for factors

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  dados$prescar_asl_asn, fitted(mod2)
## X-squared = 66, df = 3, p-value = 3.064e-14

#Avaliação da qualidade e comparação entre modelos
## Pseudo R-quadrado
PseudoR2(mod, which = "Nagelkerke")

## Nagelkerke 
##  0.5558503

PseudoR2(mod, which = "Nagelkerke")

## Nagelkerke 
##  0.5558503

PseudoR2(mod, which = "all")

##        McFadden     McFaddenAdj        CoxSnell      Nagelkerke   AldrichNelson 
##       0.4244758       0.3113963       0.3655546       0.5558503       0.3127167 
## VeallZimmermann           Efron McKelveyZavoina            Tjur             AIC 
##       0.6044518       0.4620491       0.5650231       0.4614323      48.7164411 
##             BIC          logLik         logLik0              G2 
##      57.4750601     -20.3582206     -35.3733527      30.0302642

PseudoR2(mod2, which = "all")

##        McFadden     McFaddenAdj        CoxSnell      Nagelkerke   AldrichNelson 
##       0.3712981       0.2864885       0.3283392       0.4992617       0.2846933 
## VeallZimmermann           Efron McKelveyZavoina            Tjur             AIC 
##       0.5502853       0.4042194       0.4970415       0.4042194      50.4785890 
##             BIC          logLik         logLik0              G2 
##      57.0475532     -22.2392945     -35.3733527      26.2681164

# Comparação de modelos
## AIC e BIC
AIC(mod, mod2)

	df	AIC
mod	4	48.71644
mod2	3	50.47859

BIC(mod, mod2)

	df	BIC
mod	4	57.47506
mod2	3	57.04755

## Qui-quadrado
anova(mod2, mod, test="Chisq")

Resid. Df	Resid. Dev	Df	Deviance	Pr(>Chi)
63	44.47859	NA	NA	NA
62	40.71644	1	3.762148	0.0524252

# Tabela de classificação
ClassLog(mod, dados$prescar_asl_asn)

## $rawtab
##        resp
##         não sim
##   FALSE   9   3
##   TRUE    6  48
## 
## $classtab
##        resp
##                não        sim
##   FALSE 0.60000000 0.05882353
##   TRUE  0.40000000 0.94117647
## 
## $overall
## [1] 0.8636364
## 
## $mcFadden
## [1] 0.4244758

ClassLog(mod2, dados$prescar_asl_asn)

## $rawtab
##        resp
##         não sim
##   FALSE   9   3
##   TRUE    6  48
## 
## $classtab
##        resp
##                não        sim
##   FALSE 0.60000000 0.05882353
##   TRUE  0.40000000 0.94117647
## 
## $overall
## [1] 0.8636364
## 
## $mcFadden
## [1] 0.3712981

levels(dados$prescar_asl_asn)

## [1] "não" "sim"

contrasts(dados$prescar_asl_asn)

##     sim
## não   0
## sim   1

# Make predictions
probabilities <- mod %>%
  predict(dados, type ="response")

predicted.classes <- ifelse(probabilities >0.5, "sim", "não")
#predicted.classes

dados$predicted <- factor(predicted.classes)
                             
dados$probabilities  <- probabilities

# Model accuracy
mean(dados$predicted==dados$prescar_asl_asn)

## [1] 0.8636364

# Código similar para calcular prob e classificar o ou 1
#predicoes <- predict(mod)
#probEstimada <- 1/(1+exp(-predicoes))
#classeEstimada <- round(probEstimada,0)
#dados$classeEstimada <- factor(classeEstimada,
#                         label = c("não", "sim"),
#                         levels = 0:1, order = F)
#dados$prescar_asl_asn 
#Model accuracy
#mean(dados$classeEstimada == dados$prescar_asl_asn)

Logistic regression diagnostics #Variável dependente dicotômica e ME #Linearity assumption

# 1.Remove qualitative variables from the original data frame and bind the logit values to the data: # Select only numeric predictors (apenas para veriável independente contínua) #If the scatter plot shows non-linearity, you need other methods to build the model #such as including 2 or 3-power terms, fractional polynomials and spline function

 #não é o caso em nossa análise, pois não há Xi numeric. Exemplo de Cód. 
mydata <- dados %>%
  dplyr::select_if(is.numeric) 
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata <- mydata %>%
  mutate(logit =log(probabilities/(1-probabilities))) %>%
  gather(key ="predictors", value ="predictor.value", -logit)

## Warning: attributes are not identical across measure variables;
## they will be dropped

  # 2.Create the scatter plots:

ggplot(mydata, aes(logit, predictor.value))+
  geom_point(size =0.5, alpha =0.5) +
  geom_smooth(method ="loess") +
  theme_bw() +
  facet_wrap(~predictors, scales ="free_y")

## `geom_smooth()` using formula 'y ~ x'

#Influential values

plot(mod, which =4, id.n =3)

#The most extreme values in the data can be examined by visualizing the Cook's distance values. Here we label the top 3 largest values
plot(mod, which = 5)

summary(stdres(mod))

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -5.34800  0.08625  0.19978  0.02338  0.35808  3.24796

# Extract model results
model.data <- augment(mod) %>%
  mutate(index =1:n())

#The data for the top 3 largest values, according to the Cook's distance,
# can be displayed as follow:
model.data %>%
  top_n(5, .cooksd)

prescar_asl_asn	capivara	agua	vestdom	.fitted	.resid	.std.resid	.hat	.sigma	.cooksd	index
sim	não	não	sim	-0.5777534	1.430511	1.526075	0.1213204	0.7932867	0.0700048	5
sim	não	não	sim	-0.5777534	1.430511	1.526075	0.1213204	0.7932867	0.0700048	30
não	não	sim	sim	1.8830612	-2.012296	-2.088720	0.0718389	0.7719858	0.1370428	33
sim	não	não	não	-2.2336354	2.161210	2.264077	0.0888045	0.7638384	0.2495785	36
não	sim	sim	não	3.2872773	-2.578353	-2.626149	0.0360686	0.7445957	0.2597907	45

#Plot the standardized residuals:
  ggplot(model.data, aes(index, .std.resid)) +
    geom_point(aes(color =dados$prescar_asl_asn),size=4 ,alpha =.5) +
    theme_bw()

#Filter potential influential data points with abs(.std.res) > 3 :

model.data %>%
  filter(abs(.std.resid) >3)  #There is no influential observations in our data.

prescar_asl_asn	capivara	agua	vestdom	.fitted	.resid	.std.resid	.hat	.sigma	.cooksd	index

#When you have outliers in a continuous predictor, potential solutions include:
#Removing the concerned records
#Transform the data into log scale
#Use non parametric methods

#Multicollinearity

car::vif(mod)

## capivara     agua  vestdom 
## 1.171065 1.283017 1.327852

#As a rule of thumb, a VIF value that exceeds 5 or 10 indicates a problematic amount of collinearity. In our example, there is no collinearity: all variables have a value of VIF well below 5.

#Confusion matrix

#The R function table() can be used to produce a confusion matrix in order to determine how many observations were correctly or incorrectly classified. It compares the observed and the predicted outcome values and shows the number of correct and incorrect predictions categorized by type of outcome.

table(dados$prescar_asl_asn, dados$predicted)

##      
##       não sim
##   não   9   6
##   sim   3  48

# Confusion matrix, proportion of cases
table(dados$prescar_asl_asn, dados$predicted) %>%
  prop.table() %>%
  round(digits =3)

##      
##         não   sim
##   não 0.136 0.091
##   sim 0.045 0.727

# Confusion matrix, proportion of cases (linha)
table(dados$prescar_asl_asn, dados$predicted) %>%
  prop.table(margin =1 ) %>%
  round(digits =3)

##      
##         não   sim
##   não 0.600 0.400
##   sim 0.059 0.941

# Confusion matrix, proportion of cases (coluna)
table(dados$prescar_asl_asn, dados$predicted) %>%
  prop.table(margin =2 ) %>%
  round(digits =3)

##      
##         não   sim
##   não 0.750 0.111
##   sim 0.250 0.889

#The diagonal elements of the confusion matrix indicate correct predictions, while the off-diagonals represent
#incorrect predictions. So, the correct classification rate is the sum of the number on the diagonal divided by the sample size in the test data. In our example, that is (48 + 15)/78 = 81%.


#True positives (d): these are cases in which we predicted the individuals would be diabetes-positive and they were.
#True negatives (a): We predicted diabetes-negative, and the individuals were diabetes-negative.
#False positives (b): We predicted diabetes-positive, but the individuals didn't actually have diabetes. (Also known as a Type I error .)
#False negatives (c): We predicted diabetes-negative, but they did have diabetes. (Also known as a Type II error .)
#Technically the raw prediction accuracy of the model is defined as (TruePositives + TrueNegatives)/SampleSize .


#Precision, Recall and Specificity
#In addition to the raw classification accuracy, there are many other metrics that are widely used to examine the performance of a classification model, including:
#Precision , which is the proportion of true positives among all the individuals that have been predicted to be diabetes-positive by the model. This represents the accuracy of a predicted positive outcome. Precision = TruePositives/(TruePositives + FalsePositives) .
#Sensitivity (or Recall ), which is the True Positive Rate (TPR) or the proportion of identified positives among the diabetes-positive population (class = 1). Sensitivity = TruePositives/(TruePositives + FalseNegatives) .
#Specificity , which measures the True Negative Rate (TNR), that is the proportion of identified negatives among the diabetes-negative population (class = 0). Specificity = TrueNegatives/(TrueNegatives + FalseNegatives) .
#False Positive Rate (FPR), which represents the proportion of identified positives among the healthy individuals (i.e. diabetes-negative). This can be seen as a false alarm. The FPR can be also calculated as 1-specificity . When positives are rare, the FPR can be high, leading to the situation where a predicted positive is most likely a negative.
#Sensitivy and Specificity are commonly used to measure the performance of a predictive model.


confusionMatrix(dados$prescar_asl_asn, dados$predicted, positive ="sim")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction não sim
##        não   9   6
##        sim   3  48
##                                           
##                Accuracy : 0.8636          
##                  95% CI : (0.7569, 0.9357)
##     No Information Rate : 0.8182          
##     P-Value [Acc > NIR] : 0.2161          
##                                           
##                   Kappa : 0.5823          
##                                           
##  Mcnemar's Test P-Value : 0.5050          
##                                           
##             Sensitivity : 0.8889          
##             Specificity : 0.7500          
##          Pos Pred Value : 0.9412          
##          Neg Pred Value : 0.6000          
##              Prevalence : 0.8182          
##          Detection Rate : 0.7273          
##    Detection Prevalence : 0.7727          
##       Balanced Accuracy : 0.8194          
##                                           
##        'Positive' Class : sim             
##

Suppose a 2x2 table with notation Reference
Predicted Event No Event Event A B No Event C D The formulas used here are: Sensitivity = A/(A+C) Specificity = D/(B+D) Prevalence = (A+C)/(A+B+C+D) PPV = (sensitivity * prevalence)/((sensitivityprevalence) + ((1-specificity)(1-prevalence))) NPV = (specificity * (1-prevalence))/(((1-sensitivity)prevalence) + ((specificity)(1-prevalence))) Detection Rate = A/(A+B+C+D) Detection Prevalence = (A+B)/(A+B+C+D) Balanced Accuracy = (sensitivity+specificity)/2 Precision = A/(A+B) Recall = A/(A+C) F1 = (1+beta^{2)precisionrecall/((beta}2 * precision)+recall)

#ROC curve
#The ROC curve (or receiver operating characteristics curve ) is a popular graphical measure for assessing the performance or the accuracy of a classifier, which corresponds to the total proportion of correctly classified observations.
#For example, the accuracy of a medical diagnostic test can be assessed by considering the two possible types of errors: false positives, and false negatives. In classification point of view, the test will be declared positive when the corresponding predicted probability, returned by the classifier algorithm, is above a fixed threshold. This threshold is generally set to 0.5 (i.e., 50%), which corresponds to the random guessing probability.
#So, in reference to our diabetes data example, for a given fixed probability cutoff:
# the true positive rate (or fraction) is the proportion of identified positives among the diabetes-positive population. Recall that, this is also known as the sensitivity of the predictive classifier model.
#and the false positive rate is the proportion of identified positives among the healthy (i.e. diabetes-negative) individuals. This is also defined as 1-specificity , where specificity measures the true negative rate , that is the proportion of identified negatives among the diabetes-negative population.
#Since we don't usually know the probability cutoff in advance, the ROC curve is typically used to plot the true positive rate (or sensitivity on y-axis) against the false positive rate (or "1-specificity" on x-axis) at all possible probability cutoffs. This shows the trade off between the rate at which you can correctly predict something with the rate of incorrectly predicting something. Another visual representation of the ROC plot is to simply display the sensitive against the specificity.
#The Area Under the Curve (AUC ) summarizes the overall performance of the classifier, over all possible probability cutoffs. It represents the ability of a classification algorithm to distinguish 1s from 0s (i.e, events from non-events or positives from negatives).

#For a good model, the ROC curve should rise steeply, indicating that the true positive rate (y-axis) increases faster than the false positive rate (x-axis) as the probability threshold decreases.
#So, the "ideal point" is the top left corner of the graph, that is a false positive rate of zero, and a true positive rate of one. This is not very realistic, but it does mean that the larger the AUC the better the classifier.
#The AUC metric varies between 0.50 (random classifier) and 1.00. Values above 0.80 is an indication of a good classifier.

# Compute roc
res.roc <- roc(dados$prescar_asl_asn, dados$probabilities)

## Setting levels: control = não, case = sim

## Setting direction: controls < cases

plot.roc(res.roc, print.auc =TRUE,col="black",lwd=2)

#summary(res.roc)

#The gray diagonal line represents a classifier no better than random chance.
#A highly performant classifier will have an ROC that rises steeply to the top-left corner, that is it will correctly identify lots of positives without misclassifying lots of negatives as positives.
#In our example, the AUC is 0.85, which is close to the maximum ( max = 1). So, our classifier can be considered as very good. A classifier that performs no better than chance is expected to have an AUC of 0.5 when evaluated on an independent test set not used to train the model.
#If we want a classifier model with a specificity of at least 60%, then the sensitivity is about 0.88%. The corresponding probability threshold can be extract as follow:
 
# Extract some interesting results
roc.data <-data_frame(thresholds =res.roc$thresholds,sensitivity =res.roc$sensitivities,specificity =res.roc$specificities)

## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

# Get the probability threshold for specificity = 0.6
roc.data %>%filter(specificity >=0.6)

thresholds	sensitivity	specificity
0.4580007	0.9411765	0.6000000
0.7122571	0.8627451	0.8666667
0.9159761	0.6666667	0.9333333
0.9784541	0.1764706	1.0000000
Inf	0.0000000	1.0000000

roc(dados$prescar_asl_asn,dados$probabilities) %>%
    coords(ret = "all", transpose = FALSE) %>%
    filter(sensitivity >= 0.7,
           specificity >= 0.7)

## Setting levels: control = não, case = sim
## Setting direction: controls < cases

threshold	specificity	sensitivity	accuracy	tn	tp	fn	fp	npv	ppv	fdr	fpr	tpr	tnr	fnr	1-specificity	1-sensitivity	1-accuracy	1-npv	1-ppv	precision	recall	youden	closest.topleft
0.7122571	0.8666667	0.8627451	0.8636364	13	44	7	2	0.65	0.9565217	0.0434783	0.1333333	0.8627451	0.8666667	0.1372549	0.1333333	0.1372549	0.1363636	0.35	0.0434783	0.9565217	0.8627451	1.729412	0.0366167

#The best threshold with the highest sum sensitivity + specificity can be printed as follow. There might be more than one threshold.

plot.roc(res.roc, print.auc =TRUE, print.thres ="best")

#Here, the best probability cutoff is xxx resulting to a predictive classifier with a specificity of xxx and a sensitivity of xxx.

#Note that, print.thres can be also a numeric vector containing a direct definition of the thresholds to display:

plot.roc(res.roc, print.thres =c(0.3, 0.5, 0.7))

Carrapatos - Fabiano HVet-AMAN