Passo 2: Carregar o banco de dados
dados <- read_excel("dados.xlsx")
#View(dados) # Visualização dos dados em janela separada
#glimpse(dados) # Visualização de um resumo dos dados
dados$om <- factor(dados$om,
label = c("EsSA", "CIJF", "DCMun", "AMAN","EspCEx_Bda_CoudCamp"),
levels = 1:5, order = F)
dados$area <- factor(dados$area)
dados$prescar <- factor(dados$prescar ,
label = c("não", "sim"),
levels = 0:1, order = F)
dados$prescar_asl_asn <- factor(dados$prescar_asl_asn ,
label = c("não", "sim"),
levels = 0:1, order = F)
dados$vegeta <- factor(dados$vegeta ,
label = c("primária", "secundária", "ciliar_secundária",
"campos_sujos_graminosos_e_capoeiras","gramado", "pastagem"),
levels = 1:6, order = F)
dados$vestsil <- factor(dados$vestsil,
label = c("não", "sim"),
levels = 0:1, order = F)
dados$capivara <- factor(dados$capivara,
label = c("não", "sim"),
levels = 0:1, order = F)
dados$vestdom <- factor(dados$vestdom,
label = c("não", "sim"),
levels = 0:1, order = F)
dados$avistadom <- factor(dados$avistadom ,
label = c("não", "equideo", "bovino",
"equideo_bovino","canino"),
levels = 0:4, order = F)
dados$agua <- factor(dados$agua,
label = c("não", "sim"),
levels = 0:1, order = F)
dados$tipoagua <- factor(dados$tipoagua ,
label = c("não", "rio", "lagoa",
"ribeirao","brejo"),
levels = 0:4, order = F)
dados$queimada <- factor(dados$queimada,
label = c("não", "sim"),
levels = 0:1, order = F)
#View(dados)# Visualização dos dados em janela separada
glimpse(dados)# Visualização de um resumo dos dados
## Rows: 66
## Columns: 31
## $ om <fct> EsSA, EsSA, EsSA, EsSA, EsSA, EsSA, EsSA, EsSA, EsS...
## $ area <fct> 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 1...
## $ altitude <dbl> 827, 827, 827, 827, 1148, 970, 963, 989, 979, 970, ...
## $ vegeta <fct> ciliar_secundária, gramado, ciliar_secundária, cili...
## $ prescar <fct> sim, sim, sim, sim, sim, sim, sim, sim, sim, sim, s...
## $ totcar <dbl> 199, 23, 98, 77, 124, 78, 89, 34, 41, 39, 119, 12, ...
## $ totamb <dbl> 199, 23, 98, 77, 11, 71, 46, 34, 41, 39, 119, 12, 4...
## $ totas <dbl> 199, 23, 96, 77, 11, 71, 45, 34, 41, 39, 118, 12, 4...
## $ asl <dbl> 6, 7, 23, 4, 0, 0, 29, 0, 0, 0, 0, 0, 0, 47, 2, 2, ...
## $ asn <dbl> 191, 15, 71, 70, 10, 70, 15, 33, 40, 38, 115, 11, 4...
## $ prescar_asl_asn <fct> sim, sim, sim, sim, sim, sim, sim, sim, sim, sim, s...
## $ asa <dbl> 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, ...
## $ totad <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 1, 25,...
## $ adn <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 1, 24,...
## $ ada <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
## $ abn <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ aba <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ aln <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ aan <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ dnl <dbl> 0, 0, 0, 0, 0, 0, 43, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ rml <dbl> 0, 0, 0, 0, 113, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ ixol <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...
## $ hael <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ vestsil <fct> sim, sim, sim, sim, não, não, sim, sim, sim, não, s...
## $ capivara <fct> sim, sim, sim, sim, não, não, não, não, não, não, s...
## $ vestdom <fct> sim, sim, não, sim, sim, sim, sim, sim, sim, sim, n...
## $ avistadom <fct> equideo, não, não, canino, equideo_bovino, equideo_...
## $ agua <fct> sim, sim, sim, sim, não, sim, sim, sim, sim, sim, s...
## $ tipoagua <fct> rio, rio, rio, rio, não, ribeirao, ribeirao, ribeir...
## $ queimada <fct> não, não, não, não, não, não, não, não, não, não, n...
## $ tot_asl_asn <dbl> 197, 22, 94, 74, 10, 70, 44, 33, 40, 38, 115, 11, 4...
Passo 5: Modelagem
#1. Variável dependente dicotômica (categorias mutuamente exclusivas)
#2. Independência das observações (sem medidas repetidas)
#3. Ausência de outliers/ pontos de alavancagem (in the continuous predictors)
#4. Ausência de multicolinearidade
#5. Relação linear entre cada VI contínua e o logito logit(p) = log(p/(1-p)) da VD
## Construção do modelo:
#mod <- glm(prescar_asl_asn ~ capivara + agua + vestdom + om,
# family = binomial(link = 'logit'), data = dados)
#summary(mod)
#anova(mod, test="Chisq")
mod <- glm(prescar_asl_asn ~ capivara + agua + vestdom,
family = binomial(link = 'logit'), data = dados)
summary(mod)
##
## Call:
## glm(formula = prescar_asl_asn ~ capivara + agua + vestdom, family = binomial(link = "logit"),
## data = dados)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5783 0.1192 0.2708 0.4668 2.1612
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.2336 1.0080 -2.216 0.0267 *
## capivarasim 3.0601 1.1975 2.555 0.0106 *
## aguasim 2.4608 0.9652 2.549 0.0108 *
## vestdomsim 1.6559 0.9236 1.793 0.0730 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 70.747 on 65 degrees of freedom
## Residual deviance: 40.716 on 62 degrees of freedom
## AIC: 48.716
##
## Number of Fisher Scoring iterations: 6
| NULL |
NA |
NA |
65 |
70.74671 |
NA |
| capivara |
1 |
18.980509 |
64 |
51.76620 |
0.0000132 |
| agua |
1 |
7.287607 |
63 |
44.47859 |
0.0069432 |
| vestdom |
1 |
3.762148 |
62 |
40.71644 |
0.0524252 |
Anova(mod, type = 'II', test = "Wald")
| capivara |
1 |
6.529635 |
0.0106092 |
| agua |
1 |
6.499812 |
0.0107886 |
| vestdom |
1 |
3.214476 |
0.0729896 |
#step.model <- mod %>% stepAIC(trace =FALSE)
#coef(step.model)
#step.model
#Criação de um segundo modelo
mod2 <- glm(prescar_asl_asn ~ capivara + agua,
family = binomial(link = 'logit'), data = dados)
anova(mod2, test="Chisq")
| NULL |
NA |
NA |
65 |
70.74671 |
NA |
| capivara |
1 |
18.980509 |
64 |
51.76620 |
0.0000132 |
| agua |
1 |
7.287607 |
63 |
44.47859 |
0.0069432 |
Anova(mod2, type = 'II', test = "Wald")
| capivara |
1 |
4.792092 |
0.0285907 |
| agua |
1 |
6.327058 |
0.0118909 |
| 62 |
40.71644 |
NA |
NA |
| 63 |
44.47859 |
-1 |
-3.762148 |
#Análise do modelo
## Overall effects
Anova(mod, type = 'II', test = "Wald")
| capivara |
1 |
6.529635 |
0.0106092 |
| agua |
1 |
6.499812 |
0.0107886 |
| vestdom |
1 |
3.214476 |
0.0729896 |
## Efeitos específicos
summary(mod)
##
## Call:
## glm(formula = prescar_asl_asn ~ capivara + agua + vestdom, family = binomial(link = "logit"),
## data = dados)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5783 0.1192 0.2708 0.4668 2.1612
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.2336 1.0080 -2.216 0.0267 *
## capivarasim 3.0601 1.1975 2.555 0.0106 *
## aguasim 2.4608 0.9652 2.549 0.0108 *
## vestdomsim 1.6559 0.9236 1.793 0.0730 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 70.747 on 65 degrees of freedom
## Residual deviance: 40.716 on 62 degrees of freedom
## AIC: 48.716
##
## Number of Fisher Scoring iterations: 6
## Obtenção das razões de chance com IC 95% (usando log-likelihood)
exp(cbind(OR = coef(mod), confint(mod)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.1071382 0.01059831 0.6086829
## capivarasim 21.3296503 2.73934437 458.7842053
## aguasim 11.7143499 2.07846846 103.7606635
## vestdomsim 5.2376978 0.98381710 42.2201756
## Obtenção das razões de chance com IC 95% (usando erro padrão = SPSS)
exp(cbind(OR = coef(mod), confint.default(mod)))
## OR 2.5 % 97.5 %
## (Intercept) 0.1071382 0.01485799 0.7725541
## capivarasim 21.3296503 2.04001691 223.0148098
## aguasim 11.7143499 1.76651485 77.6817670
## vestdomsim 5.2376978 0.85701434 32.0105238
hoslem.test(dados$prescar_asl_asn, fitted(mod),g=5)
## Warning in Ops.factor(1, y): '-' not meaningful for factors
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: dados$prescar_asl_asn, fitted(mod)
## X-squared = 66, df = 3, p-value = 3.064e-14
## Overall effects
Anova(mod2, type="II", test="Wald")
| capivara |
1 |
4.792092 |
0.0285907 |
| agua |
1 |
6.327058 |
0.0118909 |
## Efeitos específicos
summary(mod2)
##
## Call:
## glm(formula = prescar_asl_asn ~ capivara + agua, family = binomial(link = "logit"),
## data = dados)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6666 0.2408 0.2408 0.6463 1.6651
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0986 0.6667 -1.648 0.0994 .
## capivarasim 2.4967 1.1405 2.189 0.0286 *
## aguasim 2.1282 0.8461 2.515 0.0119 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 70.747 on 65 degrees of freedom
## Residual deviance: 44.479 on 63 degrees of freedom
## AIC: 50.479
##
## Number of Fisher Scoring iterations: 6
## Obtenção das razões de chance com IC 95% (usando log-likelihood)
exp(cbind(OR = coef(mod2), confint(mod2)))
## Waiting for profiling to be done...
## OR 2.5 % 97.5 %
## (Intercept) 0.3333333 0.0739549 1.117189
## capivarasim 12.1428571 1.7543842 244.104446
## aguasim 8.4000000 1.7532008 51.540358
## Obtenção das razões de chance com IC 95% (usando erro padrão = SPSS)
exp(cbind(OR = coef(mod2), confint.default(mod2)))
## OR 2.5 % 97.5 %
## (Intercept) 0.3333333 0.09024249 1.231251
## capivarasim 12.1428571 1.29864533 113.540607
## aguasim 8.4000000 1.59986668 44.103675
hoslem.test(dados$prescar_asl_asn, fitted(mod2),g=5)
## Warning in Ops.factor(1, y): '-' not meaningful for factors
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: dados$prescar_asl_asn, fitted(mod2)
## X-squared = 66, df = 3, p-value = 3.064e-14
#Avaliação da qualidade e comparação entre modelos
## Pseudo R-quadrado
PseudoR2(mod, which = "Nagelkerke")
## Nagelkerke
## 0.5558503
PseudoR2(mod, which = "Nagelkerke")
## Nagelkerke
## 0.5558503
PseudoR2(mod, which = "all")
## McFadden McFaddenAdj CoxSnell Nagelkerke AldrichNelson
## 0.4244758 0.3113963 0.3655546 0.5558503 0.3127167
## VeallZimmermann Efron McKelveyZavoina Tjur AIC
## 0.6044518 0.4620491 0.5650231 0.4614323 48.7164411
## BIC logLik logLik0 G2
## 57.4750601 -20.3582206 -35.3733527 30.0302642
PseudoR2(mod2, which = "all")
## McFadden McFaddenAdj CoxSnell Nagelkerke AldrichNelson
## 0.3712981 0.2864885 0.3283392 0.4992617 0.2846933
## VeallZimmermann Efron McKelveyZavoina Tjur AIC
## 0.5502853 0.4042194 0.4970415 0.4042194 50.4785890
## BIC logLik logLik0 G2
## 57.0475532 -22.2392945 -35.3733527 26.2681164
# Comparação de modelos
## AIC e BIC
AIC(mod, mod2)
| mod |
4 |
48.71644 |
| mod2 |
3 |
50.47859 |
| mod |
4 |
57.47506 |
| mod2 |
3 |
57.04755 |
## Qui-quadrado
anova(mod2, mod, test="Chisq")
| 63 |
44.47859 |
NA |
NA |
NA |
| 62 |
40.71644 |
1 |
3.762148 |
0.0524252 |
# Tabela de classificação
ClassLog(mod, dados$prescar_asl_asn)
## $rawtab
## resp
## não sim
## FALSE 9 3
## TRUE 6 48
##
## $classtab
## resp
## não sim
## FALSE 0.60000000 0.05882353
## TRUE 0.40000000 0.94117647
##
## $overall
## [1] 0.8636364
##
## $mcFadden
## [1] 0.4244758
ClassLog(mod2, dados$prescar_asl_asn)
## $rawtab
## resp
## não sim
## FALSE 9 3
## TRUE 6 48
##
## $classtab
## resp
## não sim
## FALSE 0.60000000 0.05882353
## TRUE 0.40000000 0.94117647
##
## $overall
## [1] 0.8636364
##
## $mcFadden
## [1] 0.3712981
levels(dados$prescar_asl_asn)
## [1] "não" "sim"
contrasts(dados$prescar_asl_asn)
## sim
## não 0
## sim 1
# Make predictions
probabilities <- mod %>%
predict(dados, type ="response")
predicted.classes <- ifelse(probabilities >0.5, "sim", "não")
#predicted.classes
dados$predicted <- factor(predicted.classes)
dados$probabilities <- probabilities
# Model accuracy
mean(dados$predicted==dados$prescar_asl_asn)
## [1] 0.8636364
# Código similar para calcular prob e classificar o ou 1
#predicoes <- predict(mod)
#probEstimada <- 1/(1+exp(-predicoes))
#classeEstimada <- round(probEstimada,0)
#dados$classeEstimada <- factor(classeEstimada,
# label = c("não", "sim"),
# levels = 0:1, order = F)
#dados$prescar_asl_asn
#Model accuracy
#mean(dados$classeEstimada == dados$prescar_asl_asn)
Logistic regression diagnostics #Variável dependente dicotômica e ME #Linearity assumption
# 1.Remove qualitative variables from the original data frame and bind the logit values to the data: # Select only numeric predictors (apenas para veriável independente contínua) #If the scatter plot shows non-linearity, you need other methods to build the model #such as including 2 or 3-power terms, fractional polynomials and spline function
#não é o caso em nossa análise, pois não há Xi numeric. Exemplo de Cód.
mydata <- dados %>%
dplyr::select_if(is.numeric)
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata <- mydata %>%
mutate(logit =log(probabilities/(1-probabilities))) %>%
gather(key ="predictors", value ="predictor.value", -logit)
## Warning: attributes are not identical across measure variables;
## they will be dropped
# 2.Create the scatter plots:
ggplot(mydata, aes(logit, predictor.value))+
geom_point(size =0.5, alpha =0.5) +
geom_smooth(method ="loess") +
theme_bw() +
facet_wrap(~predictors, scales ="free_y")
## `geom_smooth()` using formula 'y ~ x'

#Influential values
plot(mod, which =4, id.n =3)

#The most extreme values in the data can be examined by visualizing the Cook's distance values. Here we label the top 3 largest values
plot(mod, which = 5)

## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.34800 0.08625 0.19978 0.02338 0.35808 3.24796
# Extract model results
model.data <- augment(mod) %>%
mutate(index =1:n())
#The data for the top 3 largest values, according to the Cook's distance,
# can be displayed as follow:
model.data %>%
top_n(5, .cooksd)
| sim |
não |
não |
sim |
-0.5777534 |
1.430511 |
1.526075 |
0.1213204 |
0.7932867 |
0.0700048 |
5 |
| sim |
não |
não |
sim |
-0.5777534 |
1.430511 |
1.526075 |
0.1213204 |
0.7932867 |
0.0700048 |
30 |
| não |
não |
sim |
sim |
1.8830612 |
-2.012296 |
-2.088720 |
0.0718389 |
0.7719858 |
0.1370428 |
33 |
| sim |
não |
não |
não |
-2.2336354 |
2.161210 |
2.264077 |
0.0888045 |
0.7638384 |
0.2495785 |
36 |
| não |
sim |
sim |
não |
3.2872773 |
-2.578353 |
-2.626149 |
0.0360686 |
0.7445957 |
0.2597907 |
45 |
#Plot the standardized residuals:
ggplot(model.data, aes(index, .std.resid)) +
geom_point(aes(color =dados$prescar_asl_asn),size=4 ,alpha =.5) +
theme_bw()

#Filter potential influential data points with abs(.std.res) > 3 :
model.data %>%
filter(abs(.std.resid) >3) #There is no influential observations in our data.
#When you have outliers in a continuous predictor, potential solutions include:
#Removing the concerned records
#Transform the data into log scale
#Use non parametric methods
#Multicollinearity
## capivara agua vestdom
## 1.171065 1.283017 1.327852
#As a rule of thumb, a VIF value that exceeds 5 or 10 indicates a problematic amount of collinearity. In our example, there is no collinearity: all variables have a value of VIF well below 5.
#Confusion matrix
#The R function table() can be used to produce a confusion matrix in order to determine how many observations were correctly or incorrectly classified. It compares the observed and the predicted outcome values and shows the number of correct and incorrect predictions categorized by type of outcome.
table(dados$prescar_asl_asn, dados$predicted)
##
## não sim
## não 9 6
## sim 3 48
# Confusion matrix, proportion of cases
table(dados$prescar_asl_asn, dados$predicted) %>%
prop.table() %>%
round(digits =3)
##
## não sim
## não 0.136 0.091
## sim 0.045 0.727
# Confusion matrix, proportion of cases (linha)
table(dados$prescar_asl_asn, dados$predicted) %>%
prop.table(margin =1 ) %>%
round(digits =3)
##
## não sim
## não 0.600 0.400
## sim 0.059 0.941
# Confusion matrix, proportion of cases (coluna)
table(dados$prescar_asl_asn, dados$predicted) %>%
prop.table(margin =2 ) %>%
round(digits =3)
##
## não sim
## não 0.750 0.111
## sim 0.250 0.889
#The diagonal elements of the confusion matrix indicate correct predictions, while the off-diagonals represent
#incorrect predictions. So, the correct classification rate is the sum of the number on the diagonal divided by the sample size in the test data. In our example, that is (48 + 15)/78 = 81%.
#True positives (d): these are cases in which we predicted the individuals would be diabetes-positive and they were.
#True negatives (a): We predicted diabetes-negative, and the individuals were diabetes-negative.
#False positives (b): We predicted diabetes-positive, but the individuals didn't actually have diabetes. (Also known as a Type I error .)
#False negatives (c): We predicted diabetes-negative, but they did have diabetes. (Also known as a Type II error .)
#Technically the raw prediction accuracy of the model is defined as (TruePositives + TrueNegatives)/SampleSize .
#Precision, Recall and Specificity
#In addition to the raw classification accuracy, there are many other metrics that are widely used to examine the performance of a classification model, including:
#Precision , which is the proportion of true positives among all the individuals that have been predicted to be diabetes-positive by the model. This represents the accuracy of a predicted positive outcome. Precision = TruePositives/(TruePositives + FalsePositives) .
#Sensitivity (or Recall ), which is the True Positive Rate (TPR) or the proportion of identified positives among the diabetes-positive population (class = 1). Sensitivity = TruePositives/(TruePositives + FalseNegatives) .
#Specificity , which measures the True Negative Rate (TNR), that is the proportion of identified negatives among the diabetes-negative population (class = 0). Specificity = TrueNegatives/(TrueNegatives + FalseNegatives) .
#False Positive Rate (FPR), which represents the proportion of identified positives among the healthy individuals (i.e. diabetes-negative). This can be seen as a false alarm. The FPR can be also calculated as 1-specificity . When positives are rare, the FPR can be high, leading to the situation where a predicted positive is most likely a negative.
#Sensitivy and Specificity are commonly used to measure the performance of a predictive model.
confusionMatrix(dados$prescar_asl_asn, dados$predicted, positive ="sim")
## Confusion Matrix and Statistics
##
## Reference
## Prediction não sim
## não 9 6
## sim 3 48
##
## Accuracy : 0.8636
## 95% CI : (0.7569, 0.9357)
## No Information Rate : 0.8182
## P-Value [Acc > NIR] : 0.2161
##
## Kappa : 0.5823
##
## Mcnemar's Test P-Value : 0.5050
##
## Sensitivity : 0.8889
## Specificity : 0.7500
## Pos Pred Value : 0.9412
## Neg Pred Value : 0.6000
## Prevalence : 0.8182
## Detection Rate : 0.7273
## Detection Prevalence : 0.7727
## Balanced Accuracy : 0.8194
##
## 'Positive' Class : sim
##
Suppose a 2x2 table with notation Reference
Predicted Event No Event Event A B No Event C D The formulas used here are: Sensitivity = A/(A+C) Specificity = D/(B+D) Prevalence = (A+C)/(A+B+C+D) PPV = (sensitivity * prevalence)/((sensitivityprevalence) + ((1-specificity)(1-prevalence))) NPV = (specificity * (1-prevalence))/(((1-sensitivity)prevalence) + ((specificity)(1-prevalence))) Detection Rate = A/(A+B+C+D) Detection Prevalence = (A+B)/(A+B+C+D) Balanced Accuracy = (sensitivity+specificity)/2 Precision = A/(A+B) Recall = A/(A+C) F1 = (1+beta2)precisionrecall/((beta2 * precision)+recall)
#ROC curve
#The ROC curve (or receiver operating characteristics curve ) is a popular graphical measure for assessing the performance or the accuracy of a classifier, which corresponds to the total proportion of correctly classified observations.
#For example, the accuracy of a medical diagnostic test can be assessed by considering the two possible types of errors: false positives, and false negatives. In classification point of view, the test will be declared positive when the corresponding predicted probability, returned by the classifier algorithm, is above a fixed threshold. This threshold is generally set to 0.5 (i.e., 50%), which corresponds to the random guessing probability.
#So, in reference to our diabetes data example, for a given fixed probability cutoff:
# the true positive rate (or fraction) is the proportion of identified positives among the diabetes-positive population. Recall that, this is also known as the sensitivity of the predictive classifier model.
#and the false positive rate is the proportion of identified positives among the healthy (i.e. diabetes-negative) individuals. This is also defined as 1-specificity , where specificity measures the true negative rate , that is the proportion of identified negatives among the diabetes-negative population.
#Since we don't usually know the probability cutoff in advance, the ROC curve is typically used to plot the true positive rate (or sensitivity on y-axis) against the false positive rate (or "1-specificity" on x-axis) at all possible probability cutoffs. This shows the trade off between the rate at which you can correctly predict something with the rate of incorrectly predicting something. Another visual representation of the ROC plot is to simply display the sensitive against the specificity.
#The Area Under the Curve (AUC ) summarizes the overall performance of the classifier, over all possible probability cutoffs. It represents the ability of a classification algorithm to distinguish 1s from 0s (i.e, events from non-events or positives from negatives).
#For a good model, the ROC curve should rise steeply, indicating that the true positive rate (y-axis) increases faster than the false positive rate (x-axis) as the probability threshold decreases.
#So, the "ideal point" is the top left corner of the graph, that is a false positive rate of zero, and a true positive rate of one. This is not very realistic, but it does mean that the larger the AUC the better the classifier.
#The AUC metric varies between 0.50 (random classifier) and 1.00. Values above 0.80 is an indication of a good classifier.
# Compute roc
res.roc <- roc(dados$prescar_asl_asn, dados$probabilities)
## Setting levels: control = não, case = sim
## Setting direction: controls < cases
plot.roc(res.roc, print.auc =TRUE,col="black",lwd=2)

#summary(res.roc)
#The gray diagonal line represents a classifier no better than random chance.
#A highly performant classifier will have an ROC that rises steeply to the top-left corner, that is it will correctly identify lots of positives without misclassifying lots of negatives as positives.
#In our example, the AUC is 0.85, which is close to the maximum ( max = 1). So, our classifier can be considered as very good. A classifier that performs no better than chance is expected to have an AUC of 0.5 when evaluated on an independent test set not used to train the model.
#If we want a classifier model with a specificity of at least 60%, then the sensitivity is about 0.88%. The corresponding probability threshold can be extract as follow:
# Extract some interesting results
roc.data <-data_frame(thresholds =res.roc$thresholds,sensitivity =res.roc$sensitivities,specificity =res.roc$specificities)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# Get the probability threshold for specificity = 0.6
roc.data %>%filter(specificity >=0.6)
| 0.4580007 |
0.9411765 |
0.6000000 |
| 0.7122571 |
0.8627451 |
0.8666667 |
| 0.9159761 |
0.6666667 |
0.9333333 |
| 0.9784541 |
0.1764706 |
1.0000000 |
| Inf |
0.0000000 |
1.0000000 |
roc(dados$prescar_asl_asn,dados$probabilities) %>%
coords(ret = "all", transpose = FALSE) %>%
filter(sensitivity >= 0.7,
specificity >= 0.7)
## Setting levels: control = não, case = sim
## Setting direction: controls < cases
| 0.7122571 |
0.8666667 |
0.8627451 |
0.8636364 |
13 |
44 |
7 |
2 |
0.65 |
0.9565217 |
0.0434783 |
0.1333333 |
0.8627451 |
0.8666667 |
0.1372549 |
0.1333333 |
0.1372549 |
0.1363636 |
0.35 |
0.0434783 |
0.9565217 |
0.8627451 |
1.729412 |
0.0366167 |
#The best threshold with the highest sum sensitivity + specificity can be printed as follow. There might be more than one threshold.
plot.roc(res.roc, print.auc =TRUE, print.thres ="best")

#Here, the best probability cutoff is xxx resulting to a predictive classifier with a specificity of xxx and a sensitivity of xxx.
#Note that, print.thres can be also a numeric vector containing a direct definition of the thresholds to display:
plot.roc(res.roc, print.thres =c(0.3, 0.5, 0.7))
