1 Aula 11 - Regressão com Variável Dependente Binária

  • Carregando dataset disponibilizado pela UCLA.
#Pacotes utilizados
pacotes <- c("rgl","car", "tidyverse",
             "reshape2","jtools","lmtest","caret","pROC","ROCR","nnet")

if(sum(as.numeric(!pacotes %in% installed.packages())) != 0){
  instalador <- pacotes[!pacotes %in% installed.packages()]
  for(i in 1:length(instalador)) {
    install.packages(instalador, dependencies = T)
    break()}
  sapply(pacotes, require, character = T) 
} else {
  sapply(pacotes, require, character = T) 
}
## Loading required package: rgl
## Loading required package: car
## Loading required package: carData
## Loading required package: tidyverse
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()
## Loading required package: reshape2
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## Loading required package: jtools
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: caret
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
## Loading required package: pROC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
## Loading required package: ROCR
## Loading required package: nnet
##       rgl       car tidyverse  reshape2    jtools    lmtest     caret      pROC 
##      TRUE      TRUE      TRUE      TRUE      TRUE      TRUE      TRUE      TRUE 
##      ROCR      nnet 
##      TRUE      TRUE
link <- 'https://stats.idre.ucla.edu/wp-content/uploads/2016/02/sample.csv'
ucla <- read.csv(link)
glimpse(ucla)
## Rows: 200
## Columns: 6
## $ female      <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ read        <int> 57, 68, 44, 63, 47, 44, 50, 34, 63, 57, 60, 57, 73, 54,...
## $ write       <int> 52, 59, 33, 44, 52, 52, 59, 46, 57, 55, 46, 65, 60, 63,...
## $ math        <int> 41, 53, 54, 47, 57, 51, 42, 45, 54, 52, 51, 51, 71, 57,...
## $ hon         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1...
## $ femalexmath <int> 0, 53, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...

1.1 Calculando log-odds da variável dependente (hon)

  • Regressão logística com apenas uma constante.
modelo1 <- glm(hon ~1,
               data = ucla, 
               family = "binomial")

#visualizando os resultados
summ(modelo1, confint = T, digits = 3, ci.width = .95) 
Observations 200
Dependent variable hon
Type Generalized linear model
Family binomial
Link logit
χ²(0) -0.000
Pseudo-R² (Cragg-Uhler) 0.000
Pseudo-R² (McFadden) 0.000
AIC 224.710
BIC 228.008
Est. 2.5% 97.5% z val. p
(Intercept) -1.125 -1.448 -0.803 -6.845 0.000
Standard errors: MLE
  • Obtendo o Log Likelihood
logLik(modelo1)
## 'log Lik.' -111.355 (df=1)
  • Frequência da variável depedente (hon)
table(ucla$hon)
## 
##   0   1 
## 151  49
  • Frequência relativa
table(ucla$hon)/nrow(ucla)*100
## 
##    0    1 
## 75.5 24.5
  • calculando a probabilidade
p <- 49/200

print(p)
## [1] 0.245
#ou transformar ou log odds em probabilidade de novo
p1 <- exp(-1.125)/ (1 + exp(-1.125))

print(p1)
## [1] 0.245085
  • Calculando a chance (odds)
odds <- p/(1-p)
print(odds)
## [1] 0.3245033
  • Calculando log odds
log(odds)
## [1] -1.12546

1.2 Regressão com uma variável independente dummy

modelo2 <- glm(hon ~ female, 
               data = ucla,
               family = "binomial")

#visualizando os resultados
summ(modelo2, confint = T, digits = 3, ci.width = .95) 
Observations 200
Dependent variable hon
Type Generalized linear model
Family binomial
Link logit
χ²(1) 3.104
Pseudo-R² (Cragg-Uhler) 0.023
Pseudo-R² (McFadden) 0.014
AIC 223.606
BIC 230.203
Est. 2.5% 97.5% z val. p
(Intercept) -1.471 -1.998 -0.944 -5.469 0.000
female 0.593 -0.076 1.262 1.736 0.083
Standard errors: MLE
logLik(modelo2)
## 'log Lik.' -109.8031 (df=2)
  • Tabele de Contigência
xtabs(~hon + female, data = ucla)
##    female
## hon  0  1
##   0 74 77
##   1 17 32

1.3 Calculando Log odss manualmente

  • chance (odds) dos homens estarem na categoria hon
#pertencer/ nao pertencer
odds_homens <- (17/ (74 + 17)) / (74/(74 + 17))
print(odds_homens)
## [1] 0.2297297
  • chance (odds) das mulheres estarem na categoria hon
#pertencer/ nao pertencer
odds_mulheres <- (32/(32 + 77)) / (77/(32 + 77))
print(odds_mulheres)
## [1] 0.4155844
  • Odds Mulheres / Odds Homens
odds_mulheres/odds_homens
## [1] 1.809015
#ou
(32*74)/(77*17)
## [1] 1.809015
  • Intercepto = log odds homens
log(odds_homens)
## [1] -1.470852
  • Beta_1 = log odds mulheres / odds homens
log(odds_mulheres/odds_homens)
## [1] 0.5927822

1.4 Regressão logística com uma variável indepedente contínua

modelo3 <- glm(hon ~ math,
               data = ucla,
               family = "binomial")

summ(modelo3, confint = T, digits = 3, ci.width = .95)
Observations 200
Dependent variable hon
Type Generalized linear model
Family binomial
Link logit
χ²(1) 55.637
Pseudo-R² (Cragg-Uhler) 0.362
Pseudo-R² (McFadden) 0.250
AIC 171.073
BIC 177.670
Est. 2.5% 97.5% z val. p
(Intercept) -9.794 -12.698 -6.890 -6.610 0.000
math 0.156 0.106 0.207 6.105 0.000
Standard errors: MLE
logLik(modelo3)
## 'log Lik.' -83.53662 (df=2)
  • Beta_0 (Intercepto) = Log odds de um estudante com nota 0 em matemática está na categoria hon.

*Odds de um estudante com nota 0 em matemática está na categoria hon = exp(beta_0)

exp(modelo3$coefficients[1])
##  (Intercept) 
## 5.578854e-05
  • Beta_1

\[log(p/(1-p)) = logit(p) = \beta_0 + \beta_1 * math\] * Logito condicional de um aluno com nota 54 em matemática está na categoria hon

\[log(p/(1-p)) = logit(p) = – 9.793942 + .1563404* 54\]

logit_mat1 <-  -9.793942 + .1563404 * 54
print(logit_mat1)
## [1] -1.35156
  • Logito condicional de um aluno com nota 55 em matemática está na categoria hon

\[log(p/(1-p)) = logit(p) = – 9.793942 + .1563404* 55\]

logit_mat2 <-  -9.793942 + .1563404 * 55
print(logit_mat2)
## [1] -1.19522
  • Beta_1 = diferença entre os logitos acima
beta_1 <- logit_mat2 - logit_mat1
beta_1
## [1] 0.1563404
  • Logito -> Odds
exp(beta_1)
## [1] 1.169224

1.5 Regressão Logística Multivariada

\[logit(p) = log(p/(1-p))= β0 + β1*x1 + … + βk*x\]

modelo4 <- glm(hon ~ math + female + read,
                data = ucla,
                family = "binomial")

summ(modelo4, confint = T, digits = 3, ci.width = .95)
Observations 200
Dependent variable hon
Type Generalized linear model
Family binomial
Link logit
χ²(3) 66.540
Pseudo-R² (Cragg-Uhler) 0.421
Pseudo-R² (McFadden) 0.299
AIC 164.170
BIC 177.363
Est. 2.5% 97.5% z val. p
(Intercept) -11.770 -15.123 -8.417 -6.880 0.000
math 0.123 0.062 0.184 3.931 0.000
female 0.980 0.154 1.806 2.324 0.020
read 0.059 0.007 0.111 2.224 0.026
Standard errors: MLE
logLik(modelo4)
## 'log Lik.' -78.08478 (df=4)

1.6 Regressão Logística com termo de interação

\[logit(p) = log(p/(1-p))= β0 + β1*female + β2*math + β3*female*math\]

modelo5 <- glm(hon ~ female + math + female * math,
               data = ucla,
               family = "binomial")
summ(modelo5, confint = T, digits = 3, ci.width = .95)
Observations 200
Dependent variable hon
Type Generalized linear model
Family binomial
Link logit
χ²(3) 62.943
Pseudo-R² (Cragg-Uhler) 0.402
Pseudo-R² (McFadden) 0.283
AIC 167.767
BIC 180.960
Est. 2.5% 97.5% z val. p
(Intercept) -8.746 -12.919 -4.573 -4.108 0.000
female -2.900 -8.964 3.165 -0.937 0.349
math 0.129 0.059 0.200 3.606 0.000
female:math 0.067 -0.038 0.172 1.253 0.210
Standard errors: MLE
logLik(modelo5)
## 'log Lik.' -79.8833 (df=4)

1.7 Referências

Bruin, J. 2006. newtest: command to compute new test. UCLA:
Statistical Consulting Group. https://stats.idre.ucla.edu/stata/ado/analysis/.