1 Aula 11 - Regressão com Variável Dependente Binária

Carregando dataset disponibilizado pela UCLA.

#Pacotes utilizados
pacotes <- c("rgl","car", "tidyverse",
             "reshape2","jtools","lmtest","caret","pROC","ROCR","nnet")

if(sum(as.numeric(!pacotes %in% installed.packages())) != 0){
  instalador <- pacotes[!pacotes %in% installed.packages()]
  for(i in 1:length(instalador)) {
    install.packages(instalador, dependencies = T)
    break()}
  sapply(pacotes, require, character = T) 
} else {
  sapply(pacotes, require, character = T) 
}

## Loading required package: rgl

## Loading required package: car

## Loading required package: carData

## Loading required package: tidyverse

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::some()   masks car::some()

## Loading required package: reshape2

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

## Loading required package: jtools

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: caret

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

## Loading required package: pROC

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

## Loading required package: ROCR

## Loading required package: nnet

##       rgl       car tidyverse  reshape2    jtools    lmtest     caret      pROC 
##      TRUE      TRUE      TRUE      TRUE      TRUE      TRUE      TRUE      TRUE 
##      ROCR      nnet 
##      TRUE      TRUE

link <- 'https://stats.idre.ucla.edu/wp-content/uploads/2016/02/sample.csv'
ucla <- read.csv(link)

glimpse(ucla)

## Rows: 200
## Columns: 6
## $ female      <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ read        <int> 57, 68, 44, 63, 47, 44, 50, 34, 63, 57, 60, 57, 73, 54,...
## $ write       <int> 52, 59, 33, 44, 52, 52, 59, 46, 57, 55, 46, 65, 60, 63,...
## $ math        <int> 41, 53, 54, 47, 57, 51, 42, 45, 54, 52, 51, 51, 71, 57,...
## $ hon         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1...
## $ femalexmath <int> 0, 53, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...

1.1 Calculando log-odds da variável dependente (hon)

Regressão logística com apenas uma constante.

modelo1 <- glm(hon ~1,
               data = ucla, 
               family = "binomial")

#visualizando os resultados
summ(modelo1, confint = T, digits = 3, ci.width = .95)

Observations	200
Dependent variable	hon
Type	Generalized linear model
Family	binomial
Link	logit

χ²(0)	-0.000
Pseudo-R² (Cragg-Uhler)	0.000
Pseudo-R² (McFadden)	0.000
AIC	224.710
BIC	228.008

	Est.	2.5%	97.5%	z val.	p
(Intercept)	-1.125	-1.448	-0.803	-6.845	0.000
Standard errors: MLE

Obtendo o Log Likelihood

logLik(modelo1)

## 'log Lik.' -111.355 (df=1)

Frequência da variável depedente (hon)

table(ucla$hon)

## 
##   0   1 
## 151  49

Frequência relativa

table(ucla$hon)/nrow(ucla)*100

## 
##    0    1 
## 75.5 24.5

calculando a probabilidade

p <- 49/200

print(p)

## [1] 0.245

#ou transformar ou log odds em probabilidade de novo
p1 <- exp(-1.125)/ (1 + exp(-1.125))

print(p1)

## [1] 0.245085

Calculando a chance (odds)

odds <- p/(1-p)
print(odds)

## [1] 0.3245033

Calculando log odds

log(odds)

## [1] -1.12546

1.2 Regressão com uma variável independente dummy

modelo2 <- glm(hon ~ female, 
               data = ucla,
               family = "binomial")

#visualizando os resultados
summ(modelo2, confint = T, digits = 3, ci.width = .95)

Observations	200
Dependent variable	hon
Type	Generalized linear model
Family	binomial
Link	logit

χ²(1)	3.104
Pseudo-R² (Cragg-Uhler)	0.023
Pseudo-R² (McFadden)	0.014
AIC	223.606
BIC	230.203

	Est.	2.5%	97.5%	z val.	p
(Intercept)	-1.471	-1.998	-0.944	-5.469	0.000
female	0.593	-0.076	1.262	1.736	0.083
Standard errors: MLE

logLik(modelo2)

## 'log Lik.' -109.8031 (df=2)

Tabele de Contigência

xtabs(~hon + female, data = ucla)

##    female
## hon  0  1
##   0 74 77
##   1 17 32

1.3 Calculando Log odss manualmente

chance (odds) dos homens estarem na categoria hon

#pertencer/ nao pertencer
odds_homens <- (17/ (74 + 17)) / (74/(74 + 17))
print(odds_homens)

## [1] 0.2297297

chance (odds) das mulheres estarem na categoria hon

#pertencer/ nao pertencer
odds_mulheres <- (32/(32 + 77)) / (77/(32 + 77))
print(odds_mulheres)

## [1] 0.4155844

Odds Mulheres / Odds Homens

odds_mulheres/odds_homens

## [1] 1.809015

#ou
(32*74)/(77*17)

## [1] 1.809015

Intercepto = log odds homens

log(odds_homens)

## [1] -1.470852

Beta_1 = log odds mulheres / odds homens

log(odds_mulheres/odds_homens)

## [1] 0.5927822

1.4 Regressão logística com uma variável indepedente contínua

modelo3 <- glm(hon ~ math,
               data = ucla,
               family = "binomial")

summ(modelo3, confint = T, digits = 3, ci.width = .95)

Observations	200
Dependent variable	hon
Type	Generalized linear model
Family	binomial
Link	logit

χ²(1)	55.637
Pseudo-R² (Cragg-Uhler)	0.362
Pseudo-R² (McFadden)	0.250
AIC	171.073
BIC	177.670

	Est.	2.5%	97.5%	z val.	p
(Intercept)	-9.794	-12.698	-6.890	-6.610	0.000
math	0.156	0.106	0.207	6.105	0.000
Standard errors: MLE

logLik(modelo3)

## 'log Lik.' -83.53662 (df=2)

Beta_0 (Intercepto) = Log odds de um estudante com nota 0 em matemática está na categoria hon.

*Odds de um estudante com nota 0 em matemática está na categoria hon = exp(beta_0)

exp(modelo3$coefficients[1])

##  (Intercept) 
## 5.578854e-05

Beta_1

\[log(p/(1-p)) = logit(p) = \beta_0 + \beta_1 * math\] * Logito condicional de um aluno com nota 54 em matemática está na categoria hon

\[log(p/(1-p)) = logit(p) = – 9.793942 + .1563404* 54\]

logit_mat1 <-  -9.793942 + .1563404 * 54
print(logit_mat1)

## [1] -1.35156

Logito condicional de um aluno com nota 55 em matemática está na categoria hon

\[log(p/(1-p)) = logit(p) = – 9.793942 + .1563404* 55\]

logit_mat2 <-  -9.793942 + .1563404 * 55
print(logit_mat2)

## [1] -1.19522

Beta_1 = diferença entre os logitos acima

beta_1 <- logit_mat2 - logit_mat1
beta_1

## [1] 0.1563404

Logito -> Odds

exp(beta_1)

## [1] 1.169224

1.5 Regressão Logística Multivariada

\[logit(p) = log(p/(1-p))= β0 + β1*x1 + … + βk*x\]

modelo4 <- glm(hon ~ math + female + read,
                data = ucla,
                family = "binomial")

summ(modelo4, confint = T, digits = 3, ci.width = .95)

Observations	200
Dependent variable	hon
Type	Generalized linear model
Family	binomial
Link	logit

χ²(3)	66.540
Pseudo-R² (Cragg-Uhler)	0.421
Pseudo-R² (McFadden)	0.299
AIC	164.170
BIC	177.363

	Est.	2.5%	97.5%	z val.	p
(Intercept)	-11.770	-15.123	-8.417	-6.880	0.000
math	0.123	0.062	0.184	3.931	0.000
female	0.980	0.154	1.806	2.324	0.020
read	0.059	0.007	0.111	2.224	0.026
Standard errors: MLE

logLik(modelo4)

## 'log Lik.' -78.08478 (df=4)

1.6 Regressão Logística com termo de interação

\[logit(p) = log(p/(1-p))= β0 + β1*female + β2*math + β3*female*math\]

modelo5 <- glm(hon ~ female + math + female * math,
               data = ucla,
               family = "binomial")
summ(modelo5, confint = T, digits = 3, ci.width = .95)

Observations	200
Dependent variable	hon
Type	Generalized linear model
Family	binomial
Link	logit

χ²(3)	62.943
Pseudo-R² (Cragg-Uhler)	0.402
Pseudo-R² (McFadden)	0.283
AIC	167.767
BIC	180.960

	Est.	2.5%	97.5%	z val.	p
(Intercept)	-8.746	-12.919	-4.573	-4.108	0.000
female	-2.900	-8.964	3.165	-0.937	0.349
math	0.129	0.059	0.200	3.606	0.000
female:math	0.067	-0.038	0.172	1.253	0.210
Standard errors: MLE

logLik(modelo5)

## 'log Lik.' -79.8833 (df=4)

1.7 Referências

Bruin, J. 2006. newtest: command to compute new test. UCLA:
Statistical Consulting Group. https://stats.idre.ucla.edu/stata/ado/analysis/.

Logit

Camila

25/11/2020