Heterocedasticidade

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Warning: package 'lmtest' was built under R version 4.0.5

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Warning: package 'stargazer' was built under R version 4.0.5

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

df.despesas_alimentos <- read.csv("POF08_DespesaAlimentos.csv")

Dados

As variáveis de interesse são:

rendapc: renda mensal domiciliar per capita (R$)

despalimpc: despesa mensal domiciliar per capita com alimentos (R$)

anosestudo: anos de escolaridade da pessoa responsável pelo domicílio

As questões relacionam-se à identificação, análise e correção da heterocedasticidade no modelo:

\[ ln(despalimpc) = \alpha + \beta_1:; ln(rendapc) + \beta_2 anosestudo + erro \]

summary(df.despesas_alimentos)

##        uf            fator             n_morador          renda         
##  Min.   :11.00   Min.   :    3.827   Min.   : 1.000   Min.   :    17.0  
##  1st Qu.:23.00   1st Qu.:  330.019   1st Qu.: 2.000   1st Qu.:   808.3  
##  Median :31.00   Median :  681.833   Median : 3.000   Median :  1368.8  
##  Mean   :31.13   Mean   : 1048.095   Mean   : 3.434   Mean   :  2325.6  
##  3rd Qu.:41.00   3rd Qu.: 1256.922   3rd Qu.: 4.000   3rd Qu.:  2517.6  
##  Max.   :53.00   Max.   :19521.859   Max.   :20.000   Max.   :117219.2  
##     rendapc             idade          anosestudo        mulher      
##  Min.   :    3.47   Min.   : 12.00   Min.   : 0.00   Min.   :0.0000  
##  1st Qu.:  253.03   1st Qu.: 35.00   1st Qu.: 3.00   1st Qu.:0.0000  
##  Median :  477.98   Median : 45.00   Median : 6.00   Median :0.0000  
##  Mean   :  838.42   Mean   : 47.23   Mean   : 7.05   Mean   :0.3225  
##  3rd Qu.:  891.83   3rd Qu.: 58.00   3rd Qu.:11.00   3rd Qu.:1.0000  
##  Max.   :87430.75   Max.   :103.00   Max.   :88.00   Max.   :1.0000  
##       ano        despalimtot          despalimpc      
##  Min.   :2008   Min.   :    0.867   Min.   :   0.368  
##  1st Qu.:2008   1st Qu.:  135.590   1st Qu.:  43.333  
##  Median :2008   Median :  272.025   Median :  87.179  
##  Mean   :2008   Mean   :  378.697   Mean   : 131.064  
##  3rd Qu.:2008   3rd Qu.:  492.873   3rd Qu.: 165.383  
##  Max.   :2008   Max.   :15634.450   Max.   :3547.657

Modelo RLM-MQO

model.mqo <- lm(log(despalimpc) ~ log(rendapc) + anosestudo, df.despesas_alimentos)
summary(model.mqo)

## 
## Call:
## lm(formula = log(despalimpc) ~ log(rendapc) + anosestudo, data = df.despesas_alimentos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4743 -0.4939  0.1296  0.6385  3.5703 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.2573714  0.0261836  48.021  < 2e-16 ***
## log(rendapc) 0.5023812  0.0042787 117.416  < 2e-16 ***
## anosestudo   0.0022899  0.0005286   4.332 1.48e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9526 on 52615 degrees of freedom
## Multiple R-squared:  0.2197, Adjusted R-squared:  0.2197 
## F-statistic:  7409 on 2 and 52615 DF,  p-value: < 2.2e-16

model <- lm(log(despalimpc) ~ log(rendapc) + 
              anosestudo, data = df.despesas_alimentos)
stargazer::stargazer(model.mqo,
 title = "Resultado",
 dep.var.labels = c("log(despesalimppc)"),
 out="models.htm", type = "text", header=FALSE, 
 covariate.labels = c("", "log(rendapc)",
 "anosestudo"))

## 
## Resultado
## ================================================
##                         Dependent variable:     
##                     ----------------------------
##                          log(despesalimppc)     
## ------------------------------------------------
##                               0.502***          
##                               (0.004)           
##                                                 
## log(rendapc)                  0.002***          
##                               (0.001)           
##                                                 
## anosestudo                    1.257***          
##                               (0.026)           
##                                                 
## ------------------------------------------------
## Observations                   52,618           
## R2                             0.220            
## Adjusted R2                    0.220            
## Residual Std. Error      0.953 (df = 52615)     
## F Statistic         7,409.039*** (df = 2; 52615)
## ================================================
## Note:                *p<0.1; **p<0.05; ***p<0.01

resid_auxpanel(residuals = resid(model.mqo), 
 predicted = fitted(model), 
 plots = c("resid", "index"))

ggplot(df.despesas_alimentos, aes(x=log(rendapc)))+geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df.despesas_alimentos%>% ggplot(aes(sample = log(rendapc))) + geom_qq()

ggplot(df.despesas_alimentos, aes(x=rendapc))+geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df.despesas_alimentos%>% ggplot(aes(sample = rendapc)) + geom_qq()

Plots

echapeu <- resid(model.mqo)
echapeu2 <- echapeu^2
plot(log(df.despesas_alimentos$rendapc), echapeu2)

plot(df.despesas_alimentos$anosestudo, echapeu2)

Modelo auxiliar para teste de Breusch-Pagan

bp <- lm(echapeu2 ~ log(rendapc) + anosestudo, data=df.despesas_alimentos)
summary(bp)

## 
## Call:
## lm(formula = echapeu2 ~ log(rendapc) + anosestudo, data = df.despesas_alimentos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9907 -0.8259 -0.5626  0.0974 29.0939 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.0213989  0.0466970  21.873  < 2e-16 ***
## log(rendapc) -0.0117967  0.0076307  -1.546    0.122    
## anosestudo   -0.0058356  0.0009428  -6.190 6.07e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.699 on 52615 degrees of freedom
## Multiple R-squared:  0.0009038,  Adjusted R-squared:  0.0008658 
## F-statistic:  23.8 on 2 and 52615 DF,  p-value: 4.667e-11

Calcula estatística LM (n*R2)

Teste de BP

n <- length(echapeu2)              # tamanho da amostra
r2_bp <- summary(bp)$r.squared     # R2 do ajuste auxiliar
lm_bp <- n * r2_bp                 # estatística LM
lm_bp

## [1] 47.55692

Probabilidade de erro ao rejeitar homocedasticidade

p_bp <- pchisq(lm_bp, 2, lower.tail = FALSE)# valor p, em qui-quadrado com 2 gl (variáveis independentes do modelo auxiliar)
p_bp

## [1] 4.711361e-11

Modelo auxiliar para teste de White

white <- lm(echapeu2 ~ log(rendapc) + anosestudo + 
              I((log(rendapc))^2) + I(anosestudo^2) + 
              I((log(rendapc))*anosestudo), data=df.despesas_alimentos)
summary(white)

## 
## Call:
## lm(formula = echapeu2 ~ log(rendapc) + anosestudo + I((log(rendapc))^2) + 
##     I(anosestudo^2) + I((log(rendapc)) * anosestudo), data = df.despesas_alimentos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8130 -0.8117 -0.5601  0.1005 29.1309 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     1.257e+00  1.868e-01   6.726 1.77e-11 ***
## log(rendapc)                   -1.236e-01  6.114e-02  -2.022  0.04319 *  
## anosestudo                      1.300e-02  7.055e-03   1.843  0.06533 .  
## I((log(rendapc))^2)             1.344e-02  5.073e-03   2.648  0.00809 ** 
## I(anosestudo^2)                 1.485e-04  2.635e-05   5.638 1.73e-08 ***
## I((log(rendapc)) * anosestudo) -4.856e-03  1.075e-03  -4.518 6.26e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.698 on 52612 degrees of freedom
## Multiple R-squared:  0.002014,   Adjusted R-squared:  0.001919 
## F-statistic: 21.24 on 5 and 52612 DF,  p-value: < 2.2e-16

Calcula estatística LM (n*R2) - Teste de White

r2_white <- summary(white)$r.squared      # R2 do ajuste auxiliar
lm_white <- n * r2_white                  # estatística LM
lm_white

## [1] 105.9873

Probabilidade de erro ao rejeitar homocedasticidade

p_white <- pchisq(lm_white, 5, lower.tail = FALSE)  # valor p, em qui-quadrado com 5 gl
p_white

## [1] 2.884581e-21

Teste de hipótese para betas usando estimador robusto da variância de White

# "HC0": Eicker-Huber-White standard errors

coeftest(model.mqo, vcov = vcovHC(model.mqo, type = "HC0"))

## 
## t test of coefficients:
## 
##                Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)  1.25737137 0.02629352  47.821 < 2.2e-16 ***
## log(rendapc) 0.50238125 0.00429123 117.072 < 2.2e-16 ***
## anosestudo   0.00228985 0.00049499   4.626 3.736e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# compute the square root of the diagonal elements in vcov
robust_se <- sqrt(diag(coeftest(model.mqo, vcov = vcovHC(model.mqo, type = "HC0"))))
robust_se

## [1] 1.12132572 0.06550745 2.15081868

# "HC1":  MacKinnon and White(1985) standard errors
# default in the statistics package STATA

coeftest(model.mqo, vcov = vcovHC(model.mqo, type = "HC1"))

## 
## t test of coefficients:
## 
##                Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept)  1.25737137 0.02629427  47.8192 < 2.2e-16 ***
## log(rendapc) 0.50238125 0.00429135 117.0684 < 2.2e-16 ***
## anosestudo   0.00228985 0.00049501   4.6259 3.739e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# compute the square root of the diagonal elements in vcov
robust_se <- sqrt(diag(coeftest(model.mqo, vcov = vcovHC(model.mqo, type = "HC1"))))
robust_se

## [1] 1.12132572 0.06550839 2.15078802

Modelo por MQP, usando opção weights da função lm

Pressuposto:

\[ var(e)=log(rendapc)*\sigma^2 \]

, ou seja, \[v=log(rendapc)\]

pois, \[weights = 1/v\],

logo, \[weights=1/log(rendapc)\]

mqp <- lm(log(despalimpc) ~ log(rendapc) + 
            anosestudo, data=df.despesas_alimentos, weights=1/log(rendapc))
summary(mqp)

## 
## Call:
## lm(formula = log(despalimpc) ~ log(rendapc) + anosestudo, data = df.despesas_alimentos, 
##     weights = 1/log(rendapc))
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.14446 -0.19991  0.05245  0.25742  1.99450 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.3321908  0.0253041   52.65  < 2e-16 ***
## log(rendapc) 0.4901035  0.0042367  115.68  < 2e-16 ***
## anosestudo   0.0024435  0.0005266    4.64 3.49e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3894 on 52615 degrees of freedom
## Multiple R-squared:  0.2136, Adjusted R-squared:  0.2136 
## F-statistic:  7145 on 2 and 52615 DF,  p-value: < 2.2e-16

# load scales package for adjusting color opacities
library(scales)

# generate some heteroskedastic data:

# set seed for reproducibility
set.seed(123) 

# set up vector of x coordinates
x <- rep(c(10, 15, 20, 25), each = 25)

# initialize vector of errors
e <- c()

# sample 100 errors such that the variance increases with x
e[1:25] <- rnorm(25, sd = 10)
e[26:50] <- rnorm(25, sd = 15)
e[51:75] <- rnorm(25, sd = 20)
e[76:100] <- rnorm(25, sd = 25)

# set up y
y <- 720 - 3.3 * x + e

# Estimate the model 
mod <- lm(y ~ x)

# Plot the data
plot(x = x, 
     y = y, 
     main = "An Example of Heteroskedasticity",
     xlab = "Student-Teacher Ratio",
     ylab = "Test Score",
     cex = 0.5, 
     pch = 19, 
     xlim = c(8, 27), 
     ylim = c(600, 710))

# Add the regression line to the plot
abline(mod, col = "darkred")

# Add boxplots to the plot
boxplot(formula = y ~ x, 
        add = TRUE, 
        at = c(10, 15, 20, 25), 
        col = alpha("gray", 0.4), 
        border = "black")

# load package and attach data
#install.packages("AER")
library(AER)

## Loading required package: car

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.0.5

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## Loading required package: survival

## Warning: package 'survival' was built under R version 4.0.5

data("CPSSWEducation")
attach(CPSSWEducation)

# get an overview
summary(CPSSWEducation)

##       age          gender        earnings        education    
##  Min.   :29.0   female:1202   Min.   : 2.137   Min.   : 6.00  
##  1st Qu.:29.0   male  :1748   1st Qu.:10.577   1st Qu.:12.00  
##  Median :29.0                 Median :14.615   Median :13.00  
##  Mean   :29.5                 Mean   :16.743   Mean   :13.55  
##  3rd Qu.:30.0                 3rd Qu.:20.192   3rd Qu.:16.00  
##  Max.   :30.0                 Max.   :97.500   Max.   :18.00

# estimate a simple regression model
labor_model <- lm(earnings ~ education)

# plot observations and add the regression line
plot(education, 
     earnings, 
     ylim = c(0, 150))

abline(labor_model, 
       col = "steelblue", 
       lwd = 2)

# print the contents of labor_model to the console
labor_model

## 
## Call:
## lm(formula = earnings ~ education)
## 
## Coefficients:
## (Intercept)    education  
##      -3.134        1.467

# compute a 95% confidence interval for the coefficients in the model
confint(labor_model)

##                 2.5 %    97.5 %
## (Intercept) -5.015248 -1.253495
## education    1.330098  1.603753

# Store model summary in 'model'
model <- summary(labor_model)

# Extract the standard error of the regression from model summary
SER <- model$sigma

# Compute the variation in 'education'
V <- (nrow(CPSSWEducation)-1) * var(education)

# Compute the standard error of the slope parameter's estimator and print it
SE.beta_1.hat <- sqrt(SER^2/V)
SE.beta_1.hat

## [1] 0.06978281

# Use logical operators to see if the value computed by hand matches the one provided 
# in mod$coefficients. Round estimates to four decimal places
round(model$coefficients[2, 2], 4) == round(SE.beta_1.hat, 4)

## [1] TRUE

# compute heteroskedasticity-robust standard errors
linear_model <- model.mqo
vcov <- vcovHC(linear_model, type = "HC0")
vcov

##                (Intercept)  log(rendapc)    anosestudo
## (Intercept)   6.913493e-04 -1.103298e-04  1.437754e-06
## log(rendapc) -1.103298e-04  1.841463e-05 -5.299338e-07
## anosestudo    1.437754e-06 -5.299338e-07  2.450193e-07

# compute the square root of the diagonal elements in vcov
robust_se <- sqrt(diag(vcov))
robust_se

##  (Intercept) log(rendapc)   anosestudo 
## 0.0262935227 0.0042912266 0.0004949943

# we invoke the function `coeftest()` on our model
coeftest(linear_model, vcov. = vcov)

## 
## t test of coefficients:
## 
##                Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)  1.25737137 0.02629352  47.821 < 2.2e-16 ***
## log(rendapc) 0.50238125 0.00429123 117.072 < 2.2e-16 ***
## anosestudo   0.00228985 0.00049499   4.626 3.736e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

vcov <- vcovHC(linear_model, type = "HC1")
vcov

##                (Intercept)  log(rendapc)    anosestudo
## (Intercept)   6.913888e-04 -1.103361e-04  1.437836e-06
## log(rendapc) -1.103361e-04  1.841568e-05 -5.299641e-07
## anosestudo    1.437836e-06 -5.299641e-07  2.450333e-07

# compute the square root of the diagonal elements in vcov
robust_se <- sqrt(diag(vcov))
robust_se

##  (Intercept) log(rendapc)   anosestudo 
## 0.0262942723 0.0042913489 0.0004950084

# we invoke the function `coeftest()` on our model
coeftest(linear_model, vcov. = vcov)

## 
## t test of coefficients:
## 
##                Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept)  1.25737137 0.02629427  47.8192 < 2.2e-16 ***
## log(rendapc) 0.50238125 0.00429135 117.0684 < 2.2e-16 ***
## anosestudo   0.00228985 0.00049501   4.6259 3.739e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1