##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'lmtest' was built under R version 4.0.5
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Warning: package 'stargazer' was built under R version 4.0.5
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
df.despesas_alimentos <- read.csv("POF08_DespesaAlimentos.csv")
As variáveis de interesse são:
rendapc: renda mensal domiciliar per capita (R$)
despalimpc: despesa mensal domiciliar per capita com alimentos (R$)
anosestudo: anos de escolaridade da pessoa responsável pelo domicÃlio
As questões relacionam-se à identificação, análise e correção da heterocedasticidade no modelo:
\[ ln(despalimpc) = \alpha + \beta_1:; ln(rendapc) + \beta_2 anosestudo + erro \]
summary(df.despesas_alimentos)
## uf fator n_morador renda
## Min. :11.00 Min. : 3.827 Min. : 1.000 Min. : 17.0
## 1st Qu.:23.00 1st Qu.: 330.019 1st Qu.: 2.000 1st Qu.: 808.3
## Median :31.00 Median : 681.833 Median : 3.000 Median : 1368.8
## Mean :31.13 Mean : 1048.095 Mean : 3.434 Mean : 2325.6
## 3rd Qu.:41.00 3rd Qu.: 1256.922 3rd Qu.: 4.000 3rd Qu.: 2517.6
## Max. :53.00 Max. :19521.859 Max. :20.000 Max. :117219.2
## rendapc idade anosestudo mulher
## Min. : 3.47 Min. : 12.00 Min. : 0.00 Min. :0.0000
## 1st Qu.: 253.03 1st Qu.: 35.00 1st Qu.: 3.00 1st Qu.:0.0000
## Median : 477.98 Median : 45.00 Median : 6.00 Median :0.0000
## Mean : 838.42 Mean : 47.23 Mean : 7.05 Mean :0.3225
## 3rd Qu.: 891.83 3rd Qu.: 58.00 3rd Qu.:11.00 3rd Qu.:1.0000
## Max. :87430.75 Max. :103.00 Max. :88.00 Max. :1.0000
## ano despalimtot despalimpc
## Min. :2008 Min. : 0.867 Min. : 0.368
## 1st Qu.:2008 1st Qu.: 135.590 1st Qu.: 43.333
## Median :2008 Median : 272.025 Median : 87.179
## Mean :2008 Mean : 378.697 Mean : 131.064
## 3rd Qu.:2008 3rd Qu.: 492.873 3rd Qu.: 165.383
## Max. :2008 Max. :15634.450 Max. :3547.657
model.mqo <- lm(log(despalimpc) ~ log(rendapc) + anosestudo, df.despesas_alimentos)
summary(model.mqo)
##
## Call:
## lm(formula = log(despalimpc) ~ log(rendapc) + anosestudo, data = df.despesas_alimentos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.4743 -0.4939 0.1296 0.6385 3.5703
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.2573714 0.0261836 48.021 < 2e-16 ***
## log(rendapc) 0.5023812 0.0042787 117.416 < 2e-16 ***
## anosestudo 0.0022899 0.0005286 4.332 1.48e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9526 on 52615 degrees of freedom
## Multiple R-squared: 0.2197, Adjusted R-squared: 0.2197
## F-statistic: 7409 on 2 and 52615 DF, p-value: < 2.2e-16
model <- lm(log(despalimpc) ~ log(rendapc) +
anosestudo, data = df.despesas_alimentos)
stargazer::stargazer(model.mqo,
title = "Resultado",
dep.var.labels = c("log(despesalimppc)"),
out="models.htm", type = "text", header=FALSE,
covariate.labels = c("", "log(rendapc)",
"anosestudo"))
##
## Resultado
## ================================================
## Dependent variable:
## ----------------------------
## log(despesalimppc)
## ------------------------------------------------
## 0.502***
## (0.004)
##
## log(rendapc) 0.002***
## (0.001)
##
## anosestudo 1.257***
## (0.026)
##
## ------------------------------------------------
## Observations 52,618
## R2 0.220
## Adjusted R2 0.220
## Residual Std. Error 0.953 (df = 52615)
## F Statistic 7,409.039*** (df = 2; 52615)
## ================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
resid_auxpanel(residuals = resid(model.mqo),
predicted = fitted(model),
plots = c("resid", "index"))
ggplot(df.despesas_alimentos, aes(x=log(rendapc)))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df.despesas_alimentos%>% ggplot(aes(sample = log(rendapc))) + geom_qq()
ggplot(df.despesas_alimentos, aes(x=rendapc))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df.despesas_alimentos%>% ggplot(aes(sample = rendapc)) + geom_qq()
echapeu <- resid(model.mqo)
echapeu2 <- echapeu^2
plot(log(df.despesas_alimentos$rendapc), echapeu2)
plot(df.despesas_alimentos$anosestudo, echapeu2)
bp <- lm(echapeu2 ~ log(rendapc) + anosestudo, data=df.despesas_alimentos)
summary(bp)
##
## Call:
## lm(formula = echapeu2 ~ log(rendapc) + anosestudo, data = df.despesas_alimentos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9907 -0.8259 -0.5626 0.0974 29.0939
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0213989 0.0466970 21.873 < 2e-16 ***
## log(rendapc) -0.0117967 0.0076307 -1.546 0.122
## anosestudo -0.0058356 0.0009428 -6.190 6.07e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.699 on 52615 degrees of freedom
## Multiple R-squared: 0.0009038, Adjusted R-squared: 0.0008658
## F-statistic: 23.8 on 2 and 52615 DF, p-value: 4.667e-11
Teste de BP
n <- length(echapeu2) # tamanho da amostra
r2_bp <- summary(bp)$r.squared # R2 do ajuste auxiliar
lm_bp <- n * r2_bp # estatÃstica LM
lm_bp
## [1] 47.55692
p_bp <- pchisq(lm_bp, 2, lower.tail = FALSE)# valor p, em qui-quadrado com 2 gl (variáveis independentes do modelo auxiliar)
p_bp
## [1] 4.711361e-11
white <- lm(echapeu2 ~ log(rendapc) + anosestudo +
I((log(rendapc))^2) + I(anosestudo^2) +
I((log(rendapc))*anosestudo), data=df.despesas_alimentos)
summary(white)
##
## Call:
## lm(formula = echapeu2 ~ log(rendapc) + anosestudo + I((log(rendapc))^2) +
## I(anosestudo^2) + I((log(rendapc)) * anosestudo), data = df.despesas_alimentos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8130 -0.8117 -0.5601 0.1005 29.1309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.257e+00 1.868e-01 6.726 1.77e-11 ***
## log(rendapc) -1.236e-01 6.114e-02 -2.022 0.04319 *
## anosestudo 1.300e-02 7.055e-03 1.843 0.06533 .
## I((log(rendapc))^2) 1.344e-02 5.073e-03 2.648 0.00809 **
## I(anosestudo^2) 1.485e-04 2.635e-05 5.638 1.73e-08 ***
## I((log(rendapc)) * anosestudo) -4.856e-03 1.075e-03 -4.518 6.26e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.698 on 52612 degrees of freedom
## Multiple R-squared: 0.002014, Adjusted R-squared: 0.001919
## F-statistic: 21.24 on 5 and 52612 DF, p-value: < 2.2e-16
r2_white <- summary(white)$r.squared # R2 do ajuste auxiliar
lm_white <- n * r2_white # estatÃstica LM
lm_white
## [1] 105.9873
p_white <- pchisq(lm_white, 5, lower.tail = FALSE) # valor p, em qui-quadrado com 5 gl
p_white
## [1] 2.884581e-21
# "HC0": Eicker-Huber-White standard errors
coeftest(model.mqo, vcov = vcovHC(model.mqo, type = "HC0"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.25737137 0.02629352 47.821 < 2.2e-16 ***
## log(rendapc) 0.50238125 0.00429123 117.072 < 2.2e-16 ***
## anosestudo 0.00228985 0.00049499 4.626 3.736e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# compute the square root of the diagonal elements in vcov
robust_se <- sqrt(diag(coeftest(model.mqo, vcov = vcovHC(model.mqo, type = "HC0"))))
robust_se
## [1] 1.12132572 0.06550745 2.15081868
# "HC1": MacKinnon and White(1985) standard errors
# default in the statistics package STATA
coeftest(model.mqo, vcov = vcovHC(model.mqo, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.25737137 0.02629427 47.8192 < 2.2e-16 ***
## log(rendapc) 0.50238125 0.00429135 117.0684 < 2.2e-16 ***
## anosestudo 0.00228985 0.00049501 4.6259 3.739e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# compute the square root of the diagonal elements in vcov
robust_se <- sqrt(diag(coeftest(model.mqo, vcov = vcovHC(model.mqo, type = "HC1"))))
robust_se
## [1] 1.12132572 0.06550839 2.15078802
Pressuposto:
\[ var(e)=log(rendapc)*\sigma^2 \]
, ou seja, \[v=log(rendapc)\]
pois, \[weights = 1/v\],
logo, \[weights=1/log(rendapc)\]
mqp <- lm(log(despalimpc) ~ log(rendapc) +
anosestudo, data=df.despesas_alimentos, weights=1/log(rendapc))
summary(mqp)
##
## Call:
## lm(formula = log(despalimpc) ~ log(rendapc) + anosestudo, data = df.despesas_alimentos,
## weights = 1/log(rendapc))
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -2.14446 -0.19991 0.05245 0.25742 1.99450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.3321908 0.0253041 52.65 < 2e-16 ***
## log(rendapc) 0.4901035 0.0042367 115.68 < 2e-16 ***
## anosestudo 0.0024435 0.0005266 4.64 3.49e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3894 on 52615 degrees of freedom
## Multiple R-squared: 0.2136, Adjusted R-squared: 0.2136
## F-statistic: 7145 on 2 and 52615 DF, p-value: < 2.2e-16
# load scales package for adjusting color opacities
library(scales)
# generate some heteroskedastic data:
# set seed for reproducibility
set.seed(123)
# set up vector of x coordinates
x <- rep(c(10, 15, 20, 25), each = 25)
# initialize vector of errors
e <- c()
# sample 100 errors such that the variance increases with x
e[1:25] <- rnorm(25, sd = 10)
e[26:50] <- rnorm(25, sd = 15)
e[51:75] <- rnorm(25, sd = 20)
e[76:100] <- rnorm(25, sd = 25)
# set up y
y <- 720 - 3.3 * x + e
# Estimate the model
mod <- lm(y ~ x)
# Plot the data
plot(x = x,
y = y,
main = "An Example of Heteroskedasticity",
xlab = "Student-Teacher Ratio",
ylab = "Test Score",
cex = 0.5,
pch = 19,
xlim = c(8, 27),
ylim = c(600, 710))
# Add the regression line to the plot
abline(mod, col = "darkred")
# Add boxplots to the plot
boxplot(formula = y ~ x,
add = TRUE,
at = c(10, 15, 20, 25),
col = alpha("gray", 0.4),
border = "black")
# load package and attach data
#install.packages("AER")
library(AER)
## Loading required package: car
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.0.5
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.0.5
data("CPSSWEducation")
attach(CPSSWEducation)
# get an overview
summary(CPSSWEducation)
## age gender earnings education
## Min. :29.0 female:1202 Min. : 2.137 Min. : 6.00
## 1st Qu.:29.0 male :1748 1st Qu.:10.577 1st Qu.:12.00
## Median :29.0 Median :14.615 Median :13.00
## Mean :29.5 Mean :16.743 Mean :13.55
## 3rd Qu.:30.0 3rd Qu.:20.192 3rd Qu.:16.00
## Max. :30.0 Max. :97.500 Max. :18.00
# estimate a simple regression model
labor_model <- lm(earnings ~ education)
# plot observations and add the regression line
plot(education,
earnings,
ylim = c(0, 150))
abline(labor_model,
col = "steelblue",
lwd = 2)
# print the contents of labor_model to the console
labor_model
##
## Call:
## lm(formula = earnings ~ education)
##
## Coefficients:
## (Intercept) education
## -3.134 1.467
# compute a 95% confidence interval for the coefficients in the model
confint(labor_model)
## 2.5 % 97.5 %
## (Intercept) -5.015248 -1.253495
## education 1.330098 1.603753
# Store model summary in 'model'
model <- summary(labor_model)
# Extract the standard error of the regression from model summary
SER <- model$sigma
# Compute the variation in 'education'
V <- (nrow(CPSSWEducation)-1) * var(education)
# Compute the standard error of the slope parameter's estimator and print it
SE.beta_1.hat <- sqrt(SER^2/V)
SE.beta_1.hat
## [1] 0.06978281
# Use logical operators to see if the value computed by hand matches the one provided
# in mod$coefficients. Round estimates to four decimal places
round(model$coefficients[2, 2], 4) == round(SE.beta_1.hat, 4)
## [1] TRUE
# compute heteroskedasticity-robust standard errors
linear_model <- model.mqo
vcov <- vcovHC(linear_model, type = "HC0")
vcov
## (Intercept) log(rendapc) anosestudo
## (Intercept) 6.913493e-04 -1.103298e-04 1.437754e-06
## log(rendapc) -1.103298e-04 1.841463e-05 -5.299338e-07
## anosestudo 1.437754e-06 -5.299338e-07 2.450193e-07
# compute the square root of the diagonal elements in vcov
robust_se <- sqrt(diag(vcov))
robust_se
## (Intercept) log(rendapc) anosestudo
## 0.0262935227 0.0042912266 0.0004949943
# we invoke the function `coeftest()` on our model
coeftest(linear_model, vcov. = vcov)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.25737137 0.02629352 47.821 < 2.2e-16 ***
## log(rendapc) 0.50238125 0.00429123 117.072 < 2.2e-16 ***
## anosestudo 0.00228985 0.00049499 4.626 3.736e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vcov <- vcovHC(linear_model, type = "HC1")
vcov
## (Intercept) log(rendapc) anosestudo
## (Intercept) 6.913888e-04 -1.103361e-04 1.437836e-06
## log(rendapc) -1.103361e-04 1.841568e-05 -5.299641e-07
## anosestudo 1.437836e-06 -5.299641e-07 2.450333e-07
# compute the square root of the diagonal elements in vcov
robust_se <- sqrt(diag(vcov))
robust_se
## (Intercept) log(rendapc) anosestudo
## 0.0262942723 0.0042913489 0.0004950084
# we invoke the function `coeftest()` on our model
coeftest(linear_model, vcov. = vcov)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.25737137 0.02629427 47.8192 < 2.2e-16 ***
## log(rendapc) 0.50238125 0.00429135 117.0684 < 2.2e-16 ***
## anosestudo 0.00228985 0.00049501 4.6259 3.739e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1