knitr::opts _chunk$set(echo =VERDADERO)
req_pkgs <- c("wooldridge","tidyverse","car","lmtest","sandwich",
"broom","MASS","ggpubr","moments","performance","knitr","modelsummary")
inst <- req_pkgs[!req_pkgs %in% installed.packages()[,"Package"]]
if(length(inst)) install.packages(inst, dependencies = TRUE)
invisible(lapply(req_pkgs, library, character.only = TRUE))
## Warning: package 'wooldridge' was built under R version 4.4.3
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.2
## Warning: package 'tidyr' was built under R version 4.4.2
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.2
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'stringr' was built under R version 4.4.2
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'car' was built under R version 4.4.3
## Cargando paquete requerido: carData
## Warning: package 'carData' was built under R version 4.4.2
##
## Adjuntando el paquete: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
## Warning: package 'lmtest' was built under R version 4.4.3
## Cargando paquete requerido: zoo
## Warning: package 'zoo' was built under R version 4.4.2
##
## Adjuntando el paquete: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Warning: package 'sandwich' was built under R version 4.4.3
## Warning: package 'broom' was built under R version 4.4.3
##
## Adjuntando el paquete: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## The following object is masked from 'package:wooldridge':
##
## cement
## Warning: package 'ggpubr' was built under R version 4.4.3
## Warning: package 'performance' was built under R version 4.4.3
## Warning: package 'knitr' was built under R version 4.4.3
## Warning: package 'modelsummary' was built under R version 4.4.3
if (!requireNamespace("dplyr", quietly = TRUE)) install.packages("dplyr")
library(dplyr)
##Base de datos
data("wage1")
df <- wage1
y_var <- "wage"
x_vars <- c("educ", "exper", "tenure", "female", "married", "expersq")
##Cagar y preparar datos
df_selected <- df[, c(y_var, x_vars)] # selecciona columnas
df_selected <- na.omit(df_selected) # elimina NA
df_selected <- df_selected %>% mutate(across(everything(), as.numeric))
head(df_selected)
## wage educ exper tenure female married expersq
## 1 3.10 11 2 0 1 0 4
## 2 3.24 12 22 2 1 1 484
## 3 3.00 11 2 0 0 0 4
## 4 6.00 8 44 28 0 1 1936
## 5 5.30 12 7 2 0 1 49
## 6 8.75 16 9 8 0 1 81
##Análisis exploratorio: media, mediana, sd, min, max, skewness, kurtosis
tabla_exploratoria <- df_selected %>%
summarise(
media = sapply(., mean, na.rm = TRUE),
mediana = sapply(., median, na.rm = TRUE),
sd = sapply(., sd, na.rm = TRUE),
minimo = sapply(., min, na.rm = TRUE),
maximo = sapply(., max, na.rm = TRUE),
asimetria = sapply(., skewness, na.rm = TRUE),
kurtosis = sapply(., kurtosis, na.rm = TRUE)
) %>%
t() %>%
as.data.frame()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
colnames(tabla_exploratoria) <- names(df_selected)
rownames(tabla_exploratoria) <- c("Media", "Mediana", "Desv.Est.", "Mínimo", "Máximo", "Asimetría", "Curtosis")
##Mostrar tabla en Markdown/Rpubs
kable(tabla_exploratoria, caption = "Estadísticos descriptivos de wage, educ, exper, tenure")
| wage | educ | exper | tenure | female | married | expersq | |
|---|---|---|---|---|---|---|---|
| Media | 5.896103 | 12.5627376 | 17.0171103 | 5.104563 | 0.4790875 | 0.6083650 | 473.435361 |
| Mediana | 4.650000 | 12.0000000 | 13.5000000 | 2.000000 | 0.0000000 | 1.0000000 | 182.500000 |
| Desv.Est. | 3.693086 | 2.7690224 | 13.5721596 | 7.224462 | 0.5000380 | 0.4885804 | 616.044772 |
| Mínimo | 0.530000 | 0.0000000 | 1.0000000 | 0.000000 | 0.0000000 | 0.0000000 | 1.000000 |
| Máximo | 24.980000 | 18.0000000 | 51.0000000 | 44.000000 | 1.0000000 | 1.0000000 | 2601.000000 |
| Asimetría | 2.007325 | -0.6195741 | 0.7068652 | 2.110273 | 0.0837235 | -0.4440136 | 1.492432 |
| Curtosis | 7.970083 | 4.8842450 | 2.3573176 | 7.658076 | 1.0070096 | 1.1971481 | 4.296473 |
##Fórmula y estimación OLS
fml <- as.formula(paste("log(", y_var, ") ~", paste(x_vars, collapse = " + ")))
m_log <- lm(fml, data = df)
##Resumen y R^2
summary(m_log)
##
## Call:
## lm(formula = fml, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.81854 -0.25682 -0.02526 0.24755 1.18148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4158424 0.0993372 4.186 3.33e-05 ***
## educ 0.0798322 0.0068273 11.693 < 2e-16 ***
## exper 0.0300995 0.0051931 5.796 1.18e-08 ***
## tenure 0.0160739 0.0028801 5.581 3.86e-08 ***
## female -0.2911303 0.0362832 -8.024 6.88e-15 ***
## married 0.0564494 0.0409259 1.379 0.168
## expersq -0.0006012 0.0001099 -5.472 6.95e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4014 on 519 degrees of freedom
## Multiple R-squared: 0.4361, Adjusted R-squared: 0.4296
## F-statistic: 66.91 on 6 and 519 DF, p-value: < 2.2e-16
#Tabla del summary
modelsummary(m_log, statistic = "({std.error})", title = "Resultados del modelo OLS")
| (1) | |
|---|---|
| (Intercept) | 0.416 |
| (0.099) | |
| educ | 0.080 |
| (0.007) | |
| exper | 0.030 |
| (0.005) | |
| tenure | 0.016 |
| (0.003) | |
| female | -0.291 |
| (0.036) | |
| married | 0.056 |
| (0.041) | |
| expersq | -0.001 |
| (0.000) | |
| Num.Obs. | 526 |
| R2 | 0.436 |
| R2 Adj. | 0.430 |
| AIC | 2249.2 |
| BIC | 2283.3 |
| Log.Lik. | -262.755 |
| F | 66.906 |
| RMSE | 0.40 |
#Linealidad (gràficos y pruebas)
car::crPlots(m_log)
##Residuos parciales con ggplot
hacer_grafico_pr <- function(modelo, x){
aug <- broom::augment(modelo)
beta_x <- coef(modelo)[x]
if(is.na(beta_x)) stop("La variable no está en el modelo.")
aug |>
mutate(partial = .resid + !!sym(x)*beta_x) |>
ggplot(aes_string(x = x, y = "partial")) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = FALSE) +
labs(title = paste("Gráfico de residuos parciales:", x),
y = paste("e +", round(beta_x,4), "*", x))
}
print(hacer_grafico_pr(m_log, x_vars[1]))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
reset_lm <- lmtest::resettest(m_log, power = 2:3, type = "fitted")
reset_lm
##
## RESET test
##
## data: m_log
## RESET = 7.0291, df1 = 2, df2 = 517, p-value = 0.0009729
#Homoedasticidad
bp <- lmtest::bptest(m_log)
bp
##
## studentized Breusch-Pagan test
##
## data: m_log
## BP = 17.023, df = 6, p-value = 0.009197
X <- model.matrix(m_log)
white <- lmtest::bptest(m_log, ~ fitted(m_log) + I(fitted(m_log)^2))
white
##
## studentized Breusch-Pagan test
##
## data: m_log
## BP = 5.1463, df = 2, p-value = 0.0763
broom::augment(m_log) |>
ggplot(aes(.fitted, .resid)) +
geom_hline(yintercept = 0, linetype = 2) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess", se = FALSE) +
labs(title = "Residuos vs Ajustados", x = "Ajustados", y = "Residuos")
## `geom_smooth()` using formula = 'y ~ x'
#Normalidad en los residuos
if(nrow(df) <= 5000){
print(shapiro.test(residuals(m_log)))
}
##
## Shapiro-Wilk normality test
##
## data: residuals(m_log)
## W = 0.98685, p-value = 0.0001111
jb <- tseries::jarque.bera.test(residuals(m_log))
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
jb
##
## Jarque Bera Test
##
## data: residuals(m_log)
## X-squared = 26.035, df = 2, p-value = 2.221e-06
ggpubr::ggqqplot(residuals(m_log)) + ggtitle("QQ-plot de residuos")
#Multicolinealidad
vifs <- car::vif(m_log)
vifs
## educ exper tenure female married expersq
## 1.164312 16.183797 1.410458 1.072362 1.302549 14.924497
kappa_X <- kappa(model.matrix(m_log))
kappa_X
## [1] 4635.054
#Soluciones econométricas si los supuestos fallan # SE robustos a heterocedasticidad
coeftest(m_log, vcov = sandwich::vcovHC(m_log, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.41584244 0.10658752 3.9014 0.0001082 ***
## educ 0.07983221 0.00765428 10.4298 < 2.2e-16 ***
## exper 0.03009952 0.00488331 6.1638 1.428e-09 ***
## tenure 0.01607388 0.00348398 4.6137 4.991e-06 ***
## female -0.29113027 0.03686784 -7.8966 1.720e-14 ***
## married 0.05644940 0.04170194 1.3536 0.1764406
## expersq -0.00060115 0.00010185 -5.9023 6.478e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
w <- 1 / (fitted(m_log)^2 + 1e-8)
m_wls <- lm(fml, data = df, weights = w)
summary(m_wls)
##
## Call:
## lm(formula = fml, data = df, weights = w)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -1.55214 -0.15953 -0.01655 0.15536 0.78938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.164e-01 7.966e-02 8.994 < 2e-16 ***
## educ 5.279e-02 5.409e-03 9.760 < 2e-16 ***
## exper 3.039e-02 4.847e-03 6.271 7.57e-10 ***
## tenure 1.462e-02 3.142e-03 4.654 4.13e-06 ***
## female -2.479e-01 3.551e-02 -6.982 8.97e-12 ***
## married 7.861e-02 3.888e-02 2.022 0.0437 *
## expersq -6.044e-04 9.913e-05 -6.098 2.10e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2571 on 519 degrees of freedom
## Multiple R-squared: 0.3953, Adjusted R-squared: 0.3884
## F-statistic: 56.56 on 6 and 519 DF, p-value: < 2.2e-16
bptest(m_log)
##
## studentized Breusch-Pagan test
##
## data: m_log
## BP = 17.023, df = 6, p-value = 0.009197
bptest(m_wls)
##
## studentized Breusch-Pagan test
##
## data: m_wls
## BP = 23.328, df = 6, p-value = 0.0006939
plot(fitted(m_log), residuals(m_log),
main = "Residuos vs Ajustados (¨LOG)",
xlab = "Valores ajustados", ylab = "Residuos")
abline(h = 0, col = "red")
plot(fitted(m_wls), residuals(m_wls),
main = "Residuos vs Ajustados (WLS)",
xlab = "Valores ajustados", ylab = "Residuos ponderados")
abline(h = 0, col = "red")
#Informe ordenado (tabla de coeficientes)
tab_log <- broom::tidy(m_log, conf.int = TRUE)
tab_rob <- broom::tidy(coeftest(m_log, vcov = sandwich::vcovHC(m_log, type="HC1")))
list(
resumen_log = broom::glance(m_log),
coef_ci_log = tab_log,
coef_se_rob = tab_rob,
prueba_bp = bp,
reset_test = reset_lm,
vifs = vifs,
kappa = kappa_X
)
## $resumen_log
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.436 0.430 0.401 66.9 1.76e-61 6 -263. 542. 576.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
##
## $coef_ci_log
## # A tibble: 7 × 7
## term estimate std.error statistic p.value conf.low conf.high
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.416 0.0993 4.19 3.33e- 5 0.221 0.611
## 2 educ 0.0798 0.00683 11.7 3.37e-28 0.0664 0.0932
## 3 exper 0.0301 0.00519 5.80 1.18e- 8 0.0199 0.0403
## 4 tenure 0.0161 0.00288 5.58 3.86e- 8 0.0104 0.0217
## 5 female -0.291 0.0363 -8.02 6.88e-15 -0.362 -0.220
## 6 married 0.0564 0.0409 1.38 1.68e- 1 -0.0240 0.137
## 7 expersq -0.000601 0.000110 -5.47 6.95e- 8 -0.000817 -0.000385
##
## $coef_se_rob
## # A tibble: 7 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.416 0.107 3.90 1.08e- 4
## 2 educ 0.0798 0.00765 10.4 2.99e-23
## 3 exper 0.0301 0.00488 6.16 1.43e- 9
## 4 tenure 0.0161 0.00348 4.61 4.99e- 6
## 5 female -0.291 0.0369 -7.90 1.72e-14
## 6 married 0.0564 0.0417 1.35 1.76e- 1
## 7 expersq -0.000601 0.000102 -5.90 6.48e- 9
##
## $prueba_bp
##
## studentized Breusch-Pagan test
##
## data: m_log
## BP = 17.023, df = 6, p-value = 0.009197
##
##
## $reset_test
##
## RESET test
##
## data: m_log
## RESET = 7.0291, df1 = 2, df2 = 517, p-value = 0.0009729
##
##
## $vifs
## educ exper tenure female married expersq
## 1.164312 16.183797 1.410458 1.072362 1.302549 14.924497
##
## $kappa
## [1] 4635.054