knitr::opts _chunk$set(echo =VERDADERO)

Instalación de Paquetes y Carga de Librerías

req_pkgs <- c("wooldridge","tidyverse","car","lmtest","sandwich",
              "broom","MASS","ggpubr","moments","performance","knitr","modelsummary")
inst <- req_pkgs[!req_pkgs %in% installed.packages()[,"Package"]]
if(length(inst)) install.packages(inst, dependencies = TRUE)
invisible(lapply(req_pkgs, library, character.only = TRUE))
## Warning: package 'wooldridge' was built under R version 4.4.3
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.2
## Warning: package 'tidyr' was built under R version 4.4.2
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.2
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'stringr' was built under R version 4.4.2
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'car' was built under R version 4.4.3
## Cargando paquete requerido: carData
## Warning: package 'carData' was built under R version 4.4.2
## 
## Adjuntando el paquete: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
## Warning: package 'lmtest' was built under R version 4.4.3
## Cargando paquete requerido: zoo
## Warning: package 'zoo' was built under R version 4.4.2
## 
## Adjuntando el paquete: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Warning: package 'sandwich' was built under R version 4.4.3
## Warning: package 'broom' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## The following object is masked from 'package:wooldridge':
## 
##     cement
## Warning: package 'ggpubr' was built under R version 4.4.3
## Warning: package 'performance' was built under R version 4.4.3
## Warning: package 'knitr' was built under R version 4.4.3
## Warning: package 'modelsummary' was built under R version 4.4.3
if (!requireNamespace("dplyr", quietly = TRUE)) install.packages("dplyr")
library(dplyr)

##Base de datos

data("wage1")
df <- wage1
y_var <- "wage"
x_vars <- c("educ", "exper", "tenure", "female", "married", "expersq")

##Cagar y preparar datos

df_selected <- df[, c(y_var, x_vars)]  # selecciona columnas
df_selected <- na.omit(df_selected)    # elimina NA
df_selected <- df_selected %>% mutate(across(everything(), as.numeric))

head(df_selected)
##   wage educ exper tenure female married expersq
## 1 3.10   11     2      0      1       0       4
## 2 3.24   12    22      2      1       1     484
## 3 3.00   11     2      0      0       0       4
## 4 6.00    8    44     28      0       1    1936
## 5 5.30   12     7      2      0       1      49
## 6 8.75   16     9      8      0       1      81

##Análisis exploratorio: media, mediana, sd, min, max, skewness, kurtosis

tabla_exploratoria <- df_selected %>%
  summarise(
    media = sapply(., mean, na.rm = TRUE),
    mediana = sapply(., median, na.rm = TRUE),
    sd = sapply(., sd, na.rm = TRUE),
    minimo = sapply(., min, na.rm = TRUE),
    maximo = sapply(., max, na.rm = TRUE),
    asimetria = sapply(., skewness, na.rm = TRUE),
    kurtosis = sapply(., kurtosis, na.rm = TRUE)
  ) %>%
  t() %>%                   
  as.data.frame()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
colnames(tabla_exploratoria) <- names(df_selected)
rownames(tabla_exploratoria) <- c("Media", "Mediana", "Desv.Est.", "Mínimo", "Máximo", "Asimetría", "Curtosis")

##Mostrar tabla en Markdown/Rpubs

kable(tabla_exploratoria, caption = "Estadísticos descriptivos de wage, educ, exper, tenure")
Estadísticos descriptivos de wage, educ, exper, tenure
wage educ exper tenure female married expersq
Media 5.896103 12.5627376 17.0171103 5.104563 0.4790875 0.6083650 473.435361
Mediana 4.650000 12.0000000 13.5000000 2.000000 0.0000000 1.0000000 182.500000
Desv.Est. 3.693086 2.7690224 13.5721596 7.224462 0.5000380 0.4885804 616.044772
Mínimo 0.530000 0.0000000 1.0000000 0.000000 0.0000000 0.0000000 1.000000
Máximo 24.980000 18.0000000 51.0000000 44.000000 1.0000000 1.0000000 2601.000000
Asimetría 2.007325 -0.6195741 0.7068652 2.110273 0.0837235 -0.4440136 1.492432
Curtosis 7.970083 4.8842450 2.3573176 7.658076 1.0070096 1.1971481 4.296473

##Fórmula y estimación OLS

fml <- as.formula(paste("log(", y_var, ") ~", paste(x_vars, collapse = " + ")))
m_log <- lm(fml, data = df)

##Resumen y R^2

summary(m_log)
## 
## Call:
## lm(formula = fml, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.81854 -0.25682 -0.02526  0.24755  1.18148 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.4158424  0.0993372   4.186 3.33e-05 ***
## educ         0.0798322  0.0068273  11.693  < 2e-16 ***
## exper        0.0300995  0.0051931   5.796 1.18e-08 ***
## tenure       0.0160739  0.0028801   5.581 3.86e-08 ***
## female      -0.2911303  0.0362832  -8.024 6.88e-15 ***
## married      0.0564494  0.0409259   1.379    0.168    
## expersq     -0.0006012  0.0001099  -5.472 6.95e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4014 on 519 degrees of freedom
## Multiple R-squared:  0.4361, Adjusted R-squared:  0.4296 
## F-statistic: 66.91 on 6 and 519 DF,  p-value: < 2.2e-16

#Tabla del summary

modelsummary(m_log, statistic = "({std.error})", title = "Resultados del modelo OLS")
Resultados del modelo OLS
(1)
(Intercept) 0.416
(0.099)
educ 0.080
(0.007)
exper 0.030
(0.005)
tenure 0.016
(0.003)
female -0.291
(0.036)
married 0.056
(0.041)
expersq -0.001
(0.000)
Num.Obs. 526
R2 0.436
R2 Adj. 0.430
AIC 2249.2
BIC 2283.3
Log.Lik. -262.755
F 66.906
RMSE 0.40

#Linealidad (gràficos y pruebas)

car::crPlots(m_log)

##Residuos parciales con ggplot

hacer_grafico_pr <- function(modelo, x){
  aug <- broom::augment(modelo)
  beta_x <- coef(modelo)[x]
  if(is.na(beta_x)) stop("La variable no está en el modelo.")
  aug |>
    mutate(partial = .resid + !!sym(x)*beta_x) |>
    ggplot(aes_string(x = x, y = "partial")) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = "loess", se = FALSE) +
    labs(title = paste("Gráfico de residuos parciales:", x),
         y = paste("e +", round(beta_x,4), "*", x))
}
print(hacer_grafico_pr(m_log, x_vars[1]))
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

RESET test

reset_lm <- lmtest::resettest(m_log, power = 2:3, type = "fitted")
reset_lm
## 
##  RESET test
## 
## data:  m_log
## RESET = 7.0291, df1 = 2, df2 = 517, p-value = 0.0009729

#Homoedasticidad

bp <- lmtest::bptest(m_log)
bp
## 
##  studentized Breusch-Pagan test
## 
## data:  m_log
## BP = 17.023, df = 6, p-value = 0.009197
X <- model.matrix(m_log)
white <- lmtest::bptest(m_log, ~ fitted(m_log) + I(fitted(m_log)^2))
white
## 
##  studentized Breusch-Pagan test
## 
## data:  m_log
## BP = 5.1463, df = 2, p-value = 0.0763

Gráfico residuos vs ajustados

broom::augment(m_log) |>
  ggplot(aes(.fitted, .resid)) +
  geom_hline(yintercept = 0, linetype = 2) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "loess", se = FALSE) +
  labs(title = "Residuos vs Ajustados", x = "Ajustados", y = "Residuos")
## `geom_smooth()` using formula = 'y ~ x'

#Normalidad en los residuos

if(nrow(df) <= 5000){
  print(shapiro.test(residuals(m_log)))
}
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(m_log)
## W = 0.98685, p-value = 0.0001111
jb <- tseries::jarque.bera.test(residuals(m_log))
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
jb
## 
##  Jarque Bera Test
## 
## data:  residuals(m_log)
## X-squared = 26.035, df = 2, p-value = 2.221e-06
ggpubr::ggqqplot(residuals(m_log)) + ggtitle("QQ-plot de residuos")

#Multicolinealidad

vifs <- car::vif(m_log)
vifs
##      educ     exper    tenure    female   married   expersq 
##  1.164312 16.183797  1.410458  1.072362  1.302549 14.924497
kappa_X <- kappa(model.matrix(m_log))
kappa_X
## [1] 4635.054

#Soluciones econométricas si los supuestos fallan # SE robustos a heterocedasticidad

coeftest(m_log, vcov = sandwich::vcovHC(m_log, type = "HC1"))
## 
## t test of coefficients:
## 
##                Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)  0.41584244  0.10658752  3.9014 0.0001082 ***
## educ         0.07983221  0.00765428 10.4298 < 2.2e-16 ***
## exper        0.03009952  0.00488331  6.1638 1.428e-09 ***
## tenure       0.01607388  0.00348398  4.6137 4.991e-06 ***
## female      -0.29113027  0.03686784 -7.8966 1.720e-14 ***
## married      0.05644940  0.04170194  1.3536 0.1764406    
## expersq     -0.00060115  0.00010185 -5.9023 6.478e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

WLS

w <- 1 / (fitted(m_log)^2 + 1e-8)
m_wls <- lm(fml, data = df, weights = w)
summary(m_wls)
## 
## Call:
## lm(formula = fml, data = df, weights = w)
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.55214 -0.15953 -0.01655  0.15536  0.78938 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.164e-01  7.966e-02   8.994  < 2e-16 ***
## educ         5.279e-02  5.409e-03   9.760  < 2e-16 ***
## exper        3.039e-02  4.847e-03   6.271 7.57e-10 ***
## tenure       1.462e-02  3.142e-03   4.654 4.13e-06 ***
## female      -2.479e-01  3.551e-02  -6.982 8.97e-12 ***
## married      7.861e-02  3.888e-02   2.022   0.0437 *  
## expersq     -6.044e-04  9.913e-05  -6.098 2.10e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2571 on 519 degrees of freedom
## Multiple R-squared:  0.3953, Adjusted R-squared:  0.3884 
## F-statistic: 56.56 on 6 and 519 DF,  p-value: < 2.2e-16
bptest(m_log)
## 
##  studentized Breusch-Pagan test
## 
## data:  m_log
## BP = 17.023, df = 6, p-value = 0.009197
bptest(m_wls)
## 
##  studentized Breusch-Pagan test
## 
## data:  m_wls
## BP = 23.328, df = 6, p-value = 0.0006939
plot(fitted(m_log), residuals(m_log),
     main = "Residuos vs Ajustados (¨LOG)",
     xlab = "Valores ajustados", ylab = "Residuos")
abline(h = 0, col = "red")

plot(fitted(m_wls), residuals(m_wls),
     main = "Residuos vs Ajustados (WLS)",
     xlab = "Valores ajustados", ylab = "Residuos ponderados")
abline(h = 0, col = "red")

#Informe ordenado (tabla de coeficientes)

tab_log <- broom::tidy(m_log, conf.int = TRUE)
tab_rob <- broom::tidy(coeftest(m_log, vcov = sandwich::vcovHC(m_log, type="HC1")))
list(
  resumen_log = broom::glance(m_log),
  coef_ci_log = tab_log,
  coef_se_rob = tab_rob,
  prueba_bp = bp,
  reset_test = reset_lm,
  vifs = vifs,
  kappa = kappa_X
)
## $resumen_log
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.436         0.430 0.401      66.9 1.76e-61     6  -263.  542.  576.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
## 
## $coef_ci_log
## # A tibble: 7 × 7
##   term         estimate std.error statistic  p.value  conf.low conf.high
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>     <dbl>     <dbl>
## 1 (Intercept)  0.416     0.0993        4.19 3.33e- 5  0.221     0.611   
## 2 educ         0.0798    0.00683      11.7  3.37e-28  0.0664    0.0932  
## 3 exper        0.0301    0.00519       5.80 1.18e- 8  0.0199    0.0403  
## 4 tenure       0.0161    0.00288       5.58 3.86e- 8  0.0104    0.0217  
## 5 female      -0.291     0.0363       -8.02 6.88e-15 -0.362    -0.220   
## 6 married      0.0564    0.0409        1.38 1.68e- 1 -0.0240    0.137   
## 7 expersq     -0.000601  0.000110     -5.47 6.95e- 8 -0.000817 -0.000385
## 
## $coef_se_rob
## # A tibble: 7 × 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)  0.416     0.107         3.90 1.08e- 4
## 2 educ         0.0798    0.00765      10.4  2.99e-23
## 3 exper        0.0301    0.00488       6.16 1.43e- 9
## 4 tenure       0.0161    0.00348       4.61 4.99e- 6
## 5 female      -0.291     0.0369       -7.90 1.72e-14
## 6 married      0.0564    0.0417        1.35 1.76e- 1
## 7 expersq     -0.000601  0.000102     -5.90 6.48e- 9
## 
## $prueba_bp
## 
##  studentized Breusch-Pagan test
## 
## data:  m_log
## BP = 17.023, df = 6, p-value = 0.009197
## 
## 
## $reset_test
## 
##  RESET test
## 
## data:  m_log
## RESET = 7.0291, df1 = 2, df2 = 517, p-value = 0.0009729
## 
## 
## $vifs
##      educ     exper    tenure    female   married   expersq 
##  1.164312 16.183797  1.410458  1.072362  1.302549 14.924497 
## 
## $kappa
## [1] 4635.054