Librerías

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(lmtest)     # bptest, dwtest, resettest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)   # vcovHC, NeweyWest
library(car)      # vif
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(nlme)       # gls
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse

Lectura de Datos

database <- read.csv(
  "database.csv",
  header = TRUE,
  na.strings = c("NA", "", " ")
)

==========================================================

PARTE 1

EMPLEO TOTAL EN MÉXICO (1980–2020)

==========================================================

# Preparar base
database_clean <- database %>%
  mutate(
    year = as.integer(year),
    employment_total = as.numeric(employment_total)
  ) %>%
  filter(!is.na(year), !is.na(employment_total)) %>%
  arrange(year)

1) Evolución en niveles

# GRÁFICA: Empleo total (niveles)
g_empleo_niveles <- ggplot(database_clean, aes(x = year, y = employment_total)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  labs(
    title = "México: Evolución del empleo total (1980–2020)",
    x = "Año",
    y = "Empleo total (miles de personas)"
  ) +
  scale_x_continuous(breaks = pretty_breaks(n = 8)) +
  scale_y_continuous(labels = label_comma()) +
  theme_minimal(base_size = 12)

print(g_empleo_niveles)

2) Evolución en términos porcentuales (crecimiento anual %)

# Cálculo: crecimiento anual (%)
database_clean <- database_clean %>%
  mutate(emp_growth_pct = (employment_total / lag(employment_total) - 1) * 100)

# GRÁFICA: Variación porcentual anual
g_empleo_pct <- ggplot(database_clean, aes(x = year, y = emp_growth_pct)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_line(linewidth = 1, na.rm = TRUE) +
  geom_point(size = 2, na.rm = TRUE) +
  labs(
    title = "México: Variación porcentual anual del empleo total (1980–2020)",
    x = "Año",
    y = "Crecimiento anual del empleo (%)"
  ) +
  scale_x_continuous(breaks = pretty_breaks(n = 8)) +
  theme_minimal(base_size = 12)

print(g_empleo_pct)

# Resumen numérico (para interpretación)
resumen_empleo <- database_clean %>%
  summarise(
    anio_inicio = min(year),
    anio_fin = max(year),
    empleo_inicio = first(employment_total),
    empleo_fin = last(employment_total),
    cambio_absoluto = empleo_fin - empleo_inicio,
    crecimiento_total_pct = (empleo_fin / empleo_inicio - 1) * 100,
    crecimiento_promedio_anual_pct = mean(emp_growth_pct, na.rm = TRUE)
  )

print(resumen_empleo)
##   anio_inicio anio_fin empleo_inicio empleo_fin cambio_absoluto
## 1        1980     2020      21188.27   50927.07         29738.8
##   crecimiento_total_pct crecimiento_promedio_anual_pct
## 1               140.355                        2.23603

==========================================================

PARTE 2. REGRESIÓN LINEAL SIMPLE

Y = employment_total

X = escolaridad

==========================================================

# Preparar datos
data_simple <- database %>%
  mutate(
    year = as.integer(year),
    unemployment_rate = as.numeric(unemployment_rate),
    employment_total = as.numeric(employment_total),
    escolaridad = as.numeric(escolaridad)
  ) %>%
  filter(!is.na(year), !is.na(escolaridad)) %>%
  arrange(year)

Modelo

\[ employment\_total_t = \beta_0 + \beta_1 \, escolaridad_t + u_t \]

# employment_total
Y_var <- "employment_total" 
X_var <- "escolaridad"

data_model <- data_simple %>%
  filter(!is.na(.data[[Y_var]]))

Mínimos Cuadrados Ordinarios (MCO)

form_mco <- as.formula(paste(Y_var, "~", X_var))
mco <- lm(form_mco, data = data_model)
summary(mco)
## 
## Call:
## lm(formula = form_mco, data = data_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2117.0  -784.0  -264.9   851.3  2371.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11720.7      913.2  -12.84  1.4e-15 ***
## escolaridad   6676.8      120.4   55.44  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1094 on 39 degrees of freedom
## Multiple R-squared:  0.9875, Adjusted R-squared:  0.9871 
## F-statistic:  3074 on 1 and 39 DF,  p-value: < 2.2e-16
# Errores robustos a heterocedasticidad (HC1)
coeftest(mco, vcov = vcovHC(mco, type = "HC1"))
## 
## t test of coefficients:
## 
##              Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) -11720.72    1183.27 -9.9053 3.348e-12 ***
## escolaridad   6676.79     156.97 42.5363 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Diagnósticos básicos
# - Heterocedasticidad: Breusch-Pagan
# - Autocorrelación: Durbin-Watson
# - Especificación: Ramsey RESET
bp <- bptest(mco)                 # H0: homocedasticidad
dw <- dwtest(mco)                 # H0: no autocorrelación (AR(1) aprox)
reset <- resettest(mco, power = 2:3, type = "fitted")  # H0: bien especificado

bp; dw; reset
## 
##  studentized Breusch-Pagan test
## 
## data:  mco
## BP = 0.0047301, df = 1, p-value = 0.9452
## 
##  Durbin-Watson test
## 
## data:  mco
## DW = 0.63656, p-value = 5.89e-08
## alternative hypothesis: true autocorrelation is greater than 0
## 
##  RESET test
## 
## data:  mco
## RESET = 13.359, df1 = 2, df2 = 37, p-value = 4.295e-05
# Si hay autocorrelación: errores Newey-West (HAC)
coeftest(mco, vcov = NeweyWest(mco, lag = 1, prewhite = FALSE))
## 
## t test of coefficients:
## 
##              Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) -11720.72    1422.13 -8.2417 4.518e-10 ***
## escolaridad   6676.79     184.16 36.2558 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Mínimos Cuadrados Generalizados (MCG)

mcg_ar1 <- gls(
  form_mco,
  data = data_model,
  correlation = corAR1(form = ~ year),
  method = "ML"
)

summary(mcg_ar1)
## Generalized least squares fit by maximum likelihood
##   Model: form_mco 
##   Data: data_model 
##        AIC      BIC    logLik
##   673.0246 679.8789 -332.5123
## 
## Correlation Structure: AR(1)
##  Formula: ~year 
##  Parameter estimate(s):
##       Phi 
## 0.7276341 
## 
## Coefficients:
##                 Value Std.Error   t-value p-value
## (Intercept) -9486.610 2043.0314 -4.643399       0
## escolaridad  6376.076  269.0495 23.698520       0
## 
##  Correlation: 
##             (Intr)
## escolaridad -0.976
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -1.6361523 -0.7304805 -0.1836343  0.8487153  2.5950736 
## 
## Residual standard error: 1163.109 
## Degrees of freedom: 41 total; 39 residual

Predicción para 2022

# a) Proxy: usar escolaridad del último año disponible
escolaridad_last <- data_model %>%
  filter(year == max(year)) %>%
  summarise(val = first(escolaridad)) %>%
  pull(val)

new_2022 <- data.frame(
  year = 2022,
  escolaridad = escolaridad_last
)
# Predicción con MCO
pred_mco_2022 <- predict(mco, newdata = new_2022, interval = "prediction", level = 0.95)
pred_mco_2022
##        fit     lwr      upr
## 1 53044.11 50737.9 55350.32
# Predicción con MCG 
pred_mcg_2022 <- predict(mcg_ar1, newdata = new_2022)
pred_mcg_2022
## [1] 52361.33
## attr(,"label")
## [1] "Predicted values"

==========================================================

PARTE 3. REGRESIÓN LINEAL MÚLTIPLE (R)

==========================================================

# Oxford 2022 para empleo total (miles de personas):
oxford_2022 <- 57322.4
Y_var <- "employment_total"   

data_all <- database %>%
  mutate(
    year = as.integer(year),
    employment_total = as.numeric(employment_total),
    unemployment_rate = as.numeric(unemployment_rate),
    escolaridad = as.numeric(escolaridad),
    population_growth = as.numeric(population_growth),
    gdp_real = as.numeric(gdp_real),
    investment = as.numeric(investment),
    inflation = as.numeric(inflation),
    disposable_income = as.numeric(disposable_income),
    consumer_spending = as.numeric(consumer_spending),
    population_working_age = as.numeric(population_working_age)
  ) %>%
  arrange(year)

1) Regresión múltiple: Y ~ escolaridad + population_growth

Modelo:

\[ Employment\_total_t = \beta_0 + \beta_1\,Escolaridad_t + \beta_2\,Population\_growth_t + u_t \]

data_m1 <- data_all %>%
  filter(!is.na(.data[[Y_var]]),
         !is.na(escolaridad),
         !is.na(population_growth))

m1 <- lm(as.formula(paste(Y_var, "~ escolaridad + population_growth")), data = data_m1)

# Resultados MCO
summary(m1)
## 
## Call:
## lm(formula = as.formula(paste(Y_var, "~ escolaridad + population_growth")), 
##     data = data_m1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2490.7  -719.4  -162.8   743.6  2153.0 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -6805.6     7168.8  -0.949    0.349    
## escolaridad         6336.0      553.8  11.441 1.03e-13 ***
## population_growth  -1507.2     1906.5  -0.791    0.434    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1046 on 37 degrees of freedom
## Multiple R-squared:  0.9882, Adjusted R-squared:  0.9876 
## F-statistic:  1551 on 2 and 37 DF,  p-value: < 2.2e-16
# Errores robustos (HC1)
coeftest(m1, vcov = vcovHC(m1, type = "HC1"))
## 
## t test of coefficients:
## 
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6805.57    6869.92 -0.9906   0.3283    
## escolaridad        6336.01     496.97 12.7492 4.15e-15 ***
## population_growth -1507.22    1950.56 -0.7727   0.4446    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Diagnósticos 
bptest(m1)                       # heterocedasticidad
## 
##  studentized Breusch-Pagan test
## 
## data:  m1
## BP = 5.9252, df = 2, p-value = 0.05168
dwtest(m1)                       # autocorrelación
## 
##  Durbin-Watson test
## 
## data:  m1
## DW = 0.77185, p-value = 1.123e-06
## alternative hypothesis: true autocorrelation is greater than 0
resettest(m1, power = 2:3, type = "fitted")  # especificación
## 
##  RESET test
## 
## data:  m1
## RESET = 8.7362, df1 = 2, df2 = 35, p-value = 0.0008364
# (Opcional) Errores HAC Newey-West
coeftest(m1, vcov = NeweyWest(m1, lag = 1, prewhite = FALSE))
## 
## t test of coefficients:
## 
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6805.57    7482.24 -0.9096   0.3689    
## escolaridad        6336.01     557.89 11.3570 1.28e-13 ***
## population_growth -1507.22    2058.25 -0.7323   0.4686    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (Opcional) GLS AR(1) si DW sugiere autocorrelación
m1_gls <- gls(
  as.formula(paste(Y_var, "~ escolaridad + population_growth")),
  data = data_m1,
  correlation = corAR1(form = ~ year),
  method = "ML"
)
summary(m1_gls)
## Generalized least squares fit by maximum likelihood
##   Model: as.formula(paste(Y_var, "~ escolaridad + population_growth")) 
##   Data: data_m1 
##        AIC      BIC    logLik
##   656.3664 664.8108 -323.1832
## 
## Correlation Structure: AR(1)
##  Formula: ~year 
##  Parameter estimate(s):
##       Phi 
## 0.7821947 
## 
## Coefficients:
##                        Value Std.Error   t-value p-value
## (Intercept)       -29888.000 11833.624 -2.525685  0.0160
## escolaridad         7948.610   954.229  8.329877  0.0000
## population_growth   5377.599  3042.792  1.767324  0.0854
## 
##  Correlation: 
##                   (Intr) esclrd
## escolaridad       -0.989       
## population_growth -0.978  0.940
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -1.7399918 -0.6272118 -0.2909866  0.5652343  2.6202153 
## 
## Residual standard error: 1238.885 
## Degrees of freedom: 40 total; 37 residual

2) Multicolinealidad (VIF) + decisión si existe

vif_vals <- vif(m1)
vif_vals
##       escolaridad population_growth 
##          20.79223          20.79223
# Regla:
# * Si el VIF es entre 1-5, no hay problema
# * Si está entre 5-10 es multicolinealidad moderada
# * Si es mayor que 10 hay multicolinealidad

Si hay VIF alto, opciones típicas: - Quitar/combinar variables muy correlacionadas - Usar transformaciones (p. ej., tasas o logs) - Mantenerlas pero interpretar con cautela (si el objetivo es predicción) - Aumentar tamaño muestral (no aplica aquí)

3) Regresión múltiple alternativa:

Modelo:

\[ Employment\_total_t = \beta_0 + \beta_1 Population\_working\_age_t + \beta_2 Unemployment\_rate_t + u_t \]

X2a <- "population_working_age"
X2b <- "unemployment_rate"

data_m2 <- data_all %>%
  filter(!is.na(.data[[Y_var]]),
         !is.na(.data[[X2a]]),
         !is.na(.data[[X2b]]))

m2 <- lm(as.formula(paste(Y_var, "~", X2a, "+", X2b)), data = data_m2)

summary(m2)
## 
## Call:
## lm(formula = as.formula(paste(Y_var, "~", X2a, "+", X2b)), data = data_m2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2401.3  -417.3  -251.1   138.0  1999.6 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.747e+03  7.057e+02   2.475  0.01789 *  
## population_working_age  6.394e-01  9.028e-03  70.827  < 2e-16 ***
## unemployment_rate      -5.257e+02  1.520e+02  -3.458  0.00136 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 844.1 on 38 degrees of freedom
## Multiple R-squared:  0.9927, Adjusted R-squared:  0.9924 
## F-statistic:  2596 on 2 and 38 DF,  p-value: < 2.2e-16
coeftest(m2, vcov = vcovHC(m2, type = "HC1"))
## 
## t test of coefficients:
## 
##                           Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)             1.7467e+03  6.8572e+02  2.5472  0.015029 *  
## population_working_age  6.3943e-01  8.9339e-03 71.5743 < 2.2e-16 ***
## unemployment_rate      -5.2567e+02  1.6618e+02 -3.1633  0.003065 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Diagnósticos
bptest(m2)
## 
##  studentized Breusch-Pagan test
## 
## data:  m2
## BP = 0.92732, df = 2, p-value = 0.629
dwtest(m2)
## 
##  Durbin-Watson test
## 
## data:  m2
## DW = 0.77724, p-value = 1.168e-06
## alternative hypothesis: true autocorrelation is greater than 0
resettest(m2, power = 2:3, type = "fitted")
## 
##  RESET test
## 
## data:  m2
## RESET = 3.6184, df1 = 2, df2 = 36, p-value = 0.03699
coeftest(m2, vcov = NeweyWest(m2, lag = 1, prewhite = FALSE))
## 
## t test of coefficients:
## 
##                           Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)             1.7467e+03  7.2540e+02  2.4079  0.021006 *  
## population_working_age  6.3943e-01  8.4067e-03 76.0622 < 2.2e-16 ***
## unemployment_rate      -5.2567e+02  1.9173e+02 -2.7417  0.009268 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# GLS AR(1)
m2_gls <- gls(
  as.formula(paste(Y_var, "~", X2a, "+", X2b)),
  data = data_m2,
  correlation = corAR1(form = ~ year),
  method = "ML"
)
summary(m2_gls)
## Generalized least squares fit by maximum likelihood
##   Model: as.formula(paste(Y_var, "~", X2a, "+", X2b)) 
##   Data: data_m2 
##        AIC      BIC    logLik
##   659.7513 668.3191 -324.8756
## 
## Correlation Structure: AR(1)
##  Formula: ~year 
##  Parameter estimate(s):
##       Phi 
## 0.6351418 
## 
## Coefficients:
##                            Value Std.Error  t-value p-value
## (Intercept)            2355.4942 1224.5737  1.92352  0.0619
## population_working_age    0.6322    0.0176 35.90209  0.0000
## unemployment_rate      -600.0197  181.6849 -3.30253  0.0021
## 
##  Correlation: 
##                        (Intr) pplt__
## population_working_age -0.791       
## unemployment_rate      -0.466 -0.120
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -2.4053565 -0.4739263 -0.1383676  0.4632173  2.3846429 
## 
## Residual standard error: 859.9006 
## Degrees of freedom: 41 total; 38 residual

4) Predicción para 2022 + comparación con Oxford Economics

# Valor Oxford Economics (empleo total, miles de personas)
oxford_2022 <- 57322.4

# Crear newdata 2022 usando último valor observado (proxy 2020)
make_new_2022 <- function(df, vars, year_col = "year") {
  last_year <- max(df[[year_col]], na.rm = TRUE)
  last_vals <- df %>%
    dplyr::filter(.data[[year_col]] == last_year) %>%
    dplyr::summarise(dplyr::across(dplyr::all_of(vars), ~ dplyr::first(.x))) %>%
    as.data.frame()
  last_vals[[year_col]] <- 2022
  last_vals
}

# Newdata 2022 para el modelo 2
new_2022_m2 <- make_new_2022(
  data_m2,
  vars = c("population_working_age", "unemployment_rate")
)

# Predicción MCO (con intervalo de predicción 95%)
pred_m2_mco_2022 <- predict(
  m2,
  newdata = new_2022_m2,
  interval = "prediction",
  level = 0.95
)

# Predicción GLS AR(1) (puntual)
pred_m2_gls_2022 <- as.numeric(predict(m2_gls, newdata = new_2022_m2))

# Mostrar resultados
pred_m2_mco_2022
##        fit      lwr      upr
## 1 53328.41 51541.23 55115.59
pred_m2_gls_2022
## [1] 52995.44

Comparación con Oxford Economics

comp_m2 <- data.frame(
  modelo = c("M2_MCO", "M2_GLS_AR1", "Oxford"),
  pred_2022 = c(
    as.numeric(pred_m2_mco_2022[1, "fit"]),
    pred_m2_gls_2022,
    oxford_2022
  )
) %>%
  mutate(
    diferencia_vs_oxford = pred_2022 - oxford_2022,
    diferencia_pct_vs_oxford = (pred_2022 / oxford_2022 - 1) * 100
  )

print(comp_m2)
##       modelo pred_2022 diferencia_vs_oxford diferencia_pct_vs_oxford
## 1     M2_MCO  53328.41            -3993.989                -6.967588
## 2 M2_GLS_AR1  52995.44            -4326.962                -7.548467
## 3     Oxford  57322.40                0.000                 0.000000