library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
library(lmtest) # bptest, dwtest, resettest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich) # vcovHC, NeweyWest
library(car) # vif
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(nlme) # gls
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
database <- read.csv(
"database.csv",
header = TRUE,
na.strings = c("NA", "", " ")
)
# Preparar base
database_clean <- database %>%
mutate(
year = as.integer(year),
employment_total = as.numeric(employment_total)
) %>%
filter(!is.na(year), !is.na(employment_total)) %>%
arrange(year)
# GRÁFICA: Empleo total (niveles)
g_empleo_niveles <- ggplot(database_clean, aes(x = year, y = employment_total)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
labs(
title = "México: Evolución del empleo total (1980–2020)",
x = "Año",
y = "Empleo total (miles de personas)"
) +
scale_x_continuous(breaks = pretty_breaks(n = 8)) +
scale_y_continuous(labels = label_comma()) +
theme_minimal(base_size = 12)
print(g_empleo_niveles)
# Cálculo: crecimiento anual (%)
database_clean <- database_clean %>%
mutate(emp_growth_pct = (employment_total / lag(employment_total) - 1) * 100)
# GRÁFICA: Variación porcentual anual
g_empleo_pct <- ggplot(database_clean, aes(x = year, y = emp_growth_pct)) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_line(linewidth = 1, na.rm = TRUE) +
geom_point(size = 2, na.rm = TRUE) +
labs(
title = "México: Variación porcentual anual del empleo total (1980–2020)",
x = "Año",
y = "Crecimiento anual del empleo (%)"
) +
scale_x_continuous(breaks = pretty_breaks(n = 8)) +
theme_minimal(base_size = 12)
print(g_empleo_pct)
# Resumen numérico (para interpretación)
resumen_empleo <- database_clean %>%
summarise(
anio_inicio = min(year),
anio_fin = max(year),
empleo_inicio = first(employment_total),
empleo_fin = last(employment_total),
cambio_absoluto = empleo_fin - empleo_inicio,
crecimiento_total_pct = (empleo_fin / empleo_inicio - 1) * 100,
crecimiento_promedio_anual_pct = mean(emp_growth_pct, na.rm = TRUE)
)
print(resumen_empleo)
## anio_inicio anio_fin empleo_inicio empleo_fin cambio_absoluto
## 1 1980 2020 21188.27 50927.07 29738.8
## crecimiento_total_pct crecimiento_promedio_anual_pct
## 1 140.355 2.23603
# Preparar datos
data_simple <- database %>%
mutate(
year = as.integer(year),
unemployment_rate = as.numeric(unemployment_rate),
employment_total = as.numeric(employment_total),
escolaridad = as.numeric(escolaridad)
) %>%
filter(!is.na(year), !is.na(escolaridad)) %>%
arrange(year)
\[ employment\_total_t = \beta_0 + \beta_1 \, escolaridad_t + u_t \]
# employment_total
Y_var <- "employment_total"
X_var <- "escolaridad"
data_model <- data_simple %>%
filter(!is.na(.data[[Y_var]]))
form_mco <- as.formula(paste(Y_var, "~", X_var))
mco <- lm(form_mco, data = data_model)
summary(mco)
##
## Call:
## lm(formula = form_mco, data = data_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2117.0 -784.0 -264.9 851.3 2371.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11720.7 913.2 -12.84 1.4e-15 ***
## escolaridad 6676.8 120.4 55.44 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1094 on 39 degrees of freedom
## Multiple R-squared: 0.9875, Adjusted R-squared: 0.9871
## F-statistic: 3074 on 1 and 39 DF, p-value: < 2.2e-16
# Errores robustos a heterocedasticidad (HC1)
coeftest(mco, vcov = vcovHC(mco, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11720.72 1183.27 -9.9053 3.348e-12 ***
## escolaridad 6676.79 156.97 42.5363 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Diagnósticos básicos
# - Heterocedasticidad: Breusch-Pagan
# - Autocorrelación: Durbin-Watson
# - Especificación: Ramsey RESET
bp <- bptest(mco) # H0: homocedasticidad
dw <- dwtest(mco) # H0: no autocorrelación (AR(1) aprox)
reset <- resettest(mco, power = 2:3, type = "fitted") # H0: bien especificado
bp; dw; reset
##
## studentized Breusch-Pagan test
##
## data: mco
## BP = 0.0047301, df = 1, p-value = 0.9452
##
## Durbin-Watson test
##
## data: mco
## DW = 0.63656, p-value = 5.89e-08
## alternative hypothesis: true autocorrelation is greater than 0
##
## RESET test
##
## data: mco
## RESET = 13.359, df1 = 2, df2 = 37, p-value = 4.295e-05
# Si hay autocorrelación: errores Newey-West (HAC)
coeftest(mco, vcov = NeweyWest(mco, lag = 1, prewhite = FALSE))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11720.72 1422.13 -8.2417 4.518e-10 ***
## escolaridad 6676.79 184.16 36.2558 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mcg_ar1 <- gls(
form_mco,
data = data_model,
correlation = corAR1(form = ~ year),
method = "ML"
)
summary(mcg_ar1)
## Generalized least squares fit by maximum likelihood
## Model: form_mco
## Data: data_model
## AIC BIC logLik
## 673.0246 679.8789 -332.5123
##
## Correlation Structure: AR(1)
## Formula: ~year
## Parameter estimate(s):
## Phi
## 0.7276341
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -9486.610 2043.0314 -4.643399 0
## escolaridad 6376.076 269.0495 23.698520 0
##
## Correlation:
## (Intr)
## escolaridad -0.976
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.6361523 -0.7304805 -0.1836343 0.8487153 2.5950736
##
## Residual standard error: 1163.109
## Degrees of freedom: 41 total; 39 residual
# a) Proxy: usar escolaridad del último año disponible
escolaridad_last <- data_model %>%
filter(year == max(year)) %>%
summarise(val = first(escolaridad)) %>%
pull(val)
new_2022 <- data.frame(
year = 2022,
escolaridad = escolaridad_last
)
# Predicción con MCO
pred_mco_2022 <- predict(mco, newdata = new_2022, interval = "prediction", level = 0.95)
pred_mco_2022
## fit lwr upr
## 1 53044.11 50737.9 55350.32
# Predicción con MCG
pred_mcg_2022 <- predict(mcg_ar1, newdata = new_2022)
pred_mcg_2022
## [1] 52361.33
## attr(,"label")
## [1] "Predicted values"
# Oxford 2022 para empleo total (miles de personas):
oxford_2022 <- 57322.4
Y_var <- "employment_total"
data_all <- database %>%
mutate(
year = as.integer(year),
employment_total = as.numeric(employment_total),
unemployment_rate = as.numeric(unemployment_rate),
escolaridad = as.numeric(escolaridad),
population_growth = as.numeric(population_growth),
gdp_real = as.numeric(gdp_real),
investment = as.numeric(investment),
inflation = as.numeric(inflation),
disposable_income = as.numeric(disposable_income),
consumer_spending = as.numeric(consumer_spending),
population_working_age = as.numeric(population_working_age)
) %>%
arrange(year)
\[ Employment\_total_t = \beta_0 + \beta_1\,Escolaridad_t + \beta_2\,Population\_growth_t + u_t \]
data_m1 <- data_all %>%
filter(!is.na(.data[[Y_var]]),
!is.na(escolaridad),
!is.na(population_growth))
m1 <- lm(as.formula(paste(Y_var, "~ escolaridad + population_growth")), data = data_m1)
# Resultados MCO
summary(m1)
##
## Call:
## lm(formula = as.formula(paste(Y_var, "~ escolaridad + population_growth")),
## data = data_m1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2490.7 -719.4 -162.8 743.6 2153.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6805.6 7168.8 -0.949 0.349
## escolaridad 6336.0 553.8 11.441 1.03e-13 ***
## population_growth -1507.2 1906.5 -0.791 0.434
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1046 on 37 degrees of freedom
## Multiple R-squared: 0.9882, Adjusted R-squared: 0.9876
## F-statistic: 1551 on 2 and 37 DF, p-value: < 2.2e-16
# Errores robustos (HC1)
coeftest(m1, vcov = vcovHC(m1, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6805.57 6869.92 -0.9906 0.3283
## escolaridad 6336.01 496.97 12.7492 4.15e-15 ***
## population_growth -1507.22 1950.56 -0.7727 0.4446
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Diagnósticos
bptest(m1) # heterocedasticidad
##
## studentized Breusch-Pagan test
##
## data: m1
## BP = 5.9252, df = 2, p-value = 0.05168
dwtest(m1) # autocorrelación
##
## Durbin-Watson test
##
## data: m1
## DW = 0.77185, p-value = 1.123e-06
## alternative hypothesis: true autocorrelation is greater than 0
resettest(m1, power = 2:3, type = "fitted") # especificación
##
## RESET test
##
## data: m1
## RESET = 8.7362, df1 = 2, df2 = 35, p-value = 0.0008364
# (Opcional) Errores HAC Newey-West
coeftest(m1, vcov = NeweyWest(m1, lag = 1, prewhite = FALSE))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6805.57 7482.24 -0.9096 0.3689
## escolaridad 6336.01 557.89 11.3570 1.28e-13 ***
## population_growth -1507.22 2058.25 -0.7323 0.4686
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (Opcional) GLS AR(1) si DW sugiere autocorrelación
m1_gls <- gls(
as.formula(paste(Y_var, "~ escolaridad + population_growth")),
data = data_m1,
correlation = corAR1(form = ~ year),
method = "ML"
)
summary(m1_gls)
## Generalized least squares fit by maximum likelihood
## Model: as.formula(paste(Y_var, "~ escolaridad + population_growth"))
## Data: data_m1
## AIC BIC logLik
## 656.3664 664.8108 -323.1832
##
## Correlation Structure: AR(1)
## Formula: ~year
## Parameter estimate(s):
## Phi
## 0.7821947
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -29888.000 11833.624 -2.525685 0.0160
## escolaridad 7948.610 954.229 8.329877 0.0000
## population_growth 5377.599 3042.792 1.767324 0.0854
##
## Correlation:
## (Intr) esclrd
## escolaridad -0.989
## population_growth -0.978 0.940
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -1.7399918 -0.6272118 -0.2909866 0.5652343 2.6202153
##
## Residual standard error: 1238.885
## Degrees of freedom: 40 total; 37 residual
vif_vals <- vif(m1)
vif_vals
## escolaridad population_growth
## 20.79223 20.79223
# Regla:
# * Si el VIF es entre 1-5, no hay problema
# * Si está entre 5-10 es multicolinealidad moderada
# * Si es mayor que 10 hay multicolinealidad
Si hay VIF alto, opciones típicas: - Quitar/combinar variables muy correlacionadas - Usar transformaciones (p. ej., tasas o logs) - Mantenerlas pero interpretar con cautela (si el objetivo es predicción) - Aumentar tamaño muestral (no aplica aquí)
\[ Employment\_total_t = \beta_0 + \beta_1 Population\_working\_age_t + \beta_2 Unemployment\_rate_t + u_t \]
X2a <- "population_working_age"
X2b <- "unemployment_rate"
data_m2 <- data_all %>%
filter(!is.na(.data[[Y_var]]),
!is.na(.data[[X2a]]),
!is.na(.data[[X2b]]))
m2 <- lm(as.formula(paste(Y_var, "~", X2a, "+", X2b)), data = data_m2)
summary(m2)
##
## Call:
## lm(formula = as.formula(paste(Y_var, "~", X2a, "+", X2b)), data = data_m2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2401.3 -417.3 -251.1 138.0 1999.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.747e+03 7.057e+02 2.475 0.01789 *
## population_working_age 6.394e-01 9.028e-03 70.827 < 2e-16 ***
## unemployment_rate -5.257e+02 1.520e+02 -3.458 0.00136 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 844.1 on 38 degrees of freedom
## Multiple R-squared: 0.9927, Adjusted R-squared: 0.9924
## F-statistic: 2596 on 2 and 38 DF, p-value: < 2.2e-16
coeftest(m2, vcov = vcovHC(m2, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.7467e+03 6.8572e+02 2.5472 0.015029 *
## population_working_age 6.3943e-01 8.9339e-03 71.5743 < 2.2e-16 ***
## unemployment_rate -5.2567e+02 1.6618e+02 -3.1633 0.003065 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Diagnósticos
bptest(m2)
##
## studentized Breusch-Pagan test
##
## data: m2
## BP = 0.92732, df = 2, p-value = 0.629
dwtest(m2)
##
## Durbin-Watson test
##
## data: m2
## DW = 0.77724, p-value = 1.168e-06
## alternative hypothesis: true autocorrelation is greater than 0
resettest(m2, power = 2:3, type = "fitted")
##
## RESET test
##
## data: m2
## RESET = 3.6184, df1 = 2, df2 = 36, p-value = 0.03699
coeftest(m2, vcov = NeweyWest(m2, lag = 1, prewhite = FALSE))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.7467e+03 7.2540e+02 2.4079 0.021006 *
## population_working_age 6.3943e-01 8.4067e-03 76.0622 < 2.2e-16 ***
## unemployment_rate -5.2567e+02 1.9173e+02 -2.7417 0.009268 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# GLS AR(1)
m2_gls <- gls(
as.formula(paste(Y_var, "~", X2a, "+", X2b)),
data = data_m2,
correlation = corAR1(form = ~ year),
method = "ML"
)
summary(m2_gls)
## Generalized least squares fit by maximum likelihood
## Model: as.formula(paste(Y_var, "~", X2a, "+", X2b))
## Data: data_m2
## AIC BIC logLik
## 659.7513 668.3191 -324.8756
##
## Correlation Structure: AR(1)
## Formula: ~year
## Parameter estimate(s):
## Phi
## 0.6351418
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) 2355.4942 1224.5737 1.92352 0.0619
## population_working_age 0.6322 0.0176 35.90209 0.0000
## unemployment_rate -600.0197 181.6849 -3.30253 0.0021
##
## Correlation:
## (Intr) pplt__
## population_working_age -0.791
## unemployment_rate -0.466 -0.120
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.4053565 -0.4739263 -0.1383676 0.4632173 2.3846429
##
## Residual standard error: 859.9006
## Degrees of freedom: 41 total; 38 residual
# Valor Oxford Economics (empleo total, miles de personas)
oxford_2022 <- 57322.4
# Crear newdata 2022 usando último valor observado (proxy 2020)
make_new_2022 <- function(df, vars, year_col = "year") {
last_year <- max(df[[year_col]], na.rm = TRUE)
last_vals <- df %>%
dplyr::filter(.data[[year_col]] == last_year) %>%
dplyr::summarise(dplyr::across(dplyr::all_of(vars), ~ dplyr::first(.x))) %>%
as.data.frame()
last_vals[[year_col]] <- 2022
last_vals
}
# Newdata 2022 para el modelo 2
new_2022_m2 <- make_new_2022(
data_m2,
vars = c("population_working_age", "unemployment_rate")
)
# Predicción MCO (con intervalo de predicción 95%)
pred_m2_mco_2022 <- predict(
m2,
newdata = new_2022_m2,
interval = "prediction",
level = 0.95
)
# Predicción GLS AR(1) (puntual)
pred_m2_gls_2022 <- as.numeric(predict(m2_gls, newdata = new_2022_m2))
# Mostrar resultados
pred_m2_mco_2022
## fit lwr upr
## 1 53328.41 51541.23 55115.59
pred_m2_gls_2022
## [1] 52995.44
comp_m2 <- data.frame(
modelo = c("M2_MCO", "M2_GLS_AR1", "Oxford"),
pred_2022 = c(
as.numeric(pred_m2_mco_2022[1, "fit"]),
pred_m2_gls_2022,
oxford_2022
)
) %>%
mutate(
diferencia_vs_oxford = pred_2022 - oxford_2022,
diferencia_pct_vs_oxford = (pred_2022 / oxford_2022 - 1) * 100
)
print(comp_m2)
## modelo pred_2022 diferencia_vs_oxford diferencia_pct_vs_oxford
## 1 M2_MCO 53328.41 -3993.989 -6.967588
## 2 M2_GLS_AR1 52995.44 -4326.962 -7.548467
## 3 Oxford 57322.40 0.000 0.000000