library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(corrplot)
library(stringr)
library(plm)
library(psych)
library(ggthemes)
library(caret)
library(glmnet)
library(car)
library(regclass)
library(lmtest)
library(mctest)
library(caret)
library(lubridate)
library(forecast)
library(tseries)
library(urca)
library(reshape2)
exports <- read_excel("inegi_exports_dataset.xlsx",
sheet = "exports")
ts_exports <- read_excel("~/Proyecto R/CONCENTRACION/inegi_exports_dataset.xlsx",
sheet = "ts")
data <- read_excel("inegi_exports_dataset.xlsx",
sheet = "data")
fdi <- read_excel("inegi_exports_dataset.xlsx",
sheet = "fdi")
exports <- exports %>%
mutate(state = str_trim(state))
fdi <- fdi %>%
mutate(state = str_trim(state))
data <- data %>%
mutate(state = str_trim(state))
# EXPORTS
exports_long <- exports %>%
pivot_longer(
cols = starts_with("exports_"),
names_to = "year",
values_to = "real_exports"
) %>%
mutate(
year = str_extract(year, "\\d{4}") %>% as.numeric()
) %>%
select(state, region, year, real_exports)
# FDI
fdi_long <- fdi %>%
pivot_longer(
cols = starts_with("fdi_"),
names_to = "year",
values_to = "fdi"
) %>%
mutate(
year = str_extract(year, "\\d{4}") %>% as.numeric()
) %>%
select(state, region, year, fdi)
# Verificar duplicados en exports
dup_exports <- exports_long %>%
count(state, year) %>%
filter(n > 1)
# Verificar duplicados en fdi
dup_fdi <- fdi_long %>%
count(state, year) %>%
filter(n > 1)
# Verificar duplicados en base principal
dup_data <- data %>%
count(state, year) %>%
filter(n > 1)
# Si cualquiera tiene filas, hay problema estructural
print(dup_exports)
## # A tibble: 0 × 3
## # ℹ 3 variables: state <chr>, year <dbl>, n <int>
print(dup_fdi)
## # A tibble: 0 × 3
## # ℹ 3 variables: state <chr>, year <dbl>, n <int>
print(dup_data)
## # A tibble: 0 × 3
## # ℹ 3 variables: state <chr>, year <dbl>, n <int>
ts_exports_annual <- ts_exports %>%
mutate(
year = substr(date, 1, 4) %>% as.numeric()
) %>%
group_by(year) %>%
summarise(
ts_exports = sum(ts_exports, na.rm = TRUE),
.groups = "drop"
)
panel_data <- data %>%
left_join(exports_long, by = c("state", "year")) %>%
left_join(fdi_long %>% select(-region),
by = c("state", "year")) %>%
left_join(ts_exports_annual, by = "year")
psych::describe(panel_data)
## vars n mean sd median
## state* 1 288 16.50 9.25 16.50
## year 2 288 2019.00 2.59 2019.00
## pop_density 3 288 308.82 1063.93 65.02
## gdp_per_capita_2018 4 288 1986.91 1062.05 1788.31
## lq_primary 5 288 1.06 0.88 0.84
## lq_secondary 6 288 1.00 0.38 0.97
## lq_tertiary 7 288 1.00 0.09 1.00
## average_daily_salary 8 288 332.49 46.96 324.12
## real_public_investment_pc 9 288 627.38 446.39 506.42
## border_economic_activity 10 288 -1.77 0.87 -1.99
## crime_rate 11 256 27.43 23.99 18.36
## college_education 12 256 0.25 0.05 0.25
## exchange_rate 13 256 19.49 1.05 19.72
## border_distance 14 256 704.92 274.27 751.64
## inpc 15 256 105.17 11.77 104.47
## region* 16 256 4.09 1.51 4.00
## real_exports 17 256 228822083.78 274331648.02 97535120.79
## fdi 18 256 18535.61 26141.08 10478.15
## ts_exports 19 288 7822529.33 606507.99 7997670.57
## trimmed mad min max
## state* 16.50 11.86 1.00 3.200000e+01
## year 2019.00 2.97 2015.00 2.023000e+03
## pop_density 96.05 67.14 9.60 6.233410e+03
## gdp_per_capita_2018 1831.36 728.79 603.71 8.216560e+03
## lq_primary 0.92 0.64 0.01 4.630000e+00
## lq_secondary 0.97 0.45 0.37 1.990000e+00
## lq_tertiary 1.00 0.10 0.79 1.180000e+00
## average_daily_salary 328.56 43.29 249.97 5.057300e+02
## real_public_investment_pc 570.97 384.18 4.14 2.423360e+03
## border_economic_activity -1.92 0.26 -2.77 2.530000e+00
## crime_rate 23.29 14.07 1.98 1.169500e+02
## college_education 0.25 0.05 0.15 4.400000e-01
## exchange_rate 19.66 0.85 17.07 2.052000e+01
## border_distance 720.83 223.45 8.83 1.252660e+03
## inpc 104.54 13.82 89.05 1.264800e+02
## region* 4.16 1.48 1.00 6.000000e+00
## real_exports 178098052.86 136535980.71 268782.64 1.163117e+09
## fdi 13034.20 11111.04 -7412.68 1.712389e+05
## ts_exports 7860733.19 558612.44 6535587.76 8.792925e+06
## range skew kurtosis se
## state* 3.100000e+01 0.00 -1.21 0.55
## year 8.000000e+00 0.00 -1.24 0.15
## pop_density 6.223810e+03 5.20 25.70 62.69
## gdp_per_capita_2018 7.612850e+03 2.49 9.25 62.58
## lq_primary 4.620000e+00 1.46 1.96 0.05
## lq_secondary 1.620000e+00 0.42 -0.58 0.02
## lq_tertiary 4.000000e-01 -0.13 -0.57 0.01
## average_daily_salary 2.557600e+02 0.83 0.55 2.77
## real_public_investment_pc 2.419220e+03 1.32 2.08 26.30
## border_economic_activity 5.300000e+00 3.60 14.55 0.05
## crime_rate 1.149700e+02 1.50 1.66 1.50
## college_education 2.900000e-01 0.50 0.37 0.00
## exchange_rate 3.450000e+00 -1.29 0.80 0.07
## border_distance 1.243830e+03 -0.54 0.04 17.14
## inpc 3.743000e+01 0.36 -0.86 0.74
## region* 5.000000e+00 -0.32 -1.14 0.09
## real_exports 1.162848e+09 1.38 0.99 17145728.00
## fdi 1.786516e+05 3.26 12.47 1633.82
## ts_exports 2.257337e+06 -0.56 0.08 35738.83
panel_data$region <- as.factor(panel_data$region)
panel_data$state <- as.factor(panel_data$state)
summary(panel_data)
## state year pop_density
## Aguascalientes : 9 Min. :2015 Min. : 9.601
## Baja California : 9 1st Qu.:2017 1st Qu.: 42.067
## Baja California Sur: 9 Median :2019 Median : 65.017
## Campeche : 9 Mean :2019 Mean : 308.816
## Chiapas : 9 3rd Qu.:2021 3rd Qu.: 161.291
## Chihuahua : 9 Max. :2023 Max. :6233.409
## (Other) :234
## gdp_per_capita_2018 lq_primary lq_secondary lq_tertiary
## Min. : 603.7 Min. :0.01493 Min. :0.3685 Min. :0.7867
## 1st Qu.:1307.2 1st Qu.:0.43199 1st Qu.:0.6470 1st Qu.:0.9360
## Median :1788.3 Median :0.83619 Median :0.9693 Median :0.9993
## Mean :1986.9 Mean :1.05887 Mean :0.9969 Mean :0.9973
## 3rd Qu.:2314.7 3rd Qu.:1.32423 3rd Qu.:1.2569 3rd Qu.:1.0669
## Max. :8216.6 Max. :4.63480 Max. :1.9911 Max. :1.1847
##
## average_daily_salary real_public_investment_pc border_economic_activity
## Min. :250.0 Min. : 4.139 Min. :-2.772
## 1st Qu.:297.5 1st Qu.: 289.064 1st Qu.:-2.137
## Median :324.1 Median : 506.424 Median :-1.985
## Mean :332.5 Mean : 627.380 Mean :-1.765
## 3rd Qu.:360.2 3rd Qu.: 879.463 3rd Qu.:-1.791
## Max. :505.7 Max. :2423.360 Max. : 2.530
##
## crime_rate college_education exchange_rate border_distance
## Min. : 1.984 Min. :0.1453 Min. :17.07 Min. : 8.83
## 1st Qu.: 10.560 1st Qu.:0.2077 1st Qu.:19.16 1st Qu.: 613.26
## Median : 18.355 Median :0.2460 Median :19.72 Median : 751.64
## Mean : 27.433 Mean :0.2482 Mean :19.49 Mean : 704.92
## 3rd Qu.: 37.002 3rd Qu.:0.2798 3rd Qu.:20.20 3rd Qu.: 875.76
## Max. :116.950 Max. :0.4376 Max. :20.52 Max. :1252.66
## NA's :32 NA's :32 NA's :32 NA's :32
## inpc region real_exports fdi
## Min. : 89.05 CdMx : 8 Min. :2.688e+05 Min. : -7413
## 1st Qu.: 96.71 Centro_Sur_Oriente:48 1st Qu.:2.152e+07 1st Qu.: 4048
## Median :104.47 Noreste :32 Median :9.754e+07 Median : 10478
## Mean :105.17 Noroeste :48 Mean :2.288e+08 Mean : 18536
## 3rd Qu.:111.28 Occidente_Bajio :64 3rd Qu.:3.502e+08 3rd Qu.: 22572
## Max. :126.48 Sur :56 Max. :1.163e+09 Max. :171239
## NA's :32 NA's :32 NA's :32 NA's :32
## ts_exports
## Min. :6535588
## 1st Qu.:7588156
## Median :7997671
## Mean :7822529
## 3rd Qu.:8098039
## Max. :8792925
##
panel_data <- panel_data[-(1:32), , drop = FALSE]
str(panel_data)
## tibble [256 × 19] (S3: tbl_df/tbl/data.frame)
## $ state : Factor w/ 32 levels "Aguascalientes",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ year : num [1:256] 2016 2016 2016 2016 2016 ...
## $ pop_density : num [1:256] 235.01 50.85 9.79 14.92 70.76 ...
## $ gdp_per_capita_2018 : num [1:256] 2329 2281 2254 7731 713 ...
## $ lq_primary : num [1:256] 0.403 0.845 0.599 0.486 0.731 ...
## $ lq_secondary : num [1:256] 1.191 1.508 0.651 0.709 0.965 ...
## $ lq_tertiary : num [1:256] 0.992 0.866 1.097 1.091 1.024 ...
## $ average_daily_salary : num [1:256] 314 333 318 424 307 ...
## $ real_public_investment_pc: num [1:256] 1027 275 1360 887 1022 ...
## $ border_economic_activity : num [1:256] -1.85 2.42 -2.16 -2.15 -2.36 ...
## $ crime_rate : num [1:256] 3.71 31.68 32.89 10.84 10.64 ...
## $ college_education : num [1:256] 0.249 0.258 0.287 0.229 0.146 ...
## $ exchange_rate : num [1:256] 20.5 20.5 20.5 20.5 20.5 ...
## $ border_distance : num [1:256] 625.59 8.83 800.32 978.33 1111.82 ...
## $ inpc : num [1:256] 92 92 92 92 92 ...
## $ region : Factor w/ 6 levels "CdMx","Centro_Sur_Oriente",..: 5 4 4 6 6 4 1 3 5 4 ...
## $ real_exports : num [1:256] 1.75e+08 8.03e+08 5.30e+06 2.12e+08 1.31e+07 ...
## $ fdi : num [1:256] 13225 33796 11420 3162 3091 ...
## $ ts_exports : num [1:256] 7399443 7399443 7399443 7399443 7399443 ...
variables <- c("year","real_public_investment_pc","fdi","real_exports","border_distance","inpc")
panel_data %>%
pivot_longer(cols = all_of(variables), names_to = "variable", values_to = "valor") %>%
ggplot(aes(x = valor)) +
geom_histogram(bins = 20) +
facet_wrap(~ variable, scales = "free") +
labs(title = "Distribucion de exportaciones",
x = "Valor de exportaciones", y = "Frecuencia") +
theme_fivethirtyeight()
panel_data %>%
pivot_longer(cols = all_of(variables), names_to = "variable", values_to = "valor") %>%
ggplot(aes(x = region, y = valor)) +
geom_boxplot(outlier.alpha = 0.6) +
facet_wrap(~ variable, scales = "free_y") +
labs(title = "Dispersion de exportaciones por region (boxplots)",
x = "Region", y = "Valor de exportaciones") +
theme_fivethirtyeight() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
corr_mat <- panel_data %>%
select(all_of(variables)) %>%
cor(use = "complete.obs")
corr_df <- melt(corr_mat)
ggplot(corr_df, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = round(value, 2)), size = 3) +
scale_fill_gradient2(low = "#3B4CC0",
mid = "white",
high = "#B40426",
midpoint = 0,
limits = c(-1,1),
name = "Correlacion") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
panel.grid = element_blank()) +
labs(title = "Matriz de correlacion: Variables Clave",
x = NULL, y = NULL)
scatter_data <- panel_data %>%
select(real_exports, all_of(variables)) %>%
pivot_longer(
cols = -real_exports,
names_to = "variable",
values_to = "value"
)
ggplot(scatter_data, aes(x = value, y = real_exports)) +
geom_point(alpha = 0.5, color = "#2C3E50") +
geom_smooth(method = "lm", se = FALSE, color = "#E74C3C") +
facet_wrap(~ variable, scales = "free_x")
modelo_base <- lm(real_exports ~ fdi + inpc, data = panel_data)
rmse_base <- sqrt(mean(residuals(modelo_base)^2, na.rm = TRUE))
summary(modelo_base)
##
## Call:
## lm(formula = real_exports ~ fdi + inpc, data = panel_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -681702219 -162790837 -107996862 103162435 878848333
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.285e+07 1.800e+08 0.238 0.812
## fdi 3.163e+03 6.784e+02 4.662 5.41e-06 ***
## inpc 1.189e+06 1.656e+06 0.718 0.473
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 265600000 on 221 degrees of freedom
## (32 observations deleted due to missingness)
## Multiple R-squared: 0.09038, Adjusted R-squared: 0.08215
## F-statistic: 10.98 on 2 and 221 DF, p-value: 2.844e-05
rmse_base
## [1] 263766989
modelo_full <- lm(real_exports ~ state + fdi + real_public_investment_pc + border_distance + inpc + year, data = panel_data)
rmse_full <- sqrt(mean(residuals(modelo_full)^2, na.rm = TRUE))
summary(modelo_full)
##
## Call:
## lm(formula = real_exports ~ state + fdi + real_public_investment_pc +
## border_distance + inpc + year, data = panel_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -103824886 -11651720 1555925 9832202 124781237
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.223e+10 1.085e+10 2.049 0.041873 *
## stateBaja California 5.945e+08 1.677e+07 35.438 < 2e-16 ***
## stateBaja California Sur -1.824e+08 1.566e+07 -11.649 < 2e-16 ***
## stateCampeche 7.464e+07 1.613e+07 4.626 6.91e-06 ***
## stateChiapas -1.689e+08 1.605e+07 -10.522 < 2e-16 ***
## stateChihuahua 8.353e+08 1.683e+07 49.641 < 2e-16 ***
## stateCiudad de Mexico -1.443e+08 3.300e+07 -4.372 2.03e-05 ***
## stateCoahuila 6.636e+08 1.619e+07 40.985 < 2e-16 ***
## stateColima -1.733e+08 1.609e+07 -10.772 < 2e-16 ***
## stateDurango -1.432e+08 1.578e+07 -9.076 < 2e-16 ***
## stateGuanajuato 2.860e+08 1.625e+07 17.601 < 2e-16 ***
## stateGuerrero -1.669e+08 1.619e+07 -10.307 < 2e-16 ***
## stateHidalgo -1.444e+08 1.617e+07 -8.936 3.73e-16 ***
## stateJalisco 2.071e+08 1.670e+07 12.402 < 2e-16 ***
## stateMexico 1.508e+08 1.760e+07 8.571 3.71e-15 ***
## stateMichoacan -1.066e+08 1.629e+07 -6.541 5.63e-10 ***
## stateMorelos -1.221e+08 1.598e+07 -7.638 1.08e-12 ***
## stateNayarit -1.796e+08 1.603e+07 -11.206 < 2e-16 ***
## stateNuevo Leon 5.409e+08 2.066e+07 26.178 < 2e-16 ***
## stateOaxaca -1.668e+08 1.621e+07 -10.291 < 2e-16 ***
## statePuebla 9.566e+07 1.609e+07 5.946 1.32e-08 ***
## stateQueretaro 4.235e+07 1.569e+07 2.700 0.007572 **
## stateQuintana Roo -1.820e+08 1.608e+07 -11.319 < 2e-16 ***
## stateSan Luis Potosi 7.217e+07 1.572e+07 4.590 8.09e-06 ***
## stateSinaloa -1.496e+08 1.568e+07 -9.545 < 2e-16 ***
## stateSonora 1.722e+08 1.600e+07 10.765 < 2e-16 ***
## stateTabasco -6.202e+07 1.600e+07 -3.877 0.000146 ***
## stateTamaulipas 3.374e+08 1.582e+07 21.326 < 2e-16 ***
## stateTlaxcala -1.576e+08 1.585e+07 -9.942 < 2e-16 ***
## stateVeracruz -6.757e+07 1.628e+07 -4.150 5.04e-05 ***
## stateYucatan -1.591e+08 1.661e+07 -9.577 < 2e-16 ***
## stateZacatecas -1.209e+08 1.600e+07 -7.555 1.77e-12 ***
## fdi 1.605e+01 2.431e+02 0.066 0.947425
## real_public_investment_pc 1.159e+04 7.000e+03 1.656 0.099357 .
## border_distance NA NA NA NA
## inpc 2.922e+06 9.961e+05 2.934 0.003764 **
## year -1.108e+07 5.426e+06 -2.042 0.042558 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29280000 on 188 degrees of freedom
## (32 observations deleted due to missingness)
## Multiple R-squared: 0.9906, Adjusted R-squared: 0.9888
## F-statistic: 565.7 on 35 and 188 DF, p-value: < 2.2e-16
rmse_full
## [1] 26821007
modelo_log <- lm(real_exports ~ state + log(1+fdi) + log(real_public_investment_pc) + log(border_distance) + log(inpc) + year, data = panel_data)
rmse_log <- sqrt(mean(residuals(modelo_log)^2, na.rm = TRUE))
summary(modelo_log)
##
## Call:
## lm(formula = real_exports ~ state + log(1 + fdi) + log(real_public_investment_pc) +
## log(border_distance) + log(inpc) + year, data = panel_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -105020500 -11490402 866747 10714873 127159981
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.758e+10 1.274e+10 2.164 0.03174 *
## stateBaja California 5.933e+08 1.665e+07 35.626 < 2e-16 ***
## stateBaja California Sur -1.807e+08 1.595e+07 -11.328 < 2e-16 ***
## stateCampeche 7.702e+07 1.615e+07 4.768 3.76e-06 ***
## stateChiapas -1.710e+08 1.628e+07 -10.506 < 2e-16 ***
## stateChihuahua 8.341e+08 1.668e+07 49.994 < 2e-16 ***
## stateCiudad de Mexico -1.399e+08 1.707e+07 -8.200 3.92e-14 ***
## stateCoahuila 6.639e+08 1.603e+07 41.412 < 2e-16 ***
## stateColima -1.755e+08 1.731e+07 -10.139 < 2e-16 ***
## stateDurango -1.432e+08 1.591e+07 -9.001 2.69e-16 ***
## stateGuanajuato 2.848e+08 1.613e+07 17.652 < 2e-16 ***
## stateGuerrero -1.695e+08 1.620e+07 -10.461 < 2e-16 ***
## stateHidalgo -1.460e+08 1.637e+07 -8.920 4.49e-16 ***
## stateJalisco 2.066e+08 1.625e+07 12.708 < 2e-16 ***
## stateMexico 1.529e+08 1.628e+07 9.392 < 2e-16 ***
## stateMichoacan -1.087e+08 1.643e+07 -6.621 3.76e-10 ***
## stateMorelos -1.237e+08 1.616e+07 -7.652 1.06e-12 ***
## stateNayarit -1.822e+08 1.612e+07 -11.303 < 2e-16 ***
## stateNuevo Leon 5.428e+08 1.782e+07 30.457 < 2e-16 ***
## stateOaxaca -1.692e+08 1.692e+07 -10.002 < 2e-16 ***
## statePuebla 9.486e+07 1.631e+07 5.815 2.62e-08 ***
## stateQueretaro 4.382e+07 1.596e+07 2.746 0.00662 **
## stateQuintana Roo -1.845e+08 1.608e+07 -11.475 < 2e-16 ***
## stateSan Luis Potosi 7.171e+07 1.591e+07 4.508 1.16e-05 ***
## stateSinaloa -1.492e+08 1.584e+07 -9.419 < 2e-16 ***
## stateSonora 1.705e+08 1.611e+07 10.582 < 2e-16 ***
## stateTabasco -6.451e+07 1.609e+07 -4.009 8.85e-05 ***
## stateTamaulipas 3.379e+08 1.597e+07 21.154 < 2e-16 ***
## stateTlaxcala -1.591e+08 1.604e+07 -9.918 < 2e-16 ***
## stateVeracruz -6.971e+07 1.650e+07 -4.224 3.76e-05 ***
## stateYucatan -1.588e+08 1.776e+07 -8.941 3.94e-16 ***
## stateZacatecas -1.253e+08 1.671e+07 -7.496 2.65e-12 ***
## log(1 + fdi) -4.755e+05 2.401e+06 -0.198 0.84324
## log(real_public_investment_pc) 4.486e+06 3.502e+06 1.281 0.20179
## log(border_distance) NA NA NA NA
## log(inpc) 3.782e+08 1.333e+08 2.837 0.00507 **
## year -1.445e+07 6.614e+06 -2.185 0.03013 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29630000 on 185 degrees of freedom
## (35 observations deleted due to missingness)
## Multiple R-squared: 0.9904, Adjusted R-squared: 0.9886
## F-statistic: 548.1 on 35 and 185 DF, p-value: < 2.2e-16
rmse_log
## [1] 27112395
set.seed(123)
idx <- createDataPartition(panel_data$real_exports, p = 0.75, list = FALSE)
train_data <- panel_data[idx, ]
test_data <- panel_data[-idx, ]
# --- TRAIN: x e y alineados (mismas filas) ---
mf_train <- model.frame(real_exports ~ ., data = train_data, na.action = na.omit)
y_train <- model.response(mf_train)
x_train <- model.matrix(real_exports ~ ., data = mf_train)[, -1]
# --- TEST: x e y alineados (mismas filas) ---
mf_test <- model.frame(real_exports ~ ., data = test_data, na.action = na.omit)
y_test <- model.response(mf_test)
x_test <- model.matrix(real_exports ~ ., data = mf_test)[, -1]
set.seed(123)
cv_lasso <- cv.glmnet(x_train, y_train, alpha = 1)
lasso_model <- glmnet(x_train, y_train, alpha = 1, lambda = cv_lasso$lambda.min)
pred_lasso <- as.vector(predict(lasso_model, newx = x_test))
RMSE_lasso <- RMSE(pred_lasso, y_test)
R2_lasso <- R2(pred_lasso, y_test)
RMSE_lasso
## [1] 32077848
R2_lasso
## [1] 0.9858817
coef(lasso_model)
## 53 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 2.434832e+10
## stateBaja California 1.646669e+08
## stateBaja California Sur -9.417562e+07
## stateCampeche 1.740530e+08
## stateChiapas .
## stateChihuahua 7.184962e+08
## stateCiudad de Mexico -1.519623e+08
## stateCoahuila 5.336884e+08
## stateColima -1.150422e+07
## stateDurango -8.313408e+07
## stateGuanajuato 3.148007e+08
## stateGuerrero -1.339583e+07
## stateHidalgo -1.034491e+08
## stateJalisco 2.590956e+08
## stateMexico 2.084174e+08
## stateMichoacan -3.059761e+07
## stateMorelos -4.885840e+07
## stateNayarit -5.774609e+07
## stateNuevo Leon 3.214024e+08
## stateOaxaca -7.960520e+06
## statePuebla 1.724382e+08
## stateQueretaro 2.445335e+07
## stateQuintana Roo 6.838772e+07
## stateSan Luis Potosi .
## stateSinaloa -3.404641e+07
## stateSonora 5.810535e+07
## stateTabasco 7.354367e+07
## stateTamaulipas 1.734331e+08
## stateTlaxcala -1.107964e+08
## stateVeracruz -3.546283e+07
## stateYucatan -4.593071e+07
## stateZacatecas -1.023260e+08
## year -1.221207e+07
## pop_density .
## gdp_per_capita_2018 -5.817087e+03
## lq_primary -7.187881e+06
## lq_secondary 2.631034e+07
## lq_tertiary .
## average_daily_salary 5.493408e+05
## real_public_investment_pc 4.211319e+03
## border_economic_activity 6.272532e+07
## crime_rate -1.670094e+05
## college_education -1.463661e+07
## exchange_rate 2.700079e+06
## border_distance -2.245104e+05
## inpc .
## regionCentro_Sur_Oriente .
## regionNoreste 2.964960e+07
## regionNoroeste .
## regionOccidente_Bajio .
## regionSur .
## fdi 1.080623e+02
## ts_exports 6.138800e+01
resultados <- data.frame(
Regression_Model = c("a) Modelo Base", "b) Modelo Full", "c) Modelo log", "d) LASSO"),
Adjusted_R2 = c(summary(modelo_base)$adj.r.squared,
summary(modelo_full)$adj.r.squared,
summary(modelo_log)$adj.r.squared,
R2_lasso),
RMSE = c(rmse_base, rmse_full, rmse_log, RMSE_lasso)
)
resultados
## Regression_Model Adjusted_R2 RMSE
## 1 a) Modelo Base 0.08214863 263766989
## 2 b) Modelo Full 0.98884381 26821007
## 3 c) Modelo log 0.98864172 27112395
## 4 d) LASSO 0.98588168 32077848
VIF(modelo_base)
## fdi inpc
## 1.002812 1.002812
bptest(modelo_base)
##
## studentized Breusch-Pagan test
##
## data: modelo_base
## BP = 43.961, df = 2, p-value = 2.845e-10
bptest(modelo_full)
##
## studentized Breusch-Pagan test
##
## data: modelo_full
## BP = 76.177, df = 35, p-value = 6.952e-05
bptest(modelo_log)
##
## studentized Breusch-Pagan test
##
## data: modelo_log
## BP = 72.379, df = 35, p-value = 0.0002069
hist(modelo_base$residuals,xlab="Residuales estimados de la regresion", main="Distribucion de Residuales en Modelo Base", col="lightgreen", border="white")
hist(modelo_full$residuals,xlab="Residuales estimados de la regresion", main="Distribucion de Residuales en Modelo Full", col="lightgreen", border="white")
hist(modelo_log$residuals,xlab="Residuales estimados de la regresion", main="Distribucion de Residuales en Modelo logaritmico", col="lightgreen", border="white")
library(tseries)
# Jarque-Bera test
jb_test_base <- jarque.bera.test(modelo_base$residuals)
jb_test_base
##
## Jarque Bera Test
##
## data: modelo_base$residuals
## X-squared = 64.302, df = 2, p-value = 1.088e-14
jb_test_full <- jarque.bera.test(modelo_full$residuals)
jb_test_full
##
## Jarque Bera Test
##
## data: modelo_full$residuals
## X-squared = 141.43, df = 2, p-value < 2.2e-16
jb_test_log <- jarque.bera.test(modelo_log$residuals)
jb_test_log
##
## Jarque Bera Test
##
## data: modelo_log$residuals
## X-squared = 148.02, df = 2, p-value < 2.2e-16
AIC(modelo_base, modelo_full, modelo_log)
## df AIC
## modelo_base 4 9330.663
## modelo_full 37 8372.588
## modelo_log 37 8266.223
ts_df <- ts_exports %>%
mutate(
year = as.integer(substr(date, 1, 4)),
month = as.integer(substr(date, 6, 7)),
date_m = as.Date(paste0(year, "-", month, "-01"))
) %>%
arrange(date_m)
p_ts <- ggplot(ts_df, aes(x = date_m, y = ts_exports)) +
geom_line() +
theme_minimal() +
labs(
title = "Exportaciones a traves del tiempo",
x = "Date",
y = "ts_exports"
)
p_ts
ts_y <- ts(
ts_df$ts_exports,
start = c(min(ts_df$year), min(ts_df$month[ts_df$year == min(ts_df$year)])),
frequency = 12
)
decomp_classic <- decompose(ts_y, type = "additive")
plot(decomp_classic)
adf_level <- tseries::adf.test(ts_y)
adf_level
##
## Augmented Dickey-Fuller Test
##
## data: ts_y
## Dickey-Fuller = -2.7908, Lag order = 7, p-value = 0.2433
## alternative hypothesis: stationary
Acf(ts_y, main = "ACF: ts_exports")
Pacf(ts_y, main = "PACF: ts_exports")
ts_y_d1 <- diff(ts_y, differences = 1)
modelo_arma <- Arima(
ts_y_d1,
order = c(1,0,1),
include.constant = FALSE
)
summary(modelo_arma)
## Series: ts_y_d1
## ARIMA(1,0,1) with zero mean
##
## Coefficients:
## ar1 ma1
## 0.1397 -0.7685
## s.e. 0.0648 0.0388
##
## sigma^2 = 1.969e+09: log likelihood = -4931.87
## AIC=9869.73 AICc=9869.79 BIC=9881.76
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 6095.921 44259.69 28757.12 11.34027 211.5769 0.8993409
## ACF1
## Training set -0.005080321
checkresiduals(modelo_arma)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,1) with zero mean
## Q* = 195.82, df = 22, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 24
modelo_arima <- Arima(
ts_y,
order = c(1,1,1),
include.constant = TRUE
)
summary(modelo_arima)
## Series: ts_y
## ARIMA(1,1,1) with drift
##
## Coefficients:
## ar1 ma1 drift
## 0.1647 -0.8148 1652.2861
## s.e. 0.0649 0.0382 485.8929
##
## sigma^2 = 1.928e+09: log likelihood = -4927.24
## AIC=9862.47 AICc=9862.57 BIC=9878.51
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 1.093527 43697.63 28231.01 -0.9572928 7.338796 0.6850747
## ACF1
## Training set 0.01418405
checkresiduals(modelo_arima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1) with drift
## Q* = 185.54, df = 22, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 24
modelo_sarima <- Arima(
ts_y,
order = c(1,1,1),
seasonal = c(1,1,1),
include.constant = TRUE
)
summary(modelo_sarima)
## Series: ts_y
## ARIMA(1,1,1)(1,1,1)[12]
##
## Coefficients:
## ar1 ma1 sar1 sma1
## 0.1122 -0.7140 0.0104 -0.8390
## s.e. 0.0757 0.0523 0.0585 0.0306
##
## sigma^2 = 1.369e+09: log likelihood = -4720.77
## AIC=9451.55 AICc=9451.7 BIC=9471.44
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -97.56791 36215.02 22759.48 -0.4223074 5.68124 0.5522986
## ACF1
## Training set 0.01070795
checkresiduals(modelo_sarima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)(1,1,1)[12]
## Q* = 150.49, df = 20, p-value < 2.2e-16
##
## Model df: 4. Total lags used: 24
library(forecast)
fc_5 <- forecast(modelo_sarima, h = (5*12))
fc_5
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jan 2026 626786.3 579376.6 674196.0 554279.4 699293.1
## Feb 2026 701484.9 650453.8 752516.0 623439.6 779530.2
## Mar 2026 770569.9 717184.5 823955.3 688924.0 852215.8
## Apr 2026 714768.1 659228.2 770308.0 629827.1 799709.0
## May 2026 727821.7 670218.5 785424.9 639725.2 815918.3
## Jun 2026 738255.1 678661.1 797849.1 647114.0 829396.3
## Jul 2026 731630.7 670110.4 793150.9 637543.6 825717.8
## Aug 2026 751347.6 687959.6 814735.6 654404.1 848291.1
## Sep 2026 749738.1 684535.9 814940.3 650019.9 849456.3
## Oct 2026 802541.5 735574.2 869508.8 700123.8 904959.2
## Nov 2026 753991.8 685304.7 822678.9 648943.9 859039.6
## Dec 2026 762075.9 691711.0 832440.7 654462.1 869689.6
## Jan 2027 643327.4 569173.6 717481.2 529919.0 756735.8
## Feb 2027 718024.4 641595.3 794453.5 601136.1 834912.6
## Mar 2027 786930.0 708418.7 865441.3 666857.4 907002.7
## Apr 2027 731014.2 650488.3 811540.1 607860.4 854168.0
## May 2027 744186.0 661696.1 826675.9 618028.6 870343.4
## Jun 2027 755060.9 670652.9 839468.8 625970.0 884151.7
## Jul 2027 748059.9 661776.4 834343.3 616100.8 880019.0
## Aug 2027 768174.3 680055.3 856293.2 633407.9 902940.6
## Sep 2027 766603.5 676686.5 856520.6 629087.3 904119.8
## Oct 2027 818583.8 726904.0 910263.7 678371.6 958796.1
## Nov 2027 770978.1 677568.7 864387.5 628120.7 913835.5
## Dec 2027 778723.7 683616.2 873831.2 633269.3 924178.1
## Jan 2028 659963.4 561468.0 758458.8 509327.7 810599.2
## Feb 2028 734660.4 633958.2 835362.5 580649.8 888671.0
## Mar 2028 803564.2 700807.6 906320.7 646411.6 960716.7
## Apr 2028 747647.2 642888.0 852406.4 587431.8 907862.6
## May 2028 760820.2 654097.1 867543.3 597601.3 924039.0
## Jun 2028 771699.6 663048.3 880350.9 605531.7 937867.5
## Jul 2028 764694.7 654148.8 875240.6 595629.3 933760.1
## Aug 2028 784813.2 672404.6 897221.8 612899.1 956727.3
## Sep 2028 783242.9 669002.0 897483.8 608526.5 957959.3
## Oct 2028 835214.7 719170.4 951258.9 657740.3 1012689.1
## Nov 2028 787618.7 669798.7 905438.8 607428.5 967809.0
## Dec 2028 795360.8 675791.4 914930.3 612495.1 978226.5
## Jan 2029 676600.4 553775.0 799425.8 488755.2 864445.6
## Feb 2029 751297.4 626253.3 876341.5 560059.0 942535.8
## Mar 2029 820201.1 693072.1 947330.2 625774.0 1014628.3
## Apr 2029 764284.1 635114.1 893454.2 566735.6 961832.7
## May 2029 777457.2 646279.0 908635.3 576837.5 978076.8
## Jun 2029 788336.6 655180.8 921492.4 584692.4 991980.9
## Jul 2029 781331.7 646227.2 916436.2 574707.2 987956.2
## Aug 2029 801450.3 664424.7 938475.8 591887.8 1011012.7
## Sep 2029 799879.9 660960.0 938799.9 587420.2 1012339.7
## Oct 2029 851851.6 711062.7 992640.5 636533.5 1067169.7
## Nov 2029 804255.8 661622.4 946889.2 586116.9 1022394.7
## Dec 2029 811997.8 667543.5 956452.1 591074.1 1032921.6
## Jan 2030 693237.4 545582.1 840892.7 467418.1 919056.7
## Feb 2030 767934.4 618018.5 917850.3 538657.8 997210.9
## Mar 2030 836838.1 684783.2 988893.1 604290.2 1069386.1
## Apr 2030 780921.1 626766.5 935075.8 545161.9 1016680.4
## May 2030 794094.2 637869.0 950319.3 555168.4 1033019.9
## Jun 2030 804973.6 646705.3 963242.0 562923.0 1047024.3
## Jul 2030 797968.7 637683.1 958254.3 552833.0 1043104.4
## Aug 2030 818087.3 655809.5 980365.0 569904.9 1066269.6
## Sep 2030 816516.9 652271.2 980762.6 565324.8 1067709.0
## Oct 2030 868488.6 702298.3 1034679.0 614322.4 1122654.8
## Nov 2030 820892.8 652780.3 989005.3 563786.9 1077998.7
## Dec 2030 828634.8 658621.9 998647.8 568622.5 1088647.2
autoplot(fc_5) +
theme_minimal() +
labs(
title = "SARIMA Forecast: Proximos 5 periodos",
x = "Time",
y = "ts_exports"
)