Introducción

  • Ángel Santiago Díaz Castillo
  • Angie Michelle Zerón Monge
  • Eli Gabriel Hernánez Medina
  • Patricio Javier Pérez Fajardo
library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(spdep)
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
library(spatialreg)
## Loading required package: Matrix
## 
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
## 
##     get.ClusterOption, get.coresOption, get.mcOption,
##     get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
##     set.coresOption, set.mcOption, set.VerboseOption,
##     set.ZeroPolicyOption
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## Loading required package: lattice
library(ggplot2)
# Leer base principal de turismo
tourism_data <- read_excel("/Users/gabrielmedina/Downloads/mx_spatial_data-2/tourism_state_data.xlsx")
## New names:
## • `region` -> `region...17`
## • `region` -> `region...18`
# Leer base de cuartos ocupados por extranjeros (equipo Beta)
beta_data <- read_excel("/Users/gabrielmedina/Downloads/bd22.xlsx")
names(beta_data) <- trimws(names(beta_data))

# Selección y renombramiento
beta_data <- beta_data %>% 
  rename(state = `Row Labels`, extranjeros = `Cuartos Ocupados Extranjeros`) %>% 
  select(state, extranjeros) %>% 
  mutate(extranjeros = as.numeric(extranjeros))

# Filtrar año más reciente en turismo y unir con base Beta
data_merged <- tourism_data %>% 
  filter(year == 2021) %>% 
  select(state, state_id, tourism_gdp, crime_rate, college_education, business_activity, pop_density) %>% 
  left_join(beta_data, by = "state") %>% 
  na.omit()

# Cargar shapefile
mx_map <- st_read("/Users/gabrielmedina/Downloads/mx_spatial_data/mx_maps/mx_states/mexlatlong.shp")
## Reading layer `mexlatlong' from data source 
##   `/Users/gabrielmedina/Downloads/mx_spatial_data/mx_maps/mx_states/mexlatlong.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 32 features and 19 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -118.4042 ymin: 14.55055 xmax: -86.73862 ymax: 32.71846
## Geodetic CRS:  WGS 84
# Unir shapefile con datos manualmente
geo_data <- mx_map %>% 
  left_join(data_merged, by = c("OBJECTID" = "state_id")) %>% 
  na.omit()
# CREAR MATRIZ DE VECINDAD -------------------------------------------
nb <- poly2nb(geo_data)
lw <- nb2listw(nb, style = "W", zero.policy = TRUE)
# MODELO NO ESPACIAL ------------------------------------------------
modelo_lm <- lm(extranjeros ~ tourism_gdp + crime_rate + college_education + business_activity + pop_density, data = geo_data)
summary(modelo_lm)
## 
## Call:
## lm(formula = extranjeros ~ tourism_gdp + crime_rate + college_education + 
##     business_activity + pop_density, data = geo_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -512146 -135959    6985  131697  909183 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.377e+06  5.023e+05  -2.742 0.012970 *  
## tourism_gdp        6.791e+00  1.723e+00   3.941 0.000876 ***
## crime_rate         8.070e+02  2.352e+03   0.343 0.735330    
## college_education  3.696e+06  1.475e+06   2.505 0.021496 *  
## business_activity -1.517e+05  7.650e+04  -1.983 0.061992 .  
## pop_density       -7.942e+02  5.947e+02  -1.335 0.197502    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 304700 on 19 degrees of freedom
## Multiple R-squared:  0.5828, Adjusted R-squared:  0.473 
## F-statistic: 5.308 on 5 and 19 DF,  p-value: 0.003223
# AUTOCORRELACION ESPACIAL DE RESIDUALES ----------------------------
moran.test(residuals(modelo_lm), lw, zero.policy = TRUE)
## 
##  Moran I test under randomisation
## 
## data:  residuals(modelo_lm)  
## weights: lw    
## 
## Moran I statistic standard deviate = 0.28251, p-value = 0.3888
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##     -0.0001942027     -0.0416666667      0.0215494956
# MODELOS ESPACIALES ------------------------------------------------
# SAR
modelo_sar <- lagsarlm(extranjeros ~ tourism_gdp + crime_rate + college_education + business_activity + pop_density,
                       data = geo_data, listw = lw, zero.policy = TRUE)
## Warning in lagsarlm(extranjeros ~ tourism_gdp + crime_rate + college_education + : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16 
##   reciprocal condition number = 6.14852e-23 - using numerical Hessian.
summary(modelo_sar)
## 
## Call:
## lagsarlm(formula = extranjeros ~ tourism_gdp + crime_rate + college_education + 
##     business_activity + pop_density, data = geo_data, listw = lw, 
##     zero.policy = TRUE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -511250 -141722   15573  127338  902078 
## 
## Type: lag 
## Coefficients: (numerical Hessian approximate standard errors) 
##                      Estimate  Std. Error z value  Pr(>|z|)
## (Intercept)       -1.3851e+06  4.3500e+05 -3.1842  0.001452
## tourism_gdp        6.6752e+00  1.5074e+00  4.4283 9.498e-06
## crime_rate         8.4363e+02  2.0363e+03  0.4143  0.678661
## college_education  3.7715e+06  1.2847e+06  2.9358  0.003327
## business_activity -1.5498e+05  6.6483e+04 -2.3311  0.019751
## pop_density       -8.3319e+02  5.1998e+02 -1.6023  0.109079
## 
## Rho: -0.11211, LR test value: 0.27153, p-value: 0.60231
## Approximate (numerical Hessian) standard error: 0.21473
##     z-value: -0.52209, p-value: 0.60161
## Wald statistic: 0.27258, p-value: 0.60161
## 
## Log likelihood: -347.5878 for lag model
## ML residual variance (sigma squared): 6.9537e+10, (sigma: 263700)
## Number of observations: 25 
## Number of parameters estimated: 8 
## AIC: 711.18, (AIC for lm: 709.45)
# SEM
modelo_sem <- errorsarlm(extranjeros ~ tourism_gdp + crime_rate + college_education + business_activity + pop_density,
                         data = geo_data, listw = lw, zero.policy = TRUE)
## Warning in errorsarlm(extranjeros ~ tourism_gdp + crime_rate + college_education + : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16 
##   reciprocal condition number = 1.32256e-22 - using numerical Hessian.
## Warning in sqrt(fdHess[1, 1]): NaNs produced
summary(modelo_sem)
## 
## Call:errorsarlm(formula = extranjeros ~ tourism_gdp + crime_rate + 
##     college_education + business_activity + pop_density, data = geo_data, 
##     listw = lw, zero.policy = TRUE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -512090 -136024    7046  131566  909238 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                      Estimate  Std. Error z value  Pr(>|z|)
## (Intercept)       -1.3768e+06  4.3783e+05 -3.1446  0.001663
## tourism_gdp        6.7902e+00  1.5023e+00  4.5198 6.191e-06
## crime_rate         8.0352e+02  2.0509e+03  0.3918  0.695212
## college_education  3.6956e+06  1.2858e+06  2.8741  0.004051
## business_activity -1.5173e+05  6.6700e+04 -2.2749  0.022912
## pop_density       -7.9461e+02  5.1838e+02 -1.5329  0.125307
## 
## Lambda: -0.00075619, LR test value: 3.6709e-06, p-value: 0.99847
## Approximate (numerical Hessian) standard error: NaN
##     z-value: NaN, p-value: NA
## Wald statistic: NaN, p-value: NA
## 
## Log likelihood: -347.7235 for error model
## ML residual variance (sigma squared): 7.058e+10, (sigma: 265670)
## Number of observations: 25 
## Number of parameters estimated: 8 
## AIC: 711.45, (AIC for lm: 709.45)
# SDM
modelo_sdm <- lagsarlm(extranjeros ~ tourism_gdp + crime_rate + college_education + business_activity + pop_density,
                       data = geo_data, listw = lw, type = "mixed", zero.policy = TRUE)
## Warning in lagsarlm(extranjeros ~ tourism_gdp + crime_rate + college_education + : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16 
##   reciprocal condition number = 1.01794e-22 - using numerical Hessian.
summary(modelo_sdm)
## 
## Call:
## lagsarlm(formula = extranjeros ~ tourism_gdp + crime_rate + college_education + 
##     business_activity + pop_density, data = geo_data, listw = lw, 
##     type = "mixed", zero.policy = TRUE)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -400758.2  -70581.4   -3525.8  124108.7  546317.6 
## 
## Type: mixed 
## Coefficients: (numerical Hessian approximate standard errors) 
##                          Estimate  Std. Error z value  Pr(>|z|)
## (Intercept)           -2.0591e+06  1.0929e+06 -1.8840 0.0595634
## tourism_gdp            6.3418e+00  1.3326e+00  4.7589 1.947e-06
## crime_rate             4.8704e+02  1.9410e+03  0.2509 0.8018705
## college_education      2.8647e+06  1.5181e+06  1.8870 0.0591604
## business_activity     -2.1136e+05  6.1588e+04 -3.4319 0.0005994
## pop_density           -6.9569e+01  5.4473e+02 -0.1277 0.8983771
## lag.tourism_gdp       -3.8493e+00  2.6551e+00 -1.4498 0.1471155
## lag.crime_rate        -6.6186e+03  3.0768e+03 -2.1512 0.0314622
## lag.college_education  4.6221e+06  3.8881e+06  1.1888 0.2345289
## lag.business_activity -3.0240e+04  9.4032e+04 -0.3216 0.7477622
## lag.pop_density       -1.6027e+03  9.9942e+02 -1.6037 0.1087871
## 
## Rho: -0.13467, LR test value: 0.24133, p-value: 0.62325
## Approximate (numerical Hessian) standard error: 0.27279
##     z-value: -0.49368, p-value: 0.62153
## Wald statistic: 0.24372, p-value: 0.62153
## 
## Log likelihood: -342.049 for mixed model
## ML residual variance (sigma squared): 4.4566e+10, (sigma: 211110)
## Number of observations: 25 
## Number of parameters estimated: 13 
## AIC: 710.1, (AIC for lm: 708.34)
# MACHINE LEARNING - RANDOM FOREST ----------------------------------
set.seed(123)
rf <- randomForest(extranjeros ~ tourism_gdp + crime_rate + college_education + business_activity + pop_density,
                   data = geo_data, importance = TRUE, ntree = 500)
print(rf)
## 
## Call:
##  randomForest(formula = extranjeros ~ tourism_gdp + crime_rate +      college_education + business_activity + pop_density, data = geo_data,      importance = TRUE, ntree = 500) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 174382137088
##                     % Var explained: -3.08
# COMPARACION CON RMSE ----------------------------------------------
pred_lm <- predict(modelo_lm)
pred_sar <- predict(modelo_sar)
## This method assumes the response is known - see manual page
pred_rf <- predict(rf)

rmse <- function(actual, pred) sqrt(mean((actual - pred)^2))

rmse_lm <- rmse(geo_data$extranjeros, pred_lm)
rmse_sar <- rmse(geo_data$extranjeros, pred_sar)
rmse_rf <- rmse(geo_data$extranjeros, pred_rf)

cat("\n--- COMPARACIÓN DE RMSE ---\n")
## 
## --- COMPARACIÓN DE RMSE ---
cat("Modelo LM:", rmse_lm, "\n")
## Modelo LM: 265668.2
cat("Modelo SAR:", rmse_sar, "\n")
## Modelo SAR: 263698.5
cat("Random Forest:", rmse_rf, "\n")
## Random Forest: 417590.9