Introducción
- Ángel Santiago Díaz Castillo
- Angie Michelle Zerón Monge
- Eli Gabriel Hernánez Medina
- Patricio Javier Pérez Fajardo
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: Matrix
##
## Attaching package: 'spatialreg'
## The following objects are masked from 'package:spdep':
##
## get.ClusterOption, get.coresOption, get.mcOption,
## get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
## set.coresOption, set.mcOption, set.VerboseOption,
## set.ZeroPolicyOption
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## Loading required package: lattice
# Leer base principal de turismo
tourism_data <- read_excel("/Users/gabrielmedina/Downloads/mx_spatial_data-2/tourism_state_data.xlsx")
## New names:
## • `region` -> `region...17`
## • `region` -> `region...18`
# Leer base de cuartos ocupados por extranjeros (equipo Beta)
beta_data <- read_excel("/Users/gabrielmedina/Downloads/bd22.xlsx")
names(beta_data) <- trimws(names(beta_data))
# Selección y renombramiento
beta_data <- beta_data %>%
rename(state = `Row Labels`, extranjeros = `Cuartos Ocupados Extranjeros`) %>%
select(state, extranjeros) %>%
mutate(extranjeros = as.numeric(extranjeros))
# Filtrar año más reciente en turismo y unir con base Beta
data_merged <- tourism_data %>%
filter(year == 2021) %>%
select(state, state_id, tourism_gdp, crime_rate, college_education, business_activity, pop_density) %>%
left_join(beta_data, by = "state") %>%
na.omit()
# Cargar shapefile
mx_map <- st_read("/Users/gabrielmedina/Downloads/mx_spatial_data/mx_maps/mx_states/mexlatlong.shp")
## Reading layer `mexlatlong' from data source
## `/Users/gabrielmedina/Downloads/mx_spatial_data/mx_maps/mx_states/mexlatlong.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 32 features and 19 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -118.4042 ymin: 14.55055 xmax: -86.73862 ymax: 32.71846
## Geodetic CRS: WGS 84
# Unir shapefile con datos manualmente
geo_data <- mx_map %>%
left_join(data_merged, by = c("OBJECTID" = "state_id")) %>%
na.omit()
# CREAR MATRIZ DE VECINDAD -------------------------------------------
nb <- poly2nb(geo_data)
lw <- nb2listw(nb, style = "W", zero.policy = TRUE)
# MODELO NO ESPACIAL ------------------------------------------------
modelo_lm <- lm(extranjeros ~ tourism_gdp + crime_rate + college_education + business_activity + pop_density, data = geo_data)
summary(modelo_lm)
##
## Call:
## lm(formula = extranjeros ~ tourism_gdp + crime_rate + college_education +
## business_activity + pop_density, data = geo_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -512146 -135959 6985 131697 909183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.377e+06 5.023e+05 -2.742 0.012970 *
## tourism_gdp 6.791e+00 1.723e+00 3.941 0.000876 ***
## crime_rate 8.070e+02 2.352e+03 0.343 0.735330
## college_education 3.696e+06 1.475e+06 2.505 0.021496 *
## business_activity -1.517e+05 7.650e+04 -1.983 0.061992 .
## pop_density -7.942e+02 5.947e+02 -1.335 0.197502
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 304700 on 19 degrees of freedom
## Multiple R-squared: 0.5828, Adjusted R-squared: 0.473
## F-statistic: 5.308 on 5 and 19 DF, p-value: 0.003223
# AUTOCORRELACION ESPACIAL DE RESIDUALES ----------------------------
moran.test(residuals(modelo_lm), lw, zero.policy = TRUE)
##
## Moran I test under randomisation
##
## data: residuals(modelo_lm)
## weights: lw
##
## Moran I statistic standard deviate = 0.28251, p-value = 0.3888
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## -0.0001942027 -0.0416666667 0.0215494956
# MODELOS ESPACIALES ------------------------------------------------
# SAR
modelo_sar <- lagsarlm(extranjeros ~ tourism_gdp + crime_rate + college_education + business_activity + pop_density,
data = geo_data, listw = lw, zero.policy = TRUE)
## Warning in lagsarlm(extranjeros ~ tourism_gdp + crime_rate + college_education + : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## reciprocal condition number = 6.14852e-23 - using numerical Hessian.
##
## Call:
## lagsarlm(formula = extranjeros ~ tourism_gdp + crime_rate + college_education +
## business_activity + pop_density, data = geo_data, listw = lw,
## zero.policy = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -511250 -141722 15573 127338 902078
##
## Type: lag
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.3851e+06 4.3500e+05 -3.1842 0.001452
## tourism_gdp 6.6752e+00 1.5074e+00 4.4283 9.498e-06
## crime_rate 8.4363e+02 2.0363e+03 0.4143 0.678661
## college_education 3.7715e+06 1.2847e+06 2.9358 0.003327
## business_activity -1.5498e+05 6.6483e+04 -2.3311 0.019751
## pop_density -8.3319e+02 5.1998e+02 -1.6023 0.109079
##
## Rho: -0.11211, LR test value: 0.27153, p-value: 0.60231
## Approximate (numerical Hessian) standard error: 0.21473
## z-value: -0.52209, p-value: 0.60161
## Wald statistic: 0.27258, p-value: 0.60161
##
## Log likelihood: -347.5878 for lag model
## ML residual variance (sigma squared): 6.9537e+10, (sigma: 263700)
## Number of observations: 25
## Number of parameters estimated: 8
## AIC: 711.18, (AIC for lm: 709.45)
# SEM
modelo_sem <- errorsarlm(extranjeros ~ tourism_gdp + crime_rate + college_education + business_activity + pop_density,
data = geo_data, listw = lw, zero.policy = TRUE)
## Warning in errorsarlm(extranjeros ~ tourism_gdp + crime_rate + college_education + : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## reciprocal condition number = 1.32256e-22 - using numerical Hessian.
## Warning in sqrt(fdHess[1, 1]): NaNs produced
##
## Call:errorsarlm(formula = extranjeros ~ tourism_gdp + crime_rate +
## college_education + business_activity + pop_density, data = geo_data,
## listw = lw, zero.policy = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -512090 -136024 7046 131566 909238
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.3768e+06 4.3783e+05 -3.1446 0.001663
## tourism_gdp 6.7902e+00 1.5023e+00 4.5198 6.191e-06
## crime_rate 8.0352e+02 2.0509e+03 0.3918 0.695212
## college_education 3.6956e+06 1.2858e+06 2.8741 0.004051
## business_activity -1.5173e+05 6.6700e+04 -2.2749 0.022912
## pop_density -7.9461e+02 5.1838e+02 -1.5329 0.125307
##
## Lambda: -0.00075619, LR test value: 3.6709e-06, p-value: 0.99847
## Approximate (numerical Hessian) standard error: NaN
## z-value: NaN, p-value: NA
## Wald statistic: NaN, p-value: NA
##
## Log likelihood: -347.7235 for error model
## ML residual variance (sigma squared): 7.058e+10, (sigma: 265670)
## Number of observations: 25
## Number of parameters estimated: 8
## AIC: 711.45, (AIC for lm: 709.45)
# SDM
modelo_sdm <- lagsarlm(extranjeros ~ tourism_gdp + crime_rate + college_education + business_activity + pop_density,
data = geo_data, listw = lw, type = "mixed", zero.policy = TRUE)
## Warning in lagsarlm(extranjeros ~ tourism_gdp + crime_rate + college_education + : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## reciprocal condition number = 1.01794e-22 - using numerical Hessian.
##
## Call:
## lagsarlm(formula = extranjeros ~ tourism_gdp + crime_rate + college_education +
## business_activity + pop_density, data = geo_data, listw = lw,
## type = "mixed", zero.policy = TRUE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -400758.2 -70581.4 -3525.8 124108.7 546317.6
##
## Type: mixed
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.0591e+06 1.0929e+06 -1.8840 0.0595634
## tourism_gdp 6.3418e+00 1.3326e+00 4.7589 1.947e-06
## crime_rate 4.8704e+02 1.9410e+03 0.2509 0.8018705
## college_education 2.8647e+06 1.5181e+06 1.8870 0.0591604
## business_activity -2.1136e+05 6.1588e+04 -3.4319 0.0005994
## pop_density -6.9569e+01 5.4473e+02 -0.1277 0.8983771
## lag.tourism_gdp -3.8493e+00 2.6551e+00 -1.4498 0.1471155
## lag.crime_rate -6.6186e+03 3.0768e+03 -2.1512 0.0314622
## lag.college_education 4.6221e+06 3.8881e+06 1.1888 0.2345289
## lag.business_activity -3.0240e+04 9.4032e+04 -0.3216 0.7477622
## lag.pop_density -1.6027e+03 9.9942e+02 -1.6037 0.1087871
##
## Rho: -0.13467, LR test value: 0.24133, p-value: 0.62325
## Approximate (numerical Hessian) standard error: 0.27279
## z-value: -0.49368, p-value: 0.62153
## Wald statistic: 0.24372, p-value: 0.62153
##
## Log likelihood: -342.049 for mixed model
## ML residual variance (sigma squared): 4.4566e+10, (sigma: 211110)
## Number of observations: 25
## Number of parameters estimated: 13
## AIC: 710.1, (AIC for lm: 708.34)
# MACHINE LEARNING - RANDOM FOREST ----------------------------------
set.seed(123)
rf <- randomForest(extranjeros ~ tourism_gdp + crime_rate + college_education + business_activity + pop_density,
data = geo_data, importance = TRUE, ntree = 500)
print(rf)
##
## Call:
## randomForest(formula = extranjeros ~ tourism_gdp + crime_rate + college_education + business_activity + pop_density, data = geo_data, importance = TRUE, ntree = 500)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 174382137088
## % Var explained: -3.08
# COMPARACION CON RMSE ----------------------------------------------
pred_lm <- predict(modelo_lm)
pred_sar <- predict(modelo_sar)
## This method assumes the response is known - see manual page
pred_rf <- predict(rf)
rmse <- function(actual, pred) sqrt(mean((actual - pred)^2))
rmse_lm <- rmse(geo_data$extranjeros, pred_lm)
rmse_sar <- rmse(geo_data$extranjeros, pred_sar)
rmse_rf <- rmse(geo_data$extranjeros, pred_rf)
cat("\n--- COMPARACIÓN DE RMSE ---\n")
##
## --- COMPARACIÓN DE RMSE ---
cat("Modelo LM:", rmse_lm, "\n")
## Modelo LM: 265668.2
cat("Modelo SAR:", rmse_sar, "\n")
## Modelo SAR: 263698.5
cat("Random Forest:", rmse_rf, "\n")
## Random Forest: 417590.9