Equipo 3 - Gamma.
Integrantes:
* César Romeo Vega
* Gamaliel Ostos
* Luis Carlos Borbón
* Rodrigo Arroyo
#install.packages("readxl")
#install.packages("ggplot2")
#install.packages("dplyr")
#install.packages("spdep")
#install.packages("sp")
#install.packages("sf")
#install.packages("spatialreg")
#install.packages("stargazer")
#install.packages("tidyverse")
#install.packages("randomForest")
#install.packages("caret")
#install.packages("car")
#install.packages("GWmodel")
library(readxl)
library(ggplot2)
library(dplyr)
library(spdep)
library(tmap)
library(sf)
library(sp)
library(tmaptools)
library(tmap)
library(spatialreg)
library(stargazer)
library(tidyverse)
library(car)
library(caret)
library(foreign)
library(GWmodel)
# Read the CSV file
data <- read.csv("tourism_state_data.csv")
# Mexico's states (32)
map <- st_read("mexlatlong.shp")
## Reading layer `mexlatlong' from data source
## `C:\Users\rodio\OneDrive\Escritorio\8semestre\Bloque Final\Actividad 3\mexlatlong.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 32 features and 19 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -118.4042 ymin: 14.55055 xmax: -86.73862 ymax: 32.71846
## Geodetic CRS: WGS 84
map_a <- read_sf("mexlatlong.shp")
data <- data %>%
mutate(tourist_night_foreign = as.numeric(tourist_night_foreign))
data$region <- as.factor(data$region)
# Read the Excel file
data <- filter(data, year == 2018)
# Merging dataset
data <- map %>%
left_join(data, by = c("ADMIN_NAME" = "state"))
# Queen contiguity
swm_queen <- poly2nb(map, queen=TRUE)
rswm_queen <- nb2listw(swm_queen, style = "W", zero.policy = TRUE)
# Rook contiguity
swm_rook <- poly2nb(map, queen = FALSE)
rswm_rook <- nb2listw(swm_rook, style = "W", zero.policy = TRUE)
# Knn Means
## Obtener coordinadas de los estados de Mexico
map_a <- as(map, "Spatial")
coords <- coordinates(map_a)
knn5_nb <- knn2nb(knearneigh(coords, k = 5))
rswm_knn5 <- nb2listw(knn5_nb, style = "W", zero.policy = TRUE)
# Distance-based neighbors (max first-neighbor distance)
d1_nb <- knn2nb(knearneigh(coords, k = 1))
d1_dists <- nbdists(d1_nb, coords, longlat = FALSE)
max_d1 <- max(unlist(d1_dists))
swm_dist <- dnearneigh(coords, 0, max_d1, longlat = FALSE)
rswm_dist <- nb2listw(swm_dist, style = "W", zero.policy = TRUE)
# Modelo de regresión no espacial
model_NS <- lm(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign), data = data)
summary(model_NS)
##
## Call:
## lm(formula = tourism_gdp ~ crime_rate + employment + real_wage +
## pop_density + college_education + log(tourist_night_foreign),
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46881 -24046 -10522 12065 100331
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.600e+05 6.542e+05 0.703 0.4884
## crime_rate 2.486e+02 3.392e+02 0.733 0.4703
## employment -6.026e+05 6.913e+05 -0.872 0.3917
## real_wage 2.398e+02 2.176e+02 1.102 0.2811
## pop_density 4.962e+01 8.857e+00 5.602 7.92e-06 ***
## college_education -2.800e+05 2.181e+05 -1.284 0.2109
## log(tourist_night_foreign) 1.183e+04 4.350e+03 2.719 0.0117 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38340 on 25 degrees of freedom
## Multiple R-squared: 0.7617, Adjusted R-squared: 0.7045
## F-statistic: 13.32 on 6 and 25 DF, p-value: 9.721e-07
# Valores Reales
valores_reales <- data$tourism_gdp
# Predicciones del modelo
predicciones_NS <- predict(model_NS)
# Calcular el RMSE y AIC
mse_NS <- mean((valores_reales - predicciones_NS)^2)
RMSE_NS <- sqrt(mse_NS)
AIC_NS <- AIC(model_NS)
En el modelo no espacial, la variable más significativa al 99% es pop_density, seguida de tourist_night_foreign a una significancia del 90%. Por otro lado, employment es la que más alto p-value tiene, haciendola la menos significativa para el modelo, seguido por crime_rate. La estimación del modelo cuenta con una R^2 ajustada del 0.698, lo que implica que casi el 70% del comportamiento de tourism_gdp se explica con las variables independientes.
# SAR - Spatial Autoregressive Model
model_SAR <- lagsarlm(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign), data = data, listw = rswm_queen)
summary(model_SAR)
##
## Call:lagsarlm(formula = tourism_gdp ~ crime_rate + employment + real_wage +
## pop_density + college_education + log(tourist_night_foreign),
## data = data, listw = rswm_queen)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43871.3 -23346.3 -8512.2 8815.1 103862.6
##
## Type: lag
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 323120.923 563294.349 0.5736 0.566220
## crime_rate 259.232 289.558 0.8953 0.370644
## employment -430025.156 599483.598 -0.7173 0.473173
## real_wage 242.747 185.708 1.3071 0.191164
## pop_density 52.666 7.899 6.6674 2.603e-11
## college_education -254852.282 186757.616 -1.3646 0.172374
## log(tourist_night_foreign) 10126.378 3924.662 2.5802 0.009875
##
## Rho: -0.26186, LR test value: 1.6875, p-value: 0.19393
## Approximate (numerical Hessian) standard error: 0.19782
## z-value: -1.3237, p-value: 0.18559
## Wald statistic: 1.7522, p-value: 0.18559
##
## Log likelihood: -378.3493 for lag model
## ML residual variance (sigma squared): 1071300000, (sigma: 32731)
## Number of observations: 32
## Number of parameters estimated: 9
## AIC: 774.7, (AIC for lm: 774.39)
# Predicciones del modelo
predicciones_SAR <- predict(model_SAR)
# Calcular el RMSE y AIC
mse_SAR <- mean((valores_reales - predicciones_SAR)^2)
RMSE_SAR <- sqrt(mse_SAR)
AIC_SAR <- AIC(model_SAR)
model_SEM <- errorsarlm(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign), data = data, listw = rswm_queen)
summary(model_SEM)
##
## Call:errorsarlm(formula = tourism_gdp ~ crime_rate + employment +
## real_wage + pop_density + college_education + log(tourist_night_foreign),
## data = data, listw = rswm_queen)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37312.2 -20694.4 -8696.5 11578.7 86078.3
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 241891.546 484003.604 0.4998 0.617235
## crime_rate 271.212 255.445 1.0617 0.288361
## employment -354025.182 512370.943 -0.6910 0.489594
## real_wage 305.852 156.656 1.9524 0.050894
## pop_density 54.755 6.110 8.9615 < 2.2e-16
## college_education -290745.223 142153.137 -2.0453 0.040826
## log(tourist_night_foreign) 8720.516 3233.856 2.6966 0.007004
##
## Lambda: -0.69002, LR test value: 5.4827, p-value: 0.019206
## Approximate (numerical Hessian) standard error: 0.25319
## z-value: -2.7254, p-value: 0.006423
## Wald statistic: 7.4276, p-value: 0.006423
##
## Log likelihood: -376.4517 for error model
## ML residual variance (sigma squared): 858310000, (sigma: 29297)
## Number of observations: 32
## Number of parameters estimated: 9
## AIC: 770.9, (AIC for lm: 774.39)
# Predicciones del modelo
predicciones_SEM <- predict(model_SEM)
# Calcular el RMSE y AIC
mse_SEM <- mean((valores_reales - predicciones_SEM)^2)
RMSE_SEM <- sqrt(mse_SEM)
AIC_SEM <- AIC(model_SEM)
model_DURB <- lagsarlm(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign), data = data, listw = rswm_queen, type="mixed")
## Warning in lagsarlm(tourism_gdp ~ crime_rate + employment + real_wage + : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## número de condición recíproco = 1.33635e-19 - using numerical Hessian.
summary(model_DURB)
##
## Call:lagsarlm(formula = tourism_gdp ~ crime_rate + employment + real_wage +
## pop_density + college_education + log(tourist_night_foreign),
## data = data, listw = rswm_queen, type = "mixed")
##
## Residuals:
## Min 1Q Median 3Q Max
## -38209.0 -19122.1 -8476.1 9803.9 74259.6
##
## Type: mixed
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.3853e+04 6.6662e+05 0.1108 0.911786
## crime_rate 3.0232e+02 2.4741e+02 1.2220 0.221726
## employment -3.1956e+05 5.5866e+05 -0.5720 0.567314
## real_wage 2.4139e+02 1.6097e+02 1.4996 0.133715
## pop_density 5.0762e+01 6.8953e+00 7.3618 1.814e-13
## college_education -2.2233e+05 1.8579e+05 -1.1967 0.231427
## log(tourist_night_foreign) 9.7495e+03 3.3388e+03 2.9201 0.003499
## lag.crime_rate 2.3918e+01 3.8988e+02 0.0613 0.951083
## lag.employment 4.2037e+04 5.9648e+05 0.0705 0.943815
## lag.real_wage 5.3748e+02 3.1460e+02 1.7084 0.087555
## lag.pop_density 4.5492e+01 2.2108e+01 2.0577 0.039619
## lag.college_education -1.4868e+05 3.3681e+05 -0.4414 0.658907
## lag.log(tourist_night_foreign) -1.5701e+03 8.5333e+03 -0.1840 0.854013
##
## Rho: -0.68268, LR test value: 6.4493, p-value: 0.0111
## Approximate (numerical Hessian) standard error: 0.23365
## z-value: -2.9218, p-value: 0.0034804
## Wald statistic: 8.5368, p-value: 0.0034804
##
## Log likelihood: -374.4107 for mixed model
## ML residual variance (sigma squared): 757560000, (sigma: 27524)
## Number of observations: 32
## Number of parameters estimated: 15
## AIC: 778.82, (AIC for lm: 783.27)
# Predicciones del modelo
predicciones_DURB <- predict(model_DURB)
# Calcular el RMSE y AIC
mse_DURB <- mean((valores_reales - predicciones_DURB)^2)
RMSE_DURB <- sqrt(mse_DURB)
AIC_DURB <- AIC(model_DURB)
table1 <- data.frame(Variable = c("No Espacial", "SAR", "SEM", "DURB"),
RMSE = c(RMSE_NS, RMSE_SAR, RMSE_SEM, RMSE_DURB),
AIC = c(AIC_NS,AIC_SAR,AIC_SEM,AIC_DURB))
table1
## Variable RMSE AIC
## 1 No Espacial 33888.88 774.3860
## 2 SAR 32730.67 774.6985
## 3 SEM 29296.90 770.9033
## 4 DURB 27523.84 778.8214
Como se puede apreciar, de los modelos espaciales y no espacial estimados con la matriz de contigüidad, el modelo de BURBIN mostró un menor RMSE y un valor de AIC ligeramente mayor a los demás modelos. Esto significa que el modelo de DURBIN es el mejor para la predicción de la variable dependiente que las otras especificaciones realizadas.
# SAR - Spatial Autoregressive Model
model_SAR2 <- lagsarlm(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign), data = data, listw = rswm_rook)
summary(model_SAR2)
##
## Call:lagsarlm(formula = tourism_gdp ~ crime_rate + employment + real_wage +
## pop_density + college_education + log(tourist_night_foreign),
## data = data, listw = rswm_rook)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44975.7 -23575.3 -9513.8 8108.0 102869.7
##
## Type: lag
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.7944e+05 5.7323e+05 0.6619 0.508016
## crime_rate 2.5719e+02 2.9418e+02 0.8743 0.381965
## employment -4.9843e+05 6.0889e+05 -0.8186 0.413024
## real_wage 2.4380e+02 1.8888e+02 1.2907 0.196800
## pop_density 5.1710e+01 7.9888e+00 6.4727 9.624e-11
## college_education -2.6390e+05 1.9002e+05 -1.3888 0.164891
## log(tourist_night_foreign) 1.0703e+04 3.9520e+03 2.7083 0.006763
##
## Rho: -0.18837, LR test value: 0.8992, p-value: 0.343
## Approximate (numerical Hessian) standard error: 0.19688
## z-value: -0.9568, p-value: 0.33867
## Wald statistic: 0.91547, p-value: 0.33867
##
## Log likelihood: -378.7434 for lag model
## ML residual variance (sigma squared): 1106400000, (sigma: 33263)
## Number of observations: 32
## Number of parameters estimated: 9
## AIC: 775.49, (AIC for lm: 774.39)
# Predicciones del modelo
predicciones_SAR2 <- predict(model_SAR2)
# Calcular el RMSE y AIC
mse_SAR2 <- mean((valores_reales - predicciones_SAR2)^2)
RMSE_SAR2 <- sqrt(mse_SAR2)
AIC_SAR2 <- AIC(model_SAR2)
model_SEM2 <- errorsarlm(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign), data = data, listw = rswm_rook)
summary(model_SEM2)
##
## Call:errorsarlm(formula = tourism_gdp ~ crime_rate + employment +
## real_wage + pop_density + college_education + log(tourist_night_foreign),
## data = data, listw = rswm_rook)
##
## Residuals:
## Min 1Q Median 3Q Max
## -44010.9 -22705.7 -6728.2 11799.3 89679.6
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.5599e+05 5.1408e+05 0.6925 0.488644
## crime_rate 2.7595e+02 2.7567e+02 1.0010 0.316818
## employment -4.8519e+05 5.4302e+05 -0.8935 0.371585
## real_wage 3.1124e+02 1.6887e+02 1.8430 0.065329
## pop_density 5.3401e+01 6.5794e+00 8.1163 4.441e-16
## college_education -3.0312e+05 1.5688e+05 -1.9321 0.053341
## log(tourist_night_foreign) 9.8128e+03 3.4334e+03 2.8580 0.004263
##
## Lambda: -0.5458, LR test value: 3.4906, p-value: 0.061717
## Approximate (numerical Hessian) standard error: 0.27189
## z-value: -2.0075, p-value: 0.044699
## Wald statistic: 4.03, p-value: 0.044699
##
## Log likelihood: -377.4477 for error model
## ML residual variance (sigma squared): 953220000, (sigma: 30874)
## Number of observations: 32
## Number of parameters estimated: 9
## AIC: 772.9, (AIC for lm: 774.39)
# Predicciones del modelo
predicciones_SEM2 <- predict(model_SEM2)
# Calcular el RMSE y AIC
mse_SEM2 <- mean((valores_reales - predicciones_SEM2)^2)
RMSE_SEM2 <- sqrt(mse_SEM2)
AIC_SEM2 <- AIC(model_SEM2)
model_DURB2 <- lagsarlm(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign), data = data, listw = rswm_rook, type="mixed")
## Warning in lagsarlm(tourism_gdp ~ crime_rate + employment + real_wage + : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## número de condición recíproco = 1.24911e-19 - using numerical Hessian.
summary(model_DURB2)
##
## Call:lagsarlm(formula = tourism_gdp ~ crime_rate + employment + real_wage +
## pop_density + college_education + log(tourist_night_foreign),
## data = data, listw = rswm_rook, type = "mixed")
##
## Residuals:
## Min 1Q Median 3Q Max
## -41304 -18971 -8368 11541 77688
##
## Type: mixed
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.9438e+05 1.7129e+06 0.3470 0.728583
## crime_rate 3.1671e+02 2.7209e+02 1.1640 0.244431
## employment -3.9438e+05 5.8461e+05 -0.6746 0.499920
## real_wage 2.3919e+02 1.7255e+02 1.3862 0.165681
## pop_density 5.0231e+01 7.2218e+00 6.9554 3.515e-12
## college_education -2.2740e+05 1.9772e+05 -1.1501 0.250083
## log(tourist_night_foreign) 1.0436e+04 3.5357e+03 2.9515 0.003162
## lag.crime_rate 1.0851e+02 6.0544e+02 0.1792 0.857757
## lag.employment -4.6425e+05 1.7566e+06 -0.2643 0.791561
## lag.real_wage 6.0912e+02 4.0002e+02 1.5227 0.127832
## lag.pop_density 3.8327e+01 2.3743e+01 1.6143 0.106467
## lag.college_education -2.1102e+05 3.9950e+05 -0.5282 0.597349
## lag.log(tourist_night_foreign) -7.9465e+01 6.8140e+03 -0.0117 0.990695
##
## Rho: -0.56102, LR test value: 4.2658, p-value: 0.038888
## Approximate (numerical Hessian) standard error: 0.25159
## z-value: -2.2299, p-value: 0.025754
## Wald statistic: 4.9724, p-value: 0.025754
##
## Log likelihood: -375.6062 for mixed model
## ML residual variance (sigma squared): 845770000, (sigma: 29082)
## Number of observations: 32
## Number of parameters estimated: 15
## AIC: 781.21, (AIC for lm: 783.48)
# Predicciones del modelo
predicciones_DURB2 <- predict(model_DURB2)
# Calcular el RMSE y AIC
mse_DURB2 <- mean((valores_reales - predicciones_DURB2)^2)
RMSE_DURB2 <- sqrt(mse_DURB2)
AIC_DURB2 <- AIC(model_DURB2)
table2 <- data.frame(Variable = c("No Espacial", "SAR", "SEM", "DURB"),
RMSE = c(RMSE_NS, RMSE_SAR2, RMSE_SEM2, RMSE_DURB2),
AIC = c(AIC_NS,AIC_SAR2,AIC_SEM2,AIC_DURB2))
table2
## Variable RMSE AIC
## 1 No Espacial 33888.88 774.3860
## 2 SAR 33263.24 775.4868
## 3 SEM 30874.22 772.8954
## 4 DURB 29082.12 781.2124
Como se puede apreciar, de los modelos espaciales y no espacial
estimados con la matriz de contigüidad rook, el modelo de
BURBIN mostró un menor RMSE y un valor de AIC ligeramente mayor
a los demás modelos. Esto significa que el modelo de
DURBIN, nuevamente, es el mejor para la predicción de la
variable dependiente que las otras especificaciones realizadas.
En comparación de Queen, hay un mayor RMSE en los modelos
especificados.
# SAR - Spatial Autoregressive Model
model_SAR3 <- lagsarlm(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign), data = data, listw = rswm_knn5)
summary(model_SAR3)
##
## Call:lagsarlm(formula = tourism_gdp ~ crime_rate + employment + real_wage +
## pop_density + college_education + log(tourist_night_foreign),
## data = data, listw = rswm_knn5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46871.4 -23101.6 -8360.6 12806.5 98780.0
##
## Type: lag
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.0451e+05 5.8672e+05 0.8599 0.389861
## crime_rate 2.6056e+02 3.0030e+02 0.8677 0.385580
## employment -6.6068e+05 6.2622e+05 -1.0550 0.291410
## real_wage 2.4020e+02 1.9174e+02 1.2528 0.210296
## pop_density 4.9175e+01 7.8877e+00 6.2343 4.537e-10
## college_education -2.8691e+05 1.9313e+05 -1.4856 0.137385
## log(tourist_night_foreign) 1.2483e+04 4.1959e+03 2.9751 0.002928
##
## Rho: 0.066695, LR test value: 0.14734, p-value: 0.70109
## Approximate (numerical Hessian) standard error: 0.17328
## z-value: 0.38489, p-value: 0.70032
## Wald statistic: 0.14814, p-value: 0.70032
##
## Log likelihood: -379.1193 for lag model
## ML residual variance (sigma squared): 1142500000, (sigma: 33800)
## Number of observations: 32
## Number of parameters estimated: 9
## AIC: 776.24, (AIC for lm: 774.39)
# Predicciones del modelo
predicciones_SAR3 <- predict(model_SAR3)
# Calcular el RMSE y AIC
mse_SAR3 <- mean((valores_reales - predicciones_SAR3)^2)
RMSE_SAR3 <- sqrt(mse_SAR3)
AIC_SAR3 <- AIC(model_SAR3)
model_SEM3 <- errorsarlm(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign), data = data, listw = rswm_knn5)
summary(model_SEM3)
##
## Call:errorsarlm(formula = tourism_gdp ~ crime_rate + employment +
## real_wage + pop_density + college_education + log(tourist_night_foreign),
## data = data, listw = rswm_knn5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42118 -23381 -8469 12494 87823
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 471533.111 548491.780 0.8597 0.389960
## crime_rate 325.902 290.502 1.1219 0.261924
## employment -615874.061 573798.247 -1.0733 0.283124
## real_wage 326.605 177.992 1.8349 0.066514
## pop_density 53.297 6.892 7.7332 1.044e-14
## college_education -316513.506 172606.132 -1.8337 0.066694
## log(tourist_night_foreign) 10295.093 3752.203 2.7437 0.006074
##
## Lambda: -0.61493, LR test value: 1.795, p-value: 0.18032
## Approximate (numerical Hessian) standard error: 0.46981
## z-value: -1.3089, p-value: 0.19057
## Wald statistic: 1.7132, p-value: 0.19057
##
## Log likelihood: -378.2955 for error model
## ML residual variance (sigma squared): 1037700000, (sigma: 32214)
## Number of observations: 32
## Number of parameters estimated: 9
## AIC: 774.59, (AIC for lm: 774.39)
# Predicciones del modelo
predicciones_SEM3 <- predict(model_SEM3)
# Calcular el RMSE y AIC
mse_SEM3 <- mean((valores_reales - predicciones_SEM3)^2)
RMSE_SEM3 <- sqrt(mse_SEM3)
AIC_SEM3 <- AIC(model_SEM3)
model_DURB3 <- lagsarlm(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign), data = data, listw = rswm_knn5, type="mixed")
## Warning in lagsarlm(tourism_gdp ~ crime_rate + employment + real_wage + : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## número de condición recíproco = 9.54213e-20 - using numerical Hessian.
summary(model_DURB3)
##
## Call:lagsarlm(formula = tourism_gdp ~ crime_rate + employment + real_wage +
## pop_density + college_education + log(tourist_night_foreign),
## data = data, listw = rswm_knn5, type = "mixed")
##
## Residuals:
## Min 1Q Median 3Q Max
## -46341.9 -18714.3 -7250.8 16871.0 67447.5
##
## Type: mixed
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.1274e+06 2.2492e+06 1.3905 0.16439
## crime_rate 6.1119e+02 2.8809e+02 2.1215 0.03388
## employment -1.0378e+06 6.2155e+05 -1.6696 0.09500
## real_wage 3.0585e+02 1.7220e+02 1.7761 0.07572
## pop_density 4.9363e+01 7.4804e+00 6.5990 4.14e-11
## college_education -3.0483e+05 1.8446e+05 -1.6525 0.09843
## log(tourist_night_foreign) 1.0410e+04 4.7062e+03 2.2120 0.02696
## lag.crime_rate 1.0081e+03 9.0441e+02 1.1147 0.26498
## lag.employment -2.4089e+06 1.9607e+06 -1.2286 0.21923
## lag.real_wage 8.6774e+02 6.4886e+02 1.3373 0.18112
## lag.pop_density 5.1579e+01 3.0388e+01 1.6974 0.08963
## lag.college_education -6.8145e+05 5.4434e+05 -1.2519 0.21061
## lag.log(tourist_night_foreign) -2.2121e+03 1.7255e+04 -0.1282 0.89799
##
## Rho: -0.8262, LR test value: 3.5159, p-value: 0.060782
## Approximate (numerical Hessian) standard error: 0.41664
## z-value: -1.983, p-value: 0.047366
## Wald statistic: 3.9323, p-value: 0.047366
##
## Log likelihood: -375.1031 for mixed model
## ML residual variance (sigma squared): 821700000, (sigma: 28665)
## Number of observations: 32
## Number of parameters estimated: 15
## AIC: 780.21, (AIC for lm: 781.72)
# Predicciones del modelo
predicciones_DURB3 <- predict(model_DURB3)
# Calcular el RMSE y AIC
mse_DURB3 <- mean((valores_reales - predicciones_DURB3)^2)
RMSE_DURB3 <- sqrt(mse_DURB3)
AIC_DURB3 <- AIC(model_DURB3)
table3 <- data.frame(Variable = c("No Espacial", "SAR", "SEM", "DURB"),
RMSE = c(RMSE_NS, RMSE_SAR3, RMSE_SEM3, RMSE_DURB3),
AIC = c(AIC_NS,AIC_SAR3,AIC_SEM3,AIC_DURB3))
table3
## Variable RMSE AIC
## 1 No Espacial 33888.88 774.3860
## 2 SAR 33800.17 776.2386
## 3 SEM 32213.55 774.5910
## 4 DURB 28665.31 780.2063
Como se puede apreciar, de los modelos espaciales y no espacial
estimados con la matriz de vecinos cercanos (Knn Means), el
modelo de BURBIN mostró un menor RMSE y un valor de AIC
ligeramente mayor a los demás modelos. Esto significa que el
modelo de DURBIN, nuevamente, es el mejor para la
predicción de la variable dependiente que las otras especificaciones
realizadas.
En comparación de los modelos con matriz de contigüidad, hay un
ligero incremento en el RMSE en los modelos especificados. Aunque el
modelo SAR tuvo su menor RMSE hasta el momento.
# SAR - Spatial Autoregressive Model
model_SAR4 <- lagsarlm(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign), data = data, listw = rswm_dist)
summary(model_SAR4)
##
## Call:lagsarlm(formula = tourism_gdp ~ crime_rate + employment + real_wage +
## pop_density + college_education + log(tourist_night_foreign),
## data = data, listw = rswm_dist)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46692 -23531 -11286 11177 100645
##
## Type: lag
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.4203e+05 6.0376e+05 0.7321 0.464089
## crime_rate 2.4404e+02 3.0303e+02 0.8053 0.420633
## employment -5.8005e+05 6.4809e+05 -0.8950 0.370782
## real_wage 2.3870e+02 1.9259e+02 1.2394 0.215199
## pop_density 4.9803e+01 8.0267e+00 6.2047 5.481e-10
## college_education -2.7985e+05 1.9262e+05 -1.4529 0.146258
## log(tourist_night_foreign) 1.1683e+04 4.0662e+03 2.8732 0.004063
##
## Rho: -0.025751, LR test value: 0.011411, p-value: 0.91493
## Approximate (numerical Hessian) standard error: 0.23479
## z-value: -0.10968, p-value: 0.91267
## Wald statistic: 0.012029, p-value: 0.91267
##
## Log likelihood: -379.1873 for lag model
## ML residual variance (sigma squared): 1147900000, (sigma: 33881)
## Number of observations: 32
## Number of parameters estimated: 9
## AIC: 776.37, (AIC for lm: 774.39)
# Predicciones del modelo
predicciones_SAR4 <- predict(model_SAR4)
# Calcular el RMSE y AIC
mse_SAR4 <- mean((valores_reales - predicciones_SAR4)^2)
RMSE_SAR4 <- sqrt(mse_SAR4)
AIC_SAR4 <- AIC(model_SAR4)
model_SEM4 <- errorsarlm(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign), data = data, listw = rswm_dist)
summary(model_SEM4)
##
## Call:errorsarlm(formula = tourism_gdp ~ crime_rate + employment +
## real_wage + pop_density + college_education + log(tourist_night_foreign),
## data = data, listw = rswm_dist)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41931.8 -20737.4 -7422.9 9901.5 99842.2
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.5357e+04 5.3059e+05 -0.0666 0.946872
## crime_rate 2.6260e+02 2.8615e+02 0.9177 0.358777
## employment -1.0831e+05 5.5940e+05 -0.1936 0.846475
## real_wage 3.0816e+02 1.8039e+02 1.7082 0.087591
## pop_density 5.1563e+01 6.8196e+00 7.5611 3.997e-14
## college_education -2.3035e+05 1.6033e+05 -1.4367 0.150800
## log(tourist_night_foreign) 1.0505e+04 3.5817e+03 2.9331 0.003356
##
## Lambda: -0.55116, LR test value: 1.7563, p-value: 0.18508
## Approximate (numerical Hessian) standard error: 0.34633
## z-value: -1.5914, p-value: 0.11152
## Wald statistic: 2.5326, p-value: 0.11152
##
## Log likelihood: -378.3148 for error model
## ML residual variance (sigma squared): 1023600000, (sigma: 31994)
## Number of observations: 32
## Number of parameters estimated: 9
## AIC: 774.63, (AIC for lm: 774.39)
# Predicciones del modelo
predicciones_SEM4 <- predict(model_SEM4)
# Calcular el RMSE y AIC
mse_SEM4 <- mean((valores_reales - predicciones_SEM4)^2)
RMSE_SEM4 <- sqrt(mse_SEM4)
AIC_SEM4 <- AIC(model_SEM4)
model_DURB4 <- lagsarlm(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign), data = data, listw = rswm_dist, type="mixed")
## Warning in lagsarlm(tourism_gdp ~ crime_rate + employment + real_wage + : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## número de condición recíproco = 1.24399e-19 - using numerical Hessian.
summary(model_DURB4)
##
## Call:lagsarlm(formula = tourism_gdp ~ crime_rate + employment + real_wage +
## pop_density + college_education + log(tourist_night_foreign),
## data = data, listw = rswm_dist, type = "mixed")
##
## Residuals:
## Min 1Q Median 3Q Max
## -35682.3 -20846.8 -4137.6 6153.5 91803.0
##
## Type: mixed
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.7874e+06 2.0273e+06 -1.3749 0.1691533
## crime_rate 3.0612e+02 2.8939e+02 1.0578 0.2901401
## employment -5.1375e+05 6.7434e+05 -0.7619 0.4461463
## real_wage 3.3645e+02 1.7497e+02 1.9229 0.0544908
## pop_density 4.7058e+01 7.1023e+00 6.6257 3.455e-11
## college_education -3.1821e+05 1.9667e+05 -1.6180 0.1056588
## log(tourist_night_foreign) 1.3161e+04 3.8976e+03 3.3766 0.0007339
## lag.crime_rate -8.5698e+02 1.0461e+03 -0.8192 0.4126507
## lag.employment 3.1073e+06 2.0022e+06 1.5519 0.1206753
## lag.real_wage 1.1676e+02 5.8610e+02 0.1992 0.8420988
## lag.pop_density 4.3217e+01 3.4352e+01 1.2580 0.2083741
## lag.college_education 3.6723e+05 4.3219e+05 0.8497 0.3954930
## lag.log(tourist_night_foreign) 2.4575e+03 1.0252e+04 0.2397 0.8105567
##
## Rho: -0.48655, LR test value: 1.7917, p-value: 0.18072
## Approximate (numerical Hessian) standard error: 0.3336
## z-value: -1.4585, p-value: 0.14471
## Wald statistic: 2.1271, p-value: 0.14471
##
## Log likelihood: -374.7001 for mixed model
## ML residual variance (sigma squared): 8.28e+08, (sigma: 28775)
## Number of observations: 32
## Number of parameters estimated: 15
## AIC: 779.4, (AIC for lm: 779.19)
# Predicciones del modelo
predicciones_DURB4 <- predict(model_DURB4)
# Calcular el RMSE y AIC
mse_DURB4 <- mean((valores_reales - predicciones_DURB4)^2)
RMSE_DURB4 <- sqrt(mse_DURB4)
AIC_DURB4 <- AIC(model_DURB4)
table4 <- data.frame(Variable = c("No Espacial", "SAR", "SEM", "DURB"),
RMSE = c(RMSE_NS, RMSE_SAR4, RMSE_SEM4, RMSE_DURB4),
AIC = c(AIC_NS,AIC_SAR4,AIC_SEM4,AIC_DURB4))
table4
## Variable RMSE AIC
## 1 No Espacial 33888.88 774.3860
## 2 SAR 33880.62 776.3746
## 3 SEM 31993.53 774.6296
## 4 DURB 28775.04 779.4002
Como se puede apreciar, de los modelos espaciales y no espacial
estimados con la matriz de distancias inversas, el modelo de
BURBIN mostró un menor RMSE y un valor de AIC ligeramente mayor
a los demás modelos. Esto significa que el modelo de
DURBIN, nuevamente, es el mejor para la predicción de la
variable dependiente que las otras especificaciones realizadas.
En comparación de los modelos con matriz de contigüidad y vecinos
cercanos, el modelo DURBIN tiene la mejor especificación dado un menor
RMSE. Así mismo, el patrón se repite en el modelo SEM.
Para este análisis de autocorrelación espacial se seleccionarán los modelos más significativos de cada matriz de conectividad utilizada, con el objetivo de visualizar si hay existencia de este fenómeno.
#Generar variable que contenga los residuales de los modelos seleccionados
data2 <- data
data2$non_espacial_error <- model_NS$residuals
data2$queen <- model_DURB$residuals
data2$rook <- model_DURB2$residuals
data2$knn <- model_DURB3$residuals
data2$dist <- model_DURB4$residuals
# Generar mapas de distribución de residuales
map1 <- tm_shape(data2) +
tm_polygons(
fill = "non_espacial_error",
fill.scale = tm_scale_intervals(style = "quantile", n = 8, values = "brewer.yl_or_rd"),
fill.legend = tm_legend(title = "Residuals")
) +
tm_title("Non-Spatial Residuals", position = c("right", "top"))
map2 <- tm_shape(data2) +
tm_polygons(
fill = "queen",
fill.scale = tm_scale_intervals(style = "quantile", n = 8, values = "brewer.yl_or_rd"),
fill.legend = tm_legend(title = "Residuals")
) +
tm_title("Queen - DURB Residuals", position = c("right", "top"))
map3 <- tm_shape(data2) +
tm_polygons(
fill = "rook",
fill.scale = tm_scale_intervals(style = "quantile", n = 8, values = "brewer.yl_or_rd"),
fill.legend = tm_legend(title = "Residuals")
) +
tm_title("Rook - DURB Residuals", position = c("right", "top"))
map4 <- tm_shape(data2) +
tm_polygons(
fill = "knn",
fill.scale = tm_scale_intervals(style = "quantile", n = 8, values = "brewer.yl_or_rd"),
fill.legend = tm_legend(title = "Residuals")
) +
tm_title("Knn Means - DURB Residuals", position = c("right", "top"))
map5 <- tm_shape(data2) +
tm_polygons(
fill = "dist",
fill.scale = tm_scale_intervals(style = "quantile", n = 8, values = "brewer.yl_or_rd"),
fill.legend = tm_legend(title = "Residuals")
) +
tm_title("Inverse Distance - DURB Residuals", position = c("right", "top"))
tmap_arrange(map1, map2, map3, map4, map5, ncol = 2)
# Calcular el Global Moran para los residuales de los modelos
moran.test(model_NS$residuals, rswm_queen)
##
## Moran I test under randomisation
##
## data: model_NS$residuals
## weights: rswm_queen
##
## Moran I statistic standard deviate = -1.6191, p-value = 0.9473
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## -0.22730562 -0.03225806 0.01451221
moran.test(model_DURB$residuals, rswm_queen)
##
## Moran I test under randomisation
##
## data: model_DURB$residuals
## weights: rswm_queen
##
## Moran I statistic standard deviate = -0.35256, p-value = 0.6378
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## -0.07502217 -0.03225806 0.01471266
moran.test(model_DURB2$residuals, rswm_rook)
##
## Moran I test under randomisation
##
## data: model_DURB2$residuals
## weights: rswm_rook
##
## Moran I statistic standard deviate = -0.39758, p-value = 0.6545
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## -0.08150094 -0.03225806 0.01534030
moran.test(model_DURB3$residuals, rswm_knn5)
##
## Moran I test under randomisation
##
## data: model_DURB3$residuals
## weights: rswm_knn5
##
## Moran I statistic standard deviate = -0.37661, p-value = 0.6468
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## -0.06690826 -0.03225806 0.00846522
moran.test(model_DURB4$residuals, rswm_dist)
##
## Moran I test under randomisation
##
## data: model_DURB4$residuals
## weights: rswm_dist
##
## Moran I statistic standard deviate = -0.33995, p-value = 0.6331
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## -0.06729406 -0.03225806 0.01062159
Como se demuestra en los mapas y en el cálculo del Global Moran no parece existir una autocorrelación espacial en los residuales de los 5 modelos seleccionados, teniendo valores menores a 0.1 y sin significancia estadística, a excepción del modelo no espacial cuyo valor supera el 0.2 aunque sigue siendo no significativo.
bw1 <- bw.gwr(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign),
approach = "AIC", adaptive = T, data=data)
## Adaptive bandwidth (number of nearest neighbours): 27 AICc value: 805.213
## Adaptive bandwidth (number of nearest neighbours): 25 AICc value: 816.1958
## Adaptive bandwidth (number of nearest neighbours): 30 AICc value: 792.4966
## Adaptive bandwidth (number of nearest neighbours): 30 AICc value: 792.4966
model_gwr <- gwr.basic(tourism_gdp ~ crime_rate + employment + real_wage + pop_density + college_education+ log(tourist_night_foreign),
adaptive = T, data = data, bw = bw1)
model_gwr
## ***********************************************************************
## * Package GWmodel *
## ***********************************************************************
## Program starts at: 2025-04-23 22:06:34.363769
## Call:
## gwr.basic(formula = tourism_gdp ~ crime_rate + employment + real_wage +
## pop_density + college_education + log(tourist_night_foreign),
## data = data, bw = bw1, adaptive = T)
##
## Dependent (y) variable: tourism_gdp
## Independent variables: crime_rate employment real_wage pop_density college_education tourist_night_foreign
## Number of data points: 32
## ***********************************************************************
## * Results of Global Regression *
## ***********************************************************************
##
## Call:
## lm(formula = formula, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46881 -24046 -10522 12065 100331
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.600e+05 6.542e+05 0.703 0.4884
## crime_rate 2.486e+02 3.392e+02 0.733 0.4703
## employment -6.026e+05 6.913e+05 -0.872 0.3917
## real_wage 2.398e+02 2.176e+02 1.102 0.2811
## pop_density 4.962e+01 8.857e+00 5.602 7.92e-06 ***
## college_education -2.800e+05 2.181e+05 -1.284 0.2109
## log(tourist_night_foreign) 1.183e+04 4.350e+03 2.719 0.0117 *
##
## ---Significance stars
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Residual standard error: 38340 on 25 degrees of freedom
## Multiple R-squared: 0.7617
## Adjusted R-squared: 0.7045
## F-statistic: 13.32 on 6 and 25 DF, p-value: 9.721e-07
## ***Extra Diagnostic information
## Residual sum of squares: 36750606581
## Sigma(hat): 35000.29
## AIC: 774.386
## AICc: 780.6468
## BIC: 781.8377
## ***********************************************************************
## * Results of Geographically Weighted Regression *
## ***********************************************************************
##
## *********************Model calibration information*********************
## Kernel function: bisquare
## Adaptive bandwidth: 30 (number of nearest neighbours)
## Regression points: the same locations as observations are used.
## Distance metric: Euclidean distance metric is used.
##
## ****************Summary of GWR coefficient estimates:******************
## Min. 1st Qu. Median 3rd Qu.
## Intercept 9.1551e+04 3.8794e+05 8.6366e+05 1.3483e+06
## crime_rate 1.9623e+02 2.1899e+02 2.4106e+02 2.9371e+02
## employment -1.6990e+06 -1.5569e+06 -1.0446e+06 -5.4622e+05
## real_wage 8.3253e+01 2.6817e+02 4.1996e+02 4.9481e+02
## pop_density 4.5225e+01 4.5872e+01 4.6700e+01 4.7238e+01
## college_education -5.8120e+05 -5.3894e+05 -4.3566e+05 -2.3010e+05
## log(tourist_night_foreign) 1.1759e+04 1.1904e+04 1.2353e+04 1.3026e+04
## Max.
## Intercept 1499954.749
## crime_rate 405.655
## employment -223087.200
## real_wage 528.709
## pop_density 48.344
## college_education -57599.842
## log(tourist_night_foreign) 13696.467
## ************************Diagnostic information*************************
## Number of data points: 32
## Effective number of parameters (2trace(S) - trace(S'S)): 12.77802
## Effective degrees of freedom (n-2trace(S) + trace(S'S)): 19.22198
## AICc (GWR book, Fotheringham, et al. 2002, p. 61, eq 2.33): 792.4966
## AIC (GWR book, Fotheringham, et al. 2002,GWR p. 96, eq. 4.22): 763.3288
## BIC (GWR book, Fotheringham, et al. 2002,GWR p. 61, eq. 2.34): 758.3125
## Residual sum of squares: 30466725770
## R-square value: 0.802472
## Adjusted R-square value: 0.663957
##
## ***********************************************************************
## Program stops at: 2025-04-23 22:06:34.407365
gwr_sf = st_as_sf(model_gwr$SDF)
#Mapeo de la variable Dependiente
gwr_sf$y_predicted <- gwr_sf$yhat
tm_shape(gwr_sf) +
tm_polygons(
fill = "y_predicted",
fill.scale = tm_scale_intervals(style = "quantile", n = 8, values = "brewer.yl_or_rd"),
fill.legend = tm_legend(title = "MXN")
) +
tm_title("Tourism GDP", position = c("right", "top"))
Los estados con un mayor Tourism GDP basandose en la predicción son Nuevo León, Guanajuato, Ciudad de México y Quintana Roo. Mientras que los estados con menor Tourism GDP son Sonora, Sinaloa, Durango e Hidalgo.
map1 <- tm_shape(gwr_sf) +
tm_polygons(
fill = "crime_rate_TV",
fill.scale = tm_scale_intervals(style = "quantile", n = 8, values = "PuBu")
) +
tm_title("% Crime Rate", position = c("right", "top"))
map2 <- tm_shape(gwr_sf) +
tm_polygons(
fill = "employment_TV",
fill.scale = tm_scale_intervals(style = "quantile", n = 8, values = "PuBu")
) +
tm_title("% Employment Rate", position = c("right", "top"))
map3 <- tm_shape(gwr_sf) +
tm_polygons(
fill = "real_wage_TV",
fill.scale = tm_scale_intervals(style = "quantile", n = 8, values = "PuBu")
) +
tm_title("Real Wage", position = c("right", "top"))
map4 <- tm_shape(gwr_sf) +
tm_polygons(
fill = "college_education_TV",
fill.scale = tm_scale_intervals(style = "quantile", n = 8, values = "PuBu")
) +
tm_title("College Education", position = c("right", "top"))
tmap_arrange(map1, map2, map3, map4, ncol = 2)
Como se puede apreciar, las variables con menor significancia estadística tiene cluster completamente separados, indicando una relación espacial positiva considerando real_wage y crime_rate significativos para la región del norte y oeste del pais; mientra que college_education y employment para la región del centro, sur y este.
Si comparamos todos los RMSE y AIC de los resultados de los modelos tenemos que los dos mejores modelos son:
Entre los principales hallazgos tenemos que:
tm_shape(gwr_sf) +
tm_polygons(
fill = "pop_density_TV",
fill.scale = tm_scale_intervals(style = "quantile", n = 8, values = "PuBu")
) +
tm_title("Population Density", position = c("right", "top"))
La variable pop_density es la más significativa del modelo generado al 99% y a través del mapa se aprecia que los estados de la región sureste cuentan con una mayor significancia que los estados del centro o del norte/oeste. Es decir, Veracruz, Yucatán y Tabasco son estados donde la variable pop_density cobra mayor relevancia en lugar de estados como ambas Baja Californias o Nuevo León.
tm_shape(gwr_sf) +
tm_polygons(
fill = "log(tourist_night_foreign)_TV",
fill.scale = tm_scale_intervals(style = "quantile", n = 8, values = "PuBu")
) +
tm_title("Foreign Tourist Hosted at Night", position = c("right", "top"))
La variable tourist_night_foreign es la segunda más significativa del modelo generado al 95% y a través del mapa se aprecia que los estados de la región sureste y el centro-oeste cuentan con una mayor significancia que los estados del norte y parte del centro. Es decir, existen dos clusters significativos entre los estados de Veracruz, Queretaro y Chiapas; además de los estados de nayarit, Zacatecas y Jalisco.