pkgs <- c("readxl","dplyr","tidyr","stringr","ggplot2","scales","patchwork",
"leaflet","leaflet.extras","sf","osrm","spdep","spatialreg",
"tidytext","syuzhet","wordcloud","RColorBrewer","knitr","kableExtra")
invisible(lapply(pkgs, function(p) {
if (!requireNamespace(p, quietly = TRUE)) install.packages(p)
library(p, character.only = TRUE)
}))ruta_airbnb <- "C:\\Users\\almai\\Downloads\\AirBnB_Data.xlsx"
ruta_reviews <- "C:\\Users\\almai\\Downloads\\AirBnB_Raitings_Reviews_Mty.xlsx"
ruta_zmm <- "C:\\Users\\almai\\Downloads\\zmm_data_entretenimiento.xlsx"
df_airbnb <- read_excel(ruta_airbnb, sheet = "Database_Airbnb")
df_hoteles <- read_excel(ruta_airbnb, sheet = "Database_Hoteles")
# skip=1 porque la primera fila es un sub-encabezado
# col_types fuerza la lectura de fecha como Date directamente
df_ts <- read_excel(ruta_airbnb, sheet = "Database_AirBnB_Time_Series",
skip = 1, col_types = c("date", rep("numeric", 8)))
names(df_ts)[1] <- "fecha"
df_ts <- df_ts %>% filter(!is.na(fecha)) %>% mutate(fecha = as.Date(fecha))
df_reviews <- read_excel(ruta_reviews) %>% filter(!is.na(reviews))
df_rest <- read_excel(ruta_zmm, sheet = "restaurant")
df_cafe <- read_excel(ruta_zmm, sheet = "coffee shops")
df_bares <- read_excel(ruta_zmm, sheet = "bares & antros")
df_plazas <- read_excel(ruta_zmm, sheet = "plazas comerciales")
cat("Airbnb :", nrow(df_airbnb), "propiedades\n")## Airbnb : 250 propiedades
## Hoteles : 60 hoteles
## Reviews : 249 registros
## TS : 45 meses
El mapa combina los 250 Airbnbs, 60 hoteles y los lugares de entretenimiento de la ZMM en una sola vista interactiva. El objetivo es identificar clusters espaciales y entender si hay patrones de concentración diferenciados por tipo de alojamiento.
ico_ab <- makeAwesomeIcon(icon = "home", markerColor = "blue", library = "fa")
ico_ht <- makeAwesomeIcon(icon = "hotel", markerColor = "red", library = "fa")
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addAwesomeMarkers(
data = df_airbnb, lng = ~lon, lat = ~lat, icon = ico_ab,
group = "Airbnb",
popup = ~paste0("<b>Airbnb</b><br>Precio: $", booking_price, " MXN<br>",
"Tipo: ", room_type, "<br>Rating: ", overall_raiting,
"<br>Municipio: ", Municipio)
) %>%
addAwesomeMarkers(
data = df_hoteles, lng = ~Lon, lat = ~Lat, icon = ico_ht,
group = "Hoteles",
popup = ~paste0("<b>", Hotel, "</b><br>Precio: $", Precio, " MXN<br>",
"Estrellas: ", No_Estrellas, "<br>Cal: ", Calificación_1, "/10")
) %>%
addCircleMarkers(data = df_rest, lng = ~longitud, lat = ~latitud,
color = "#e67e22", radius = 4, fillOpacity = 0.7, weight = 1,
group = "Restaurantes",
popup = ~paste0("<b>", nombre_negocio, "</b><br>⭐ ",
`calificación_open_table`)) %>%
addCircleMarkers(data = df_cafe, lng = ~longitud, lat = ~latitud,
color = "#795548", radius = 4, fillOpacity = 0.7, weight = 1,
group = "Cafés",
popup = ~paste0("<b>", nombre_negocio, "</b><br>⭐ ", calificación)) %>%
addCircleMarkers(data = df_bares, lng = ~longitud, lat = ~latitud,
color = "#8e44ad", radius = 4, fillOpacity = 0.7, weight = 1,
group = "Bares & Antros",
popup = ~paste0("<b>", nombre, "</b><br>⭐ ", calificación)) %>%
addCircleMarkers(data = df_plazas, lng = ~longitud, lat = ~latitud,
color = "#27ae60", radius = 5, fillOpacity = 0.8, weight = 1,
group = "Plazas Comerciales",
popup = ~paste0("<b>", nombre, "</b><br>⭐ ", calificación)) %>%
addLayersControl(
overlayGroups = c("Airbnb","Hoteles","Restaurantes",
"Cafés","Bares & Antros","Plazas Comerciales"),
options = layersControlOptions(collapsed = FALSE)
) %>%
addLegend(
position = "bottomright",
colors = c("#3498db","#e74c3c","#e67e22","#795548","#8e44ad","#27ae60"),
labels = c("Airbnb","Hotel","Restaurante","Café","Bar/Antro","Plaza"),
title = "Tipo de lugar"
)Interpretación — clusters espaciales:
Las isocronas muestran cuántas propiedades de cada tipo quedan accesibles desde la Macroplaza en 10, 20 y 30 minutos en auto. Esto es más informativo que la distancia en km porque refleja la experiencia real de traslado del viajero.
centro_mty <- c(-100.3161, 25.6694)
iso <- osrmIsochrone(loc = centro_mty, breaks = c(10, 20, 30), res = 50)
iso <- st_as_sf(iso)
# Detectar nombres de columna (cambian según versión de osrm)
col_min <- names(iso)[grepl("min", names(iso), ignore.case = TRUE)][1]
col_max <- names(iso)[grepl("max", names(iso), ignore.case = TRUE)][1]
iso <- iso %>% rename(isomin = all_of(col_min), isomax = all_of(col_max))
# Reparar geometrías inválidas que genera osrm
iso <- st_make_valid(iso)
sf_use_s2(FALSE)
sf_ab <- st_as_sf(df_airbnb, coords = c("lon", "lat"), crs = 4326)
sf_ht <- st_as_sf(df_hoteles, coords = c("Lon", "Lat"), crs = 4326)
iso <- iso %>%
mutate(
n_airbnb = lengths(st_intersects(iso, sf_ab)),
n_hoteles = lengths(st_intersects(iso, sf_ht)),
pct_ab = round(n_airbnb / nrow(df_airbnb) * 100, 1),
pct_ht = round(n_hoteles / nrow(df_hoteles) * 100, 1)
)
pal_iso <- colorFactor(c("#2ecc71","#f39c12","#e74c3c"), domain = iso$isomin)
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(
data = iso, fillColor = ~pal_iso(isomin), fillOpacity = 0.20,
color = ~pal_iso(isomin), weight = 2,
popup = ~paste0("<b>", isomin, "–", isomax, " min</b><br>",
"Airbnb: ", n_airbnb, " (", pct_ab, "%)<br>",
"Hoteles: ", n_hoteles, " (", pct_ht, "%)")
) %>%
addCircleMarkers(data = df_airbnb, lng = ~lon, lat = ~lat,
radius = 3, color = "#3498db", fillOpacity = 0.6,
weight = 0, group = "Airbnb") %>%
addCircleMarkers(data = df_hoteles, lng = ~Lon, lat = ~Lat,
radius = 6, color = "#e74c3c", fillOpacity = 0.9,
weight = 1, group = "Hoteles") %>%
addMarkers(lng = centro_mty[1], lat = centro_mty[2],
popup = "Macroplaza") %>%
addLayersControl(overlayGroups = c("Airbnb","Hoteles"),
options = layersControlOptions(collapsed = FALSE)) %>%
addLegend(position = "bottomright",
colors = c("#2ecc71","#f39c12","#e74c3c"),
labels = c("0–10 min","10–20 min","20–30 min"),
title = "Tiempo desde centro")iso %>%
st_drop_geometry() %>%
select(isomin, isomax, n_airbnb, pct_ab, n_hoteles, pct_ht) %>%
rename(`Desde (min)` = isomin, `Hasta (min)` = isomax,
`Airbnb (n)` = n_airbnb, `Airbnb (%)` = pct_ab,
`Hotel (n)` = n_hoteles, `Hotel (%)` = pct_ht) %>%
kbl(caption = "Tabla A.1 — Distribución acumulada por isocrona") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Desde (min) | Hasta (min) | Airbnb (n) | Airbnb (%) | Hotel (n) | Hotel (%) |
|---|---|---|---|---|---|
| 0 | 10 | 189 | 75.6 | 55 | 91.7 |
| 10 | 20 | 58 | 23.2 | 4 | 6.7 |
| 20 | 30 | 3 | 1.2 | 1 | 1.7 |
Interpretación — accesibilidad diferencial:
df_sar <- df_airbnb %>%
select(lat, lon, booking_price, overall_raiting, number_reviews,
max_guests, bedroom, beds, bath, Dist_km_Downtown, room_type) %>%
filter(complete.cases(.)) %>%
mutate(
log_price = log(booking_price), # distribución sesgada → log
log_dist = log(Dist_km_Downtown + 0.01),
completa = ifelse(str_detect(str_to_lower(room_type), "entire|cabin"), 1L, 0L)
)
sf_sar <- st_as_sf(df_sar, coords = c("lon","lat"), crs = 4326)
coords_m <- st_coordinates(sf_sar)
# k=5: estándar para datasets urbanos de ~250 obs
knn5 <- knearneigh(coords_m, k = 5)
nb_sar <- knn2nb(knn5)
lw_sar <- nb2listw(nb_sar, style = "W") # row-standardized
cat("Propiedades en el modelo:", nrow(df_sar), "\n")## Propiedades en el modelo: 250
## Vecinos por propiedad : k = 5
Antes de estimar el SAR verificamos si existe autocorrelación espacial en los residuos del OLS. Si el test no fuera significativo, el OLS sería suficiente.
ols_ab <- lm(
log_price ~ overall_raiting + number_reviews +
max_guests + bedroom + bath + log_dist + completa,
data = df_sar
)
moran_res <- lm.morantest(ols_ab, lw_sar)
data.frame(
Estadístico = c("I de Moran", "E[I] bajo H0", "p-valor"),
Valor = c(round(moran_res$estimate[1], 4),
round(moran_res$estimate[2], 4),
round(moran_res$p.value, 6))
) %>%
kbl(caption = "Tabla B.1 — Moran's I sobre residuos OLS") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Estadístico | Valor | |
|---|---|---|
| Observed Moran I | I de Moran | 0.2157 |
| Expectation | E[I] bajo H0 | -0.0102 |
| p-valor | 0.0000 |
Interpretación: I de Moran = 0.2157 con p = 0 — se rechaza H0 de aleatoriedad espacial. Los residuos del OLS tienen estructura espacial, lo que justifica el uso del modelo SAR.
##
## Call:
## lm(formula = log_price ~ overall_raiting + number_reviews + max_guests +
## bedroom + bath + log_dist + completa, data = df_sar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93453 -0.22142 -0.02623 0.17948 2.03069
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.3830974 0.4982502 12.811 < 2e-16 ***
## overall_raiting 0.0680080 0.1055195 0.645 0.51986
## number_reviews 0.0003028 0.0002212 1.369 0.17234
## max_guests 0.0548547 0.0145743 3.764 0.00021 ***
## bedroom 0.1729213 0.0587392 2.944 0.00356 **
## bath 0.3293654 0.0707045 4.658 5.26e-06 ***
## log_dist 0.0112787 0.0382641 0.295 0.76843
## completa 0.1798605 0.0656871 2.738 0.00664 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3623 on 242 degrees of freedom
## Multiple R-squared: 0.5346, Adjusted R-squared: 0.5211
## F-statistic: 39.71 on 7 and 242 DF, p-value: < 2.2e-16
## R² ajustado: 0.5211
## AIC : 211.65
sar_ab <- lagsarlm(
log_price ~ overall_raiting + number_reviews +
max_guests + bedroom + bath + log_dist + completa,
data = df_sar, listw = lw_sar, method = "eigen"
)
summary(sar_ab)##
## Call:lagsarlm(formula = log_price ~ overall_raiting + number_reviews +
## max_guests + bedroom + bath + log_dist + completa, data = df_sar,
## listw = lw_sar, method = "eigen")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.889041 -0.228171 -0.021593 0.200843 1.979308
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.56954260 0.64972447 7.0330 2.021e-12
## overall_raiting 0.03152807 0.10019784 0.3147 0.753021
## number_reviews 0.00024710 0.00021055 1.1736 0.240557
## max_guests 0.05470543 0.01382840 3.9560 7.621e-05
## bedroom 0.14869486 0.05652825 2.6305 0.008527
## bath 0.32470150 0.06711815 4.8378 1.313e-06
## log_dist 0.00477167 0.03657819 0.1305 0.896209
## completa 0.17727707 0.06236341 2.8426 0.004474
##
## Rho: 0.2633, LR test value: 15.502, p-value: 8.2429e-05
## Asymptotic standard error: 0.062601
## z-value: 4.206, p-value: 2.5998e-05
## Wald statistic: 17.69, p-value: 2.5998e-05
##
## Log likelihood: -89.07332 for lag model
## ML residual variance (sigma squared): 0.11811, (sigma: 0.34368)
## Number of observations: 250
## Number of parameters estimated: 10
## AIC: 198.15, (AIC for lm: 211.65)
## LM test for residual autocorrelation
## test value: 12.181, p-value: 0.00048288
df_sar_h <- df_hoteles %>%
mutate(No_Estrellas_n = suppressWarnings(as.numeric(as.character(No_Estrellas)))) %>%
select(Lat, Lon, Precio, Calificación_1, No_Comentarios,
No_Camas, No_Estrellas_n, Dist_km_Centro) %>%
filter(complete.cases(.)) %>%
mutate(log_precio = log(Precio),
log_dist_h = log(Dist_km_Centro + 0.01))
sf_sar_h <- st_as_sf(df_sar_h, coords = c("Lon","Lat"), crs = 4326)
knn4_h <- knearneigh(st_coordinates(sf_sar_h), k = 4) # k=4 por n=53
lw_h <- nb2listw(knn2nb(knn4_h), style = "W")
ols_h <- lm(log_precio ~ Calificación_1 + No_Comentarios +
No_Camas + No_Estrellas_n + log_dist_h,
data = df_sar_h)
sar_h <- lagsarlm(log_precio ~ Calificación_1 + No_Comentarios +
No_Camas + No_Estrellas_n + log_dist_h,
data = df_sar_h, listw = lw_h, method = "eigen")
summary(sar_h)##
## Call:lagsarlm(formula = log_precio ~ Calificación_1 + No_Comentarios +
## No_Camas + No_Estrellas_n + log_dist_h, data = df_sar_h,
## listw = lw_h, method = "eigen")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7021658 -0.1229183 0.0019718 0.2115224 0.3317173
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.3736e+00 1.3674e+00 5.3924 6.951e-08
## Calificación_1 1.2990e-01 5.6425e-02 2.3021 0.0213283
## No_Comentarios -8.2380e-05 3.5974e-05 -2.2900 0.0220211
## No_Camas -4.6394e-02 6.3238e-02 -0.7336 0.4631733
## No_Estrellas_n 1.7246e-01 4.7651e-02 3.6192 0.0002956
## log_dist_h 1.3765e-01 5.2544e-02 2.6198 0.0087984
##
## Rho: -0.098465, LR test value: 0.28858, p-value: 0.59113
## Asymptotic standard error: 0.16627
## z-value: -0.59219, p-value: 0.55372
## Wald statistic: 0.35069, p-value: 0.55372
##
## Log likelihood: 0.2959457 for lag model
## ML residual variance (sigma squared): 0.057813, (sigma: 0.24044)
## Number of observations: 53
## Number of parameters estimated: 8
## AIC: 15.408, (AIC for lm: 13.697)
## LM test for residual autocorrelation
## test value: 0.49615, p-value: 0.48119
data.frame(
Modelo = c("OLS – Airbnb","SAR – Airbnb","OLS – Hoteles","SAR – Hoteles"),
AIC = round(c(AIC(ols_ab), AIC(sar_ab), AIC(ols_h), AIC(sar_h)), 2),
LogLik = round(c(as.numeric(logLik(ols_ab)), sar_ab$LL,
as.numeric(logLik(ols_h)), sar_h$LL), 2),
Rho = c("—", round(sar_ab$rho, 4), "—", round(sar_h$rho, 4))
) %>%
kbl(caption = "Tabla B.2 — Comparación OLS vs. SAR") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
row_spec(2, background = "#d5f5e3") %>%
row_spec(4, background = "#d5f5e3")| Modelo | AIC | LogLik | Rho |
|---|---|---|---|
| OLS – Airbnb | 211.65 | -96.82 | — |
| SAR – Airbnb | 198.15 | -89.07 | 0.2633 |
| OLS – Hoteles | 13.70 | 0.15 | — |
| SAR – Hoteles | 15.41 | 0.30 | -0.0985 |
coef_comp <- data.frame(
Variable = names(coef(ols_ab)),
OLS = as.numeric(coef(ols_ab)),
SAR = as.numeric(coef(sar_ab)[names(coef(ols_ab))])
) %>%
filter(Variable != "(Intercept)") %>%
pivot_longer(c(OLS, SAR), names_to = "Modelo", values_to = "Coef") %>%
mutate(Variable = recode(Variable,
"overall_raiting" = "Calificación",
"number_reviews" = "No. reseñas",
"max_guests" = "Max. huéspedes",
"bedroom" = "Recámaras",
"bath" = "Baños",
"log_dist" = "log(Distancia)",
"completa" = "Propiedad completa"))
ggplot(coef_comp, aes(x = reorder(Variable, abs(Coef)), y = Coef, fill = Modelo)) +
geom_col(position = position_dodge(0.7), width = 0.6) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey60") +
coord_flip() +
scale_fill_manual(values = c("OLS" = "#aed6f1", "SAR" = "#2980b9")) +
labs(title = "Figura B.1 — Coeficientes OLS vs. SAR (Airbnb)",
subtitle = "Variable dependiente: log(booking_price)",
x = NULL, y = "Coeficiente") +
theme_minimal(base_size = 12)Interpretación — coeficiente ρ (rho):
max_guests, no
bedroom. Esto sugiere que el mercado premia más la
capacidad de alojamiento que el número de habitaciones en sí.El análisis de sentimiento requiere texto limpio. Sin preprocesamiento, la puntuación, los números y las stopwords distorsionan los conteos y reducen la calidad de los scores.
stopwords_es <- c(
"de","la","el","en","y","a","que","los","las","del","se","por","con",
"una","un","su","al","es","lo","más","pero","ha","me","muy","fue",
"mi","te","le","no","si","como","para","este","esta","también","todo",
"cuando","hay","ser","estar","tiene","tan","ya","así","donde","había",
"cada","nos","sus","les","era","han","he","o","e","u","ni","aunque",
"porque","pues","desde","hasta","entre","sobre","sin","hacia","sí"
)
limpiar <- function(x) {
x %>%
str_to_lower() %>%
str_replace_all("[^[:alpha:][:space:]]", " ") %>%
str_replace_all("\\s+", " ") %>%
str_trim()
}
reviews_ab <- df_reviews %>%
select(property_id, lat, lon, booking_price, overall_raiting,
Dist_km_downtown, Municipio, reviews) %>%
mutate(texto = limpiar(reviews), fuente = "Airbnb")
# Hoteles: concatenar los 3 comentarios en un solo texto por hotel
reviews_ht <- df_hoteles %>%
mutate(
texto_raw = paste(replace_na(Comentarios_1,""),
replace_na(Comentarios_2,""),
replace_na(Comentarios_3,"")),
texto = limpiar(texto_raw),
fuente = "Hotel"
) %>%
select(Hotel, Lat, Lon, Precio, Calificación_1,
No_Estrellas, Dist_km_Centro, texto, fuente)
cat("Reviews Airbnb :", nrow(reviews_ab), "\n")## Reviews Airbnb : 249
## Reviews Hoteles: 60
El preprocesamiento es necesario porque:
# NRC en español: categoriza en 8 emociones + positivo/negativo
scores_ab <- get_nrc_sentiment(reviews_ab$texto, language = "spanish")
scores_ht <- get_nrc_sentiment(reviews_ht$texto, language = "spanish")
reviews_ab <- reviews_ab %>%
bind_cols(scores_ab) %>%
mutate(score_neto = positive - negative,
sentimiento = case_when(score_neto > 0 ~ "Positivo",
score_neto < 0 ~ "Negativo",
TRUE ~ "Neutro"))
reviews_ht <- reviews_ht %>%
bind_cols(scores_ht) %>%
mutate(score_neto = positive - negative,
sentimiento = case_when(score_neto > 0 ~ "Positivo",
score_neto < 0 ~ "Negativo",
TRUE ~ "Neutro"))
# Tabla resumen
bind_rows(
reviews_ab %>% summarise(Fuente = "Airbnb",
`Positivo (%)` = round(mean(sentimiento=="Positivo")*100, 1),
`Neutro (%)` = round(mean(sentimiento=="Neutro")*100, 1),
`Negativo (%)` = round(mean(sentimiento=="Negativo")*100, 1),
`Score prom` = round(mean(score_neto), 2)),
reviews_ht %>% summarise(Fuente = "Hotel",
`Positivo (%)` = round(mean(sentimiento=="Positivo")*100, 1),
`Neutro (%)` = round(mean(sentimiento=="Neutro")*100, 1),
`Negativo (%)` = round(mean(sentimiento=="Negativo")*100, 1),
`Score prom` = round(mean(score_neto), 2))
) %>%
kbl(caption = "Tabla C.1 — Distribución de sentimiento por tipo de alojamiento") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Fuente | Positivo (%) | Neutro (%) | Negativo (%) | Score prom |
|---|---|---|---|---|
| Airbnb | 56.2 | 30.9 | 12.9 | 1.22 |
| Hotel | 80.0 | 10.0 | 10.0 | 3.57 |
df_sent <- bind_rows(
reviews_ab %>% select(fuente, sentimiento, score_neto),
reviews_ht %>% select(fuente, sentimiento, score_neto)
)
p1 <- df_sent %>%
count(fuente, sentimiento) %>%
mutate(sentimiento = factor(sentimiento, levels = c("Positivo","Neutro","Negativo"))) %>%
ggplot(aes(x = fuente, y = n, fill = sentimiento)) +
geom_col(position = "fill", width = 0.5) +
scale_y_continuous(labels = percent) +
scale_fill_manual(values = c("Positivo"="#2ecc71","Neutro"="#bdc3c7","Negativo"="#e74c3c")) +
labs(title = "Composición de sentimiento", x = NULL, y = "Proporción", fill = NULL) +
theme_minimal(base_size = 12)
p2 <- df_sent %>%
ggplot(aes(x = fuente, y = score_neto, fill = fuente)) +
geom_boxplot(alpha = 0.7, outlier.shape = 21) +
scale_fill_manual(values = c("Airbnb"="#3498db","Hotel"="#e74c3c")) +
labs(title = "Score neto de sentimiento", x = NULL, y = "Positivo – Negativo") +
theme_minimal(base_size = 12) + theme(legend.position = "none")
p1 + p2 + plot_annotation(
title = "Figura C.1 — Análisis de Sentimiento: Airbnb vs. Hoteles"
)par(mfrow = c(1,2))
tok_ab <- reviews_ab %>%
unnest_tokens(word, texto) %>%
filter(!word %in% stopwords_es, str_length(word) > 3) %>%
count(word, sort = TRUE) %>% head(60)
wordcloud(tok_ab$word, tok_ab$n, min.freq = 2, max.words = 50,
random.order = FALSE, colors = brewer.pal(8,"Blues"))
title("Airbnb – Palabras más frecuentes")
tok_ht <- reviews_ht %>%
unnest_tokens(word, texto) %>%
filter(!word %in% stopwords_es, str_length(word) > 3) %>%
count(word, sort = TRUE) %>% head(60)
wordcloud(tok_ht$word, tok_ht$n, min.freq = 1, max.words = 50,
random.order = FALSE, colors = brewer.pal(8,"Reds"))
title("Hoteles – Palabras más frecuentes")pal_sent <- colorNumeric(
palette = c("#e74c3c","#f39c12","#2ecc71"),
domain = c(min(c(reviews_ab$score_neto, reviews_ht$score_neto)),
max(c(reviews_ab$score_neto, reviews_ht$score_neto)))
)
leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircleMarkers(
data = reviews_ab, lng = ~lon, lat = ~lat,
radius = 5, color = ~pal_sent(score_neto),
fillOpacity = 0.8, weight = 0.5, group = "Airbnb",
popup = ~paste0("<b>Airbnb</b><br>Score: ", score_neto,
" (", sentimiento, ")<br>Rating: ⭐ ", overall_raiting,
"<br><i>", str_trunc(reviews, 100), "</i>")
) %>%
addCircleMarkers(
data = reviews_ht, lng = ~Lon, lat = ~Lat,
radius = 8, color = ~pal_sent(score_neto),
fillOpacity = 0.85, weight = 1, group = "Hoteles",
popup = ~paste0("<b>", Hotel, "</b><br>Score: ", score_neto,
" (", sentimiento, ")<br>Cal: ", Calificación_1, "/10")
) %>%
addLayersControl(overlayGroups = c("Airbnb","Hoteles"),
options = layersControlOptions(collapsed = FALSE)) %>%
addLegend(position = "bottomright", pal = pal_sent,
values = reviews_ab$score_neto, title = "Score de sentimiento")Interpretación — patrones de sentimiento:
reviews_ab %>%
mutate(zona = case_when(
Dist_km_downtown <= 4 ~ "Centro (≤4 km)",
Dist_km_downtown <= 8 ~ "Intermedia (4–8 km)",
TRUE ~ "Periférica (>8 km)"
)) %>%
group_by(zona) %>%
summarise(
Propiedades = n(),
`Precio prom` = round(mean(booking_price)),
`Rating prom` = round(mean(overall_raiting), 2),
`Score sentim` = round(mean(score_neto), 2),
`% positivas` = round(mean(sentimiento=="Positivo")*100, 1),
.groups = "drop"
) %>%
kbl(caption = "Tabla D.1 — Perfil por zona geográfica (Airbnb)") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| zona | Propiedades | Precio prom | Rating prom | Score sentim | % positivas |
|---|---|---|---|---|---|
| Centro (≤4 km) | 160 | 2835 | 4.75 | 1.14 | 53.8 |
| Intermedia (4–8 km) | 77 | 2557 | 4.80 | 1.39 | 62.3 |
| Periférica (>8 km) | 12 | 4131 | 4.85 | 1.17 | 50.0 |
Para anfitriones Airbnb:
Para hoteles:
estadio_lat <- 25.6694; estadio_lon <- -100.4547
reviews_ab <- reviews_ab %>%
mutate(
dist_estadio = sqrt(((lat - estadio_lat)*111)^2 +
((lon - estadio_lon)*111*cos(lat*pi/180))^2),
zona_evento = case_when(
dist_estadio <= 5 ~ "Zona 1 (≤5 km)",
dist_estadio <= 10 ~ "Zona 2 (5–10 km)",
dist_estadio <= 15 ~ "Zona 3 (10–15 km)",
TRUE ~ "Zona 4 (>15 km)"
)
)
reviews_ab %>%
group_by(zona_evento) %>%
summarise(
`N propiedades` = n(),
`Cap. total (PAX)` = sum(df_reviews$max_guests[
match(property_id, df_reviews$property_id)],
na.rm = TRUE),
`Precio prom` = round(mean(booking_price)),
`Rating prom` = round(mean(overall_raiting), 2),
`% positivas` = round(mean(sentimiento=="Positivo")*100, 1),
.groups = "drop"
) %>%
kbl(caption = "Tabla D.2 — Capacidad Airbnb por zona del Estadio BBVA") %>%
kable_styling(bootstrap_options = c("striped","hover","condensed"))| zona_evento | N propiedades | Cap. total (PAX) | Precio prom | Rating prom | % positivas |
|---|---|---|---|---|---|
| Zona 2 (5–10 km) | 30 | 114 | 2876 | 4.84 | 60.0 |
| Zona 3 (10–15 km) | 125 | 439 | 2730 | 4.75 | 52.8 |
| Zona 4 (>15 km) | 94 | 341 | 2900 | 4.78 | 59.6 |
pal_evento <- colorFactor(
palette = c("#27ae60","#f39c12","#e67e22","#e74c3c"),
levels = c("Zona 1 (≤5 km)","Zona 2 (5–10 km)",
"Zona 3 (10–15 km)","Zona 4 (>15 km)")
)
leaflet() %>%
addProviderTiles(providers$CartoDB.DarkMatter) %>%
addCircleMarkers(
data = reviews_ab, lng = ~lon, lat = ~lat,
radius = 5, color = ~pal_evento(zona_evento),
fillOpacity = 0.8, weight = 0.3,
popup = ~paste0(zona_evento, "<br>Dist: ", round(dist_estadio,1), " km<br>",
"Precio: $", booking_price)
) %>%
addMarkers(lng = estadio_lon, lat = estadio_lat, popup = "🏟️ Estadio BBVA") %>%
addLegend(position = "bottomright", pal = pal_evento,
values = reviews_ab$zona_evento, title = "Zona del evento")Recomendaciones para el escenario del evento:
El modelo SAR estima ρ = 0.263 (estadísticamente significativo), lo que confirma dependencia espacial positiva en los precios de Airbnb en la ZMM. Las áreas con mayor dependencia observable son:
Implicación estratégica: en zonas de alta dependencia, los anfitriones deben monitorear a sus vecinos más cercanos y ajustar dentro de rangos razonables. Una subida unilateral del 30% sin que el mercado la acompañe resultará en pérdida de ocupación.
iso %>%
st_drop_geometry() %>%
mutate(pct_ab_ac = round(n_airbnb / nrow(df_airbnb) * 100, 1),
pct_ht_ac = round(n_hoteles / nrow(df_hoteles) * 100, 1)) %>%
select(isomin, isomax, n_airbnb, pct_ab_ac, n_hoteles, pct_ht_ac) %>%
rename(`Desde` = isomin, `Hasta` = isomax,
`Airbnb (n)` = n_airbnb, `Airbnb (%)` = pct_ab_ac,
`Hotel (n)` = n_hoteles, `Hotel (%)` = pct_ht_ac) %>%
kbl(caption = "Tabla E.1 — Accesibilidad acumulada desde la Macroplaza") %>%
kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)| Desde | Hasta | Airbnb (n) | Airbnb (%) | Hotel (n) | Hotel (%) |
|---|---|---|---|---|---|
| 0 | 10 | 189 | 75.6 | 55 | 91.7 |
| 10 | 20 | 58 | 23.2 | 4 | 6.7 |
| 20 | 30 | 3 | 1.2 | 1 | 1.7 |
| Dimensión | Airbnb | Hoteles |
|---|---|---|
| Distribución por isocrona | Amplia, presencia significativa >20 min | Concentrada, mayoría dentro de 20 min |
| Viajero favorecido | Busca precio o experiencia residencial | Busca centralidad y estándares |
| Zona de ventaja | Periférica / residencial | Central / corporativa |
Los mapas confirman que ambos modelos no son sustitutos perfectos — sirven perfiles distintos de viajero y su accesibilidad diferencial es parte de su propuesta de valor, no una limitación.
reviews_ab %>%
mutate(cuadrante = case_when(
booking_price > median(booking_price) & score_neto > median(score_neto)
~ "Precio alto · Sentimiento alto",
booking_price <= median(booking_price) & score_neto > median(score_neto)
~ "Precio bajo · Sentimiento alto",
booking_price > median(booking_price) & score_neto <= median(score_neto)
~ "Precio alto · Sentimiento bajo",
TRUE ~ "Precio bajo · Sentimiento bajo"
)) %>%
ggplot(aes(x = booking_price, y = score_neto, color = cuadrante)) +
geom_point(alpha = 0.6, size = 2) +
geom_vline(xintercept = median(reviews_ab$booking_price),
linetype = "dashed", color = "grey50") +
geom_hline(yintercept = median(reviews_ab$score_neto),
linetype = "dashed", color = "grey50") +
scale_color_manual(values = c(
"Precio alto · Sentimiento alto" = "#27ae60",
"Precio bajo · Sentimiento alto" = "#2980b9",
"Precio alto · Sentimiento bajo" = "#e67e22",
"Precio bajo · Sentimiento bajo" = "#e74c3c")) +
scale_x_continuous(labels = dollar_format(prefix = "$", suffix = " MXN")) +
labs(title = "Figura E.1 — Matriz Precio vs. Sentimiento (Airbnb)",
subtitle = "Líneas punteadas = mediana de cada variable",
x = "Precio de reserva", y = "Score de sentimiento neto",
color = NULL) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")Acciones prescriptivas por cuadrante:
El análisis integra visualización espacial, regresión SAR y minería de texto para generar una visión prescriptiva del mercado de alojamiento de la ZMM.
Hallazgos principales:
Airbnb y hoteles no compiten en el mismo submercado geográfico. Los hoteles dominan los corredores centrales; Airbnb tiene presencia en zonas residenciales donde un hotel no sería viable. Las isocronas confirman la separación: los hoteles están más concentrados dentro de los 20 minutos del centro.
Los precios de Airbnb tienen estructura espacial (ρ > 0). Los anfitriones fijan precios con referencia a sus vecinos más cercanos, no de forma aislada. Este resultado justifica el SAR sobre el OLS y tiene implicaciones directas para estrategias de pricing dinámico.
La satisfacción del cliente tiene patrones geográficos. Las zonas con reseñas consistentemente positivas coinciden con áreas de mejor infraestructura urbana y entretenimiento. Esto sugiere que la calidad del entorno influye en la percepción del alojamiento, independientemente de sus atributos físicos.
La prescripción más accionable: propiedades con sentimiento positivo y precio bajo están dejando margen sin aprovechar. La combinación de SAR (dependencia espacial) y análisis de sentimiento permite identificar cuándo y dónde subir precios sin sacrificar calificación.
## R version 4.5.2 (2025-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=Spanish_Mexico.utf8 LC_CTYPE=Spanish_Mexico.utf8
## [3] LC_MONETARY=Spanish_Mexico.utf8 LC_NUMERIC=C
## [5] LC_TIME=Spanish_Mexico.utf8
##
## time zone: America/Mexico_City
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] kableExtra_1.4.0 knitr_1.51 wordcloud_2.6
## [4] RColorBrewer_1.1-3 syuzhet_1.0.7 tidytext_0.4.3
## [7] spatialreg_1.4-3 Matrix_1.7-4 spdep_1.4-2
## [10] spData_2.3.4 osrm_5.0.0 sf_1.1-1
## [13] leaflet.extras_2.0.2 leaflet_2.2.3 patchwork_1.3.2
## [16] scales_1.4.0 ggplot2_4.0.3 stringr_1.6.0
## [19] tidyr_1.3.2 dplyr_1.2.0 readxl_1.4.5
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 viridisLite_0.4.3 farver_2.1.2
## [4] S7_0.2.1 fastmap_1.2.0 TH.data_1.1-5
## [7] janeaustenr_1.0.0 digest_0.6.39 lifecycle_1.0.5
## [10] tokenizers_0.3.0 LearnBayes_2.15.2 survival_3.8-3
## [13] magrittr_2.0.4 compiler_4.5.2 mapiso_0.3.0
## [16] rlang_1.1.7 sass_0.4.10 tools_4.5.2
## [19] igraph_2.2.1 yaml_2.3.12 data.table_1.18.2.1
## [22] labeling_0.4.3 htmlwidgets_1.6.4 sp_2.2-1
## [25] classInt_0.4-11 xml2_1.5.2 multcomp_1.4-30
## [28] KernSmooth_2.23-26 withr_3.0.2 purrr_1.2.1
## [31] grid_4.5.2 e1071_1.7-17 MASS_7.3-65
## [34] isoband_0.3.0 cli_3.6.5 mvtnorm_1.3-7
## [37] rmarkdown_2.30 generics_0.1.4 otel_0.2.0
## [40] rstudioapi_0.18.0 DBI_1.2.3 cachem_1.1.0
## [43] proxy_0.4-29 splines_4.5.2 s2_1.1.9
## [46] cellranger_1.1.0 marginaleffects_0.32.0 vctrs_0.7.1
## [49] boot_1.3-32 sandwich_3.1-1 jsonlite_2.0.0
## [52] systemfonts_1.3.1 crosstalk_1.2.2 jquerylib_0.1.4
## [55] units_1.0-0 glue_1.8.0 leaflet.providers_3.0.0
## [58] codetools_0.2-20 stringi_1.8.7 gtable_0.3.6
## [61] deldir_2.0-4 tibble_3.3.1 pillar_1.11.1
## [64] htmltools_0.5.9 R6_2.6.1 textshaping_1.0.4
## [67] wk_0.9.5 evaluate_1.0.5 lattice_0.22-7
## [70] backports_1.5.0 SnowballC_0.7.1 bslib_0.10.0
## [73] class_7.3-23 Rcpp_1.1.1 svglite_2.2.2
## [76] coda_0.19-4.1 nlme_3.1-168 xfun_0.56
## [79] zoo_1.8-15 pkgconfig_2.0.3