library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(scales)
library(leaflet)
library(viridis)
library(knitr)
library(caret)
library(car)
library(lmtest)# ── AirBnB (scraper propio, 1 780 registros) ─────────────────────────────────
airbnb_raw <- read_csv("AirBnB_Monterrey_FINAL_ES (3) (1).csv")
# ── Hoteles Booking (90 registros con geo) ────────────────────────────────────
hoteles_raw <- read_csv("hoteles_booking_limpio (3).csv")
# ── Etiquetas de zona ─────────────────────────────────────────────────────────
zona_labs <- c("Centro (<3 km)", "Intermedia (3-7 km)",
"Periférica (7-15 km)", "Exterior (>15 km)")
# ── AirBnB limpio ─────────────────────────────────────────────────────────────
airbnb <- airbnb_raw %>%
mutate(
zona = factor(
case_when(
dist_km_downtown < 3 ~ "Centro (<3 km)",
dist_km_downtown < 7 ~ "Intermedia (3-7 km)",
dist_km_downtown < 15 ~ "Periférica (7-15 km)",
TRUE ~ "Exterior (>15 km)"
),
levels = zona_labs
),
tipo = "AirBnB",
overall_raiting = calificacion_satisfaccion_huesped,
booking_price = precio_booking / num_noches,
number_reviews = num_resenas,
Dist_km_Downtown = dist_km_downtown,
lat = latitud,
lon = longitud,
rating_10 = overall_raiting * 2
)
# ── Hoteles limpio ────────────────────────────────────────────────────────────
hoteles <- hoteles_raw %>%
mutate(
zona = factor(zona, levels = zona_labs),
tipo = "Hotel",
overall_raiting = Calificacion_1,
booking_price = Precio,
number_reviews = No_Comentarios,
Dist_km_Downtown = Dist_km_Centro,
lat = Lat,
lon = Lon,
rating_10 = overall_raiting
)
# ── Dataset combinado ─────────────────────────────────────────────────────────
combined <- bind_rows(
airbnb %>% select(tipo, zona, booking_price, rating_10, number_reviews,
Dist_km_Downtown, lat, lon),
hoteles %>% select(tipo, zona, booking_price, rating_10, number_reviews,
Dist_km_Downtown, lat, lon)
)ggplot(combined, aes(x = Dist_km_Downtown, y = booking_price, color = tipo)) +
geom_point(alpha = 0.4, size = 2) +
geom_smooth(method = "lm", se = TRUE, linewidth = 1.2) +
scale_color_manual(values = c("AirBnB" = "#FF5A5F", "Hotel" = "#00A699")) +
scale_y_continuous(
labels = comma_format(prefix = "$"),
limits = c(0, quantile(combined$booking_price, 0.97, na.rm = TRUE))
) +
labs(title = "Precio por noche vs Distancia al centro – AirBnB y Hoteles",
subtitle = "Línea de regresión lineal por plataforma",
x = "Distancia al centro (km)", y = "Precio por noche (MXN)", color = NULL) +
theme_minimal()cor_ab <- cor(airbnb$Dist_km_Downtown, airbnb$booking_price, use = "complete.obs")
cor_hot <- cor(hoteles$Dist_km_Downtown, hoteles$booking_price, use = "complete.obs")
tibble(
Plataforma = c("AirBnB", "Hotel"),
`Correlación (Dist vs Precio/noche)` = round(c(cor_ab, cor_hot), 3),
Interpretación = c(
ifelse(cor_ab < 0, "A mayor distancia, menor precio", "A mayor distancia, mayor precio"),
ifelse(cor_hot < 0, "A mayor distancia, menor precio", "A mayor distancia, mayor precio")
)
) %>% kable(caption = "Correlación de Pearson: Distancia al centro vs Precio por noche")| Plataforma | Correlación (Dist vs Precio/noche) | Interpretación |
|---|---|---|
| AirBnB | 0.134 | A mayor distancia, mayor precio |
| Hotel | 0.300 | A mayor distancia, mayor precio |
combined %>%
group_by(zona, tipo) %>%
summarise(
precio_med = median(booking_price, na.rm = TRUE),
rating_med = median(rating_10, na.rm = TRUE),
resenas_med = median(number_reviews, na.rm = TRUE),
n = n(),
.groups = "drop"
) %>%
kable(digits = 2,
col.names = c("Zona", "Tipo", "Precio/noche med (MXN)",
"Rating med (/10)", "Reseñas medianas", "N"),
caption = "Precio por noche, satisfacción y volumen por zona y plataforma")| Zona | Tipo | Precio/noche med (MXN) | Rating med (/10) | Reseñas medianas | N |
|---|---|---|---|---|---|
| Centro (<3 km) | AirBnB | 794.85 | 9.90 | 14 | 405 |
| Centro (<3 km) | Hotel | 1622.11 | 8.50 | 111 | 85 |
| Intermedia (3-7 km) | AirBnB | 987.19 | 9.86 | 9 | 480 |
| Intermedia (3-7 km) | Hotel | 2758.54 | 8.80 | 368 | 5 |
| Periférica (7-15 km) | AirBnB | 1089.62 | 9.91 | 1 | 376 |
| Exterior (>15 km) | AirBnB | 1198.58 | 9.86 | 6 | 519 |
combined %>%
group_by(zona, tipo) %>%
summarise(precio_mean = mean(booking_price, na.rm = TRUE), .groups = "drop") %>%
ggplot(aes(x = tipo, y = zona, fill = precio_mean)) +
geom_tile(color = "white", linewidth = 0.8) +
geom_text(aes(label = comma(round(precio_mean))),
size = 4, color = "white", fontface = "bold") +
scale_fill_viridis(option = "inferno", name = "Precio\npromedio\n(MXN)") +
labs(title = "Heatmap de precios promedio por noche – zona y plataforma",
x = NULL, y = NULL) +
theme_minimal()airbnb_q <- airbnb %>%
filter(!is.na(overall_raiting), !is.na(booking_price)) %>%
mutate(
precio_alto = booking_price > median(booking_price, na.rm = TRUE),
rating_alto = overall_raiting > median(overall_raiting, na.rm = TRUE),
cuadrante = case_when(
precio_alto & rating_alto ~ "Alto precio / Alto rating",
precio_alto & !rating_alto ~ "Alto precio / Bajo rating",
!precio_alto & rating_alto ~ "Bajo precio / Alto rating ⭐",
TRUE ~ "Bajo precio / Bajo rating"
)
)
airbnb_q %>%
count(zona, cuadrante) %>%
group_by(zona) %>%
mutate(pct = percent(n / sum(n), accuracy = 1)) %>%
arrange(zona, desc(n)) %>%
kable(caption = "Cuadrantes precio/noche-rating por zona – AirBnB")| zona | cuadrante | n | pct |
|---|---|---|---|
| Centro (<3 km) | Bajo precio / Alto rating ⭐ | 122 | 36% |
| Centro (<3 km) | Alto precio / Bajo rating | 87 | 25% |
| Centro (<3 km) | Bajo precio / Bajo rating | 82 | 24% |
| Centro (<3 km) | Alto precio / Alto rating | 52 | 15% |
| Intermedia (3-7 km) | Bajo precio / Bajo rating | 106 | 29% |
| Intermedia (3-7 km) | Bajo precio / Alto rating ⭐ | 89 | 25% |
| Intermedia (3-7 km) | Alto precio / Bajo rating | 88 | 24% |
| Intermedia (3-7 km) | Alto precio / Alto rating | 80 | 22% |
| Periférica (7-15 km) | Alto precio / Alto rating | 59 | 27% |
| Periférica (7-15 km) | Bajo precio / Bajo rating | 57 | 26% |
| Periférica (7-15 km) | Bajo precio / Alto rating ⭐ | 54 | 25% |
| Periférica (7-15 km) | Alto precio / Bajo rating | 48 | 22% |
| Exterior (>15 km) | Alto precio / Bajo rating | 128 | 32% |
| Exterior (>15 km) | Alto precio / Alto rating | 116 | 29% |
| Exterior (>15 km) | Bajo precio / Bajo rating | 91 | 23% |
| Exterior (>15 km) | Bajo precio / Alto rating ⭐ | 59 | 15% |
bind_rows(
airbnb %>% select(zona, number_reviews, tipo),
hoteles %>% select(zona, number_reviews, tipo)
) %>%
group_by(zona, tipo) %>%
summarise(
total_resenas = sum(number_reviews, na.rm = TRUE),
resenas_median = median(number_reviews, na.rm = TRUE),
n_propiedades = n(),
resenas_por_propiedad = round(sum(number_reviews, na.rm = TRUE) / n(), 1),
.groups = "drop"
) %>%
arrange(tipo, desc(total_resenas)) %>%
kable(caption = "Volumen de reseñas por zona y plataforma")| zona | tipo | total_resenas | resenas_median | n_propiedades | resenas_por_propiedad |
|---|---|---|---|---|---|
| Exterior (>15 km) | AirBnB | 20772 | 6 | 519 | 40.0 |
| Intermedia (3-7 km) | AirBnB | 18397 | 9 | 480 | 38.3 |
| Centro (<3 km) | AirBnB | 15905 | 14 | 405 | 39.3 |
| Periférica (7-15 km) | AirBnB | 8490 | 1 | 376 | 22.6 |
| Centro (<3 km) | Hotel | 52110 | 111 | 85 | 613.1 |
| Intermedia (3-7 km) | Hotel | 1696 | 368 | 5 | 339.2 |
precio_zona_ab <- airbnb %>%
group_by(zona) %>%
summarise(
ab_p25 = quantile(booking_price, 0.25, na.rm = TRUE),
ab_med = median(booking_price, na.rm = TRUE),
ab_p75 = quantile(booking_price, 0.75, na.rm = TRUE),
ab_rating_med = median(overall_raiting, na.rm = TRUE),
.groups = "drop"
)
precio_zona_hot <- hoteles %>%
group_by(zona) %>%
summarise(
hot_p25 = quantile(booking_price, 0.25, na.rm = TRUE),
hot_med = median(booking_price, na.rm = TRUE),
hot_p75 = quantile(booking_price, 0.75, na.rm = TRUE),
hot_rating_med = median(overall_raiting, na.rm = TRUE),
.groups = "drop"
)
estrategia <- left_join(precio_zona_ab, precio_zona_hot, by = "zona") %>%
mutate(
brecha_precio = hot_med - ab_med,
brecha_rating = hot_rating_med - (ab_rating_med * 2),
precio_optimo = round(ab_med * 1.05, 0),
recomendacion = case_when(
brecha_precio > 300 & brecha_rating < 0 ~
"Reducir precio + mejorar servicio urgente",
brecha_precio > 300 & brecha_rating >= 0 ~
"Reducir precio – el servicio ya es competitivo",
brecha_precio <= 0 ~
"Precio ya competitivo – mantener y mejorar diferenciadores",
TRUE ~
"Ajuste moderado de precio + reforzar amenidades"
)
)
kable(
estrategia %>%
select(zona, ab_med, hot_med, brecha_precio,
ab_rating_med, hot_rating_med, precio_optimo, recomendacion),
digits = 0,
col.names = c("Zona", "AirBnB Med/noche", "Hotel Med/noche", "Brecha",
"Rating AB (/5)", "Rating Hot (/10)",
"Precio Óptimo Hotel", "Recomendación"),
caption = "Estrategia de precios por noche: hoteles vs AirBnB"
)| Zona | AirBnB Med/noche | Hotel Med/noche | Brecha | Rating AB (/5) | Rating Hot (/10) | Precio Óptimo Hotel | Recomendación |
|---|---|---|---|---|---|---|---|
| Centro (<3 km) | 795 | 1622 | 827 | 5 | 8 | 835 | Reducir precio + mejorar servicio urgente |
| Intermedia (3-7 km) | 987 | 2759 | 1771 | 5 | 9 | 1037 | Reducir precio + mejorar servicio urgente |
| Periférica (7-15 km) | 1090 | NA | NA | 5 | NA | 1144 | Ajuste moderado de precio + reforzar amenidades |
| Exterior (>15 km) | 1199 | NA | NA | 5 | NA | 1259 | Ajuste moderado de precio + reforzar amenidades |
datos_log <- combined %>%
filter(!is.na(booking_price), !is.na(Dist_km_Downtown), !is.na(rating_10)) %>%
mutate(es_airbnb = as.integer(tipo == "AirBnB"))
modelo_log <- glm(es_airbnb ~ booking_price + Dist_km_Downtown + rating_10,
data = datos_log, family = binomial(link = "logit"))
cat("═════════════════════════════════════════════════════════════════════════════\n")## ═════════════════════════════════════════════════════════════════════════════
## MODELO LOGÍSTICO — CLASIFICACIÓN AirBnB vs HOTEL
## ═════════════════════════════════════════════════════════════════════════════
##
## Call:
## glm(formula = es_airbnb ~ booking_price + Dist_km_Downtown +
## rating_10, family = binomial(link = "logit"), data = datos_log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.081e+01 1.427e+00 -7.577 3.55e-14 ***
## booking_price -6.444e-04 1.186e-04 -5.432 5.59e-08 ***
## Dist_km_Downtown 1.437e+00 1.838e-01 7.822 5.18e-15 ***
## rating_10 1.149e+00 1.384e-01 8.301 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 669.14 on 1407 degrees of freedom
## Residual deviance: 288.75 on 1404 degrees of freedom
## AIC: 296.75
##
## Number of Fisher Scoring iterations: 11
set.seed(42)
folds <- caret::createFolds(datos_log$es_airbnb, k = 5, returnTrain = TRUE)
cv_results <- sapply(seq_along(folds), function(i) {
train_idx <- folds[[i]]
test_idx <- setdiff(seq_len(nrow(datos_log)), train_idx)
datos_train <- datos_log[train_idx, ]
datos_test <- datos_log[test_idx, ]
mod_cv <- glm(es_airbnb ~ booking_price + Dist_km_Downtown + rating_10,
data = datos_train, family = binomial(link = "logit"))
pred_prob <- predict(mod_cv, datos_test, type = "response")
pred_cv <- ifelse(pred_prob > 0.5, 1, 0)
accuracy <- mean(pred_cv == datos_test$es_airbnb, na.rm = TRUE)
tp <- sum(pred_cv == 1 & datos_test$es_airbnb == 1, na.rm = TRUE)
fp <- sum(pred_cv == 1 & datos_test$es_airbnb == 0, na.rm = TRUE)
tn <- sum(pred_cv == 0 & datos_test$es_airbnb == 0, na.rm = TRUE)
fn <- sum(pred_cv == 0 & datos_test$es_airbnb == 1, na.rm = TRUE)
sensitivity <- if (tp + fn > 0) tp / (tp + fn) else NA
specificity <- if (tn + fp > 0) tn / (tn + fp) else NA
c(Accuracy = accuracy, Sensitivity = sensitivity, Specificity = specificity)
})
cv_summary <- data.frame(
Fold = paste0("Fold ", 1:5),
Accuracy = round(cv_results["Accuracy", ], 4),
Sensitivity = round(cv_results["Sensitivity", ], 4),
Specificity = round(cv_results["Specificity", ], 4)
)
kable(cv_summary, caption = "Validación cruzada 5-fold — Modelo logístico")| Fold | Accuracy | Sensitivity | Specificity |
|---|---|---|---|
| Fold 1 | 0.9468 | 0.9885 | 0.4286 |
| Fold 2 | 0.9609 | 0.9703 | 0.7500 |
| Fold 3 | 0.9573 | 0.9848 | 0.5556 |
| Fold 4 | 0.9539 | 0.9885 | 0.5000 |
| Fold 5 | 0.9574 | 0.9924 | 0.4737 |
##
## ┌─────────────────────────────────────────────────────────────────────────────┐
## │ RESUMEN VALIDACIÓN CRUZADA │
## ├─────────────────────────────────────────────────────────────────────────────┤
cat("│ Accuracy media: ", sprintf("%6.2f%%", mean(cv_results["Accuracy", ], na.rm = TRUE) * 100), " │\n")## │ Accuracy media: 95.53% │
cat("│ Sensitivity media: ", sprintf("%6.2f%%", mean(cv_results["Sensitivity", ], na.rm = TRUE) * 100), " (capacidad detectar AirBnB) │\n")## │ Sensitivity media: 98.49% (capacidad detectar AirBnB) │
cat("│ Specificity media: ", sprintf("%6.2f%%", mean(cv_results["Specificity", ], na.rm = TRUE) * 100), " (capacidad detectar Hotel) │\n")## │ Specificity media: 54.16% (capacidad detectar Hotel) │
## └─────────────────────────────────────────────────────────────────────────────┘
datos_ols <- combined %>%
filter(!is.na(booking_price), !is.na(Dist_km_Downtown), !is.na(rating_10)) %>%
mutate(log_precio = log(booking_price))
modelo_ols <- lm(log_precio ~ Dist_km_Downtown + rating_10 + tipo + zona,
data = datos_ols)
cat("═════════════════════════════════════════════════════════════════════════════\n")## ═════════════════════════════════════════════════════════════════════════════
## DIAGNÓSTICO DE SUPUESTOS — MODELO OLS
## ═════════════════════════════════════════════════════════════════════════════
## 1. HOMOCEDASTICIDAD (Breusch-Pagan)
## H0: Varianza constante
## Estadístico: 34.5583
## p-valor: 5.25e-06
if (bp_test$p.value < 0.05) {
cat(" ⚠️ RECHAZAR H0 → Heterocedasticidad presente\n")
} else {
cat(" ✓ ACEPTAR H0 → Homocedasticidad satisfecha\n")
}## ⚠️ RECHAZAR H0 → Heterocedasticidad presente
resid_muestra <- if (nrow(datos_ols) > 5000) {
sample(residuals(modelo_ols), 5000)
} else {
residuals(modelo_ols)
}
sw_test <- shapiro.test(resid_muestra)
cat("\n2. NORMALIDAD (Shapiro-Wilk)\n")##
## 2. NORMALIDAD (Shapiro-Wilk)
## H0: Residuos distribuidos normalmente
## Estadístico: 0.9855
## p-valor: 1.15e-10
if (sw_test$p.value < 0.05) {
cat(" ⚠️ RECHAZAR H0 → Residuos no normales\n")
} else {
cat(" ✓ ACEPTAR H0 → Normalidad satisfecha\n")
}## ⚠️ RECHAZAR H0 → Residuos no normales
##
## 3. MULTICOLINEALIDAD (Variance Inflation Factor)
## GVIF Df GVIF^(1/(2*Df))
## Dist_km_Downtown 5.775995 1 2.403330
## rating_10 1.197320 1 1.094221
## tipo 1.362168 1 1.167120
## zona 6.379700 3 1.361863
if (max(vif_vals) < 5) {
cat(" ✓ Todos VIF < 5 → Sin problemas de multicolinealidad\n")
} else {
cat(" ⚠️ Algunos VIF ≥ 5 → Considerar remover variables\n")
}## ⚠️ Algunos VIF ≥ 5 → Considerar remover variables
##
## 4. BONDAD DE AJUSTE
## R² ajustado: 0.0992
## F-statistic: 26.84
cat(" p-valor F:", format.pval(pf(summary(modelo_ols)$fstatistic[1],
summary(modelo_ols)$fstatistic[2],
summary(modelo_ols)$fstatistic[3],
lower.tail = FALSE), digits = 3), "\n")## p-valor F: <2e-16
supuestos_tabla <- data.frame(
Supuesto = c(
"Homocedasticidad",
"Normalidad residuos",
"Multicolinealidad",
"Autocorrelación espacial"
),
Test = c(
"Breusch-Pagan",
"Shapiro-Wilk",
"VIF máximo",
"Moran's I (próx.)"
),
Estadístico = c(
round(bp_test$statistic, 3),
round(sw_test$statistic, 3),
round(max(vif_vals), 2),
"—"
),
p_valor = c(
format.pval(bp_test$p.value, 2),
format.pval(sw_test$p.value, 2),
"< 5",
"—"
),
Estado = c(
if (bp_test$p.value < 0.05) "⚠️ Hetero" else "✓ OK",
if (sw_test$p.value < 0.05) "⚠️ No normal" else "✓ OK",
if (max(vif_vals) < 5) "✓ OK" else "⚠️ Alto",
"Pendiente"
),
Remedio = c(
"HC3 errors | GLM",
"Log transf. ✓ | Box-Cox",
"Revisar correlaciones",
"SAR/GWR (Etapa 3)"
),
stringsAsFactors = FALSE
)
kable(supuestos_tabla,
caption = "Tabla resumen: Supuestos OLS, diagnóstico y remedios")| Supuesto | Test | Estadístico | p_valor | Estado | Remedio |
|---|---|---|---|---|---|
| Homocedasticidad | Breusch-Pagan | 34.558 | 5.2e-06 | ⚠️ Hetero | HC3 errors | GLM |
| Normalidad residuos | Shapiro-Wilk | 0.985 | 1.2e-10 | ⚠️ No normal | Log transf. ✓ | Box-Cox |
| Multicolinealidad | VIF máximo | 6.38 | < 5 | ⚠️ Alto | Revisar correlaciones |
| Autocorrelación espacial | Moran’s I (próx.) | — | — | Pendiente | SAR/GWR (Etapa 3) |
##
## ┌─────────────────────────────────────────────────────────────────────────────┐
## │ REMEDIOS RECOMENDADOS PARA SUPUESTOS NO SATISFECHOS │
## ├─────────────────────────────────────────────────────────────────────────────┤
## │ │
## │ PROBLEMA 1: Heterocedasticidad (Breusch-Pagan p < 0.05) │
## │ │
## │ SOLUCIÓN A: Errores robustos HC3 (mantener OLS, ajustar errores) │
## │ library(sandwich) │
## │ library(lmtest) │
## │ coeftest(modelo_ols, vcov = vcovHC(modelo_ols, type = 'HC3')) │
## │ │
## │ SOLUCIÓN B: Usar GLM con link function │
## │ modelo_glm <- glm(booking_price ~ ... , family = gaussian(link='log')) │
## │ │
## ├─────────────────────────────────────────────────────────────────────────────┤
## │ │
## │ PROBLEMA 2: Residuos no normales (Shapiro-Wilk p < 0.05) │
## │ │
## │ SOLUCIÓN A: Transformación log ya aplicada (precio → log(precio)) ✓ │
## │ │
## │ SOLUCIÓN B: Box-Cox para transformación óptima │
## │ library(MASS) │
## │ bc <- boxcox(modelo_ols) │
## │ lambda <- bc$x[which.max(bc$y)] │
## │ │
## ├─────────────────────────────────────────────────────────────────────────────┤
## │ │
## │ ALTERNATIVA GLOBAL: Modelo SAR (Spatial Autoregressive) — Etapa 3 │
## │ │
## │ Captura autocorrelación espacial detectada en Moran's I │
## │ Corrije simultaneously heterocedasticidad y correlación espacial │
## │ │
## └─────────────────────────────────────────────────────────────────────────────┘
# Recargar datos por si acaso
airbnb_raw <- read_csv("AirBnB_Monterrey_FINAL_ES (3) (1).csv")
hoteles_raw <- read_csv("hoteles_booking_limpio (3).csv")
zona_labs <- c("Centro (<3 km)", "Intermedia (3-7 km)",
"Periférica (7-15 km)", "Exterior (>15 km)")
airbnb <- airbnb_raw %>%
mutate(
zona = factor(
case_when(
dist_km_downtown < 3 ~ "Centro (<3 km)",
dist_km_downtown < 7 ~ "Intermedia (3-7 km)",
dist_km_downtown < 15 ~ "Periférica (7-15 km)",
TRUE ~ "Exterior (>15 km)"
),
levels = zona_labs
),
tipo = "AirBnB",
overall_raiting = calificacion_satisfaccion_huesped,
booking_price = precio_booking / num_noches,
number_reviews = num_resenas,
Dist_km_Downtown = dist_km_downtown,
lat = latitud,
lon = longitud,
rating_10 = overall_raiting * 2
)
hoteles <- hoteles_raw %>%
mutate(
zona = factor(zona, levels = zona_labs),
tipo = "Hotel",
overall_raiting = Calificacion_1,
booking_price = Precio,
number_reviews = No_Comentarios,
Dist_km_Downtown = Dist_km_Centro,
lat = Lat,
lon = Lon,
rating_10 = overall_raiting
)
# ═══════════════════════════════════════════════════════════════════════════════
# TEST t DE WELCH
# ═══════════════════════════════════════════════════════════════════════════════
precio_airbnb <- airbnb$booking_price[!is.na(airbnb$booking_price)]
precio_hotel <- hoteles$booking_price[!is.na(hoteles$booking_price)]
cat("═════════════════════════════════════════════════════════════════════════════\n")## ═════════════════════════════════════════════════════════════════════════════
## TEST t DE WELCH — Diferencia de precios AirBnB vs Hotel
## ═════════════════════════════════════════════════════════════════════════════
## H0: μ(AirBnB) = μ(Hotel)
## H1: μ(AirBnB) ≠ μ(Hotel)
## Media AirBnB: $ 1330 /noche
## Media Hotel: $ 1837 /noche
cat("Diferencia: $", round(mean(precio_airbnb, na.rm = TRUE) - mean(precio_hotel, na.rm = TRUE), 0), "\n\n")## Diferencia: $ -507
t_welch <- t.test(precio_airbnb, precio_hotel, var.equal = FALSE)
cat("t-estadístico: ", round(t_welch$statistic, 4), "\n")## t-estadístico: -4.4094
## gl (aprox): 100.4
## p-valor: 2.61e-05
if (t_welch$p.value < 0.05) {
cat("✓ RECHAZAR H0 → Diferencia SIGNIFICATIVA (p < 0.05)\n")
} else {
cat("⚠️ ACEPTAR H0 → Diferencia NO significativa (p ≥ 0.05)\n")
}## ✓ RECHAZAR H0 → Diferencia SIGNIFICATIVA (p < 0.05)
# ═══════════════════════════════════════════════════════════════════════════════
# TEST DE WILCOXON
# ═══════════════════════════════════════════════════════════════════════════════
rating_airbnb <- airbnb$rating_10[!is.na(airbnb$rating_10)]
rating_hotel <- hoteles$rating_10[!is.na(hoteles$rating_10)]
cat("\n═════════════════════════════════════════════════════════════════════════════\n")##
## ═════════════════════════════════════════════════════════════════════════════
## TEST DE WILCOXON (MANN-WHITNEY) — Diferencia de ratings AirBnB vs Hotel
## ═════════════════════════════════════════════════════════════════════════════
## H0: Distribuciones de rating son iguales
## H1: Distribuciones son diferentes
## Mediana AirBnB: 9.88 /10
## Mediana Hotel: 8.55 /10
cat("Diferencia: ", round(median(rating_airbnb, na.rm = TRUE) - median(rating_hotel, na.rm = TRUE), 2), "\n\n")## Diferencia: 1.33
wilcoxon_test <- wilcox.test(rating_airbnb, rating_hotel, alternative = "two.sided")
cat("Estadístico U: ", round(wilcoxon_test$statistic, 0), "\n")## Estadístico U: 102168
## p-valor: <2e-16
if (wilcoxon_test$p.value < 0.05) {
cat("✓ RECHAZAR H0 → Distribuciones DIFERENTES (p < 0.05)\n")
} else {
cat("⚠️ ACEPTAR H0 → Distribuciones iguales (p ≥ 0.05)\n")
}## ✓ RECHAZAR H0 → Distribuciones DIFERENTES (p < 0.05)
# ═══════════════════════════════════════════════════════════════════════════════
# TABLA RESUMEN
# ═══════════════════════════════════════════════════════════════════════════════
tests_tabla <- data.frame(
Test = c("t-Welch", "Wilcoxon"),
Contraste = c(
"μ(AirBnB precio) = μ(Hotel precio)",
"Me(AirBnB rating) = Me(Hotel rating)"
),
Estadístico = c(
round(t_welch$statistic, 3),
round(wilcoxon_test$statistic, 0)
),
p_valor = c(
format.pval(t_welch$p.value, digits = 2),
format.pval(wilcoxon_test$p.value, digits = 2)
),
Decisión = c(
if (t_welch$p.value < 0.05) "Rechazar H0" else "Aceptar H0",
if (wilcoxon_test$p.value < 0.05) "Rechazar H0" else "Aceptar H0"
),
Conclusión = c(
"Precios significativamente diferentes",
"Ratings significativamente diferentes"
),
stringsAsFactors = FALSE
)
kable(tests_tabla,
caption = "Resumen de tests estadísticos (α = 0.05)")| Test | Contraste | Estadístico | p_valor | Decisión | Conclusión | |
|---|---|---|---|---|---|---|
| t | t-Welch | μ(AirBnB precio) = μ(Hotel precio) | -4.409 | 2.6e-05 | Rechazar H0 | Precios significativamente diferentes |
| W | Wilcoxon | Me(AirBnB rating) = Me(Hotel rating) | 102168.000 | <2e-16 | Rechazar H0 | Ratings significativamente diferentes |
Equipo Épsilon · Tec de Monterrey · AD3003B Planeación Estratégica Basada en Analítica Prescriptiva Socio Formador: DATLAS / DASHA MTY · Asesor: Dr. Pedro Vallejo