1 1. Carga de Datos y Librerías

library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(scales)
library(leaflet)
library(viridis)
library(knitr)
library(caret)
library(car)
library(lmtest)

# ── AirBnB (scraper propio, 1 780 registros) ─────────────────────────────────
airbnb_raw <- read_csv("AirBnB_Monterrey_FINAL_ES (3) (1).csv")

# ── Hoteles Booking (90 registros con geo) ────────────────────────────────────
hoteles_raw <- read_csv("hoteles_booking_limpio (3).csv")

# ── Etiquetas de zona ─────────────────────────────────────────────────────────
zona_labs <- c("Centro (<3 km)", "Intermedia (3-7 km)",
               "Periférica (7-15 km)", "Exterior (>15 km)")

# ── AirBnB limpio ─────────────────────────────────────────────────────────────
airbnb <- airbnb_raw %>%
  mutate(
    zona = factor(
      case_when(
        dist_km_downtown <  3  ~ "Centro (<3 km)",
        dist_km_downtown <  7  ~ "Intermedia (3-7 km)",
        dist_km_downtown < 15  ~ "Periférica (7-15 km)",
        TRUE                   ~ "Exterior (>15 km)"
      ),
      levels = zona_labs
    ),
    tipo             = "AirBnB",
    overall_raiting  = calificacion_satisfaccion_huesped,
    booking_price    = precio_booking / num_noches,
    number_reviews   = num_resenas,
    Dist_km_Downtown = dist_km_downtown,
    lat              = latitud,
    lon              = longitud,
    rating_10        = overall_raiting * 2
  )

# ── Hoteles limpio ────────────────────────────────────────────────────────────
hoteles <- hoteles_raw %>%
  mutate(
    zona = factor(zona, levels = zona_labs),
    tipo             = "Hotel",
    overall_raiting  = Calificacion_1,
    booking_price    = Precio,
    number_reviews   = No_Comentarios,
    Dist_km_Downtown = Dist_km_Centro,
    lat              = Lat,
    lon              = Lon,
    rating_10        = overall_raiting
  )

# ── Dataset combinado ─────────────────────────────────────────────────────────
combined <- bind_rows(
  airbnb  %>% select(tipo, zona, booking_price, rating_10, number_reviews,
                     Dist_km_Downtown, lat, lon),
  hoteles %>% select(tipo, zona, booking_price, rating_10, number_reviews,
                     Dist_km_Downtown, lat, lon)
)

2 2. Pregunta A – Distancia al centro vs Precio

2.1 Scatter + Regresión

ggplot(combined, aes(x = Dist_km_Downtown, y = booking_price, color = tipo)) +
  geom_point(alpha = 0.4, size = 2) +
  geom_smooth(method = "lm", se = TRUE, linewidth = 1.2) +
  scale_color_manual(values = c("AirBnB" = "#FF5A5F", "Hotel" = "#00A699")) +
  scale_y_continuous(
    labels = comma_format(prefix = "$"),
    limits = c(0, quantile(combined$booking_price, 0.97, na.rm = TRUE))
  ) +
  labs(title = "Precio por noche vs Distancia al centro – AirBnB y Hoteles",
       subtitle = "Línea de regresión lineal por plataforma",
       x = "Distancia al centro (km)", y = "Precio por noche (MXN)", color = NULL) +
  theme_minimal()

2.2 Correlación Pearson

cor_ab  <- cor(airbnb$Dist_km_Downtown,  airbnb$booking_price,  use = "complete.obs")
cor_hot <- cor(hoteles$Dist_km_Downtown, hoteles$booking_price, use = "complete.obs")

tibble(
  Plataforma = c("AirBnB", "Hotel"),
  `Correlación (Dist vs Precio/noche)` = round(c(cor_ab, cor_hot), 3),
  Interpretación = c(
    ifelse(cor_ab  < 0, "A mayor distancia, menor precio", "A mayor distancia, mayor precio"),
    ifelse(cor_hot < 0, "A mayor distancia, menor precio", "A mayor distancia, mayor precio")
  )
) %>% kable(caption = "Correlación de Pearson: Distancia al centro vs Precio por noche")

Correlación de Pearson: Distancia al centro vs Precio por noche
Plataforma	Correlación (Dist vs Precio/noche)	Interpretación
AirBnB	0.134	A mayor distancia, mayor precio
Hotel	0.300	A mayor distancia, mayor precio

3 3. Pregunta B – Localización, precio, satisfacción y gasto total

3.1 Tabla Resumen

combined %>%
  group_by(zona, tipo) %>%
  summarise(
    precio_med  = median(booking_price,   na.rm = TRUE),
    rating_med  = median(rating_10,       na.rm = TRUE),
    resenas_med = median(number_reviews,  na.rm = TRUE),
    n           = n(),
    .groups = "drop"
  ) %>%
  kable(digits = 2,
        col.names = c("Zona", "Tipo", "Precio/noche med (MXN)",
                      "Rating med (/10)", "Reseñas medianas", "N"),
        caption = "Precio por noche, satisfacción y volumen por zona y plataforma")

Precio por noche, satisfacción y volumen por zona y plataforma
Zona	Tipo	Precio/noche med (MXN)	Rating med (/10)	Reseñas medianas	N
Centro (<3 km)	AirBnB	794.85	9.90	14	405
Centro (<3 km)	Hotel	1622.11	8.50	111	85
Intermedia (3-7 km)	AirBnB	987.19	9.86	9	480
Intermedia (3-7 km)	Hotel	2758.54	8.80	368	5
Periférica (7-15 km)	AirBnB	1089.62	9.91	1	376
Exterior (>15 km)	AirBnB	1198.58	9.86	6	519

4 4. Pregunta C – Zonas con precios extremos

4.1 Heatmap

combined %>%
  group_by(zona, tipo) %>%
  summarise(precio_mean = mean(booking_price, na.rm = TRUE), .groups = "drop") %>%
  ggplot(aes(x = tipo, y = zona, fill = precio_mean)) +
  geom_tile(color = "white", linewidth = 0.8) +
  geom_text(aes(label = comma(round(precio_mean))),
            size = 4, color = "white", fontface = "bold") +
  scale_fill_viridis(option = "inferno", name = "Precio\npromedio\n(MXN)") +
  labs(title = "Heatmap de precios promedio por noche – zona y plataforma",
       x = NULL, y = NULL) +
  theme_minimal()

5 5. Pregunta D – Correlación espacial

5.1 Cuadrantes

airbnb_q <- airbnb %>%
  filter(!is.na(overall_raiting), !is.na(booking_price)) %>%
  mutate(
    precio_alto = booking_price   > median(booking_price,    na.rm = TRUE),
    rating_alto = overall_raiting > median(overall_raiting,  na.rm = TRUE),
    cuadrante = case_when(
       precio_alto &  rating_alto ~ "Alto precio / Alto rating",
       precio_alto & !rating_alto ~ "Alto precio / Bajo rating",
      !precio_alto &  rating_alto ~ "Bajo precio / Alto rating ⭐",
      TRUE                        ~ "Bajo precio / Bajo rating"
    )
  )

airbnb_q %>%
  count(zona, cuadrante) %>%
  group_by(zona) %>%
  mutate(pct = percent(n / sum(n), accuracy = 1)) %>%
  arrange(zona, desc(n)) %>%
  kable(caption = "Cuadrantes precio/noche-rating por zona – AirBnB")

Cuadrantes precio/noche-rating por zona – AirBnB
zona	cuadrante	n	pct
Centro (<3 km)	Bajo precio / Alto rating ⭐	122	36%
Centro (<3 km)	Alto precio / Bajo rating	87	25%
Centro (<3 km)	Bajo precio / Bajo rating	82	24%
Centro (<3 km)	Alto precio / Alto rating	52	15%
Intermedia (3-7 km)	Bajo precio / Bajo rating	106	29%
Intermedia (3-7 km)	Bajo precio / Alto rating ⭐	89	25%
Intermedia (3-7 km)	Alto precio / Bajo rating	88	24%
Intermedia (3-7 km)	Alto precio / Alto rating	80	22%
Periférica (7-15 km)	Alto precio / Alto rating	59	27%
Periférica (7-15 km)	Bajo precio / Bajo rating	57	26%
Periférica (7-15 km)	Bajo precio / Alto rating ⭐	54	25%
Periférica (7-15 km)	Alto precio / Bajo rating	48	22%
Exterior (>15 km)	Alto precio / Bajo rating	128	32%
Exterior (>15 km)	Alto precio / Alto rating	116	29%
Exterior (>15 km)	Bajo precio / Bajo rating	91	23%
Exterior (>15 km)	Bajo precio / Alto rating ⭐	59	15%

6 6. Pregunta E – Diferencias en reseñas

6.1 Volumen de Reseñas

bind_rows(
  airbnb  %>% select(zona, number_reviews, tipo),
  hoteles %>% select(zona, number_reviews, tipo)
) %>%
  group_by(zona, tipo) %>%
  summarise(
    total_resenas         = sum(number_reviews, na.rm = TRUE),
    resenas_median        = median(number_reviews, na.rm = TRUE),
    n_propiedades         = n(),
    resenas_por_propiedad = round(sum(number_reviews, na.rm = TRUE) / n(), 1),
    .groups = "drop"
  ) %>%
  arrange(tipo, desc(total_resenas)) %>%
  kable(caption = "Volumen de reseñas por zona y plataforma")

Volumen de reseñas por zona y plataforma
zona	tipo	total_resenas	resenas_median	n_propiedades	resenas_por_propiedad
Exterior (>15 km)	AirBnB	20772	6	519	40.0
Intermedia (3-7 km)	AirBnB	18397	9	480	38.3
Centro (<3 km)	AirBnB	15905	14	405	39.3
Periférica (7-15 km)	AirBnB	8490	1	376	22.6
Centro (<3 km)	Hotel	52110	111	85	613.1
Intermedia (3-7 km)	Hotel	1696	368	5	339.2

7 7. Pregunta F – Estrategia óptima de precios

7.1 Tabla Estrategia

precio_zona_ab <- airbnb %>%
  group_by(zona) %>%
  summarise(
    ab_p25        = quantile(booking_price, 0.25, na.rm = TRUE),
    ab_med        = median(booking_price,         na.rm = TRUE),
    ab_p75        = quantile(booking_price, 0.75, na.rm = TRUE),
    ab_rating_med = median(overall_raiting,       na.rm = TRUE),
    .groups = "drop"
  )

precio_zona_hot <- hoteles %>%
  group_by(zona) %>%
  summarise(
    hot_p25        = quantile(booking_price, 0.25, na.rm = TRUE),
    hot_med        = median(booking_price,         na.rm = TRUE),
    hot_p75        = quantile(booking_price, 0.75, na.rm = TRUE),
    hot_rating_med = median(overall_raiting,       na.rm = TRUE),
    .groups = "drop"
  )

estrategia <- left_join(precio_zona_ab, precio_zona_hot, by = "zona") %>%
  mutate(
    brecha_precio = hot_med - ab_med,
    brecha_rating = hot_rating_med - (ab_rating_med * 2),
    precio_optimo = round(ab_med * 1.05, 0),
    recomendacion = case_when(
      brecha_precio > 300 & brecha_rating < 0 ~
        "Reducir precio + mejorar servicio urgente",
      brecha_precio > 300 & brecha_rating >= 0 ~
        "Reducir precio – el servicio ya es competitivo",
      brecha_precio <= 0 ~
        "Precio ya competitivo – mantener y mejorar diferenciadores",
      TRUE ~
        "Ajuste moderado de precio + reforzar amenidades"
    )
  )

kable(
  estrategia %>%
    select(zona, ab_med, hot_med, brecha_precio,
           ab_rating_med, hot_rating_med, precio_optimo, recomendacion),
  digits = 0,
  col.names = c("Zona", "AirBnB Med/noche", "Hotel Med/noche", "Brecha",
                "Rating AB (/5)", "Rating Hot (/10)",
                "Precio Óptimo Hotel", "Recomendación"),
  caption = "Estrategia de precios por noche: hoteles vs AirBnB"
)

Estrategia de precios por noche: hoteles vs AirBnB
Zona	AirBnB Med/noche	Hotel Med/noche	Brecha	Rating AB (/5)	Rating Hot (/10)	Precio Óptimo Hotel	Recomendación
Centro (<3 km)	795	1622	827	5	8	835	Reducir precio + mejorar servicio urgente
Intermedia (3-7 km)	987	2759	1771	5	9	1037	Reducir precio + mejorar servicio urgente
Periférica (7-15 km)	1090	NA	NA	5	NA	1144	Ajuste moderado de precio + reforzar amenidades
Exterior (>15 km)	1199	NA	NA	5	NA	1259	Ajuste moderado de precio + reforzar amenidades

8 8. Modelo Logístico con Validación Cruzada

8.1 Especificación

datos_log <- combined %>%
  filter(!is.na(booking_price), !is.na(Dist_km_Downtown), !is.na(rating_10)) %>%
  mutate(es_airbnb = as.integer(tipo == "AirBnB"))

modelo_log <- glm(es_airbnb ~ booking_price + Dist_km_Downtown + rating_10,
                  data = datos_log, family = binomial(link = "logit"))

cat("═════════════════════════════════════════════════════════════════════════════\n")

## ═════════════════════════════════════════════════════════════════════════════

cat("MODELO LOGÍSTICO — CLASIFICACIÓN AirBnB vs HOTEL\n")

## MODELO LOGÍSTICO — CLASIFICACIÓN AirBnB vs HOTEL

cat("═════════════════════════════════════════════════════════════════════════════\n\n")

## ═════════════════════════════════════════════════════════════════════════════

print(summary(modelo_log))

## 
## Call:
## glm(formula = es_airbnb ~ booking_price + Dist_km_Downtown + 
##     rating_10, family = binomial(link = "logit"), data = datos_log)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.081e+01  1.427e+00  -7.577 3.55e-14 ***
## booking_price    -6.444e-04  1.186e-04  -5.432 5.59e-08 ***
## Dist_km_Downtown  1.437e+00  1.838e-01   7.822 5.18e-15 ***
## rating_10         1.149e+00  1.384e-01   8.301  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 669.14  on 1407  degrees of freedom
## Residual deviance: 288.75  on 1404  degrees of freedom
## AIC: 296.75
## 
## Number of Fisher Scoring iterations: 11

8.2 Validación Cruzada 5-fold

set.seed(42)
folds <- caret::createFolds(datos_log$es_airbnb, k = 5, returnTrain = TRUE)

cv_results <- sapply(seq_along(folds), function(i) {
  train_idx <- folds[[i]]
  test_idx  <- setdiff(seq_len(nrow(datos_log)), train_idx)
  
  datos_train <- datos_log[train_idx, ]
  datos_test  <- datos_log[test_idx, ]
  
  mod_cv <- glm(es_airbnb ~ booking_price + Dist_km_Downtown + rating_10,
                data = datos_train, family = binomial(link = "logit"))
  
  pred_prob <- predict(mod_cv, datos_test, type = "response")
  pred_cv <- ifelse(pred_prob > 0.5, 1, 0)
  
  accuracy <- mean(pred_cv == datos_test$es_airbnb, na.rm = TRUE)
  
  tp <- sum(pred_cv == 1 & datos_test$es_airbnb == 1, na.rm = TRUE)
  fp <- sum(pred_cv == 1 & datos_test$es_airbnb == 0, na.rm = TRUE)
  tn <- sum(pred_cv == 0 & datos_test$es_airbnb == 0, na.rm = TRUE)
  fn <- sum(pred_cv == 0 & datos_test$es_airbnb == 1, na.rm = TRUE)
  
  sensitivity <- if (tp + fn > 0) tp / (tp + fn) else NA
  specificity <- if (tn + fp > 0) tn / (tn + fp) else NA
  
  c(Accuracy = accuracy, Sensitivity = sensitivity, Specificity = specificity)
})

cv_summary <- data.frame(
  Fold = paste0("Fold ", 1:5),
  Accuracy = round(cv_results["Accuracy", ], 4),
  Sensitivity = round(cv_results["Sensitivity", ], 4),
  Specificity = round(cv_results["Specificity", ], 4)
)

kable(cv_summary, caption = "Validación cruzada 5-fold — Modelo logístico")

Validación cruzada 5-fold — Modelo logístico
Fold	Accuracy	Sensitivity	Specificity
Fold 1	0.9468	0.9885	0.4286
Fold 2	0.9609	0.9703	0.7500
Fold 3	0.9573	0.9848	0.5556
Fold 4	0.9539	0.9885	0.5000
Fold 5	0.9574	0.9924	0.4737

cat("\n┌─────────────────────────────────────────────────────────────────────────────┐\n")

## 
## ┌─────────────────────────────────────────────────────────────────────────────┐

cat("│ RESUMEN VALIDACIÓN CRUZADA                                                  │\n")

## │ RESUMEN VALIDACIÓN CRUZADA                                                  │

cat("├─────────────────────────────────────────────────────────────────────────────┤\n")

## ├─────────────────────────────────────────────────────────────────────────────┤

cat("│ Accuracy media:    ", sprintf("%6.2f%%", mean(cv_results["Accuracy", ], na.rm = TRUE) * 100), "                                           │\n")

## │ Accuracy media:      95.53%                                            │

cat("│ Sensitivity media: ", sprintf("%6.2f%%", mean(cv_results["Sensitivity", ], na.rm = TRUE) * 100), "  (capacidad detectar AirBnB)         │\n")

## │ Sensitivity media:   98.49%   (capacidad detectar AirBnB)         │

cat("│ Specificity media: ", sprintf("%6.2f%%", mean(cv_results["Specificity", ], na.rm = TRUE) * 100), "  (capacidad detectar Hotel)          │\n")

## │ Specificity media:   54.16%   (capacidad detectar Hotel)          │

cat("└─────────────────────────────────────────────────────────────────────────────┘\n")

## └─────────────────────────────────────────────────────────────────────────────┘

9 9. Diagnóstico de Supuestos OLS + Remedios

9.1 Pruebas de Supuestos

datos_ols <- combined %>%
  filter(!is.na(booking_price), !is.na(Dist_km_Downtown), !is.na(rating_10)) %>%
  mutate(log_precio = log(booking_price))

modelo_ols <- lm(log_precio ~ Dist_km_Downtown + rating_10 + tipo + zona,
                 data = datos_ols)

cat("═════════════════════════════════════════════════════════════════════════════\n")

## ═════════════════════════════════════════════════════════════════════════════

cat("DIAGNÓSTICO DE SUPUESTOS — MODELO OLS\n")

## DIAGNÓSTICO DE SUPUESTOS — MODELO OLS

cat("═════════════════════════════════════════════════════════════════════════════\n\n")

## ═════════════════════════════════════════════════════════════════════════════

bp_test <- bptest(modelo_ols)
cat("1. HOMOCEDASTICIDAD (Breusch-Pagan)\n")

## 1. HOMOCEDASTICIDAD (Breusch-Pagan)

cat("   H0: Varianza constante\n")

##    H0: Varianza constante

cat("   Estadístico:", round(bp_test$statistic, 4), "\n")

##    Estadístico: 34.5583

cat("   p-valor:", format.pval(bp_test$p.value, digits = 3), "\n")

##    p-valor: 5.25e-06

if (bp_test$p.value < 0.05) {
  cat("   ⚠️  RECHAZAR H0 → Heterocedasticidad presente\n")
} else {
  cat("   ✓ ACEPTAR H0 → Homocedasticidad satisfecha\n")
}

##    ⚠️  RECHAZAR H0 → Heterocedasticidad presente

resid_muestra <- if (nrow(datos_ols) > 5000) {
  sample(residuals(modelo_ols), 5000)
} else {
  residuals(modelo_ols)
}
sw_test <- shapiro.test(resid_muestra)
cat("\n2. NORMALIDAD (Shapiro-Wilk)\n")

## 
## 2. NORMALIDAD (Shapiro-Wilk)

cat("   H0: Residuos distribuidos normalmente\n")

##    H0: Residuos distribuidos normalmente

cat("   Estadístico:", round(sw_test$statistic, 4), "\n")

##    Estadístico: 0.9855

cat("   p-valor:", format.pval(sw_test$p.value, digits = 3), "\n")

##    p-valor: 1.15e-10

if (sw_test$p.value < 0.05) {
  cat("   ⚠️  RECHAZAR H0 → Residuos no normales\n")
} else {
  cat("   ✓ ACEPTAR H0 → Normalidad satisfecha\n")
}

##    ⚠️  RECHAZAR H0 → Residuos no normales

cat("\n3. MULTICOLINEALIDAD (Variance Inflation Factor)\n")

## 
## 3. MULTICOLINEALIDAD (Variance Inflation Factor)

vif_vals <- car::vif(modelo_ols)
print(vif_vals)

##                      GVIF Df GVIF^(1/(2*Df))
## Dist_km_Downtown 5.775995  1        2.403330
## rating_10        1.197320  1        1.094221
## tipo             1.362168  1        1.167120
## zona             6.379700  3        1.361863

if (max(vif_vals) < 5) {
  cat("   ✓ Todos VIF < 5 → Sin problemas de multicolinealidad\n")
} else {
  cat("   ⚠️  Algunos VIF ≥ 5 → Considerar remover variables\n")
}

##    ⚠️  Algunos VIF ≥ 5 → Considerar remover variables

cat("\n4. BONDAD DE AJUSTE\n")

## 
## 4. BONDAD DE AJUSTE

cat("   R² ajustado:", round(summary(modelo_ols)$adj.r.squared, 4), "\n")

##    R² ajustado: 0.0992

cat("   F-statistic:", round(summary(modelo_ols)$fstatistic[1], 2), "\n")

##    F-statistic: 26.84

cat("   p-valor F:", format.pval(pf(summary(modelo_ols)$fstatistic[1],
                                     summary(modelo_ols)$fstatistic[2],
                                     summary(modelo_ols)$fstatistic[3],
                                     lower.tail = FALSE), digits = 3), "\n")

##    p-valor F: <2e-16

9.2 Tabla Resumen de Supuestos

supuestos_tabla <- data.frame(
  Supuesto = c(
    "Homocedasticidad",
    "Normalidad residuos",
    "Multicolinealidad",
    "Autocorrelación espacial"
  ),
  Test = c(
    "Breusch-Pagan",
    "Shapiro-Wilk",
    "VIF máximo",
    "Moran's I (próx.)"
  ),
  Estadístico = c(
    round(bp_test$statistic, 3),
    round(sw_test$statistic, 3),
    round(max(vif_vals), 2),
    "—"
  ),
  p_valor = c(
    format.pval(bp_test$p.value, 2),
    format.pval(sw_test$p.value, 2),
    "< 5",
    "—"
  ),
  Estado = c(
    if (bp_test$p.value < 0.05) "⚠️ Hetero" else "✓ OK",
    if (sw_test$p.value < 0.05) "⚠️ No normal" else "✓ OK",
    if (max(vif_vals) < 5) "✓ OK" else "⚠️ Alto",
    "Pendiente"
  ),
  Remedio = c(
    "HC3 errors | GLM",
    "Log transf. ✓ | Box-Cox",
    "Revisar correlaciones",
    "SAR/GWR (Etapa 3)"
  ),
  stringsAsFactors = FALSE
)

kable(supuestos_tabla,
      caption = "Tabla resumen: Supuestos OLS, diagnóstico y remedios")

Tabla resumen: Supuestos OLS, diagnóstico y remedios
Supuesto	Test	Estadístico	p_valor	Estado	Remedio
Homocedasticidad	Breusch-Pagan	34.558	5.2e-06	⚠️ Hetero	HC3 errors \| GLM
Normalidad residuos	Shapiro-Wilk	0.985	1.2e-10	⚠️ No normal	Log transf. ✓ \| Box-Cox
Multicolinealidad	VIF máximo	6.38	< 5	⚠️ Alto	Revisar correlaciones
Autocorrelación espacial	Moran’s I (próx.)	—	—	Pendiente	SAR/GWR (Etapa 3)

9.3 Remedios Recomendados

cat("\n┌─────────────────────────────────────────────────────────────────────────────┐\n")

## 
## ┌─────────────────────────────────────────────────────────────────────────────┐

cat("│ REMEDIOS RECOMENDADOS PARA SUPUESTOS NO SATISFECHOS                        │\n")

## │ REMEDIOS RECOMENDADOS PARA SUPUESTOS NO SATISFECHOS                        │

cat("├─────────────────────────────────────────────────────────────────────────────┤\n")

## ├─────────────────────────────────────────────────────────────────────────────┤

cat("│                                                                             │\n")

## │                                                                             │

cat("│ PROBLEMA 1: Heterocedasticidad (Breusch-Pagan p < 0.05)                   │\n")

## │ PROBLEMA 1: Heterocedasticidad (Breusch-Pagan p < 0.05)                   │

cat("│                                                                             │\n")

## │                                                                             │

cat("│ SOLUCIÓN A: Errores robustos HC3 (mantener OLS, ajustar errores)          │\n")

## │ SOLUCIÓN A: Errores robustos HC3 (mantener OLS, ajustar errores)          │

cat("│   library(sandwich)                                                         │\n")

## │   library(sandwich)                                                         │

cat("│   library(lmtest)                                                           │\n")

## │   library(lmtest)                                                           │

cat("│   coeftest(modelo_ols, vcov = vcovHC(modelo_ols, type = 'HC3'))           │\n")

## │   coeftest(modelo_ols, vcov = vcovHC(modelo_ols, type = 'HC3'))           │

cat("│                                                                             │\n")

## │                                                                             │

cat("│ SOLUCIÓN B: Usar GLM con link function                                    │\n")

## │ SOLUCIÓN B: Usar GLM con link function                                    │

cat("│   modelo_glm <- glm(booking_price ~ ... , family = gaussian(link='log'))  │\n")

## │   modelo_glm <- glm(booking_price ~ ... , family = gaussian(link='log'))  │

cat("│                                                                             │\n")

## │                                                                             │

cat("├─────────────────────────────────────────────────────────────────────────────┤\n")

## ├─────────────────────────────────────────────────────────────────────────────┤

cat("│                                                                             │\n")

## │                                                                             │

cat("│ PROBLEMA 2: Residuos no normales (Shapiro-Wilk p < 0.05)                 │\n")

## │ PROBLEMA 2: Residuos no normales (Shapiro-Wilk p < 0.05)                 │

cat("│                                                                             │\n")

## │                                                                             │

cat("│ SOLUCIÓN A: Transformación log ya aplicada (precio → log(precio)) ✓       │\n")

## │ SOLUCIÓN A: Transformación log ya aplicada (precio → log(precio)) ✓       │

cat("│                                                                             │\n")

## │                                                                             │

cat("│ SOLUCIÓN B: Box-Cox para transformación óptima                            │\n")

## │ SOLUCIÓN B: Box-Cox para transformación óptima                            │

cat("│   library(MASS)                                                             │\n")

## │   library(MASS)                                                             │

cat("│   bc <- boxcox(modelo_ols)                                                 │\n")

## │   bc <- boxcox(modelo_ols)                                                 │

cat("│   lambda <- bc$x[which.max(bc$y)]                                          │\n")

## │   lambda <- bc$x[which.max(bc$y)]                                          │

cat("│                                                                             │\n")

## │                                                                             │

cat("├─────────────────────────────────────────────────────────────────────────────┤\n")

## ├─────────────────────────────────────────────────────────────────────────────┤

cat("│                                                                             │\n")

## │                                                                             │

cat("│ ALTERNATIVA GLOBAL: Modelo SAR (Spatial Autoregressive) — Etapa 3         │\n")

## │ ALTERNATIVA GLOBAL: Modelo SAR (Spatial Autoregressive) — Etapa 3         │

cat("│                                                                             │\n")

## │                                                                             │

cat("│   Captura autocorrelación espacial detectada en Moran's I                  │\n")

## │   Captura autocorrelación espacial detectada en Moran's I                  │

cat("│   Corrije simultaneously heterocedasticidad y correlación espacial         │\n")

## │   Corrije simultaneously heterocedasticidad y correlación espacial         │

cat("│                                                                             │\n")

## │                                                                             │

cat("└─────────────────────────────────────────────────────────────────────────────┘\n")

## └─────────────────────────────────────────────────────────────────────────────┘

10 10. Tests Estadísticos Explícitos

10.1 Tests t-Welch y Wilcoxon

# Recargar datos por si acaso
airbnb_raw <- read_csv("AirBnB_Monterrey_FINAL_ES (3) (1).csv")
hoteles_raw <- read_csv("hoteles_booking_limpio (3).csv")

zona_labs <- c("Centro (<3 km)", "Intermedia (3-7 km)",
               "Periférica (7-15 km)", "Exterior (>15 km)")

airbnb <- airbnb_raw %>%
  mutate(
    zona = factor(
      case_when(
        dist_km_downtown <  3  ~ "Centro (<3 km)",
        dist_km_downtown <  7  ~ "Intermedia (3-7 km)",
        dist_km_downtown < 15  ~ "Periférica (7-15 km)",
        TRUE                   ~ "Exterior (>15 km)"
      ),
      levels = zona_labs
    ),
    tipo             = "AirBnB",
    overall_raiting  = calificacion_satisfaccion_huesped,
    booking_price    = precio_booking / num_noches,
    number_reviews   = num_resenas,
    Dist_km_Downtown = dist_km_downtown,
    lat              = latitud,
    lon              = longitud,
    rating_10        = overall_raiting * 2
  )

hoteles <- hoteles_raw %>%
  mutate(
    zona = factor(zona, levels = zona_labs),
    tipo             = "Hotel",
    overall_raiting  = Calificacion_1,
    booking_price    = Precio,
    number_reviews   = No_Comentarios,
    Dist_km_Downtown = Dist_km_Centro,
    lat              = Lat,
    lon              = Lon,
    rating_10        = overall_raiting
  )

# ═══════════════════════════════════════════════════════════════════════════════
# TEST t DE WELCH
# ═══════════════════════════════════════════════════════════════════════════════

precio_airbnb <- airbnb$booking_price[!is.na(airbnb$booking_price)]
precio_hotel  <- hoteles$booking_price[!is.na(hoteles$booking_price)]

cat("═════════════════════════════════════════════════════════════════════════════\n")

## ═════════════════════════════════════════════════════════════════════════════

cat("TEST t DE WELCH — Diferencia de precios AirBnB vs Hotel\n")

## TEST t DE WELCH — Diferencia de precios AirBnB vs Hotel

cat("═════════════════════════════════════════════════════════════════════════════\n\n")

## ═════════════════════════════════════════════════════════════════════════════

cat("H0: μ(AirBnB) = μ(Hotel)\n")

## H0: μ(AirBnB) = μ(Hotel)

cat("H1: μ(AirBnB) ≠ μ(Hotel)\n\n")

## H1: μ(AirBnB) ≠ μ(Hotel)

cat("Media AirBnB:  $", round(mean(precio_airbnb, na.rm = TRUE), 0), "/noche\n")

## Media AirBnB:  $ 1330 /noche

cat("Media Hotel:   $", round(mean(precio_hotel, na.rm = TRUE), 0), "/noche\n")

## Media Hotel:   $ 1837 /noche

cat("Diferencia:    $", round(mean(precio_airbnb, na.rm = TRUE) - mean(precio_hotel, na.rm = TRUE), 0), "\n\n")

## Diferencia:    $ -507

t_welch <- t.test(precio_airbnb, precio_hotel, var.equal = FALSE)

cat("t-estadístico: ", round(t_welch$statistic, 4), "\n")

## t-estadístico:  -4.4094

cat("gl (aprox):    ", round(t_welch$parameter, 1), "\n")

## gl (aprox):     100.4

cat("p-valor:       ", format.pval(t_welch$p.value, digits = 3), "\n\n")

## p-valor:        2.61e-05

if (t_welch$p.value < 0.05) {
  cat("✓ RECHAZAR H0 → Diferencia SIGNIFICATIVA (p < 0.05)\n")
} else {
  cat("⚠️  ACEPTAR H0 → Diferencia NO significativa (p ≥ 0.05)\n")
}

## ✓ RECHAZAR H0 → Diferencia SIGNIFICATIVA (p < 0.05)

# ═══════════════════════════════════════════════════════════════════════════════
# TEST DE WILCOXON
# ═══════════════════════════════════════════════════════════════════════════════

rating_airbnb <- airbnb$rating_10[!is.na(airbnb$rating_10)]
rating_hotel  <- hoteles$rating_10[!is.na(hoteles$rating_10)]

cat("\n═════════════════════════════════════════════════════════════════════════════\n")

## 
## ═════════════════════════════════════════════════════════════════════════════

cat("TEST DE WILCOXON (MANN-WHITNEY) — Diferencia de ratings AirBnB vs Hotel\n")

## TEST DE WILCOXON (MANN-WHITNEY) — Diferencia de ratings AirBnB vs Hotel

cat("═════════════════════════════════════════════════════════════════════════════\n\n")

## ═════════════════════════════════════════════════════════════════════════════

cat("H0: Distribuciones de rating son iguales\n")

## H0: Distribuciones de rating son iguales

cat("H1: Distribuciones son diferentes\n\n")

## H1: Distribuciones son diferentes

cat("Mediana AirBnB:  ", round(median(rating_airbnb, na.rm = TRUE), 2), "/10\n")

## Mediana AirBnB:   9.88 /10

cat("Mediana Hotel:   ", round(median(rating_hotel, na.rm = TRUE), 2), "/10\n")

## Mediana Hotel:    8.55 /10

cat("Diferencia:      ", round(median(rating_airbnb, na.rm = TRUE) - median(rating_hotel, na.rm = TRUE), 2), "\n\n")

## Diferencia:       1.33

wilcoxon_test <- wilcox.test(rating_airbnb, rating_hotel, alternative = "two.sided")

cat("Estadístico U:   ", round(wilcoxon_test$statistic, 0), "\n")

## Estadístico U:    102168

cat("p-valor:         ", format.pval(wilcoxon_test$p.value, digits = 3), "\n\n")

## p-valor:          <2e-16

if (wilcoxon_test$p.value < 0.05) {
  cat("✓ RECHAZAR H0 → Distribuciones DIFERENTES (p < 0.05)\n")
} else {
  cat("⚠️  ACEPTAR H0 → Distribuciones iguales (p ≥ 0.05)\n")
}

## ✓ RECHAZAR H0 → Distribuciones DIFERENTES (p < 0.05)

# ═══════════════════════════════════════════════════════════════════════════════
# TABLA RESUMEN
# ═══════════════════════════════════════════════════════════════════════════════

tests_tabla <- data.frame(
  Test = c("t-Welch", "Wilcoxon"),
  Contraste = c(
    "μ(AirBnB precio) = μ(Hotel precio)",
    "Me(AirBnB rating) = Me(Hotel rating)"
  ),
  Estadístico = c(
    round(t_welch$statistic, 3),
    round(wilcoxon_test$statistic, 0)
  ),
  p_valor = c(
    format.pval(t_welch$p.value, digits = 2),
    format.pval(wilcoxon_test$p.value, digits = 2)
  ),
  Decisión = c(
    if (t_welch$p.value < 0.05) "Rechazar H0" else "Aceptar H0",
    if (wilcoxon_test$p.value < 0.05) "Rechazar H0" else "Aceptar H0"
  ),
  Conclusión = c(
    "Precios significativamente diferentes",
    "Ratings significativamente diferentes"
  ),
  stringsAsFactors = FALSE
)

kable(tests_tabla,
      caption = "Resumen de tests estadísticos (α = 0.05)")

Resumen de tests estadísticos (α = 0.05)
	Test	Contraste	Estadístico	p_valor	Decisión	Conclusión
t	t-Welch	μ(AirBnB precio) = μ(Hotel precio)	-4.409	2.6e-05	Rechazar H0	Precios significativamente diferentes
W	Wilcoxon	Me(AirBnB rating) = Me(Hotel rating)	102168.000	<2e-16	Rechazar H0	Ratings significativamente diferentes

Equipo Épsilon · Tec de Monterrey · AD3003B Planeación Estratégica Basada en Analítica Prescriptiva Socio Formador: DATLAS / DASHA MTY · Asesor: Dr. Pedro Vallejo

Análisis de Negocio – Inteligencia Competitiva AirBnB vs Hoteles | AMM

Equipo Épsilon – AD3003B

2026-04-30

1 1. Carga de Datos y Librerías

2 2. Pregunta A – Distancia al centro vs Precio

2.1 Scatter + Regresión

2.2 Correlación Pearson

3 3. Pregunta B – Localización, precio, satisfacción y gasto total

3.1 Tabla Resumen

4 4. Pregunta C – Zonas con precios extremos

4.1 Heatmap

5 5. Pregunta D – Correlación espacial

5.1 Cuadrantes

6 6. Pregunta E – Diferencias en reseñas

6.1 Volumen de Reseñas

7 7. Pregunta F – Estrategia óptima de precios

7.1 Tabla Estrategia

8 8. Modelo Logístico con Validación Cruzada

8.1 Especificación

8.2 Validación Cruzada 5-fold

9 9. Diagnóstico de Supuestos OLS + Remedios

9.1 Pruebas de Supuestos

9.2 Tabla Resumen de Supuestos

9.3 Remedios Recomendados

10 10. Tests Estadísticos Explícitos

10.1 Tests t-Welch y Wilcoxon