library(ggplot2)
library(lubridate)
## 
## Adjuntando el paquete: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(forecast)
## Warning: package 'forecast' was built under R version 4.4.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.4.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(textdata)
## Warning: package 'textdata' was built under R version 4.4.3
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.1
## ✔ purrr   1.0.4     ✔ tibble  3.2.1
## ✔ readr   2.1.5     ✔ tidyr   1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.4.3
## Cargando paquete requerido: RColorBrewer
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(stopwords)
## Warning: package 'stopwords' was built under R version 4.4.3
library(RColorBrewer)
library(stringr)

1 Cargar datos

PS <- read_excel("C:\\Users\\Luis Mendoza\\Downloads\\Encuesta Perfil y SatisfacciĂ³n 2024.xlsx")
## Warning: Expecting numeric in T1172 / R1172C20: got 'Cada semana por trabajo'
colnames(PS)
##  [1] "Mes"                              "Procedencia PaĂ­s"                
##  [3] "Procedencia Ciudad"               "Edad"                            
##  [5] "Motivo Visita"                    "Motivo Visita Otro"              
##  [7] "Noches Hospedadas"                "Hospedaje en Hotel"              
##  [9] "Hospedaje en Extra Hotelero"      "Hospedaje en Casa Familia/amigos"
## [11] "No se hospedĂ³"                    "Hospedaje en Otro"               
## [13] "Transporte AviĂ³n"                 "Transporte AutobĂºs"              
## [15] "Transporte AutomĂ³vil"             "Transporte Motocicleta"          
## [17] "Transporte Otro"                  "Gasto total en pesos"            
## [19] "Primera Visita"                   "Visitas Previas"                 
## [21] "EV Experiencia"                   "EV Sostenibilidad"               
## [23] "Experiencia"                      "Si Recomienda"                   
## [25] "No Recomienda"                    "Comentario RecomendaciĂ³n"

2 I. Transporte mĂ¡s utilizado

PS_transporte <- PS %>%
  pivot_longer(cols = starts_with("Transporte "), 
               names_to = "TipoTransporte", 
               values_to = "valor") %>%
  filter(!is.na(valor) & valor > 0)

transporte_total <- PS_transporte %>%
  group_by(TipoTransporte) %>%
  summarise(Cantidad = n()) %>%
  mutate(TipoTransporte = str_remove(TipoTransporte, "Transporte "))

ggplot(transporte_total, aes(x = reorder(TipoTransporte, -Cantidad), y = Cantidad)) +
  geom_col(fill = "steelblue") +
  labs(title = "Transporte mĂ¡s utilizado (Todos los visitantes)", x = "Transporte", y = "Cantidad de personas") +
  theme_minimal()

PS_extranjeros <- PS %>% filter(`Procedencia País` != "México" & !is.na(`Procedencia País`))

transporte_extranjeros <- PS_extranjeros %>%
  pivot_longer(cols = starts_with("Transporte "), 
               names_to = "TipoTransporte", 
               values_to = "valor") %>%
  filter(!is.na(valor) & valor > 0) %>%
  group_by(TipoTransporte) %>%
  summarise(Cantidad = n()) %>%
  mutate(TipoTransporte = str_remove(TipoTransporte, "Transporte "))

ggplot(transporte_extranjeros, aes(x = reorder(TipoTransporte, -Cantidad), y = Cantidad)) +
  geom_col(fill = "darkorange") +
  labs(title = "Transporte mĂ¡s utilizado (Visitantes extranjeros)", x = "Transporte", y = "Cantidad de personas") +
  theme_minimal()

3 II. AnĂ¡lisis Exploratorio de Datos

# 1. Definir nombres de meses vĂ¡lidos
meses_es <- c("enero", "febrero", "marzo", "abril", "mayo", "junio",
              "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre")

# 2. Normalizar y convertir "Mes" a fecha
PS$Mes <- tolower(trimws(PS$Mes))                # limpiar texto
PS$MesNum <- match(PS$Mes, meses_es)             # convertir a nĂºmero de mes
PS$MesFecha <- as.Date(ISOdate(2024, PS$MesNum, 1))  # crear fecha segura

# 3. Filtrar registros vĂ¡lidos
PS <- PS[!is.na(PS$MesFecha), ]

# 4. Clasificar visitantes como "Normal" u "Outlier" segĂºn gasto
q3 <- quantile(PS$`Gasto total en pesos`, 0.75, na.rm = TRUE)
iqr <- IQR(PS$`Gasto total en pesos`, na.rm = TRUE)
limite_superior <- q3 + 1.5 * iqr

PS <- PS %>%
  mutate(GrupoGasto = ifelse(`Gasto total en pesos` > limite_superior, "Outlier", "Normal"))

# 5. Calcular gasto total mensual por grupo
gasto_mensual_segmentado <- PS %>%
  group_by(MesFecha, GrupoGasto) %>%
  summarise(GastoTotal = sum(`Gasto total en pesos`, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'MesFecha'. You can override using the
## `.groups` argument.
# 6. Graficar tendencia mensual por grupo
ggplot(gasto_mensual_segmentado, aes(x = MesFecha, y = GastoTotal, color = GrupoGasto)) +
  geom_line(size = 1.2) +
  geom_smooth(se = FALSE, linetype = "dashed") +
  scale_x_date(date_labels = "%b", date_breaks = "1 month") +
  labs(title = "Tendencia mensual del gasto total por grupo de gasto",
       x = "Mes", y = "Gasto total (MXN)", color = "Grupo de gasto") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Agregar una etiqueta para distinguir grupos
PS <- PS %>%
  mutate(
    GrupoGasto = ifelse(`Gasto total en pesos` > limite_superior, "Outliers", "Normales")
  )

# Eliminar valores NA en gasto
PS_filtrado <- PS %>% filter(!is.na(`Gasto total en pesos`))

# Boxplot comparativo
library(ggplot2)
ggplot(PS_filtrado, aes(x = GrupoGasto, y = `Gasto total en pesos`, fill = GrupoGasto)) +
  geom_boxplot(outlier.color = "red", outlier.shape = 16) +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title = "ComparaciĂ³n del Gasto Total",
    x = "Grupo",
    y = "Gasto total en pesos (MXN)"
  ) +
  theme_minimal()

# ------------------------
# Definir nombres legibles para las variables
# ------------------------
nombre_vars <- c("Gasto", "Edad", "Noches", "Visitas")

# ------------------------
# 1. Matriz general
# ------------------------
vars_cor <- PS[, c("Gasto total en pesos", "Edad", "Noches Hospedadas", "Visitas Previas")] %>%
  mutate_all(as.numeric)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Edad = .Primitive("as.double")(Edad)`.
## Caused by warning:
## ! NAs introducidos por coerciĂ³n
colnames(vars_cor) <- nombre_vars
cor_mat <- cor(vars_cor, use = "complete.obs")

corrplot(cor_mat,
         method = "circle",
         type = "lower",
         tl.col = "black",
         tl.cex = 1,
         tl.srt = 45,
         addCoef.col = "black",
         number.cex = 0.7,
         col = colorRampPalette(c("firebrick", "white", "steelblue"))(200),
         title = "Matriz de CorrelaciĂ³n General",
         mar = c(0, 0, 2, 0))

# ------------------------
# 2. Matriz para visitantes normales
# ------------------------
normales <- PS %>%
  filter(GrupoGasto == "Normales") %>%
  select(`Gasto total en pesos`, Edad, `Noches Hospedadas`, `Visitas Previas`) %>%
  mutate_all(as.numeric)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Edad = .Primitive("as.double")(Edad)`.
## Caused by warning:
## ! NAs introducidos por coerciĂ³n
colnames(normales) <- nombre_vars
cor_normales <- cor(normales, use = "complete.obs")

corrplot(cor_normales,
         method = "circle",
         type = "lower",
         tl.col = "black",
         tl.cex = 1,
         tl.srt = 45,
         addCoef.col = "black",
         number.cex = 0.7,
         col = colorRampPalette(c("firebrick", "white", "steelblue"))(200),
         title = "CorrelaciĂ³n - Visitantes Normales",
         mar = c(0, 0, 2, 0))

# ------------------------
# 3. Matriz para outliers
# ------------------------
outliers <- PS %>%
  filter(GrupoGasto == "Outliers") %>%
  select(`Gasto total en pesos`, Edad, `Noches Hospedadas`, `Visitas Previas`) %>%
  mutate_all(as.numeric)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Edad = .Primitive("as.double")(Edad)`.
## Caused by warning:
## ! NAs introducidos por coerciĂ³n
colnames(outliers) <- nombre_vars
cor_outliers <- cor(outliers, use = "complete.obs")

corrplot(cor_outliers,
         method = "circle",
         type = "lower",
         tl.col = "black",
         tl.cex = 1,
         tl.srt = 45,
         addCoef.col = "black",
         number.cex = 0.7,
         col = colorRampPalette(c("firebrick", "white", "steelblue"))(200),
         title = "CorrelaciĂ³n - Outliers",
         mar = c(0, 0, 2, 0))

4 V. AnĂ¡lisis de Sentimiento

comentarios <- PS %>%
  filter(!is.na(`Comentario RecomendaciĂ³n`)) %>%
  tidytext::unnest_tokens(input = `Comentario RecomendaciĂ³n`, output = word) %>%
  anti_join(get_stopwords(language = "es"))
## Joining with `by = join_by(word)`
lexico_es <- tibble(
  word = c("bueno", "excelente", "agradable", "recomendado", "feliz", "malo", "terrible", "sucio", "caro", "pésimo"),
  sentiment = c("positivo", "positivo", "positivo", "positivo", "positivo", "negativo", "negativo", "negativo", "negativo", "negativo")
)

sentimientos <- comentarios %>%
  inner_join(lexico_es, by = "word") %>%
  count(sentiment, sort = TRUE)

ggplot(sentimientos, aes(x = reorder(sentiment, n), y = n, fill = sentiment)) +
  geom_col() +
  coord_flip() +
  labs(title = "AnĂ¡lisis de sentimiento de los comentarios (Español)", x = "Sentimiento", y = "Frecuencia") +
  theme_minimal()

5 VI. Nube de Palabras por RecomendaciĂ³n

# -----------------------------
# Limpieza y procesamiento - SĂ­ Recomienda
# -----------------------------
recomienda <- PS %>%
  filter(!is.na(`Si Recomienda`)) %>%
  select(`Si Recomienda`) %>%
  rename(texto = `Si Recomienda`) %>%
  unnest_tokens(word, texto) %>%
  filter(!word %in% stopwords::stopwords("es"),
         !str_detect(word, "\\d"),         # elimina nĂºmeros
         str_length(word) > 2,             # evita palabras cortas irrelevantes
         !is.na(word)) %>%
  mutate(word = case_when(                # corregir sinĂ³nimos o variantes
    word == "banio" ~ "baño",
    word == "limpia" ~ "limpio",
    word == "recomiendo" ~ "recomendar",
    TRUE ~ word
  )) %>%
  count(word, sort = TRUE)

# -----------------------------
# Limpieza y procesamiento - No Recomienda
# -----------------------------
no_recomienda <- PS %>%
  filter(!is.na(`No Recomienda`)) %>%
  select(`No Recomienda`) %>%
  rename(texto = `No Recomienda`) %>%
  unnest_tokens(word, texto) %>%
  filter(!word %in% stopwords::stopwords("es"),
         !str_detect(word, "\\d"),
         str_length(word) > 2,
         !is.na(word)) %>%
  mutate(word = case_when(
    word == "banio" ~ "baño",
    word == "sucio" ~ "suciedad",
    TRUE ~ word
  )) %>%
  count(word, sort = TRUE)

# -----------------------------
# Graficar nubes de palabras mejoradas
# -----------------------------
par(mfrow = c(1, 2))  # panel de 2 grĂ¡ficos

# Wordcloud para SĂ­ Recomienda
if (nrow(recomienda) > 0) {
  wordcloud(words = recomienda$word,
            freq = recomienda$n,
            scale = c(4, 0.8),
            max.words = 100,
            colors = brewer.pal(8, "Dark2"),
            random.order = FALSE)
  title("Lugares que SĂ­ Recomiendan")
} else {
  plot.new(); title("Sin datos: SĂ­ Recomienda")
}

# Wordcloud para No Recomienda
if (nrow(no_recomienda) > 0) {
  wordcloud(words = no_recomienda$word,
            freq = no_recomienda$n,
            scale = c(4, 0.8),
            max.words = 100,
            colors = brewer.pal(8, "Reds"),
            random.order = FALSE)
  title("Lugares que No Recomiendan")
} else {
  plot.new(); title("Sin datos: No Recomienda")
}

par(mfrow = c(1, 1))  # restaurar panel normal