1. Configuración Inicial

1.1 Limpieza del Entorno

# Cerrar conexiones previas y limpiar memoria
closeAllConnections()
gc()

##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  618329 33.1    1415461 75.6   702048 37.5
## Vcells 1156215  8.9    8388608 64.0  1927429 14.8

1.2 Instalación de Paquetes

# Ejecutar SOLO si los paquetes no están instalados
install.packages("rvest", dependencies = TRUE)
install.packages("dplyr", dependencies = TRUE)
install.packages("lubridate", dependencies = TRUE)
install.packages("stringr", dependencies = TRUE)
install.packages("knitr", dependencies = TRUE)
install.packages("DT", dependencies = TRUE)
install.packages("ggplot2", dependencies = TRUE)

1.3 Carga de Librerías

suppressPackageStartupMessages({
  library(rvest)
  library(dplyr)
  library(lubridate)
  library(stringr)
  library(knitr)
  library(DT)
  library(ggplot2)
})

cat("✓ Todos los paquetes cargados correctamente\n")

## ✓ Todos los paquetes cargados correctamente

2. Funciones de Scraping

2.1 Función: Identificar Fuente

obtener_fuente <- function(url) {
  dominios <- list(
    "mvsnoticias.com" = "MVS Noticias",
    "yucatan.com.mx" = "Yucatán.com.mx",
    "eluniversal.com.mx" = "El Universal"
  )
  
  for (dominio in names(dominios)) {
    if (grepl(dominio, url, fixed = TRUE)) {
      return(dominios[[dominio]])
    }
  }
  
  return("Medio desconocido")
}

2.2 Función: Extraer Fecha

extraer_fecha <- function(pagina, selectores) {
  for (selector in selectores) {
    tryCatch({
      # Intentar extraer del atributo datetime
      fecha <- pagina %>%
        html_element(selector) %>%
        html_attr("datetime")
      
      if (!is.na(fecha) && nchar(fecha) > 0) {
        return(fecha)
      }
      
      # Si no hay datetime, extraer texto
      fecha <- pagina %>%
        html_element(selector) %>%
        html_text() %>%
        str_trim()
      
      if (!is.na(fecha) && nchar(fecha) > 0) {
        return(fecha)
      }
    }, error = function(e) NULL)
  }
  
  return(NA_character_)
}

2.3 Función: Extraer Autor

extraer_autor <- function(pagina, selectores) {
  for (selector in selectores) {
    tryCatch({
      autor <- pagina %>%
        html_element(selector) %>%
        html_text() %>%
        str_trim()
      
      if (!is.na(autor) && nchar(autor) > 0 && nchar(autor) < 200) {
        return(autor)
      }
    }, error = function(e) NULL)
  }
  
  return(NA_character_)
}

2.4 Función: Extraer Texto del Artículo

extraer_texto <- function(pagina, selectores) {
  # Intentar con selectores específicos
  for (selector in selectores) {
    tryCatch({
      texto <- pagina %>%
        html_element(selector) %>%
        html_text() %>%
        str_trim() %>%
        str_squish()
      
      if (!is.na(texto) && nchar(texto) > 100) {
        return(texto)
      }
    }, error = function(e) NULL)
  }
  
  # Fallback: extraer todos los párrafos
  tryCatch({
    texto <- pagina %>%
      html_elements("p") %>%
      html_text() %>%
      str_trim() %>%
      paste(collapse = " ") %>%
      str_squish()
    
    if (!is.na(texto) && nchar(texto) > 100) {
      return(texto)
    }
  }, error = function(e) NULL)
  
  return(NA_character_)
}

2.5 Función Principal: Scraping de Noticia Individual

scrape_noticia <- function(url) {
  tryCatch({
    # Descargar la página
    pagina <- read_html(url)
    fuente <- obtener_fuente(url)
    
    # Selectores por medio
    selectores_fecha <- list(
      "MVS Noticias" = c("time", ".date", ".published-date", 
                         "meta[property='article:published_time']"),
      "Yucatán.com.mx" = c("time[datetime]", ".article-date", 
                           ".published-date", ".entry-date"),
      "El Universal" = c("time[datetime]", ".publish-date", "time", 
                         ".date-published")
    )
    
    selectores_autor <- list(
      "MVS Noticias" = c(".author-name", ".autor", "[rel='author']", ".byline"),
      "Yucatán.com.mx" = c(".author", "[rel='author']", ".author-name", 
                           ".entry-author"),
      "El Universal" = c(".author-name", ".byline", "[rel='author']", ".autor")
    )
    
    selectores_texto <- list(
      "MVS Noticias" = c("article", ".article-content", ".entry-content", 
                         ".post-content"),
      "Yucatán.com.mx" = c("article", ".entry-content", ".post-content", 
                           ".article-body"),
      "El Universal" = c("article", ".article-body", ".story-body", 
                         ".entry-content")
    )
    
    # Obtener selectores
    sel_fecha <- selectores_fecha[[fuente]] %||% 
      c("time", ".date", "meta[property='article:published_time']")
    sel_autor <- selectores_autor[[fuente]] %||% 
      c("[rel='author']", ".author", ".byline")
    sel_texto <- selectores_texto[[fuente]] %||% 
      c("article", ".entry-content", ".post-content")
    
    # Extraer información
    fecha <- extraer_fecha(pagina, sel_fecha)
    autor <- extraer_autor(pagina, sel_autor)
    texto <- extraer_texto(pagina, sel_texto)
    
    # Normalizar valores
    fecha <- ifelse(is.na(fecha) || nchar(fecha) == 0, "No disponible", fecha)
    autor <- ifelse(is.na(autor) || nchar(autor) == 0, "No disponible", autor)
    texto <- ifelse(is.na(texto) || nchar(texto) == 0, "No disponible", texto)
    
    # Crear resultado
    resultado <- tibble(
      Fuente = fuente,
      URL = url,
      Fecha_Redaccion = fecha,
      Autor = autor,
      Texto_Crudo = texto,
      Fecha_Scraping = as.character(Sys.time()),
      Caracteres = nchar(texto)
    )
    
    return(resultado)
    
  }, error = function(e) {
    return(tibble(
      Fuente = obtener_fuente(url),
      URL = url,
      Fecha_Redaccion = NA_character_,
      Autor = NA_character_,
      Texto_Crudo = NA_character_,
      Fecha_Scraping = as.character(Sys.time()),
      Caracteres = NA_integer_
    ))
  })
}

2.6 Función: Scraping por Lote

scrape_noticias_lote <- function(urls, pausa = 2) {
  total <- length(urls)
  cat(sprintf("\n=== Iniciando scraping de %d noticias ===\n\n", total))
  
  resultados <- vector("list", total)
  
  for (i in seq_along(urls)) {
    cat(sprintf("[%d/%d] Procesando: %s\n", i, total, urls[i]))
    
    resultados[[i]] <- scrape_noticia(urls[i])
    
    if (i < total) {
      Sys.sleep(pausa)
    }
  }
  
  cat("\n=== Scraping completado ===\n\n")
  
  return(bind_rows(resultados))
}

3. Extracción de Datos

3.1 URLs a Procesar

urls_yucatan <- c(
  "https://mvsnoticias.com/yucatan/2025/10/13/investigacion-en-curso-por-fallecimiento-de-una-persona-en-oxkutzcab-715216.html",
  "https://www.yucatan.com.mx/merida/2025/10/12/feminicidios-en-yucatan-ante-una-preocupante-tendencia.html",
  "https://www.eluniversal.com.mx/estados/reportan-nuevo-envenenamiento-de-animales-en-yucatan/",
  "https://www.yucatan.com.mx/central-9/2025/10/02/mas-violencia-en-yucatan-suben-28-los-heridos-a-balazos-y-25-los-homicidios-en-un-ano.html"
)

cat("Total de URLs a procesar:", length(urls_yucatan), "\n")

## Total de URLs a procesar: 4

3.2 Ejecutar Scraping

noticias_yucatan <- scrape_noticias_lote(urls_yucatan, pausa = 2)

## 
## === Iniciando scraping de 4 noticias ===
## 
## [1/4] Procesando: https://mvsnoticias.com/yucatan/2025/10/13/investigacion-en-curso-por-fallecimiento-de-una-persona-en-oxkutzcab-715216.html
## [2/4] Procesando: https://www.yucatan.com.mx/merida/2025/10/12/feminicidios-en-yucatan-ante-una-preocupante-tendencia.html
## [3/4] Procesando: https://www.eluniversal.com.mx/estados/reportan-nuevo-envenenamiento-de-animales-en-yucatan/
## [4/4] Procesando: https://www.yucatan.com.mx/central-9/2025/10/02/mas-violencia-en-yucatan-suben-28-los-heridos-a-balazos-y-25-los-homicidios-en-un-ano.html
## 
## === Scraping completado ===

3.3 Vista Previa de Datos

# Mostrar las primeras filas
noticias_yucatan %>%
  select(Fuente, Autor, Caracteres) %>%
  head() %>%
  kable(caption = "Primeras filas del dataset")

Primeras filas del dataset
Fuente	Autor	Caracteres
MVS Noticias	No disponible	1273
Yucatán.com.mx	David Domínguez Massa	15212
El Universal	No disponible	2060
Yucatán.com.mx	Flor Estrella Santana	2166

4. Análisis de Datos

4.1 Resumen General

cat("=== ESTADÍSTICAS GENERALES ===\n")

## === ESTADÍSTICAS GENERALES ===

cat("Total de noticias procesadas:", nrow(noticias_yucatan), "\n")

## Total de noticias procesadas: 4

cat("Noticias con texto extraído:", 
    sum(noticias_yucatan$Caracteres > 100, na.rm = TRUE), "\n")

## Noticias con texto extraído: 4

cat("Noticias con autor:", 
    sum(noticias_yucatan$Autor != "No disponible"), "\n")

## Noticias con autor: 2

cat("Noticias con fecha:", 
    sum(noticias_yucatan$Fecha_Redaccion != "No disponible"), "\n")

## Noticias con fecha: 3

4.2 Resumen por Fuente

resumen_fuente <- noticias_yucatan %>%
  group_by(Fuente) %>%
  summarise(
    Cantidad = n(),
    Promedio_Caracteres = round(mean(Caracteres, na.rm = TRUE), 0),
    Min_Caracteres = min(Caracteres, na.rm = TRUE),
    Max_Caracteres = max(Caracteres, na.rm = TRUE),
    Con_Autor = sum(Autor != "No disponible"),
    Con_Fecha = sum(Fecha_Redaccion != "No disponible"),
    .groups = "drop"
  )

resumen_fuente %>%
  kable(caption = "Resumen por fuente de noticias",
        col.names = c("Fuente", "Cantidad", "Promedio Car.", "Min Car.", 
                      "Max Car.", "Con Autor", "Con Fecha"))

Resumen por fuente de noticias
Fuente	Cantidad	Promedio Car.	Min Car.	Max Car.	Con Autor	Con Fecha
El Universal	1	2060	2060	2060	0	0
MVS Noticias	1	1273	1273	1273	0	1
Yucatán.com.mx	2	8689	2166	15212	2	2

4.3 Tabla Interactiva

noticias_yucatan %>%
  mutate(Preview = substr(Texto_Crudo, 1, 100)) %>%
  select(Fuente, Autor, Fecha_Redaccion, Caracteres, Preview) %>%
  datatable(
    options = list(pageLength = 10, scrollX = TRUE),
    caption = "Datos completos de las noticias",
    filter = "top"
  )

5. Visualizaciones

5.1 Distribución de Caracteres por Fuente

ggplot(noticias_yucatan, aes(x = Fuente, y = Caracteres, fill = Fuente)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = scales::comma(Caracteres)), 
            vjust = -0.5, size = 4) +
  labs(
    title = "Longitud de Artículos por Medio",
    subtitle = "Número total de caracteres extraídos",
    x = "Medio de Comunicación",
    y = "Caracteres"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    axis.text.x = element_text(angle = 45, hjust = 1)
  ) +
  scale_y_continuous(labels = scales::comma)

Cantidad de caracteres por medio

5.2 Completitud de Datos

datos_completitud <- noticias_yucatan %>%
  summarise(
    `Texto Extraído` = sum(Caracteres > 100, na.rm = TRUE),
    `Con Autor` = sum(Autor != "No disponible"),
    `Con Fecha` = sum(Fecha_Redaccion != "No disponible")
  ) %>%
  tidyr::pivot_longer(everything(), names_to = "Campo", values_to = "Cantidad")

ggplot(datos_completitud, aes(x = Campo, y = Cantidad, fill = Campo)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = Cantidad), vjust = -0.5, size = 5) +
  labs(
    title = "Completitud de Datos Extraídos",
    subtitle = paste("Total de", nrow(noticias_yucatan), "noticias procesadas"),
    x = "",
    y = "Cantidad"
  ) +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(face = "bold", size = 16)) +
  ylim(0, nrow(noticias_yucatan) + 0.5)

Completitud de los datos extraídos

6. Exportación de Datos

6.1 Crear Directorio de Salida

if (!dir.exists("output")) {
  dir.create("output")
  cat("✓ Directorio 'output' creado\n")
}

6.2 Exportar a Múltiples Formatos

# CSV
archivo_csv <- "output/noticias_yucatan.csv"
write.csv(noticias_yucatan, archivo_csv, row.names = FALSE, fileEncoding = "UTF-8")
cat(sprintf("✓ Datos exportados a: %s\n", archivo_csv))

## ✓ Datos exportados a: output/noticias_yucatan.csv

# TSV
archivo_tsv <- "output/noticias_yucatan.tsv"
write.table(noticias_yucatan, archivo_tsv, row.names = FALSE, 
            sep = "\t", fileEncoding = "UTF-8")
cat(sprintf("✓ Datos exportados a: %s\n", archivo_tsv))

## ✓ Datos exportados a: output/noticias_yucatan.tsv

# RDS (formato R)
archivo_rds <- "output/noticias_yucatan.rds"
saveRDS(noticias_yucatan, archivo_rds)
cat(sprintf("✓ Datos exportados a: %s\n", archivo_rds))

## ✓ Datos exportados a: output/noticias_yucatan.rds

# Excel (opcional)
if (requireNamespace("writexl", quietly = TRUE)) {
  archivo_xlsx <- "output/noticias_yucatan.xlsx"
  writexl::write_xlsx(noticias_yucatan, archivo_xlsx)
  cat(sprintf("✓ Datos exportados a: %s\n", archivo_xlsx))
}

## ✓ Datos exportados a: output/noticias_yucatan.xlsx

7. Conclusiones

cat("\n## Resumen del Análisis\n\n")

Resumen del Análisis

cat("- **Total de noticias analizadas:**", nrow(noticias_yucatan), "\n")

Total de noticias analizadas: 4

cat("- **Medios consultados:**", n_distinct(noticias_yucatan$Fuente), "\n")

Medios consultados: 3

cat("- **Tasa de éxito en extracción:**", 
    round(sum(noticias_yucatan$Caracteres > 100, na.rm = TRUE) / 
            nrow(noticias_yucatan) * 100, 1), "%\n")

Tasa de éxito en extracción: 100 %

cat("- **Promedio de caracteres por noticia:**", 
    round(mean(noticias_yucatan$Caracteres, na.rm = TRUE), 0), "\n")

Promedio de caracteres por noticia: 5178

cat("\n## Recomendaciones\n\n")

Recomendaciones

cat("1. Verificar manualmente las noticias con texto corto\n")

Verificar manualmente las noticias con texto corto

cat("2. Actualizar selectores CSS si la estructura web cambia\n")

Actualizar selectores CSS si la estructura web cambia

cat("3. Considerar aumentar el tiempo de pausa entre solicitudes\n")

Considerar aumentar el tiempo de pausa entre solicitudes

8. Información de Sesión

sessionInfo()

## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=Spanish_Mexico.utf8  LC_CTYPE=Spanish_Mexico.utf8   
## [3] LC_MONETARY=Spanish_Mexico.utf8 LC_NUMERIC=C                   
## [5] LC_TIME=Spanish_Mexico.utf8    
## 
## time zone: America/Mexico_City
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_4.0.0   DT_0.34.0       knitr_1.50      stringr_1.5.2  
## [5] lubridate_1.9.4 dplyr_1.1.4     rvest_1.0.5    
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     selectr_0.4-2      compiler_4.5.1    
##  [5] tidyselect_1.2.1   xml2_1.4.0         tidyr_1.3.1        jquerylib_0.1.4   
##  [9] scales_1.4.0       yaml_2.3.10        fastmap_1.2.0      R6_2.6.1          
## [13] labeling_0.4.3     generics_0.1.4     curl_7.0.0         htmlwidgets_1.6.4 
## [17] tibble_3.3.0       bslib_0.9.0        pillar_1.11.1      RColorBrewer_1.1-3
## [21] rlang_1.1.6        cachem_1.1.0       stringi_1.8.7      xfun_0.53         
## [25] S7_0.2.0           sass_0.4.10        timechange_0.3.0   cli_3.6.5         
## [29] withr_3.0.2        magrittr_2.0.4     crosstalk_1.2.2    digest_0.6.37     
## [33] grid_4.5.1         rstudioapi_0.17.1  lifecycle_1.0.4    vctrs_0.6.5       
## [37] writexl_1.5.4      evaluate_1.0.5     glue_1.8.0         farver_2.1.2      
## [41] purrr_1.1.0        rmarkdown_2.30     httr_1.4.7         tools_4.5.1       
## [45] pkgconfig_2.0.3    htmltools_0.5.8.1

Documento generado el: 2025-10-14 12:48:13.079503

Web Scraper de Noticias - Yucatán

A01738675

`14_10_25`

1. Configuración Inicial

1.1 Limpieza del Entorno

1.2 Instalación de Paquetes

1.3 Carga de Librerías

2. Funciones de Scraping

2.1 Función: Identificar Fuente

2.2 Función: Extraer Fecha

2.3 Función: Extraer Autor

2.4 Función: Extraer Texto del Artículo

2.5 Función Principal: Scraping de Noticia Individual

2.6 Función: Scraping por Lote

3. Extracción de Datos

3.1 URLs a Procesar

3.2 Ejecutar Scraping

3.3 Vista Previa de Datos

4. Análisis de Datos

4.1 Resumen General

4.2 Resumen por Fuente

4.3 Tabla Interactiva

5. Visualizaciones

5.1 Distribución de Caracteres por Fuente

5.2 Completitud de Datos

6. Exportación de Datos

6.1 Crear Directorio de Salida

6.2 Exportar a Múltiples Formatos

7. Conclusiones

Resumen del Análisis

Recomendaciones

8. Información de Sesión