Scraping Web

Librerías

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(xml2)
library(rvest)

## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding

library(lubridate)
library(stringr)
library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

Definir búsqueda

query <- "inteligencia artificial"
query_clean <- str_replace_all(query, " ", "+")
url <- paste0("https://news.google.com/rss/search?q=", query_clean)
url

## [1] "https://news.google.com/rss/search?q=inteligencia+artificial"

Request RSS

rss <- read_xml(url)
rss

## {xml_document}
## <rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
## [1] <channel>\n  <generator>NFE/5.0</generator>\n  <title>"inteligencia artif ...

Extraer nodos

items <- xml_find_all(rss, "//item")
length(items)

## [1] 100

Extraer campos

titles <- xml_text(xml_find_all(items, "title"))
links  <- xml_text(xml_find_all(items, "link"))
dates  <- xml_text(xml_find_all(items, "pubDate"))
desc   <- xml_text(xml_find_all(items, "description"))

Crear DataFrame

df <- tibble(
  title = titles,
  link = links,
  date = dates,
  description = desc
) %>% clean_names()

Limpieza de fecha

df <- df %>%
  mutate(
    date = ymd_hms(date)
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `date = ymd_hms(date)`.
## Caused by warning:
## ! All formats failed to parse. No formats found.

Feature engineering

df <- df %>%
  mutate(
    title_length = nchar(title),
    has_number = str_detect(title, "\\\\d+")
  )

Extraer fuente

df <- df %>%
  mutate(
    source = str_extract(title, "- .*"),
    source = str_remove(source, "- ")
  )

Limpieza título

df <- df %>%
  mutate(
    title = str_remove(title, " - .*")
  )

EDA

summary(df)

##     title               link                date     description       
##  Length:100         Length:100         Min.   :NA    Length:100        
##  Class :character   Class :character   1st Qu.:NA    Class :character  
##  Mode  :character   Mode  :character   Median :NA    Mode  :character  
##                                        Mean   :NaN                     
##                                        3rd Qu.:NA                      
##                                        Max.   :NA                      
##                                        NA's   :100                     
##   title_length    has_number         source         
##  Min.   : 33.00   Mode :logical   Length:100        
##  1st Qu.: 64.00   FALSE:100       Class :character  
##  Median : 83.00                   Mode  :character  
##  Mean   : 86.12                                     
##  3rd Qu.:104.00                                     
##  Max.   :199.00                                     
##

ggplot(df, aes(title_length)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Frecuencia por fuente

df %>%
  count(source, sort = TRUE)

## # A tibble: 77 × 2
##    source                              n
##    <chr>                           <int>
##  1 UNESCO                              6
##  2 The New York Times                  4
##  3 Universidad Pontificia Comillas     4
##  4 KFF Health News                     3
##  5 Atalayar                            2
##  6 CalMatters                          2
##  7 Capital-Riesgo.es                   2
##  8 Esade                               2
##  9 European Parliament                 2
## 10 Frontiers                           2
## # ℹ 67 more rows

Top noticias

df %>%
  arrange(desc(date)) %>%
  head(10)

## # A tibble: 10 × 7
##    title    link  date   description              title_length has_number source
##    <chr>    <chr> <dttm> <chr>                           <int> <lgl>      <chr> 
##  1 delitos… http… NA     "<a href=\"https://news…           58 FALSE      WFMZ.…
##  2 Inicio … http… NA     "<a href=\"https://news…          122 FALSE      contx…
##  3 Most ph… http… NA     "<a href=\"https://news…           57 FALSE      there…
##  4 UNESCO … http… NA     "<a href=\"https://news…           78 FALSE      UNESCO
##  5 Not onl… http… NA     "<a href=\"https://news…          104 FALSE      Diari…
##  6 Adolesc… http… NA     "<a href=\"https://news…           72 FALSE      KGET.…
##  7 Cómo de… http… NA     "<a href=\"https://news…           81 FALSE      Undet…
##  8 Intel s… http… NA     "<a href=\"https://news…           93 FALSE      Tradi…
##  9 Artific… http… NA     "<a href=\"https://news…          103 FALSE      Ayunt…
## 10 AI vs c… http… NA     "<a href=\"https://news…           34 FALSE      Atala…

Texto análisis

library(tidytext)

words <- df %>%
  unnest_tokens(word, title) %>%
  anti_join(stop_words)

## Joining with `by = join_by(word)`

words %>%
  count(word, sort = TRUE)

## # A tibble: 453 × 2
##    word             n
##    <chr>        <int>
##  1 artificial      57
##  2 ai              46
##  3 intelligence    36
##  4 inteligencia    19
##  5 de              18
##  6 la              14
##  7 en               7
##  8 los              6
##  9 2026             4
## 10 act              4
## # ℹ 443 more rows

Guardar

write.csv(df, "google_news.csv", row.names = FALSE)

Validación

stopifnot(nrow(df) > 0)

Scraping Web

Marcelo Callao Pimentel

2025-05-01

Librerías

Definir búsqueda

Request RSS

Extraer nodos

Extraer campos

Crear DataFrame

Limpieza de fecha

Feature engineering

Extraer fuente

Limpieza título

EDA

Frecuencia por fuente

Top noticias

Texto análisis

Guardar

Validación