Librerías
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(xml2)
library(rvest)
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library(lubridate)
library(stringr)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
Definir búsqueda
query <- "inteligencia artificial"
query_clean <- str_replace_all(query, " ", "+")
url <- paste0("https://news.google.com/rss/search?q=", query_clean)
url
## [1] "https://news.google.com/rss/search?q=inteligencia+artificial"
Crear DataFrame
df <- tibble(
title = titles,
link = links,
date = dates,
description = desc
) %>% clean_names()
Limpieza de fecha
df <- df %>%
mutate(
date = ymd_hms(date)
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `date = ymd_hms(date)`.
## Caused by warning:
## ! All formats failed to parse. No formats found.
Feature engineering
df <- df %>%
mutate(
title_length = nchar(title),
has_number = str_detect(title, "\\\\d+")
)
Limpieza título
df <- df %>%
mutate(
title = str_remove(title, " - .*")
)
EDA
summary(df)
## title link date description
## Length:100 Length:100 Min. :NA Length:100
## Class :character Class :character 1st Qu.:NA Class :character
## Mode :character Mode :character Median :NA Mode :character
## Mean :NaN
## 3rd Qu.:NA
## Max. :NA
## NA's :100
## title_length has_number source
## Min. : 33.00 Mode :logical Length:100
## 1st Qu.: 64.00 FALSE:100 Class :character
## Median : 83.00 Mode :character
## Mean : 86.12
## 3rd Qu.:104.00
## Max. :199.00
##
ggplot(df, aes(title_length)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Frecuencia por fuente
df %>%
count(source, sort = TRUE)
## # A tibble: 77 × 2
## source n
## <chr> <int>
## 1 UNESCO 6
## 2 The New York Times 4
## 3 Universidad Pontificia Comillas 4
## 4 KFF Health News 3
## 5 Atalayar 2
## 6 CalMatters 2
## 7 Capital-Riesgo.es 2
## 8 Esade 2
## 9 European Parliament 2
## 10 Frontiers 2
## # ℹ 67 more rows
Top noticias
df %>%
arrange(desc(date)) %>%
head(10)
## # A tibble: 10 × 7
## title link date description title_length has_number source
## <chr> <chr> <dttm> <chr> <int> <lgl> <chr>
## 1 delitos… http… NA "<a href=\"https://news… 58 FALSE WFMZ.…
## 2 Inicio … http… NA "<a href=\"https://news… 122 FALSE contx…
## 3 Most ph… http… NA "<a href=\"https://news… 57 FALSE there…
## 4 UNESCO … http… NA "<a href=\"https://news… 78 FALSE UNESCO
## 5 Not onl… http… NA "<a href=\"https://news… 104 FALSE Diari…
## 6 Adolesc… http… NA "<a href=\"https://news… 72 FALSE KGET.…
## 7 Cómo de… http… NA "<a href=\"https://news… 81 FALSE Undet…
## 8 Intel s… http… NA "<a href=\"https://news… 93 FALSE Tradi…
## 9 Artific… http… NA "<a href=\"https://news… 103 FALSE Ayunt…
## 10 AI vs c… http… NA "<a href=\"https://news… 34 FALSE Atala…
Texto análisis
library(tidytext)
words <- df %>%
unnest_tokens(word, title) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
words %>%
count(word, sort = TRUE)
## # A tibble: 453 × 2
## word n
## <chr> <int>
## 1 artificial 57
## 2 ai 46
## 3 intelligence 36
## 4 inteligencia 19
## 5 de 18
## 6 la 14
## 7 en 7
## 8 los 6
## 9 2026 4
## 10 act 4
## # ℹ 443 more rows
Guardar
write.csv(df, "google_news.csv", row.names = FALSE)
Validación
stopifnot(nrow(df) > 0)