Webscraping

##Webscraping

Este documento presenta ejemplos prácticos de cómo utilizar R y la librería rvest para realizar scraping de datos de páginas web.

# Cargar las librerías necesarias
library(rvest)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

###Ejemplo 1: Scraping de Países y Datos Geográficos ####Extraer Datos de Países

# Leer la página web
doc <- read_html("https://www.scrapethissite.com/pages/simple/")
doc

## {html_document}
## <html lang="en">
##  [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF- ...
##  [2] <body>\n    <nav id="site-nav"><div class="container">\n                 ...
##  [3] <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery. ...
##  [4] <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/js/bootstra ...
##  [5] <script src="https://cdnjs.cloudflare.com/ajax/libs/pnotify/2.1.0/pnotif ...
##  [6] <link href="https://cdnjs.cloudflare.com/ajax/libs/pnotify/2.1.0/pnotify ...
##  [7] <script type="text/javascript">\n    \n    PNotify.prototype.options.sty ...
##  [8] <script type="text/javascript">\n    $("video").hover(function() {\n     ...
##  [9] <script>\n    (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r] ...
## [10] <script>\n  !function(f,b,e,v,n,t,s){if(f.fbq)return;n=f.fbq=function(){ ...
## [11] <noscript><img height="1" width="1" style="display:none" src="https://ww ...
## [12] <script type="text/javascript">\n    /* <![CDATA[ */\n    var google_con ...
## [13] <script type="text/javascript" src="//www.googleadservices.com/pagead/co ...
## [14] <noscript>\n    <div style="display:inline;">\n    <img height="1" width ...
## [15] <script async src="https://www.googletagmanager.com/gtag/js?id=AW-950945 ...
## [16] <script>\n   window.dataLayer = window.dataLayer || [];\n   function gta ...

# Extraer el nombre de cada país
paises <- doc %>%
  html_elements(".country-name") %>%
  html_text2()

# Extraer las capitales
capitales <- doc %>%
  html_elements(".country-capital") %>%
  html_text2()

# Extraer la población
poblacion <- doc %>%
  html_elements(".country-population") %>%
  html_text2()

# Extraer el área y convertirla a numérica
area <- doc %>%
  html_elements(".country-area") %>%
  html_text2() %>%
  as.numeric()

# Crear un data frame con los datos extraídos
df_paises <- data.frame(pais = paises,
                        capital = capitales, 
                        poblacion = poblacion, 
                        area = area)

# Mostrar las primeras filas del data frame
head(df_paises)

##                   pais          capital poblacion   area
## 1              Andorra Andorra la Vella     84000    468
## 2 United Arab Emirates        Abu Dhabi   4975593  82880
## 3          Afghanistan            Kabul  29121286 647500
## 4  Antigua and Barbuda       St. John's     86754    443
## 5             Anguilla       The Valley     13254    102
## 6              Albania           Tirana   2986952  28748

###Ejemplo 2: Scraping de Hockey ####Extraer Datos de Equipos de Hockey

# Leer la página web
doc <- read_html("https://www.scrapethissite.com/pages/forms/")
doc

## {html_document}
## <html lang="en">
##  [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF- ...
##  [2] <body>\n    <nav id="site-nav"><div class="container">\n                 ...
##  [3] <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery. ...
##  [4] <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/js/bootstra ...
##  [5] <script type="text/javascript">\n        $(document).ready( function(){\ ...
##  [6] <script src="https://cdnjs.cloudflare.com/ajax/libs/pnotify/2.1.0/pnotif ...
##  [7] <link href="https://cdnjs.cloudflare.com/ajax/libs/pnotify/2.1.0/pnotify ...
##  [8] <script type="text/javascript">\n    \n    PNotify.prototype.options.sty ...
##  [9] <script type="text/javascript">\n    $("video").hover(function() {\n     ...
## [10] <script>\n    (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r] ...
## [11] <script>\n  !function(f,b,e,v,n,t,s){if(f.fbq)return;n=f.fbq=function(){ ...
## [12] <noscript><img height="1" width="1" style="display:none" src="https://ww ...
## [13] <script type="text/javascript">\n    /* <![CDATA[ */\n    var google_con ...
## [14] <script type="text/javascript" src="//www.googleadservices.com/pagead/co ...
## [15] <noscript>\n    <div style="display:inline;">\n    <img height="1" width ...
## [16] <script async src="https://www.googletagmanager.com/gtag/js?id=AW-950945 ...
## [17] <script>\n   window.dataLayer = window.dataLayer || [];\n   function gta ...

# Extraer la tabla de la primera página
df_hockey <- (doc %>%
  html_table())[[1]]

# Iterar sobre varias páginas y combinar los resultados
df_hockey_total <- data.frame()
for(i in 1:24){
  url <- paste0("https://www.scrapethissite.com/pages/forms/?page_num=", i)
  doc <- read_html(url)
  
  # Extraer la tabla de cada página
  df_pagina <- (doc %>%
           html_table())[[1]]
  
  # Combinar las tablas de todas las páginas
  df_hockey_total <- rbind(df_hockey_total, df_pagina)
}

# Mostrar las primeras filas del data frame
head(df_hockey_total)

## # A tibble: 6 × 9
##   `Team Name`         Year  Wins Losses `OT Losses` `Win %` `Goals For (GF)`
##   <chr>              <int> <int>  <int>       <int>   <dbl>            <int>
## 1 Boston Bruins       1990    44     24          NA   0.55               299
## 2 Buffalo Sabres      1990    31     30          NA   0.388              292
## 3 Calgary Flames      1990    46     26          NA   0.575              344
## 4 Chicago Blackhawks  1990    49     23          NA   0.613              284
## 5 Detroit Red Wings   1990    34     38          NA   0.425              273
## 6 Edmonton Oilers     1990    37     37          NA   0.463              272
## # ℹ 2 more variables: `Goals Against (GA)` <int>, `+ / -` <int>

# Mostrar el número total de filas
nrow(df_hockey_total)

## [1] 582

###Ejemplo 3: Scraping de Películas en Filmaffinity ####Extraer Datos de Películas

# Leer la página web de Filmaffinity
doc <- read_html("https://filmaffinity.com/es/topcat.php?id=new_netflix")

# Extraer la fecha de estreno
estreno <- doc %>%
  html_elements(".date") %>%
  html_text2()

# Extraer la duración y convertirla a numérica
duracion <- doc %>%
  html_elements(".duration") %>%
  html_text2()

duraciones <- gsub(" min.", "", duracion) %>%
  as.numeric()

# Extraer los títulos
titulos <- doc %>%
  html_elements(".mc-right") %>%
  html_elements("h3") %>%
  html_text2()

# Extraer el tipo (Película o Serie de TV)
tipos <- ifelse(regexpr("\\(", titulos) > 0, "Serie de TV", "Película")

# Extraer el año
año <- (doc %>%
  html_elements(".mc-data") %>%
  html_text2()) %>%
  substr(1, 4) %>%
  as.numeric()

# Extraer el país
paises <- doc %>%
  html_elements(".mc-data") %>%
  html_elements(".nflag") %>%
  html_attr("alt")

# Extraer las puntuaciones
puntuaciones <- doc %>%
  html_elements(".avg-rating") %>%
  html_text2()

puntuaciones <- gsub(",", ".", puntuaciones) %>%
  as.numeric()

# Extraer los votos
votos <- doc %>%
  html_elements(".rat-count") %>%
  html_text2()

votos <- gsub("\\.", "", votos)

# Crear un data frame con los datos extraídos
df_peliculas <- data.frame(titulo = titulos, 
                           año = año, 
                           estreno = estreno, 
                           tipo = tipos, 
                           puntuacion = puntuaciones, 
                           votos = votos, 
                           duracion = duraciones)

# Mostrar las primeras filas del data frame
head(df_peliculas)

##                          titulo  año                 estreno     tipo
## 1     Arcane: League of Legends 2021  7 de noviembre de 2021 Película
## 2                  Black Mirror 2011     15 de junio de 2023 Película
## 3            Nuestro planeta II 2023     14 de junio de 2023 Película
## 4        Samurái de ojos azules 2023  3 de noviembre de 2023 Película
## 5 Cortar por la línea de puntos 2021 17 de noviembre de 2021 Película
## 6                       Monster 2004      1 de enero de 2023 Película
##   puntuacion votos duracion
## 1        8.2 13353       40
## 2        8.2 36543       60
## 3        8.1   461       50
## 4        8.1  6340       45
## 5        8.1  8501       18
## 6        7.9  8815       22

Webscraping

2024-10-02