SCRAPPING: se va a usar la función “read_html” -> librerías necesarias

library(dplyr)

## 
## Adjuntando el paquete: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(rvest)

OPCION 1

PRIMER PASO: definir la url

link1 = "https://en.wikipedia.org/wiki/The_Economist_Democracy_Index"

SEGUNDO PASO: leer el contenido del link

webpage = read_html(link1)

TERCER PASO: xtraer la tabla usando un Xpath o CSS

data1 <- webpage |>
  html_node(xpath = '//*[@id="mw-content-text"]/div[1]/div[9]/table') |>
  html_table()

head(data1)

## # A tibble: 6 × 20
##   Region    `2023 rank` Country `Regime type` `2023` `2022` `2021` `2020` `2019`
##   <chr>           <int> <chr>   <chr>          <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 North Am…          13 Canada  Full democra…   8.69   8.88   8.87   9.24   9.22
## 2 North Am…          29 United… Flawed democ…   7.85   7.85   7.85   7.92   7.96
## 3 Western …          19 Austria Full democra…   8.28   8.2    8.07   8.16   8.29
## 4 Western …          36 Belgium Flawed democ…   7.64   7.64   7.51   7.51   7.64
## 5 Western …          37 Cyprus  Flawed democ…   7.38   7.38   7.43   7.56   7.59
## 6 Western …           6 Denmark Full democra…   9.28   9.28   9.09   9.15   9.22
## # ℹ 11 more variables: `2018` <dbl>, `2017` <dbl>, `2016` <dbl>, `2015` <dbl>,
## #   `2014` <dbl>, `2013` <dbl>, `2012` <dbl>, `2011` <dbl>, `2010` <dbl>,
## #   `2008` <dbl>, `2006` <dbl>

OPCION 2

RVEST: Qué pasa si los datos que quiero sacar no están en formato de tabla

link2 = "https://www.gob.pe/institucion/presidencia/funcionarios"
webpage2 = read_html(link2)

PRIMER PASO: CSS DEL NOMBRE

css_nombre = "h3.text-2xl"

nombre_html = html_nodes(webpage2,css_nombre)

name_text = html_text(nombre_html)
head(name_text)

## [1] "Dina Ercilia Boluarte Zegarra"   "Enrique Ernesto Vilchez Vilchez"
## [3] "Antonio Mirril Ramos Bernaola"   "Fredy Hernán Hinojosa Angulo"   
## [5] "Gabriela Sedano Barreto"         "José Joshua Curay Ferrer"

AHORA CON EL CARGO:

css_cargo = "p"

cargo_html = html_nodes(webpage2,css_cargo)

cargo_text = html_text(cargo_html)
head(cargo_text)

## [1] "Presidenta de la República del Perú"                        
## [2] "Secretario General"                                         
## [3] "Subsecretario General"                                      
## [4] "Jefe del Gabinete Técnico de la Presidencia de la República"
## [5] "Jefa del Órgano de Control Institucional"                   
## [6] "Director General de la Oficina de Protocolo"

PASO 2: ARMAMOS LA BASE

data2 = data.frame(NOMBRE = name_text,  CARGO = cargo_text)
head(data2)

##                            NOMBRE
## 1   Dina Ercilia Boluarte Zegarra
## 2 Enrique Ernesto Vilchez Vilchez
## 3   Antonio Mirril Ramos Bernaola
## 4    Fredy Hernán Hinojosa Angulo
## 5         Gabriela Sedano Barreto
## 6        José Joshua Curay Ferrer
##                                                         CARGO
## 1                         Presidenta de la República del Perú
## 2                                          Secretario General
## 3                                       Subsecretario General
## 4 Jefe del Gabinete Técnico de la Presidencia de la República
## 5                    Jefa del Órgano de Control Institucional
## 6                 Director General de la Oficina de Protocolo

PRACTICA

Daniel Sánchez

2024-10-07

OPCION 1

OPCION 2