Di dalam package rvest terdapat fungsi html_table yang berguna untuk parsing tabel HTML. Fungsi ini mampu mengubah data berupa tabel di sebuah website HTML menjadi sebuah dataframe. Berikut ini adalah sraping tabel virus corona di berbagai negara di dunia pada web target https://www.worldometers.info/coronavirus/

loading library

library(rvest)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.6
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose

membaca file HTML dari halaman website

url <- "https://www.worldometers.info/coronavirus/"
html <- url %>% read_html

membuat data frame

corona_table <- html %>%
  html_node(".main_table_countries_div") %>%
  html_table

menampilkan strukturnya danmerapikan

str(corona_table)
## tibble [238 × 22] (S3: tbl_df/tbl/data.frame)
##  $ #                  : int [1:238] NA NA NA NA NA NA NA NA 1 2 ...
##  $ Country,Other      : chr [1:238] "North America" "Asia" "South America" "Europe" ...
##  $ TotalCases         : chr [1:238] "40,229,771" "53,913,017" "30,921,662" "47,283,948" ...
##  $ NewCases           : chr [1:238] "+7,586" "+113,020" "+2,571" "+22,757" ...
##  $ TotalDeaths        : chr [1:238] "909,007" "753,648" "951,683" "1,088,919" ...
##  $ NewDeaths          : chr [1:238] "+330" "+1,045" "+124" "+776" ...
##  $ TotalRecovered     : chr [1:238] "33,494,556" "51,134,915" "28,028,123" "44,717,303" ...
##  $ NewRecovered       : chr [1:238] "+6,296" "+137,816" "+2,456" "+37,659" ...
##  $ ActiveCases        : chr [1:238] "5,826,208" "2,024,454" "1,941,856" "1,477,726" ...
##  $ Serious,Critical   : chr [1:238] "11,735" "26,928" "32,194" "9,644" ...
##  $ Tot Cases/1M pop   : chr [1:238] "" "" "" "" ...
##  $ Deaths/1M pop      : chr [1:238] "" "" "" "" ...
##  $ TotalTests         : chr [1:238] "" "" "" "" ...
##  $ Tests/1M pop       : chr [1:238] "" "" "" "" ...
##  $ Population         : chr [1:238] "" "" "" "" ...
##  $ Continent          : chr [1:238] "North America" "Asia" "South America" "Europe" ...
##  $ 1 Caseevery X ppl  : chr [1:238] "" "" "" "" ...
##  $ 1 Deathevery X ppl : chr [1:238] "" "" "" "" ...
##  $ 1 Testevery X ppl  : int [1:238] NA NA NA NA NA NA NA NA 1 4 ...
##  $ New Cases/1M pop   : num [1:238] NA NA NA NA NA NA NA NA 3 37 ...
##  $ New Deaths/1M pop  : num [1:238] NA NA NA NA NA NA NA NA 0.2 NA ...
##  $ Active Cases/1M pop: chr [1:238] "" "" "" "" ...
corona_table
## # A tibble: 238 x 22
##      `#` `Country,Other` TotalCases  NewCases   TotalDeaths NewDeaths
##    <int> <chr>           <chr>       <chr>      <chr>       <chr>    
##  1    NA "North America" 40,229,771  "+7,586"   909,007     "+330"   
##  2    NA "Asia"          53,913,017  "+113,020" 753,648     "+1,045" 
##  3    NA "South America" 30,921,662  "+2,571"   951,683     "+124"   
##  4    NA "Europe"        47,283,948  "+22,757"  1,088,919   "+776"   
##  5    NA "Africa"        5,126,718   "+742"     135,876     "+4"     
##  6    NA "Oceania"       70,782      "+136"     1,256       ""       
##  7    NA ""              721         ""         15          ""       
##  8    NA "World"         177,546,619 "+146,812" 3,840,404   "+2,279" 
##  9     1 "USA"           34,353,097  "+912"     615,770     "+53"    
## 10     2 "India"         29,684,104  "+51,843"  379,601     ""       
## # … with 228 more rows, and 16 more variables: TotalRecovered <chr>,
## #   NewRecovered <chr>, ActiveCases <chr>, Serious,Critical <chr>,
## #   Tot\U00a0Cases/1M pop <chr>, Deaths/1M pop <chr>, TotalTests <chr>,
## #   Tests/1M pop <chr>, Population <chr>, Continent <chr>,
## #   1 Caseevery X ppl <chr>, 1 Deathevery X ppl <chr>, 1 Testevery X ppl <int>,
## #   New Cases/1M pop <dbl>, New Deaths/1M pop <dbl>, Active Cases/1M pop <chr>
corona_table[1] <- list(NULL)
corona_table
## # A tibble: 238 x 21
##    `Country,Other` TotalCases  NewCases   TotalDeaths NewDeaths TotalRecovered
##    <chr>           <chr>       <chr>      <chr>       <chr>     <chr>         
##  1 "North America" 40,229,771  "+7,586"   909,007     "+330"    33,494,556    
##  2 "Asia"          53,913,017  "+113,020" 753,648     "+1,045"  51,134,915    
##  3 "South America" 30,921,662  "+2,571"   951,683     "+124"    28,028,123    
##  4 "Europe"        47,283,948  "+22,757"  1,088,919   "+776"    44,717,303    
##  5 "Africa"        5,126,718   "+742"     135,876     "+4"      4,573,060     
##  6 "Oceania"       70,782      "+136"     1,256       ""        67,669        
##  7 ""              721         ""         15          ""        706           
##  8 "World"         177,546,619 "+146,812" 3,840,404   "+2,279"  162,016,332   
##  9 "USA"           34,353,097  "+912"     615,770     "+53"     28,579,129    
## 10 "India"         29,684,104  "+51,843"  379,601     ""        28,454,938    
## # … with 228 more rows, and 15 more variables: NewRecovered <chr>,
## #   ActiveCases <chr>, Serious,Critical <chr>, Tot\U00a0Cases/1M pop <chr>,
## #   Deaths/1M pop <chr>, TotalTests <chr>, Tests/1M pop <chr>,
## #   Population <chr>, Continent <chr>, 1 Caseevery X ppl <chr>,
## #   1 Deathevery X ppl <chr>, 1 Testevery X ppl <int>, New Cases/1M pop <dbl>,
## #   New Deaths/1M pop <dbl>, Active Cases/1M pop <chr>
corona_table <- corona_table[-1:-8,]

mencetak data frame

print(corona_table)
## # A tibble: 230 x 21
##    `Country,Other` TotalCases NewCases  TotalDeaths NewDeaths TotalRecovered
##    <chr>           <chr>      <chr>     <chr>       <chr>     <chr>         
##  1 USA             34,353,097 "+912"    615,770     "+53"     28,579,129    
##  2 India           29,684,104 "+51,843" 379,601     ""        28,454,938    
##  3 Brazil          17,543,853 ""        491,164     ""        15,944,646    
##  4 France          5,744,589  ""        110,530     ""        5,511,166     
##  5 Turkey          5,342,028  ""        48,879      ""        5,211,022     
##  6 Russia          5,249,990  "+13,397" 127,576     "+396"    4,828,500     
##  7 UK              4,581,006  ""        127,917     ""        4,292,182     
##  8 Italy           4,248,432  "+1,400"  127,153     "+52"     4,019,424     
##  9 Argentina       4,172,742  ""        86,615      ""        3,771,968     
## 10 Colombia        3,802,052  ""        96,965      ""        3,539,442     
## # … with 220 more rows, and 15 more variables: NewRecovered <chr>,
## #   ActiveCases <chr>, Serious,Critical <chr>, Tot\U00a0Cases/1M pop <chr>,
## #   Deaths/1M pop <chr>, TotalTests <chr>, Tests/1M pop <chr>,
## #   Population <chr>, Continent <chr>, 1 Caseevery X ppl <chr>,
## #   1 Deathevery X ppl <chr>, 1 Testevery X ppl <int>, New Cases/1M pop <dbl>,
## #   New Deaths/1M pop <dbl>, Active Cases/1M pop <chr>

DAFTAR PUSTAKA:

https://www.worldometers.info/coronavirus/

https://www.nurandi.id/blog/web-scraping-dengan-r-dan-rvest-parsing-tabel-html/