Di dalam package rvest terdapat fungsi html_table yang berguna untuk parsing tabel HTML. Fungsi ini mampu mengubah data berupa tabel di sebuah website HTML menjadi sebuah dataframe. Berikut ini adalah sraping tabel virus corona di berbagai negara di dunia pada web target https://www.worldometers.info/coronavirus/
loading library
library(rvest)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.6
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
membaca file HTML dari halaman website
url <- "https://www.worldometers.info/coronavirus/"
html <- url %>% read_html
membuat data frame
corona_table <- html %>%
html_node(".main_table_countries_div") %>%
html_table
menampilkan strukturnya danmerapikan
str(corona_table)
## tibble [238 × 22] (S3: tbl_df/tbl/data.frame)
## $ # : int [1:238] NA NA NA NA NA NA NA NA 1 2 ...
## $ Country,Other : chr [1:238] "North America" "Asia" "South America" "Europe" ...
## $ TotalCases : chr [1:238] "40,229,771" "53,913,017" "30,921,662" "47,283,948" ...
## $ NewCases : chr [1:238] "+7,586" "+113,020" "+2,571" "+22,757" ...
## $ TotalDeaths : chr [1:238] "909,007" "753,648" "951,683" "1,088,919" ...
## $ NewDeaths : chr [1:238] "+330" "+1,045" "+124" "+776" ...
## $ TotalRecovered : chr [1:238] "33,494,556" "51,134,915" "28,028,123" "44,717,303" ...
## $ NewRecovered : chr [1:238] "+6,296" "+137,816" "+2,456" "+37,659" ...
## $ ActiveCases : chr [1:238] "5,826,208" "2,024,454" "1,941,856" "1,477,726" ...
## $ Serious,Critical : chr [1:238] "11,735" "26,928" "32,194" "9,644" ...
## $ Tot Cases/1M pop : chr [1:238] "" "" "" "" ...
## $ Deaths/1M pop : chr [1:238] "" "" "" "" ...
## $ TotalTests : chr [1:238] "" "" "" "" ...
## $ Tests/1M pop : chr [1:238] "" "" "" "" ...
## $ Population : chr [1:238] "" "" "" "" ...
## $ Continent : chr [1:238] "North America" "Asia" "South America" "Europe" ...
## $ 1 Caseevery X ppl : chr [1:238] "" "" "" "" ...
## $ 1 Deathevery X ppl : chr [1:238] "" "" "" "" ...
## $ 1 Testevery X ppl : int [1:238] NA NA NA NA NA NA NA NA 1 4 ...
## $ New Cases/1M pop : num [1:238] NA NA NA NA NA NA NA NA 3 37 ...
## $ New Deaths/1M pop : num [1:238] NA NA NA NA NA NA NA NA 0.2 NA ...
## $ Active Cases/1M pop: chr [1:238] "" "" "" "" ...
corona_table
## # A tibble: 238 x 22
## `#` `Country,Other` TotalCases NewCases TotalDeaths NewDeaths
## <int> <chr> <chr> <chr> <chr> <chr>
## 1 NA "North America" 40,229,771 "+7,586" 909,007 "+330"
## 2 NA "Asia" 53,913,017 "+113,020" 753,648 "+1,045"
## 3 NA "South America" 30,921,662 "+2,571" 951,683 "+124"
## 4 NA "Europe" 47,283,948 "+22,757" 1,088,919 "+776"
## 5 NA "Africa" 5,126,718 "+742" 135,876 "+4"
## 6 NA "Oceania" 70,782 "+136" 1,256 ""
## 7 NA "" 721 "" 15 ""
## 8 NA "World" 177,546,619 "+146,812" 3,840,404 "+2,279"
## 9 1 "USA" 34,353,097 "+912" 615,770 "+53"
## 10 2 "India" 29,684,104 "+51,843" 379,601 ""
## # … with 228 more rows, and 16 more variables: TotalRecovered <chr>,
## # NewRecovered <chr>, ActiveCases <chr>, Serious,Critical <chr>,
## # Tot\U00a0Cases/1M pop <chr>, Deaths/1M pop <chr>, TotalTests <chr>,
## # Tests/1M pop <chr>, Population <chr>, Continent <chr>,
## # 1 Caseevery X ppl <chr>, 1 Deathevery X ppl <chr>, 1 Testevery X ppl <int>,
## # New Cases/1M pop <dbl>, New Deaths/1M pop <dbl>, Active Cases/1M pop <chr>
corona_table[1] <- list(NULL)
corona_table
## # A tibble: 238 x 21
## `Country,Other` TotalCases NewCases TotalDeaths NewDeaths TotalRecovered
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 "North America" 40,229,771 "+7,586" 909,007 "+330" 33,494,556
## 2 "Asia" 53,913,017 "+113,020" 753,648 "+1,045" 51,134,915
## 3 "South America" 30,921,662 "+2,571" 951,683 "+124" 28,028,123
## 4 "Europe" 47,283,948 "+22,757" 1,088,919 "+776" 44,717,303
## 5 "Africa" 5,126,718 "+742" 135,876 "+4" 4,573,060
## 6 "Oceania" 70,782 "+136" 1,256 "" 67,669
## 7 "" 721 "" 15 "" 706
## 8 "World" 177,546,619 "+146,812" 3,840,404 "+2,279" 162,016,332
## 9 "USA" 34,353,097 "+912" 615,770 "+53" 28,579,129
## 10 "India" 29,684,104 "+51,843" 379,601 "" 28,454,938
## # … with 228 more rows, and 15 more variables: NewRecovered <chr>,
## # ActiveCases <chr>, Serious,Critical <chr>, Tot\U00a0Cases/1M pop <chr>,
## # Deaths/1M pop <chr>, TotalTests <chr>, Tests/1M pop <chr>,
## # Population <chr>, Continent <chr>, 1 Caseevery X ppl <chr>,
## # 1 Deathevery X ppl <chr>, 1 Testevery X ppl <int>, New Cases/1M pop <dbl>,
## # New Deaths/1M pop <dbl>, Active Cases/1M pop <chr>
corona_table <- corona_table[-1:-8,]
mencetak data frame
print(corona_table)
## # A tibble: 230 x 21
## `Country,Other` TotalCases NewCases TotalDeaths NewDeaths TotalRecovered
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 USA 34,353,097 "+912" 615,770 "+53" 28,579,129
## 2 India 29,684,104 "+51,843" 379,601 "" 28,454,938
## 3 Brazil 17,543,853 "" 491,164 "" 15,944,646
## 4 France 5,744,589 "" 110,530 "" 5,511,166
## 5 Turkey 5,342,028 "" 48,879 "" 5,211,022
## 6 Russia 5,249,990 "+13,397" 127,576 "+396" 4,828,500
## 7 UK 4,581,006 "" 127,917 "" 4,292,182
## 8 Italy 4,248,432 "+1,400" 127,153 "+52" 4,019,424
## 9 Argentina 4,172,742 "" 86,615 "" 3,771,968
## 10 Colombia 3,802,052 "" 96,965 "" 3,539,442
## # … with 220 more rows, and 15 more variables: NewRecovered <chr>,
## # ActiveCases <chr>, Serious,Critical <chr>, Tot\U00a0Cases/1M pop <chr>,
## # Deaths/1M pop <chr>, TotalTests <chr>, Tests/1M pop <chr>,
## # Population <chr>, Continent <chr>, 1 Caseevery X ppl <chr>,
## # 1 Deathevery X ppl <chr>, 1 Testevery X ppl <int>, New Cases/1M pop <dbl>,
## # New Deaths/1M pop <dbl>, Active Cases/1M pop <chr>
DAFTAR PUSTAKA:
https://www.worldometers.info/coronavirus/
https://www.nurandi.id/blog/web-scraping-dengan-r-dan-rvest-parsing-tabel-html/