Web scraping Wikipedia

#load libraries and url 

library(tidyverse)
library(rvest)

url<- "https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory"

html_data<- read_html(url)

Extract data from website

#html_nodes will select all nodes, html_node to select one
#inspect elements- use # for IDs
#convert to table then as tibble to tidy it up
#use name repair to get around column name issues
df_raw <- html_data %>% 
  html_node("#table65150380") %>% 
  html_table() %>% 
  as_tibble(.name_repair = "unique")

## New names:
## * Country -> Country...1
## * Country -> Country...2
## * `` -> ...6
## * `` -> ...7
## * `` -> ...8

Clean the data

#Final row is just footer info. Use slice (positive to add rows, negative to remove)
df_clean<-df_raw %>% 
  select(Country=2, 3:5) %>% 
  slice(-(n())) %>% #removes last row. To remove last two rows would be: slice(-((n()-1):n()))
  
  #remove everything in Country column within a square bracket i.e. World[a]
  mutate(Country = Country %>% str_remove_all("\\[.+")) %>% 
 
   #convert numbers to numeric instead of character. Use a function to remove commas first
  mutate_at(c("Deaths/million", "Deaths", "Cases"), function(x){
    x %>% str_remove_all(",") %>% as.integer()
  }) 
head(df_clean)

## # A tibble: 6 x 4
##   Country                `Deaths/million`  Deaths     Cases
##   <chr>                             <int>   <int>     <int>
## 1 World                               633 4989532 246025709
## 2 Peru                               6001  200197   2199876
## 3 Bosnia and Herzegovina             3522   11495    252758
## 4 Bulgaria                           3461   23872    598199
## 5 North Macedonia                    3412    7108    201708
## 6 Montenegro                         3335    2095    143322

Web scraping Wikipedia

R Glover

31/10/2021