#load libraries and url
library(tidyverse)
library(rvest)
url<- "https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory"
html_data<- read_html(url)
Extract data from website
#html_nodes will select all nodes, html_node to select one
#inspect elements- use # for IDs
#convert to table then as tibble to tidy it up
#use name repair to get around column name issues
df_raw <- html_data %>%
html_node("#table65150380") %>%
html_table() %>%
as_tibble(.name_repair = "unique")
## New names:
## * Country -> Country...1
## * Country -> Country...2
## * `` -> ...6
## * `` -> ...7
## * `` -> ...8
Clean the data
#Final row is just footer info. Use slice (positive to add rows, negative to remove)
df_clean<-df_raw %>%
select(Country=2, 3:5) %>%
slice(-(n())) %>% #removes last row. To remove last two rows would be: slice(-((n()-1):n()))
#remove everything in Country column within a square bracket i.e. World[a]
mutate(Country = Country %>% str_remove_all("\\[.+")) %>%
#convert numbers to numeric instead of character. Use a function to remove commas first
mutate_at(c("Deaths/million", "Deaths", "Cases"), function(x){
x %>% str_remove_all(",") %>% as.integer()
})
head(df_clean)
## # A tibble: 6 x 4
## Country `Deaths/million` Deaths Cases
## <chr> <int> <int> <int>
## 1 World 633 4989532 246025709
## 2 Peru 6001 200197 2199876
## 3 Bosnia and Herzegovina 3522 11495 252758
## 4 Bulgaria 3461 23872 598199
## 5 North Macedonia 3412 7108 201708
## 6 Montenegro 3335 2095 143322