#install.packages("httr")
#install.packages("rvest")
library(httr)
library(rvest)
First, let’s write a function to use HTTP request to get a public COVID-19 Wiki page. Before you write the function, you can open this public page from this URL (https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country) using a web browser.
The goal of task 1 is to get the html page using HTTP request (httr library)
get_wiki_covid19_page <- function() {
# Our target COVID-19 wiki page URL is: https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country
# Which has two parts:
# 1) base URL `https://en.wikipedia.org/w/index.php
# 2) URL parameter: `title=Template:COVID-19_testing_by_country`, separated by question mark ?
# Wiki page base
wiki_base_url <- "http://web.archive.org/web/20221025155918/https://en.wikipedia.org/w/index.php"
# You will need to create a List which has an element called `title` to specify which page you want to get from Wiki
# in our case, it will be `Template:COVID-19_testing_by_country`
url_param <- list(title = "Template:COVID-19_testing_by_country")
wiki_response <- httr::GET(wiki_base_url, query = url_param)
return(wiki_response)
}
response <- get_wiki_covid19_page()
print(response)
## Response [http://web.archive.org/web/20221025155918/https://en.wikipedia.org/w/index.php?title=Template%3ACOVID-19_testing_by_country]
## Date: 2023-04-16 14:58
## Status: 200
## Content-Type: text/html; charset=UTF-8
## Size: 464 kB
## <!DOCTYPE html>
## <html class="client-nojs" lang="en" dir="ltr">
## <head><script type="text/javascript" src="/_static/js/bundle-playback.js?v=TP...
## <script type="text/javascript" src="/_static/js/wombat.js?v=txqj7nKC" charset...
## <script type="text/javascript">
## __wm.init("http://web.archive.org/web");
## __wm.wombat("https://en.wikipedia.org/w/index.php?title=Template:COVID-19_t...
## "1666713558");
## </script>
## <link rel="stylesheet" type="text/css" href="/_static/css/banner-styles.css?v...
## ...
# Now use the read_html function in rvest library to get the root html node from response
root_node <- rvest::read_html(response)
# Get the tables in the HTML root node using html_nodes function
table_node <- rvest::html_element(root_node, "table")
# Read the specific table from the multiple tables in the `table_node` using the `html_table` function and convert it into dataframe using `as.data.frame`. Hint:- Please read the `table_node` with index 2(ex:- table_node[2]).
covid_table <- rvest::html_table(table_node)
covid_df <- as.data.frame(covid_table)
knitr::kable(covid_df[1:10,])
Country or region | Date[a] | Tested | Units[b] | Confirmed(cases) | Confirmed /tested,% | Tested /population,% | Confirmed /population,% | Ref. |
---|---|---|---|---|---|---|---|---|
Afghanistan | 17 Dec 2020 | 154,767 | samples | 49,621 | 32.1 | 0.40 | 0.13 | [1] |
Albania | 18 Feb 2021 | 428,654 | samples | 96,838 | 22.6 | 15.0 | 3.4 | [2] |
Algeria | 2 Nov 2020 | 230,553 | samples | 58,574 | 25.4 | 0.53 | 0.13 | [3][4] |
Andorra | 23 Feb 2022 | 300,307 | samples | 37,958 | 12.6 | 387 | 49.0 | [5] |
Angola | 2 Feb 2021 | 399,228 | samples | 20,981 | 5.3 | 1.3 | 0.067 | [6] |
Antigua and Barbuda | 6 Mar 2021 | 15,268 | samples | 832 | 5.4 | 15.9 | 0.86 | [7] |
Argentina | 16 Apr 2022 | 35,716,069 | samples | 9,060,495 | 25.4 | 78.3 | 20.0 | [8] |
Armenia | 29 May 2022 | 3,099,602 | samples | 422,963 | 13.6 | 105 | 14.3 | [9] |
Australia | 9 Sep 2022 | 78,548,492 | samples | 10,112,229 | 12.9 | 313 | 40.3 | [10] |
Austria | 21 Oct 2022 | 199,625,374 | samples | 5,392,347 | 2.7 | 2,242 | 60.6 | [11] |
The goal of task 3 is to pre-process the extracted data frame from the previous step, and export it as a csv file
preprocess_covid_data_frame <- function(data_frame) {
# shape <- dim(data_frame)
# Remove the World row
# data_frame<-data_frame[!(data_frame$`Country.or.region`=="World"),]
# Remove the last row
data_frame <- data_frame[1:172, ]
# We dont need the Units and Ref columns, so can be removed
data_frame["Ref."] <- NULL
data_frame["Units[b]"] <- NULL
# Renaming the columns
names(data_frame) <- c("country", "date", "tested", "confirmed", "confirmed.tested.ratio", "tested.population.ratio", "confirmed.population.ratio")
# Convert column data types
data_frame$country <- as.factor(data_frame$country)
data_frame$date <- as.factor(data_frame$date)
data_frame$tested <- as.numeric(gsub(",","",data_frame$tested))
data_frame$confirmed <- as.numeric(gsub(",","",data_frame$confirmed))
data_frame$'confirmed.tested.ratio' <- as.numeric(gsub(",","",data_frame$`confirmed.tested.ratio`))
data_frame$'tested.population.ratio' <- as.numeric(gsub(",","",data_frame$`tested.population.ratio`))
data_frame$'confirmed.population.ratio' <- as.numeric(gsub(",","",data_frame$`confirmed.population.ratio`))
return(data_frame)
}
df <- preprocess_covid_data_frame(covid_df)
knitr::kable(df[1:10,])
country | date | tested | confirmed | confirmed.tested.ratio | tested.population.ratio | confirmed.population.ratio |
---|---|---|---|---|---|---|
Afghanistan | 17 Dec 2020 | 154767 | 49621 | 32.1 | 0.40 | 0.130 |
Albania | 18 Feb 2021 | 428654 | 96838 | 22.6 | 15.00 | 3.400 |
Algeria | 2 Nov 2020 | 230553 | 58574 | 25.4 | 0.53 | 0.130 |
Andorra | 23 Feb 2022 | 300307 | 37958 | 12.6 | 387.00 | 49.000 |
Angola | 2 Feb 2021 | 399228 | 20981 | 5.3 | 1.30 | 0.067 |
Antigua and Barbuda | 6 Mar 2021 | 15268 | 832 | 5.4 | 15.90 | 0.860 |
Argentina | 16 Apr 2022 | 35716069 | 9060495 | 25.4 | 78.30 | 20.000 |
Armenia | 29 May 2022 | 3099602 | 422963 | 13.6 | 105.00 | 14.300 |
Australia | 9 Sep 2022 | 78548492 | 10112229 | 12.9 | 313.00 | 40.300 |
Austria | 21 Oct 2022 | 199625374 | 5392347 | 2.7 | 2242.00 | 60.600 |
# Export the data frame to a csv file
write.csv(df, "covid_data.csv")
# Read covid_data_frame_csv from the csv file
data <- read.csv("covid_data.csv")
# Get the 5th to 10th rows, with two "country" "confirmed" columns
subset <- data[5:10,]
subset
## X country date tested confirmed
## 5 5 Angola 2 Feb 2021 399228 20981
## 6 6 Antigua and Barbuda 6 Mar 2021 15268 832
## 7 7 Argentina 16 Apr 2022 35716069 9060495
## 8 8 Armenia 29 May 2022 3099602 422963
## 9 9 Australia 9 Sep 2022 78548492 10112229
## 10 10 Austria 21 Oct 2022 199625374 5392347
## confirmed.tested.ratio tested.population.ratio confirmed.population.ratio
## 5 5.3 1.3 0.067
## 6 5.4 15.9 0.860
## 7 25.4 78.3 20.000
## 8 13.6 105.0 14.300
## 9 12.9 313.0 40.300
## 10 2.7 2242.0 60.600
The goal of task 5 is to get the total confirmed and tested cases worldwide, and try to figure the overall positive ratio using confirmed cases / tested cases
# Get the total confirmed cases worldwide
total_conf <- sum(data$confirmed)
total_conf
## [1] 424336298
# Get the total tested cases worldwide
total_test <- sum(data$tested)
total_test
## [1] 5341818173
# Get the positive ratio (confirmed / tested)
conf.test.ratio <- total_conf/total_test
conf.test.ratio
## [1] 0.07943668
The goal of task 6 is to get a catalog or sorted list of countries who have reported their COVID-19 testing data
# Get the `country` column
countries <- data$country
# Check its class (should be Factor)
class(countries)
## [1] "character"
# Convert the country column into character so that you can easily sort them
as.character(countries)
## [1] "Afghanistan" "Albania" "Algeria"
## [4] "Andorra" "Angola" "Antigua and Barbuda"
## [7] "Argentina" "Armenia" "Australia"
## [10] "Austria" "Azerbaijan" "Bahamas"
## [13] "Bahrain" "Bangladesh" "Barbados"
## [16] "Belarus" "Belgium" "Belize"
## [19] "Benin" "Bhutan" "Bolivia"
## [22] "Bosnia and Herzegovina" "Botswana" "Brazil"
## [25] "Brunei" "Bulgaria" "Burkina Faso"
## [28] "Burundi" "Cambodia" "Cameroon"
## [31] "Canada" "Chad" "Chile"
## [34] "China[c]" "Colombia" "Costa Rica"
## [37] "Croatia" "Cuba" "Cyprus[d]"
## [40] "Czechia" "Denmark[e]" "Djibouti"
## [43] "Dominica" "Dominican Republic" "DR Congo"
## [46] "Ecuador" "Egypt" "El Salvador"
## [49] "Equatorial Guinea" "Estonia" "Eswatini"
## [52] "Ethiopia" "Faroe Islands" "Fiji"
## [55] "Finland" "France[f][g]" "Gabon"
## [58] "Gambia" "Georgia[h]" "Germany"
## [61] "Ghana" "Greece" "Greenland"
## [64] "Grenada" "Guatemala" "Guinea"
## [67] "Guinea-Bissau" "Guyana" "Haiti"
## [70] "Honduras" "Hungary" "Iceland"
## [73] "India" "Indonesia" "Iran"
## [76] "Iraq" "Ireland" "Israel"
## [79] "Italy" "Ivory Coast" "Jamaica"
## [82] "Japan" "Jordan" "Kazakhstan"
## [85] "Kenya" "Kosovo" "Kuwait"
## [88] "Kyrgyzstan" "Laos" "Latvia"
## [91] "Lebanon" "Lesotho" "Liberia"
## [94] "Libya" "Lithuania" "Luxembourg[i]"
## [97] "Madagascar" "Malawi" "Malaysia"
## [100] "Maldives" "Mali" "Malta"
## [103] "Mauritania" "Mauritius" "Mexico"
## [106] "Moldova[j]" "Mongolia" "Montenegro"
## [109] "Morocco" "Mozambique" "Myanmar"
## [112] "Namibia" "Nepal" "Netherlands"
## [115] "New Caledonia" "New Zealand" "Niger"
## [118] "Nigeria" "North Korea" "North Macedonia"
## [121] "Northern Cyprus[k]" "Norway" "Oman"
## [124] "Pakistan" "Palestine" "Panama"
## [127] "Papua New Guinea" "Paraguay" "Peru"
## [130] "Philippines" "Poland" "Portugal"
## [133] "Qatar" "Romania" "Russia"
## [136] "Rwanda" "Saint Kitts and Nevis" "Saint Lucia"
## [139] "Saint Vincent" "San Marino" "Saudi Arabia"
## [142] "Senegal" "Serbia" "Singapore"
## [145] "Slovakia" "Slovenia" "South Africa"
## [148] "South Korea" "South Sudan" "Spain"
## [151] "Sri Lanka" "Sudan" "Sweden"
## [154] "Switzerland[l]" "Taiwan[m]" "Tanzania"
## [157] "Thailand" "Togo" "Trinidad and Tobago"
## [160] "Tunisia" "Turkey" "Uganda"
## [163] "Ukraine" "United Arab Emirates" "United Kingdom"
## [166] "United States" "Uruguay" "Uzbekistan"
## [169] "Venezuela" "Vietnam" "Zambia"
## [172] "Zimbabwe"
# Sort the countries AtoZ
countries.sorted <- sort(countries)
# Sort the countries ZtoA
countries.sorted.desc <- sort(countries, decreasing = TRUE)
# Print the sorted ZtoA list
print(countries.sorted.desc)
## [1] "Zimbabwe" "Zambia" "Vietnam"
## [4] "Venezuela" "Uzbekistan" "Uruguay"
## [7] "United States" "United Kingdom" "United Arab Emirates"
## [10] "Ukraine" "Uganda" "Turkey"
## [13] "Tunisia" "Trinidad and Tobago" "Togo"
## [16] "Thailand" "Tanzania" "Taiwan[m]"
## [19] "Switzerland[l]" "Sweden" "Sudan"
## [22] "Sri Lanka" "Spain" "South Sudan"
## [25] "South Korea" "South Africa" "Slovenia"
## [28] "Slovakia" "Singapore" "Serbia"
## [31] "Senegal" "Saudi Arabia" "San Marino"
## [34] "Saint Vincent" "Saint Lucia" "Saint Kitts and Nevis"
## [37] "Rwanda" "Russia" "Romania"
## [40] "Qatar" "Portugal" "Poland"
## [43] "Philippines" "Peru" "Paraguay"
## [46] "Papua New Guinea" "Panama" "Palestine"
## [49] "Pakistan" "Oman" "Norway"
## [52] "Northern Cyprus[k]" "North Macedonia" "North Korea"
## [55] "Nigeria" "Niger" "New Zealand"
## [58] "New Caledonia" "Netherlands" "Nepal"
## [61] "Namibia" "Myanmar" "Mozambique"
## [64] "Morocco" "Montenegro" "Mongolia"
## [67] "Moldova[j]" "Mexico" "Mauritius"
## [70] "Mauritania" "Malta" "Mali"
## [73] "Maldives" "Malaysia" "Malawi"
## [76] "Madagascar" "Luxembourg[i]" "Lithuania"
## [79] "Libya" "Liberia" "Lesotho"
## [82] "Lebanon" "Latvia" "Laos"
## [85] "Kyrgyzstan" "Kuwait" "Kosovo"
## [88] "Kenya" "Kazakhstan" "Jordan"
## [91] "Japan" "Jamaica" "Ivory Coast"
## [94] "Italy" "Israel" "Ireland"
## [97] "Iraq" "Iran" "Indonesia"
## [100] "India" "Iceland" "Hungary"
## [103] "Honduras" "Haiti" "Guyana"
## [106] "Guinea-Bissau" "Guinea" "Guatemala"
## [109] "Grenada" "Greenland" "Greece"
## [112] "Ghana" "Germany" "Georgia[h]"
## [115] "Gambia" "Gabon" "France[f][g]"
## [118] "Finland" "Fiji" "Faroe Islands"
## [121] "Ethiopia" "Eswatini" "Estonia"
## [124] "Equatorial Guinea" "El Salvador" "Egypt"
## [127] "Ecuador" "DR Congo" "Dominican Republic"
## [130] "Dominica" "Djibouti" "Denmark[e]"
## [133] "Czechia" "Cyprus[d]" "Cuba"
## [136] "Croatia" "Costa Rica" "Colombia"
## [139] "China[c]" "Chile" "Chad"
## [142] "Canada" "Cameroon" "Cambodia"
## [145] "Burundi" "Burkina Faso" "Bulgaria"
## [148] "Brunei" "Brazil" "Botswana"
## [151] "Bosnia and Herzegovina" "Bolivia" "Bhutan"
## [154] "Benin" "Belize" "Belgium"
## [157] "Belarus" "Barbados" "Bangladesh"
## [160] "Bahrain" "Bahamas" "Azerbaijan"
## [163] "Austria" "Australia" "Armenia"
## [166] "Argentina" "Antigua and Barbuda" "Angola"
## [169] "Andorra" "Algeria" "Albania"
## [172] "Afghanistan"
The goal of task 7 is using a regular expression to find any countires start with United
# Use a regular expression `United.+` to find matches
matched_indeces <- grep("United.+", data$country)
matched_indeces
## [1] 164 165 166
# Print the matched country names
for (i in matched_indeces) {print(data$country[i])}
## [1] "United Arab Emirates"
## [1] "United Kingdom"
## [1] "United States"
The goal of task 8 is to compare the COVID-19 test data between two countires, you will need to select two rows from the dataframe, and select country, confirmed, confirmed-population-ratio columns
# Select a subset (should be only one row) of data frame based on a selected country name and columns
subset1 <- data[data$country == "Ukraine",]
# Select a subset (should be only one row) of data frame based on a selected country name and columns
subset2 <- data[data$country == "United Kingdom",]
The goal of task 9 is to find out which country you have selected before has larger ratio of confirmed cases to population, which may indicate that country has higher COVID-19 infection risk
# Use if-else statement
# if (check which confirmed.population value is greater) {
# print()
# } else {
# print()
# }
if (subset1$tested.population.ratio > subset2$confirmed.tested.ratio) {
print(paste0(subset1$country, " has higher COVID-19 infection risk."))
} else {
print(paste0(subset2$country, " has higher COVID-19 infection risk."))
}
## [1] "Ukraine has higher COVID-19 infection risk."
The goal of task 10 is to find out which countries have the confirmed to population ratio less than 1%, it may indicate the risk of those countries are relatively low
# Get a subset of any countries with `confirmed.population.ratio` less than the threshold
subset3 <- data[data$confirmed.population.ratio < 1, ]$country
subset3
## [1] "Afghanistan" "Algeria" "Angola"
## [4] "Antigua and Barbuda" "Bangladesh" "Benin"
## [7] "Brunei" "Burkina Faso" "Burundi"
## [10] "Cambodia" "Cameroon" "Chad"
## [13] "China[c]" "DR Congo" "Egypt"
## [16] "Ethiopia" "Gabon" "Gambia"
## [19] "Ghana" "Grenada" "Guinea"
## [22] "Guinea-Bissau" "Haiti" "Ivory Coast"
## [25] "Japan" "Kenya" "Laos"
## [28] "Liberia" "Madagascar" "Malawi"
## [31] "Mali" "Mauritania" "Mauritius"
## [34] "Mozambique" "Myanmar" "New Caledonia"
## [37] "Niger" "Nigeria" "North Korea"
## [40] "Pakistan" "Papua New Guinea" "Rwanda"
## [43] "Senegal" "South Korea" "South Sudan"
## [46] "Sri Lanka" "Sudan" "Tanzania"
## [49] "Thailand" "Togo" "Uganda"
## [52] "Uzbekistan" "Venezuela"