Load required packages

#install.packages("httr")
#install.packages("rvest")
library(httr)
library(rvest)

Task 1: Get a COVID-19 pandemic Wiki page using HTTP request

First, let’s write a function to use HTTP request to get a public COVID-19 Wiki page. Before you write the function, you can open this public page from this URL (https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country) using a web browser.

The goal of task 1 is to get the html page using HTTP request (httr library)

get_wiki_covid19_page <- function() {
  # Our target COVID-19 wiki page URL is: https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country  
  # Which has two parts: 
    # 1) base URL `https://en.wikipedia.org/w/index.php  
    # 2) URL parameter: `title=Template:COVID-19_testing_by_country`, separated by question mark ?
    
  # Wiki page base
  wiki_base_url <- "http://web.archive.org/web/20221025155918/https://en.wikipedia.org/w/index.php"
  
  # You will need to create a List which has an element called `title` to specify which page you want to get from Wiki
  # in our case, it will be `Template:COVID-19_testing_by_country`
  url_param <- list(title = "Template:COVID-19_testing_by_country")
  
  wiki_response <- httr::GET(wiki_base_url, query = url_param)
  return(wiki_response)
}

response <- get_wiki_covid19_page()
print(response)
## Response [http://web.archive.org/web/20221025155918/https://en.wikipedia.org/w/index.php?title=Template%3ACOVID-19_testing_by_country]
##   Date: 2023-04-16 14:58
##   Status: 200
##   Content-Type: text/html; charset=UTF-8
##   Size: 464 kB
## <!DOCTYPE html>
## <html class="client-nojs" lang="en" dir="ltr">
## <head><script type="text/javascript" src="/_static/js/bundle-playback.js?v=TP...
## <script type="text/javascript" src="/_static/js/wombat.js?v=txqj7nKC" charset...
## <script type="text/javascript">
##   __wm.init("http://web.archive.org/web");
##   __wm.wombat("https://en.wikipedia.org/w/index.php?title=Template:COVID-19_t...
##        "1666713558");
## </script>
## <link rel="stylesheet" type="text/css" href="/_static/css/banner-styles.css?v...
## ...

Task 2: Extract COVID-19 testing data table from the wiki HTML page

# Now use the read_html function in rvest library to get the root html node from response
root_node <- rvest::read_html(response)

# Get the tables in the HTML root node using html_nodes function
table_node <- rvest::html_element(root_node, "table")

# Read the specific table from the multiple tables in the `table_node` using the `html_table` function and convert it into dataframe using `as.data.frame`. Hint:- Please read the `table_node` with index 2(ex:- table_node[2]).
covid_table <- rvest::html_table(table_node)
covid_df <- as.data.frame(covid_table)
knitr::kable(covid_df[1:10,])
Country or region Date[a] Tested Units[b] Confirmed(cases) Confirmed /tested,% Tested /population,% Confirmed /population,% Ref.
Afghanistan 17 Dec 2020 154,767 samples 49,621 32.1 0.40 0.13 [1]
Albania 18 Feb 2021 428,654 samples 96,838 22.6 15.0 3.4 [2]
Algeria 2 Nov 2020 230,553 samples 58,574 25.4 0.53 0.13 [3][4]
Andorra 23 Feb 2022 300,307 samples 37,958 12.6 387 49.0 [5]
Angola 2 Feb 2021 399,228 samples 20,981 5.3 1.3 0.067 [6]
Antigua and Barbuda 6 Mar 2021 15,268 samples 832 5.4 15.9 0.86 [7]
Argentina 16 Apr 2022 35,716,069 samples 9,060,495 25.4 78.3 20.0 [8]
Armenia 29 May 2022 3,099,602 samples 422,963 13.6 105 14.3 [9]
Australia 9 Sep 2022 78,548,492 samples 10,112,229 12.9 313 40.3 [10]
Austria 21 Oct 2022 199,625,374 samples 5,392,347 2.7 2,242 60.6 [11]

Task 3: Pre-process and export the extracted data frame

The goal of task 3 is to pre-process the extracted data frame from the previous step, and export it as a csv file

preprocess_covid_data_frame <- function(data_frame) {
    
    # shape <- dim(data_frame)

    # Remove the World row
    # data_frame<-data_frame[!(data_frame$`Country.or.region`=="World"),]
    # Remove the last row
    data_frame <- data_frame[1:172, ]
    
    # We dont need the Units and Ref columns, so can be removed
    data_frame["Ref."] <- NULL
    data_frame["Units[b]"] <- NULL
    
    # Renaming the columns
    names(data_frame) <- c("country", "date", "tested", "confirmed", "confirmed.tested.ratio", "tested.population.ratio", "confirmed.population.ratio")
    
    # Convert column data types
    data_frame$country <- as.factor(data_frame$country)
    data_frame$date <- as.factor(data_frame$date)
    data_frame$tested <- as.numeric(gsub(",","",data_frame$tested))
    data_frame$confirmed <- as.numeric(gsub(",","",data_frame$confirmed))
    data_frame$'confirmed.tested.ratio' <- as.numeric(gsub(",","",data_frame$`confirmed.tested.ratio`))
    data_frame$'tested.population.ratio' <- as.numeric(gsub(",","",data_frame$`tested.population.ratio`))
    data_frame$'confirmed.population.ratio' <- as.numeric(gsub(",","",data_frame$`confirmed.population.ratio`))

    return(data_frame)
}

df <- preprocess_covid_data_frame(covid_df)
knitr::kable(df[1:10,])
country date tested confirmed confirmed.tested.ratio tested.population.ratio confirmed.population.ratio
Afghanistan 17 Dec 2020 154767 49621 32.1 0.40 0.130
Albania 18 Feb 2021 428654 96838 22.6 15.00 3.400
Algeria 2 Nov 2020 230553 58574 25.4 0.53 0.130
Andorra 23 Feb 2022 300307 37958 12.6 387.00 49.000
Angola 2 Feb 2021 399228 20981 5.3 1.30 0.067
Antigua and Barbuda 6 Mar 2021 15268 832 5.4 15.90 0.860
Argentina 16 Apr 2022 35716069 9060495 25.4 78.30 20.000
Armenia 29 May 2022 3099602 422963 13.6 105.00 14.300
Australia 9 Sep 2022 78548492 10112229 12.9 313.00 40.300
Austria 21 Oct 2022 199625374 5392347 2.7 2242.00 60.600
# Export the data frame to a csv file
write.csv(df, "covid_data.csv")

Task 4: Get a subset of the extracted data frame

# Read covid_data_frame_csv from the csv file
data <- read.csv("covid_data.csv")

# Get the 5th to 10th rows, with two "country" "confirmed" columns
subset <- data[5:10,]
subset
##     X             country        date    tested confirmed
## 5   5              Angola  2 Feb 2021    399228     20981
## 6   6 Antigua and Barbuda  6 Mar 2021     15268       832
## 7   7           Argentina 16 Apr 2022  35716069   9060495
## 8   8             Armenia 29 May 2022   3099602    422963
## 9   9           Australia  9 Sep 2022  78548492  10112229
## 10 10             Austria 21 Oct 2022 199625374   5392347
##    confirmed.tested.ratio tested.population.ratio confirmed.population.ratio
## 5                     5.3                     1.3                      0.067
## 6                     5.4                    15.9                      0.860
## 7                    25.4                    78.3                     20.000
## 8                    13.6                   105.0                     14.300
## 9                    12.9                   313.0                     40.300
## 10                    2.7                  2242.0                     60.600

Task 5: Calculate worldwide COVID testing positive ratio

The goal of task 5 is to get the total confirmed and tested cases worldwide, and try to figure the overall positive ratio using confirmed cases / tested cases

# Get the total confirmed cases worldwide
total_conf <- sum(data$confirmed)
total_conf
## [1] 424336298
# Get the total tested cases worldwide
total_test <- sum(data$tested)
total_test
## [1] 5341818173
# Get the positive ratio (confirmed / tested)
conf.test.ratio <- total_conf/total_test
conf.test.ratio
## [1] 0.07943668

Task 6: Get a sorted name list of countries that reported their testing data

The goal of task 6 is to get a catalog or sorted list of countries who have reported their COVID-19 testing data

# Get the `country` column
countries <- data$country

# Check its class (should be Factor)
class(countries)
## [1] "character"
# Convert the country column into character so that you can easily sort them
as.character(countries)
##   [1] "Afghanistan"            "Albania"                "Algeria"               
##   [4] "Andorra"                "Angola"                 "Antigua and Barbuda"   
##   [7] "Argentina"              "Armenia"                "Australia"             
##  [10] "Austria"                "Azerbaijan"             "Bahamas"               
##  [13] "Bahrain"                "Bangladesh"             "Barbados"              
##  [16] "Belarus"                "Belgium"                "Belize"                
##  [19] "Benin"                  "Bhutan"                 "Bolivia"               
##  [22] "Bosnia and Herzegovina" "Botswana"               "Brazil"                
##  [25] "Brunei"                 "Bulgaria"               "Burkina Faso"          
##  [28] "Burundi"                "Cambodia"               "Cameroon"              
##  [31] "Canada"                 "Chad"                   "Chile"                 
##  [34] "China[c]"               "Colombia"               "Costa Rica"            
##  [37] "Croatia"                "Cuba"                   "Cyprus[d]"             
##  [40] "Czechia"                "Denmark[e]"             "Djibouti"              
##  [43] "Dominica"               "Dominican Republic"     "DR Congo"              
##  [46] "Ecuador"                "Egypt"                  "El Salvador"           
##  [49] "Equatorial Guinea"      "Estonia"                "Eswatini"              
##  [52] "Ethiopia"               "Faroe Islands"          "Fiji"                  
##  [55] "Finland"                "France[f][g]"           "Gabon"                 
##  [58] "Gambia"                 "Georgia[h]"             "Germany"               
##  [61] "Ghana"                  "Greece"                 "Greenland"             
##  [64] "Grenada"                "Guatemala"              "Guinea"                
##  [67] "Guinea-Bissau"          "Guyana"                 "Haiti"                 
##  [70] "Honduras"               "Hungary"                "Iceland"               
##  [73] "India"                  "Indonesia"              "Iran"                  
##  [76] "Iraq"                   "Ireland"                "Israel"                
##  [79] "Italy"                  "Ivory Coast"            "Jamaica"               
##  [82] "Japan"                  "Jordan"                 "Kazakhstan"            
##  [85] "Kenya"                  "Kosovo"                 "Kuwait"                
##  [88] "Kyrgyzstan"             "Laos"                   "Latvia"                
##  [91] "Lebanon"                "Lesotho"                "Liberia"               
##  [94] "Libya"                  "Lithuania"              "Luxembourg[i]"         
##  [97] "Madagascar"             "Malawi"                 "Malaysia"              
## [100] "Maldives"               "Mali"                   "Malta"                 
## [103] "Mauritania"             "Mauritius"              "Mexico"                
## [106] "Moldova[j]"             "Mongolia"               "Montenegro"            
## [109] "Morocco"                "Mozambique"             "Myanmar"               
## [112] "Namibia"                "Nepal"                  "Netherlands"           
## [115] "New Caledonia"          "New Zealand"            "Niger"                 
## [118] "Nigeria"                "North Korea"            "North Macedonia"       
## [121] "Northern Cyprus[k]"     "Norway"                 "Oman"                  
## [124] "Pakistan"               "Palestine"              "Panama"                
## [127] "Papua New Guinea"       "Paraguay"               "Peru"                  
## [130] "Philippines"            "Poland"                 "Portugal"              
## [133] "Qatar"                  "Romania"                "Russia"                
## [136] "Rwanda"                 "Saint Kitts and Nevis"  "Saint Lucia"           
## [139] "Saint Vincent"          "San Marino"             "Saudi Arabia"          
## [142] "Senegal"                "Serbia"                 "Singapore"             
## [145] "Slovakia"               "Slovenia"               "South Africa"          
## [148] "South Korea"            "South Sudan"            "Spain"                 
## [151] "Sri Lanka"              "Sudan"                  "Sweden"                
## [154] "Switzerland[l]"         "Taiwan[m]"              "Tanzania"              
## [157] "Thailand"               "Togo"                   "Trinidad and Tobago"   
## [160] "Tunisia"                "Turkey"                 "Uganda"                
## [163] "Ukraine"                "United Arab Emirates"   "United Kingdom"        
## [166] "United States"          "Uruguay"                "Uzbekistan"            
## [169] "Venezuela"              "Vietnam"                "Zambia"                
## [172] "Zimbabwe"
# Sort the countries AtoZ
countries.sorted <- sort(countries)

# Sort the countries ZtoA
countries.sorted.desc <- sort(countries, decreasing = TRUE)

# Print the sorted ZtoA list
print(countries.sorted.desc)
##   [1] "Zimbabwe"               "Zambia"                 "Vietnam"               
##   [4] "Venezuela"              "Uzbekistan"             "Uruguay"               
##   [7] "United States"          "United Kingdom"         "United Arab Emirates"  
##  [10] "Ukraine"                "Uganda"                 "Turkey"                
##  [13] "Tunisia"                "Trinidad and Tobago"    "Togo"                  
##  [16] "Thailand"               "Tanzania"               "Taiwan[m]"             
##  [19] "Switzerland[l]"         "Sweden"                 "Sudan"                 
##  [22] "Sri Lanka"              "Spain"                  "South Sudan"           
##  [25] "South Korea"            "South Africa"           "Slovenia"              
##  [28] "Slovakia"               "Singapore"              "Serbia"                
##  [31] "Senegal"                "Saudi Arabia"           "San Marino"            
##  [34] "Saint Vincent"          "Saint Lucia"            "Saint Kitts and Nevis" 
##  [37] "Rwanda"                 "Russia"                 "Romania"               
##  [40] "Qatar"                  "Portugal"               "Poland"                
##  [43] "Philippines"            "Peru"                   "Paraguay"              
##  [46] "Papua New Guinea"       "Panama"                 "Palestine"             
##  [49] "Pakistan"               "Oman"                   "Norway"                
##  [52] "Northern Cyprus[k]"     "North Macedonia"        "North Korea"           
##  [55] "Nigeria"                "Niger"                  "New Zealand"           
##  [58] "New Caledonia"          "Netherlands"            "Nepal"                 
##  [61] "Namibia"                "Myanmar"                "Mozambique"            
##  [64] "Morocco"                "Montenegro"             "Mongolia"              
##  [67] "Moldova[j]"             "Mexico"                 "Mauritius"             
##  [70] "Mauritania"             "Malta"                  "Mali"                  
##  [73] "Maldives"               "Malaysia"               "Malawi"                
##  [76] "Madagascar"             "Luxembourg[i]"          "Lithuania"             
##  [79] "Libya"                  "Liberia"                "Lesotho"               
##  [82] "Lebanon"                "Latvia"                 "Laos"                  
##  [85] "Kyrgyzstan"             "Kuwait"                 "Kosovo"                
##  [88] "Kenya"                  "Kazakhstan"             "Jordan"                
##  [91] "Japan"                  "Jamaica"                "Ivory Coast"           
##  [94] "Italy"                  "Israel"                 "Ireland"               
##  [97] "Iraq"                   "Iran"                   "Indonesia"             
## [100] "India"                  "Iceland"                "Hungary"               
## [103] "Honduras"               "Haiti"                  "Guyana"                
## [106] "Guinea-Bissau"          "Guinea"                 "Guatemala"             
## [109] "Grenada"                "Greenland"              "Greece"                
## [112] "Ghana"                  "Germany"                "Georgia[h]"            
## [115] "Gambia"                 "Gabon"                  "France[f][g]"          
## [118] "Finland"                "Fiji"                   "Faroe Islands"         
## [121] "Ethiopia"               "Eswatini"               "Estonia"               
## [124] "Equatorial Guinea"      "El Salvador"            "Egypt"                 
## [127] "Ecuador"                "DR Congo"               "Dominican Republic"    
## [130] "Dominica"               "Djibouti"               "Denmark[e]"            
## [133] "Czechia"                "Cyprus[d]"              "Cuba"                  
## [136] "Croatia"                "Costa Rica"             "Colombia"              
## [139] "China[c]"               "Chile"                  "Chad"                  
## [142] "Canada"                 "Cameroon"               "Cambodia"              
## [145] "Burundi"                "Burkina Faso"           "Bulgaria"              
## [148] "Brunei"                 "Brazil"                 "Botswana"              
## [151] "Bosnia and Herzegovina" "Bolivia"                "Bhutan"                
## [154] "Benin"                  "Belize"                 "Belgium"               
## [157] "Belarus"                "Barbados"               "Bangladesh"            
## [160] "Bahrain"                "Bahamas"                "Azerbaijan"            
## [163] "Austria"                "Australia"              "Armenia"               
## [166] "Argentina"              "Antigua and Barbuda"    "Angola"                
## [169] "Andorra"                "Algeria"                "Albania"               
## [172] "Afghanistan"

Task 7: Identify country names with a specific pattern

The goal of task 7 is using a regular expression to find any countires start with United

# Use a regular expression `United.+` to find matches
matched_indeces <- grep("United.+", data$country)
matched_indeces
## [1] 164 165 166
# Print the matched country names
for (i in matched_indeces) {print(data$country[i])}
## [1] "United Arab Emirates"
## [1] "United Kingdom"
## [1] "United States"

Task 8: Pick two countries you are interested in, and then review their testing data

The goal of task 8 is to compare the COVID-19 test data between two countires, you will need to select two rows from the dataframe, and select country, confirmed, confirmed-population-ratio columns

# Select a subset (should be only one row) of data frame based on a selected country name and columns
subset1 <- data[data$country == "Ukraine",]

# Select a subset (should be only one row) of data frame based on a selected country name and columns
subset2 <- data[data$country == "United Kingdom",]

Task 9: Compare which one of the selected countries has a larger ratio of confirmed cases to population

The goal of task 9 is to find out which country you have selected before has larger ratio of confirmed cases to population, which may indicate that country has higher COVID-19 infection risk

# Use if-else statement
# if (check which confirmed.population value is greater) {
#    print()
# } else {
#    print()
# }

if (subset1$tested.population.ratio > subset2$confirmed.tested.ratio) {
  print(paste0(subset1$country, " has higher COVID-19 infection risk."))
} else {
  print(paste0(subset2$country, " has higher COVID-19 infection risk."))
}
## [1] "Ukraine has higher COVID-19 infection risk."

Task 10: Find countries with confirmed cases to population ratio rate less than a threshold

The goal of task 10 is to find out which countries have the confirmed to population ratio less than 1%, it may indicate the risk of those countries are relatively low

# Get a subset of any countries with `confirmed.population.ratio` less than the threshold

subset3 <- data[data$confirmed.population.ratio < 1, ]$country
subset3
##  [1] "Afghanistan"         "Algeria"             "Angola"             
##  [4] "Antigua and Barbuda" "Bangladesh"          "Benin"              
##  [7] "Brunei"              "Burkina Faso"        "Burundi"            
## [10] "Cambodia"            "Cameroon"            "Chad"               
## [13] "China[c]"            "DR Congo"            "Egypt"              
## [16] "Ethiopia"            "Gabon"               "Gambia"             
## [19] "Ghana"               "Grenada"             "Guinea"             
## [22] "Guinea-Bissau"       "Haiti"               "Ivory Coast"        
## [25] "Japan"               "Kenya"               "Laos"               
## [28] "Liberia"             "Madagascar"          "Malawi"             
## [31] "Mali"                "Mauritania"          "Mauritius"          
## [34] "Mozambique"          "Myanmar"             "New Caledonia"      
## [37] "Niger"               "Nigeria"             "North Korea"        
## [40] "Pakistan"            "Papua New Guinea"    "Rwanda"             
## [43] "Senegal"             "South Korea"         "South Sudan"        
## [46] "Sri Lanka"           "Sudan"               "Tanzania"           
## [49] "Thailand"            "Togo"                "Uganda"             
## [52] "Uzbekistan"          "Venezuela"