knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = "C:/Users/USER/Desktop/R folder")

Covid_19 Edx Project

Loaded necessary libraries

library(httr)
library(rvest)

TASK 1: Get a COVID-19 pandemic Wiki page using HTTP request

get_wiki_covid19_page <- function() {
  
  wiki_base_url <- "https://en.wikipedia.org/w/index.php"
 
  url_param <- list(title = "Template:COVID-19_testing_by_country")
  # - Use the `GET` function in httr library with a `url` argument and a `query` arugment to get a HTTP response
  result <- GET(url = wiki_base_url, query = url_param)
  # Use the `return` function to return the response
  return(result)
}
# Call the get_wiki_covid19_page function and print the response
result <- get_wiki_covid19_page()
result
## Response [https://en.wikipedia.org/w/index.php?title=Template%3ACOVID-19_testing_by_country]
##   Date: 2025-11-20 11:04
##   Status: 200
##   Content-Type: text/html; charset=UTF-8
##   Size: 456 kB
## <!DOCTYPE html>
## <html class="client-nojs vector-feature-language-in-header-enabled vector-fea...
## <head>
## <meta charset="UTF-8">
## <title>Template:COVID-19 testing by country - Wikipedia</title>
## <script>(function(){var className="client-js vector-feature-language-in-heade...
## RLSTATE={"ext.globalCssJs.user.styles":"ready","site.styles":"ready","user.st...
## <script>(RLQ=window.RLQ||[]).push(function(){mw.loader.impl(function(){return...
## }];});});</script>
## <link rel="stylesheet" href="/w/load.php?lang=en&amp;modules=ext.cite.styles%...
## ...

TASK 2: Extract COVID-19 testing data table from the wiki HTML page

# Get the root html node from the http response in task 1 
root<- read_html(result)
root
## {html_document}
## <html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" lang="en" dir="ltr">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="skin--responsive skin-vector skin-vector-search-vue mediawik ...
# Get the table node from the root html node
table <- html_nodes(root, "table")
table
## {xml_nodeset (4)}
## [1] <table class="box-Update plainlinks ombox ombox-content ambox-Update" rol ...
## [2] <table class="wikitable plainrowheaders sortable collapsible autocollapse ...
## [3] <table class="plainlinks ombox mbox-small ombox-notice" role="presentatio ...
## [4] <table class="wikitable mw-templatedata-doc-params">\n<caption><p class=" ...
#Read the table node from the root html node
Covid19 <- as.data.frame(html_table(table[2]))
Covid19
# head,tail and summary of the data
head(Covid19) #Head is used to view the first six rows of the dataset
tail(Covid19) # tail is used to get the bottom of the data
summary(Covid19) #summary is used get the overall information of the data
##  Country.or.region    Date.a.             Tested            Units.b.        
##  Length:173         Length:173         Length:173         Length:173        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Confirmed.cases.   Confirmed..tested.. Tested..population..
##  Length:173         Length:173          Length:173          
##  Class :character   Class :character    Class :character    
##  Mode  :character   Mode  :character    Mode  :character    
##  Confirmed..population..     Ref.          
##  Length:173              Length:173        
##  Class :character        Class :character  
##  Mode  :character        Mode  :character
str(Covid19)
## 'data.frame':    173 obs. of  9 variables:
##  $ Country.or.region      : chr  "Afghanistan" "Albania" "Algeria" "Andorra" ...
##  $ Date.a.                : chr  "17 Dec 2020" "18 Feb 2021" "2 Nov 2020" "23 Feb 2022" ...
##  $ Tested                 : chr  "154,767" "428,654" "230,553" "300,307" ...
##  $ Units.b.               : chr  "samples" "samples" "samples" "samples" ...
##  $ Confirmed.cases.       : chr  "49,621" "96,838" "58,574" "37,958" ...
##  $ Confirmed..tested..    : chr  "32.1" "22.6" "25.4" "12.6" ...
##  $ Tested..population..   : chr  "0.40" "15.0" "0.53" "387" ...
##  $ Confirmed..population..: chr  "0.13" "3.4" "0.13" "49.0" ...
##  $ Ref.                   : chr  "[1]" "[2]" "[3][4]" "[5]" ...

step 3 (pre-process the extracted data frame from the previous step, and export it as a csv file)

#(3a)
preprocess_covid_data_frame <- function(Covid19) {
  # Remove the last row
  Covid19_frame <- Covid19 [1:172, ]
  
  # We don't need the Units and Ref columns, so can be removed
  Covid19_frame ["Ref."] <- NULL
  Covid19_frame ["Units.b."] <- NULL
  
  # Renaming the columns
  names(  Covid19_frame ) <- c("country", "date", "tested", "confirmed", "confirmed.tested.ratio", "tested.population.ratio", "confirmed.population.ratio")
  
  # Convert column data types
  Covid19_frame $country <- as.factor( Covid19_frame$country)
  Covid19_frame $date <- as.factor( Covid19_frame$date)
  Covid19_frame $tested <- as.numeric(gsub(",","", Covid19_frame$tested))
  Covid19_frame $confirmed <- as.numeric(gsub(",","", Covid19_frame$confirmed))
  Covid19_frame $'confirmed.tested.ratio' <- as.numeric(gsub(",","", Covid19_frame$'confirmed.tested.ratio'))
  Covid19_frame $'tested.population.ratio' <- as.numeric(gsub(",","", Covid19_frame$`tested.population.ratio`))
  Covid19_frame$'confirmed.population.ratio' <- as.numeric(gsub(",","", Covid19_frame$`confirmed.population.ratio`))
  
  return( Covid19_frame )
}


Covid19_frame <-  preprocess_covid_data_frame(Covid19) 
Covid19_frame 
#(3b)
# export the data frame to a csv file
write.table(Covid19_frame ,"covid_data.csv")
# to get the working directory
vid<-getwd()
vid
## [1] "C:/Users/USER/Desktop/R folder"
setwd("C:/Users/USER/Desktop/R folder")
#get exported 
file_path <- paste(vid,sep="","/covid_data.csv")
# file path
file_path
## [1] "C:/Users/USER/Desktop/R folder/covid_data.csv"
file.exists(file_path)
## [1] TRUE

TASK 4: Get a subset of the extracted data frame

#The goal of task 4 is to get the 5th to 10th rows from the data frame with only country and confirmed columns selected
Covid19_data_frame_csv <- read.table("C:\\Users\\USER\\Desktop\\R folder\\covid_data.csv",header = TRUE,sep = "")
Covid19_data_frame_csv
head(Covid19_data_frame_csv)
subset<- Covid19_data_frame_csv[1:10, c("country","date", "confirmed")]
subset

TASK 5: Calculate worldwide COVID testing positive ratio

#The goal of task 5 is to get the total confirmed and tested cases worldwide, and try to figure the overall positive ratio using confirmed cases / tested cases
total_confirmed_cases <- sum(Covid19_data_frame_csv$confirmed)
total_confirmed_cases
## [1] 431434555
total_tested_cases <- sum(Covid19_data_frame_csv$tested)
total_tested_cases
## [1] 5396881644
positive_ratio <- total_confirmed_cases/total_tested_cases
positive_ratio
## [1] 0.07994145

TASK 6: Get a country list which reported their testing data

#The goal of task 6 is to get a catalog or sorted list of countries who have reported their COVID-19 testing data
#get the 'country' column
Covid19_data_frame_csv$country
##   [1] "Afghanistan"            "Albania"                "Algeria"               
##   [4] "Andorra"                "Angola"                 "Antigua and Barbuda"   
##   [7] "Argentina"              "Armenia"                "Australia"             
##  [10] "Austria"                "Azerbaijan"             "Bahamas"               
##  [13] "Bahrain"                "Bangladesh"             "Barbados"              
##  [16] "Belarus"                "Belgium"                "Belize"                
##  [19] "Benin"                  "Bhutan"                 "Bolivia"               
##  [22] "Bosnia and Herzegovina" "Botswana"               "Brazil"                
##  [25] "Brunei"                 "Bulgaria"               "Burkina Faso"          
##  [28] "Burundi"                "Cambodia"               "Cameroon"              
##  [31] "Canada"                 "Chad"                   "Chile"                 
##  [34] "China[c]"               "Colombia"               "Costa Rica"            
##  [37] "Croatia"                "Cuba"                   "Cyprus[d]"             
##  [40] "Czechia"                "Denmark[e]"             "Djibouti"              
##  [43] "Dominica"               "Dominican Republic"     "DR Congo"              
##  [46] "Ecuador"                "Egypt"                  "El Salvador"           
##  [49] "Equatorial Guinea"      "Estonia"                "Eswatini"              
##  [52] "Ethiopia"               "Faroe Islands"          "Fiji"                  
##  [55] "Finland"                "France[f][g]"           "Gabon"                 
##  [58] "Gambia"                 "Georgia[h]"             "Germany"               
##  [61] "Ghana"                  "Greece"                 "Greenland"             
##  [64] "Grenada"                "Guatemala"              "Guinea"                
##  [67] "Guinea-Bissau"          "Guyana"                 "Haiti"                 
##  [70] "Honduras"               "Hungary"                "Iceland"               
##  [73] "India"                  "Indonesia"              "Iran"                  
##  [76] "Iraq"                   "Ireland"                "Israel"                
##  [79] "Italy"                  "Ivory Coast"            "Jamaica"               
##  [82] "Japan"                  "Jordan"                 "Kazakhstan"            
##  [85] "Kenya"                  "Kosovo"                 "Kuwait"                
##  [88] "Kyrgyzstan"             "Laos"                   "Latvia"                
##  [91] "Lebanon"                "Lesotho"                "Liberia"               
##  [94] "Libya"                  "Lithuania"              "Luxembourg[i]"         
##  [97] "Madagascar"             "Malawi"                 "Malaysia"              
## [100] "Maldives"               "Mali"                   "Malta"                 
## [103] "Mauritania"             "Mauritius"              "Mexico"                
## [106] "Moldova[j]"             "Mongolia"               "Montenegro"            
## [109] "Morocco"                "Mozambique"             "Myanmar"               
## [112] "Namibia"                "Nepal"                  "Netherlands"           
## [115] "New Caledonia"          "New Zealand"            "Niger"                 
## [118] "Nigeria"                "North Korea"            "North Macedonia"       
## [121] "Northern Cyprus[k]"     "Norway"                 "Oman"                  
## [124] "Pakistan"               "Palestine"              "Panama"                
## [127] "Papua New Guinea"       "Paraguay"               "Peru"                  
## [130] "Philippines"            "Poland"                 "Portugal"              
## [133] "Qatar"                  "Romania"                "Russia"                
## [136] "Rwanda"                 "Saint Kitts and Nevis"  "Saint Lucia"           
## [139] "Saint Vincent"          "San Marino"             "Saudi Arabia"          
## [142] "Senegal"                "Serbia"                 "Singapore"             
## [145] "Slovakia"               "Slovenia"               "South Africa"          
## [148] "South Korea"            "South Sudan"            "Spain"                 
## [151] "Sri Lanka"              "Sudan"                  "Sweden"                
## [154] "Switzerland[l]"         "Taiwan[m]"              "Tanzania"              
## [157] "Thailand"               "Togo"                   "Trinidad and Tobago"   
## [160] "Tunisia"                "Turkey"                 "Uganda"                
## [163] "Ukraine"                "United Arab Emirates"   "United Kingdom"        
## [166] "United States"          "Uruguay"                "Uzbekistan"            
## [169] "Venezuela"              "Vietnam"                "Zambia"                
## [172] "Zimbabwe"
# check the class (it should be character)
class(Covid19_data_frame_csv$country)
## [1] "character"
#Sort the countries AtoZ
A_to_Z <- sort(Covid19_data_frame_csv$country,decreasing=FALSE)
A_to_Z
##   [1] "Afghanistan"            "Albania"                "Algeria"               
##   [4] "Andorra"                "Angola"                 "Antigua and Barbuda"   
##   [7] "Argentina"              "Armenia"                "Australia"             
##  [10] "Austria"                "Azerbaijan"             "Bahamas"               
##  [13] "Bahrain"                "Bangladesh"             "Barbados"              
##  [16] "Belarus"                "Belgium"                "Belize"                
##  [19] "Benin"                  "Bhutan"                 "Bolivia"               
##  [22] "Bosnia and Herzegovina" "Botswana"               "Brazil"                
##  [25] "Brunei"                 "Bulgaria"               "Burkina Faso"          
##  [28] "Burundi"                "Cambodia"               "Cameroon"              
##  [31] "Canada"                 "Chad"                   "Chile"                 
##  [34] "China[c]"               "Colombia"               "Costa Rica"            
##  [37] "Croatia"                "Cuba"                   "Cyprus[d]"             
##  [40] "Czechia"                "Denmark[e]"             "Djibouti"              
##  [43] "Dominica"               "Dominican Republic"     "DR Congo"              
##  [46] "Ecuador"                "Egypt"                  "El Salvador"           
##  [49] "Equatorial Guinea"      "Estonia"                "Eswatini"              
##  [52] "Ethiopia"               "Faroe Islands"          "Fiji"                  
##  [55] "Finland"                "France[f][g]"           "Gabon"                 
##  [58] "Gambia"                 "Georgia[h]"             "Germany"               
##  [61] "Ghana"                  "Greece"                 "Greenland"             
##  [64] "Grenada"                "Guatemala"              "Guinea"                
##  [67] "Guinea-Bissau"          "Guyana"                 "Haiti"                 
##  [70] "Honduras"               "Hungary"                "Iceland"               
##  [73] "India"                  "Indonesia"              "Iran"                  
##  [76] "Iraq"                   "Ireland"                "Israel"                
##  [79] "Italy"                  "Ivory Coast"            "Jamaica"               
##  [82] "Japan"                  "Jordan"                 "Kazakhstan"            
##  [85] "Kenya"                  "Kosovo"                 "Kuwait"                
##  [88] "Kyrgyzstan"             "Laos"                   "Latvia"                
##  [91] "Lebanon"                "Lesotho"                "Liberia"               
##  [94] "Libya"                  "Lithuania"              "Luxembourg[i]"         
##  [97] "Madagascar"             "Malawi"                 "Malaysia"              
## [100] "Maldives"               "Mali"                   "Malta"                 
## [103] "Mauritania"             "Mauritius"              "Mexico"                
## [106] "Moldova[j]"             "Mongolia"               "Montenegro"            
## [109] "Morocco"                "Mozambique"             "Myanmar"               
## [112] "Namibia"                "Nepal"                  "Netherlands"           
## [115] "New Caledonia"          "New Zealand"            "Niger"                 
## [118] "Nigeria"                "North Korea"            "North Macedonia"       
## [121] "Northern Cyprus[k]"     "Norway"                 "Oman"                  
## [124] "Pakistan"               "Palestine"              "Panama"                
## [127] "Papua New Guinea"       "Paraguay"               "Peru"                  
## [130] "Philippines"            "Poland"                 "Portugal"              
## [133] "Qatar"                  "Romania"                "Russia"                
## [136] "Rwanda"                 "Saint Kitts and Nevis"  "Saint Lucia"           
## [139] "Saint Vincent"          "San Marino"             "Saudi Arabia"          
## [142] "Senegal"                "Serbia"                 "Singapore"             
## [145] "Slovakia"               "Slovenia"               "South Africa"          
## [148] "South Korea"            "South Sudan"            "Spain"                 
## [151] "Sri Lanka"              "Sudan"                  "Sweden"                
## [154] "Switzerland[l]"         "Taiwan[m]"              "Tanzania"              
## [157] "Thailand"               "Togo"                   "Trinidad and Tobago"   
## [160] "Tunisia"                "Turkey"                 "Uganda"                
## [163] "Ukraine"                "United Arab Emirates"   "United Kingdom"        
## [166] "United States"          "Uruguay"                "Uzbekistan"            
## [169] "Venezuela"              "Vietnam"                "Zambia"                
## [172] "Zimbabwe"
#Sort the countries ZtoA
Z_to_A <- sort(Covid19_data_frame_csv$country,decreasing=TRUE)
#Print the sorted ZtoA list
Z_to_A
##   [1] "Zimbabwe"               "Zambia"                 "Vietnam"               
##   [4] "Venezuela"              "Uzbekistan"             "Uruguay"               
##   [7] "United States"          "United Kingdom"         "United Arab Emirates"  
##  [10] "Ukraine"                "Uganda"                 "Turkey"                
##  [13] "Tunisia"                "Trinidad and Tobago"    "Togo"                  
##  [16] "Thailand"               "Tanzania"               "Taiwan[m]"             
##  [19] "Switzerland[l]"         "Sweden"                 "Sudan"                 
##  [22] "Sri Lanka"              "Spain"                  "South Sudan"           
##  [25] "South Korea"            "South Africa"           "Slovenia"              
##  [28] "Slovakia"               "Singapore"              "Serbia"                
##  [31] "Senegal"                "Saudi Arabia"           "San Marino"            
##  [34] "Saint Vincent"          "Saint Lucia"            "Saint Kitts and Nevis" 
##  [37] "Rwanda"                 "Russia"                 "Romania"               
##  [40] "Qatar"                  "Portugal"               "Poland"                
##  [43] "Philippines"            "Peru"                   "Paraguay"              
##  [46] "Papua New Guinea"       "Panama"                 "Palestine"             
##  [49] "Pakistan"               "Oman"                   "Norway"                
##  [52] "Northern Cyprus[k]"     "North Macedonia"        "North Korea"           
##  [55] "Nigeria"                "Niger"                  "New Zealand"           
##  [58] "New Caledonia"          "Netherlands"            "Nepal"                 
##  [61] "Namibia"                "Myanmar"                "Mozambique"            
##  [64] "Morocco"                "Montenegro"             "Mongolia"              
##  [67] "Moldova[j]"             "Mexico"                 "Mauritius"             
##  [70] "Mauritania"             "Malta"                  "Mali"                  
##  [73] "Maldives"               "Malaysia"               "Malawi"                
##  [76] "Madagascar"             "Luxembourg[i]"          "Lithuania"             
##  [79] "Libya"                  "Liberia"                "Lesotho"               
##  [82] "Lebanon"                "Latvia"                 "Laos"                  
##  [85] "Kyrgyzstan"             "Kuwait"                 "Kosovo"                
##  [88] "Kenya"                  "Kazakhstan"             "Jordan"                
##  [91] "Japan"                  "Jamaica"                "Ivory Coast"           
##  [94] "Italy"                  "Israel"                 "Ireland"               
##  [97] "Iraq"                   "Iran"                   "Indonesia"             
## [100] "India"                  "Iceland"                "Hungary"               
## [103] "Honduras"               "Haiti"                  "Guyana"                
## [106] "Guinea-Bissau"          "Guinea"                 "Guatemala"             
## [109] "Grenada"                "Greenland"              "Greece"                
## [112] "Ghana"                  "Germany"                "Georgia[h]"            
## [115] "Gambia"                 "Gabon"                  "France[f][g]"          
## [118] "Finland"                "Fiji"                   "Faroe Islands"         
## [121] "Ethiopia"               "Eswatini"               "Estonia"               
## [124] "Equatorial Guinea"      "El Salvador"            "Egypt"                 
## [127] "Ecuador"                "DR Congo"               "Dominican Republic"    
## [130] "Dominica"               "Djibouti"               "Denmark[e]"            
## [133] "Czechia"                "Cyprus[d]"              "Cuba"                  
## [136] "Croatia"                "Costa Rica"             "Colombia"              
## [139] "China[c]"               "Chile"                  "Chad"                  
## [142] "Canada"                 "Cameroon"               "Cambodia"              
## [145] "Burundi"                "Burkina Faso"           "Bulgaria"              
## [148] "Brunei"                 "Brazil"                 "Botswana"              
## [151] "Bosnia and Herzegovina" "Bolivia"                "Bhutan"                
## [154] "Benin"                  "Belize"                 "Belgium"               
## [157] "Belarus"                "Barbados"               "Bangladesh"            
## [160] "Bahrain"                "Bahamas"                "Azerbaijan"            
## [163] "Austria"                "Australia"              "Armenia"               
## [166] "Argentina"              "Antigua and Barbuda"    "Angola"                
## [169] "Andorra"                "Algeria"                "Albania"               
## [172] "Afghanistan"

TASK 7: Identify countries names with a specific pattern

#The goal of task 7 is using a regular expression to find any countries start with United
matches <- regexpr("United.+",Covid19_data_frame_csv$country)
regmatches(Covid19_data_frame_csv$country,matches)
## [1] "United Arab Emirates" "United Kingdom"       "United States"

TASK 8: Pick two countries you are interested, and then review their testing data

#The goal of task 8 is to compare the COVID-19 test data between two countries, you will need to select two rows 
#from the dataframe, and select country, confirmed, confirmed-population-ratio columns
country_a <- Covid19_data_frame_csv[118,c(1,4,7)]
country_a
country_b <- Covid19_data_frame_csv[80,c(1,4,7)]
country_b

TASK 9: Compare which one of the selected countries has a larger ratio of confirmed cases to population

#The goal of task 9 is to find out which country you have selected before has larger ratio of confirmed cases to population, which may 
#indicate that country has higher COVID-19 infection risk
# Use if-else statement
if(country_a[2]>country_b[2]){
  print("Nigeria has more populations afected with covid19 hence is not safer than Ivory Coast")
}else{
  print("Ivory Coast has more populations afected with covid19 hence is not safer than Nigeria")
}
## [1] "Nigeria has more populations afected with covid19 hence is not safer than Ivory Coast"

TASK 10: Find countries with confirmed to population ratio rate less than a threshold

#The goal of task 10 is to find out which countries have the confirmed to population ratio less than 1%, it may indicate the risk of those countries are relatively low
threshold <- 1
threshold 
## [1] 1
Covid19_data_frame_csv[Covid19_data_frame_csv$confirmed.population.ratio<threshold,"country"]
##  [1] "Afghanistan"         "Algeria"             "Angola"             
##  [4] "Antigua and Barbuda" "Bangladesh"          "Benin"              
##  [7] "Brunei"              "Burkina Faso"        "Burundi"            
## [10] "Cambodia"            "Cameroon"            "Chad"               
## [13] "China[c]"            "DR Congo"            "Egypt"              
## [16] "Ethiopia"            "Gabon"               "Gambia"             
## [19] "Ghana"               "Grenada"             "Guinea"             
## [22] "Guinea-Bissau"       "Haiti"               "Ivory Coast"        
## [25] "Japan"               "Kenya"               "Laos"               
## [28] "Liberia"             "Madagascar"          "Malawi"             
## [31] "Mali"                "Mauritania"          "Mauritius"          
## [34] "Mozambique"          "Myanmar"             "New Caledonia"      
## [37] "Niger"               "Nigeria"             "North Korea"        
## [40] "Pakistan"            "Papua New Guinea"    "Rwanda"             
## [43] "Senegal"             "South Korea"         "South Sudan"        
## [46] "Sri Lanka"           "Sudan"               "Tanzania"           
## [49] "Thailand"            "Togo"                "Uganda"             
## [52] "Uzbekistan"          "Venezuela"

Visualizing the highest country with comfirmed cases

library(ggplot2)
ggplot(subset,aes(x=reorder(country,confirmed),y=confirmed,fill = country))+
  geom_col()+
  coord_flip()+
  labs(title = "highest country with confirmed cases",
       x="country",
       y="confirmed"
        )+
  theme_minimal()