Analysis of Global COVID-19 Pandemic Data

First run the necessary libraries

library(httr)
library(rvest)
library(dplyr)
library(knitr)
library(ggplot2)

TASK 1: Get a COVID-19 pandemic Wiki page using HTTP request

get_wiki_covid19_page <- function() {
  wiki_base_url <- "https://en.wikipedia.org/w/index.php"
  query_params <- list(title = "Template:COVID-19_testing_by_country")
  response <- GET(url = wiki_base_url, query = query_params)
  return(response)
}
get_wiki_covid19_page

function () { wiki_base_url <- “https://en.wikipedia.org/w/index.php” query_params <- list(title = “Template:COVID-19_testing_by_country”) response <- GET(url = wiki_base_url, query = query_params) return(response) }

TASK 2: Extract COVID-19 testing data table from the wiki HTML page

We use the read_html to get the root html node from response

url <- "https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country"
page <- read_html(url)

Then we get the tables in the HTML root node using html_nodes function.

response <- read_html("https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country")
tables <- html_nodes(response, "table")

Read tables from the HTML root node using html_nodes function e.g table with index 2

covid_table <- html_table(tables[[2]], fill = TRUE)
covid_df <- as.data.frame(covid_table)
kable(head(covid_df))
Country or region Date[a] Tested Units[b] Confirmed(cases) Confirmed /tested,% Tested /population,% Confirmed /population,% Ref.
Afghanistan 17 Dec 2020 154,767 samples 49,621 32.1 0.40 0.13 [1]
Albania 18 Feb 2021 428,654 samples 96,838 22.6 15.0 3.4 [2]
Algeria 2 Nov 2020 230,553 samples 58,574 25.4 0.53 0.13 [3][4]
Andorra 23 Feb 2022 300,307 samples 37,958 12.6 387 49.0 [5]
Angola 2 Feb 2021 399,228 samples 20,981 5.3 1.3 0.067 [6]
Antigua and Barbuda 6 Mar 2021 15,268 samples 832 5.4 15.9 0.86 [7]

TASK 3: Pre-process and export the extracted data frame

summary(covid_df)
##  Country or region    Date[a]             Tested            Units[b]        
##  Length:173         Length:173         Length:173         Length:173        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##  Confirmed(cases)   Confirmed /tested,% Tested /population,%
##  Length:173         Length:173          Length:173          
##  Class :character   Class :character    Class :character    
##  Mode  :character   Mode  :character    Mode  :character    
##  Confirmed /population,%     Ref.          
##  Length:173              Length:173        
##  Class :character        Class :character  
##  Mode  :character        Mode  :character
preprocess_covid_data_frame <- function(covid_df) {
  covid_df <- covid_df[1:172, ]
  covid_df <- covid_df[, !(names(covid_df) %in% c("Ref.", "Units[b]", "NA"))]
  names(covid_df) <- c("country", "date", "tested", "confirmed", 
                       "confirmed.tested.ratio", 
                       "tested.population.ratio", 
                       "confirmed.population.ratio")
  covid_df$country <- as.factor(covid_df$country)
  covid_df$date <- as.Date(covid_df$date, format = "%d %b %Y")
  covid_df$tested <- suppressWarnings(as.numeric(gsub(",", "", covid_df$tested)))
  covid_df$confirmed <- suppressWarnings(as.numeric(gsub(",", "", covid_df$confirmed)))
  covid_df$confirmed.tested.ratio <- suppressWarnings(as.numeric(covid_df$confirmed.tested.ratio))
  covid_df$tested.population.ratio <- suppressWarnings(as.numeric(covid_df$tested.population.ratio))
  covid_df$confirmed.population.ratio <- suppressWarnings(as.numeric(covid_df$confirmed.population.ratio))
  if (any(is.na(covid_df[-c(1,2)]))) {  # Skip country & date columns
    warning("Some numeric columns contain NA values after conversion.")
  }
  
  return(covid_df)
}

processed_covid_df <- preprocess_covid_data_frame(covid_df)
kable(head(processed_covid_df))
country date tested confirmed confirmed.tested.ratio tested.population.ratio confirmed.population.ratio
Afghanistan 2020-12-17 154767 49621 32.1 0.40 0.130
Albania 2021-02-18 428654 96838 22.6 15.00 3.400
Algeria 2020-11-02 230553 58574 25.4 0.53 0.130
Andorra 2022-02-23 300307 37958 12.6 387.00 49.000
Angola 2021-02-02 399228 20981 5.3 1.30 0.067
Antigua and Barbuda 2021-03-06 15268 832 5.4 15.90 0.860
summary(processed_covid_df)
##                 country         date                tested         
##  Afghanistan        :  1   Min.   :2020-07-31   Min.   :     3880  
##  Albania            :  1   1st Qu.:2021-05-30   1st Qu.:   512037  
##  Algeria            :  1   Median :2022-01-25   Median :  3029859  
##  Andorra            :  1   Mean   :2022-01-17   Mean   : 31377219  
##  Angola             :  1   3rd Qu.:2022-09-27   3rd Qu.: 12386725  
##  Antigua and Barbuda:  1   Max.   :2023-07-03   Max.   :929349291  
##  (Other)            :166                                           
##    confirmed        confirmed.tested.ratio tested.population.ratio
##  Min.   :       0   Min.   : 0.00          Min.   :  0.0065       
##  1st Qu.:   37839   1st Qu.: 5.00          1st Qu.:  8.5000       
##  Median :  281196   Median :10.05          Median : 40.9500       
##  Mean   : 2508340   Mean   :11.25          Mean   :106.9261       
##  3rd Qu.: 1278105   3rd Qu.:15.25          3rd Qu.:135.0000       
##  Max.   :90749469   Max.   :46.80          Max.   :943.0000       
##                                            NA's   :6              
##  confirmed.population.ratio
##  Min.   : 0.000            
##  1st Qu.: 0.425            
##  Median : 6.100            
##  Mean   :12.769            
##  3rd Qu.:16.250            
##  Max.   :74.400            
## 
write.csv(
  processed_covid_df,          
  file = "newcovid.csv",  
  row.names = FALSE         
)

wd <- getwd()
file_path <- paste(wd, sep="", "/newcovid.csv")
print(file_path)

[1] “C:/Users/HP/Documents/Olamide’s R Program/newcovid.csv”

file.exists(file_path)

[1] TRUE

# Set mirror to avoid CRAN error
options(repos = c(CRAN = "https://cloud.r-project.org/"))

# Install necessary packages
if (!require("leaflet")) install.packages("leaflet")
if (!require("dplyr")) install.packages("dplyr")
if (!require("rnaturalearth")) install.packages("rnaturalearth")
if (!require("rnaturalearthdata")) install.packages("rnaturalearthdata")
if (!require("sf")) install.packages("sf")

# Load libraries
library(leaflet)
library(dplyr)
library(rnaturalearth)
library(rnaturalearthdata)
library(sf)

# Load COVID-19 data
covid_data <- read.csv("newcovid.csv")

# Clean column names
colnames(covid_data) <- tolower(colnames(covid_data))

# Load country centroids (as spatial data)
countries_sf <- ne_countries(scale = "medium", returnclass = "sf")
centroids <- st_centroid(countries_sf)

# Extract coordinates and convert to data frame
centroids_df <- centroids %>%
  select(name, geometry) %>%
  mutate(
    lon = st_coordinates(geometry)[,1],
    lat = st_coordinates(geometry)[,2]
  ) %>%
  st_drop_geometry()

# Merge centroids with your COVID data
covid_map_data <- covid_data %>%
  inner_join(centroids_df, by = c("country" = "name"))

# Create popup text for each location
popup_info <- apply(covid_map_data, 1, function(row) {
  paste(
    sprintf("<b>Country:</b> %s", row["country"]),
    sprintf("<b>Date:</b> %s", row["date"]),
    sprintf("<b>Tested:</b> %s", row["tested"]),
    sprintf("<b>Confirmed:</b> %s", row["confirmed"]),
    sprintf("<b>Confirmed/Tested Ratio:</b> %s", row["confirmed.tested.ratio"]),
    sprintf("<b>Tested/Population Ratio:</b> %s", row["tested.population.ratio"]),
    sprintf("<b>Confirmed/Population Ratio:</b> %s", row["confirmed.population.ratio"]),
    sep = "<br>"
  )
})

# Plot the interactive map with small location markers
leaflet(covid_map_data) %>%
  addTiles() %>%
  addMarkers(
    lng = ~lon,
    lat = ~lat,
    popup = popup_info
  ) %>%
  addLegend("bottomright", title = "COVID-19 Marker", colors = "blue", labels = "Location Markers")

TASK 4: Get a subset of the extracted data frame

covid_data <- read.csv("newcovid.csv")
subset_data <- covid_data[5:10, c("country", "confirmed")]
kable(subset_data)
country confirmed
5 Angola 20981
6 Antigua and Barbuda 832
7 Argentina 9060495
8 Armenia 422963
9 Australia 10112229
10 Austria 5789991

TASK 5: Calculate worldwide COVID testing positive ratio

total_confirmed <- sum(processed_covid_df$confirmed, na.rm = TRUE)
total_tested <- sum(processed_covid_df$tested, na.rm = TRUE)
positive_ratio <- total_confirmed / total_tested
cat("Worldwide Total Confirmed Cases:", total_confirmed, "\n")

Worldwide Total Confirmed Cases: 431434555

cat("Worldwide Total Tested Cases:", total_tested, "\n")

Worldwide Total Tested Cases: 5396881644

cat("Worldwide Positive Ratio:", round(positive_ratio * 100, 2), "%\n")

Worldwide Positive Ratio: 7.99 %

TASK 6: Get a country list which reported their testing data

countries <- processed_covid_df$country
class(countries)

[1] “factor”

sorted_atoz <- sort(as.character(countries))
cat("Countries A to Z:\n")

Countries A to Z:

kable(sorted_atoz)
x
Afghanistan
Albania
Algeria
Andorra
Angola
Antigua and Barbuda
Argentina
Armenia
Australia
Austria
Azerbaijan
Bahamas
Bahrain
Bangladesh
Barbados
Belarus
Belgium
Belize
Benin
Bhutan
Bolivia
Bosnia and Herzegovina
Botswana
Brazil
Brunei
Bulgaria
Burkina Faso
Burundi
Cambodia
Cameroon
Canada
Chad
Chile
China[c]
Colombia
Costa Rica
Croatia
Cuba
Cyprus[d]
Czechia
Denmark[e]
Djibouti
Dominica
Dominican Republic
DR Congo
Ecuador
Egypt
El Salvador
Equatorial Guinea
Estonia
Eswatini
Ethiopia
Faroe Islands
Fiji
Finland
France[f][g]
Gabon
Gambia
Georgia[h]
Germany
Ghana
Greece
Greenland
Grenada
Guatemala
Guinea
Guinea-Bissau
Guyana
Haiti
Honduras
Hungary
Iceland
India
Indonesia
Iran
Iraq
Ireland
Israel
Italy
Ivory Coast
Jamaica
Japan
Jordan
Kazakhstan
Kenya
Kosovo
Kuwait
Kyrgyzstan
Laos
Latvia
Lebanon
Lesotho
Liberia
Libya
Lithuania
Luxembourg[i]
Madagascar
Malawi
Malaysia
Maldives
Mali
Malta
Mauritania
Mauritius
Mexico
Moldova[j]
Mongolia
Montenegro
Morocco
Mozambique
Myanmar
Namibia
Nepal
Netherlands
New Caledonia
New Zealand
Niger
Nigeria
North Korea
North Macedonia
Northern Cyprus[k]
Norway
Oman
Pakistan
Palestine
Panama
Papua New Guinea
Paraguay
Peru
Philippines
Poland
Portugal
Qatar
Romania
Russia
Rwanda
Saint Kitts and Nevis
Saint Lucia
Saint Vincent
San Marino
Saudi Arabia
Senegal
Serbia
Singapore
Slovakia
Slovenia
South Africa
South Korea
South Sudan
Spain
Sri Lanka
Sudan
Sweden
Switzerland[l]
Taiwan[m]
Tanzania
Thailand
Togo
Trinidad and Tobago
Tunisia
Turkey
Uganda
Ukraine
United Arab Emirates
United Kingdom
United States
Uruguay
Uzbekistan
Venezuela
Vietnam
Zambia
Zimbabwe

TASK 7: Identify countries names with a specific pattern

united_countries <- grep("United", processed_covid_df$country, value = TRUE)
cat("Countries starting with 'United':\n")

Countries starting with ‘United’:

kable(united_countries)
x
United Arab Emirates
United Kingdom
United States

TASK 8: Pick two countries you are interested, and then review their testing data

country1_data <- processed_covid_df[
  processed_covid_df$country == "United States",
  c("country", "confirmed", "confirmed.population.ratio")
]
country2_data <- processed_covid_df[
  processed_covid_df$country == "United Kingdom",
  c("country", "confirmed", "confirmed.population.ratio")
]
comparison <- rbind(country1_data, country2_data)
kable(comparison)
country confirmed confirmed.population.ratio
166 United States 90749469 27.4
165 United Kingdom 22232377 32.9

TASK 9: Compare which one of the selected countries has a larger ratio of confirmed cases to population

comparison <- processed_covid_df %>%
  filter(country %in% c("United States", "United Kingdom")) %>%
  select(country, confirmed.population.ratio)
ratio1 <- comparison$confirmed.population.ratio[1]
ratio1

[1] 32.9

ratio2 <- comparison$confirmed.population.ratio[2]
ratio2

[1] 27.4

country1 <- comparison$country[1]
country1

[1] United Kingdom 172 Levels: Afghanistan Albania Algeria Andorra Angola … Zimbabwe

country2 <- comparison$country[2]
country2

[1] United States 172 Levels: Afghanistan Albania Algeria Andorra Angola … Zimbabwe

if (ratio1 > ratio2) {
  cat(as.character(country1), "has higher COVID-19 infection risk (", ratio1, "%) than", 
      as.character(country2), "(", ratio2, "%)\n\n\n")
} else if (ratio2 > ratio1) {
  cat(as.character(country2), "has higher COVID-19 infection risk (", ratio2, "%) than", 
      as.character(country1), "(", ratio1, "%)\n\n\n")
} else {
  cat("Both countries have the same confirmed-to-population ratio (", 
      ratio1, "%)\n\n\n")
}

United Kingdom has higher COVID-19 infection risk ( 32.9 %) than United States ( 27.4 %)

TASK 10: Find countries with confirmed to population ratio rate less than a threshold

low_risk_countries <- processed_covid_df %>%
  filter(confirmed.population.ratio < 1) %>%
  select(country, confirmed.population.ratio) %>%
  arrange(confirmed.population.ratio)
kable(low_risk_countries)
country confirmed.population.ratio
North Korea 0.00000
Laos 0.00063
Tanzania 0.00085
China[c] 0.00610
Burundi 0.00740
Papua New Guinea 0.01100
Niger 0.02100
Chad 0.02900
DR Congo 0.02900
Thailand 0.03800
Mauritius 0.03900
New Caledonia 0.05000
Sudan 0.05300
Burkina Faso 0.05800
Angola 0.06700
Benin 0.06700
Mali 0.07100
Brunei 0.07400
Madagascar 0.07600
Nigeria 0.07600
Gabon 0.08200
South Sudan 0.08400
Uganda 0.08700
Liberia 0.11000
Cameroon 0.12000
Afghanistan 0.13000
Algeria 0.13000
Ivory Coast 0.13000
Uzbekistan 0.13000
Grenada 0.14000
South Korea 0.17000
Guinea 0.19000
Gambia 0.21000
Kenya 0.23000
Ethiopia 0.24000
Pakistan 0.27000
Egypt 0.28000
Senegal 0.29000
Haiti 0.30000
Ghana 0.31000
Japan 0.34000
Mozambique 0.34000
Mauritania 0.41000
Sri Lanka 0.43000
Guinea-Bissau 0.45000
Malawi 0.46000
Togo 0.46000
Cambodia 0.48000
Venezuela 0.55000
Bangladesh 0.70000
Rwanda 0.76000
Myanmar 0.81000
Antigua and Barbuda 0.86000