First run the necessary libraries
library(httr)
library(rvest)
library(dplyr)
library(knitr)
library(ggplot2)
get_wiki_covid19_page <- function() {
wiki_base_url <- "https://en.wikipedia.org/w/index.php"
query_params <- list(title = "Template:COVID-19_testing_by_country")
response <- GET(url = wiki_base_url, query = query_params)
return(response)
}
get_wiki_covid19_page
function () { wiki_base_url <- “https://en.wikipedia.org/w/index.php” query_params <- list(title = “Template:COVID-19_testing_by_country”) response <- GET(url = wiki_base_url, query = query_params) return(response) }
We use the read_html to get the root html node from response
url <- "https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country"
page <- read_html(url)
Then we get the tables in the HTML root node using html_nodes function.
response <- read_html("https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country")
tables <- html_nodes(response, "table")
Read tables from the HTML root node using html_nodes function e.g table with index 2
covid_table <- html_table(tables[[2]], fill = TRUE)
covid_df <- as.data.frame(covid_table)
kable(head(covid_df))
| Country or region | Date[a] | Tested | Units[b] | Confirmed(cases) | Confirmed /tested,% | Tested /population,% | Confirmed /population,% | Ref. |
|---|---|---|---|---|---|---|---|---|
| Afghanistan | 17 Dec 2020 | 154,767 | samples | 49,621 | 32.1 | 0.40 | 0.13 | [1] |
| Albania | 18 Feb 2021 | 428,654 | samples | 96,838 | 22.6 | 15.0 | 3.4 | [2] |
| Algeria | 2 Nov 2020 | 230,553 | samples | 58,574 | 25.4 | 0.53 | 0.13 | [3][4] |
| Andorra | 23 Feb 2022 | 300,307 | samples | 37,958 | 12.6 | 387 | 49.0 | [5] |
| Angola | 2 Feb 2021 | 399,228 | samples | 20,981 | 5.3 | 1.3 | 0.067 | [6] |
| Antigua and Barbuda | 6 Mar 2021 | 15,268 | samples | 832 | 5.4 | 15.9 | 0.86 | [7] |
summary(covid_df)
## Country or region Date[a] Tested Units[b]
## Length:173 Length:173 Length:173 Length:173
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Confirmed(cases) Confirmed /tested,% Tested /population,%
## Length:173 Length:173 Length:173
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## Confirmed /population,% Ref.
## Length:173 Length:173
## Class :character Class :character
## Mode :character Mode :character
preprocess_covid_data_frame <- function(covid_df) {
covid_df <- covid_df[1:172, ]
covid_df <- covid_df[, !(names(covid_df) %in% c("Ref.", "Units[b]", "NA"))]
names(covid_df) <- c("country", "date", "tested", "confirmed",
"confirmed.tested.ratio",
"tested.population.ratio",
"confirmed.population.ratio")
covid_df$country <- as.factor(covid_df$country)
covid_df$date <- as.Date(covid_df$date, format = "%d %b %Y")
covid_df$tested <- suppressWarnings(as.numeric(gsub(",", "", covid_df$tested)))
covid_df$confirmed <- suppressWarnings(as.numeric(gsub(",", "", covid_df$confirmed)))
covid_df$confirmed.tested.ratio <- suppressWarnings(as.numeric(covid_df$confirmed.tested.ratio))
covid_df$tested.population.ratio <- suppressWarnings(as.numeric(covid_df$tested.population.ratio))
covid_df$confirmed.population.ratio <- suppressWarnings(as.numeric(covid_df$confirmed.population.ratio))
if (any(is.na(covid_df[-c(1,2)]))) { # Skip country & date columns
warning("Some numeric columns contain NA values after conversion.")
}
return(covid_df)
}
processed_covid_df <- preprocess_covid_data_frame(covid_df)
kable(head(processed_covid_df))
| country | date | tested | confirmed | confirmed.tested.ratio | tested.population.ratio | confirmed.population.ratio |
|---|---|---|---|---|---|---|
| Afghanistan | 2020-12-17 | 154767 | 49621 | 32.1 | 0.40 | 0.130 |
| Albania | 2021-02-18 | 428654 | 96838 | 22.6 | 15.00 | 3.400 |
| Algeria | 2020-11-02 | 230553 | 58574 | 25.4 | 0.53 | 0.130 |
| Andorra | 2022-02-23 | 300307 | 37958 | 12.6 | 387.00 | 49.000 |
| Angola | 2021-02-02 | 399228 | 20981 | 5.3 | 1.30 | 0.067 |
| Antigua and Barbuda | 2021-03-06 | 15268 | 832 | 5.4 | 15.90 | 0.860 |
summary(processed_covid_df)
## country date tested
## Afghanistan : 1 Min. :2020-07-31 Min. : 3880
## Albania : 1 1st Qu.:2021-05-30 1st Qu.: 512037
## Algeria : 1 Median :2022-01-25 Median : 3029859
## Andorra : 1 Mean :2022-01-17 Mean : 31377219
## Angola : 1 3rd Qu.:2022-09-27 3rd Qu.: 12386725
## Antigua and Barbuda: 1 Max. :2023-07-03 Max. :929349291
## (Other) :166
## confirmed confirmed.tested.ratio tested.population.ratio
## Min. : 0 Min. : 0.00 Min. : 0.0065
## 1st Qu.: 37839 1st Qu.: 5.00 1st Qu.: 8.5000
## Median : 281196 Median :10.05 Median : 40.9500
## Mean : 2508340 Mean :11.25 Mean :106.9261
## 3rd Qu.: 1278105 3rd Qu.:15.25 3rd Qu.:135.0000
## Max. :90749469 Max. :46.80 Max. :943.0000
## NA's :6
## confirmed.population.ratio
## Min. : 0.000
## 1st Qu.: 0.425
## Median : 6.100
## Mean :12.769
## 3rd Qu.:16.250
## Max. :74.400
##
write.csv(
processed_covid_df,
file = "newcovid.csv",
row.names = FALSE
)
wd <- getwd()
file_path <- paste(wd, sep="", "/newcovid.csv")
print(file_path)
[1] “C:/Users/HP/Documents/Olamide’s R Program/newcovid.csv”
file.exists(file_path)
[1] TRUE
# Set mirror to avoid CRAN error
options(repos = c(CRAN = "https://cloud.r-project.org/"))
# Install necessary packages
if (!require("leaflet")) install.packages("leaflet")
if (!require("dplyr")) install.packages("dplyr")
if (!require("rnaturalearth")) install.packages("rnaturalearth")
if (!require("rnaturalearthdata")) install.packages("rnaturalearthdata")
if (!require("sf")) install.packages("sf")
# Load libraries
library(leaflet)
library(dplyr)
library(rnaturalearth)
library(rnaturalearthdata)
library(sf)
# Load COVID-19 data
covid_data <- read.csv("newcovid.csv")
# Clean column names
colnames(covid_data) <- tolower(colnames(covid_data))
# Load country centroids (as spatial data)
countries_sf <- ne_countries(scale = "medium", returnclass = "sf")
centroids <- st_centroid(countries_sf)
# Extract coordinates and convert to data frame
centroids_df <- centroids %>%
select(name, geometry) %>%
mutate(
lon = st_coordinates(geometry)[,1],
lat = st_coordinates(geometry)[,2]
) %>%
st_drop_geometry()
# Merge centroids with your COVID data
covid_map_data <- covid_data %>%
inner_join(centroids_df, by = c("country" = "name"))
# Create popup text for each location
popup_info <- apply(covid_map_data, 1, function(row) {
paste(
sprintf("<b>Country:</b> %s", row["country"]),
sprintf("<b>Date:</b> %s", row["date"]),
sprintf("<b>Tested:</b> %s", row["tested"]),
sprintf("<b>Confirmed:</b> %s", row["confirmed"]),
sprintf("<b>Confirmed/Tested Ratio:</b> %s", row["confirmed.tested.ratio"]),
sprintf("<b>Tested/Population Ratio:</b> %s", row["tested.population.ratio"]),
sprintf("<b>Confirmed/Population Ratio:</b> %s", row["confirmed.population.ratio"]),
sep = "<br>"
)
})
# Plot the interactive map with small location markers
leaflet(covid_map_data) %>%
addTiles() %>%
addMarkers(
lng = ~lon,
lat = ~lat,
popup = popup_info
) %>%
addLegend("bottomright", title = "COVID-19 Marker", colors = "blue", labels = "Location Markers")
covid_data <- read.csv("newcovid.csv")
subset_data <- covid_data[5:10, c("country", "confirmed")]
kable(subset_data)
| country | confirmed | |
|---|---|---|
| 5 | Angola | 20981 |
| 6 | Antigua and Barbuda | 832 |
| 7 | Argentina | 9060495 |
| 8 | Armenia | 422963 |
| 9 | Australia | 10112229 |
| 10 | Austria | 5789991 |
total_confirmed <- sum(processed_covid_df$confirmed, na.rm = TRUE)
total_tested <- sum(processed_covid_df$tested, na.rm = TRUE)
positive_ratio <- total_confirmed / total_tested
cat("Worldwide Total Confirmed Cases:", total_confirmed, "\n")
Worldwide Total Confirmed Cases: 431434555
cat("Worldwide Total Tested Cases:", total_tested, "\n")
Worldwide Total Tested Cases: 5396881644
cat("Worldwide Positive Ratio:", round(positive_ratio * 100, 2), "%\n")
Worldwide Positive Ratio: 7.99 %
countries <- processed_covid_df$country
class(countries)
[1] “factor”
sorted_atoz <- sort(as.character(countries))
cat("Countries A to Z:\n")
Countries A to Z:
kable(sorted_atoz)
| x |
|---|
| Afghanistan |
| Albania |
| Algeria |
| Andorra |
| Angola |
| Antigua and Barbuda |
| Argentina |
| Armenia |
| Australia |
| Austria |
| Azerbaijan |
| Bahamas |
| Bahrain |
| Bangladesh |
| Barbados |
| Belarus |
| Belgium |
| Belize |
| Benin |
| Bhutan |
| Bolivia |
| Bosnia and Herzegovina |
| Botswana |
| Brazil |
| Brunei |
| Bulgaria |
| Burkina Faso |
| Burundi |
| Cambodia |
| Cameroon |
| Canada |
| Chad |
| Chile |
| China[c] |
| Colombia |
| Costa Rica |
| Croatia |
| Cuba |
| Cyprus[d] |
| Czechia |
| Denmark[e] |
| Djibouti |
| Dominica |
| Dominican Republic |
| DR Congo |
| Ecuador |
| Egypt |
| El Salvador |
| Equatorial Guinea |
| Estonia |
| Eswatini |
| Ethiopia |
| Faroe Islands |
| Fiji |
| Finland |
| France[f][g] |
| Gabon |
| Gambia |
| Georgia[h] |
| Germany |
| Ghana |
| Greece |
| Greenland |
| Grenada |
| Guatemala |
| Guinea |
| Guinea-Bissau |
| Guyana |
| Haiti |
| Honduras |
| Hungary |
| Iceland |
| India |
| Indonesia |
| Iran |
| Iraq |
| Ireland |
| Israel |
| Italy |
| Ivory Coast |
| Jamaica |
| Japan |
| Jordan |
| Kazakhstan |
| Kenya |
| Kosovo |
| Kuwait |
| Kyrgyzstan |
| Laos |
| Latvia |
| Lebanon |
| Lesotho |
| Liberia |
| Libya |
| Lithuania |
| Luxembourg[i] |
| Madagascar |
| Malawi |
| Malaysia |
| Maldives |
| Mali |
| Malta |
| Mauritania |
| Mauritius |
| Mexico |
| Moldova[j] |
| Mongolia |
| Montenegro |
| Morocco |
| Mozambique |
| Myanmar |
| Namibia |
| Nepal |
| Netherlands |
| New Caledonia |
| New Zealand |
| Niger |
| Nigeria |
| North Korea |
| North Macedonia |
| Northern Cyprus[k] |
| Norway |
| Oman |
| Pakistan |
| Palestine |
| Panama |
| Papua New Guinea |
| Paraguay |
| Peru |
| Philippines |
| Poland |
| Portugal |
| Qatar |
| Romania |
| Russia |
| Rwanda |
| Saint Kitts and Nevis |
| Saint Lucia |
| Saint Vincent |
| San Marino |
| Saudi Arabia |
| Senegal |
| Serbia |
| Singapore |
| Slovakia |
| Slovenia |
| South Africa |
| South Korea |
| South Sudan |
| Spain |
| Sri Lanka |
| Sudan |
| Sweden |
| Switzerland[l] |
| Taiwan[m] |
| Tanzania |
| Thailand |
| Togo |
| Trinidad and Tobago |
| Tunisia |
| Turkey |
| Uganda |
| Ukraine |
| United Arab Emirates |
| United Kingdom |
| United States |
| Uruguay |
| Uzbekistan |
| Venezuela |
| Vietnam |
| Zambia |
| Zimbabwe |
united_countries <- grep("United", processed_covid_df$country, value = TRUE)
cat("Countries starting with 'United':\n")
Countries starting with ‘United’:
kable(united_countries)
| x |
|---|
| United Arab Emirates |
| United Kingdom |
| United States |
country1_data <- processed_covid_df[
processed_covid_df$country == "United States",
c("country", "confirmed", "confirmed.population.ratio")
]
country2_data <- processed_covid_df[
processed_covid_df$country == "United Kingdom",
c("country", "confirmed", "confirmed.population.ratio")
]
comparison <- rbind(country1_data, country2_data)
kable(comparison)
| country | confirmed | confirmed.population.ratio | |
|---|---|---|---|
| 166 | United States | 90749469 | 27.4 |
| 165 | United Kingdom | 22232377 | 32.9 |
comparison <- processed_covid_df %>%
filter(country %in% c("United States", "United Kingdom")) %>%
select(country, confirmed.population.ratio)
ratio1 <- comparison$confirmed.population.ratio[1]
ratio1
[1] 32.9
ratio2 <- comparison$confirmed.population.ratio[2]
ratio2
[1] 27.4
country1 <- comparison$country[1]
country1
[1] United Kingdom 172 Levels: Afghanistan Albania Algeria Andorra Angola … Zimbabwe
country2 <- comparison$country[2]
country2
[1] United States 172 Levels: Afghanistan Albania Algeria Andorra Angola … Zimbabwe
if (ratio1 > ratio2) {
cat(as.character(country1), "has higher COVID-19 infection risk (", ratio1, "%) than",
as.character(country2), "(", ratio2, "%)\n\n\n")
} else if (ratio2 > ratio1) {
cat(as.character(country2), "has higher COVID-19 infection risk (", ratio2, "%) than",
as.character(country1), "(", ratio1, "%)\n\n\n")
} else {
cat("Both countries have the same confirmed-to-population ratio (",
ratio1, "%)\n\n\n")
}
United Kingdom has higher COVID-19 infection risk ( 32.9 %) than United States ( 27.4 %)
low_risk_countries <- processed_covid_df %>%
filter(confirmed.population.ratio < 1) %>%
select(country, confirmed.population.ratio) %>%
arrange(confirmed.population.ratio)
kable(low_risk_countries)
| country | confirmed.population.ratio |
|---|---|
| North Korea | 0.00000 |
| Laos | 0.00063 |
| Tanzania | 0.00085 |
| China[c] | 0.00610 |
| Burundi | 0.00740 |
| Papua New Guinea | 0.01100 |
| Niger | 0.02100 |
| Chad | 0.02900 |
| DR Congo | 0.02900 |
| Thailand | 0.03800 |
| Mauritius | 0.03900 |
| New Caledonia | 0.05000 |
| Sudan | 0.05300 |
| Burkina Faso | 0.05800 |
| Angola | 0.06700 |
| Benin | 0.06700 |
| Mali | 0.07100 |
| Brunei | 0.07400 |
| Madagascar | 0.07600 |
| Nigeria | 0.07600 |
| Gabon | 0.08200 |
| South Sudan | 0.08400 |
| Uganda | 0.08700 |
| Liberia | 0.11000 |
| Cameroon | 0.12000 |
| Afghanistan | 0.13000 |
| Algeria | 0.13000 |
| Ivory Coast | 0.13000 |
| Uzbekistan | 0.13000 |
| Grenada | 0.14000 |
| South Korea | 0.17000 |
| Guinea | 0.19000 |
| Gambia | 0.21000 |
| Kenya | 0.23000 |
| Ethiopia | 0.24000 |
| Pakistan | 0.27000 |
| Egypt | 0.28000 |
| Senegal | 0.29000 |
| Haiti | 0.30000 |
| Ghana | 0.31000 |
| Japan | 0.34000 |
| Mozambique | 0.34000 |
| Mauritania | 0.41000 |
| Sri Lanka | 0.43000 |
| Guinea-Bissau | 0.45000 |
| Malawi | 0.46000 |
| Togo | 0.46000 |
| Cambodia | 0.48000 |
| Venezuela | 0.55000 |
| Bangladesh | 0.70000 |
| Rwanda | 0.76000 |
| Myanmar | 0.81000 |
| Antigua and Barbuda | 0.86000 |