always_allow_html: true header-includes: -
This document demonstrates all required actions for
Assignment 1 – Data Analysis using R Programming using
the Canada COVID-19 dataset.
Every student can use their own dataset, provided it has a similar
structure (rows = observations, columns = variables).
Each section includes explanations and commented R code for clarity.
# Load essential libraries
library(tidyverse) # For data wrangling and visualization
library(janitor) # For clean column names
library(lubridate) # For handling dates
library(scales) # For number formatting in plots
# Set consistent visual theme
theme_set(theme_minimal(base_size = 13))
# Path to your dataset
# Each student should ensure the dataset is saved inside their R Project's /data folder
data_path <- "data/canada_covid19_dataset.csv"
# Stop execution if the file is not found
stopifnot(file.exists(data_path))
# Load and clean
data_raw <- readr::read_csv(data_path, show_col_types = FALSE)
# Clean column names and convert relevant columns
tidy_covid <- data_raw |>
janitor::clean_names() |>
mutate(
date = as.Date(date, format = "%d-%m-%Y"), # Convert date from character
across(
c(totalcases, numtotal_last7, ratecases_total, ratecases_last7,
numdeaths, numdeaths_last7, ratedeaths, ratedeaths_last7,
numtotal_last14, numdeaths_last14, ratetotal_last14, ratedeaths_last14,
avgcases_last7, avgincidence_last7, avgdeaths_last7, avgratedeaths_last7),
~ suppressWarnings(as.numeric(.x))
)
)
# Rename object to 'covid' for assignment consistency
covid <- tidy_covid
# Check structure and preview
glimpse(covid)
## Rows: 3,630
## Columns: 23
## $ pruid <dbl> 59, 48, 47, 46, 35, 24, 10, 13, 12, 11, 60, 61, 62…
## $ prname <chr> "British Columbia", "Alberta", "Saskatchewan", "Ma…
## $ prname_fr <chr> "Colombie-Britannique", "Alberta", "Saskatchewan",…
## $ date <date> 2020-02-08, 2020-02-08, 2020-02-08, 2020-02-08, 2…
## $ reporting_week <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7,…
## $ reporting_year <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 20…
## $ update <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, NA, 1, …
## $ totalcases <dbl> 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 4, 0,…
## $ numtotal_last7 <dbl> 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,…
## $ ratecases_total <dbl> 0.07, 0.00, 0.00, 0.00, 0.03, 0.00, 0.00, 0.00, 0.…
## $ numdeaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ numdeaths_last7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ ratedeaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0…
## $ ratecases_last7 <dbl> 0.05, 0.00, 0.00, 0.00, 0.01, 0.00, 0.00, 0.00, 0.…
## $ ratedeaths_last7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0…
## $ numtotal_last14 <dbl> 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 3, 0,…
## $ numdeaths_last14 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ ratetotal_last14 <dbl> 0.07, 0.00, 0.00, 0.00, 0.01, 0.00, 0.00, 0.00, 0.…
## $ ratedeaths_last14 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0…
## $ avgcases_last7 <dbl> 0.43, 0.00, 0.00, 0.00, 0.14, 0.00, 0.00, 0.00, 0.…
## $ avgincidence_last7 <dbl> 0.01, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
## $ avgdeaths_last7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ avgratedeaths_last7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0…
head(covid, 10)
## # A tibble: 10 × 23
## pruid prname prname_fr date reporting_week reporting_year update
## <dbl> <chr> <chr> <date> <dbl> <dbl> <dbl>
## 1 59 British Colu… Colombie… 2020-02-08 6 2020 1
## 2 48 Alberta Alberta 2020-02-08 6 2020 1
## 3 47 Saskatchewan Saskatch… 2020-02-08 6 2020 1
## 4 46 Manitoba Manitoba 2020-02-08 6 2020 1
## 5 35 Ontario Ontario 2020-02-08 6 2020 1
## 6 24 Quebec Québec 2020-02-08 6 2020 1
## 7 10 Newfoundland… Terre-Ne… 2020-02-08 6 2020 1
## 8 13 New Brunswick Nouveau-… 2020-02-08 6 2020 1
## 9 12 Nova Scotia Nouvelle… 2020-02-08 6 2020 1
## 10 11 Prince Edwar… Île-du-P… 2020-02-08 6 2020 1
## # ℹ 16 more variables: totalcases <dbl>, numtotal_last7 <dbl>,
## # ratecases_total <dbl>, numdeaths <dbl>, numdeaths_last7 <dbl>,
## # ratedeaths <dbl>, ratecases_last7 <dbl>, ratedeaths_last7 <dbl>,
## # numtotal_last14 <dbl>, numdeaths_last14 <dbl>, ratetotal_last14 <dbl>,
## # ratedeaths_last14 <dbl>, avgcases_last7 <dbl>, avgincidence_last7 <dbl>,
## # avgdeaths_last7 <dbl>, avgratedeaths_last7 <dbl>
# Function to calculate average total cases per province-year
# You can modify the function to calculate other metrics (e.g., avg deaths or rate)
average_cases <- function(data, province, year) {
stopifnot(all(c("prname", "reporting_year", "totalcases") %in% names(data)))
data |>
filter(prname == province, reporting_year == year) |>
summarise(mean_totalcases = mean(totalcases, na.rm = TRUE))
}
# Example usage
average_cases(covid, "Ontario", 2021)
## # A tibble: 1 × 1
## mean_totalcases
## <dbl>
## 1 498149.
# Filter example: Ontario data for 2021 only where total cases > 100,000
# To use another province, change 'Ontario' to e.g., 'Quebec' or 'British Columbia'
ontario_2021 <- covid |>
filter(prname == "Ontario", reporting_year == 2021, totalcases > 100000)
glimpse(ontario_2021)
## Rows: 52
## Columns: 23
## $ pruid <dbl> 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35…
## $ prname <chr> "Ontario", "Ontario", "Ontario", "Ontario", "Ontar…
## $ prname_fr <chr> "Ontario", "Ontario", "Ontario", "Ontario", "Ontar…
## $ date <date> 2021-01-09, 2021-01-16, 2021-01-23, 2021-01-30, 2…
## $ reporting_week <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
## $ reporting_year <dbl> 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 20…
## $ update <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ totalcases <dbl> 218835, 240220, 256636, 269410, 279192, 287093, 29…
## $ numtotal_last7 <dbl> 24875, 21385, 16416, 12774, 9782, 7901, 7462, 7682…
## $ ratecases_total <dbl> 1402.04, 1539.05, 1644.22, 1726.06, 1788.73, 1839.…
## $ numdeaths <dbl> 5275, 5643, 6075, 6448, 6710, 6880, 7031, 7128, 72…
## $ numdeaths_last7 <dbl> 371, 368, 432, 373, 262, 170, 151, 97, 87, 88, 82,…
## $ ratedeaths <dbl> 33.80, 36.15, 38.92, 41.31, 42.99, 44.08, 45.05, 4…
## $ ratecases_last7 <dbl> 159.37, 137.01, 105.17, 81.84, 62.67, 50.62, 47.81…
## $ ratedeaths_last7 <dbl> 2.38, 2.36, 2.77, 2.39, 1.68, 1.09, 0.97, 0.62, 0.…
## $ numtotal_last14 <dbl> 45334, 46260, 37801, 29190, 22556, 17683, 15363, 1…
## $ numdeaths_last14 <dbl> 698, 739, 800, 805, 635, 432, 321, 248, 184, 175, …
## $ ratetotal_last14 <dbl> 290.45, 296.38, 242.18, 187.02, 144.51, 113.29, 98…
## $ ratedeaths_last14 <dbl> 4.47, 4.73, 5.13, 5.16, 4.07, 2.77, 2.06, 1.59, 1.…
## $ avgcases_last7 <dbl> 3553.57, 3055.00, 2345.14, 1824.86, 1397.43, 1128.…
## $ avgincidence_last7 <dbl> 22.77, 19.57, 15.02, 11.69, 8.95, 7.23, 6.83, 7.03…
## $ avgdeaths_last7 <dbl> 53.00, 52.57, 61.71, 53.29, 37.43, 24.29, 21.57, 1…
## $ avgratedeaths_last7 <dbl> 0.34, 0.34, 0.40, 0.34, 0.24, 0.16, 0.14, 0.09, 0.…
# Select key columns for summary analysis
covid_selected <- covid |>
select(prname, reporting_year, date, totalcases, numdeaths)
# Summarise mean cases and deaths per province/year
covid_summary <- covid_selected |>
group_by(prname, reporting_year) |>
summarise(
mean_cases = mean(totalcases, na.rm = TRUE),
mean_deaths = mean(numdeaths, na.rm = TRUE),
.groups = "drop"
)
# Join summary back to main dataset
covid_joined <- covid_selected |>
left_join(covid_summary, by = c("prname", "reporting_year"))
glimpse(covid_joined)
## Rows: 3,630
## Columns: 7
## $ prname <chr> "British Columbia", "Alberta", "Saskatchewan", "Manitob…
## $ reporting_year <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2…
## $ date <date> 2020-02-08, 2020-02-08, 2020-02-08, 2020-02-08, 2020-0…
## $ totalcases <dbl> 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 4, 0, 0, 0…
## $ numdeaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ mean_cases <dbl> 9.948000e+03, 2.009021e+04, 2.611250e+03, 3.969042e+03,…
## $ mean_deaths <dbl> 218.5833333, 258.7708333, 24.2916667, 81.5000000, 2372.…
# Drop rows with missing values
covid_nomiss <- covid_joined |> drop_na()
# Remove duplicate rows if any
covid_nodup <- covid_nomiss |> distinct()
# Reorder and rename for readability
covid_sorted <- covid_nodup |> arrange(desc(totalcases)) |>
rename(
province = prname,
year = reporting_year,
total_cases = totalcases,
total_deaths = numdeaths
)
names(covid_sorted)
## [1] "province" "year" "date" "total_cases" "total_deaths"
## [6] "mean_cases" "mean_deaths"
# Create new columns (add or remove as needed)
# For instance, you could compute 'deaths_per_100k' if population data is available
covid_features <- covid_sorted |>
mutate(
fatality_rate = round(100 * (total_deaths / total_cases), 2),
total_cases_x2 = total_cases * 2
)
glimpse(covid_features)
## Rows: 3,375
## Columns: 9
## $ province <chr> "Canada", "Canada", "Canada", "Canada", "Canada", "Cana…
## $ year <dbl> 2024, 2024, 2024, 2024, 2024, 2024, 2024, 2024, 2024, 2…
## $ date <date> 2024-05-25, 2024-05-18, 2024-05-11, 2024-05-04, 2024-0…
## $ total_cases <dbl> 4964587, 4961966, 4959253, 4956770, 4954680, 4952777, 4…
## $ total_deaths <dbl> 59644, 59581, 59521, 59454, 59407, 59349, 59288, 59239,…
## $ mean_cases <dbl> 4940262, 4940262, 4940262, 4940262, 4940262, 4940262, 4…
## $ mean_deaths <dbl> 59562.39, 59562.39, 59562.39, 59562.39, 59562.39, 59562…
## $ fatality_rate <dbl> 1.20, 1.20, 1.20, 1.20, 1.20, 1.20, 1.20, 1.20, 1.20, 1…
## $ total_cases_x2 <dbl> 9929174, 9923932, 9918506, 9913540, 9909360, 9905554, 9…
# Create training/testing split for reproducibility
set.seed(123)
index <- sample(1:nrow(covid_features), 0.8 * nrow(covid_features))
covid_train <- covid_features[index, ]
covid_test <- covid_features[-index, ]
dim(covid_train); dim(covid_test)
## [1] 2700 9
## [1] 675 9
# Summary statistics for key numerical columns
summary(select(covid_features, total_cases, total_deaths, fatality_rate))
## total_cases total_deaths fatality_rate
## Min. : 0 Min. : 0 Min. : 0.000
## 1st Qu.: 1094 1st Qu.: 7 1st Qu.: 0.210
## Median : 49834 Median : 284 Median : 0.990
## Mean : 372628 Mean : 4672 Mean : 1.205
## 3rd Qu.: 190243 3rd Qu.: 3314 3rd Qu.: 1.380
## Max. :4964587 Max. :59644 Max. :14.290
## NA's :95
get_mode <- function(x) {
ux <- na.omit(unique(x))
ux[which.max(tabulate(match(x, ux)))]
}
# Calculates key descriptive statistics
with(covid_features, list(
mean_total_cases = mean(total_cases, na.rm = TRUE),
median_total_cases = median(total_cases, na.rm = TRUE),
mode_total_cases = get_mode(total_cases),
range_total_cases = range(total_cases, na.rm = TRUE)
))
## $mean_total_cases
## [1] 372628.4
##
## $median_total_cases
## [1] 49834
##
## $mode_total_cases
## [1] 13
##
## $range_total_cases
## [1] 0 4964587
# Scatter plot of total cases vs total deaths
# You can change 'color' or remove geom_smooth for a simpler plot
# Adjust the axis labels and scales as desired
ggplot(covid_features, aes(x = total_cases, y = total_deaths)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, linewidth = 0.8, color = "darkred") +
scale_x_continuous(labels = label_number(scale = 1e-6, suffix = " M")) +
scale_y_continuous(labels = label_number(scale = 1e-3, suffix = " K")) +
labs(
title = "Scatter Plot: Total Cases vs Total Deaths",
x = "Total Cases (Millions)",
y = "Total Deaths (Thousands)"
)
# Create bar chart of total COVID-19 cases by province
# To remove 'Canada' as a category, add: filter(province != "Canada") before summarise()
# To change colors, adjust the 'scale_fill_viridis_d' option below
# You can also swap fill = province for another variable (e.g., year)
plot_df <- covid_features |>
filter(province != "Canada") |> # Example filter to remove national aggregate
group_by(province) |>
summarise(total_cases = max(total_cases, na.rm = TRUE), .groups = "drop") |>
arrange(desc(total_cases))
ggplot(plot_df, aes(x = reorder(province, -total_cases), y = total_cases, fill = province)) +
geom_col(width = 0.7, show.legend = TRUE) +
scale_y_continuous(labels = label_number(scale = 1e-6, suffix = " M")) +
scale_fill_viridis_d(option = "turbo") + # Options: 'plasma', 'magma', 'inferno', 'cividis'
labs(
title = "Total COVID-19 Cases by Province in Canada",
subtitle = "Cumulative totals (2020–2024)",
x = "Province",
y = "Total Reported Cases (Millions)",
fill = "Province"
) +
theme(
axis.text.x = element_text(angle = 35, hjust = 1),
plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)
)
# Pearson correlation between total cases and total deaths
# You can replace these variables to test other relationships (e.g., avg_cases_7d vs avg_deaths_7d)
correlation_value <- cor(
covid_features$total_cases,
covid_features$total_deaths,
method = "pearson",
use = "complete.obs"
)
cat("Pearson correlation between total cases and total deaths =", round(correlation_value, 3))
## Pearson correlation between total cases and total deaths = 0.983
cat("All analysis steps completed successfully. Ready to knit or publish on RPubs.")
## All analysis steps completed successfully. Ready to knit or publish on RPubs.
/data/
folder.angle
or n.dodge
inside
theme(axis.text.x = element_text(...))
.color
or
fill
arguments in geom_*
or
scale_*
lines.filter()
statement before plotting.mutate()
section or select additional columns using
select()
.Ctrl + Shift + F10
), then knit again.