always_allow_html: true header-includes: -

Introduction

This document demonstrates all required actions for Assignment 1 – Data Analysis using R Programming using the Canada COVID-19 dataset.
Every student can use their own dataset, provided it has a similar structure (rows = observations, columns = variables).
Each section includes explanations and commented R code for clarity.


1. Setup and Libraries

# Load essential libraries
library(tidyverse)   # For data wrangling and visualization
library(janitor)     # For clean column names
library(lubridate)   # For handling dates
library(scales)      # For number formatting in plots

# Set consistent visual theme
theme_set(theme_minimal(base_size = 13))

2. Load and Clean the Dataset

# Path to your dataset
# Each student should ensure the dataset is saved inside their R Project's /data folder
data_path <- "data/canada_covid19_dataset.csv"

# Stop execution if the file is not found
stopifnot(file.exists(data_path))

# Load and clean
data_raw <- readr::read_csv(data_path, show_col_types = FALSE)

# Clean column names and convert relevant columns
tidy_covid <- data_raw |>
  janitor::clean_names() |>
  mutate(
    date = as.Date(date, format = "%d-%m-%Y"), # Convert date from character
    across(
      c(totalcases, numtotal_last7, ratecases_total, ratecases_last7,
        numdeaths, numdeaths_last7, ratedeaths, ratedeaths_last7,
        numtotal_last14, numdeaths_last14, ratetotal_last14, ratedeaths_last14,
        avgcases_last7, avgincidence_last7, avgdeaths_last7, avgratedeaths_last7),
      ~ suppressWarnings(as.numeric(.x))
    )
  )

# Rename object to 'covid' for assignment consistency
covid <- tidy_covid

# Check structure and preview
glimpse(covid)
## Rows: 3,630
## Columns: 23
## $ pruid               <dbl> 59, 48, 47, 46, 35, 24, 10, 13, 12, 11, 60, 61, 62…
## $ prname              <chr> "British Columbia", "Alberta", "Saskatchewan", "Ma…
## $ prname_fr           <chr> "Colombie-Britannique", "Alberta", "Saskatchewan",…
## $ date                <date> 2020-02-08, 2020-02-08, 2020-02-08, 2020-02-08, 2…
## $ reporting_week      <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7,…
## $ reporting_year      <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 20…
## $ update              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, NA, 1, …
## $ totalcases          <dbl> 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 4, 0,…
## $ numtotal_last7      <dbl> 3, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,…
## $ ratecases_total     <dbl> 0.07, 0.00, 0.00, 0.00, 0.03, 0.00, 0.00, 0.00, 0.…
## $ numdeaths           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ numdeaths_last7     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ ratedeaths          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0…
## $ ratecases_last7     <dbl> 0.05, 0.00, 0.00, 0.00, 0.01, 0.00, 0.00, 0.00, 0.…
## $ ratedeaths_last7    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0…
## $ numtotal_last14     <dbl> 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 3, 0,…
## $ numdeaths_last14    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ ratetotal_last14    <dbl> 0.07, 0.00, 0.00, 0.00, 0.01, 0.00, 0.00, 0.00, 0.…
## $ ratedeaths_last14   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0…
## $ avgcases_last7      <dbl> 0.43, 0.00, 0.00, 0.00, 0.14, 0.00, 0.00, 0.00, 0.…
## $ avgincidence_last7  <dbl> 0.01, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
## $ avgdeaths_last7     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ avgratedeaths_last7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0…
head(covid, 10)
## # A tibble: 10 × 23
##    pruid prname        prname_fr date       reporting_week reporting_year update
##    <dbl> <chr>         <chr>     <date>              <dbl>          <dbl>  <dbl>
##  1    59 British Colu… Colombie… 2020-02-08              6           2020      1
##  2    48 Alberta       Alberta   2020-02-08              6           2020      1
##  3    47 Saskatchewan  Saskatch… 2020-02-08              6           2020      1
##  4    46 Manitoba      Manitoba  2020-02-08              6           2020      1
##  5    35 Ontario       Ontario   2020-02-08              6           2020      1
##  6    24 Quebec        Québec    2020-02-08              6           2020      1
##  7    10 Newfoundland… Terre-Ne… 2020-02-08              6           2020      1
##  8    13 New Brunswick Nouveau-… 2020-02-08              6           2020      1
##  9    12 Nova Scotia   Nouvelle… 2020-02-08              6           2020      1
## 10    11 Prince Edwar… Île-du-P… 2020-02-08              6           2020      1
## # ℹ 16 more variables: totalcases <dbl>, numtotal_last7 <dbl>,
## #   ratecases_total <dbl>, numdeaths <dbl>, numdeaths_last7 <dbl>,
## #   ratedeaths <dbl>, ratecases_last7 <dbl>, ratedeaths_last7 <dbl>,
## #   numtotal_last14 <dbl>, numdeaths_last14 <dbl>, ratetotal_last14 <dbl>,
## #   ratedeaths_last14 <dbl>, avgcases_last7 <dbl>, avgincidence_last7 <dbl>,
## #   avgdeaths_last7 <dbl>, avgratedeaths_last7 <dbl>

3. User-Defined Function

# Function to calculate average total cases per province-year
# You can modify the function to calculate other metrics (e.g., avg deaths or rate)
average_cases <- function(data, province, year) {
  stopifnot(all(c("prname", "reporting_year", "totalcases") %in% names(data)))
  data |>
    filter(prname == province, reporting_year == year) |>
    summarise(mean_totalcases = mean(totalcases, na.rm = TRUE))
}

# Example usage
average_cases(covid, "Ontario", 2021)
## # A tibble: 1 × 1
##   mean_totalcases
##             <dbl>
## 1         498149.

4. Filtering and Selecting Data

# Filter example: Ontario data for 2021 only where total cases > 100,000
# To use another province, change 'Ontario' to e.g., 'Quebec' or 'British Columbia'
ontario_2021 <- covid |>
  filter(prname == "Ontario", reporting_year == 2021, totalcases > 100000)

glimpse(ontario_2021)
## Rows: 52
## Columns: 23
## $ pruid               <dbl> 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35…
## $ prname              <chr> "Ontario", "Ontario", "Ontario", "Ontario", "Ontar…
## $ prname_fr           <chr> "Ontario", "Ontario", "Ontario", "Ontario", "Ontar…
## $ date                <date> 2021-01-09, 2021-01-16, 2021-01-23, 2021-01-30, 2…
## $ reporting_week      <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,…
## $ reporting_year      <dbl> 2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021, 20…
## $ update              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ totalcases          <dbl> 218835, 240220, 256636, 269410, 279192, 287093, 29…
## $ numtotal_last7      <dbl> 24875, 21385, 16416, 12774, 9782, 7901, 7462, 7682…
## $ ratecases_total     <dbl> 1402.04, 1539.05, 1644.22, 1726.06, 1788.73, 1839.…
## $ numdeaths           <dbl> 5275, 5643, 6075, 6448, 6710, 6880, 7031, 7128, 72…
## $ numdeaths_last7     <dbl> 371, 368, 432, 373, 262, 170, 151, 97, 87, 88, 82,…
## $ ratedeaths          <dbl> 33.80, 36.15, 38.92, 41.31, 42.99, 44.08, 45.05, 4…
## $ ratecases_last7     <dbl> 159.37, 137.01, 105.17, 81.84, 62.67, 50.62, 47.81…
## $ ratedeaths_last7    <dbl> 2.38, 2.36, 2.77, 2.39, 1.68, 1.09, 0.97, 0.62, 0.…
## $ numtotal_last14     <dbl> 45334, 46260, 37801, 29190, 22556, 17683, 15363, 1…
## $ numdeaths_last14    <dbl> 698, 739, 800, 805, 635, 432, 321, 248, 184, 175, …
## $ ratetotal_last14    <dbl> 290.45, 296.38, 242.18, 187.02, 144.51, 113.29, 98…
## $ ratedeaths_last14   <dbl> 4.47, 4.73, 5.13, 5.16, 4.07, 2.77, 2.06, 1.59, 1.…
## $ avgcases_last7      <dbl> 3553.57, 3055.00, 2345.14, 1824.86, 1397.43, 1128.…
## $ avgincidence_last7  <dbl> 22.77, 19.57, 15.02, 11.69, 8.95, 7.23, 6.83, 7.03…
## $ avgdeaths_last7     <dbl> 53.00, 52.57, 61.71, 53.29, 37.43, 24.29, 21.57, 1…
## $ avgratedeaths_last7 <dbl> 0.34, 0.34, 0.40, 0.34, 0.24, 0.16, 0.14, 0.09, 0.…

5. Summarising and Joining

# Select key columns for summary analysis
covid_selected <- covid |>
  select(prname, reporting_year, date, totalcases, numdeaths)

# Summarise mean cases and deaths per province/year
covid_summary <- covid_selected |>
  group_by(prname, reporting_year) |>
  summarise(
    mean_cases = mean(totalcases, na.rm = TRUE),
    mean_deaths = mean(numdeaths, na.rm = TRUE),
    .groups = "drop"
  )

# Join summary back to main dataset
covid_joined <- covid_selected |>
  left_join(covid_summary, by = c("prname", "reporting_year"))

glimpse(covid_joined)
## Rows: 3,630
## Columns: 7
## $ prname         <chr> "British Columbia", "Alberta", "Saskatchewan", "Manitob…
## $ reporting_year <dbl> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2…
## $ date           <date> 2020-02-08, 2020-02-08, 2020-02-08, 2020-02-08, 2020-0…
## $ totalcases     <dbl> 4, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 4, 0, 0, 0…
## $ numdeaths      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ mean_cases     <dbl> 9.948000e+03, 2.009021e+04, 2.611250e+03, 3.969042e+03,…
## $ mean_deaths    <dbl> 218.5833333, 258.7708333, 24.2916667, 81.5000000, 2372.…

6. Cleaning the Data

# Drop rows with missing values
covid_nomiss <- covid_joined |> drop_na()

# Remove duplicate rows if any
covid_nodup <- covid_nomiss |> distinct()

# Reorder and rename for readability
covid_sorted <- covid_nodup |> arrange(desc(totalcases)) |>
  rename(
    province = prname,
    year = reporting_year,
    total_cases = totalcases,
    total_deaths = numdeaths
  )

names(covid_sorted)
## [1] "province"     "year"         "date"         "total_cases"  "total_deaths"
## [6] "mean_cases"   "mean_deaths"

7. Add Derived Variables

# Create new columns (add or remove as needed)
# For instance, you could compute 'deaths_per_100k' if population data is available
covid_features <- covid_sorted |>
  mutate(
    fatality_rate = round(100 * (total_deaths / total_cases), 2),
    total_cases_x2 = total_cases * 2
  )

glimpse(covid_features)
## Rows: 3,375
## Columns: 9
## $ province       <chr> "Canada", "Canada", "Canada", "Canada", "Canada", "Cana…
## $ year           <dbl> 2024, 2024, 2024, 2024, 2024, 2024, 2024, 2024, 2024, 2…
## $ date           <date> 2024-05-25, 2024-05-18, 2024-05-11, 2024-05-04, 2024-0…
## $ total_cases    <dbl> 4964587, 4961966, 4959253, 4956770, 4954680, 4952777, 4…
## $ total_deaths   <dbl> 59644, 59581, 59521, 59454, 59407, 59349, 59288, 59239,…
## $ mean_cases     <dbl> 4940262, 4940262, 4940262, 4940262, 4940262, 4940262, 4…
## $ mean_deaths    <dbl> 59562.39, 59562.39, 59562.39, 59562.39, 59562.39, 59562…
## $ fatality_rate  <dbl> 1.20, 1.20, 1.20, 1.20, 1.20, 1.20, 1.20, 1.20, 1.20, 1…
## $ total_cases_x2 <dbl> 9929174, 9923932, 9918506, 9913540, 9909360, 9905554, 9…

8. Train/Test Split and Summary Stats

# Create training/testing split for reproducibility
set.seed(123)
index <- sample(1:nrow(covid_features), 0.8 * nrow(covid_features))
covid_train <- covid_features[index, ]
covid_test  <- covid_features[-index, ]

dim(covid_train); dim(covid_test)
## [1] 2700    9
## [1] 675   9
# Summary statistics for key numerical columns
summary(select(covid_features, total_cases, total_deaths, fatality_rate))
##   total_cases       total_deaths   fatality_rate   
##  Min.   :      0   Min.   :    0   Min.   : 0.000  
##  1st Qu.:   1094   1st Qu.:    7   1st Qu.: 0.210  
##  Median :  49834   Median :  284   Median : 0.990  
##  Mean   : 372628   Mean   : 4672   Mean   : 1.205  
##  3rd Qu.: 190243   3rd Qu.: 3314   3rd Qu.: 1.380  
##  Max.   :4964587   Max.   :59644   Max.   :14.290  
##                                    NA's   :95

9. Mean, Median, Mode, and Range

get_mode <- function(x) {
  ux <- na.omit(unique(x))
  ux[which.max(tabulate(match(x, ux)))]
}

# Calculates key descriptive statistics
with(covid_features, list(
  mean_total_cases = mean(total_cases, na.rm = TRUE),
  median_total_cases = median(total_cases, na.rm = TRUE),
  mode_total_cases = get_mode(total_cases),
  range_total_cases = range(total_cases, na.rm = TRUE)
))
## $mean_total_cases
## [1] 372628.4
## 
## $median_total_cases
## [1] 49834
## 
## $mode_total_cases
## [1] 13
## 
## $range_total_cases
## [1]       0 4964587

10. Visualization – Scatter Plot

# Scatter plot of total cases vs total deaths
# You can change 'color' or remove geom_smooth for a simpler plot
# Adjust the axis labels and scales as desired
ggplot(covid_features, aes(x = total_cases, y = total_deaths)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", se = TRUE, linewidth = 0.8, color = "darkred") +
  scale_x_continuous(labels = label_number(scale = 1e-6, suffix = " M")) +
  scale_y_continuous(labels = label_number(scale = 1e-3, suffix = " K")) +
  labs(
    title = "Scatter Plot: Total Cases vs Total Deaths",
    x = "Total Cases (Millions)",
    y = "Total Deaths (Thousands)"
  )


11. Visualization – Bar Plot by Province

# Create bar chart of total COVID-19 cases by province
# To remove 'Canada' as a category, add: filter(province != "Canada") before summarise()
# To change colors, adjust the 'scale_fill_viridis_d' option below
# You can also swap fill = province for another variable (e.g., year)

plot_df <- covid_features |>
  filter(province != "Canada") |>  # Example filter to remove national aggregate
  group_by(province) |>
  summarise(total_cases = max(total_cases, na.rm = TRUE), .groups = "drop") |>
  arrange(desc(total_cases))

ggplot(plot_df, aes(x = reorder(province, -total_cases), y = total_cases, fill = province)) +
  geom_col(width = 0.7, show.legend = TRUE) +
  scale_y_continuous(labels = label_number(scale = 1e-6, suffix = " M")) +
  scale_fill_viridis_d(option = "turbo") +  # Options: 'plasma', 'magma', 'inferno', 'cividis'
  labs(
    title = "Total COVID-19 Cases by Province in Canada",
    subtitle = "Cumulative totals (2020–2024)",
    x = "Province",
    y = "Total Reported Cases (Millions)",
    fill = "Province"
  ) +
  theme(
    axis.text.x = element_text(angle = 35, hjust = 1),
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5)
  )


12. Correlation (Pearson)

# Pearson correlation between total cases and total deaths
# You can replace these variables to test other relationships (e.g., avg_cases_7d vs avg_deaths_7d)
correlation_value <- cor(
  covid_features$total_cases,
  covid_features$total_deaths,
  method = "pearson",
  use = "complete.obs"
)

cat("Pearson correlation between total cases and total deaths =", round(correlation_value, 3))
## Pearson correlation between total cases and total deaths = 0.983

13. Final Check

cat("All analysis steps completed successfully. Ready to knit or publish on RPubs.")
## All analysis steps completed successfully. Ready to knit or publish on RPubs.

Appendix: Troubleshooting Tips


End of Report