Semester: Fall 2025
Group members: Sooyeon An, Leo Chow Bello, Andrea Guaderrama,
Raphael Mondragon, Diana Erazo
1. Setup and Libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr)
library(dplyr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
theme_set(theme_minimal(base_size = 13))
2. Import Data and Print
canada_covid19_dataset <- read.csv("C:/Users/monse/OneDrive/Escritorio/R/canada_covid19_dataset.csv")
Covid19_db <- canada_covid19_dataset
str(Covid19_db)
## 'data.frame': 3630 obs. of 23 variables:
## $ pruid : int 59 48 47 46 35 24 10 13 12 11 ...
## $ prname : chr "British Columbia" "Alberta" "Saskatchewan" "Manitoba" ...
## $ prnameFR : chr "Colombie-Britannique" "Alberta" "Saskatchewan" "Manitoba" ...
## $ date : chr "08-02-2020" "08-02-2020" "08-02-2020" "08-02-2020" ...
## $ reporting_week : int 6 6 6 6 6 6 6 6 6 6 ...
## $ reporting_year : int 2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
## $ update : num 1 1 1 1 1 1 1 1 1 1 ...
## $ totalcases : chr "4" "0" "0" "0" ...
## $ numtotal_last7 : chr "3" "0" "0" "0" ...
## $ ratecases_total : chr "0.07" "0" "0" "0" ...
## $ numdeaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ numdeaths_last7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ratedeaths : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ratecases_last7 : chr "0.05" "0" "0" "0" ...
## $ ratedeaths_last7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ numtotal_last14 : chr "4" "0" "0" "0" ...
## $ numdeaths_last14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ratetotal_last14 : chr "0.07" "0" "0" "0" ...
## $ ratedeaths_last14 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ avgcases_last7 : chr "0.43" "0" "0" "0" ...
## $ avgincidence_last7 : chr "0.01" "0" "0" "0" ...
## $ avgdeaths_last7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ avgratedeaths_last7: num 0 0 0 0 0 0 0 0 0 0 ...
3. List the Variables
colnames(Covid19_db)
## [1] "pruid" "prname" "prnameFR"
## [4] "date" "reporting_week" "reporting_year"
## [7] "update" "totalcases" "numtotal_last7"
## [10] "ratecases_total" "numdeaths" "numdeaths_last7"
## [13] "ratedeaths" "ratecases_last7" "ratedeaths_last7"
## [16] "numtotal_last14" "numdeaths_last14" "ratetotal_last14"
## [19] "ratedeaths_last14" "avgcases_last7" "avgincidence_last7"
## [22] "avgdeaths_last7" "avgratedeaths_last7"
4. Print the Top 15 Rows
head(Covid19_db,15)
## pruid prname prnameFR date
## 1 59 British Columbia Colombie-Britannique 08-02-2020
## 2 48 Alberta Alberta 08-02-2020
## 3 47 Saskatchewan Saskatchewan 08-02-2020
## 4 46 Manitoba Manitoba 08-02-2020
## 5 35 Ontario Ontario 08-02-2020
## 6 24 Quebec Québec 08-02-2020
## 7 10 Newfoundland and Labrador Terre-Neuve-et-Labrador 08-02-2020
## 8 13 New Brunswick Nouveau-Brunswick 08-02-2020
## 9 12 Nova Scotia Nouvelle-Écosse 08-02-2020
## 10 11 Prince Edward Island Île-du-Prince-Édouard 08-02-2020
## 11 60 Yukon Yukon 08-02-2020
## 12 61 Northwest Territories Territoires du Nord-Ouest 08-02-2020
## 13 62 Nunavut Nunavut 08-02-2020
## 14 99 Repatriated travellers Voyageurs rapatriés 08-02-2020
## 15 1 Canada Canada 08-02-2020
## reporting_week reporting_year update totalcases numtotal_last7
## 1 6 2020 1 4 3
## 2 6 2020 1 0 0
## 3 6 2020 1 0 0
## 4 6 2020 1 0 0
## 5 6 2020 1 4 1
## 6 6 2020 1 0 0
## 7 6 2020 1 0 0
## 8 6 2020 1 0 0
## 9 6 2020 1 0 0
## 10 6 2020 1 0 0
## 11 6 2020 1 0 0
## 12 6 2020 1 0 0
## 13 6 2020 1 0 0
## 14 6 2020 NA 0 0
## 15 6 2020 NA 8 4
## ratecases_total numdeaths numdeaths_last7 ratedeaths ratecases_last7
## 1 0.07 0 0 0 0.05
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0.03 0 0 0 0.01
## 6 0 0 0 0 0
## 7 0 0 0 0 0
## 8 0 0 0 0 0
## 9 0 0 0 0 0
## 10 0 0 0 0 0
## 11 0 0 0 0 0
## 12 0 0 0 0 0
## 13 0 0 0 0 0
## 14 0 0 NA
## 15 0.02 0 0 0 0.01
## ratedeaths_last7 numtotal_last14 numdeaths_last14 ratetotal_last14
## 1 0 4 0 0.07
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 1 0 0.01
## 6 0 0 0 0
## 7 0 0 0 0
## 8 0 0 0 0
## 9 0 0 0 0
## 10 0 0 0 0
## 11 0 0 0 0
## 12 0 0 0 0
## 13 0 0 0 0
## 14 NA 0 0
## 15 0 5 0 0.01
## ratedeaths_last14 avgcases_last7 avgincidence_last7 avgdeaths_last7
## 1 0 0.43 0.01 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0.14 0 0
## 6 0 0 0 0
## 7 0 0 0 0
## 8 0 0 0 0
## 9 0 0 0 0
## 10 0 0 0 0
## 11 0 0 0 0
## 12 0 0 0 0
## 13 0 0 0 0
## 14 NA 0 0
## 15 0 0.57 0 0
## avgratedeaths_last7
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 0
## 9 0
## 10 0
## 11 0
## 12 0
## 13 0
## 14 NA
## 15 0
5. User-Defined Function
avgDeathsPerCase <- function(dataset) {
last_available_data <- dataset %>%
filter(prname == "Canada", totalcases != "-", numdeaths != "-") %>%
select(totalcases, numdeaths) %>%
tail(1)
casetotal <- suppressWarnings(as.numeric(last_available_data$totalcases))
deathtotal <- suppressWarnings(as.numeric(last_available_data$numdeaths))
if (is.na(casetotal) || is.na(deathtotal) || casetotal == 0) return(NA)
average <- deathtotal / casetotal
return(average)
}
avgDeathsPerCase(Covid19_db)
## [1] 0.01201389
6. Filter Data (Ontario Example)
ontario_data <- Covid19_db %>% filter(prname == "Ontario")
head(ontario_data)
## pruid prname prnameFR date reporting_week reporting_year update
## 1 35 Ontario Ontario 08-02-2020 6 2020 1
## 2 35 Ontario Ontario 15-02-2020 7 2020 1
## 3 35 Ontario Ontario 22-02-2020 8 2020 1
## 4 35 Ontario Ontario 29-02-2020 9 2020 1
## 5 35 Ontario Ontario 07-03-2020 10 2020 1
## 6 35 Ontario Ontario 14-03-2020 11 2020 1
## totalcases numtotal_last7 ratecases_total numdeaths numdeaths_last7
## 1 4 1 0.03 0 0
## 2 4 0 0.03 0 0
## 3 5 1 0.03 0 0
## 4 18 13 0.12 0 0
## 5 33 15 0.21 0 0
## 6 181 148 1.16 1 1
## ratedeaths ratecases_last7 ratedeaths_last7 numtotal_last14 numdeaths_last14
## 1 0.00 0.01 0.00 1 0
## 2 0.00 0 0.00 1 0
## 3 0.00 0.01 0.00 1 0
## 4 0.00 0.08 0.00 14 0
## 5 0.00 0.1 0.00 28 0
## 6 0.01 0.95 0.01 163 1
## ratetotal_last14 ratedeaths_last14 avgcases_last7 avgincidence_last7
## 1 0.01 0.00 0.14 0
## 2 0.01 0.00 0 0
## 3 0.01 0.00 0.14 0
## 4 0.09 0.00 1.86 0.01
## 5 0.18 0.00 2.14 0.01
## 6 1.04 0.01 21.14 0.14
## avgdeaths_last7 avgratedeaths_last7
## 1 0.00 0
## 2 0.00 0
## 3 0.00 0
## 4 0.00 0
## 5 0.00 0
## 6 0.14 0
7. Identify Independent and Dependent Variables
independent_var <- Covid19_db[, c("prname", "date", "reporting_year")]
dependent_var <- Covid19_db[, "numdeaths", drop = FALSE]
reshaped_data <- cbind(independent_var, dependent_var)
head(reshaped_data)
## prname date reporting_year numdeaths
## 1 British Columbia 08-02-2020 2020 0
## 2 Alberta 08-02-2020 2020 0
## 3 Saskatchewan 08-02-2020 2020 0
## 4 Manitoba 08-02-2020 2020 0
## 5 Ontario 08-02-2020 2020 0
## 6 Quebec 08-02-2020 2020 0
8. Remove Missing Values
Covid19_db <- na.omit(Covid19_db)
9. Remove Duplicates
sum(duplicated(Covid19_db))
## [1] 0
Covid19_db <- Covid19_db %>% distinct()
10. Reorder Rows (Descending by Date)
Covid19_ordered <- Covid19_db %>% arrange(desc(date))
head(Covid19_ordered)
## pruid prname prnameFR date reporting_week
## 1 59 British Columbia Colombie-Britannique 31-12-2022 52
## 2 48 Alberta Alberta 31-12-2022 52
## 3 47 Saskatchewan Saskatchewan 31-12-2022 52
## 4 46 Manitoba Manitoba 31-12-2022 52
## 5 35 Ontario Ontario 31-12-2022 52
## 6 24 Quebec Québec 31-12-2022 52
## reporting_year update totalcases numtotal_last7 ratecases_total numdeaths
## 1 2022 1 393145 692 7123.47 4896
## 2 2022 1 623991 870 13289.72 5421
## 3 2022 1 151570 302 12535.7 1822
## 4 2022 1 153784 134 10570.06 2369
## 5 2022 1 1550579 6635 9934.28 16189
## 6 2022 1 1285181 5987 14481.43 17314
## numdeaths_last7 ratedeaths ratecases_last7 ratedeaths_last7 numtotal_last14
## 1 13 88.71 12.54 0.24 1248
## 2 26 115.46 18.53 0.55 1693
## 3 15 150.69 24.98 1.24 628
## 4 0 162.83 9.21 0.00 277
## 5 71 103.72 42.51 0.45 13129
## 6 109 195.09 67.46 1.23 13103
## numdeaths_last14 ratetotal_last14 ratedeaths_last14 avgcases_last7
## 1 90 22.61 1.63 98.86
## 2 55 36.06 1.17 124.29
## 3 22 51.94 1.82 43.14
## 4 38 19.04 2.61 19.14
## 5 158 84.12 1.01 947.86
## 6 218 147.64 2.46 855.29
## avgincidence_last7 avgdeaths_last7 avgratedeaths_last7
## 1 1.79 1.86 0.03
## 2 2.65 3.71 0.08
## 3 3.57 2.14 0.18
## 4 1.32 0.00 0.00
## 5 6.07 10.14 0.06
## 6 9.64 15.57 0.18
11. Rename Columns
Covid19_db <- Covid19_db %>% rename(
Province = prname,
Year = reporting_year,
Total_Cases = totalcases,
Deaths = numdeaths)
colnames(Covid19_db)
## [1] "pruid" "Province" "prnameFR"
## [4] "date" "reporting_week" "Year"
## [7] "update" "Total_Cases" "numtotal_last7"
## [10] "ratecases_total" "Deaths" "numdeaths_last7"
## [13] "ratedeaths" "ratecases_last7" "ratedeaths_last7"
## [16] "numtotal_last14" "numdeaths_last14" "ratetotal_last14"
## [19] "ratedeaths_last14" "avgcases_last7" "avgincidence_last7"
## [22] "avgdeaths_last7" "avgratedeaths_last7"
12. Add New Variables
new_data <- Covid19_db %>% mutate(
Total_Cases = as.numeric(ifelse(Total_Cases == "-", NA, Total_Cases)),
Deaths = as.numeric(ifelse(Deaths == "-", NA, Deaths)),
DeathsPerCase = Deaths / Total_Cases,
PercentDeaths = (Deaths / Total_Cases) * 100)
head(new_data)
## pruid Province prnameFR date reporting_week Year
## 1 59 British Columbia Colombie-Britannique 08-02-2020 6 2020
## 2 48 Alberta Alberta 08-02-2020 6 2020
## 3 47 Saskatchewan Saskatchewan 08-02-2020 6 2020
## 4 46 Manitoba Manitoba 08-02-2020 6 2020
## 5 35 Ontario Ontario 08-02-2020 6 2020
## 6 24 Quebec Québec 08-02-2020 6 2020
## update Total_Cases numtotal_last7 ratecases_total Deaths numdeaths_last7
## 1 1 4 3 0.07 0 0
## 2 1 0 0 0 0 0
## 3 1 0 0 0 0 0
## 4 1 0 0 0 0 0
## 5 1 4 1 0.03 0 0
## 6 1 0 0 0 0 0
## ratedeaths ratecases_last7 ratedeaths_last7 numtotal_last14 numdeaths_last14
## 1 0 0.05 0 4 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0.01 0 1 0
## 6 0 0 0 0 0
## ratetotal_last14 ratedeaths_last14 avgcases_last7 avgincidence_last7
## 1 0.07 0 0.43 0.01
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0.01 0 0.14 0
## 6 0 0 0 0
## avgdeaths_last7 avgratedeaths_last7 DeathsPerCase PercentDeaths
## 1 0 0 0 0
## 2 0 0 NaN NaN
## 3 0 0 NaN NaN
## 4 0 0 NaN NaN
## 5 0 0 0 0
## 6 0 0 NaN NaN
13. Create a Training Set
set.seed(1234)
training_set <- new_data %>% sample_frac(0.75, replace = FALSE)
head(training_set)
## pruid Province prnameFR date
## 1 47 Saskatchewan Saskatchewan 31-07-2021
## 2 61 Northwest Territories Territoires du Nord-Ouest 02-01-2021
## 3 60 Yukon Yukon 19-06-2021
## 4 11 Prince Edward Island Île-du-Prince-Édouard 05-09-2020
## 5 24 Quebec Québec 19-08-2023
## 6 46 Manitoba Manitoba 09-07-2022
## reporting_week Year update Total_Cases numtotal_last7 ratecases_total Deaths
## 1 30 2021 1 50024 368 4137.27 578
## 2 53 2020 1 24 0 53.37 0
## 3 24 2021 1 159 59 353.53 3
## 4 36 2020 1 49 5 28.2 0
## 5 33 2023 1 1356849 1669 15288.99 18468
## 6 27 2022 1 145853 230 10024.94 2053
## numdeaths_last7 ratedeaths ratecases_last7 ratedeaths_last7 numtotal_last14
## 1 0 47.80 30.44 0.00 627
## 2 0 0.00 0 0.00 1
## 3 1 6.67 131.18 2.22 74
## 4 0 0.00 2.88 0.00 5
## 5 20 208.10 18.81 0.23 3030
## 6 6 141.11 15.81 0.41 355
## numdeaths_last14 ratetotal_last14 ratedeaths_last14 avgcases_last7
## 1 3 51.86 0.25 52.57
## 2 0 2.22 0.00 0
## 3 1 164.54 2.22 8.43
## 4 0 2.88 0.00 0.73
## 5 34 34.14 0.38 238.43
## 6 10 24.4 0.69 32.86
## avgincidence_last7 avgdeaths_last7 avgratedeaths_last7 DeathsPerCase
## 1 4.35 0.00 0.00 0.01155445
## 2 0 0.00 0.00 0.00000000
## 3 18.74 0.14 0.32 0.01886792
## 4 0.42 0.00 0.00 0.00000000
## 5 2.69 2.86 0.03 0.01361095
## 6 2.26 0.86 0.06 0.01407582
## PercentDeaths
## 1 1.155445
## 2 0.000000
## 3 1.886792
## 4 0.000000
## 5 1.361095
## 6 1.407582
14. Summary Statistics
summary(new_data)
## pruid Province prnameFR date
## Min. :10.00 Length:2597 Length:2597 Length:2597
## 1st Qu.:13.00 Class :character Class :character Class :character
## Median :46.00 Mode :character Mode :character Mode :character
## Mean :35.77
## 3rd Qu.:48.00
## Max. :62.00
##
## reporting_week Year update Total_Cases
## Min. : 1.00 Min. :2020 Min. :1 Min. : 0
## 1st Qu.:13.00 1st Qu.:2021 1st Qu.:1 1st Qu.: 1141
## Median :25.00 Median :2021 Median :1 Median : 56441
## Mean :25.86 Mean :2022 Mean :1 Mean : 246430
## 3rd Qu.:38.00 3rd Qu.:2023 3rd Qu.:1 3rd Qu.: 240220
## Max. :53.00 Max. :2024 Max. :1 Max. :1719315
## NA's :112
## numtotal_last7 ratecases_total Deaths numdeaths_last7
## Length:2597 Length:2597 Min. : 0 Min. : -1.00
## Class :character Class :character 1st Qu.: 14 1st Qu.: 0.00
## Mode :character Mode :character Median : 586 Median : 4.00
## Mean : 3343 Mean : 23.34
## 3rd Qu.: 4311 3rd Qu.: 22.00
## Max. :20553 Max. :838.00
##
## ratedeaths ratecases_last7 ratedeaths_last7 numtotal_last14
## Min. : 0.00 Length:2597 Min. :-0.0800 Length:2597
## 1st Qu.: 4.31 Class :character 1st Qu.: 0.0000 Class :character
## Median : 50.32 Mode :character Median : 0.2400 Mode :character
## Mean : 63.53 Mean : 0.5628
## 3rd Qu.:111.50 3rd Qu.: 0.7400
## Max. :231.59 Max. :11.2600
##
## numdeaths_last14 ratetotal_last14 ratedeaths_last14 avgcases_last7
## Min. : -1.00 Length:2597 Min. :-0.080 Length:2597
## 1st Qu.: 0.00 Class :character 1st Qu.: 0.000 Class :character
## Median : 10.00 Mode :character Median : 0.560 Mode :character
## Mean : 46.61 Mean : 1.118
## 3rd Qu.: 45.00 3rd Qu.: 1.550
## Max. :1587.00 Max. :17.880
##
## avgincidence_last7 avgdeaths_last7 avgratedeaths_last7 DeathsPerCase
## Length:2597 Min. : -0.140 Min. :-0.01000 Min. :0.000000
## Class :character 1st Qu.: 0.000 1st Qu.: 0.00000 1st Qu.:0.006031
## Mode :character Median : 0.590 Median : 0.04000 Median :0.010673
## Mean : 3.335 Mean : 0.08036 Mean :0.013587
## 3rd Qu.: 3.140 3rd Qu.: 0.11000 3rd Qu.:0.014609
## Max. :119.710 Max. : 1.61000 Max. :0.142857
## NA's :202
## PercentDeaths
## Min. : 0.0000
## 1st Qu.: 0.6031
## Median : 1.0673
## Mean : 1.3587
## 3rd Qu.: 1.4609
## Max. :14.2857
## NA's :202
15. Statistical Calculations
mean_total_cases <- mean(new_data$Total_Cases, na.rm = TRUE)
median_total_cases <- median(new_data$Total_Cases, na.rm = TRUE)
mode_total_cases <- new_data %>% filter(!is.na(Total_Cases)) %>%
count(Total_Cases, sort = TRUE) %>% slice(1) %>% pull(Total_Cases)
range_total_cases <- range(new_data$Total_Cases, na.rm = TRUE)
cat("Mean:", mean_total_cases, "\n", "Median:",
median_total_cases, "\n", "Mode:",
mode_total_cases, "\n", "Range:", range_total_cases, "\n")
## Mean: 246430.4
## Median: 56441
## Mode: 0
## Range: 0 1719315
16. Scatter Plot (Total Cases vs Deaths)
# Convert to numeric (clean data)
Covid19_db$Total_Cases <- as.numeric(ifelse(Covid19_db$Total_Cases == "-", NA, Covid19_db$Total_Cases))
Covid19_db$Deaths <- as.numeric(ifelse(Covid19_db$Deaths == "-", NA, Covid19_db$Deaths))
# Create Scatter Plot
ggplot(Covid19_db, aes(x = Total_Cases, y = Deaths)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, linewidth = 0.8, color = "darkred") +
scale_x_continuous(labels = label_number(scale = 1e-6, suffix = " M")) +
scale_y_continuous(labels = label_number(scale = 1e-3, suffix = " K")) +
labs(
title = "Scatter Plot: Total Cases vs Total Deaths",
x = "Total Cases (Millions)",
y = "Total Deaths (Thousands)")

17. Bar Plot (Total COVID-19 Cases by Province)
# Prepare data for the bar plot
plot_df <- Covid19_db %>%
filter(Province != "Canada") %>%
group_by(Province) %>%
summarise(Total_Cases = max(Total_Cases, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(Total_Cases))
# Create the bar plot
library(viridis)
ggplot(plot_df, aes(x = reorder(Province, -Total_Cases), y = Total_Cases, fill = Province)) +
geom_col(width = 0.7, show.legend = TRUE) +
scale_y_continuous(labels = label_number(scale = 1e-6, suffix = " M")) +
scale_fill_viridis_d(option = "turbo") +
labs(
title = "Total COVID-19 Cases by Province in Canada",
subtitle = "Cumulative totals (2020–2024)",
x = "Province",
y = "Total Reported Cases (Millions)",
fill = "Province") +
theme(
axis.text.x = element_text(angle = 35, hjust = 1),
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))

18. Correlation (Pearson)
correlation_value <- cor(Covid19_db$Total_Cases,
Covid19_db$Deaths, method = "pearson", use = "complete.obs")
cat("Pearson correlation between Total Cases and Deaths:",
correlation_value, "\n")
## Pearson correlation between Total Cases and Deaths: 0.9601831
19. Final Check
cat("All analysis steps completed successfully. Ready to knit or publish on RPubs.")
## All analysis steps completed successfully. Ready to knit or publish on RPubs.