Semester: Fall 2025

Group members: Sooyeon An, Leo Chow Bello, Andrea Guaderrama, Raphael Mondragon, Diana Erazo


1. Setup and Libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr)
library(dplyr)
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
theme_set(theme_minimal(base_size = 13))

2. Import Data and Print

canada_covid19_dataset <- read.csv("C:/Users/monse/OneDrive/Escritorio/R/canada_covid19_dataset.csv")
Covid19_db <- canada_covid19_dataset
str(Covid19_db)
## 'data.frame':    3630 obs. of  23 variables:
##  $ pruid              : int  59 48 47 46 35 24 10 13 12 11 ...
##  $ prname             : chr  "British Columbia" "Alberta" "Saskatchewan" "Manitoba" ...
##  $ prnameFR           : chr  "Colombie-Britannique" "Alberta" "Saskatchewan" "Manitoba" ...
##  $ date               : chr  "08-02-2020" "08-02-2020" "08-02-2020" "08-02-2020" ...
##  $ reporting_week     : int  6 6 6 6 6 6 6 6 6 6 ...
##  $ reporting_year     : int  2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
##  $ update             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ totalcases         : chr  "4" "0" "0" "0" ...
##  $ numtotal_last7     : chr  "3" "0" "0" "0" ...
##  $ ratecases_total    : chr  "0.07" "0" "0" "0" ...
##  $ numdeaths          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ numdeaths_last7    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ratedeaths         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ratecases_last7    : chr  "0.05" "0" "0" "0" ...
##  $ ratedeaths_last7   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ numtotal_last14    : chr  "4" "0" "0" "0" ...
##  $ numdeaths_last14   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ ratetotal_last14   : chr  "0.07" "0" "0" "0" ...
##  $ ratedeaths_last14  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ avgcases_last7     : chr  "0.43" "0" "0" "0" ...
##  $ avgincidence_last7 : chr  "0.01" "0" "0" "0" ...
##  $ avgdeaths_last7    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ avgratedeaths_last7: num  0 0 0 0 0 0 0 0 0 0 ...

3. List the Variables

colnames(Covid19_db)
##  [1] "pruid"               "prname"              "prnameFR"           
##  [4] "date"                "reporting_week"      "reporting_year"     
##  [7] "update"              "totalcases"          "numtotal_last7"     
## [10] "ratecases_total"     "numdeaths"           "numdeaths_last7"    
## [13] "ratedeaths"          "ratecases_last7"     "ratedeaths_last7"   
## [16] "numtotal_last14"     "numdeaths_last14"    "ratetotal_last14"   
## [19] "ratedeaths_last14"   "avgcases_last7"      "avgincidence_last7" 
## [22] "avgdeaths_last7"     "avgratedeaths_last7"

5. User-Defined Function

avgDeathsPerCase <- function(dataset) {
last_available_data <- dataset %>%
filter(prname == "Canada", totalcases != "-", numdeaths != "-") %>%
select(totalcases, numdeaths) %>%
tail(1)
casetotal <- suppressWarnings(as.numeric(last_available_data$totalcases))
deathtotal <- suppressWarnings(as.numeric(last_available_data$numdeaths))
if (is.na(casetotal) || is.na(deathtotal) || casetotal == 0) return(NA)
average <- deathtotal / casetotal
return(average)
}
avgDeathsPerCase(Covid19_db)
## [1] 0.01201389

6. Filter Data (Ontario Example)

ontario_data <- Covid19_db %>% filter(prname == "Ontario")
head(ontario_data)
##   pruid  prname prnameFR       date reporting_week reporting_year update
## 1    35 Ontario  Ontario 08-02-2020              6           2020      1
## 2    35 Ontario  Ontario 15-02-2020              7           2020      1
## 3    35 Ontario  Ontario 22-02-2020              8           2020      1
## 4    35 Ontario  Ontario 29-02-2020              9           2020      1
## 5    35 Ontario  Ontario 07-03-2020             10           2020      1
## 6    35 Ontario  Ontario 14-03-2020             11           2020      1
##   totalcases numtotal_last7 ratecases_total numdeaths numdeaths_last7
## 1          4              1            0.03         0               0
## 2          4              0            0.03         0               0
## 3          5              1            0.03         0               0
## 4         18             13            0.12         0               0
## 5         33             15            0.21         0               0
## 6        181            148            1.16         1               1
##   ratedeaths ratecases_last7 ratedeaths_last7 numtotal_last14 numdeaths_last14
## 1       0.00            0.01             0.00               1                0
## 2       0.00               0             0.00               1                0
## 3       0.00            0.01             0.00               1                0
## 4       0.00            0.08             0.00              14                0
## 5       0.00             0.1             0.00              28                0
## 6       0.01            0.95             0.01             163                1
##   ratetotal_last14 ratedeaths_last14 avgcases_last7 avgincidence_last7
## 1             0.01              0.00           0.14                  0
## 2             0.01              0.00              0                  0
## 3             0.01              0.00           0.14                  0
## 4             0.09              0.00           1.86               0.01
## 5             0.18              0.00           2.14               0.01
## 6             1.04              0.01          21.14               0.14
##   avgdeaths_last7 avgratedeaths_last7
## 1            0.00                   0
## 2            0.00                   0
## 3            0.00                   0
## 4            0.00                   0
## 5            0.00                   0
## 6            0.14                   0

7. Identify Independent and Dependent Variables

independent_var <- Covid19_db[, c("prname", "date", "reporting_year")]
dependent_var <- Covid19_db[, "numdeaths", drop = FALSE]
reshaped_data <- cbind(independent_var, dependent_var)
head(reshaped_data)
##             prname       date reporting_year numdeaths
## 1 British Columbia 08-02-2020           2020         0
## 2          Alberta 08-02-2020           2020         0
## 3     Saskatchewan 08-02-2020           2020         0
## 4         Manitoba 08-02-2020           2020         0
## 5          Ontario 08-02-2020           2020         0
## 6           Quebec 08-02-2020           2020         0

8. Remove Missing Values

Covid19_db <- na.omit(Covid19_db)

9. Remove Duplicates

sum(duplicated(Covid19_db))
## [1] 0
Covid19_db <- Covid19_db %>% distinct()

10. Reorder Rows (Descending by Date)

Covid19_ordered <- Covid19_db %>% arrange(desc(date))
head(Covid19_ordered)
##   pruid           prname             prnameFR       date reporting_week
## 1    59 British Columbia Colombie-Britannique 31-12-2022             52
## 2    48          Alberta              Alberta 31-12-2022             52
## 3    47     Saskatchewan         Saskatchewan 31-12-2022             52
## 4    46         Manitoba             Manitoba 31-12-2022             52
## 5    35          Ontario              Ontario 31-12-2022             52
## 6    24           Quebec               Québec 31-12-2022             52
##   reporting_year update totalcases numtotal_last7 ratecases_total numdeaths
## 1           2022      1     393145            692         7123.47      4896
## 2           2022      1     623991            870        13289.72      5421
## 3           2022      1     151570            302         12535.7      1822
## 4           2022      1     153784            134        10570.06      2369
## 5           2022      1    1550579           6635         9934.28     16189
## 6           2022      1    1285181           5987        14481.43     17314
##   numdeaths_last7 ratedeaths ratecases_last7 ratedeaths_last7 numtotal_last14
## 1              13      88.71           12.54             0.24            1248
## 2              26     115.46           18.53             0.55            1693
## 3              15     150.69           24.98             1.24             628
## 4               0     162.83            9.21             0.00             277
## 5              71     103.72           42.51             0.45           13129
## 6             109     195.09           67.46             1.23           13103
##   numdeaths_last14 ratetotal_last14 ratedeaths_last14 avgcases_last7
## 1               90            22.61              1.63          98.86
## 2               55            36.06              1.17         124.29
## 3               22            51.94              1.82          43.14
## 4               38            19.04              2.61          19.14
## 5              158            84.12              1.01         947.86
## 6              218           147.64              2.46         855.29
##   avgincidence_last7 avgdeaths_last7 avgratedeaths_last7
## 1               1.79            1.86                0.03
## 2               2.65            3.71                0.08
## 3               3.57            2.14                0.18
## 4               1.32            0.00                0.00
## 5               6.07           10.14                0.06
## 6               9.64           15.57                0.18

11. Rename Columns

Covid19_db <- Covid19_db %>% rename(
Province = prname,
Year = reporting_year,
Total_Cases = totalcases,
Deaths = numdeaths)
colnames(Covid19_db)
##  [1] "pruid"               "Province"            "prnameFR"           
##  [4] "date"                "reporting_week"      "Year"               
##  [7] "update"              "Total_Cases"         "numtotal_last7"     
## [10] "ratecases_total"     "Deaths"              "numdeaths_last7"    
## [13] "ratedeaths"          "ratecases_last7"     "ratedeaths_last7"   
## [16] "numtotal_last14"     "numdeaths_last14"    "ratetotal_last14"   
## [19] "ratedeaths_last14"   "avgcases_last7"      "avgincidence_last7" 
## [22] "avgdeaths_last7"     "avgratedeaths_last7"

12. Add New Variables

new_data <- Covid19_db %>% mutate(
Total_Cases = as.numeric(ifelse(Total_Cases == "-", NA, Total_Cases)),
Deaths = as.numeric(ifelse(Deaths == "-", NA, Deaths)),
DeathsPerCase = Deaths / Total_Cases,
PercentDeaths = (Deaths / Total_Cases) * 100)
head(new_data)
##   pruid         Province             prnameFR       date reporting_week Year
## 1    59 British Columbia Colombie-Britannique 08-02-2020              6 2020
## 2    48          Alberta              Alberta 08-02-2020              6 2020
## 3    47     Saskatchewan         Saskatchewan 08-02-2020              6 2020
## 4    46         Manitoba             Manitoba 08-02-2020              6 2020
## 5    35          Ontario              Ontario 08-02-2020              6 2020
## 6    24           Quebec               Québec 08-02-2020              6 2020
##   update Total_Cases numtotal_last7 ratecases_total Deaths numdeaths_last7
## 1      1           4              3            0.07      0               0
## 2      1           0              0               0      0               0
## 3      1           0              0               0      0               0
## 4      1           0              0               0      0               0
## 5      1           4              1            0.03      0               0
## 6      1           0              0               0      0               0
##   ratedeaths ratecases_last7 ratedeaths_last7 numtotal_last14 numdeaths_last14
## 1          0            0.05                0               4                0
## 2          0               0                0               0                0
## 3          0               0                0               0                0
## 4          0               0                0               0                0
## 5          0            0.01                0               1                0
## 6          0               0                0               0                0
##   ratetotal_last14 ratedeaths_last14 avgcases_last7 avgincidence_last7
## 1             0.07                 0           0.43               0.01
## 2                0                 0              0                  0
## 3                0                 0              0                  0
## 4                0                 0              0                  0
## 5             0.01                 0           0.14                  0
## 6                0                 0              0                  0
##   avgdeaths_last7 avgratedeaths_last7 DeathsPerCase PercentDeaths
## 1               0                   0             0             0
## 2               0                   0           NaN           NaN
## 3               0                   0           NaN           NaN
## 4               0                   0           NaN           NaN
## 5               0                   0             0             0
## 6               0                   0           NaN           NaN

13. Create a Training Set

set.seed(1234)
training_set <- new_data %>% sample_frac(0.75, replace = FALSE)
head(training_set)
##   pruid              Province                  prnameFR       date
## 1    47          Saskatchewan              Saskatchewan 31-07-2021
## 2    61 Northwest Territories Territoires du Nord-Ouest 02-01-2021
## 3    60                 Yukon                     Yukon 19-06-2021
## 4    11  Prince Edward Island     Île-du-Prince-Édouard 05-09-2020
## 5    24                Quebec                    Québec 19-08-2023
## 6    46              Manitoba                  Manitoba 09-07-2022
##   reporting_week Year update Total_Cases numtotal_last7 ratecases_total Deaths
## 1             30 2021      1       50024            368         4137.27    578
## 2             53 2020      1          24              0           53.37      0
## 3             24 2021      1         159             59          353.53      3
## 4             36 2020      1          49              5            28.2      0
## 5             33 2023      1     1356849           1669        15288.99  18468
## 6             27 2022      1      145853            230        10024.94   2053
##   numdeaths_last7 ratedeaths ratecases_last7 ratedeaths_last7 numtotal_last14
## 1               0      47.80           30.44             0.00             627
## 2               0       0.00               0             0.00               1
## 3               1       6.67          131.18             2.22              74
## 4               0       0.00            2.88             0.00               5
## 5              20     208.10           18.81             0.23            3030
## 6               6     141.11           15.81             0.41             355
##   numdeaths_last14 ratetotal_last14 ratedeaths_last14 avgcases_last7
## 1                3            51.86              0.25          52.57
## 2                0             2.22              0.00              0
## 3                1           164.54              2.22           8.43
## 4                0             2.88              0.00           0.73
## 5               34            34.14              0.38         238.43
## 6               10             24.4              0.69          32.86
##   avgincidence_last7 avgdeaths_last7 avgratedeaths_last7 DeathsPerCase
## 1               4.35            0.00                0.00    0.01155445
## 2                  0            0.00                0.00    0.00000000
## 3              18.74            0.14                0.32    0.01886792
## 4               0.42            0.00                0.00    0.00000000
## 5               2.69            2.86                0.03    0.01361095
## 6               2.26            0.86                0.06    0.01407582
##   PercentDeaths
## 1      1.155445
## 2      0.000000
## 3      1.886792
## 4      0.000000
## 5      1.361095
## 6      1.407582

14. Summary Statistics

summary(new_data)
##      pruid         Province           prnameFR             date          
##  Min.   :10.00   Length:2597        Length:2597        Length:2597       
##  1st Qu.:13.00   Class :character   Class :character   Class :character  
##  Median :46.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :35.77                                                           
##  3rd Qu.:48.00                                                           
##  Max.   :62.00                                                           
##                                                                          
##  reporting_week       Year          update   Total_Cases     
##  Min.   : 1.00   Min.   :2020   Min.   :1   Min.   :      0  
##  1st Qu.:13.00   1st Qu.:2021   1st Qu.:1   1st Qu.:   1141  
##  Median :25.00   Median :2021   Median :1   Median :  56441  
##  Mean   :25.86   Mean   :2022   Mean   :1   Mean   : 246430  
##  3rd Qu.:38.00   3rd Qu.:2023   3rd Qu.:1   3rd Qu.: 240220  
##  Max.   :53.00   Max.   :2024   Max.   :1   Max.   :1719315  
##                                             NA's   :112      
##  numtotal_last7     ratecases_total        Deaths      numdeaths_last7 
##  Length:2597        Length:2597        Min.   :    0   Min.   : -1.00  
##  Class :character   Class :character   1st Qu.:   14   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  586   Median :  4.00  
##                                        Mean   : 3343   Mean   : 23.34  
##                                        3rd Qu.: 4311   3rd Qu.: 22.00  
##                                        Max.   :20553   Max.   :838.00  
##                                                                        
##    ratedeaths     ratecases_last7    ratedeaths_last7  numtotal_last14   
##  Min.   :  0.00   Length:2597        Min.   :-0.0800   Length:2597       
##  1st Qu.:  4.31   Class :character   1st Qu.: 0.0000   Class :character  
##  Median : 50.32   Mode  :character   Median : 0.2400   Mode  :character  
##  Mean   : 63.53                      Mean   : 0.5628                     
##  3rd Qu.:111.50                      3rd Qu.: 0.7400                     
##  Max.   :231.59                      Max.   :11.2600                     
##                                                                          
##  numdeaths_last14  ratetotal_last14   ratedeaths_last14 avgcases_last7    
##  Min.   :  -1.00   Length:2597        Min.   :-0.080    Length:2597       
##  1st Qu.:   0.00   Class :character   1st Qu.: 0.000    Class :character  
##  Median :  10.00   Mode  :character   Median : 0.560    Mode  :character  
##  Mean   :  46.61                      Mean   : 1.118                      
##  3rd Qu.:  45.00                      3rd Qu.: 1.550                      
##  Max.   :1587.00                      Max.   :17.880                      
##                                                                           
##  avgincidence_last7 avgdeaths_last7   avgratedeaths_last7 DeathsPerCase     
##  Length:2597        Min.   : -0.140   Min.   :-0.01000    Min.   :0.000000  
##  Class :character   1st Qu.:  0.000   1st Qu.: 0.00000    1st Qu.:0.006031  
##  Mode  :character   Median :  0.590   Median : 0.04000    Median :0.010673  
##                     Mean   :  3.335   Mean   : 0.08036    Mean   :0.013587  
##                     3rd Qu.:  3.140   3rd Qu.: 0.11000    3rd Qu.:0.014609  
##                     Max.   :119.710   Max.   : 1.61000    Max.   :0.142857  
##                                                           NA's   :202       
##  PercentDeaths    
##  Min.   : 0.0000  
##  1st Qu.: 0.6031  
##  Median : 1.0673  
##  Mean   : 1.3587  
##  3rd Qu.: 1.4609  
##  Max.   :14.2857  
##  NA's   :202

15. Statistical Calculations

mean_total_cases <- mean(new_data$Total_Cases, na.rm = TRUE)
median_total_cases <- median(new_data$Total_Cases, na.rm = TRUE)
mode_total_cases <- new_data %>% filter(!is.na(Total_Cases)) %>% 
count(Total_Cases, sort = TRUE) %>% slice(1) %>% pull(Total_Cases)
range_total_cases <- range(new_data$Total_Cases, na.rm = TRUE)
cat("Mean:", mean_total_cases, "\n", "Median:", 
median_total_cases, "\n", "Mode:", 
mode_total_cases, "\n", "Range:", range_total_cases, "\n")
## Mean: 246430.4 
##  Median: 56441 
##  Mode: 0 
##  Range: 0 1719315

16. Scatter Plot (Total Cases vs Deaths)

# Convert to numeric (clean data)
Covid19_db$Total_Cases <- as.numeric(ifelse(Covid19_db$Total_Cases == "-", NA, Covid19_db$Total_Cases))
Covid19_db$Deaths <- as.numeric(ifelse(Covid19_db$Deaths == "-", NA, Covid19_db$Deaths))

# Create Scatter Plot
ggplot(Covid19_db, aes(x = Total_Cases, y = Deaths)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, linewidth = 0.8, color = "darkred") +
scale_x_continuous(labels = label_number(scale = 1e-6, suffix = " M")) +
scale_y_continuous(labels = label_number(scale = 1e-3, suffix = " K")) +
labs(
title = "Scatter Plot: Total Cases vs Total Deaths",
x = "Total Cases (Millions)",
y = "Total Deaths (Thousands)")

17. Bar Plot (Total COVID-19 Cases by Province)

# Prepare data for the bar plot
plot_df <- Covid19_db %>%
filter(Province != "Canada") %>%
group_by(Province) %>%
summarise(Total_Cases = max(Total_Cases, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(Total_Cases))

# Create the bar plot
library(viridis)

ggplot(plot_df, aes(x = reorder(Province, -Total_Cases), y = Total_Cases, fill = Province)) +
geom_col(width = 0.7, show.legend = TRUE) +
scale_y_continuous(labels = label_number(scale = 1e-6, suffix = " M")) +
scale_fill_viridis_d(option = "turbo") +
labs(
title = "Total COVID-19 Cases by Province in Canada",
subtitle = "Cumulative totals (2020–2024)",
x = "Province",
y = "Total Reported Cases (Millions)",
fill = "Province") +
theme(
axis.text.x = element_text(angle = 35, hjust = 1),
plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))

18. Correlation (Pearson)

correlation_value <- cor(Covid19_db$Total_Cases, 
Covid19_db$Deaths, method = "pearson", use = "complete.obs")
cat("Pearson correlation between Total Cases and Deaths:", 
correlation_value, "\n")
## Pearson correlation between Total Cases and Deaths: 0.9601831

19. Final Check

cat("All analysis steps completed successfully. Ready to knit or publish on RPubs.")
## All analysis steps completed successfully. Ready to knit or publish on RPubs.