# Load necessary libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
# Load the data
data <- read.csv("covid.csv")

# Convert date column to Date format
data$date <- as.Date(data$date)

# Calculate excess deaths
data <- data %>%
  mutate(excess_deaths = excess_mortality)

# Check for missing values
missing_values <- sum(is.na(data$excess_deaths))
cat("Number of missing values in excess_deaths column:", missing_values, "\n")
## Number of missing values in excess_deaths column: 360589
# Explore non-finite values
non_finite_values <- sum(!is.finite(data$excess_deaths))
cat("Number of non-finite values in excess_deaths column:", non_finite_values, "\n")
## Number of non-finite values in excess_deaths column: 360589
# Summarize the data
summary(data$excess_deaths)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   -95.9    -1.5     5.6    11.0    15.7   377.7  360589
# Create a histogram of excess deaths
histogram <- ggplot(data, aes(x = excess_deaths)) +
  geom_histogram(binwidth = 5, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Excess Deaths",
       x = "Excess Deaths",
       y = "Frequency")

# Time series plot of total cases
total_cases_plot <- ggplot(data, aes(x = date, y = total_cases / 1e6)) +  # Divide by 1 million for y-axis label in millions
  geom_line() +
  scale_y_continuous(labels = scales::label_number(scale = 1e6)) +  
  labs(title = "Total COVID Cases Over Time",
       x = "Date",
       y = "Total Cases")  

# Time series plot of total deaths
total_deaths_plot <- ggplot(data, aes(x = date, y = total_deaths / 1e6)) +  # Divide by 1 million for y-axis label in millions
  geom_line() +
  scale_y_continuous(labels = scales::label_number(scale = 1e6)) +  
  labs(title = "Total COVID Deaths Over Time",
       x = "Date",
       y = "Total Deaths") 

# Filter data for the age groups aged 65 and older and aged 70 and older
age_groups <- data %>%
  filter(!is.na(aged_65_older) & !is.na(aged_70_older)) %>%
  select(location, aged_65_older, aged_70_older, people_vaccinated_per_hundred, total_deaths_per_million)

# Time series plot of ICU patients aged 65 and older
icu_patients_plot <- ggplot(data, aes(x = date, y = icu_patients)) +
  geom_line() +
  labs(title = "ICU Patients Aged 65 and Older Over Time",
       x = "Date",
       y = "ICU Patients")

# Time series plot of hospitalisations aged 65 and older
hosp_patients_plot <- ggplot(data, aes(x = date, y = hosp_patients)) +
  geom_line() +
  labs(title = "Hospitalisations Aged 65 and Older Over Time",
       x = "Date",
       y = "Hospitalisations")

# Time series plot of excess deaths for individuals aged 65 and older
excess_deaths_over_65_plot <- ggplot(data %>% filter(aged_65_older > 0), aes(x = date, y = excess_deaths)) +
  geom_line() +
  labs(title = "Excess Deaths in Individuals Aged 65 and Older Over Time",
       x = "Date",
       y = "Excess Deaths")

# Combine all plots into one plot grid
gridExtra::grid.arrange(histogram, total_cases_plot, total_deaths_plot, icu_patients_plot, hosp_patients_plot, excess_deaths_over_65_plot, ncol = 2)
## Warning: Removed 360589 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 398 rows containing missing values (`geom_line()`).
## Warning: Removed 455 rows containing missing values (`geom_line()`).
## Warning: Removed 12432 rows containing missing values (`geom_line()`).
## Warning: Removed 323 rows containing missing values (`geom_line()`).
## Warning: Removed 4313 rows containing missing values (`geom_line()`).