Download data

setwd("C:/Users/thuyb/OneDrive - Universiteit Antwerpen/University of Antwerp/Semester 1/Data Management/R assignment/Assigment 1")
hosp_dat <- read.table("COVID19BE_MORT.csv", header = TRUE, sep = ",")
library(tidyverse)
## Warning: package 'forcats' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:lubridate':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

Question 1: Produce a plot showing the daily incidence of deaths over time aggregated over age, sex and region and indicate on the x-axis the first day of each month.

hosp_dat <- na.omit(hosp_dat)
hosp_dat$DATE <- as.Date(hosp_dat$DATE)

hosp_dat_date <- hosp_dat %>% 
  group_by(DATE) %>% 
  summarise(death=sum(DEATHS))

ggplot(data = hosp_dat_date, aes(x = DATE, y = death)) +
  geom_line(color="blue") +
  labs(title = "The daily incidence of deaths over time",
       y = "Incidence of deaths",
       x = "Time") +
  scale_x_date(date_breaks = "1 month", date_labels = "%e %b %y") +
  theme_gray() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

Question 2: Show the cumulative incidence of deaths over time.

cumulative_death <- hosp_dat %>%
  group_by(DATE) %>% 
  summarize(sum(DEATHS)) 

cumulative_death <- cumulative_death %>% 
  mutate(cumulative_deaths=cumsum(`sum(DEATHS)`))

ggplot(data = cumulative_death)+
  geom_line(mapping = aes(x=DATE, y=cumulative_deaths, color="red"))+
  scale_x_date(date_breaks = "3 months", date_labels = "%b %y") +
  labs(title = "The cumulative incidence of deaths over time",
       y = "Cumulative_death",
       x = "Time") +
  theme(legend.position = "none")

Question 3: Produce the same plots for the daily number of deaths over time, by age group,region and sex. Briefly explain the key differences from an epidemiological point of view.

hosp_dat1 <- hosp_dat %>% 
  group_by(SEX,DATE) %>% 
  summarise(death=sum(DEATHS))
## `summarise()` has grouped output by 'SEX'. You can override using the `.groups`
## argument.
ggplot(data = hosp_dat1, mapping = aes(x=DATE, color=SEX, y=death))+
  geom_line()+
  labs(title = "Daily incidence of deaths over time by sex", y= "Daily incidence of deaths", x="Time", color="Gender")+
  scale_x_date(date_breaks = "6 months", date_labels = "%b %y") +
  theme_gray() +
  facet_wrap(~SEX)

These charts illustrate the daily count of COVID-19 deaths from 2020 to mid-2023. A discernible decline is noticeable over this timeframe, reaching its highest peak in April 2020. Notable fluctuations, including peaks in December 2021 and January 2022, contribute to the overall trend. Towards July 2023, reported deaths sharply decrease, with cases consistently below 5 per day.

The data reveals a consistent gender distribution in COVID-19 deaths, with females consistently experiencing a higher daily death rate than males. The peak was reported in April 2020, with females reaching around 180 deaths per day, surpassing males whose peak was approximately 140 deaths per day.

hosp_dat2 <- hosp_dat %>% 
  group_by(REGION,DATE) %>% 
  summarise(death=sum(DEATHS))
## `summarise()` has grouped output by 'REGION'. You can override using the
## `.groups` argument.
ggplot(data = hosp_dat2, mapping = aes(x=DATE, color=REGION, y=death))+
  geom_line()+
  labs(title = "Daily incidence of deaths over time by region", y= "Daily incidence of deaths", x="Time") + 
  guides(fill=guide_legend(title="Region")) +
  theme_gray() +
  facet_grid(~REGION)

The charts provide insights into the daily death cases reported in three different regions. Flanders consistently reported the highest number of cases, followed by Wallonia and Brussels. The peak daily cases were reported as 140, 130, and 40, respectively.

hosp_dat3 <- hosp_dat %>% 
  group_by(AGEGROUP,DATE) %>% 
  summarise(death=sum(DEATHS))
## `summarise()` has grouped output by 'AGEGROUP'. You can override using the
## `.groups` argument.
ggplot(data = hosp_dat3, mapping = aes(x=DATE, fill=AGEGROUP, y=death))+
  geom_col(width=1, position = "dodge")+
  labs(title = "Daily incidence of deaths over time by age group", y= "Daily incidence of deaths", x="Time") + 
  scale_fill_manual(values = c("lightblue","pink", "yellow","purple", "orange","blue"))+
  guides(fill=guide_legend(title="Age groups")) +
  theme_gray() +
  facet_wrap(~AGEGROUP, ncol = 2, scales = "free")

The charts describe the daily death cases across six distinct age groups. The incidence of death rises with advancing age. The 85 and above age group recorded the highest daily death cases, conversely, the 0-24 age group exhibits notably lower daily death cases. Both the young group (0-24) and the adult group (25-44) experienced days without reported death cases. In contrast, the oldest age groups reported daily cases, peaking at 150 cases per day.

Question 4: Create a new data frame containing the total number of deaths per month for the different age categories and by gender, aggregated over region. Write this file out as a .csv file. Show the first lines of the data frame in the R Markdown file

#install.packages("tsibble")
hosp_dat <- hosp_dat %>% 
  mutate(month = tsibble::yearmonth(DATE))
monthly_deaths <- hosp_dat %>% 
  group_by(month, SEX, AGEGROUP) %>% 
  summarise(monthly_deaths = sum(DEATHS)) 
## `summarise()` has grouped output by 'month', 'SEX'. You can override using the
## `.groups` argument.
print(monthly_deaths)
## # A tibble: 388 × 4
## # Groups:   month, SEX [80]
##       month SEX   AGEGROUP monthly_deaths
##       <mth> <chr> <chr>             <int>
##  1 2020 Mar F     0-24                  1
##  2 2020 Mar F     25-44                 5
##  3 2020 Mar F     45-64                28
##  4 2020 Mar F     65-74                54
##  5 2020 Mar F     75-84               205
##  6 2020 Mar F     85+                 383
##  7 2020 Mar M     25-44                 2
##  8 2020 Mar M     45-64                66
##  9 2020 Mar M     65-74               132
## 10 2020 Mar M     75-84               258
## # ℹ 378 more rows
head(monthly_deaths, c(1))
## # A tibble: 1 × 4
## # Groups:   month, SEX [1]
##      month SEX   AGEGROUP monthly_deaths
##      <mth> <chr> <chr>             <int>
## 1 2020 Mar F     0-24                  1
write.csv(monthly_deaths, "monthly_deaths1.csv", row.names = FALSE)