setwd("C:/Users/thuyb/OneDrive - Universiteit Antwerpen/University of Antwerp/Semester 1/Data Management/R assignment/Assigment 1")
hosp_dat <- read.table("COVID19BE_MORT.csv", header = TRUE, sep = ",")
library(tidyverse)
## Warning: package 'forcats' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.2
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
hosp_dat <- na.omit(hosp_dat)
hosp_dat$DATE <- as.Date(hosp_dat$DATE)
hosp_dat_date <- hosp_dat %>%
group_by(DATE) %>%
summarise(death=sum(DEATHS))
ggplot(data = hosp_dat_date, aes(x = DATE, y = death)) +
geom_line(color="blue") +
labs(title = "The daily incidence of deaths over time",
y = "Incidence of deaths",
x = "Time") +
scale_x_date(date_breaks = "1 month", date_labels = "%e %b %y") +
theme_gray() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
cumulative_death <- hosp_dat %>%
group_by(DATE) %>%
summarize(sum(DEATHS))
cumulative_death <- cumulative_death %>%
mutate(cumulative_deaths=cumsum(`sum(DEATHS)`))
ggplot(data = cumulative_death)+
geom_line(mapping = aes(x=DATE, y=cumulative_deaths, color="red"))+
scale_x_date(date_breaks = "3 months", date_labels = "%b %y") +
labs(title = "The cumulative incidence of deaths over time",
y = "Cumulative_death",
x = "Time") +
theme(legend.position = "none")
hosp_dat1 <- hosp_dat %>%
group_by(SEX,DATE) %>%
summarise(death=sum(DEATHS))
## `summarise()` has grouped output by 'SEX'. You can override using the `.groups`
## argument.
ggplot(data = hosp_dat1, mapping = aes(x=DATE, color=SEX, y=death))+
geom_line()+
labs(title = "Daily incidence of deaths over time by sex", y= "Daily incidence of deaths", x="Time", color="Gender")+
scale_x_date(date_breaks = "6 months", date_labels = "%b %y") +
theme_gray() +
facet_wrap(~SEX)
These charts illustrate the daily count of COVID-19 deaths from 2020 to mid-2023. A discernible decline is noticeable over this timeframe, reaching its highest peak in April 2020. Notable fluctuations, including peaks in December 2021 and January 2022, contribute to the overall trend. Towards July 2023, reported deaths sharply decrease, with cases consistently below 5 per day.
The data reveals a consistent gender distribution in COVID-19 deaths, with females consistently experiencing a higher daily death rate than males. The peak was reported in April 2020, with females reaching around 180 deaths per day, surpassing males whose peak was approximately 140 deaths per day.
hosp_dat2 <- hosp_dat %>%
group_by(REGION,DATE) %>%
summarise(death=sum(DEATHS))
## `summarise()` has grouped output by 'REGION'. You can override using the
## `.groups` argument.
ggplot(data = hosp_dat2, mapping = aes(x=DATE, color=REGION, y=death))+
geom_line()+
labs(title = "Daily incidence of deaths over time by region", y= "Daily incidence of deaths", x="Time") +
guides(fill=guide_legend(title="Region")) +
theme_gray() +
facet_grid(~REGION)
The charts provide insights into the daily death cases reported in three different regions. Flanders consistently reported the highest number of cases, followed by Wallonia and Brussels. The peak daily cases were reported as 140, 130, and 40, respectively.
hosp_dat3 <- hosp_dat %>%
group_by(AGEGROUP,DATE) %>%
summarise(death=sum(DEATHS))
## `summarise()` has grouped output by 'AGEGROUP'. You can override using the
## `.groups` argument.
ggplot(data = hosp_dat3, mapping = aes(x=DATE, fill=AGEGROUP, y=death))+
geom_col(width=1, position = "dodge")+
labs(title = "Daily incidence of deaths over time by age group", y= "Daily incidence of deaths", x="Time") +
scale_fill_manual(values = c("lightblue","pink", "yellow","purple", "orange","blue"))+
guides(fill=guide_legend(title="Age groups")) +
theme_gray() +
facet_wrap(~AGEGROUP, ncol = 2, scales = "free")
The charts describe the daily death cases across six distinct age groups. The incidence of death rises with advancing age. The 85 and above age group recorded the highest daily death cases, conversely, the 0-24 age group exhibits notably lower daily death cases. Both the young group (0-24) and the adult group (25-44) experienced days without reported death cases. In contrast, the oldest age groups reported daily cases, peaking at 150 cases per day.
#install.packages("tsibble")
hosp_dat <- hosp_dat %>%
mutate(month = tsibble::yearmonth(DATE))
monthly_deaths <- hosp_dat %>%
group_by(month, SEX, AGEGROUP) %>%
summarise(monthly_deaths = sum(DEATHS))
## `summarise()` has grouped output by 'month', 'SEX'. You can override using the
## `.groups` argument.
print(monthly_deaths)
## # A tibble: 388 × 4
## # Groups: month, SEX [80]
## month SEX AGEGROUP monthly_deaths
## <mth> <chr> <chr> <int>
## 1 2020 Mar F 0-24 1
## 2 2020 Mar F 25-44 5
## 3 2020 Mar F 45-64 28
## 4 2020 Mar F 65-74 54
## 5 2020 Mar F 75-84 205
## 6 2020 Mar F 85+ 383
## 7 2020 Mar M 25-44 2
## 8 2020 Mar M 45-64 66
## 9 2020 Mar M 65-74 132
## 10 2020 Mar M 75-84 258
## # ℹ 378 more rows
head(monthly_deaths, c(1))
## # A tibble: 1 × 4
## # Groups: month, SEX [1]
## month SEX AGEGROUP monthly_deaths
## <mth> <chr> <chr> <int>
## 1 2020 Mar F 0-24 1
write.csv(monthly_deaths, "monthly_deaths1.csv", row.names = FALSE)