Reading in Dataset

#read the Dataset sheet into “R”. The dataset will be called "data".
data <- read.csv("https://opendata.ecdc.europa.eu/covid19/nationalcasedeath_eueea_daily_ei/csv", na.strings = "", fileEncoding = "UTF-8-BOM")

#loading additional packages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggthemes)
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(directlabels)

covid_eu <- data

   

Data Cleaning and Manipulation

covid_eu = covid_eu %>%
  mutate(date = dmy(dateRep)) %>% #creating new date variable based on current m/d/y setup
  relocate(date, .before = dateRep) %>% #moving new date variable to first column
  arrange(date) #arranging dates in order
  
#Transforming countriesAndTerritories and continentExp to factor variables
covid_eu$countriesAndTerritories <- factor(covid_eu$countriesAndTerritories)
covid_eu$continentExp <- factor(covid_eu$continentExp)

#Creating weekly incidence rate per 100K population 
    #inspecting output so far, looks as though the cases for first reported date (2021-02-28 and 2021-03-01) are very high for each country. Omit dates 
covid_eu_1 = covid_eu %>%
  filter(cases > -1, date > "2021-03-01") %>%        #couple of instances of negative case count - omit. Also omitting dates 2021-02-28 and 2021-03-01
  group_by(countriesAndTerritories) %>%         #group by country for calculations below
  mutate(cases_100k = (cases/popData2020) *100000) %>%       #creating var for cases per 100k population, by country
  mutate(weekly.inc.rate = rollapplyr(cases_100k,            #creating 7-day rolling avg of cases per 100k population, by country
                                      width=7, FUN=mean, partial=T, align = "left")) 

   

Graph 1 INCORRECT OUTPUT - Attempting Rolling Incidence Rate for France

#Plotting incidence rate over time for specific countries
covid_eu_1 %>%
  filter(countriesAndTerritories == 'France') %>%
  group_by(countriesAndTerritories) %>%
  ggplot() +
  geom_line(data = covid_eu_1, aes(x=date, y=weekly.inc.rate)) 

      #The line generated is not as expected...vertical line graph...

   

Graph 1 CORRECT OUTPUT - France 2021: COVID-19 7-Day Rolling Incidence Rate

covid_eu_1 %>%
  filter(countriesAndTerritories == 'France') %>% #just looking at one country, France
  group_by(countriesAndTerritories) %>%
  ggplot(aes(x=date, y=weekly.inc.rate)) +
  geom_line() +
  theme_classic() + #applying theme to clean up the graph display 
  labs(title = "France 2021:
  COVID-19 Incidence Rate per 100K Population",
      x= "Date", y= "Incidence per 100,000")+
  theme(plot.title = element_text(hjust = 0.5)) + #centering the plot title
  theme(plot.title = element_text(lineheight = 1)) +
  theme(plot.title = element_text(face = "bold")) + #change plot title font to be bolded
  theme(plot.title = element_text(size = 16)) #increase title text size

   

Graph 2 - France & Spain 2021: COVID-19 7-Day Rolling Incidence Rate

#Now, let's trying looking at two countries within the line graph...
covid_eu_1 %>%
  filter(countriesAndTerritories %in% c("France", "Spain")) %>% #filtering to more than one country using "%in% c(" for grouping
  group_by(countriesAndTerritories) %>%
  
  ggplot(aes(x=date, y=weekly.inc.rate, 
             group=countriesAndTerritories,   #must include "group=" to obtain lines for distinct countries as desired
             colour=countriesAndTerritories)) +  #must include "colour" by group variable to differentiate lines. also generates legend
  geom_line(size= 1) +  #within geom_line function, increasing line thickness
  theme_classic() +     #inputting classic theme first here, else would undo changes made below if tacked to the end of the code
  labs(title = "2021 COVID-19 Incidence Rate: France & Spain",   #graph title
        x = "Date", y = "Cases per 100,000 Persons",     #legend titles
        colour = "Country") +   #altering default legend title - here, the lines are distinguished with colour, so altering via colour = in labs f'n
  
  theme(plot.title = element_text(hjust = 0.5,     #Centering title
                                  face = "bold",   #Bolding title
                                  size = 16),      #Increasing title text size 
        legend.position = "bottom") +   #Moving legend from right-hand side to bottom of graph 
  scale_color_manual(values = c("Dark Blue", "Dark Red")) +  #Changing line and legend colors
  scale_x_date(date_breaks = "1 month", date_labels = "%b")   #Altering such that there are date labels/breaks for every month ("%B" would show full month)

     

Table 1 - 2021 COVID-19 Cases and Deaths in Europe

#########################################################################################################################################################################
                                                        # CREATE A TABLE
#########################################################################################################################################################################

 # Let's create a table of cumulative cases, deaths and case fatality rate per country
      #Will first need to create variables

covid_eu_2 <- covid_eu %>% 
  filter(cases > -1, deaths > -1) %>%  #Min deaths in summary is -3; filtering out negative cases/deaths
  group_by(countriesAndTerritories) %>%
  mutate(tot_cases = sum(cases)) %>%
  mutate(tot_deaths = sum(deaths)) %>%
  mutate(fatality_rate = (tot_deaths/tot_cases)*1000) 


#Sum of cases/deaths and overall fatality rate by country is similar for all observations belonging to same country
  #to create table with one observation per country, create table subset with "unique" identification per country name

#Omitting duplicate rows per country name
cov_table <- covid_eu_2 %>%
  distinct(countriesAndTerritories, .keep_all = T)   #keepall = T" is used to keep all other variables in the output data frame

#Now dropping all other variables except country name, total cases, total deaths & fatality rate
cov_table = subset(cov_table, select = c(countriesAndTerritories,tot_cases,tot_deaths,fatality_rate)) %>%
  arrange(fatality_rate)  #here, arranging such by fatality rate (lowest to highest)

     

library(knitr) #knitr and kableExtra packages to create and format simple tables
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
knitr::kable(cov_table,  #syntax to display data frame as table
    col.names = c('Country', 'Total Cases', 'Total Deaths', 'Case Fatality per 1,000 Persons'),  #renaming columns
    align = "lccc",   #aligning each, from left to right, "left", "center", "center", "center
    caption = "<b> 2021 COVID Cases and Deaths in Europe <b>",) %>% #Adding a caption/title to the table... "<b>" at beginning and end of caption text specifies to bold the text
    kable_styling()  #this function is part of the kableExtra package and allows for nicer formatting of tables
2021 COVID Cases and Deaths in Europe
Country Total Cases Total Deaths Case Fatality per 1,000 Persons
Iceland 19861 36 1.812598
Norway 328713 1202 3.656685
Cyprus 142878 614 4.297373
Denmark 581276 3038 5.226433
Finland 209258 1446 6.910130
Netherlands 2920438 20255 6.935604
Estonia 230514 1869 8.107967
Ireland 635979 5838 9.179548
Luxembourg 95222 896 9.409590
Austria 1236181 12950 10.475812
Malta 40889 471 11.518990
Slovakia 1294019 15516 11.990550
Sweden 1242733 15221 12.248005
Liechtenstein 5503 68 12.356896
Slovenia 445228 5876 13.197732
Belgium 1980025 27729 14.004369
Lithuania 497643 7059 14.184867
France 8399846 120983 14.403002
Czechia 2378728 34923 14.681376
Portugal 1205993 18698 15.504236
Germany 6670407 107202 16.071283
Spain 5393268 88619 16.431410
Latvia 264031 4408 16.695009
Croatia 662397 11779 17.782387
Greece 1017445 19553 19.217746
Poland 3881349 89714 23.114129
Italy 5282076 135178 25.591832
Hungary 1213318 37376 30.804785
Romania 1795418 57935 32.268252
Bulgaria 720376 29847 41.432530