R Practice: 2021_EU_COVID

Reading in Dataset

#read the Dataset sheet into “R”. The dataset will be called "data".
data <- read.csv("https://opendata.ecdc.europa.eu/covid19/nationalcasedeath_eueea_daily_ei/csv", na.strings = "", fileEncoding = "UTF-8-BOM")

#loading additional packages
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(ggthemes)
library(zoo)

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(directlabels)

covid_eu <- data

Data Cleaning and Manipulation

covid_eu = covid_eu %>%
  mutate(date = dmy(dateRep)) %>% #creating new date variable based on current m/d/y setup
  relocate(date, .before = dateRep) %>% #moving new date variable to first column
  arrange(date) #arranging dates in order
  
#Transforming countriesAndTerritories and continentExp to factor variables
covid_eu$countriesAndTerritories <- factor(covid_eu$countriesAndTerritories)
covid_eu$continentExp <- factor(covid_eu$continentExp)

#Creating weekly incidence rate per 100K population 
    #inspecting output so far, looks as though the cases for first reported date (2021-02-28 and 2021-03-01) are very high for each country. Omit dates 
covid_eu_1 = covid_eu %>%
  filter(cases > -1, date > "2021-03-01") %>%        #couple of instances of negative case count - omit. Also omitting dates 2021-02-28 and 2021-03-01
  group_by(countriesAndTerritories) %>%         #group by country for calculations below
  mutate(cases_100k = (cases/popData2020) *100000) %>%       #creating var for cases per 100k population, by country
  mutate(weekly.inc.rate = rollapplyr(cases_100k,            #creating 7-day rolling avg of cases per 100k population, by country
                                      width=7, FUN=mean, partial=T, align = "left"))

Graph 1 INCORRECT OUTPUT - Attempting Rolling Incidence Rate for France

#Plotting incidence rate over time for specific countries
covid_eu_1 %>%
  filter(countriesAndTerritories == 'France') %>%
  group_by(countriesAndTerritories) %>%
  ggplot() +
  geom_line(data = covid_eu_1, aes(x=date, y=weekly.inc.rate))

      #The line generated is not as expected...vertical line graph...

Graph 1 CORRECT OUTPUT - France 2021: COVID-19 7-Day Rolling Incidence Rate

covid_eu_1 %>%
  filter(countriesAndTerritories == 'France') %>% #just looking at one country, France
  group_by(countriesAndTerritories) %>%
  ggplot(aes(x=date, y=weekly.inc.rate)) +
  geom_line() +
  theme_classic() + #applying theme to clean up the graph display 
  labs(title = "France 2021:
  COVID-19 Incidence Rate per 100K Population",
      x= "Date", y= "Incidence per 100,000")+
  theme(plot.title = element_text(hjust = 0.5)) + #centering the plot title
  theme(plot.title = element_text(lineheight = 1)) +
  theme(plot.title = element_text(face = "bold")) + #change plot title font to be bolded
  theme(plot.title = element_text(size = 16)) #increase title text size

Graph 2 - France & Spain 2021: COVID-19 7-Day Rolling Incidence Rate

#Now, let's trying looking at two countries within the line graph...
covid_eu_1 %>%
  filter(countriesAndTerritories %in% c("France", "Spain")) %>% #filtering to more than one country using "%in% c(" for grouping
  group_by(countriesAndTerritories) %>%
  
  ggplot(aes(x=date, y=weekly.inc.rate, 
             group=countriesAndTerritories,   #must include "group=" to obtain lines for distinct countries as desired
             colour=countriesAndTerritories)) +  #must include "colour" by group variable to differentiate lines. also generates legend
  geom_line(size= 1) +  #within geom_line function, increasing line thickness
  theme_classic() +     #inputting classic theme first here, else would undo changes made below if tacked to the end of the code
  labs(title = "2021 COVID-19 Incidence Rate: France & Spain",   #graph title
        x = "Date", y = "Cases per 100,000 Persons",     #legend titles
        colour = "Country") +   #altering default legend title - here, the lines are distinguished with colour, so altering via colour = in labs f'n
  
  theme(plot.title = element_text(hjust = 0.5,     #Centering title
                                  face = "bold",   #Bolding title
                                  size = 16),      #Increasing title text size 
        legend.position = "bottom") +   #Moving legend from right-hand side to bottom of graph 
  scale_color_manual(values = c("Dark Blue", "Dark Red")) +  #Changing line and legend colors
  scale_x_date(date_breaks = "1 month", date_labels = "%b")   #Altering such that there are date labels/breaks for every month ("%B" would show full month)

Table 1 - 2021 COVID-19 Cases and Deaths in Europe

#########################################################################################################################################################################
                                                        # CREATE A TABLE
#########################################################################################################################################################################

 # Let's create a table of cumulative cases, deaths and case fatality rate per country
      #Will first need to create variables

covid_eu_2 <- covid_eu %>% 
  filter(cases > -1, deaths > -1) %>%  #Min deaths in summary is -3; filtering out negative cases/deaths
  group_by(countriesAndTerritories) %>%
  mutate(tot_cases = sum(cases)) %>%
  mutate(tot_deaths = sum(deaths)) %>%
  mutate(fatality_rate = (tot_deaths/tot_cases)*1000) 


#Sum of cases/deaths and overall fatality rate by country is similar for all observations belonging to same country
  #to create table with one observation per country, create table subset with "unique" identification per country name

#Omitting duplicate rows per country name
cov_table <- covid_eu_2 %>%
  distinct(countriesAndTerritories, .keep_all = T)   #keepall = T" is used to keep all other variables in the output data frame

#Now dropping all other variables except country name, total cases, total deaths & fatality rate
cov_table = subset(cov_table, select = c(countriesAndTerritories,tot_cases,tot_deaths,fatality_rate)) %>%
  arrange(fatality_rate)  #here, arranging such by fatality rate (lowest to highest)

library(knitr) #knitr and kableExtra packages to create and format simple tables
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

knitr::kable(cov_table,  #syntax to display data frame as table
    col.names = c('Country', 'Total Cases', 'Total Deaths', 'Case Fatality per 1,000 Persons'),  #renaming columns
    align = "lccc",   #aligning each, from left to right, "left", "center", "center", "center
    caption = "<b> 2021 COVID Cases and Deaths in Europe <b>",) %>% #Adding a caption/title to the table... "<b>" at beginning and end of caption text specifies to bold the text
    kable_styling()  #this function is part of the kableExtra package and allows for nicer formatting of tables

**2021 COVID Cases and Deaths in Europe**
Country	Total Cases	Total Deaths	Case Fatality per 1,000 Persons
Iceland	19861	36	1.812598
Norway	328713	1202	3.656685
Cyprus	142878	614	4.297373
Denmark	581276	3038	5.226433
Finland	209258	1446	6.910130
Netherlands	2920438	20255	6.935604
Estonia	230514	1869	8.107967
Ireland	635979	5838	9.179548
Luxembourg	95222	896	9.409590
Austria	1236181	12950	10.475812
Malta	40889	471	11.518990
Slovakia	1294019	15516	11.990550
Sweden	1242733	15221	12.248005
Liechtenstein	5503	68	12.356896
Slovenia	445228	5876	13.197732
Belgium	1980025	27729	14.004369
Lithuania	497643	7059	14.184867
France	8399846	120983	14.403002
Czechia	2378728	34923	14.681376
Portugal	1205993	18698	15.504236
Germany	6670407	107202	16.071283
Spain	5393268	88619	16.431410
Latvia	264031	4408	16.695009
Croatia	662397	11779	17.782387
Greece	1017445	19553	19.217746
Poland	3881349	89714	23.114129
Italy	5282076	135178	25.591832
Hungary	1213318	37376	30.804785
Romania	1795418	57935	32.268252
Bulgaria	720376	29847	41.432530

R Practice: 2021_EU_COVID_Data

Dec 17, 2021

Reading in Dataset

Data Cleaning and Manipulation

Graph 1 INCORRECT OUTPUT - Attempting Rolling Incidence Rate for France

Graph 1 CORRECT OUTPUT - France 2021: COVID-19 7-Day Rolling Incidence Rate

Graph 2 - France & Spain 2021: COVID-19 7-Day Rolling Incidence Rate

Table 1 - 2021 COVID-19 Cases and Deaths in Europe