Load data

Data comes from Our World In data’s COVID dataset. I have prepared this a little bit for you.

Let’s get started with a dataset called cross.csv which you find as follows

cross=read.csv("https://raw.githubusercontent.com/mondpanther/datastorieshub/95d94862115819350247823f174a2633cde0236b/code/cross.csv")

names(cross) # show the names of the dataframe
##  [1] "X"                              "month"                         
##  [3] "iso_code"                       "date"                          
##  [5] "location"                       "total_cases"                   
##  [7] "total_deaths"                   "total_cases_per_million"       
##  [9] "total_deaths_per_million"       "total_tests"                   
## [11] "total_tests_per_thousand"       "total_vaccinations"            
## [13] "total_boosters"                 "total_vaccinations_per_hundred"
## [15] "total_boosters_per_hundred"     "population"                    
## [17] "population_density"             "continent"                     
## [19] "cs"                             "L1vax"                         
## [21] "vax"                            "lndeaths"                      
## [23] "deaths"                         "lnvax"                         
## [25] "period"
head(cross)  # show the first couple of lines
##   X   month iso_code       date    location total_cases total_deaths
## 1 1 2021-11      ABW 2021-11-27       Aruba           0            0
## 2 2 2021-02      AFG 2021-02-28 Afghanistan       55714         2443
## 3 3 2021-09      AFG 2021-09-30 Afghanistan      155174         7204
## 4 4 2021-02      AGO 2021-02-28      Angola       20807          508
## 5 5 2021-05      AGO 2021-05-31      Angola       34551          766
## 6 6 2021-02      AIA 2021-02-28    Anguilla           0            0
##   total_cases_per_million total_deaths_per_million total_tests
## 1                   0.000                    0.000           0
## 2                1398.604                   61.327           0
## 3                3895.377                  180.844           0
## 4                 613.168                   14.970           0
## 5                1018.194                   22.573           0
## 6                   0.000                    0.000           0
##   total_tests_per_thousand total_vaccinations total_boosters
## 1                        0             161611              0
## 2                        0               8200              0
## 3                        0            2369625              0
## 4                        0                  0              0
## 5                        0             909215              0
## 6                        0                  0              0
##   total_vaccinations_per_hundred total_boosters_per_hundred population
## 1                         150.76                          0     107195
## 2                           0.02                          0   39835428
## 3                           5.95                          0   39835428
## 4                           0.00                          0   33933611
## 5                           2.68                          0   33933611
## 6                           0.00                          0      15125
##   population_density     continent   cs L1vax    vax lndeaths  deaths
## 1            584.800 North America   NA    NA 150.76 0.000000   0.000
## 2             54.422          Asia   NA  0.00   0.02 4.132395  61.327
## 3             54.422          Asia   NA  4.97   5.95 5.203149 180.844
## 4             23.890        Africa 2375  0.00   0.00 2.770712  14.970
## 5             23.890        Africa 4274  0.00   2.68 3.160102  22.573
## 6                 NA North America   NA    NA   0.00 0.000000   0.000
##        lnvax period
## 1 5.02230033      1
## 2 0.01980263      1
## 3 1.93874166      2
## 4 0.00000000      1
## 5 1.30291275      2
## 6 0.00000000      1

Let’s only take the EU & North America part of the data…

…and only for the second period…

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
cross %>% group_by(continent) %>% summarize(n())
## # A tibble: 6 x 2
##   continent     `n()`
##   <chr>         <int>
## 1 Africa          109
## 2 Asia             94
## 3 Europe           98
## 4 North America    63
## 5 Oceania          32
## 6 South America    26
cross_euram=cross %>% filter(continent=="Europe" | continent=="North America" & period==2)

Start by plotting

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
cross_euram %>% ggplot(  aes(x=vax,y=deaths)  ) + # setup the aesthetic
                       
                       geom_point()            # use it to plot poines

# Note that above we broke the command over several lines which can be a good
# idea to make things more easily readable...however the following is exactly the same
# command:


cross_euram %>% ggplot(  aes(x=vax,y=deaths)  ) +  geom_point()     

Let’s do that a bit nicer:

cross_euram %>% ggplot(  aes(x=vax,y=deaths)  ) +  
  geom_point() +
  ylab("Total number of deaths per 1 million")+
  xlab("Total number of vaccinations per 100K")+
  theme_minimal()+
  geom_smooth(method="lm",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

Vaccines seem to be associated with less deaths. Before we discuss this more let’s work out the exact parameters of the trendline. We use the lm command for that:

reg1=lm(formula=deaths ~ vax,data=cross_euram)

reg1 %>% summary()  # The summary command provides a nice summary
## 
## Call:
## lm(formula = deaths ~ vax, data = cross_euram)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1267.06  -724.07   -87.49   535.36  2945.98 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1090.9359   115.7545   9.425 2.93e-16 ***
## vax            0.6132     1.2212   0.502    0.616    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 908.8 on 125 degrees of freedom
## Multiple R-squared:  0.002013,   Adjusted R-squared:  -0.005971 
## F-statistic: 0.2521 on 1 and 125 DF,  p-value: 0.6165

An increase of 1 additional vaccination per 100K leads to 0.61 more deaths per 1 million.

Let’s look at the relationship between estimated residuals \(\hat{\epsilon}\) and explanatory variable \(X\); i.e. vaccination rates in our case..

cross_euram=cross_euram %>% mutate(epsilonhat=reg1$residuals)

cross_euram %>% ggplot(  aes(x=vax,y=epsilonhat)  ) +  
  geom_point() +
  ylab("epsilon hat")+
  xlab("Total number of vaccinations per 100K")+
  theme_minimal()+
  geom_smooth(method="lm",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

cor(cross_euram%>% select(vax,epsilonhat,deaths))
##                      vax    epsilonhat     deaths
## vax         1.000000e+00 -6.865192e-17 0.04486396
## epsilonhat -6.865192e-17  1.000000e+00 0.99899311
## deaths      4.486396e-02  9.989931e-01 1.00000000
cross_eur %>% ggplot(  aes(y=deaths,x=epsilonhat)  ) +  
  geom_point() +
  xlab("epsilon hat")+
  ylab("Deaths")+
  theme_minimal()+
  geom_smooth(method="lm",se=FALSE)

Are vaccinations really causing deaths?

We can explore this by looking at the amount of death we had after the arrival of vaccines…

cross_euram2=cross %>% 
  filter(continent=="Europe" | continent =="North America") %>%  # this time we include the first period
                                    # which is from Februrary 2021
  arrange(iso_code,date) %>%        # i.e. sort
  group_by(iso_code) %>%            # put in country groups
  mutate(Ddeaths=deaths-dplyr::lag(deaths) ) # we compute the change in the death variable 
cross_euram2 %>% ggplot(  aes(x=vax,y=Ddeaths)  ) +  
  geom_point() +
  ylab("Additional number of deaths per 1 million sinc Feb 2021 (Delta Deaths")+
  xlab("Total number of vaccinations per 100K")+
  theme_minimal()+
  geom_smooth(method="lm",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 84 rows containing non-finite values (stat_smooth).
## Warning: Removed 84 rows containing missing values (geom_point).

The same as regression:

reg2=lm(formula=Ddeaths ~ vax,data=cross_euram2)

reg2 %>% summary()  # The summary command provides a nice summary
## 
## Call:
## lm(formula = Ddeaths ~ vax, data = cross_euram2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -780.0 -384.2 -159.6  268.2 1926.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  779.965    131.365   5.937  8.4e-08 ***
## vax           -2.478      1.112  -2.228   0.0289 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 566.7 on 75 degrees of freedom
##   (84 observations deleted due to missingness)
## Multiple R-squared:  0.06206,    Adjusted R-squared:  0.04955 
## F-statistic: 4.962 on 1 and 75 DF,  p-value: 0.0289

i.e. an increase of 1 additional vaccination per 100K leads to -2.48 less deaths per 1 million.

More Visiions

Comparing continents

cross_euram2 %>% ggplot(  aes(x=vax,y=Ddeaths, color=continent)  ) +  
  geom_point() +
  ylab("Delta Deaths")+
  xlab("Total number of vaccinations per 100K")+
  theme_minimal()+
  geom_smooth(method="lm",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 84 rows containing non-finite values (stat_smooth).
## Warning: Removed 84 rows containing missing values (geom_point).

Country labels?

cross_euram2 %>% ggplot(  aes(x=vax,y=Ddeaths, color=continent,label=location)  ) +  
  geom_point() +
  ylab("Delta Deaths")+
  xlab("Total number of vaccinations per 100K")+
  theme_minimal()+
  geom_smooth(method="lm",se=FALSE)+geom_text(size=3)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 84 rows containing non-finite values (stat_smooth).
## Warning: Removed 84 rows containing missing values (geom_point).
## Warning: Removed 84 rows containing missing values (geom_text).

cross_euram2 =cross_euram2 %>% mutate(slabel=ifelse(population>20*10^6,location,""))


library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.1.2
cross_euram2 %>% ggplot(  aes(x=vax,y=Ddeaths, color=continent,label=slabel)  ) +  
  geom_point() +
  ylab("Delta Deaths")+
  xlab("Total number of vaccinations per 100K")+
  theme_minimal()+
  geom_smooth(method="lm",se=FALSE)+geom_text_repel(size=3)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 84 rows containing non-finite values (stat_smooth).
## Warning: Removed 84 rows containing missing values (geom_point).
## Warning: Removed 84 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Across all continents…..

An example to exmplore ggplot2

cross=cross %>% 
 arrange(iso_code,date) %>%        # i.e. sort
  group_by(iso_code) %>%            # put in country groups
  mutate(Ddeaths=deaths-dplyr::lag(deaths) ) # we compute the change in the death variable 



cross %>% ggplot(  aes(x=vax,y=Ddeaths, color=continent)  ) +  
  geom_point() +
  ylab("Delta Deaths")+
  xlab("Total number of vaccinations per 100K")+
  theme_minimal()+
  geom_smooth(method="lm",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 222 rows containing non-finite values (stat_smooth).
## Warning: Removed 222 rows containing missing values (geom_point).

Time series

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
dfm=read.csv("dfm.csv") %>% 
  mutate( vax   =total_vaccinations_per_hundred,
          deaths=total_deaths_per_million)  


# Tell R about the date variable
dfm_us=dfm %>% filter(iso_code=="USA")%>% mutate(date=as_date(date))


p1=dfm_us %>% ggplot(aes(x=date,y=deaths)) + geom_line() 
p1

p1+theme_minimal()+xlab("Time")+ylab("Total Deaths")

Sometimes better to look at changes:

dfm_us=dfm_us %>% arrange(date) %>% mutate(Ddeaths=deaths-dplyr::lag(deaths))
dfm_us %>% ggplot(aes(x=date,y=Ddeaths)) + geom_line() +
  theme_minimal()+
  xlab("Time")+ylab("Total Deaths")
## Warning: Removed 1 row(s) containing missing values (geom_path).

Histogram’s & Density plots

Histogram

A histogram allows us to look at the distribution of a variable across a sample. For instance we can look at the daily hoax shares:

crossp2=cross %>% filter(period==2)

crossp2 %>% ggplot( aes(x=deaths)) + geom_histogram() + ylab("Number of countries")+xlab("Cases per 1 million")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


An alternative way of looking at histograms is in terms of density. If you multiply the density with the width of a histogram bin you get the share of observations (as opposed to the count) that fall into a particular category:

d1=crossp2 %>% ggplot( aes(x=deaths)) +  geom_histogram(aes(y=..density.. ,fill=..density..))  + 
  ylab("Number of countries")+xlab("Cases per 1 million")

d1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Density

An outright density plot is often preferable to a histogram. It’s kind of like a density histogram where let the bins become every smaller:

d1 + geom_density(color="red",size=2) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.