Data comes from Our World In data’s COVID dataset. I have prepared this a little bit for you.
Let’s get started with a dataset called cross.csv which you find as follows
cross=read.csv("https://raw.githubusercontent.com/mondpanther/datastorieshub/95d94862115819350247823f174a2633cde0236b/code/cross.csv")
names(cross) # show the names of the dataframe
## [1] "X" "month"
## [3] "iso_code" "date"
## [5] "location" "total_cases"
## [7] "total_deaths" "total_cases_per_million"
## [9] "total_deaths_per_million" "total_tests"
## [11] "total_tests_per_thousand" "total_vaccinations"
## [13] "total_boosters" "total_vaccinations_per_hundred"
## [15] "total_boosters_per_hundred" "population"
## [17] "population_density" "continent"
## [19] "cs" "L1vax"
## [21] "vax" "lndeaths"
## [23] "deaths" "lnvax"
## [25] "period"
head(cross) # show the first couple of lines
## X month iso_code date location total_cases total_deaths
## 1 1 2021-11 ABW 2021-11-27 Aruba 0 0
## 2 2 2021-02 AFG 2021-02-28 Afghanistan 55714 2443
## 3 3 2021-09 AFG 2021-09-30 Afghanistan 155174 7204
## 4 4 2021-02 AGO 2021-02-28 Angola 20807 508
## 5 5 2021-05 AGO 2021-05-31 Angola 34551 766
## 6 6 2021-02 AIA 2021-02-28 Anguilla 0 0
## total_cases_per_million total_deaths_per_million total_tests
## 1 0.000 0.000 0
## 2 1398.604 61.327 0
## 3 3895.377 180.844 0
## 4 613.168 14.970 0
## 5 1018.194 22.573 0
## 6 0.000 0.000 0
## total_tests_per_thousand total_vaccinations total_boosters
## 1 0 161611 0
## 2 0 8200 0
## 3 0 2369625 0
## 4 0 0 0
## 5 0 909215 0
## 6 0 0 0
## total_vaccinations_per_hundred total_boosters_per_hundred population
## 1 150.76 0 107195
## 2 0.02 0 39835428
## 3 5.95 0 39835428
## 4 0.00 0 33933611
## 5 2.68 0 33933611
## 6 0.00 0 15125
## population_density continent cs L1vax vax lndeaths deaths
## 1 584.800 North America NA NA 150.76 0.000000 0.000
## 2 54.422 Asia NA 0.00 0.02 4.132395 61.327
## 3 54.422 Asia NA 4.97 5.95 5.203149 180.844
## 4 23.890 Africa 2375 0.00 0.00 2.770712 14.970
## 5 23.890 Africa 4274 0.00 2.68 3.160102 22.573
## 6 NA North America NA NA 0.00 0.000000 0.000
## lnvax period
## 1 5.02230033 1
## 2 0.01980263 1
## 3 1.93874166 2
## 4 0.00000000 1
## 5 1.30291275 2
## 6 0.00000000 1
…and only for the second period…
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
cross %>% group_by(continent) %>% summarize(n())
## # A tibble: 6 x 2
## continent `n()`
## <chr> <int>
## 1 Africa 109
## 2 Asia 94
## 3 Europe 98
## 4 North America 63
## 5 Oceania 32
## 6 South America 26
cross_euram=cross %>% filter(continent=="Europe" | continent=="North America" & period==2)
Start by plotting
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
cross_euram %>% ggplot( aes(x=vax,y=deaths) ) + # setup the aesthetic
geom_point() # use it to plot poines
# Note that above we broke the command over several lines which can be a good
# idea to make things more easily readable...however the following is exactly the same
# command:
cross_euram %>% ggplot( aes(x=vax,y=deaths) ) + geom_point()
Let’s do that a bit nicer:
cross_euram %>% ggplot( aes(x=vax,y=deaths) ) +
geom_point() +
ylab("Total number of deaths per 1 million")+
xlab("Total number of vaccinations per 100K")+
theme_minimal()+
geom_smooth(method="lm",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
Vaccines seem to be associated with less deaths. Before we discuss this more let’s work out the exact parameters of the trendline. We use the lm command for that:
reg1=lm(formula=deaths ~ vax,data=cross_euram)
reg1 %>% summary() # The summary command provides a nice summary
##
## Call:
## lm(formula = deaths ~ vax, data = cross_euram)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1267.06 -724.07 -87.49 535.36 2945.98
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1090.9359 115.7545 9.425 2.93e-16 ***
## vax 0.6132 1.2212 0.502 0.616
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 908.8 on 125 degrees of freedom
## Multiple R-squared: 0.002013, Adjusted R-squared: -0.005971
## F-statistic: 0.2521 on 1 and 125 DF, p-value: 0.6165
An increase of 1 additional vaccination per 100K leads to 0.61 more deaths per 1 million.
Let’s look at the relationship between estimated residuals \(\hat{\epsilon}\) and explanatory variable \(X\); i.e. vaccination rates in our case..
cross_euram=cross_euram %>% mutate(epsilonhat=reg1$residuals)
cross_euram %>% ggplot( aes(x=vax,y=epsilonhat) ) +
geom_point() +
ylab("epsilon hat")+
xlab("Total number of vaccinations per 100K")+
theme_minimal()+
geom_smooth(method="lm",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
cor(cross_euram%>% select(vax,epsilonhat,deaths))
## vax epsilonhat deaths
## vax 1.000000e+00 -6.865192e-17 0.04486396
## epsilonhat -6.865192e-17 1.000000e+00 0.99899311
## deaths 4.486396e-02 9.989931e-01 1.00000000
cross_eur %>% ggplot( aes(y=deaths,x=epsilonhat) ) +
geom_point() +
xlab("epsilon hat")+
ylab("Deaths")+
theme_minimal()+
geom_smooth(method="lm",se=FALSE)
Are vaccinations really causing deaths?
We can explore this by looking at the amount of death we had after the arrival of vaccines…
cross_euram2=cross %>%
filter(continent=="Europe" | continent =="North America") %>% # this time we include the first period
# which is from Februrary 2021
arrange(iso_code,date) %>% # i.e. sort
group_by(iso_code) %>% # put in country groups
mutate(Ddeaths=deaths-dplyr::lag(deaths) ) # we compute the change in the death variable
cross_euram2 %>% ggplot( aes(x=vax,y=Ddeaths) ) +
geom_point() +
ylab("Additional number of deaths per 1 million sinc Feb 2021 (Delta Deaths")+
xlab("Total number of vaccinations per 100K")+
theme_minimal()+
geom_smooth(method="lm",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 84 rows containing non-finite values (stat_smooth).
## Warning: Removed 84 rows containing missing values (geom_point).
The same as regression:
reg2=lm(formula=Ddeaths ~ vax,data=cross_euram2)
reg2 %>% summary() # The summary command provides a nice summary
##
## Call:
## lm(formula = Ddeaths ~ vax, data = cross_euram2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -780.0 -384.2 -159.6 268.2 1926.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 779.965 131.365 5.937 8.4e-08 ***
## vax -2.478 1.112 -2.228 0.0289 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 566.7 on 75 degrees of freedom
## (84 observations deleted due to missingness)
## Multiple R-squared: 0.06206, Adjusted R-squared: 0.04955
## F-statistic: 4.962 on 1 and 75 DF, p-value: 0.0289
i.e. an increase of 1 additional vaccination per 100K leads to -2.48 less deaths per 1 million.
cross_euram2 %>% ggplot( aes(x=vax,y=Ddeaths, color=continent) ) +
geom_point() +
ylab("Delta Deaths")+
xlab("Total number of vaccinations per 100K")+
theme_minimal()+
geom_smooth(method="lm",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 84 rows containing non-finite values (stat_smooth).
## Warning: Removed 84 rows containing missing values (geom_point).
Country labels?
cross_euram2 %>% ggplot( aes(x=vax,y=Ddeaths, color=continent,label=location) ) +
geom_point() +
ylab("Delta Deaths")+
xlab("Total number of vaccinations per 100K")+
theme_minimal()+
geom_smooth(method="lm",se=FALSE)+geom_text(size=3)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 84 rows containing non-finite values (stat_smooth).
## Warning: Removed 84 rows containing missing values (geom_point).
## Warning: Removed 84 rows containing missing values (geom_text).
cross_euram2 =cross_euram2 %>% mutate(slabel=ifelse(population>20*10^6,location,""))
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.1.2
cross_euram2 %>% ggplot( aes(x=vax,y=Ddeaths, color=continent,label=slabel) ) +
geom_point() +
ylab("Delta Deaths")+
xlab("Total number of vaccinations per 100K")+
theme_minimal()+
geom_smooth(method="lm",se=FALSE)+geom_text_repel(size=3)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 84 rows containing non-finite values (stat_smooth).
## Warning: Removed 84 rows containing missing values (geom_point).
## Warning: Removed 84 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
An example to exmplore ggplot2
cross=cross %>%
arrange(iso_code,date) %>% # i.e. sort
group_by(iso_code) %>% # put in country groups
mutate(Ddeaths=deaths-dplyr::lag(deaths) ) # we compute the change in the death variable
cross %>% ggplot( aes(x=vax,y=Ddeaths, color=continent) ) +
geom_point() +
ylab("Delta Deaths")+
xlab("Total number of vaccinations per 100K")+
theme_minimal()+
geom_smooth(method="lm",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 222 rows containing non-finite values (stat_smooth).
## Warning: Removed 222 rows containing missing values (geom_point).
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
dfm=read.csv("dfm.csv") %>%
mutate( vax =total_vaccinations_per_hundred,
deaths=total_deaths_per_million)
# Tell R about the date variable
dfm_us=dfm %>% filter(iso_code=="USA")%>% mutate(date=as_date(date))
p1=dfm_us %>% ggplot(aes(x=date,y=deaths)) + geom_line()
p1
p1+theme_minimal()+xlab("Time")+ylab("Total Deaths")
Sometimes better to look at changes:
dfm_us=dfm_us %>% arrange(date) %>% mutate(Ddeaths=deaths-dplyr::lag(deaths))
dfm_us %>% ggplot(aes(x=date,y=Ddeaths)) + geom_line() +
theme_minimal()+
xlab("Time")+ylab("Total Deaths")
## Warning: Removed 1 row(s) containing missing values (geom_path).
A histogram allows us to look at the distribution of a variable across a sample. For instance we can look at the daily hoax shares:
crossp2=cross %>% filter(period==2)
crossp2 %>% ggplot( aes(x=deaths)) + geom_histogram() + ylab("Number of countries")+xlab("Cases per 1 million")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
An alternative way of looking at histograms is in terms of density. If you multiply the density with the width of a histogram bin you get the share of observations (as opposed to the count) that fall into a particular category:
d1=crossp2 %>% ggplot( aes(x=deaths)) + geom_histogram(aes(y=..density.. ,fill=..density..)) +
ylab("Number of countries")+xlab("Cases per 1 million")
d1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
An outright density plot is often preferable to a histogram. It’s kind of like a density histogram where let the bins become every smaller:
d1 + geom_density(color="red",size=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.