Data Preparation
covid <- read.csv("covid_combined_groups.csv")
covid$date <- as.Date(covid$date)
# Keep the core variables needed for the report
covid_model <- covid %>%
select(iso_code, continent, location, date,
new_cases_smoothed_per_million,
new_deaths_smoothed_per_million,
total_cases_per_million,
total_deaths_per_million,
stringency_index,
reproduction_rate,
total_vaccinations_per_hundred,
people_vaccinated_per_hundred,
people_fully_vaccinated_per_hundred,
hospital_beds_per_thousand,
life_expectancy,
cardiovasc_death_rate,
diabetes_prevalence,
gdp_per_capita,
population_density,
median_age,
aged_65_older,
human_development_index,
population,
country_group,
year,
month,
year_month,
case_fatality_rate,
vax_coverage,
days_since_start) %>%
mutate(year_month = as.Date(paste0(year_month, "-01"))) %>%
drop_na(continent, location, case_fatality_rate, vax_coverage, median_age, gdp_per_capita, stringency_index, reproduction_rate, population)
glimpse(covid_model)
## Rows: 39,579
## Columns: 30
## $ iso_code <chr> "AUT", "AUT", "AUT", "AUT", "AUT",…
## $ continent <chr> "Europe", "Europe", "Europe", "Eur…
## $ location <chr> "Austria", "Austria", "Austria", "…
## $ date <date> 2020-03-01, 2020-03-02, 2020-03-0…
## $ new_cases_smoothed_per_million <dbl> 0.11, 0.11, 0.11, 0.11, 0.11, 0.11…
## $ new_deaths_smoothed_per_million <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00…
## $ total_cases_per_million <dbl> 0.77, 0.77, 0.77, 0.77, 0.77, 0.77…
## $ total_deaths_per_million <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.00…
## $ stringency_index <dbl> 11.11, 11.11, 11.11, 11.11, 11.11,…
## $ reproduction_rate <dbl> 1.07, 1.07, 1.07, 1.07, 1.07, 1.07…
## $ total_vaccinations_per_hundred <dbl> 69.3, 69.3, 69.3, 69.3, 69.3, 69.3…
## $ people_vaccinated_per_hundred <dbl> 43.6, 43.6, 43.6, 43.6, 43.6, 43.6…
## $ people_fully_vaccinated_per_hundred <dbl> 30.58, 30.58, 30.58, 30.58, 30.58,…
## $ hospital_beds_per_thousand <dbl> 7.37, 7.37, 7.37, 7.37, 7.37, 7.37…
## $ life_expectancy <dbl> 81.54, 81.54, 81.54, 81.54, 81.54,…
## $ cardiovasc_death_rate <dbl> 145.18, 145.18, 145.18, 145.18, 14…
## $ diabetes_prevalence <dbl> 6.35, 6.35, 6.35, 6.35, 6.35, 6.35…
## $ gdp_per_capita <dbl> 45436.69, 45436.69, 45436.69, 4543…
## $ population_density <dbl> 106.75, 106.75, 106.75, 106.75, 10…
## $ median_age <dbl> 44.4, 44.4, 44.4, 44.4, 44.4, 44.4…
## $ aged_65_older <dbl> 19.2, 19.2, 19.2, 19.2, 19.2, 19.2…
## $ human_development_index <dbl> 0.92, 0.92, 0.92, 0.92, 0.92, 0.92…
## $ population <int> 8939617, 8939617, 8939617, 8939617…
## $ country_group <chr> "EU", "EU", "EU", "EU", "EU", "EU"…
## $ year <int> 2020, 2020, 2020, 2020, 2020, 2020…
## $ month <int> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3…
## $ year_month <date> 2020-03-01, 2020-03-01, 2020-03-0…
## $ case_fatality_rate <dbl> 0.000000000, 0.000000000, 0.000000…
## $ vax_coverage <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ days_since_start <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, …
summary(covid_model %>% select(case_fatality_rate, vax_coverage, median_age, gdp_per_capita, stringency_index, reproduction_rate))
## case_fatality_rate vax_coverage median_age gdp_per_capita
## Min. :0.000000 Min. : 0.00 Min. :18.10 Min. : 1730
## 1st Qu.:0.005885 1st Qu.: 0.00 1st Qu.:31.90 1st Qu.:17336
## Median :0.014787 Median : 0.00 Median :39.70 Median :29481
## Mean :0.027773 Mean :10.54 Mean :37.48 Mean :30159
## 3rd Qu.:0.030259 3rd Qu.: 5.79 3rd Qu.:43.20 3rd Qu.:42659
## Max. :2.281690 Max. :84.68 Max. :48.20 Max. :94278
## stringency_index reproduction_rate
## Min. : 0.00 Min. :0.110
## 1st Qu.: 45.65 1st Qu.:0.890
## Median : 58.04 Median :1.040
## Mean : 58.47 Mean :1.075
## 3rd Qu.: 71.76 3rd Qu.:1.220
## Max. :100.00 Max. :4.650
The data set contains repeated country-level observations over time,
so each row represents a country-date combination rather than a single
independent country snapshot.
That matters because the same country appears more than once across
months, which means the observations are useful for trend analysis, but
they are not fully independent in the same way a one-row-per-country
data set would be.
For that reason, the report focuses on patterns, associations, and
model fit rather than claiming direct causation.