COVID-19 dataset is a collection of the COVID-19 data maintained by Our World in Data. The dataset contains variables pertaining to the following: Vaccinations, Tests & positivity, Hospital & ICU, Confirmed cases, Confirmed deaths, Reproduction rate, Policy responses, and Other variables of interest, to know more about the data set visit Our world in data.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
After downloading the dataset, reading of the dataset into a data frame was done using the read.csv() function and assigned to “covid_data”.
covid_data <- read.csv("owid-covid-data.csv")
To confirm that reading of the CSV file was sucessful, the head() function was use to view the first two rows
head(covid_data,2)
## iso_code continent location date total_cases new_cases
## 1 AFG Asia Afghanistan 2020-01-03 NA 0
## 2 AFG Asia Afghanistan 2020-01-04 NA 0
## new_cases_smoothed total_deaths new_deaths new_deaths_smoothed
## 1 NA NA 0 NA
## 2 NA NA 0 NA
## total_cases_per_million new_cases_per_million new_cases_smoothed_per_million
## 1 NA 0 NA
## 2 NA 0 NA
## total_deaths_per_million new_deaths_per_million
## 1 NA 0
## 2 NA 0
## new_deaths_smoothed_per_million reproduction_rate icu_patients
## 1 NA NA NA
## 2 NA NA NA
## icu_patients_per_million hosp_patients hosp_patients_per_million
## 1 NA NA NA
## 2 NA NA NA
## weekly_icu_admissions weekly_icu_admissions_per_million
## 1 NA NA
## 2 NA NA
## weekly_hosp_admissions weekly_hosp_admissions_per_million total_tests
## 1 NA NA NA
## 2 NA NA NA
## new_tests total_tests_per_thousand new_tests_per_thousand new_tests_smoothed
## 1 NA NA NA NA
## 2 NA NA NA NA
## new_tests_smoothed_per_thousand positive_rate tests_per_case tests_units
## 1 NA NA NA
## 2 NA NA NA
## total_vaccinations people_vaccinated people_fully_vaccinated total_boosters
## 1 NA NA NA NA
## 2 NA NA NA NA
## new_vaccinations new_vaccinations_smoothed total_vaccinations_per_hundred
## 1 NA NA NA
## 2 NA NA NA
## people_vaccinated_per_hundred people_fully_vaccinated_per_hundred
## 1 NA NA
## 2 NA NA
## total_boosters_per_hundred new_vaccinations_smoothed_per_million
## 1 NA NA
## 2 NA NA
## new_people_vaccinated_smoothed new_people_vaccinated_smoothed_per_hundred
## 1 NA NA
## 2 NA NA
## stringency_index population_density median_age aged_65_older aged_70_older
## 1 0 54.422 18.6 2.581 1.337
## 2 0 54.422 18.6 2.581 1.337
## gdp_per_capita extreme_poverty cardiovasc_death_rate diabetes_prevalence
## 1 1803.987 NA 597.029 9.59
## 2 1803.987 NA 597.029 9.59
## female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand
## 1 NA NA 37.746 0.5
## 2 NA NA 37.746 0.5
## life_expectancy human_development_index population
## 1 64.83 0.511 41128772
## 2 64.83 0.511 41128772
## excess_mortality_cumulative_absolute excess_mortality_cumulative
## 1 NA NA
## 2 NA NA
## excess_mortality excess_mortality_cumulative_per_million
## 1 NA NA
## 2 NA NA
# The View() function can also be used to view the whole data frame
view(covid_data)
view(tail(covid_data))
colnames(covid_data) # To view all the available columns
## [1] "iso_code"
## [2] "continent"
## [3] "location"
## [4] "date"
## [5] "total_cases"
## [6] "new_cases"
## [7] "new_cases_smoothed"
## [8] "total_deaths"
## [9] "new_deaths"
## [10] "new_deaths_smoothed"
## [11] "total_cases_per_million"
## [12] "new_cases_per_million"
## [13] "new_cases_smoothed_per_million"
## [14] "total_deaths_per_million"
## [15] "new_deaths_per_million"
## [16] "new_deaths_smoothed_per_million"
## [17] "reproduction_rate"
## [18] "icu_patients"
## [19] "icu_patients_per_million"
## [20] "hosp_patients"
## [21] "hosp_patients_per_million"
## [22] "weekly_icu_admissions"
## [23] "weekly_icu_admissions_per_million"
## [24] "weekly_hosp_admissions"
## [25] "weekly_hosp_admissions_per_million"
## [26] "total_tests"
## [27] "new_tests"
## [28] "total_tests_per_thousand"
## [29] "new_tests_per_thousand"
## [30] "new_tests_smoothed"
## [31] "new_tests_smoothed_per_thousand"
## [32] "positive_rate"
## [33] "tests_per_case"
## [34] "tests_units"
## [35] "total_vaccinations"
## [36] "people_vaccinated"
## [37] "people_fully_vaccinated"
## [38] "total_boosters"
## [39] "new_vaccinations"
## [40] "new_vaccinations_smoothed"
## [41] "total_vaccinations_per_hundred"
## [42] "people_vaccinated_per_hundred"
## [43] "people_fully_vaccinated_per_hundred"
## [44] "total_boosters_per_hundred"
## [45] "new_vaccinations_smoothed_per_million"
## [46] "new_people_vaccinated_smoothed"
## [47] "new_people_vaccinated_smoothed_per_hundred"
## [48] "stringency_index"
## [49] "population_density"
## [50] "median_age"
## [51] "aged_65_older"
## [52] "aged_70_older"
## [53] "gdp_per_capita"
## [54] "extreme_poverty"
## [55] "cardiovasc_death_rate"
## [56] "diabetes_prevalence"
## [57] "female_smokers"
## [58] "male_smokers"
## [59] "handwashing_facilities"
## [60] "hospital_beds_per_thousand"
## [61] "life_expectancy"
## [62] "human_development_index"
## [63] "population"
## [64] "excess_mortality_cumulative_absolute"
## [65] "excess_mortality_cumulative"
## [66] "excess_mortality"
## [67] "excess_mortality_cumulative_per_million"
covid_data1 <- select(covid_data,iso_code,continent,location,date,total_cases,new_cases,
total_deaths, total_vaccinations,population)
# Using the head() function to view the first 3 rows
head(covid_data1, 3)
## iso_code continent location date total_cases new_cases total_deaths
## 1 AFG Asia Afghanistan 2020-01-03 NA 0 NA
## 2 AFG Asia Afghanistan 2020-01-04 NA 0 NA
## 3 AFG Asia Afghanistan 2020-01-05 NA 0 NA
## total_vaccinations population
## 1 NA 41128772
## 2 NA 41128772
## 3 NA 41128772
# The unique() function was used to few the various continent, it was observed that there are rows with blank continent elements
unique(covid_data1$continent)
## [1] "Asia" "" "Europe" "Africa"
## [5] "Oceania" "North America" "South America"
# To filter rows where the continent is one of the specified continents, you might want to use the %in% operator instead of ==:(i.e without blank continent observation)
covid_data2 <- covid_data1 %>% filter(continent %in% c("Asia", "Europe", "Africa",
"Oceania", "North America", "South America"))
tail(covid_data2) # To confirm the new data frame was successfully created, by viewing the last few rows
## iso_code continent location date total_cases new_cases
## 338568 ZWE Africa Zimbabwe 2023-11-04 265848 0
## 338569 ZWE Africa Zimbabwe 2023-11-05 265848 0
## 338570 ZWE Africa Zimbabwe 2023-11-06 265848 0
## 338571 ZWE Africa Zimbabwe 2023-11-07 265848 0
## 338572 ZWE Africa Zimbabwe 2023-11-08 265848 0
## 338573 ZWE Africa Zimbabwe 2023-11-09 265848 0
## total_deaths total_vaccinations population
## 338568 5723 NA 16320539
## 338569 5723 NA 16320539
## 338570 5723 NA 16320539
## 338571 5723 NA 16320539
## 338572 5723 NA 16320539
## 338573 5723 NA 16320539
unique(covid_data2$continent) # The data frame does not include row with blank continents
## [1] "Asia" "Europe" "Africa" "Oceania"
## [5] "North America" "South America"
The function summary() is use to get a summary of the variaous attributes, from the summary it was observed that there were NA values in some of the attribute, also the date column is of the character data type rather than the supposed date data type. After further inspection into the data set it was observed that the NA values are best represented with zeros.
summary(covid_data2)
## iso_code continent location date
## Length:338573 Length:338573 Length:338573 Length:338573
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## total_cases new_cases total_deaths total_vaccinations
## Min. : 1 Min. : 0 Min. : 1 Min. :0.000e+00
## 1st Qu.: 7588 1st Qu.: 0 1st Qu.: 121 1st Qu.:1.274e+06
## Median : 60967 Median : 0 Median : 1016 Median :7.880e+06
## Mean : 1692960 Mean : 2346 Mean : 21975 Mean :8.703e+07
## 3rd Qu.: 553917 3rd Qu.: 153 3rd Qu.: 8713 3rd Qu.:3.879e+07
## Max. :103436829 Max. :6966046 Max. :1138309 Max. :3.491e+09
## NA's :37783 NA's :9602 NA's :59352 NA's :271237
## population
## Min. :4.700e+01
## 1st Qu.:3.958e+05
## Median :5.466e+06
## Mean :3.342e+07
## 3rd Qu.:2.213e+07
## Max. :1.426e+09
##
view(covid_data2)
To replace the Na values with zero the replace() function was used while assigning it to “covid_data3”
covid_data3 <- replace(covid_data2, is.na(covid_data2), 0)
covid_data3$date <- as.Date(covid_data3$date) # The date column was changed from character type to date data type
summary(covid_data3)
## iso_code continent location date
## Length:338573 Length:338573 Length:338573 Min. :2020-01-01
## Class :character Class :character Class :character 1st Qu.:2020-12-22
## Mode :character Mode :character Mode :character Median :2021-12-07
## Mean :2021-12-06
## 3rd Qu.:2022-11-21
## Max. :2023-11-12
## total_cases new_cases total_deaths total_vaccinations
## Min. : 0 Min. : 0 Min. : 0 Min. :0.000e+00
## 1st Qu.: 2166 1st Qu.: 0 1st Qu.: 15 1st Qu.:0.000e+00
## Median : 35202 Median : 0 Median : 386 Median :0.000e+00
## Mean : 1504034 Mean : 2280 Mean : 18123 Mean :1.731e+07
## 3rd Qu.: 420108 3rd Qu.: 136 3rd Qu.: 5719 3rd Qu.:0.000e+00
## Max. :103436829 Max. :6966046 Max. :1138309 Max. :3.491e+09
## population
## Min. :4.700e+01
## 1st Qu.:3.958e+05
## Median :5.466e+06
## Mean :3.342e+07
## 3rd Qu.:2.213e+07
## Max. :1.426e+09
str(covid_data3)
## 'data.frame': 338573 obs. of 9 variables:
## $ iso_code : chr "AFG" "AFG" "AFG" "AFG" ...
## $ continent : chr "Asia" "Asia" "Asia" "Asia" ...
## $ location : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ date : Date, format: "2020-01-03" "2020-01-04" ...
## $ total_cases : num 0 0 0 0 0 0 0 0 0 0 ...
## $ new_cases : num 0 0 0 0 0 0 0 0 0 0 ...
## $ total_deaths : num 0 0 0 0 0 0 0 0 0 0 ...
## $ total_vaccinations: num 0 0 0 0 0 0 0 0 0 0 ...
## $ population : num 41128772 41128772 41128772 41128772 41128772 ...
Using the mutate() function to add new columns to the data set, which calculates the death rate% = (total deaths/total cases) * 100, the death rate by population% = (total deaths/population) * 100,lastly, the infected_rate_by population% = (total cases/population) * 100
covid_data4 <- covid_data3 %>% mutate(death_rate_by_percent = (total_deaths/total_cases)*100,
death_rate_by_percent_and_population = (total_deaths/population)*100,
infected_rate_by_percent_and_population = (total_cases/population)*100)
tail(covid_data4,3) # This confirms the addition of the new attributes
## iso_code continent location date total_cases new_cases
## 338571 ZWE Africa Zimbabwe 2023-11-07 265848 0
## 338572 ZWE Africa Zimbabwe 2023-11-08 265848 0
## 338573 ZWE Africa Zimbabwe 2023-11-09 265848 0
## total_deaths total_vaccinations population death_rate_by_percent
## 338571 5723 0 16320539 2.152734
## 338572 5723 0 16320539 2.152734
## 338573 5723 0 16320539 2.152734
## death_rate_by_percent_and_population
## 338571 0.03506624
## 338572 0.03506624
## 338573 0.03506624
## infected_rate_by_percent_and_population
## 338571 1.628917
## 338572 1.628917
## 338573 1.628917
most_infected <- covid_data4 %>% group_by(location) %>%
summarise(infected_rate_by_percent_and_population = max(infected_rate_by_percent_and_population))%>%
filter(infected_rate_by_percent_and_population == max(infected_rate_by_percent_and_population))
most_infected
## # A tibble: 1 × 2
## location infected_rate_by_percent_and_population
## <chr> <dbl>
## 1 Cyprus 73.8
most_deaths <- covid_data4 %>% group_by(location) %>%
summarise(death_rate_by_percent_and_population = max(death_rate_by_percent_and_population))%>%
filter(death_rate_by_percent_and_population == max(death_rate_by_percent_and_population))
most_deaths
## # A tibble: 1 × 2
## location death_rate_by_percent_and_population
## <chr> <dbl>
## 1 Peru 0.651
least_infected <- covid_data4 %>% group_by(location) %>% slice_max(total_cases) %>% slice_max(date) %>%
summarise(infected_rate_by_percent_and_population = max(infected_rate_by_percent_and_population)) %>%
filter(infected_rate_by_percent_and_population != 0) %>%
filter(infected_rate_by_percent_and_population == min(infected_rate_by_percent_and_population))
least_infected
## # A tibble: 1 × 2
## location infected_rate_by_percent_and_population
## <chr> <dbl>
## 1 Yemen 0.0354
least_deaths <- covid_data4 %>% group_by(location) %>% slice_max(total_cases) %>% slice_max(date) %>%
summarise(death_rate_by_percent_and_population = max(death_rate_by_percent_and_population)) %>%
filter(death_rate_by_percent_and_population != 0) %>%
filter(death_rate_by_percent_and_population == min(death_rate_by_percent_and_population))
least_deaths
## # A tibble: 1 × 2
## location death_rate_by_percent_and_population
## <chr> <dbl>
## 1 Burundi 0.000116
covid_data5 <- covid_data4 %>% group_by(continent) %>% summarise(Total_cases = sum(new_cases))
covid_data5
## # A tibble: 6 × 2
## continent Total_cases
## <chr> <dbl>
## 1 Africa 13116765
## 2 Asia 300797149
## 3 Europe 250114367
## 4 North America 124498875
## 5 Oceania 14526018
## 6 South America 68847969
ggplot(covid_data5, aes(x = reorder(continent, Total_cases), y = Total_cases, fill = continent)) +
geom_bar(stat = "identity") +
labs(title = "Total COVID-19 Cases by Continent",
x = "Continent",
y = "Total Cases") +
theme_minimal()
covid_data4 %>% filter(location=="Cyprus") %>% ggplot(aes(x = date, y = new_cases)) +
geom_line() +
labs(title = "Daily new confirmed COVID-19 cases",
x = "Date",
y = "New Cases") +
theme_minimal()
## conclusion