Data Analytics

ANALYSIS OF COVID-19 DATASET

COVID-19 dataset is a collection of the COVID-19 data maintained by Our World in Data. The dataset contains variables pertaining to the following: Vaccinations, Tests & positivity, Hospital & ICU, Confirmed cases, Confirmed deaths, Reproduction rate, Policy responses, and Other variables of interest, to know more about the data set visit Our world in data.

Importation of the packages to be used

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)

Reading the data into R

After downloading the dataset, reading of the dataset into a data frame was done using the read.csv() function and assigned to “covid_data”.

covid_data <- read.csv("owid-covid-data.csv")

To confirm that reading of the CSV file was sucessful, the head() function was use to view the first two rows

head(covid_data,2)

##   iso_code continent    location       date total_cases new_cases
## 1      AFG      Asia Afghanistan 2020-01-03          NA         0
## 2      AFG      Asia Afghanistan 2020-01-04          NA         0
##   new_cases_smoothed total_deaths new_deaths new_deaths_smoothed
## 1                 NA           NA          0                  NA
## 2                 NA           NA          0                  NA
##   total_cases_per_million new_cases_per_million new_cases_smoothed_per_million
## 1                      NA                     0                             NA
## 2                      NA                     0                             NA
##   total_deaths_per_million new_deaths_per_million
## 1                       NA                      0
## 2                       NA                      0
##   new_deaths_smoothed_per_million reproduction_rate icu_patients
## 1                              NA                NA           NA
## 2                              NA                NA           NA
##   icu_patients_per_million hosp_patients hosp_patients_per_million
## 1                       NA            NA                        NA
## 2                       NA            NA                        NA
##   weekly_icu_admissions weekly_icu_admissions_per_million
## 1                    NA                                NA
## 2                    NA                                NA
##   weekly_hosp_admissions weekly_hosp_admissions_per_million total_tests
## 1                     NA                                 NA          NA
## 2                     NA                                 NA          NA
##   new_tests total_tests_per_thousand new_tests_per_thousand new_tests_smoothed
## 1        NA                       NA                     NA                 NA
## 2        NA                       NA                     NA                 NA
##   new_tests_smoothed_per_thousand positive_rate tests_per_case tests_units
## 1                              NA            NA             NA            
## 2                              NA            NA             NA            
##   total_vaccinations people_vaccinated people_fully_vaccinated total_boosters
## 1                 NA                NA                      NA             NA
## 2                 NA                NA                      NA             NA
##   new_vaccinations new_vaccinations_smoothed total_vaccinations_per_hundred
## 1               NA                        NA                             NA
## 2               NA                        NA                             NA
##   people_vaccinated_per_hundred people_fully_vaccinated_per_hundred
## 1                            NA                                  NA
## 2                            NA                                  NA
##   total_boosters_per_hundred new_vaccinations_smoothed_per_million
## 1                         NA                                    NA
## 2                         NA                                    NA
##   new_people_vaccinated_smoothed new_people_vaccinated_smoothed_per_hundred
## 1                             NA                                         NA
## 2                             NA                                         NA
##   stringency_index population_density median_age aged_65_older aged_70_older
## 1                0             54.422       18.6         2.581         1.337
## 2                0             54.422       18.6         2.581         1.337
##   gdp_per_capita extreme_poverty cardiovasc_death_rate diabetes_prevalence
## 1       1803.987              NA               597.029                9.59
## 2       1803.987              NA               597.029                9.59
##   female_smokers male_smokers handwashing_facilities hospital_beds_per_thousand
## 1             NA           NA                 37.746                        0.5
## 2             NA           NA                 37.746                        0.5
##   life_expectancy human_development_index population
## 1           64.83                   0.511   41128772
## 2           64.83                   0.511   41128772
##   excess_mortality_cumulative_absolute excess_mortality_cumulative
## 1                                   NA                          NA
## 2                                   NA                          NA
##   excess_mortality excess_mortality_cumulative_per_million
## 1               NA                                      NA
## 2               NA                                      NA

# The View() function can also be used to view the whole data frame

view(covid_data)

view(tail(covid_data))

colnames(covid_data) # To view all the available columns

##  [1] "iso_code"                                  
##  [2] "continent"                                 
##  [3] "location"                                  
##  [4] "date"                                      
##  [5] "total_cases"                               
##  [6] "new_cases"                                 
##  [7] "new_cases_smoothed"                        
##  [8] "total_deaths"                              
##  [9] "new_deaths"                                
## [10] "new_deaths_smoothed"                       
## [11] "total_cases_per_million"                   
## [12] "new_cases_per_million"                     
## [13] "new_cases_smoothed_per_million"            
## [14] "total_deaths_per_million"                  
## [15] "new_deaths_per_million"                    
## [16] "new_deaths_smoothed_per_million"           
## [17] "reproduction_rate"                         
## [18] "icu_patients"                              
## [19] "icu_patients_per_million"                  
## [20] "hosp_patients"                             
## [21] "hosp_patients_per_million"                 
## [22] "weekly_icu_admissions"                     
## [23] "weekly_icu_admissions_per_million"         
## [24] "weekly_hosp_admissions"                    
## [25] "weekly_hosp_admissions_per_million"        
## [26] "total_tests"                               
## [27] "new_tests"                                 
## [28] "total_tests_per_thousand"                  
## [29] "new_tests_per_thousand"                    
## [30] "new_tests_smoothed"                        
## [31] "new_tests_smoothed_per_thousand"           
## [32] "positive_rate"                             
## [33] "tests_per_case"                            
## [34] "tests_units"                               
## [35] "total_vaccinations"                        
## [36] "people_vaccinated"                         
## [37] "people_fully_vaccinated"                   
## [38] "total_boosters"                            
## [39] "new_vaccinations"                          
## [40] "new_vaccinations_smoothed"                 
## [41] "total_vaccinations_per_hundred"            
## [42] "people_vaccinated_per_hundred"             
## [43] "people_fully_vaccinated_per_hundred"       
## [44] "total_boosters_per_hundred"                
## [45] "new_vaccinations_smoothed_per_million"     
## [46] "new_people_vaccinated_smoothed"            
## [47] "new_people_vaccinated_smoothed_per_hundred"
## [48] "stringency_index"                          
## [49] "population_density"                        
## [50] "median_age"                                
## [51] "aged_65_older"                             
## [52] "aged_70_older"                             
## [53] "gdp_per_capita"                            
## [54] "extreme_poverty"                           
## [55] "cardiovasc_death_rate"                     
## [56] "diabetes_prevalence"                       
## [57] "female_smokers"                            
## [58] "male_smokers"                              
## [59] "handwashing_facilities"                    
## [60] "hospital_beds_per_thousand"                
## [61] "life_expectancy"                           
## [62] "human_development_index"                   
## [63] "population"                                
## [64] "excess_mortality_cumulative_absolute"      
## [65] "excess_mortality_cumulative"               
## [66] "excess_mortality"                          
## [67] "excess_mortality_cumulative_per_million"

Problem questions

Where is the most infected location based on population?
Which location recorded the most death based on population?
Where is the least infected location based on population?
Which location recorded the least death based on population?
Visualize the continent with the most cases
Upon getting the location with the most infected based on population, visualize the trend of the daily new cases.

To answer the problem questions, not all the columns are needed, the use of select() function was used to select the needed attributes, while assigning it to “covid_data1”

covid_data1 <- select(covid_data,iso_code,continent,location,date,total_cases,new_cases,
                      total_deaths, total_vaccinations,population)

# Using the head() function to view the first 3 rows 

head(covid_data1, 3)

##   iso_code continent    location       date total_cases new_cases total_deaths
## 1      AFG      Asia Afghanistan 2020-01-03          NA         0           NA
## 2      AFG      Asia Afghanistan 2020-01-04          NA         0           NA
## 3      AFG      Asia Afghanistan 2020-01-05          NA         0           NA
##   total_vaccinations population
## 1                 NA   41128772
## 2                 NA   41128772
## 3                 NA   41128772

# The unique() function was used to few the various continent, it was observed that there are rows with blank continent elements

unique(covid_data1$continent)

## [1] "Asia"          ""              "Europe"        "Africa"       
## [5] "Oceania"       "North America" "South America"

# To filter rows where the continent is one of the specified continents, you might want to use the %in% operator instead of ==:(i.e without blank continent observation)

covid_data2 <- covid_data1 %>% filter(continent %in% c("Asia", "Europe", "Africa",
                                                       "Oceania", "North America", "South America"))
tail(covid_data2) # To confirm the new data frame was successfully created, by viewing the last few rows

##        iso_code continent location       date total_cases new_cases
## 338568      ZWE    Africa Zimbabwe 2023-11-04      265848         0
## 338569      ZWE    Africa Zimbabwe 2023-11-05      265848         0
## 338570      ZWE    Africa Zimbabwe 2023-11-06      265848         0
## 338571      ZWE    Africa Zimbabwe 2023-11-07      265848         0
## 338572      ZWE    Africa Zimbabwe 2023-11-08      265848         0
## 338573      ZWE    Africa Zimbabwe 2023-11-09      265848         0
##        total_deaths total_vaccinations population
## 338568         5723                 NA   16320539
## 338569         5723                 NA   16320539
## 338570         5723                 NA   16320539
## 338571         5723                 NA   16320539
## 338572         5723                 NA   16320539
## 338573         5723                 NA   16320539

unique(covid_data2$continent) # The data frame does not include row with blank continents

## [1] "Asia"          "Europe"        "Africa"        "Oceania"      
## [5] "North America" "South America"

The function summary() is use to get a summary of the variaous attributes, from the summary it was observed that there were NA values in some of the attribute, also the date column is of the character data type rather than the supposed date data type. After further inspection into the data set it was observed that the NA values are best represented with zeros.

summary(covid_data2)

##    iso_code          continent           location             date          
##  Length:338573      Length:338573      Length:338573      Length:338573     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   total_cases          new_cases        total_deaths     total_vaccinations 
##  Min.   :        1   Min.   :      0   Min.   :      1   Min.   :0.000e+00  
##  1st Qu.:     7588   1st Qu.:      0   1st Qu.:    121   1st Qu.:1.274e+06  
##  Median :    60967   Median :      0   Median :   1016   Median :7.880e+06  
##  Mean   :  1692960   Mean   :   2346   Mean   :  21975   Mean   :8.703e+07  
##  3rd Qu.:   553917   3rd Qu.:    153   3rd Qu.:   8713   3rd Qu.:3.879e+07  
##  Max.   :103436829   Max.   :6966046   Max.   :1138309   Max.   :3.491e+09  
##  NA's   :37783       NA's   :9602      NA's   :59352     NA's   :271237     
##    population       
##  Min.   :4.700e+01  
##  1st Qu.:3.958e+05  
##  Median :5.466e+06  
##  Mean   :3.342e+07  
##  3rd Qu.:2.213e+07  
##  Max.   :1.426e+09  
##

view(covid_data2)

To replace the Na values with zero the replace() function was used while assigning it to “covid_data3”

covid_data3 <- replace(covid_data2, is.na(covid_data2), 0)

covid_data3$date <- as.Date(covid_data3$date) # The date column was changed from character type to date data type

summary(covid_data3)

##    iso_code          continent           location              date           
##  Length:338573      Length:338573      Length:338573      Min.   :2020-01-01  
##  Class :character   Class :character   Class :character   1st Qu.:2020-12-22  
##  Mode  :character   Mode  :character   Mode  :character   Median :2021-12-07  
##                                                           Mean   :2021-12-06  
##                                                           3rd Qu.:2022-11-21  
##                                                           Max.   :2023-11-12  
##   total_cases          new_cases        total_deaths     total_vaccinations 
##  Min.   :        0   Min.   :      0   Min.   :      0   Min.   :0.000e+00  
##  1st Qu.:     2166   1st Qu.:      0   1st Qu.:     15   1st Qu.:0.000e+00  
##  Median :    35202   Median :      0   Median :    386   Median :0.000e+00  
##  Mean   :  1504034   Mean   :   2280   Mean   :  18123   Mean   :1.731e+07  
##  3rd Qu.:   420108   3rd Qu.:    136   3rd Qu.:   5719   3rd Qu.:0.000e+00  
##  Max.   :103436829   Max.   :6966046   Max.   :1138309   Max.   :3.491e+09  
##    population       
##  Min.   :4.700e+01  
##  1st Qu.:3.958e+05  
##  Median :5.466e+06  
##  Mean   :3.342e+07  
##  3rd Qu.:2.213e+07  
##  Max.   :1.426e+09

str(covid_data3)

## 'data.frame':    338573 obs. of  9 variables:
##  $ iso_code          : chr  "AFG" "AFG" "AFG" "AFG" ...
##  $ continent         : chr  "Asia" "Asia" "Asia" "Asia" ...
##  $ location          : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ date              : Date, format: "2020-01-03" "2020-01-04" ...
##  $ total_cases       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ new_cases         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_deaths      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ total_vaccinations: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ population        : num  41128772 41128772 41128772 41128772 41128772 ...

Using the mutate() function to add new columns to the data set, which calculates the death rate% = (total deaths/total cases) * 100, the death rate by population% = (total deaths/population) * 100,lastly, the infected_rate_by population% = (total cases/population) * 100

covid_data4 <- covid_data3 %>% mutate(death_rate_by_percent = (total_deaths/total_cases)*100, 
                                      death_rate_by_percent_and_population = (total_deaths/population)*100,
                                      infected_rate_by_percent_and_population = (total_cases/population)*100)

tail(covid_data4,3) # This confirms the addition of the new attributes

##        iso_code continent location       date total_cases new_cases
## 338571      ZWE    Africa Zimbabwe 2023-11-07      265848         0
## 338572      ZWE    Africa Zimbabwe 2023-11-08      265848         0
## 338573      ZWE    Africa Zimbabwe 2023-11-09      265848         0
##        total_deaths total_vaccinations population death_rate_by_percent
## 338571         5723                  0   16320539              2.152734
## 338572         5723                  0   16320539              2.152734
## 338573         5723                  0   16320539              2.152734
##        death_rate_by_percent_and_population
## 338571                           0.03506624
## 338572                           0.03506624
## 338573                           0.03506624
##        infected_rate_by_percent_and_population
## 338571                                1.628917
## 338572                                1.628917
## 338573                                1.628917

To get the location with the most infected population wise

most_infected <- covid_data4 %>% group_by(location) %>% 
  summarise(infected_rate_by_percent_and_population = max(infected_rate_by_percent_and_population))%>% 
  filter(infected_rate_by_percent_and_population == max(infected_rate_by_percent_and_population))

most_infected

## # A tibble: 1 × 2
##   location infected_rate_by_percent_and_population
##   <chr>                                      <dbl>
## 1 Cyprus                                      73.8

To get the location with the most deaths population wise

most_deaths <- covid_data4 %>% group_by(location) %>% 
  summarise(death_rate_by_percent_and_population = max(death_rate_by_percent_and_population))%>% 
  filter(death_rate_by_percent_and_population == max(death_rate_by_percent_and_population))

most_deaths

## # A tibble: 1 × 2
##   location death_rate_by_percent_and_population
##   <chr>                                   <dbl>
## 1 Peru                                    0.651

To get the location with the least infected population wise

least_infected <- covid_data4 %>% group_by(location) %>% slice_max(total_cases) %>% slice_max(date) %>% 
  summarise(infected_rate_by_percent_and_population = max(infected_rate_by_percent_and_population)) %>% 
  filter(infected_rate_by_percent_and_population != 0) %>% 
  filter(infected_rate_by_percent_and_population == min(infected_rate_by_percent_and_population))

least_infected

## # A tibble: 1 × 2
##   location infected_rate_by_percent_and_population
##   <chr>                                      <dbl>
## 1 Yemen                                     0.0354

To get the location with the least deaths population wise

least_deaths <- covid_data4 %>% group_by(location) %>% slice_max(total_cases) %>% slice_max(date) %>% 
  summarise(death_rate_by_percent_and_population = max(death_rate_by_percent_and_population)) %>% 
  filter(death_rate_by_percent_and_population != 0) %>% 
  filter(death_rate_by_percent_and_population == min(death_rate_by_percent_and_population))

least_deaths

## # A tibble: 1 × 2
##   location death_rate_by_percent_and_population
##   <chr>                                   <dbl>
## 1 Burundi                              0.000116

TO get a data frame with the continents with the total number of cases

covid_data5 <- covid_data4 %>% group_by(continent) %>% summarise(Total_cases = sum(new_cases))

covid_data5

## # A tibble: 6 × 2
##   continent     Total_cases
##   <chr>               <dbl>
## 1 Africa           13116765
## 2 Asia            300797149
## 3 Europe          250114367
## 4 North America   124498875
## 5 Oceania          14526018
## 6 South America    68847969

Visualizing the continents and the total cases in form of a bar chart

ggplot(covid_data5, aes(x = reorder(continent, Total_cases), y = Total_cases, fill = continent)) +
  geom_bar(stat = "identity") +
  labs(title = "Total COVID-19 Cases by Continent",
       x = "Continent",
       y = "Total Cases") +
  theme_minimal()

knowing that cyprus is the location with the highest infection rate population wise, to visualize the trend of new cases with date.

covid_data4 %>% filter(location=="Cyprus") %>%  ggplot(aes(x = date, y = new_cases)) +
  geom_line() +
  labs(title = "Daily new confirmed COVID-19 cases",
       x = "Date",
       y = "New Cases") +
  theme_minimal()

## conclusion