📊 Introduction

This project performs advanced exploratory data analysis of global COVID-19 data using R.

📥 Load Libraries & Data

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tibble' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.3
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
covid_data <- read.csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")

Level 1: Understanding Data

Question 1: What is the structure of the dataset?

str(covid_data)
## 'data.frame':    429435 obs. of  67 variables:
##  $ iso_code                                  : chr  "AFG" "AFG" "AFG" "AFG" ...
##  $ continent                                 : chr  "Asia" "Asia" "Asia" "Asia" ...
##  $ location                                  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ date                                      : chr  "2020-01-05" "2020-01-06" "2020-01-07" "2020-01-08" ...
##  $ total_cases                               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ new_cases                                 : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ new_cases_smoothed                        : num  NA NA NA NA NA 0 0 0 0 0 ...
##  $ total_deaths                              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ new_deaths                                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ new_deaths_smoothed                       : num  NA NA NA NA NA 0 0 0 0 0 ...
##  $ total_cases_per_million                   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ new_cases_per_million                     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ new_cases_smoothed_per_million            : num  NA NA NA NA NA 0 0 0 0 0 ...
##  $ total_deaths_per_million                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ new_deaths_per_million                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ new_deaths_smoothed_per_million           : num  NA NA NA NA NA 0 0 0 0 0 ...
##  $ reproduction_rate                         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ icu_patients                              : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ icu_patients_per_million                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ hosp_patients                             : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ hosp_patients_per_million                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_icu_admissions                     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_icu_admissions_per_million         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_hosp_admissions                    : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_hosp_admissions_per_million        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_tests                               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests                                 : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_tests_per_thousand                  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_per_thousand                    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_smoothed                        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_smoothed_per_thousand           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ positive_rate                             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tests_per_case                            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ tests_units                               : chr  "" "" "" "" ...
##  $ total_vaccinations                        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_vaccinated                         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_fully_vaccinated                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_boosters                            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations                          : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations_smoothed                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_vaccinations_per_hundred            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_vaccinated_per_hundred             : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ people_fully_vaccinated_per_hundred       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ total_boosters_per_hundred                : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations_smoothed_per_million     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_people_vaccinated_smoothed            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ new_people_vaccinated_smoothed_per_hundred: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ stringency_index                          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ population_density                        : num  54.4 54.4 54.4 54.4 54.4 ...
##  $ median_age                                : num  18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 ...
##  $ aged_65_older                             : num  2.58 2.58 2.58 2.58 2.58 2.58 2.58 2.58 2.58 2.58 ...
##  $ aged_70_older                             : num  1.34 1.34 1.34 1.34 1.34 1.34 1.34 1.34 1.34 1.34 ...
##  $ gdp_per_capita                            : num  1804 1804 1804 1804 1804 ...
##  $ extreme_poverty                           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ cardiovasc_death_rate                     : num  597 597 597 597 597 ...
##  $ diabetes_prevalence                       : num  9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 ...
##  $ female_smokers                            : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ male_smokers                              : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ handwashing_facilities                    : num  37.8 37.8 37.8 37.8 37.8 ...
##  $ hospital_beds_per_thousand                : num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
##  $ life_expectancy                           : num  64.8 64.8 64.8 64.8 64.8 ...
##  $ human_development_index                   : num  0.51 0.51 0.51 0.51 0.51 0.51 0.51 0.51 0.51 0.51 ...
##  $ population                                : num  41128772 41128772 41128772 41128772 41128772 ...
##  $ excess_mortality_cumulative_absolute      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ excess_mortality_cumulative               : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ excess_mortality                          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ excess_mortality_cumulative_per_million   : num  NA NA NA NA NA NA NA NA NA NA ...

Question 2: What are the dimensions of the dataset?

dim(covid_data)
## [1] 429435     67

Question 3: What are the column names of the dataset?

colnames(covid_data)
##  [1] "iso_code"                                  
##  [2] "continent"                                 
##  [3] "location"                                  
##  [4] "date"                                      
##  [5] "total_cases"                               
##  [6] "new_cases"                                 
##  [7] "new_cases_smoothed"                        
##  [8] "total_deaths"                              
##  [9] "new_deaths"                                
## [10] "new_deaths_smoothed"                       
## [11] "total_cases_per_million"                   
## [12] "new_cases_per_million"                     
## [13] "new_cases_smoothed_per_million"            
## [14] "total_deaths_per_million"                  
## [15] "new_deaths_per_million"                    
## [16] "new_deaths_smoothed_per_million"           
## [17] "reproduction_rate"                         
## [18] "icu_patients"                              
## [19] "icu_patients_per_million"                  
## [20] "hosp_patients"                             
## [21] "hosp_patients_per_million"                 
## [22] "weekly_icu_admissions"                     
## [23] "weekly_icu_admissions_per_million"         
## [24] "weekly_hosp_admissions"                    
## [25] "weekly_hosp_admissions_per_million"        
## [26] "total_tests"                               
## [27] "new_tests"                                 
## [28] "total_tests_per_thousand"                  
## [29] "new_tests_per_thousand"                    
## [30] "new_tests_smoothed"                        
## [31] "new_tests_smoothed_per_thousand"           
## [32] "positive_rate"                             
## [33] "tests_per_case"                            
## [34] "tests_units"                               
## [35] "total_vaccinations"                        
## [36] "people_vaccinated"                         
## [37] "people_fully_vaccinated"                   
## [38] "total_boosters"                            
## [39] "new_vaccinations"                          
## [40] "new_vaccinations_smoothed"                 
## [41] "total_vaccinations_per_hundred"            
## [42] "people_vaccinated_per_hundred"             
## [43] "people_fully_vaccinated_per_hundred"       
## [44] "total_boosters_per_hundred"                
## [45] "new_vaccinations_smoothed_per_million"     
## [46] "new_people_vaccinated_smoothed"            
## [47] "new_people_vaccinated_smoothed_per_hundred"
## [48] "stringency_index"                          
## [49] "population_density"                        
## [50] "median_age"                                
## [51] "aged_65_older"                             
## [52] "aged_70_older"                             
## [53] "gdp_per_capita"                            
## [54] "extreme_poverty"                           
## [55] "cardiovasc_death_rate"                     
## [56] "diabetes_prevalence"                       
## [57] "female_smokers"                            
## [58] "male_smokers"                              
## [59] "handwashing_facilities"                    
## [60] "hospital_beds_per_thousand"                
## [61] "life_expectancy"                           
## [62] "human_development_index"                   
## [63] "population"                                
## [64] "excess_mortality_cumulative_absolute"      
## [65] "excess_mortality_cumulative"               
## [66] "excess_mortality"                          
## [67] "excess_mortality_cumulative_per_million"

Question 4: What is the summary of the dataset?

summary(covid_data)
##    iso_code          continent           location             date          
##  Length:429435      Length:429435      Length:429435      Length:429435     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   total_cases          new_cases        new_cases_smoothed   total_deaths    
##  Min.   :        0   Min.   :       0   Min.   :      0.0   Min.   :      0  
##  1st Qu.:     6281   1st Qu.:       0   1st Qu.:      0.0   1st Qu.:     43  
##  Median :    63653   Median :       0   Median :     12.0   Median :    799  
##  Mean   :  7365292   Mean   :    8017   Mean   :   8041.0   Mean   :  81260  
##  3rd Qu.:   758272   3rd Qu.:       0   3rd Qu.:    313.3   3rd Qu.:   9574  
##  Max.   :775866783   Max.   :44236227   Max.   :6319461.0   Max.   :7057132  
##  NA's   :17631       NA's   :19276      NA's   :20506       NA's   :17631    
##    new_deaths        new_deaths_smoothed total_cases_per_million
##  Min.   :     0.00   Min.   :    0.00    Min.   :     0         
##  1st Qu.:     0.00   1st Qu.:    0.00    1st Qu.:  1916         
##  Median :     0.00   Median :    0.00    Median : 29145         
##  Mean   :    71.85   Mean   :   72.06    Mean   :112096         
##  3rd Qu.:     0.00   3rd Qu.:    3.14    3rd Qu.:156770         
##  Max.   :103719.00   Max.   :14817.00    Max.   :763599         
##  NA's   :18827       NA's   :20057       NA's   :17631          
##  new_cases_per_million new_cases_smoothed_per_million total_deaths_per_million
##  Min.   :     0.0      Min.   :    0.00               Min.   :   0.00         
##  1st Qu.:     0.0      1st Qu.:    0.00               1st Qu.:  24.57         
##  Median :     0.0      Median :    2.79               Median : 295.09         
##  Mean   :   122.4      Mean   :  122.71               Mean   : 835.51         
##  3rd Qu.:     0.0      3rd Qu.:   56.25               3rd Qu.:1283.82         
##  Max.   :241758.2      Max.   :34536.89               Max.   :6601.11         
##  NA's   :19276         NA's   :20506                  NA's   :17631           
##  new_deaths_per_million new_deaths_smoothed_per_million reproduction_rate
##  Min.   :  0.0000       Min.   :  0.0000                Min.   :-0.070   
##  1st Qu.:  0.0000       1st Qu.:  0.0000                1st Qu.: 0.720   
##  Median :  0.0000       Median :  0.0000                Median : 0.950   
##  Mean   :  0.7623       Mean   :  0.7645                Mean   : 0.911   
##  3rd Qu.:  0.0000       3rd Qu.:  0.3600                3rd Qu.: 1.140   
##  Max.   :893.6600       Max.   :127.6600                Max.   : 5.870   
##  NA's   :18827          NA's   :20057                   NA's   :244618   
##   icu_patients    icu_patients_per_million hosp_patients   
##  Min.   :    0    Min.   :  0.00           Min.   :     0  
##  1st Qu.:   21    1st Qu.:  2.33           1st Qu.:   186  
##  Median :   90    Median :  6.43           Median :   776  
##  Mean   :  661    Mean   : 15.66           Mean   :  3912  
##  3rd Qu.:  413    3rd Qu.: 18.78           3rd Qu.:  3051  
##  Max.   :28891    Max.   :180.68           Max.   :154497  
##  NA's   :390319   NA's   :390319           NA's   :388779  
##  hosp_patients_per_million weekly_icu_admissions
##  Min.   :   0.00           Min.   :   0.0       
##  1st Qu.:  31.00           1st Qu.:  17.0       
##  Median :  74.24           Median :  92.0       
##  Mean   : 125.99           Mean   : 317.9       
##  3rd Qu.: 159.76           3rd Qu.: 353.0       
##  Max.   :1526.85           Max.   :4838.0       
##  NA's   :388779            NA's   :418442       
##  weekly_icu_admissions_per_million weekly_hosp_admissions
##  Min.   :  0.00                    Min.   :     0        
##  1st Qu.:  1.55                    1st Qu.:   223        
##  Median :  4.64                    Median :   864        
##  Mean   :  9.67                    Mean   :  4292        
##  3rd Qu.: 12.65                    3rd Qu.:  3893        
##  Max.   :224.98                    Max.   :153977        
##  NA's   :418442                    NA's   :404938        
##  weekly_hosp_admissions_per_million  total_tests          new_tests       
##  Min.   :  0.00                     Min.   :0.000e+00   Min.   :       1  
##  1st Qu.: 23.73                     1st Qu.:3.647e+05   1st Qu.:    2244  
##  Median : 56.28                     Median :2.067e+06   Median :    8783  
##  Mean   : 82.62                     Mean   :2.110e+07   Mean   :   67285  
##  3rd Qu.:110.00                     3rd Qu.:1.025e+07   3rd Qu.:   37229  
##  Max.   :717.08                     Max.   :9.214e+09   Max.   :35855632  
##  NA's   :404938                     NA's   :350048      NA's   :354032    
##  total_tests_per_thousand new_tests_per_thousand new_tests_smoothed
##  Min.   :    0.00         Min.   :  0.00         Min.   :       0  
##  1st Qu.:   43.58         1st Qu.:  0.29         1st Qu.:    1486  
##  Median :  234.14         Median :  0.97         Median :    6570  
##  Mean   :  924.25         Mean   :  3.27         Mean   :  142178  
##  3rd Qu.:  894.38         3rd Qu.:  2.91         3rd Qu.:   32205  
##  Max.   :32925.83         Max.   :531.06         Max.   :14769984  
##  NA's   :350048           NA's   :354032         NA's   :325470    
##  new_tests_smoothed_per_thousand positive_rate    tests_per_case     
##  Min.   :  0.00                  Min.   :0.00     Min.   :      1.0  
##  1st Qu.:  0.20                  1st Qu.:0.02     1st Qu.:      7.1  
##  Median :  0.85                  Median :0.06     Median :     17.5  
##  Mean   :  2.83                  Mean   :0.10     Mean   :   2403.6  
##  3rd Qu.:  2.58                  3rd Qu.:0.14     3rd Qu.:     54.6  
##  Max.   :147.60                  Max.   :1.00     Max.   :1023631.9  
##  NA's   :325470                  NA's   :333508   NA's   :335087     
##  tests_units        total_vaccinations  people_vaccinated  
##  Length:429435      Min.   :0.000e+00   Min.   :0.000e+00  
##  Class :character   1st Qu.:1.971e+06   1st Qu.:1.050e+06  
##  Mode  :character   Median :1.439e+07   Median :6.901e+06  
##                     Mean   :5.617e+08   Mean   :2.487e+08  
##                     3rd Qu.:1.162e+08   3rd Qu.:5.093e+07  
##                     Max.   :1.358e+10   Max.   :5.631e+09  
##                     NA's   :344018      NA's   :348303     
##  people_fully_vaccinated total_boosters      new_vaccinations  
##  Min.   :1.000e+00       Min.   :1.000e+00   Min.   :       0  
##  1st Qu.:9.644e+05       1st Qu.:6.023e+05   1st Qu.:    2010  
##  Median :6.191e+06       Median :5.765e+06   Median :   20531  
##  Mean   :2.287e+08       Mean   :1.506e+08   Mean   :  739864  
##  3rd Qu.:4.773e+07       3rd Qu.:4.019e+07   3rd Qu.:  173612  
##  Max.   :5.178e+09       Max.   :2.817e+09   Max.   :49673198  
##  NA's   :351374          NA's   :375835      NA's   :358464    
##  new_vaccinations_smoothed total_vaccinations_per_hundred
##  Min.   :       0          Min.   :  0.00                
##  1st Qu.:     279          1st Qu.: 44.77                
##  Median :    3871          Median :130.55                
##  Mean   :  283876          Mean   :124.28                
##  3rd Qu.:   31803          3rd Qu.:194.99                
##  Max.   :43691814          Max.   :410.23                
##  NA's   :234406            NA's   :344018                
##  people_vaccinated_per_hundred people_fully_vaccinated_per_hundred
##  Min.   :  0.00                Min.   :  0.00                     
##  1st Qu.: 27.88                1st Qu.: 21.22                     
##  Median : 64.30                Median : 57.92                     
##  Mean   : 53.50                Mean   : 48.68                     
##  3rd Qu.: 77.78                3rd Qu.: 73.61                     
##  Max.   :129.07                Max.   :126.89                     
##  NA's   :348303                NA's   :351374                     
##  total_boosters_per_hundred new_vaccinations_smoothed_per_million
##  Min.   :  0.00             Min.   :     0                       
##  1st Qu.:  5.92             1st Qu.:   106                       
##  Median : 35.91             Median :   605                       
##  Mean   : 36.30             Mean   :  1851                       
##  3rd Qu.: 57.62             3rd Qu.:  2402                       
##  Max.   :150.47             Max.   :117113                       
##  NA's   :375835             NA's   :234406                       
##  new_people_vaccinated_smoothed new_people_vaccinated_smoothed_per_hundred
##  Min.   :       0               Min.   : 0.000                            
##  1st Qu.:      43               1st Qu.: 0.000                            
##  Median :     771               Median : 0.010                            
##  Mean   :  106071               Mean   : 0.075                            
##  3rd Qu.:    9307               3rd Qu.: 0.070                            
##  Max.   :21071266               Max.   :11.710                            
##  NA's   :237258                 NA's   :237258                            
##  stringency_index population_density   median_age    aged_65_older   
##  Min.   :  0.00   Min.   :    0.14   Min.   :15.10   Min.   : 1.140  
##  1st Qu.: 22.22   1st Qu.:   37.73   1st Qu.:22.20   1st Qu.: 3.530  
##  Median : 42.85   Median :   88.12   Median :29.70   Median : 6.290  
##  Mean   : 42.88   Mean   :  394.07   Mean   :30.46   Mean   : 8.684  
##  3rd Qu.: 62.04   3rd Qu.:  222.87   3rd Qu.:38.70   3rd Qu.:13.930  
##  Max.   :100.00   Max.   :20546.77   Max.   :48.20   Max.   :27.050  
##  NA's   :233245   NA's   :68943      NA's   :94772   NA's   :106165  
##  aged_70_older    gdp_per_capita     extreme_poverty  cardiovasc_death_rate
##  Min.   : 0.530   Min.   :   661.2   Min.   : 0.10    Min.   : 79.37       
##  1st Qu.: 2.060   1st Qu.:  4227.6   1st Qu.: 0.60    1st Qu.:175.70       
##  Median : 3.870   Median : 12294.9   Median : 2.50    Median :245.46       
##  Mean   : 5.486   Mean   : 18904.2   Mean   :13.93    Mean   :264.64       
##  3rd Qu.: 8.640   3rd Qu.: 27216.4   3rd Qu.:21.40    3rd Qu.:333.44       
##  Max.   :18.490   Max.   :116935.6   Max.   :77.60    Max.   :724.42       
##  NA's   :98120    NA's   :101143     NA's   :217439   NA's   :100570       
##  diabetes_prevalence female_smokers    male_smokers    handwashing_facilities
##  Min.   : 0.990      Min.   : 0.10    Min.   : 7.7     Min.   :  1.19        
##  1st Qu.: 5.350      1st Qu.: 1.90    1st Qu.:22.6     1st Qu.: 20.86        
##  Median : 7.200      Median : 6.30    Median :33.1     Median : 49.54        
##  Mean   : 8.556      Mean   :10.77    Mean   :33.1     Mean   : 50.65        
##  3rd Qu.:10.790      3rd Qu.:19.30    3rd Qu.:41.5     3rd Qu.: 82.50        
##  Max.   :30.530      Max.   :44.00    Max.   :78.1     Max.   :100.00        
##  NA's   :83524       NA's   :182270   NA's   :185618   NA's   :267694        
##  hospital_beds_per_thousand life_expectancy human_development_index
##  Min.   : 0.100             Min.   :53.28   Min.   :0.390          
##  1st Qu.: 1.300             1st Qu.:69.50   1st Qu.:0.600          
##  Median : 2.500             Median :75.05   Median :0.740          
##  Mean   : 3.107             Mean   :73.70   Mean   :0.722          
##  3rd Qu.: 4.210             3rd Qu.:79.46   3rd Qu.:0.830          
##  Max.   :13.800             Max.   :86.75   Max.   :0.960          
##  NA's   :138746             NA's   :39136   NA's   :110308         
##    population        excess_mortality_cumulative_absolute
##  Min.   :4.700e+01   Min.   : -37726.1                   
##  1st Qu.:5.238e+05   1st Qu.:    176.5                   
##  Median :6.336e+06   Median :   6815.2                   
##  Mean   :1.520e+08   Mean   :  56047.7                   
##  3rd Qu.:3.297e+07   3rd Qu.:  39128.0                   
##  Max.   :7.975e+09   Max.   :1349776.4                   
##                      NA's   :416024                      
##  excess_mortality_cumulative excess_mortality
##  Min.   :-44.23              Min.   :-95.92  
##  1st Qu.:  2.06              1st Qu.: -1.50  
##  Median :  8.13              Median :  5.66  
##  Mean   :  9.77              Mean   : 10.93  
##  3rd Qu.: 15.16              3rd Qu.: 15.57  
##  Max.   : 78.08              Max.   :378.22  
##  NA's   :416024              NA's   :416024  
##  excess_mortality_cumulative_per_million
##  Min.   :-2936.4                        
##  1st Qu.:  116.9                        
##  Median : 1270.8                        
##  Mean   : 1772.7                        
##  3rd Qu.: 2883.0                        
##  Max.   :10293.5                        
##  NA's   :416024

Question 5: How many missing values are present in the dataset?

colSums(is.na(covid_data))
##                                   iso_code 
##                                          0 
##                                  continent 
##                                          0 
##                                   location 
##                                          0 
##                                       date 
##                                          0 
##                                total_cases 
##                                      17631 
##                                  new_cases 
##                                      19276 
##                         new_cases_smoothed 
##                                      20506 
##                               total_deaths 
##                                      17631 
##                                 new_deaths 
##                                      18827 
##                        new_deaths_smoothed 
##                                      20057 
##                    total_cases_per_million 
##                                      17631 
##                      new_cases_per_million 
##                                      19276 
##             new_cases_smoothed_per_million 
##                                      20506 
##                   total_deaths_per_million 
##                                      17631 
##                     new_deaths_per_million 
##                                      18827 
##            new_deaths_smoothed_per_million 
##                                      20057 
##                          reproduction_rate 
##                                     244618 
##                               icu_patients 
##                                     390319 
##                   icu_patients_per_million 
##                                     390319 
##                              hosp_patients 
##                                     388779 
##                  hosp_patients_per_million 
##                                     388779 
##                      weekly_icu_admissions 
##                                     418442 
##          weekly_icu_admissions_per_million 
##                                     418442 
##                     weekly_hosp_admissions 
##                                     404938 
##         weekly_hosp_admissions_per_million 
##                                     404938 
##                                total_tests 
##                                     350048 
##                                  new_tests 
##                                     354032 
##                   total_tests_per_thousand 
##                                     350048 
##                     new_tests_per_thousand 
##                                     354032 
##                         new_tests_smoothed 
##                                     325470 
##            new_tests_smoothed_per_thousand 
##                                     325470 
##                              positive_rate 
##                                     333508 
##                             tests_per_case 
##                                     335087 
##                                tests_units 
##                                          0 
##                         total_vaccinations 
##                                     344018 
##                          people_vaccinated 
##                                     348303 
##                    people_fully_vaccinated 
##                                     351374 
##                             total_boosters 
##                                     375835 
##                           new_vaccinations 
##                                     358464 
##                  new_vaccinations_smoothed 
##                                     234406 
##             total_vaccinations_per_hundred 
##                                     344018 
##              people_vaccinated_per_hundred 
##                                     348303 
##        people_fully_vaccinated_per_hundred 
##                                     351374 
##                 total_boosters_per_hundred 
##                                     375835 
##      new_vaccinations_smoothed_per_million 
##                                     234406 
##             new_people_vaccinated_smoothed 
##                                     237258 
## new_people_vaccinated_smoothed_per_hundred 
##                                     237258 
##                           stringency_index 
##                                     233245 
##                         population_density 
##                                      68943 
##                                 median_age 
##                                      94772 
##                              aged_65_older 
##                                     106165 
##                              aged_70_older 
##                                      98120 
##                             gdp_per_capita 
##                                     101143 
##                            extreme_poverty 
##                                     217439 
##                      cardiovasc_death_rate 
##                                     100570 
##                        diabetes_prevalence 
##                                      83524 
##                             female_smokers 
##                                     182270 
##                               male_smokers 
##                                     185618 
##                     handwashing_facilities 
##                                     267694 
##                 hospital_beds_per_thousand 
##                                     138746 
##                            life_expectancy 
##                                      39136 
##                    human_development_index 
##                                     110308 
##                                 population 
##                                          0 
##       excess_mortality_cumulative_absolute 
##                                     416024 
##                excess_mortality_cumulative 
##                                     416024 
##                           excess_mortality 
##                                     416024 
##    excess_mortality_cumulative_per_million 
##                                     416024

Level 2: Data Cleaning

Question 6: Select important columns for analysis

covid_clean <- covid_data %>%
  select(location, continent, date, total_cases, new_cases, total_deaths, total_vaccinations)

head(covid_clean)
##      location continent       date total_cases new_cases total_deaths
## 1 Afghanistan      Asia 2020-01-05           0         0            0
## 2 Afghanistan      Asia 2020-01-06           0         0            0
## 3 Afghanistan      Asia 2020-01-07           0         0            0
## 4 Afghanistan      Asia 2020-01-08           0         0            0
## 5 Afghanistan      Asia 2020-01-09           0         0            0
## 6 Afghanistan      Asia 2020-01-10           0         0            0
##   total_vaccinations
## 1                 NA
## 2                 NA
## 3                 NA
## 4                 NA
## 5                 NA
## 6                 NA

Question 7: Convert the date column into proper date format

covid_clean$date <- as.Date(covid_clean$date)
str(covid_clean$date)
##  Date[1:429435], format: "2020-01-05" "2020-01-06" "2020-01-07" "2020-01-08" "2020-01-09" ...

Question 8: Remove missing values from the dataset

covid_clean <- na.omit(covid_clean)
summary(covid_clean)
##    location          continent              date             total_cases       
##  Length:73532       Length:73532       Min.   :2020-12-02   Min.   :        0  
##  Class :character   Class :character   1st Qu.:2021-07-29   1st Qu.:   175560  
##  Mode  :character   Mode  :character   Median :2022-01-22   Median :  1083189  
##                                        Mean   :2022-03-22   Mean   : 23038750  
##                                        3rd Qu.:2022-09-23   3rd Qu.:  5167020  
##                                        Max.   :2024-08-04   Max.   :775866783  
##    new_cases         total_deaths     total_vaccinations 
##  Min.   :       0   Min.   :      0   Min.   :0.000e+00  
##  1st Qu.:       0   1st Qu.:   2063   1st Qu.:1.608e+06  
##  Median :       0   Median :  12571   Median :1.191e+07  
##  Mean   :   27375   Mean   : 258739   Mean   :4.490e+08  
##  3rd Qu.:       0   3rd Qu.:  91869   3rd Qu.:8.627e+07  
##  Max.   :44236227   Max.   :7057132   Max.   :1.358e+10

Question 9: Preview the cleaned dataset

head(covid_clean)
##        location continent       date total_cases new_cases total_deaths
## 415 Afghanistan      Asia 2021-02-22       55604         0         2432
## 421 Afghanistan      Asia 2021-02-28       55714       110         2443
## 437 Afghanistan      Asia 2021-03-16       55985         0         2457
## 459 Afghanistan      Asia 2021-04-07       56676         0         2497
## 474 Afghanistan      Asia 2021-04-22       57793         0         2539
## 493 Afghanistan      Asia 2021-05-11       61842         0         2686
##     total_vaccinations
## 415                  0
## 421               8200
## 437              54000
## 459             120000
## 474             240000
## 493             504502

Question 10: How many rows are left after data cleaning?

nrow(covid_clean)
## [1] 73532

Level 3: Trend Analysis

Question 11: Global trend of total cases

global_trend <- covid_clean %>%
  group_by(date) %>%
  summarise(total_cases = sum(total_cases))

ggplot(global_trend, aes(date, total_cases)) +
  geom_line()

Question 12: Trend of new cases

ggplot(covid_clean, aes(date, new_cases)) +
  geom_line()

Question 13: Trend of total deaths

ggplot(covid_clean, aes(date, total_deaths)) +
  geom_line()

Question 14: Log scale trend

ggplot(global_trend, aes(date, total_cases)) +
  geom_line() +
  scale_y_log10()

Question 15: Highest new cases day

covid_clean %>%
  filter(new_cases == max(new_cases))
##   location continent       date total_cases new_cases total_deaths
## 1    World           2022-12-25   704630839  44236227      6695683
##   total_vaccinations
## 1        13156309935

Level 4: Country Analysis

Question 16: Top 10 countries by cases

top_countries <- covid_clean %>%
  group_by(location) %>%
  summarise(max_cases = max(total_cases)) %>%
  arrange(desc(max_cases)) %>%
  head(10)

top_countries
## # A tibble: 10 × 2
##    location      max_cases
##    <chr>             <int>
##  1 World         775866783
##  2 Asia          301499099
##  3 Europe        252916868
##  4 North America 124490833
##  5 United States 103343569
##  6 China          98637553
##  7 South America  68670586
##  8 India          45041748
##  9 France         38997490
## 10 Germany        38356872

Question 17: Visualization of top countries

ggplot(top_countries, aes(reorder(location, max_cases), max_cases)) +
  geom_col() +
  coord_flip()

Question 18: Compare India, USA, Brazil

selected <- covid_clean %>%
  filter(location %in% c("India","United States","Brazil"))

head(selected)
##   location     continent       date total_cases new_cases total_deaths
## 1   Brazil South America 2021-01-17     8393492    379784       208246
## 2   Brazil South America 2021-01-18     8393492         0       208246
## 3   Brazil South America 2021-01-19     8393492         0       208246
## 4   Brazil South America 2021-01-20     8393492         0       208246
## 5   Brazil South America 2021-01-21     8393492         0       208246
## 6   Brazil South America 2021-01-22     8393492         0       208246
##   total_vaccinations
## 1                112
## 2               1109
## 3              13527
## 4             105824
## 5             193699
## 6             462269

Question 19: Vaccination trend

ggplot(selected, aes(date, total_vaccinations, color=location)) +
  geom_line()

Question 20: Death rate

covid_clean <- covid_clean %>%
  mutate(death_rate = total_deaths / total_cases)

head(covid_clean)
##        location continent       date total_cases new_cases total_deaths
## 415 Afghanistan      Asia 2021-02-22       55604         0         2432
## 421 Afghanistan      Asia 2021-02-28       55714       110         2443
## 437 Afghanistan      Asia 2021-03-16       55985         0         2457
## 459 Afghanistan      Asia 2021-04-07       56676         0         2497
## 474 Afghanistan      Asia 2021-04-22       57793         0         2539
## 493 Afghanistan      Asia 2021-05-11       61842         0         2686
##     total_vaccinations death_rate
## 415                  0 0.04373786
## 421               8200 0.04384894
## 437              54000 0.04388676
## 459             120000 0.04405745
## 474             240000 0.04393266
## 493             504502 0.04343327

Level 5: Advanced Analysis

Question 21: Correlation

cor_data <- covid_clean %>%
  select(total_cases, new_cases, total_deaths, total_vaccinations)

cor_matrix <- cor(na.omit(cor_data))
cor_matrix
##                    total_cases new_cases total_deaths total_vaccinations
## total_cases          1.0000000 0.1226220    0.9568953          0.9148502
## new_cases            0.1226220 1.0000000    0.1508531          0.1367483
## total_deaths         0.9568953 0.1508531    1.0000000          0.8673348
## total_vaccinations   0.9148502 0.1367483    0.8673348          1.0000000

Question 22: Heatmap

cor_melt <- reshape2::melt(cor_matrix)

ggplot(cor_melt, aes(Var1, Var2, fill=value)) +
  geom_tile() +
  theme_minimal()

Question 23: Scatter plot

ggplot(covid_clean, aes(total_cases, total_deaths)) +
  geom_point(alpha=0.3)

Question 24: Scatter by country

ggplot(selected, aes(total_cases, total_deaths, color=location)) +
  geom_point()

Question 25: Summary insights

summary(covid_clean$total_cases)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0    175560   1083189  23038750   5167020 775866783
summary(covid_clean$total_deaths)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    2063   12571  258739   91869 7057132

Question 26: Avg new cases

avg_cases <- covid_clean %>%
  group_by(location) %>%
  summarise(avg_new_cases = mean(new_cases, na.rm = TRUE)) %>%
  arrange(desc(avg_new_cases)) %>%
  head(10)

avg_cases
## # A tibble: 10 × 2
##    location      avg_new_cases
##    <chr>                 <dbl>
##  1 World               531202.
##  2 Asia                212699.
##  3 Europe              175843.
##  4 China               139987.
##  5 United States       101139.
##  6 North America        81374.
##  7 South America        49792.
##  8 Germany              44273.
##  9 South Korea          42265.
## 10 Japan                41958.

Question 27: Monthly trend

monthly_data <- covid_clean %>%
  mutate(month = format(date, "%Y-%m")) %>%
  group_by(month) %>%
  summarise(monthly_cases = sum(new_cases, na.rm = TRUE))

ggplot(monthly_data, aes(month, monthly_cases, group=1)) +
  geom_line() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Question 28: Continent analysis

continent_data <- covid_clean %>%
  group_by(continent) %>%
  summarise(total_cases = sum(total_cases, na.rm = TRUE))

ggplot(continent_data, aes(continent, total_cases)) +
  geom_col()

Question 29: Death rate by continent

continent_death <- covid_clean %>%
  group_by(continent) %>%
  summarise(avg_death_rate = mean(death_rate, na.rm = TRUE))

ggplot(continent_death, aes(continent, avg_death_rate)) +
  geom_col()

Question 30: Conclusion

The analysis of global COVID-19 data reveals trends, relationships, and differences across countries and continents. It demonstrates how data analysis can provide meaningful real-world insights.