This project performs advanced exploratory data analysis of global COVID-19 data using R.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## Warning: package 'ggplot2' was built under R version 4.5.3
## Warning: package 'tibble' was built under R version 4.5.3
## Warning: package 'tidyr' was built under R version 4.5.3
## Warning: package 'readr' was built under R version 4.5.3
## Warning: package 'purrr' was built under R version 4.5.3
## Warning: package 'dplyr' was built under R version 4.5.3
## Warning: package 'forcats' was built under R version 4.5.3
## Warning: package 'lubridate' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.3
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
covid_data <- read.csv("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv")
str(covid_data)
## 'data.frame': 429435 obs. of 67 variables:
## $ iso_code : chr "AFG" "AFG" "AFG" "AFG" ...
## $ continent : chr "Asia" "Asia" "Asia" "Asia" ...
## $ location : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ date : chr "2020-01-05" "2020-01-06" "2020-01-07" "2020-01-08" ...
## $ total_cases : int 0 0 0 0 0 0 0 0 0 0 ...
## $ new_cases : int 0 0 0 0 0 0 0 0 0 0 ...
## $ new_cases_smoothed : num NA NA NA NA NA 0 0 0 0 0 ...
## $ total_deaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ new_deaths : int 0 0 0 0 0 0 0 0 0 0 ...
## $ new_deaths_smoothed : num NA NA NA NA NA 0 0 0 0 0 ...
## $ total_cases_per_million : num 0 0 0 0 0 0 0 0 0 0 ...
## $ new_cases_per_million : num 0 0 0 0 0 0 0 0 0 0 ...
## $ new_cases_smoothed_per_million : num NA NA NA NA NA 0 0 0 0 0 ...
## $ total_deaths_per_million : num 0 0 0 0 0 0 0 0 0 0 ...
## $ new_deaths_per_million : num 0 0 0 0 0 0 0 0 0 0 ...
## $ new_deaths_smoothed_per_million : num NA NA NA NA NA 0 0 0 0 0 ...
## $ reproduction_rate : num NA NA NA NA NA NA NA NA NA NA ...
## $ icu_patients : int NA NA NA NA NA NA NA NA NA NA ...
## $ icu_patients_per_million : num NA NA NA NA NA NA NA NA NA NA ...
## $ hosp_patients : int NA NA NA NA NA NA NA NA NA NA ...
## $ hosp_patients_per_million : num NA NA NA NA NA NA NA NA NA NA ...
## $ weekly_icu_admissions : int NA NA NA NA NA NA NA NA NA NA ...
## $ weekly_icu_admissions_per_million : num NA NA NA NA NA NA NA NA NA NA ...
## $ weekly_hosp_admissions : int NA NA NA NA NA NA NA NA NA NA ...
## $ weekly_hosp_admissions_per_million : num NA NA NA NA NA NA NA NA NA NA ...
## $ total_tests : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_tests : int NA NA NA NA NA NA NA NA NA NA ...
## $ total_tests_per_thousand : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_tests_per_thousand : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_tests_smoothed : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_tests_smoothed_per_thousand : num NA NA NA NA NA NA NA NA NA NA ...
## $ positive_rate : num NA NA NA NA NA NA NA NA NA NA ...
## $ tests_per_case : num NA NA NA NA NA NA NA NA NA NA ...
## $ tests_units : chr "" "" "" "" ...
## $ total_vaccinations : num NA NA NA NA NA NA NA NA NA NA ...
## $ people_vaccinated : num NA NA NA NA NA NA NA NA NA NA ...
## $ people_fully_vaccinated : num NA NA NA NA NA NA NA NA NA NA ...
## $ total_boosters : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_vaccinations : int NA NA NA NA NA NA NA NA NA NA ...
## $ new_vaccinations_smoothed : num NA NA NA NA NA NA NA NA NA NA ...
## $ total_vaccinations_per_hundred : num NA NA NA NA NA NA NA NA NA NA ...
## $ people_vaccinated_per_hundred : num NA NA NA NA NA NA NA NA NA NA ...
## $ people_fully_vaccinated_per_hundred : num NA NA NA NA NA NA NA NA NA NA ...
## $ total_boosters_per_hundred : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_vaccinations_smoothed_per_million : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_people_vaccinated_smoothed : num NA NA NA NA NA NA NA NA NA NA ...
## $ new_people_vaccinated_smoothed_per_hundred: num NA NA NA NA NA NA NA NA NA NA ...
## $ stringency_index : num 0 0 0 0 0 0 0 0 0 0 ...
## $ population_density : num 54.4 54.4 54.4 54.4 54.4 ...
## $ median_age : num 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 ...
## $ aged_65_older : num 2.58 2.58 2.58 2.58 2.58 2.58 2.58 2.58 2.58 2.58 ...
## $ aged_70_older : num 1.34 1.34 1.34 1.34 1.34 1.34 1.34 1.34 1.34 1.34 ...
## $ gdp_per_capita : num 1804 1804 1804 1804 1804 ...
## $ extreme_poverty : num NA NA NA NA NA NA NA NA NA NA ...
## $ cardiovasc_death_rate : num 597 597 597 597 597 ...
## $ diabetes_prevalence : num 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 ...
## $ female_smokers : num NA NA NA NA NA NA NA NA NA NA ...
## $ male_smokers : num NA NA NA NA NA NA NA NA NA NA ...
## $ handwashing_facilities : num 37.8 37.8 37.8 37.8 37.8 ...
## $ hospital_beds_per_thousand : num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## $ life_expectancy : num 64.8 64.8 64.8 64.8 64.8 ...
## $ human_development_index : num 0.51 0.51 0.51 0.51 0.51 0.51 0.51 0.51 0.51 0.51 ...
## $ population : num 41128772 41128772 41128772 41128772 41128772 ...
## $ excess_mortality_cumulative_absolute : num NA NA NA NA NA NA NA NA NA NA ...
## $ excess_mortality_cumulative : num NA NA NA NA NA NA NA NA NA NA ...
## $ excess_mortality : num NA NA NA NA NA NA NA NA NA NA ...
## $ excess_mortality_cumulative_per_million : num NA NA NA NA NA NA NA NA NA NA ...
dim(covid_data)
## [1] 429435 67
colnames(covid_data)
## [1] "iso_code"
## [2] "continent"
## [3] "location"
## [4] "date"
## [5] "total_cases"
## [6] "new_cases"
## [7] "new_cases_smoothed"
## [8] "total_deaths"
## [9] "new_deaths"
## [10] "new_deaths_smoothed"
## [11] "total_cases_per_million"
## [12] "new_cases_per_million"
## [13] "new_cases_smoothed_per_million"
## [14] "total_deaths_per_million"
## [15] "new_deaths_per_million"
## [16] "new_deaths_smoothed_per_million"
## [17] "reproduction_rate"
## [18] "icu_patients"
## [19] "icu_patients_per_million"
## [20] "hosp_patients"
## [21] "hosp_patients_per_million"
## [22] "weekly_icu_admissions"
## [23] "weekly_icu_admissions_per_million"
## [24] "weekly_hosp_admissions"
## [25] "weekly_hosp_admissions_per_million"
## [26] "total_tests"
## [27] "new_tests"
## [28] "total_tests_per_thousand"
## [29] "new_tests_per_thousand"
## [30] "new_tests_smoothed"
## [31] "new_tests_smoothed_per_thousand"
## [32] "positive_rate"
## [33] "tests_per_case"
## [34] "tests_units"
## [35] "total_vaccinations"
## [36] "people_vaccinated"
## [37] "people_fully_vaccinated"
## [38] "total_boosters"
## [39] "new_vaccinations"
## [40] "new_vaccinations_smoothed"
## [41] "total_vaccinations_per_hundred"
## [42] "people_vaccinated_per_hundred"
## [43] "people_fully_vaccinated_per_hundred"
## [44] "total_boosters_per_hundred"
## [45] "new_vaccinations_smoothed_per_million"
## [46] "new_people_vaccinated_smoothed"
## [47] "new_people_vaccinated_smoothed_per_hundred"
## [48] "stringency_index"
## [49] "population_density"
## [50] "median_age"
## [51] "aged_65_older"
## [52] "aged_70_older"
## [53] "gdp_per_capita"
## [54] "extreme_poverty"
## [55] "cardiovasc_death_rate"
## [56] "diabetes_prevalence"
## [57] "female_smokers"
## [58] "male_smokers"
## [59] "handwashing_facilities"
## [60] "hospital_beds_per_thousand"
## [61] "life_expectancy"
## [62] "human_development_index"
## [63] "population"
## [64] "excess_mortality_cumulative_absolute"
## [65] "excess_mortality_cumulative"
## [66] "excess_mortality"
## [67] "excess_mortality_cumulative_per_million"
summary(covid_data)
## iso_code continent location date
## Length:429435 Length:429435 Length:429435 Length:429435
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## total_cases new_cases new_cases_smoothed total_deaths
## Min. : 0 Min. : 0 Min. : 0.0 Min. : 0
## 1st Qu.: 6281 1st Qu.: 0 1st Qu.: 0.0 1st Qu.: 43
## Median : 63653 Median : 0 Median : 12.0 Median : 799
## Mean : 7365292 Mean : 8017 Mean : 8041.0 Mean : 81260
## 3rd Qu.: 758272 3rd Qu.: 0 3rd Qu.: 313.3 3rd Qu.: 9574
## Max. :775866783 Max. :44236227 Max. :6319461.0 Max. :7057132
## NA's :17631 NA's :19276 NA's :20506 NA's :17631
## new_deaths new_deaths_smoothed total_cases_per_million
## Min. : 0.00 Min. : 0.00 Min. : 0
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 1916
## Median : 0.00 Median : 0.00 Median : 29145
## Mean : 71.85 Mean : 72.06 Mean :112096
## 3rd Qu.: 0.00 3rd Qu.: 3.14 3rd Qu.:156770
## Max. :103719.00 Max. :14817.00 Max. :763599
## NA's :18827 NA's :20057 NA's :17631
## new_cases_per_million new_cases_smoothed_per_million total_deaths_per_million
## Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 24.57
## Median : 0.0 Median : 2.79 Median : 295.09
## Mean : 122.4 Mean : 122.71 Mean : 835.51
## 3rd Qu.: 0.0 3rd Qu.: 56.25 3rd Qu.:1283.82
## Max. :241758.2 Max. :34536.89 Max. :6601.11
## NA's :19276 NA's :20506 NA's :17631
## new_deaths_per_million new_deaths_smoothed_per_million reproduction_rate
## Min. : 0.0000 Min. : 0.0000 Min. :-0.070
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.720
## Median : 0.0000 Median : 0.0000 Median : 0.950
## Mean : 0.7623 Mean : 0.7645 Mean : 0.911
## 3rd Qu.: 0.0000 3rd Qu.: 0.3600 3rd Qu.: 1.140
## Max. :893.6600 Max. :127.6600 Max. : 5.870
## NA's :18827 NA's :20057 NA's :244618
## icu_patients icu_patients_per_million hosp_patients
## Min. : 0 Min. : 0.00 Min. : 0
## 1st Qu.: 21 1st Qu.: 2.33 1st Qu.: 186
## Median : 90 Median : 6.43 Median : 776
## Mean : 661 Mean : 15.66 Mean : 3912
## 3rd Qu.: 413 3rd Qu.: 18.78 3rd Qu.: 3051
## Max. :28891 Max. :180.68 Max. :154497
## NA's :390319 NA's :390319 NA's :388779
## hosp_patients_per_million weekly_icu_admissions
## Min. : 0.00 Min. : 0.0
## 1st Qu.: 31.00 1st Qu.: 17.0
## Median : 74.24 Median : 92.0
## Mean : 125.99 Mean : 317.9
## 3rd Qu.: 159.76 3rd Qu.: 353.0
## Max. :1526.85 Max. :4838.0
## NA's :388779 NA's :418442
## weekly_icu_admissions_per_million weekly_hosp_admissions
## Min. : 0.00 Min. : 0
## 1st Qu.: 1.55 1st Qu.: 223
## Median : 4.64 Median : 864
## Mean : 9.67 Mean : 4292
## 3rd Qu.: 12.65 3rd Qu.: 3893
## Max. :224.98 Max. :153977
## NA's :418442 NA's :404938
## weekly_hosp_admissions_per_million total_tests new_tests
## Min. : 0.00 Min. :0.000e+00 Min. : 1
## 1st Qu.: 23.73 1st Qu.:3.647e+05 1st Qu.: 2244
## Median : 56.28 Median :2.067e+06 Median : 8783
## Mean : 82.62 Mean :2.110e+07 Mean : 67285
## 3rd Qu.:110.00 3rd Qu.:1.025e+07 3rd Qu.: 37229
## Max. :717.08 Max. :9.214e+09 Max. :35855632
## NA's :404938 NA's :350048 NA's :354032
## total_tests_per_thousand new_tests_per_thousand new_tests_smoothed
## Min. : 0.00 Min. : 0.00 Min. : 0
## 1st Qu.: 43.58 1st Qu.: 0.29 1st Qu.: 1486
## Median : 234.14 Median : 0.97 Median : 6570
## Mean : 924.25 Mean : 3.27 Mean : 142178
## 3rd Qu.: 894.38 3rd Qu.: 2.91 3rd Qu.: 32205
## Max. :32925.83 Max. :531.06 Max. :14769984
## NA's :350048 NA's :354032 NA's :325470
## new_tests_smoothed_per_thousand positive_rate tests_per_case
## Min. : 0.00 Min. :0.00 Min. : 1.0
## 1st Qu.: 0.20 1st Qu.:0.02 1st Qu.: 7.1
## Median : 0.85 Median :0.06 Median : 17.5
## Mean : 2.83 Mean :0.10 Mean : 2403.6
## 3rd Qu.: 2.58 3rd Qu.:0.14 3rd Qu.: 54.6
## Max. :147.60 Max. :1.00 Max. :1023631.9
## NA's :325470 NA's :333508 NA's :335087
## tests_units total_vaccinations people_vaccinated
## Length:429435 Min. :0.000e+00 Min. :0.000e+00
## Class :character 1st Qu.:1.971e+06 1st Qu.:1.050e+06
## Mode :character Median :1.439e+07 Median :6.901e+06
## Mean :5.617e+08 Mean :2.487e+08
## 3rd Qu.:1.162e+08 3rd Qu.:5.093e+07
## Max. :1.358e+10 Max. :5.631e+09
## NA's :344018 NA's :348303
## people_fully_vaccinated total_boosters new_vaccinations
## Min. :1.000e+00 Min. :1.000e+00 Min. : 0
## 1st Qu.:9.644e+05 1st Qu.:6.023e+05 1st Qu.: 2010
## Median :6.191e+06 Median :5.765e+06 Median : 20531
## Mean :2.287e+08 Mean :1.506e+08 Mean : 739864
## 3rd Qu.:4.773e+07 3rd Qu.:4.019e+07 3rd Qu.: 173612
## Max. :5.178e+09 Max. :2.817e+09 Max. :49673198
## NA's :351374 NA's :375835 NA's :358464
## new_vaccinations_smoothed total_vaccinations_per_hundred
## Min. : 0 Min. : 0.00
## 1st Qu.: 279 1st Qu.: 44.77
## Median : 3871 Median :130.55
## Mean : 283876 Mean :124.28
## 3rd Qu.: 31803 3rd Qu.:194.99
## Max. :43691814 Max. :410.23
## NA's :234406 NA's :344018
## people_vaccinated_per_hundred people_fully_vaccinated_per_hundred
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 27.88 1st Qu.: 21.22
## Median : 64.30 Median : 57.92
## Mean : 53.50 Mean : 48.68
## 3rd Qu.: 77.78 3rd Qu.: 73.61
## Max. :129.07 Max. :126.89
## NA's :348303 NA's :351374
## total_boosters_per_hundred new_vaccinations_smoothed_per_million
## Min. : 0.00 Min. : 0
## 1st Qu.: 5.92 1st Qu.: 106
## Median : 35.91 Median : 605
## Mean : 36.30 Mean : 1851
## 3rd Qu.: 57.62 3rd Qu.: 2402
## Max. :150.47 Max. :117113
## NA's :375835 NA's :234406
## new_people_vaccinated_smoothed new_people_vaccinated_smoothed_per_hundred
## Min. : 0 Min. : 0.000
## 1st Qu.: 43 1st Qu.: 0.000
## Median : 771 Median : 0.010
## Mean : 106071 Mean : 0.075
## 3rd Qu.: 9307 3rd Qu.: 0.070
## Max. :21071266 Max. :11.710
## NA's :237258 NA's :237258
## stringency_index population_density median_age aged_65_older
## Min. : 0.00 Min. : 0.14 Min. :15.10 Min. : 1.140
## 1st Qu.: 22.22 1st Qu.: 37.73 1st Qu.:22.20 1st Qu.: 3.530
## Median : 42.85 Median : 88.12 Median :29.70 Median : 6.290
## Mean : 42.88 Mean : 394.07 Mean :30.46 Mean : 8.684
## 3rd Qu.: 62.04 3rd Qu.: 222.87 3rd Qu.:38.70 3rd Qu.:13.930
## Max. :100.00 Max. :20546.77 Max. :48.20 Max. :27.050
## NA's :233245 NA's :68943 NA's :94772 NA's :106165
## aged_70_older gdp_per_capita extreme_poverty cardiovasc_death_rate
## Min. : 0.530 Min. : 661.2 Min. : 0.10 Min. : 79.37
## 1st Qu.: 2.060 1st Qu.: 4227.6 1st Qu.: 0.60 1st Qu.:175.70
## Median : 3.870 Median : 12294.9 Median : 2.50 Median :245.46
## Mean : 5.486 Mean : 18904.2 Mean :13.93 Mean :264.64
## 3rd Qu.: 8.640 3rd Qu.: 27216.4 3rd Qu.:21.40 3rd Qu.:333.44
## Max. :18.490 Max. :116935.6 Max. :77.60 Max. :724.42
## NA's :98120 NA's :101143 NA's :217439 NA's :100570
## diabetes_prevalence female_smokers male_smokers handwashing_facilities
## Min. : 0.990 Min. : 0.10 Min. : 7.7 Min. : 1.19
## 1st Qu.: 5.350 1st Qu.: 1.90 1st Qu.:22.6 1st Qu.: 20.86
## Median : 7.200 Median : 6.30 Median :33.1 Median : 49.54
## Mean : 8.556 Mean :10.77 Mean :33.1 Mean : 50.65
## 3rd Qu.:10.790 3rd Qu.:19.30 3rd Qu.:41.5 3rd Qu.: 82.50
## Max. :30.530 Max. :44.00 Max. :78.1 Max. :100.00
## NA's :83524 NA's :182270 NA's :185618 NA's :267694
## hospital_beds_per_thousand life_expectancy human_development_index
## Min. : 0.100 Min. :53.28 Min. :0.390
## 1st Qu.: 1.300 1st Qu.:69.50 1st Qu.:0.600
## Median : 2.500 Median :75.05 Median :0.740
## Mean : 3.107 Mean :73.70 Mean :0.722
## 3rd Qu.: 4.210 3rd Qu.:79.46 3rd Qu.:0.830
## Max. :13.800 Max. :86.75 Max. :0.960
## NA's :138746 NA's :39136 NA's :110308
## population excess_mortality_cumulative_absolute
## Min. :4.700e+01 Min. : -37726.1
## 1st Qu.:5.238e+05 1st Qu.: 176.5
## Median :6.336e+06 Median : 6815.2
## Mean :1.520e+08 Mean : 56047.7
## 3rd Qu.:3.297e+07 3rd Qu.: 39128.0
## Max. :7.975e+09 Max. :1349776.4
## NA's :416024
## excess_mortality_cumulative excess_mortality
## Min. :-44.23 Min. :-95.92
## 1st Qu.: 2.06 1st Qu.: -1.50
## Median : 8.13 Median : 5.66
## Mean : 9.77 Mean : 10.93
## 3rd Qu.: 15.16 3rd Qu.: 15.57
## Max. : 78.08 Max. :378.22
## NA's :416024 NA's :416024
## excess_mortality_cumulative_per_million
## Min. :-2936.4
## 1st Qu.: 116.9
## Median : 1270.8
## Mean : 1772.7
## 3rd Qu.: 2883.0
## Max. :10293.5
## NA's :416024
colSums(is.na(covid_data))
## iso_code
## 0
## continent
## 0
## location
## 0
## date
## 0
## total_cases
## 17631
## new_cases
## 19276
## new_cases_smoothed
## 20506
## total_deaths
## 17631
## new_deaths
## 18827
## new_deaths_smoothed
## 20057
## total_cases_per_million
## 17631
## new_cases_per_million
## 19276
## new_cases_smoothed_per_million
## 20506
## total_deaths_per_million
## 17631
## new_deaths_per_million
## 18827
## new_deaths_smoothed_per_million
## 20057
## reproduction_rate
## 244618
## icu_patients
## 390319
## icu_patients_per_million
## 390319
## hosp_patients
## 388779
## hosp_patients_per_million
## 388779
## weekly_icu_admissions
## 418442
## weekly_icu_admissions_per_million
## 418442
## weekly_hosp_admissions
## 404938
## weekly_hosp_admissions_per_million
## 404938
## total_tests
## 350048
## new_tests
## 354032
## total_tests_per_thousand
## 350048
## new_tests_per_thousand
## 354032
## new_tests_smoothed
## 325470
## new_tests_smoothed_per_thousand
## 325470
## positive_rate
## 333508
## tests_per_case
## 335087
## tests_units
## 0
## total_vaccinations
## 344018
## people_vaccinated
## 348303
## people_fully_vaccinated
## 351374
## total_boosters
## 375835
## new_vaccinations
## 358464
## new_vaccinations_smoothed
## 234406
## total_vaccinations_per_hundred
## 344018
## people_vaccinated_per_hundred
## 348303
## people_fully_vaccinated_per_hundred
## 351374
## total_boosters_per_hundred
## 375835
## new_vaccinations_smoothed_per_million
## 234406
## new_people_vaccinated_smoothed
## 237258
## new_people_vaccinated_smoothed_per_hundred
## 237258
## stringency_index
## 233245
## population_density
## 68943
## median_age
## 94772
## aged_65_older
## 106165
## aged_70_older
## 98120
## gdp_per_capita
## 101143
## extreme_poverty
## 217439
## cardiovasc_death_rate
## 100570
## diabetes_prevalence
## 83524
## female_smokers
## 182270
## male_smokers
## 185618
## handwashing_facilities
## 267694
## hospital_beds_per_thousand
## 138746
## life_expectancy
## 39136
## human_development_index
## 110308
## population
## 0
## excess_mortality_cumulative_absolute
## 416024
## excess_mortality_cumulative
## 416024
## excess_mortality
## 416024
## excess_mortality_cumulative_per_million
## 416024
covid_clean <- covid_data %>%
select(location, continent, date, total_cases, new_cases, total_deaths, total_vaccinations)
head(covid_clean)
## location continent date total_cases new_cases total_deaths
## 1 Afghanistan Asia 2020-01-05 0 0 0
## 2 Afghanistan Asia 2020-01-06 0 0 0
## 3 Afghanistan Asia 2020-01-07 0 0 0
## 4 Afghanistan Asia 2020-01-08 0 0 0
## 5 Afghanistan Asia 2020-01-09 0 0 0
## 6 Afghanistan Asia 2020-01-10 0 0 0
## total_vaccinations
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
covid_clean$date <- as.Date(covid_clean$date)
str(covid_clean$date)
## Date[1:429435], format: "2020-01-05" "2020-01-06" "2020-01-07" "2020-01-08" "2020-01-09" ...
covid_clean <- na.omit(covid_clean)
summary(covid_clean)
## location continent date total_cases
## Length:73532 Length:73532 Min. :2020-12-02 Min. : 0
## Class :character Class :character 1st Qu.:2021-07-29 1st Qu.: 175560
## Mode :character Mode :character Median :2022-01-22 Median : 1083189
## Mean :2022-03-22 Mean : 23038750
## 3rd Qu.:2022-09-23 3rd Qu.: 5167020
## Max. :2024-08-04 Max. :775866783
## new_cases total_deaths total_vaccinations
## Min. : 0 Min. : 0 Min. :0.000e+00
## 1st Qu.: 0 1st Qu.: 2063 1st Qu.:1.608e+06
## Median : 0 Median : 12571 Median :1.191e+07
## Mean : 27375 Mean : 258739 Mean :4.490e+08
## 3rd Qu.: 0 3rd Qu.: 91869 3rd Qu.:8.627e+07
## Max. :44236227 Max. :7057132 Max. :1.358e+10
head(covid_clean)
## location continent date total_cases new_cases total_deaths
## 415 Afghanistan Asia 2021-02-22 55604 0 2432
## 421 Afghanistan Asia 2021-02-28 55714 110 2443
## 437 Afghanistan Asia 2021-03-16 55985 0 2457
## 459 Afghanistan Asia 2021-04-07 56676 0 2497
## 474 Afghanistan Asia 2021-04-22 57793 0 2539
## 493 Afghanistan Asia 2021-05-11 61842 0 2686
## total_vaccinations
## 415 0
## 421 8200
## 437 54000
## 459 120000
## 474 240000
## 493 504502
nrow(covid_clean)
## [1] 73532
global_trend <- covid_clean %>%
group_by(date) %>%
summarise(total_cases = sum(total_cases))
ggplot(global_trend, aes(date, total_cases)) +
geom_line()
ggplot(covid_clean, aes(date, new_cases)) +
geom_line()
ggplot(covid_clean, aes(date, total_deaths)) +
geom_line()
ggplot(global_trend, aes(date, total_cases)) +
geom_line() +
scale_y_log10()
covid_clean %>%
filter(new_cases == max(new_cases))
## location continent date total_cases new_cases total_deaths
## 1 World 2022-12-25 704630839 44236227 6695683
## total_vaccinations
## 1 13156309935
top_countries <- covid_clean %>%
group_by(location) %>%
summarise(max_cases = max(total_cases)) %>%
arrange(desc(max_cases)) %>%
head(10)
top_countries
## # A tibble: 10 × 2
## location max_cases
## <chr> <int>
## 1 World 775866783
## 2 Asia 301499099
## 3 Europe 252916868
## 4 North America 124490833
## 5 United States 103343569
## 6 China 98637553
## 7 South America 68670586
## 8 India 45041748
## 9 France 38997490
## 10 Germany 38356872
ggplot(top_countries, aes(reorder(location, max_cases), max_cases)) +
geom_col() +
coord_flip()
selected <- covid_clean %>%
filter(location %in% c("India","United States","Brazil"))
head(selected)
## location continent date total_cases new_cases total_deaths
## 1 Brazil South America 2021-01-17 8393492 379784 208246
## 2 Brazil South America 2021-01-18 8393492 0 208246
## 3 Brazil South America 2021-01-19 8393492 0 208246
## 4 Brazil South America 2021-01-20 8393492 0 208246
## 5 Brazil South America 2021-01-21 8393492 0 208246
## 6 Brazil South America 2021-01-22 8393492 0 208246
## total_vaccinations
## 1 112
## 2 1109
## 3 13527
## 4 105824
## 5 193699
## 6 462269
ggplot(selected, aes(date, total_vaccinations, color=location)) +
geom_line()
covid_clean <- covid_clean %>%
mutate(death_rate = total_deaths / total_cases)
head(covid_clean)
## location continent date total_cases new_cases total_deaths
## 415 Afghanistan Asia 2021-02-22 55604 0 2432
## 421 Afghanistan Asia 2021-02-28 55714 110 2443
## 437 Afghanistan Asia 2021-03-16 55985 0 2457
## 459 Afghanistan Asia 2021-04-07 56676 0 2497
## 474 Afghanistan Asia 2021-04-22 57793 0 2539
## 493 Afghanistan Asia 2021-05-11 61842 0 2686
## total_vaccinations death_rate
## 415 0 0.04373786
## 421 8200 0.04384894
## 437 54000 0.04388676
## 459 120000 0.04405745
## 474 240000 0.04393266
## 493 504502 0.04343327
cor_data <- covid_clean %>%
select(total_cases, new_cases, total_deaths, total_vaccinations)
cor_matrix <- cor(na.omit(cor_data))
cor_matrix
## total_cases new_cases total_deaths total_vaccinations
## total_cases 1.0000000 0.1226220 0.9568953 0.9148502
## new_cases 0.1226220 1.0000000 0.1508531 0.1367483
## total_deaths 0.9568953 0.1508531 1.0000000 0.8673348
## total_vaccinations 0.9148502 0.1367483 0.8673348 1.0000000
cor_melt <- reshape2::melt(cor_matrix)
ggplot(cor_melt, aes(Var1, Var2, fill=value)) +
geom_tile() +
theme_minimal()
ggplot(covid_clean, aes(total_cases, total_deaths)) +
geom_point(alpha=0.3)
ggplot(selected, aes(total_cases, total_deaths, color=location)) +
geom_point()
summary(covid_clean$total_cases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 175560 1083189 23038750 5167020 775866783
summary(covid_clean$total_deaths)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 2063 12571 258739 91869 7057132
avg_cases <- covid_clean %>%
group_by(location) %>%
summarise(avg_new_cases = mean(new_cases, na.rm = TRUE)) %>%
arrange(desc(avg_new_cases)) %>%
head(10)
avg_cases
## # A tibble: 10 × 2
## location avg_new_cases
## <chr> <dbl>
## 1 World 531202.
## 2 Asia 212699.
## 3 Europe 175843.
## 4 China 139987.
## 5 United States 101139.
## 6 North America 81374.
## 7 South America 49792.
## 8 Germany 44273.
## 9 South Korea 42265.
## 10 Japan 41958.
monthly_data <- covid_clean %>%
mutate(month = format(date, "%Y-%m")) %>%
group_by(month) %>%
summarise(monthly_cases = sum(new_cases, na.rm = TRUE))
ggplot(monthly_data, aes(month, monthly_cases, group=1)) +
geom_line() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
continent_data <- covid_clean %>%
group_by(continent) %>%
summarise(total_cases = sum(total_cases, na.rm = TRUE))
ggplot(continent_data, aes(continent, total_cases)) +
geom_col()
continent_death <- covid_clean %>%
group_by(continent) %>%
summarise(avg_death_rate = mean(death_rate, na.rm = TRUE))
ggplot(continent_death, aes(continent, avg_death_rate)) +
geom_col()
The analysis of global COVID-19 data reveals trends, relationships, and differences across countries and continents. It demonstrates how data analysis can provide meaningful real-world insights.