rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 412325 22.1 857312 45.8 NA 641517 34.3
## Vcells 786693 6.1 8388608 64.0 16384 1768602 13.5
## Set directory
##directory <- ("~/Users/Nazija/Desktop")
##setwd("~/Users/Nazija/Desktop")
##list.files()
library(zoo) ## not tidyverse, but useful package for handling irregular time series data
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(magrittr) ## pipe operator (%>%) creates readable code
library(lubridate) ## works with date & date-time data
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tidyverse) ## ggplot2, dplyr, tidyr, readr, etc.
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.1
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date() masks base::date()
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
## x lubridate::setdiff() masks base::setdiff()
## x lubridate::union() masks base::union()
covid19<-read_csv("/Users/Nazija/Desktop/modified_covid_dataset_ourworldindata.csv", col_names = TRUE)
## Parsed with column specification:
## cols(
## .default = col_double(),
## iso_code = col_character(),
## continent = col_character(),
## location = col_character(),
## date = col_character(),
## tests_units = col_character()
## )
## See spec(...) for full column specifications.
head(covid19) # first few observations
## # A tibble: 6 x 24
## iso_code continent location date total_cases new_cases total_deaths
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 AFG Asia Afghani… 12/3… 0 0 0
## 2 AFG Asia Afghani… 1/1/… 0 0 0
## 3 AFG Asia Afghani… 1/2/… 0 0 0
## 4 AFG Asia Afghani… 1/3/… 0 0 0
## 5 AFG Asia Afghani… 1/4/… 0 0 0
## 6 AFG Asia Afghani… 1/5/… 0 0 0
## # … with 17 more variables: new_deaths <dbl>, total_tests <dbl>,
## # tests_units <chr>, population <dbl>, population_density <dbl>,
## # median_age <dbl>, aged_65_older <dbl>, aged_70_older <dbl>,
## # gdp_per_capita <dbl>, extreme_poverty <dbl>, cvd_death_rate <dbl>,
## # diabetes_prevalence <dbl>, female_smokers <dbl>, male_smokers <dbl>,
## # handwashing_facilities <dbl>, hospital_beds_per_thousand <dbl>,
## # life_expectancy <dbl>
tail(covid19) # last few observations
## # A tibble: 6 x 24
## iso_code continent location date total_cases new_cases total_deaths
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 <NA> <NA> Interna… 2/27… 705 14 4
## 2 <NA> <NA> Interna… 2/28… 705 0 4
## 3 <NA> <NA> Interna… 2/29… 705 0 6
## 4 <NA> <NA> Interna… 3/1/… 705 0 6
## 5 <NA> <NA> Interna… 3/2/… 705 0 6
## 6 <NA> <NA> Interna… 3/10… 696 -9 7
## # … with 17 more variables: new_deaths <dbl>, total_tests <dbl>,
## # tests_units <chr>, population <dbl>, population_density <dbl>,
## # median_age <dbl>, aged_65_older <dbl>, aged_70_older <dbl>,
## # gdp_per_capita <dbl>, extreme_poverty <dbl>, cvd_death_rate <dbl>,
## # diabetes_prevalence <dbl>, female_smokers <dbl>, male_smokers <dbl>,
## # handwashing_facilities <dbl>, hospital_beds_per_thousand <dbl>,
## # life_expectancy <dbl>
glimpse(covid19) # number of observations, variables, class of each variable, first few values of each variable
## Rows: 28,714
## Columns: 24
## $ iso_code <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", …
## $ continent <chr> "Asia", "Asia", "Asia", "Asia", "Asia", "A…
## $ location <chr> "Afghanistan", "Afghanistan", "Afghanistan…
## $ date <chr> "12/31/2019", "1/1/2020", "1/2/2020", "1/3…
## $ total_cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ new_cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ new_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_tests <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ tests_units <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ population <dbl> 38928341, 38928341, 38928341, 38928341, 38…
## $ population_density <dbl> 54.422, 54.422, 54.422, 54.422, 54.422, 54…
## $ median_age <dbl> 18.6, 18.6, 18.6, 18.6, 18.6, 18.6, 18.6, …
## $ aged_65_older <dbl> 2.581, 2.581, 2.581, 2.581, 2.581, 2.581, …
## $ aged_70_older <dbl> 1.337, 1.337, 1.337, 1.337, 1.337, 1.337, …
## $ gdp_per_capita <dbl> 1803.987, 1803.987, 1803.987, 1803.987, 18…
## $ extreme_poverty <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ cvd_death_rate <dbl> 597.029, 597.029, 597.029, 597.029, 597.02…
## $ diabetes_prevalence <dbl> 9.59, 9.59, 9.59, 9.59, 9.59, 9.59, 9.59, …
## $ female_smokers <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ male_smokers <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ handwashing_facilities <dbl> 37.746, 37.746, 37.746, 37.746, 37.746, 37…
## $ hospital_beds_per_thousand <dbl> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.…
## $ life_expectancy <dbl> 64.83, 64.83, 64.83, 64.83, 64.83, 64.83, …
summary(covid19) # basic stats like range, mean, median, missing values
## iso_code continent location date
## Length:28714 Length:28714 Length:28714 Length:28714
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## total_cases new_cases total_deaths new_deaths
## Min. : 0 Min. :-29726.0 Min. : 0 Min. :-1918.00
## 1st Qu.: 16 1st Qu.: 0.0 1st Qu.: 0 1st Qu.: 0.00
## Median : 299 Median : 4.0 Median : 6 Median : 0.00
## Mean : 37246 Mean : 814.8 Mean : 2159 Mean : 37.81
## 3rd Qu.: 3275 3rd Qu.: 77.0 3rd Qu.: 74 3rd Qu.: 2.00
## Max. :11586205 Max. :206544.0 Max. :537701 Max. :10489.00
## NA's :274 NA's :274 NA's :274 NA's :274
## total_tests tests_units population population_density
## Min. : 1 Length:28714 Min. :8.090e+02 Min. : 0.137
## 1st Qu.: 20958 Class :character 1st Qu.:1.933e+06 1st Qu.: 39.497
## Median : 92123 Mode :character Median :9.006e+06 Median : 90.672
## Mean : 592450 Mean :9.794e+07 Mean : 369.293
## 3rd Qu.: 351285 3rd Qu.:3.287e+07 3rd Qu.: 222.873
## Max. :36225015 Max. :7.795e+09 Max. :19347.500
## NA's :20009 NA's :64 NA's :1264
## median_age aged_65_older aged_70_older gdp_per_capita
## Min. :15.10 Min. : 1.144 Min. : 0.526 Min. : 661.2
## 1st Qu.:25.00 1st Qu.: 3.655 1st Qu.: 2.171 1st Qu.: 6171.9
## Median :31.90 Median : 7.150 Median : 4.593 Median : 15308.7
## Mean :31.78 Mean : 9.534 Mean : 6.051 Mean : 21849.4
## 3rd Qu.:40.10 3rd Qu.:14.864 3rd Qu.: 9.732 3rd Qu.: 33132.3
## Max. :48.20 Max. :27.049 Max. :18.493 Max. :116935.6
## NA's :2815 NA's :3184 NA's :2948 NA's :3114
## extreme_poverty cvd_death_rate diabetes_prevalence female_smokers
## Min. : 0.10 Min. : 79.37 Min. : 0.990 Min. : 0.10
## 1st Qu.: 0.50 1st Qu.:152.78 1st Qu.: 5.310 1st Qu.: 1.90
## Median : 1.70 Median :235.95 Median : 7.110 Median : 6.90
## Mean :11.16 Mean :248.68 Mean : 8.035 Mean :11.06
## 3rd Qu.:14.80 3rd Qu.:317.84 3rd Qu.:10.080 3rd Qu.:19.60
## Max. :77.60 Max. :724.42 Max. :23.360 Max. :44.00
## NA's :11588 NA's :2785 NA's :1938 NA's :8000
## male_smokers handwashing_facilities hospital_beds_per_thousand
## Min. : 7.70 Min. : 1.188 Min. : 0.100
## 1st Qu.:21.40 1st Qu.:23.437 1st Qu.: 1.380
## Median :31.40 Median :59.550 Median : 2.540
## Mean :32.65 Mean :53.738 Mean : 3.165
## 3rd Qu.:40.90 3rd Qu.:83.741 3rd Qu.: 4.210
## Max. :78.10 Max. :98.999 Max. :13.800
## NA's :8242 NA's :16974 NA's :5052
## life_expectancy
## Min. :53.28
## 1st Qu.:70.60
## Median :75.88
## Mean :74.35
## 3rd Qu.:80.18
## Max. :86.75
## NA's :400
covid19CJK <- covid19 %>%
#convert date from character to date format
mutate(date = as.Date(covid19$date, format = "%m/%d/%Y")) %>%
#create new "month" variable
mutate(month = factor(month(date, label = FALSE), # thing you're converting
1:12, # values it could take
labels = # how they should appear
c("Jan", "Feb", "Mar", "Apr",
"May", "Jun", "Jul", "Aug",
"Sep", "Oct", "Nov", "Dec"),
ordered = TRUE)) %>%
#filter out cases on cruise ships and 12/31/2019
filter(location != "International" & date != "2019-12-31") %>%
# extract data for China, Japan, and Korea
filter (iso_code %in% c("CHN", "JPN", "KOR"))
covid19CJK %>%
ggplot(aes(x = date, y= new_cases, group = location, color = location))+
geom_line() +theme_classic() + labs(title = "Confirmed COVID Cases per Country", x = "Date", y = "Confirmed Cases")
The three countries experienced their peak number of new cases at different times and at different magnitudes. China experienced its peak, about 15,000 new cases, the earliest around February and was at this point for a short period of time before the case count began to rapidly decrease. South Korea experienced its peak number of new cases soon after, but at a much lower count of about 1,000 new cases. It reached this case count and stayed around that case count for a longer time than China did, experiencing a slower decline. Japan reached it peak new case count much later, in April, and had a slightly higher peak than South Korea. While the case count did not stay this high for long, Japan’s new case count was around this number for some time, much like Korea’s. All three countries had near 0 new cases after May, though Japan seemed to be experiencing a rise in new cases by the end of July.
China experienced the most new cases in a day. Even before reaching its peak, it had a little under 5000 new cases while Japan and Korea’s new case counts were near 0. Then China’s new case count lowered before rapidly rising again, just to lower again. After lowering around March, China stayed at a near 0 new case count, with one small spike in April/May. South Korea’s highest new case count was one of China’s lowest, and it reached this point more slowly, with steady increases in confirmed cases, and lowered its case count over the course of a few weeks. Once lowered, the number of new cases slowly declined before staying between 0 new cases and about 100 cases. Like in China, there was a spike in April, but not nearly as high, with about 250 new cases. Japan had its peak new case count in April, and while it slowly decline, the new case count per day was rarely as close to 0 as it was in South Korea, and was rising rapidly by July.