Clear R Memory and Load Packages

rm(list=ls())
gc()
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 412325 22.1     857312 45.8         NA   641517 34.3
## Vcells 786693  6.1    8388608 64.0      16384  1768602 13.5
## Set directory
##directory <- ("~/Users/Nazija/Desktop")
##setwd("~/Users/Nazija/Desktop")
##list.files()

library(zoo) ## not tidyverse, but useful package for handling irregular time series data
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(magrittr) ## pipe operator (%>%) creates readable code
library(lubridate) ## works with date & date-time data
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidyverse) ## ggplot2, dplyr, tidyr, readr, etc.
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.1
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x lubridate::as.difftime() masks base::as.difftime()
## x lubridate::date()        masks base::date()
## x tidyr::extract()         masks magrittr::extract()
## x dplyr::filter()          masks stats::filter()
## x lubridate::intersect()   masks base::intersect()
## x dplyr::lag()             masks stats::lag()
## x purrr::set_names()       masks magrittr::set_names()
## x lubridate::setdiff()     masks base::setdiff()
## x lubridate::union()       masks base::union()

Load Data

covid19<-read_csv("/Users/Nazija/Desktop/modified_covid_dataset_ourworldindata.csv", col_names = TRUE)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   iso_code = col_character(),
##   continent = col_character(),
##   location = col_character(),
##   date = col_character(),
##   tests_units = col_character()
## )
## See spec(...) for full column specifications.
head(covid19) # first few observations
## # A tibble: 6 x 24
##   iso_code continent location date  total_cases new_cases total_deaths
##   <chr>    <chr>     <chr>    <chr>       <dbl>     <dbl>        <dbl>
## 1 AFG      Asia      Afghani… 12/3…           0         0            0
## 2 AFG      Asia      Afghani… 1/1/…           0         0            0
## 3 AFG      Asia      Afghani… 1/2/…           0         0            0
## 4 AFG      Asia      Afghani… 1/3/…           0         0            0
## 5 AFG      Asia      Afghani… 1/4/…           0         0            0
## 6 AFG      Asia      Afghani… 1/5/…           0         0            0
## # … with 17 more variables: new_deaths <dbl>, total_tests <dbl>,
## #   tests_units <chr>, population <dbl>, population_density <dbl>,
## #   median_age <dbl>, aged_65_older <dbl>, aged_70_older <dbl>,
## #   gdp_per_capita <dbl>, extreme_poverty <dbl>, cvd_death_rate <dbl>,
## #   diabetes_prevalence <dbl>, female_smokers <dbl>, male_smokers <dbl>,
## #   handwashing_facilities <dbl>, hospital_beds_per_thousand <dbl>,
## #   life_expectancy <dbl>
tail(covid19) # last few observations
## # A tibble: 6 x 24
##   iso_code continent location date  total_cases new_cases total_deaths
##   <chr>    <chr>     <chr>    <chr>       <dbl>     <dbl>        <dbl>
## 1 <NA>     <NA>      Interna… 2/27…         705        14            4
## 2 <NA>     <NA>      Interna… 2/28…         705         0            4
## 3 <NA>     <NA>      Interna… 2/29…         705         0            6
## 4 <NA>     <NA>      Interna… 3/1/…         705         0            6
## 5 <NA>     <NA>      Interna… 3/2/…         705         0            6
## 6 <NA>     <NA>      Interna… 3/10…         696        -9            7
## # … with 17 more variables: new_deaths <dbl>, total_tests <dbl>,
## #   tests_units <chr>, population <dbl>, population_density <dbl>,
## #   median_age <dbl>, aged_65_older <dbl>, aged_70_older <dbl>,
## #   gdp_per_capita <dbl>, extreme_poverty <dbl>, cvd_death_rate <dbl>,
## #   diabetes_prevalence <dbl>, female_smokers <dbl>, male_smokers <dbl>,
## #   handwashing_facilities <dbl>, hospital_beds_per_thousand <dbl>,
## #   life_expectancy <dbl>
glimpse(covid19) # number of observations, variables, class of each variable, first few values of each variable
## Rows: 28,714
## Columns: 24
## $ iso_code                   <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", …
## $ continent                  <chr> "Asia", "Asia", "Asia", "Asia", "Asia", "A…
## $ location                   <chr> "Afghanistan", "Afghanistan", "Afghanistan…
## $ date                       <chr> "12/31/2019", "1/1/2020", "1/2/2020", "1/3…
## $ total_cases                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ new_cases                  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_deaths               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ new_deaths                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ total_tests                <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ tests_units                <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ population                 <dbl> 38928341, 38928341, 38928341, 38928341, 38…
## $ population_density         <dbl> 54.422, 54.422, 54.422, 54.422, 54.422, 54…
## $ median_age                 <dbl> 18.6, 18.6, 18.6, 18.6, 18.6, 18.6, 18.6, …
## $ aged_65_older              <dbl> 2.581, 2.581, 2.581, 2.581, 2.581, 2.581, …
## $ aged_70_older              <dbl> 1.337, 1.337, 1.337, 1.337, 1.337, 1.337, …
## $ gdp_per_capita             <dbl> 1803.987, 1803.987, 1803.987, 1803.987, 18…
## $ extreme_poverty            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ cvd_death_rate             <dbl> 597.029, 597.029, 597.029, 597.029, 597.02…
## $ diabetes_prevalence        <dbl> 9.59, 9.59, 9.59, 9.59, 9.59, 9.59, 9.59, …
## $ female_smokers             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ male_smokers               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ handwashing_facilities     <dbl> 37.746, 37.746, 37.746, 37.746, 37.746, 37…
## $ hospital_beds_per_thousand <dbl> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.…
## $ life_expectancy            <dbl> 64.83, 64.83, 64.83, 64.83, 64.83, 64.83, …
summary(covid19) # basic stats like range, mean, median, missing values
##    iso_code          continent           location             date          
##  Length:28714       Length:28714       Length:28714       Length:28714      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   total_cases         new_cases         total_deaths      new_deaths      
##  Min.   :       0   Min.   :-29726.0   Min.   :     0   Min.   :-1918.00  
##  1st Qu.:      16   1st Qu.:     0.0   1st Qu.:     0   1st Qu.:    0.00  
##  Median :     299   Median :     4.0   Median :     6   Median :    0.00  
##  Mean   :   37246   Mean   :   814.8   Mean   :  2159   Mean   :   37.81  
##  3rd Qu.:    3275   3rd Qu.:    77.0   3rd Qu.:    74   3rd Qu.:    2.00  
##  Max.   :11586205   Max.   :206544.0   Max.   :537701   Max.   :10489.00  
##  NA's   :274        NA's   :274        NA's   :274      NA's   :274       
##   total_tests       tests_units          population        population_density 
##  Min.   :       1   Length:28714       Min.   :8.090e+02   Min.   :    0.137  
##  1st Qu.:   20958   Class :character   1st Qu.:1.933e+06   1st Qu.:   39.497  
##  Median :   92123   Mode  :character   Median :9.006e+06   Median :   90.672  
##  Mean   :  592450                      Mean   :9.794e+07   Mean   :  369.293  
##  3rd Qu.:  351285                      3rd Qu.:3.287e+07   3rd Qu.:  222.873  
##  Max.   :36225015                      Max.   :7.795e+09   Max.   :19347.500  
##  NA's   :20009                         NA's   :64          NA's   :1264       
##    median_age    aged_65_older    aged_70_older    gdp_per_capita    
##  Min.   :15.10   Min.   : 1.144   Min.   : 0.526   Min.   :   661.2  
##  1st Qu.:25.00   1st Qu.: 3.655   1st Qu.: 2.171   1st Qu.:  6171.9  
##  Median :31.90   Median : 7.150   Median : 4.593   Median : 15308.7  
##  Mean   :31.78   Mean   : 9.534   Mean   : 6.051   Mean   : 21849.4  
##  3rd Qu.:40.10   3rd Qu.:14.864   3rd Qu.: 9.732   3rd Qu.: 33132.3  
##  Max.   :48.20   Max.   :27.049   Max.   :18.493   Max.   :116935.6  
##  NA's   :2815    NA's   :3184     NA's   :2948     NA's   :3114      
##  extreme_poverty cvd_death_rate   diabetes_prevalence female_smokers 
##  Min.   : 0.10   Min.   : 79.37   Min.   : 0.990      Min.   : 0.10  
##  1st Qu.: 0.50   1st Qu.:152.78   1st Qu.: 5.310      1st Qu.: 1.90  
##  Median : 1.70   Median :235.95   Median : 7.110      Median : 6.90  
##  Mean   :11.16   Mean   :248.68   Mean   : 8.035      Mean   :11.06  
##  3rd Qu.:14.80   3rd Qu.:317.84   3rd Qu.:10.080      3rd Qu.:19.60  
##  Max.   :77.60   Max.   :724.42   Max.   :23.360      Max.   :44.00  
##  NA's   :11588   NA's   :2785     NA's   :1938        NA's   :8000   
##   male_smokers   handwashing_facilities hospital_beds_per_thousand
##  Min.   : 7.70   Min.   : 1.188         Min.   : 0.100            
##  1st Qu.:21.40   1st Qu.:23.437         1st Qu.: 1.380            
##  Median :31.40   Median :59.550         Median : 2.540            
##  Mean   :32.65   Mean   :53.738         Mean   : 3.165            
##  3rd Qu.:40.90   3rd Qu.:83.741         3rd Qu.: 4.210            
##  Max.   :78.10   Max.   :98.999         Max.   :13.800            
##  NA's   :8242    NA's   :16974          NA's   :5052              
##  life_expectancy
##  Min.   :53.28  
##  1st Qu.:70.60  
##  Median :75.88  
##  Mean   :74.35  
##  3rd Qu.:80.18  
##  Max.   :86.75  
##  NA's   :400

Clean Data

covid19CJK <- covid19 %>%
  #convert date from character to date format
  mutate(date = as.Date(covid19$date, format = "%m/%d/%Y")) %>%
  
  #create new "month" variable
  mutate(month = factor(month(date, label = FALSE),       # thing you're converting
    1:12,                                  # values it could take
    labels =                               # how they should appear
      c("Jan", "Feb", "Mar", "Apr",
      "May", "Jun", "Jul", "Aug",
      "Sep", "Oct", "Nov", "Dec"),
    ordered = TRUE))  %>% 
  
  #filter out cases on cruise ships and 12/31/2019 
  filter(location != "International" & date != "2019-12-31") %>%
 
  # extract data for China, Japan, and Korea
  filter (iso_code %in% c("CHN", "JPN", "KOR"))