# clear environment
# it's always a good idea to start fresh with no variables loaded in the R environment
# Manish Gurung
rm(list = ls())
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'stringr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(psych)
## Warning: package 'psych' was built under R version 4.3.3
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(dplyr)
# define the default directory
# it saves you the trouble of having to specify the full path every time you need to load or save a file
# (change the path below to where you have downloaded the covid dataset)
setwd("C:/Users/manis/Desktop/info332")
# load and inspect data
# important: we will use read_csv not read.csv
df_covid <- read_csv("owid-covid-data.csv")
## Rows: 76215 Columns: 59
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): iso_code, continent, location, tests_units
## dbl (54): total_cases, new_cases, new_cases_smoothed, total_deaths, new_dea...
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(df_covid)
## spc_tbl_ [76,215 × 59] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ iso_code : chr [1:76215] "AFG" "AFG" "AFG" "AFG" ...
## $ continent : chr [1:76215] "Asia" "Asia" "Asia" "Asia" ...
## $ location : chr [1:76215] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ date : Date[1:76215], format: "2020-02-24" "2020-02-25" ...
## $ total_cases : num [1:76215] 1 1 1 1 1 1 1 1 2 4 ...
## $ new_cases : num [1:76215] 1 0 0 0 0 0 0 0 1 2 ...
## $ new_cases_smoothed : num [1:76215] NA NA NA NA NA 0.143 0.143 0 0.143 0.429 ...
## $ total_deaths : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ new_deaths : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ new_deaths_smoothed : num [1:76215] NA NA NA NA NA 0 0 0 0 0 ...
## $ total_cases_per_million : num [1:76215] 0.026 0.026 0.026 0.026 0.026 0.026 0.026 0.026 0.051 0.103 ...
## $ new_cases_per_million : num [1:76215] 0.026 0 0 0 0 0 0 0 0.026 0.051 ...
## $ new_cases_smoothed_per_million : num [1:76215] NA NA NA NA NA 0.004 0.004 0 0.004 0.011 ...
## $ total_deaths_per_million : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ new_deaths_per_million : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ new_deaths_smoothed_per_million : num [1:76215] NA NA NA NA NA 0 0 0 0 0 ...
## $ reproduction_rate : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ icu_patients : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ icu_patients_per_million : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ hosp_patients : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ hosp_patients_per_million : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ weekly_icu_admissions : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ weekly_icu_admissions_per_million : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ weekly_hosp_admissions : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ weekly_hosp_admissions_per_million : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ new_tests : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ total_tests : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ total_tests_per_thousand : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ new_tests_per_thousand : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ new_tests_smoothed : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ new_tests_smoothed_per_thousand : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ positive_rate : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ tests_per_case : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ tests_units : chr [1:76215] NA NA NA NA ...
## $ total_vaccinations : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ people_vaccinated : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ people_fully_vaccinated : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ new_vaccinations : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ new_vaccinations_smoothed : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ total_vaccinations_per_hundred : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ people_vaccinated_per_hundred : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ people_fully_vaccinated_per_hundred : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ new_vaccinations_smoothed_per_million: num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ stringency_index : num [1:76215] 8.33 8.33 8.33 8.33 8.33 ...
## $ population : num [1:76215] 38928341 38928341 38928341 38928341 38928341 ...
## $ population_density : num [1:76215] 54.4 54.4 54.4 54.4 54.4 ...
## $ median_age : num [1:76215] 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 ...
## $ aged_65_older : num [1:76215] 2.58 2.58 2.58 2.58 2.58 ...
## $ aged_70_older : num [1:76215] 1.34 1.34 1.34 1.34 1.34 ...
## $ gdp_per_capita : num [1:76215] 1804 1804 1804 1804 1804 ...
## $ extreme_poverty : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ cardiovasc_death_rate : num [1:76215] 597 597 597 597 597 ...
## $ diabetes_prevalence : num [1:76215] 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 ...
## $ female_smokers : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ male_smokers : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
## $ handwashing_facilities : num [1:76215] 37.7 37.7 37.7 37.7 37.7 ...
## $ hospital_beds_per_thousand : num [1:76215] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
## $ life_expectancy : num [1:76215] 64.8 64.8 64.8 64.8 64.8 ...
## $ human_development_index : num [1:76215] 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 ...
## - attr(*, "spec")=
## .. cols(
## .. iso_code = col_character(),
## .. continent = col_character(),
## .. location = col_character(),
## .. date = col_date(format = ""),
## .. total_cases = col_double(),
## .. new_cases = col_double(),
## .. new_cases_smoothed = col_double(),
## .. total_deaths = col_double(),
## .. new_deaths = col_double(),
## .. new_deaths_smoothed = col_double(),
## .. total_cases_per_million = col_double(),
## .. new_cases_per_million = col_double(),
## .. new_cases_smoothed_per_million = col_double(),
## .. total_deaths_per_million = col_double(),
## .. new_deaths_per_million = col_double(),
## .. new_deaths_smoothed_per_million = col_double(),
## .. reproduction_rate = col_double(),
## .. icu_patients = col_double(),
## .. icu_patients_per_million = col_double(),
## .. hosp_patients = col_double(),
## .. hosp_patients_per_million = col_double(),
## .. weekly_icu_admissions = col_double(),
## .. weekly_icu_admissions_per_million = col_double(),
## .. weekly_hosp_admissions = col_double(),
## .. weekly_hosp_admissions_per_million = col_double(),
## .. new_tests = col_double(),
## .. total_tests = col_double(),
## .. total_tests_per_thousand = col_double(),
## .. new_tests_per_thousand = col_double(),
## .. new_tests_smoothed = col_double(),
## .. new_tests_smoothed_per_thousand = col_double(),
## .. positive_rate = col_double(),
## .. tests_per_case = col_double(),
## .. tests_units = col_character(),
## .. total_vaccinations = col_double(),
## .. people_vaccinated = col_double(),
## .. people_fully_vaccinated = col_double(),
## .. new_vaccinations = col_double(),
## .. new_vaccinations_smoothed = col_double(),
## .. total_vaccinations_per_hundred = col_double(),
## .. people_vaccinated_per_hundred = col_double(),
## .. people_fully_vaccinated_per_hundred = col_double(),
## .. new_vaccinations_smoothed_per_million = col_double(),
## .. stringency_index = col_double(),
## .. population = col_double(),
## .. population_density = col_double(),
## .. median_age = col_double(),
## .. aged_65_older = col_double(),
## .. aged_70_older = col_double(),
## .. gdp_per_capita = col_double(),
## .. extreme_poverty = col_double(),
## .. cardiovasc_death_rate = col_double(),
## .. diabetes_prevalence = col_double(),
## .. female_smokers = col_double(),
## .. male_smokers = col_double(),
## .. handwashing_facilities = col_double(),
## .. hospital_beds_per_thousand = col_double(),
## .. life_expectancy = col_double(),
## .. human_development_index = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Q1: Find the total number of COVID deaths in the USA in December 2020. Display the answer to console.
# provide solution here (use as many lines of code as you wish)
# total deaths in USA in december 2020
df_usa <- df_covid[df_covid$location=="United States",]
df_usa_dec <- df_usa[df_usa$date>="2020-12-01" & df_usa$date<="2020-12-31",]
sum(df_usa_dec$new_deaths)
## [1] 80998
# Q2: Generate a dataset that shows how total covid deaths compare on dec 31 2020 for Mexico, Canada, and USA. Display the answer to console.
# provide solution here (use as many lines of code as you wish)
df_covid %>%
filter(location %in% c("Mexico", "Canada", "United States")) %>%
filter(date == "2020-12-31") %>%
select(location, total_deaths)
## # A tibble: 3 × 2
## location total_deaths
## <chr> <dbl>
## 1 Canada 15762
## 2 Mexico 125807
## 3 United States 351861
# Q3: Generate a dataset that shows the top 5 countries by total covid cases as of dec 31 2020. Display the answer to console.
# provide solution here (use as many lines of code as you wish)
df_covid %>%
filter(date == "2020-12-31",
!location %in% c("European Union","Africa","Asia","Europe","Interantional","World","North America","South America")) %>%
arrange(desc(total_cases)) %>%
select(location, total_cases) %>%
head(5) #excluding all continents and any location which is not a country
## # A tibble: 5 × 2
## location total_cases
## <chr> <dbl>
## 1 United States 20098800
## 2 India 10266674
## 3 Brazil 7675973
## 4 Russia 3127347
## 5 France 2677666
# Q4: Generate a dataset that shows the top 5 countries by total covid deaths relative to the population as of dec 31 2020. Display the answer to console.
# provide solution here (use as many lines of code as you wish)
df_covid %>%
filter(date == "2020-12-31") %>%
arrange(desc(total_deaths_per_million)) %>%
select(location,total_deaths_per_million) %>%
head(5)
## # A tibble: 5 × 2
## location total_deaths_per_million
## <chr> <dbl>
## 1 San Marino 1738.
## 2 Belgium 1685.
## 3 Slovenia 1297.
## 4 Bosnia and Herzegovina 1234.
## 5 Italy 1227.
# Q5:Find the date and number of new cases on the day the USA had its highest number of new cases in a single day. Display the answer to console.
# provide solution here (use as many lines of code as you wish)
df_covid %>%
filter(location == "United States") %>%
filter(new_cases == max(new_cases, na.rm = TRUE)) %>%
select(date, new_cases)
## # A tibble: 1 × 2
## date new_cases
## <date> <dbl>
## 1 2021-01-02 300416
# Q6: Find the total number of fully vaccinated people in the USA as of the most recent date in the dataset. Display the answer to console.
# provide solution here (use as many lines of code as you wish)
df_covid %>%
filter(location == "United States") %>%
filter(date == max(date, na.rm = TRUE)) %>%
select(date, people_fully_vaccinated)
## # A tibble: 1 × 2
## date people_fully_vaccinated
## <date> <dbl>
## 1 2021-03-20 43036818
# Q7: Calculate the average number of vaccinations per day in the USA in the month of March 2021. Display the answer to console.
# provide solution here (use as many lines of code as you wish)
df_usa <- df_covid[df_covid$location=="United States",]
avg_Vac <- df_usa[df_usa$date>="2021-03-01" & df_usa$date<="2021-03-31",]
sum(avg_Vac$new_vaccinations)/length(unique(avg_Vac$date))
## [1] 2310275
# Q8: Plot the daily covid cases in the World as function of time. Hint: use the plot function which takes to arguments x and y. Optionally use the type argument to make it a line plot (as shown below).
# provide solution here (use as many lines of code as you wish)
df_world <- df_covid[df_covid$location == "World",]
plot(df_world$date, df_world$new_cases, type= 'l')

# Q9: Find the country with population greater than 50 million and lowest number of total covid cases as of March 1 2021. Display the country and total cases to date.
# provide solution here (use as many lines of code as you wish)
df_covid %>%
filter(population >= 50000000) %>%
filter(date == "2021-03-01") %>%
filter(total_cases == min(total_cases, na.rm=TRUE)) %>%
select(location, total_cases)
## # A tibble: 1 × 2
## location total_cases
## <chr> <dbl>
## 1 Tanzania 509
# Q10: Generate a dataset that shows the total covid cases as of dec 31 in all the countries that that their name end with “land”. Display the answer to console. Hint: modifying the search phrase to “land$” ensures that “land” falls at the end of the word.
# provide solution here (use as many lines of code as you wish)
#there is no data for Dec 2021 and Dec 2022.
new_ds <- df_covid %>%
filter(grepl("land$", location)) %>%
filter(date == "2020-12-31") %>%
select(location, date, total_cases)
new_ds
## # A tibble: 7 × 3
## location date total_cases
## <chr> <date> <dbl>
## 1 Finland 2020-12-31 36107
## 2 Iceland 2020-12-31 5754
## 3 Ireland 2020-12-31 91779
## 4 New Zealand 2020-12-31 2162
## 5 Poland 2020-12-31 1294878
## 6 Switzerland 2020-12-31 452296
## 7 Thailand 2020-12-31 7163