# clear environment
# it's always a good idea to start fresh with no variables loaded in the R environment
# Manish Gurung
rm(list = ls())
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'stringr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(psych)
## Warning: package 'psych' was built under R version 4.3.3
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(dplyr)

# define the default directory
# it saves you the trouble of having to specify the full path every time you need to load or save a file
# (change the path below to where you have downloaded the covid dataset)
setwd("C:/Users/manis/Desktop/info332")


# load and inspect data
# important: we will use read_csv not read.csv
df_covid <- read_csv("owid-covid-data.csv")
## Rows: 76215 Columns: 59
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (4): iso_code, continent, location, tests_units
## dbl  (54): total_cases, new_cases, new_cases_smoothed, total_deaths, new_dea...
## date  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(df_covid)
## spc_tbl_ [76,215 × 59] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ iso_code                             : chr [1:76215] "AFG" "AFG" "AFG" "AFG" ...
##  $ continent                            : chr [1:76215] "Asia" "Asia" "Asia" "Asia" ...
##  $ location                             : chr [1:76215] "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ date                                 : Date[1:76215], format: "2020-02-24" "2020-02-25" ...
##  $ total_cases                          : num [1:76215] 1 1 1 1 1 1 1 1 2 4 ...
##  $ new_cases                            : num [1:76215] 1 0 0 0 0 0 0 0 1 2 ...
##  $ new_cases_smoothed                   : num [1:76215] NA NA NA NA NA 0.143 0.143 0 0.143 0.429 ...
##  $ total_deaths                         : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths                           : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths_smoothed                  : num [1:76215] NA NA NA NA NA 0 0 0 0 0 ...
##  $ total_cases_per_million              : num [1:76215] 0.026 0.026 0.026 0.026 0.026 0.026 0.026 0.026 0.051 0.103 ...
##  $ new_cases_per_million                : num [1:76215] 0.026 0 0 0 0 0 0 0 0.026 0.051 ...
##  $ new_cases_smoothed_per_million       : num [1:76215] NA NA NA NA NA 0.004 0.004 0 0.004 0.011 ...
##  $ total_deaths_per_million             : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths_per_million               : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ new_deaths_smoothed_per_million      : num [1:76215] NA NA NA NA NA 0 0 0 0 0 ...
##  $ reproduction_rate                    : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ icu_patients                         : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ icu_patients_per_million             : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ hosp_patients                        : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ hosp_patients_per_million            : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_icu_admissions                : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_icu_admissions_per_million    : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_hosp_admissions               : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ weekly_hosp_admissions_per_million   : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests                            : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ total_tests                          : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ total_tests_per_thousand             : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_per_thousand               : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_smoothed                   : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ new_tests_smoothed_per_thousand      : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ positive_rate                        : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ tests_per_case                       : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ tests_units                          : chr [1:76215] NA NA NA NA ...
##  $ total_vaccinations                   : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ people_vaccinated                    : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ people_fully_vaccinated              : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations                     : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations_smoothed            : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ total_vaccinations_per_hundred       : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ people_vaccinated_per_hundred        : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ people_fully_vaccinated_per_hundred  : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ new_vaccinations_smoothed_per_million: num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ stringency_index                     : num [1:76215] 8.33 8.33 8.33 8.33 8.33 ...
##  $ population                           : num [1:76215] 38928341 38928341 38928341 38928341 38928341 ...
##  $ population_density                   : num [1:76215] 54.4 54.4 54.4 54.4 54.4 ...
##  $ median_age                           : num [1:76215] 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 18.6 ...
##  $ aged_65_older                        : num [1:76215] 2.58 2.58 2.58 2.58 2.58 ...
##  $ aged_70_older                        : num [1:76215] 1.34 1.34 1.34 1.34 1.34 ...
##  $ gdp_per_capita                       : num [1:76215] 1804 1804 1804 1804 1804 ...
##  $ extreme_poverty                      : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ cardiovasc_death_rate                : num [1:76215] 597 597 597 597 597 ...
##  $ diabetes_prevalence                  : num [1:76215] 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 9.59 ...
##  $ female_smokers                       : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ male_smokers                         : num [1:76215] NA NA NA NA NA NA NA NA NA NA ...
##  $ handwashing_facilities               : num [1:76215] 37.7 37.7 37.7 37.7 37.7 ...
##  $ hospital_beds_per_thousand           : num [1:76215] 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
##  $ life_expectancy                      : num [1:76215] 64.8 64.8 64.8 64.8 64.8 ...
##  $ human_development_index              : num [1:76215] 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 0.511 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   iso_code = col_character(),
##   ..   continent = col_character(),
##   ..   location = col_character(),
##   ..   date = col_date(format = ""),
##   ..   total_cases = col_double(),
##   ..   new_cases = col_double(),
##   ..   new_cases_smoothed = col_double(),
##   ..   total_deaths = col_double(),
##   ..   new_deaths = col_double(),
##   ..   new_deaths_smoothed = col_double(),
##   ..   total_cases_per_million = col_double(),
##   ..   new_cases_per_million = col_double(),
##   ..   new_cases_smoothed_per_million = col_double(),
##   ..   total_deaths_per_million = col_double(),
##   ..   new_deaths_per_million = col_double(),
##   ..   new_deaths_smoothed_per_million = col_double(),
##   ..   reproduction_rate = col_double(),
##   ..   icu_patients = col_double(),
##   ..   icu_patients_per_million = col_double(),
##   ..   hosp_patients = col_double(),
##   ..   hosp_patients_per_million = col_double(),
##   ..   weekly_icu_admissions = col_double(),
##   ..   weekly_icu_admissions_per_million = col_double(),
##   ..   weekly_hosp_admissions = col_double(),
##   ..   weekly_hosp_admissions_per_million = col_double(),
##   ..   new_tests = col_double(),
##   ..   total_tests = col_double(),
##   ..   total_tests_per_thousand = col_double(),
##   ..   new_tests_per_thousand = col_double(),
##   ..   new_tests_smoothed = col_double(),
##   ..   new_tests_smoothed_per_thousand = col_double(),
##   ..   positive_rate = col_double(),
##   ..   tests_per_case = col_double(),
##   ..   tests_units = col_character(),
##   ..   total_vaccinations = col_double(),
##   ..   people_vaccinated = col_double(),
##   ..   people_fully_vaccinated = col_double(),
##   ..   new_vaccinations = col_double(),
##   ..   new_vaccinations_smoothed = col_double(),
##   ..   total_vaccinations_per_hundred = col_double(),
##   ..   people_vaccinated_per_hundred = col_double(),
##   ..   people_fully_vaccinated_per_hundred = col_double(),
##   ..   new_vaccinations_smoothed_per_million = col_double(),
##   ..   stringency_index = col_double(),
##   ..   population = col_double(),
##   ..   population_density = col_double(),
##   ..   median_age = col_double(),
##   ..   aged_65_older = col_double(),
##   ..   aged_70_older = col_double(),
##   ..   gdp_per_capita = col_double(),
##   ..   extreme_poverty = col_double(),
##   ..   cardiovasc_death_rate = col_double(),
##   ..   diabetes_prevalence = col_double(),
##   ..   female_smokers = col_double(),
##   ..   male_smokers = col_double(),
##   ..   handwashing_facilities = col_double(),
##   ..   hospital_beds_per_thousand = col_double(),
##   ..   life_expectancy = col_double(),
##   ..   human_development_index = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Q1: Find the total number of COVID deaths in the USA in December 2020. Display the answer to console.
# provide solution here (use as many lines of code as you wish)
# total deaths in USA in december 2020
df_usa <- df_covid[df_covid$location=="United States",]
df_usa_dec <- df_usa[df_usa$date>="2020-12-01" & df_usa$date<="2020-12-31",]
sum(df_usa_dec$new_deaths)
## [1] 80998
# Q2: Generate a dataset that shows how total covid deaths compare on dec 31 2020 for Mexico, Canada, and USA. Display the answer to console.
# provide solution here (use as many lines of code as you wish)


df_covid %>%
  filter(location %in% c("Mexico", "Canada", "United States")) %>%
  filter(date == "2020-12-31") %>%
  select(location, total_deaths)
## # A tibble: 3 × 2
##   location      total_deaths
##   <chr>                <dbl>
## 1 Canada               15762
## 2 Mexico              125807
## 3 United States       351861
# Q3: Generate a dataset that shows the top 5 countries by total covid cases as of dec 31 2020. Display the answer to console.
# provide solution here (use as many lines of code as you wish)


df_covid %>%
  filter(date == "2020-12-31",
         !location %in% c("European Union","Africa","Asia","Europe","Interantional","World","North America","South America")) %>%
  arrange(desc(total_cases)) %>%
  select(location, total_cases) %>%
  head(5) #excluding all continents and any location which is not a country 
## # A tibble: 5 × 2
##   location      total_cases
##   <chr>               <dbl>
## 1 United States    20098800
## 2 India            10266674
## 3 Brazil            7675973
## 4 Russia            3127347
## 5 France            2677666
# Q4: Generate a dataset that shows the top 5 countries by total covid deaths relative to the population as of dec 31 2020. Display the answer to console. 
# provide solution here (use as many lines of code as you wish)

df_covid %>%
  filter(date == "2020-12-31") %>%
  arrange(desc(total_deaths_per_million)) %>%
  select(location,total_deaths_per_million) %>%
  head(5)
## # A tibble: 5 × 2
##   location               total_deaths_per_million
##   <chr>                                     <dbl>
## 1 San Marino                                1738.
## 2 Belgium                                   1685.
## 3 Slovenia                                  1297.
## 4 Bosnia and Herzegovina                    1234.
## 5 Italy                                     1227.
# Q5:Find the date and number of new cases on the day the USA had its highest number of new cases in a single day. Display the answer to console.
# provide solution here (use as many lines of code as you wish)

df_covid %>%
  filter(location == "United States") %>%
  filter(new_cases == max(new_cases, na.rm = TRUE)) %>%
  select(date, new_cases)
## # A tibble: 1 × 2
##   date       new_cases
##   <date>         <dbl>
## 1 2021-01-02    300416
# Q6: Find the total number of fully vaccinated people in the USA as of the most recent date in the dataset. Display the answer to console.
# provide solution here (use as many lines of code as you wish)

df_covid %>%
  filter(location == "United States") %>%
  filter(date == max(date, na.rm = TRUE)) %>%
  select(date, people_fully_vaccinated)
## # A tibble: 1 × 2
##   date       people_fully_vaccinated
##   <date>                       <dbl>
## 1 2021-03-20                43036818
# Q7: Calculate the average number of vaccinations per day in the USA in the month of March 2021. Display the answer to console.
# provide solution here (use as many lines of code as you wish)

df_usa <- df_covid[df_covid$location=="United States",]

avg_Vac <- df_usa[df_usa$date>="2021-03-01" & df_usa$date<="2021-03-31",]
sum(avg_Vac$new_vaccinations)/length(unique(avg_Vac$date))
## [1] 2310275
# Q8: Plot the daily covid cases in the World as function of time. Hint: use the plot function which takes to arguments x and y. Optionally use the type argument to make it a line plot (as shown below).
# provide solution here (use as many lines of code as you wish)


df_world <- df_covid[df_covid$location == "World",]
plot(df_world$date, df_world$new_cases, type= 'l')

# Q9: Find the country with population greater than 50 million and lowest number of total covid cases as of March 1 2021. Display the country and total cases to date.
# provide solution here (use as many lines of code as you wish)


df_covid %>%
  filter(population >= 50000000) %>%
  filter(date == "2021-03-01") %>%
  filter(total_cases == min(total_cases, na.rm=TRUE)) %>%
  select(location, total_cases)
## # A tibble: 1 × 2
##   location total_cases
##   <chr>          <dbl>
## 1 Tanzania         509
# Q10: Generate a dataset that shows the total covid cases as of dec 31 in all the countries that that their name end with “land”. Display the answer to console. Hint: modifying the search phrase to “land$” ensures that “land” falls at the end of the word.
# provide solution here (use as many lines of code as you wish)

#there is no data for Dec 2021 and Dec 2022.

new_ds <- df_covid %>%
  filter(grepl("land$", location)) %>%
  filter(date == "2020-12-31") %>%
  select(location, date, total_cases)

new_ds
## # A tibble: 7 × 3
##   location    date       total_cases
##   <chr>       <date>           <dbl>
## 1 Finland     2020-12-31       36107
## 2 Iceland     2020-12-31        5754
## 3 Ireland     2020-12-31       91779
## 4 New Zealand 2020-12-31        2162
## 5 Poland      2020-12-31     1294878
## 6 Switzerland 2020-12-31      452296
## 7 Thailand    2020-12-31        7163