I’ll be using the data from SARS 2003 Outbreak Complete Dataset from Kaggle. The raw github link is here
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyr)
library(knitr)
df=read.csv('https://raw.githubusercontent.com/metis-macys-66898/data_607_sp2020/master/sars_2003_complete_dataset_clean.csv', stringsAsFactors
= F)
head(df)
str(df)
## 'data.frame': 2538 obs. of 5 variables:
## $ Date : chr "2003-03-17" "2003-03-17" "2003-03-17" "2003-03-17" ...
## $ Country : chr "Germany" "Canada" "Singapore" "Hong Kong SAR, China" ...
## $ Cumulative.number.of.case.s.: int 1 8 20 95 2 1 40 2 8 0 ...
## $ Number.of.deaths : int 0 2 0 1 0 0 1 0 2 0 ...
## $ Number.recovered : int 0 0 0 0 0 0 0 0 0 0 ...
as.tibble(df)
## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
rename(.data, …)
select(.data, …)
df1 <- df %>% rename("Cumulative_number_of_cases" = "Cumulative.number.of.case.s.")
names(df1)
## [1] "Date" "Country"
## [3] "Cumulative_number_of_cases" "Number.of.deaths"
## [5] "Number.recovered"
df %>% select("Cumulative_number_of_cases" = "Cumulative.number.of.case.s.") %>% head(5)
top_frac(x, n, wt)
top_n(x, n, wt)
# The funciton count will give us a count of records by day, denoted in n.
# group_by (x) followed by %>% will give you the result grouping by x
df1 %>% group_by (Date) %>% count
# Assign the result to df2
df2 <- df1 %>% group_by(Date ) %>% top_n(.3 * n(), Cumulative_number_of_cases)
df2
# note that top_frac takes in .3 instead of .3 * n(). Assign the result to df2a
df2a <- df1 %>% group_by(Date ) %>% top_frac(.3, Cumulative_number_of_cases)
df2a
# checking if df2a is the same as df2
all.equal(df2, df2a)
## [1] TRUE
tally(x, wt = NULL, sort = FALSE, name = “n”)
count(x, …, wt = NULL, sort = FALSE, name = “n”, .drop = group_by_drop_default(x))
# an automatic variable n is created to hold the results of tally
df2 %>% group_by(Country) %>% tally
# It can be further simplified to using just count. count (x) means counting the number of records grouping by x.
df2 %>% count (Country)
mutate(.data, …)
#using Mutate to create new variable as `Year` having just the date component of the existing variable `Date`
df.new <- df %>%
mutate(Year=format(as.POSIXct(df$Date,format="%Y-%m-%d"),"%Y"))
str(df.new)
## 'data.frame': 2538 obs. of 6 variables:
## $ Date : chr "2003-03-17" "2003-03-17" "2003-03-17" "2003-03-17" ...
## $ Country : chr "Germany" "Canada" "Singapore" "Hong Kong SAR, China" ...
## $ Cumulative.number.of.case.s.: int 1 8 20 95 2 1 40 2 8 0 ...
## $ Number.of.deaths : int 0 2 0 1 0 0 1 0 2 0 ...
## $ Number.recovered : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Year : chr "2003" "2003" "2003" "2003" ...
kable(head(df.new,10))
Date | Country | Cumulative.number.of.case.s. | Number.of.deaths | Number.recovered | Year |
---|---|---|---|---|---|
2003-03-17 | Germany | 1 | 0 | 0 | 2003 |
2003-03-17 | Canada | 8 | 2 | 0 | 2003 |
2003-03-17 | Singapore | 20 | 0 | 0 | 2003 |
2003-03-17 | Hong Kong SAR, China | 95 | 1 | 0 | 2003 |
2003-03-17 | Switzerland | 2 | 0 | 0 | 2003 |
2003-03-17 | Thailand | 1 | 0 | 0 | 2003 |
2003-03-17 | Viet Nam | 40 | 1 | 0 | 2003 |
2003-03-18 | Germany | 2 | 0 | 0 | 2003 |
2003-03-18 | Canada | 8 | 2 | 0 | 2003 |
2003-03-18 | China | 0 | 0 | 0 | 2003 |
arrange(.data, …)
# ordering first by `Country` in descending and then by `Date` in defualt of ascending
df.new %>%
arrange(desc(Country), Date)
slice(.data, …, .preserve = FALSE)
# choosing the last row using n() along with slice()
kable(slice(df.new, n()))
Date | Country | Cumulative.number.of.case.s. | Number.of.deaths | Number.recovered | Year |
---|---|---|---|---|---|
2003-07-11 | Viet Nam | 63 | 5 | 58 | 2003 |
# choosing the first 7 row using n() along with slice()
kable(slice(df.new, 1:7))
Date | Country | Cumulative.number.of.case.s. | Number.of.deaths | Number.recovered | Year |
---|---|---|---|---|---|
2003-03-17 | Germany | 1 | 0 | 0 | 2003 |
2003-03-17 | Canada | 8 | 2 | 0 | 2003 |
2003-03-17 | Singapore | 20 | 0 | 0 | 2003 |
2003-03-17 | Hong Kong SAR, China | 95 | 1 | 0 | 2003 |
2003-03-17 | Switzerland | 2 | 0 | 0 | 2003 |
2003-03-17 | Thailand | 1 | 0 | 0 | 2003 |
2003-03-17 | Viet Nam | 40 | 1 | 0 | 2003 |