I’ll be using the data from SARS 2003 Outbreak Complete Dataset from Kaggle. The raw github link is here
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyr)
df=read.csv('https://raw.githubusercontent.com/metis-macys-66898/data_607_sp2020/master/sars_2003_complete_dataset_clean.csv', stringsAsFactors
= F)
head(df)
## Date Country Cumulative.number.of.case.s. Number.of.deaths
## 1 2003-03-17 Germany 1 0
## 2 2003-03-17 Canada 8 2
## 3 2003-03-17 Singapore 20 0
## 4 2003-03-17 Hong Kong SAR, China 95 1
## 5 2003-03-17 Switzerland 2 0
## 6 2003-03-17 Thailand 1 0
## Number.recovered
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
str(df)
## 'data.frame': 2538 obs. of 5 variables:
## $ Date : chr "2003-03-17" "2003-03-17" "2003-03-17" "2003-03-17" ...
## $ Country : chr "Germany" "Canada" "Singapore" "Hong Kong SAR, China" ...
## $ Cumulative.number.of.case.s.: int 1 8 20 95 2 1 40 2 8 0 ...
## $ Number.of.deaths : int 0 2 0 1 0 0 1 0 2 0 ...
## $ Number.recovered : int 0 0 0 0 0 0 0 0 0 0 ...
as.tibble(df)
## Warning: `as.tibble()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
## # A tibble: 2,538 x 5
## Date Country Cumulative.number.of… Number.of.deaths Number.recovered
## <chr> <chr> <int> <int> <int>
## 1 2003-03… Germany 1 0 0
## 2 2003-03… Canada 8 2 0
## 3 2003-03… Singapore 20 0 0
## 4 2003-03… Hong Kong S… 95 1 0
## 5 2003-03… Switzerland 2 0 0
## 6 2003-03… Thailand 1 0 0
## 7 2003-03… Viet Nam 40 1 0
## 8 2003-03… Germany 2 0 0
## 9 2003-03… Canada 8 2 0
## 10 2003-03… China 0 0 0
## # … with 2,528 more rows
rename(.data, …)
select(.data, …)
df1 <- df %>% rename("Cumulative_number_of_cases" = "Cumulative.number.of.case.s.")
names(df1)
## [1] "Date" "Country"
## [3] "Cumulative_number_of_cases" "Number.of.deaths"
## [5] "Number.recovered"
df %>% select("Cumulative_number_of_cases" = "Cumulative.number.of.case.s.") %>% head(5)
## Cumulative_number_of_cases
## 1 1
## 2 8
## 3 20
## 4 95
## 5 2
top_frac(x, n, wt)
top_n(x, n, wt)
# The funciton count will give us a count of records by day, denoted in n.
# group_by (x) followed by %>% will give you the result grouping by x
df1 %>% group_by (Date) %>% count
## # A tibble: 96 x 2
## # Groups: Date [96]
## Date n
## <chr> <int>
## 1 2003-03-17 7
## 2 2003-03-18 10
## 3 2003-03-19 12
## 4 2003-03-20 13
## 5 2003-03-21 15
## 6 2003-03-22 15
## 7 2003-03-24 15
## 8 2003-03-25 14
## 9 2003-03-26 14
## 10 2003-03-27 15
## # … with 86 more rows
# Assign the result to df2
df2 <- df1 %>% group_by(Date ) %>% top_n(.3 * n(), Cumulative_number_of_cases)
df2
## # A tibble: 774 x 5
## # Groups: Date [96]
## Date Country Cumulative_number_o… Number.of.deaths Number.recovered
## <chr> <chr> <int> <int> <int>
## 1 2003-03… Hong Kong SA… 95 1 0
## 2 2003-03… Viet Nam 40 1 0
## 3 2003-03… Singapore 23 0 0
## 4 2003-03… Hong Kong SA… 123 1 0
## 5 2003-03… Viet Nam 57 1 0
## 6 2003-03… Hong Kong SA… 150 5 0
## 7 2003-03… Singapore 31 0 0
## 8 2003-03… Viet Nam 56 2 0
## 9 2003-03… Hong Kong SA… 173 6 0
## 10 2003-03… Singapore 34 0 0
## # … with 764 more rows
# note that top_frac takes in .3 instead of .3 * n(). Assign the result to df2a
df2a <- df1 %>% group_by(Date ) %>% top_frac(.3, Cumulative_number_of_cases)
df2a
## # A tibble: 774 x 5
## # Groups: Date [96]
## Date Country Cumulative_number_o… Number.of.deaths Number.recovered
## <chr> <chr> <int> <int> <int>
## 1 2003-03… Hong Kong SA… 95 1 0
## 2 2003-03… Viet Nam 40 1 0
## 3 2003-03… Singapore 23 0 0
## 4 2003-03… Hong Kong SA… 123 1 0
## 5 2003-03… Viet Nam 57 1 0
## 6 2003-03… Hong Kong SA… 150 5 0
## 7 2003-03… Singapore 31 0 0
## 8 2003-03… Viet Nam 56 2 0
## 9 2003-03… Hong Kong SA… 173 6 0
## 10 2003-03… Singapore 34 0 0
## # … with 764 more rows
# checking if df2a is the same as df2
all.equal(df2, df2a)
## [1] TRUE
tally(x, wt = NULL, sort = FALSE, name = “n”)
count(x, …, wt = NULL, sort = FALSE, name = “n”, .drop = group_by_drop_default(x))
# an automatic variable n is created to hold the results of tally
df2 %>% group_by(Country) %>% tally
## # A tibble: 12 x 2
## Country n
## <chr> <int>
## 1 Canada 82
## 2 China 89
## 3 Germany 53
## 4 Hong Kong SAR, China 96
## 5 Italy 29
## 6 Mongolia 22
## 7 Philippines 52
## 8 Singapore 95
## 9 Taiwan, China 71
## 10 Thailand 9
## 11 United States 89
## 12 Viet Nam 87
# It can be further simplified to using just count. count (x) means counting the number of records grouping by x.
df2 %>% count (Country)
## # A tibble: 773 x 3
## # Groups: Date [96]
## Date Country n
## <chr> <chr> <int>
## 1 2003-03-17 Hong Kong SAR, China 1
## 2 2003-03-17 Viet Nam 1
## 3 2003-03-18 Hong Kong SAR, China 1
## 4 2003-03-18 Singapore 1
## 5 2003-03-18 Viet Nam 1
## 6 2003-03-19 Hong Kong SAR, China 1
## 7 2003-03-19 Singapore 1
## 8 2003-03-19 Viet Nam 1
## 9 2003-03-20 Hong Kong SAR, China 1
## 10 2003-03-20 Singapore 1
## # … with 763 more rows