- The student shared experience
- Slowing down the class
- Any issues running R at this point?
- R Studio vs. R; .Rmd files vs. .R files
October 15, 2020
library(tidyverse)
covid <- read_csv('https://rb.gy/lzlylj')
crime <- read_csv('https://rb.gy/5zuayh')
select() keeps the named variables from the dataset-select()
everything()'``' for multi-word variables and variable names that begin with numberscovid %>% select(location, deaths, confirmed_infections, est_infections, date) %>% head(3) ## # A tibble: 3 x 5 ## location deaths confirmed_infections est_infections date ## <chr> <dbl> <dbl> <dbl> <date> ## 1 Global 103 4966 NA 2020-02-04 ## 2 Global 111 4749 NA 2020-02-05 ## 3 Global 110 3919 NA 2020-02-06
select() keeps the named variables from the dataset-select()
everything()'``' for multi-word variables and variable names that begin with numberscovid %>% select(location, deaths, confirmed_infections, est_infections, date) %>% select(-est_infections) %>% head(3) ## # A tibble: 3 x 4 ## location deaths confirmed_infections date ## <chr> <dbl> <dbl> <date> ## 1 Global 103 4966 2020-02-04 ## 2 Global 111 4749 2020-02-05 ## 3 Global 110 3919 2020-02-06
select() keeps the named variables from the dataset-select()
everything()'``' for multi-word variables and variable names that begin with numberscovid %>% select(location, deaths, confirmed_infections, est_infections, date) %>% select(location, date, everything()) %>% head(3) ## # A tibble: 3 x 5 ## location date deaths confirmed_infections est_infections ## <chr> <date> <dbl> <dbl> <dbl> ## 1 Global 2020-02-04 103 4966 NA ## 2 Global 2020-02-05 111 4749 NA ## 3 Global 2020-02-06 110 3919 NA
mutate() creates variablesmutate()
min(), max(), mean()as.character(), as.factor(), as.numeric()ifelse() within mutate()covid %>%
mutate(
cases_deaths_ratio = (deaths / confirmed_infections) %>% round(2)
, tests_over_mean = (total_tests / mean(total_tests, na.rm = TRUE)) %>% round(2)
)
mutate() creates variablesmutate()
min(), max(), mean()as.character(), as.factor(), as.numeric()ifelse() within mutate()x <- c(1, 2, 3) y <- c(4, 5, 1) ifelse(x > y, 'yes', 'no')
mutate() creates variablesmutate()
min(), max(), mean()as.character(), as.factor(), as.numeric()ifelse() within mutate()x <- c(1, 2, 3) y <- c(4, 5, 1) ifelse(x > y, 'yes', 'no') ## [1] "no" "no" "yes"
mutate() creates variablesmutate()
min(), max(), mean()as.character(), as.factor(), as.numeric()ifelse() within mutate()covid %>%
mutate(
cases_deaths_ratio = (deaths / confirmed_infections) %>% round(2)
, tests_over_mean = (total_tests / mean(total_tests, na.rm = TRUE)) %>% round(2)
, global_infections = ifelse(location %in% 'Global', confirmed_infections, NA)
, global_mobility = ifelse(location %in% 'Global', mobility_composite, NA)
)
covid %>%
mutate(
cases_deaths_ratio = (deaths / confirmed_infections) %>% round(2)
, tests_over_mean = (total_tests / mean(total_tests, na.rm = TRUE)) %>% round(2)
, global_infections = ifelse(location %in% 'Global', confirmed_infections, NA)
, global_mobility = ifelse(location %in% 'Global', mobility_composite, NA)
) %>%
select(location, date, cases_deaths_ratio, tests_over_mean, global_infections, global_mobility) %>%
head(5)
## # A tibble: 5 x 6
## location date cases_deaths_ra… tests_over_mean global_infectio…
## <chr> <date> <dbl> <dbl> <dbl>
## 1 Global 2020-02-04 0.02 1.54 4966
## 2 Global 2020-02-05 0.02 1.68 4749
## 3 Global 2020-02-06 0.03 1.82 3919
## 4 Global 2020-02-07 0.03 1.97 4495.
## 5 Global 2020-02-08 0.04 2.11 3403.
## # … with 1 more variable: global_mobility <dbl>
Exercise - 5 minutes
- With the crime dataset, create a variable called larceny
- In larceny, flag with a 1 all records that have a ‘LARCENY-THEFT’ value in offense_parent_group. All other records should have a 0
- Set neighborhood to NA when the new variable larceny is equal to 0
- Only report the following varilables in the output: neighborhood, offense_parent_group, larceny
crime %>%
mutate(
larceny = ifelse(offense_parent_group %in% 'LARCENY-THEFT', 1, 0)
, neighborhood = ifelse(larceny %in% 1, neighborhood, NA)
) %>%
select(offense_parent_group, larceny, neighborhood) %>%
tail(8)
Exercise - 5 minutes
- With the crime dataset, create a variable called larceny
- In larceny, flag with a 1 all records that have a ‘LARCENY-THEFT’ value in offense_parent_group. All other records should have a 0
- Set neighborhood to NA when the new variable larceny is equal to 0
- Only report the following varilables in the output: neighborhood, offense_parent_group, larceny
crime %>%
mutate(
larceny = ifelse(offense_parent_group %in% 'LARCENY-THEFT', 1, 0)
, neighborhood = ifelse(larceny %in% 1, neighborhood, NA)
) %>%
select(offense_parent_group, larceny, neighborhood) %>%
tail(8)
## # A tibble: 8 x 3
## offense_parent_group larceny neighborhood
## <chr> <dbl> <chr>
## 1 LARCENY-THEFT 1 CAPITOL HILL
## 2 ASSAULT OFFENSES 0 <NA>
## 3 ASSAULT OFFENSES 0 <NA>
## 4 LARCENY-THEFT 1 ROOSEVELT/RAVENNA
## 5 ASSAULT OFFENSES 0 <NA>
## 6 MOTOR VEHICLE THEFT 0 <NA>
## 7 LARCENY-THEFT 1 MOUNT BAKER
## 8 ASSAULT OFFENSES 0 <NA>
transmute() = mutate() + select()covid %>%
mutate(
cases_deaths_ratio = (deaths / confirmed_infections) %>% round(2)
, tests_over_mean = (total_tests / mean(total_tests, na.rm = TRUE)) %>% round(2)
, global_infections = ifelse(location %in% 'Global', confirmed_infections, NA)
, global_mobility = ifelse(location %in% 'Global', mobility_composite, NA)
) %>%
select(cases_deaths_ratio, tests_over_mean, global_infections, global_mobility)
## # A tibble: 103,366 x 4
## cases_deaths_ratio tests_over_mean global_infections global_mobility
## <dbl> <dbl> <dbl> <dbl>
## 1 0.02 1.54 4966 NA
## 2 0.02 1.68 4749 NA
## 3 0.03 1.82 3919 NA
## 4 0.03 1.97 4495. NA
## 5 0.04 2.11 3403. -23.7
## 6 0.04 2.25 3954 -23.9
## 7 0.05 2.39 3294. -23.9
## 8 0.06 2.54 2591. -23.4
## 9 0.01 2.68 134 -23.5
## 10 0.09 2.83 1728 -23.2
## # … with 103,356 more rows
transmute() = mutate() + select()covid %>%
transmute(
cases_deaths_ratio = (deaths / confirmed_infections) %>% round(2)
, tests_over_mean = (total_tests / mean(total_tests, na.rm = TRUE)) %>% round(2)
, global_infections = ifelse(location %in% 'Global', confirmed_infections, NA)
, global_mobility = ifelse(location %in% 'Global', mobility_composite, NA)
)
## # A tibble: 103,366 x 4
## cases_deaths_ratio tests_over_mean global_infections global_mobility
## <dbl> <dbl> <dbl> <dbl>
## 1 0.02 1.54 4966 NA
## 2 0.02 1.68 4749 NA
## 3 0.03 1.82 3919 NA
## 4 0.03 1.97 4495. NA
## 5 0.04 2.11 3403. -23.7
## 6 0.04 2.25 3954 -23.9
## 7 0.05 2.39 3294. -23.9
## 8 0.06 2.54 2591. -23.4
## 9 0.01 2.68 134 -23.5
## 10 0.09 2.83 1728 -23.2
## # … with 103,356 more rows
covid %>%
transmute(
location
, date
, cases_deaths_ratio = (deaths / confirmed_infections) %>% round(2)
, tests_over_mean = (total_tests / mean(total_tests, na.rm = TRUE)) %>% round(2)
, global_infections = ifelse(location %in% 'Global', confirmed_infections, NA)
, global_mobility = ifelse(location %in% 'Global', mobility_composite, NA)
)
## # A tibble: 103,366 x 6
## location date cases_deaths_ra… tests_over_mean global_infectio…
## <chr> <date> <dbl> <dbl> <dbl>
## 1 Global 2020-02-04 0.02 1.54 4966
## 2 Global 2020-02-05 0.02 1.68 4749
## 3 Global 2020-02-06 0.03 1.82 3919
## 4 Global 2020-02-07 0.03 1.97 4495.
## 5 Global 2020-02-08 0.04 2.11 3403.
## 6 Global 2020-02-09 0.04 2.25 3954
## 7 Global 2020-02-10 0.05 2.39 3294.
## 8 Global 2020-02-11 0.06 2.54 2591.
## 9 Global 2020-02-12 0.01 2.68 134
## 10 Global 2020-02-13 0.09 2.83 1728
## # … with 103,356 more rows, and 1 more variable: global_mobility <dbl>
rename() assigns a new name to an existing variabletransmute() and mutate()as well
rename(): renames selected variables, keeps all other variablestransmute(): renames selected variables, drops all other variablesmutate(): creates new variables (with new names) and keeps originalsExercise - 3 minutes
- Rename confirmed_infections as cases, location as geography, and date as ymd with one of the methods
rename() assigns a new name to an existing variabletransmute() and mutate()as well
rename(): renames selected variables, keeps all other variablestransmute(): renames selected variables, drops all other variablesmutate(): creates new variables (with new names) and keeps originalscovid %>%
rename(
geography = location
, ymd = date
, cases = confirmed_infections
) %>%
head(1)
## # A tibble: 1 x 14
## geography ymd deaths mobility_data_t… mobility_compos… total_tests_dat…
## <chr> <date> <dbl> <chr> <dbl> <chr>
## 1 Global 2020-02-04 103 <NA> NA observed
## # … with 8 more variables: total_tests <dbl>, cases <dbl>,
## # est_infections <dbl>, population <dbl>, deaths_p100k <dbl>,
## # confirmed_infections_p100k <dbl>, est_infections_p100k <dbl>, month <dbl>
rename() assigns a new name to an existing variabletransmute() and mutate()as well
rename(): renames selected variables, keeps all other variablestransmute(): renames selected variables, drops all other variablesmutate(): creates new variables (with new names) and keeps originalscovid %>%
transmute(
geography = location
, ymd = date
, cases = confirmed_infections
) %>%
head(1)
## # A tibble: 1 x 3
## geography ymd cases
## <chr> <date> <dbl>
## 1 Global 2020-02-04 4966
rename() assigns a new name to an existing variabletransmute() and mutate()as well
rename(): renames selected variables, keeps all other variablestransmute(): renames selected variables, drops all other variablesmutate(): creates new variables (with new names) and keeps originalscovid %>%
mutate(
geography = location
, ymd = date
, cases = confirmed_infections
) %>%
select(location, geography, ymd, date, cases, confirmed_infections) %>%
head(1)
## # A tibble: 1 x 6
## location geography ymd date cases confirmed_infections
## <chr> <chr> <date> <date> <dbl> <dbl>
## 1 Global Global 2020-02-04 2020-02-04 4966 4966
With crime, create a data frame that meets the following conditions
report_number, crime_against_category, offense_locationreport_numbercrime_against_category, change NA values to UNKNOWNneighborhood as offense_locationcrime %>%
transmute(
report_number
, crime_against_category = ifelse(crime_against_category %in% NA
, 'UNKNOWN', crime_against_category)
, offense_location = neighborhood
)
crime %>%
transmute(
report_number
, crime_against_category = ifelse(crime_against_category %in% NA
, 'UNKNOWN', crime_against_category)
, offense_location = neighborhood
) %>%
head()
## # A tibble: 6 x 3
## report_number crime_against_category offense_location
## <chr> <chr> <chr>
## 1 2015-323587 PROPERTY ROOSEVELT/RAVENNA
## 2 2015-267992 PROPERTY FIRST HILL
## 3 2013-900207 PROPERTY <NA>
## 4 2008-334626 PROPERTY ROOSEVELT/RAVENNA
## 5 2014-363634 PROPERTY CENTRAL AREA/SQUIRE PARK
## 6 2010-019552 UNKNOWN GREENWOOD
filter() to subset your datafilter(), you tell R to exclude/include data that meet certain conditions!, %in%, >, >=, <, <=, ==covid %>% filter(est_infections_p100k > 60) %>% select(location, date, est_infections_p100k) %>% head(2) ## # A tibble: 2 x 3 ## location date est_infections_p100k ## <chr> <date> <dbl> ## 1 Global 2020-12-15 61.2 ## 2 Global 2020-12-16 62.4
filter() to subset your datafilter(), you tell R to exclude/include data that meet certain conditions!, %in%, >, >=, <, <=, ==covid %>% filter(est_infections_p100k <= 60) %>% select(location, date, est_infections_p100k) %>% head(2) ## # A tibble: 2 x 3 ## location date est_infections_p100k ## <chr> <date> <dbl> ## 1 Global 2020-08-13 21.7 ## 2 Global 2020-08-14 21.6
filter() to subset your datafilter(), you tell R to exclude/include data that meet certain conditions!, %in%, >, >=, <, <=, ==covid %>% filter(location %in% 'Peru') %>% select(location, date, est_infections_p100k) %>% tail(2) ## # A tibble: 2 x 3 ## location date est_infections_p100k ## <chr> <date> <dbl> ## 1 Peru 2020-12-31 2.67 ## 2 Peru 2021-01-01 2.63
filter() to subset your datafilter(), you tell R to exclude/include data that meet certain conditions!, %in%, >, >=, <, <=, ==covid %>% filter(! location %in% 'Peru') %>% select(location, date, est_infections_p100k) %>% tail(2) ## # A tibble: 2 x 3 ## location date est_infections_p100k ## <chr> <date> <dbl> ## 1 Zimbabwe 2020-12-31 0.581 ## 2 Zimbabwe 2021-01-01 0.579
filter()
& for ‘and’ expressions| for ‘or’ expressions() or use more than one filter()c()covid %>% filter(est_infections_p100k > 60 & location %in% 'Malta') %>% select(location, est_infections_p100k, date) %>% head(2) ## # A tibble: 2 x 3 ## location est_infections_p100k date ## <chr> <dbl> <date> ## 1 Malta 85.6 2020-09-29 ## 2 Malta 92.3 2020-09-30
filter()
& for ‘and’ expressions| for ‘or’ expressions() or use more than one filter()c()covid %>% filter(est_infections_p100k > 434 | location %in% 'Malta') %>% select(location, est_infections_p100k, date) %>% head(2) ## # A tibble: 2 x 3 ## location est_infections_p100k date ## <chr> <dbl> <date> ## 1 Maharashtra 435. 2020-11-30 ## 2 Malta NA 2020-02-04
filter()
& for ‘and’ expressions| for ‘or’ expressions() or use more than one filter()c()covid %>%
filter(
(est_infections_p100k > 60 & location %in% 'Malta') |
(est_infections_p100k <= 2.5 & location %in% 'Italy')
) %>%
select(location, est_infections_p100k, date) %>%
head(5)
## # A tibble: 5 x 3
## location est_infections_p100k date
## <chr> <dbl> <date>
## 1 Italy 2.38 2020-09-29
## 2 Italy 2.43 2020-09-30
## 3 Italy 2.46 2020-10-01
## 4 Italy 2.50 2020-10-02
## 5 Malta 85.6 2020-09-29
filter()
& for ‘and’ expressions| for ‘or’ expressions() or use more than one filter()c()covid %>%
filter(location %in% c('Peru', 'Colombia') & est_infections_p100k > 50) %>%
select(location, date, est_infections_p100k) %>%
head(5)
## # A tibble: 5 x 3
## location date est_infections_p100k
## <chr> <date> <dbl>
## 1 Colombia 2020-09-29 101.
## 2 Colombia 2020-09-30 101.
## 3 Colombia 2020-10-01 101.
## 4 Colombia 2020-10-02 100.
## 5 Colombia 2020-10-03 100.
head()tail()sample_n()covid %>% sample_n(3) %>% select(location, date, est_infections_p100k) ## # A tibble: 3 x 3 ## location date est_infections_p100k ## <chr> <date> <dbl> ## 1 Romania 2020-10-29 42.4 ## 2 Oklahoma 2020-05-29 NA ## 3 Nagaland 2020-12-22 179.
covid %>% sample_n(3) %>% select(location, date, est_infections_p100k) ## # A tibble: 3 x 3 ## location date est_infections_p100k ## <chr> <date> <dbl> ## 1 Rhode Island 2020-06-29 NA ## 2 Belgium 2020-02-25 NA ## 3 Bulgaria 2020-11-13 9.71
total_tests_data_type values not equal to NAlocation values equal to 'China' or 'Germany'confirmed_infections greater than 130high_cases. This variable should flag with a 1 observations that have confirmed_infections greater than 400location, high_cases, date, confirmed_infections, total_tests_data_typeHint
covid %>% filter(date >= '2020-07-04')
covid %>%
filter(
! total_tests_data_type %in% NA &
location %in% c('China', 'Germany') &
confirmed_infections > 130 &
date >= '2020-07-04'
) %>%
transmute(
location
, high_cases = ifelse(confirmed_infections > 400, 1, NA)
, date
, confirmed_infections
, total_tests_data_type
)
## # A tibble: 58 x 5
## location high_cases date confirmed_infections total_tests_data_type
## <chr> <dbl> <date> <dbl> <chr>
## 1 China NA 2020-07-22 136 observed
## 2 China NA 2020-07-25 137 observed
## 3 China NA 2020-07-27 149 observed
## 4 China NA 2020-07-30 152 observed
## 5 China NA 2020-08-01 133. observed
## 6 Germany 1 2020-07-04 422. observed
## 7 Germany NA 2020-07-05 239. observed
## 8 Germany NA 2020-07-06 219. observed
## 9 Germany NA 2020-07-07 390. observed
## 10 Germany NA 2020-07-08 397. observed
## # … with 48 more rows
covid %>%
filter(
! total_tests_data_type %in% NA &
location %in% c('China', 'Germany') &
confirmed_infections > 130 &
date >= '2020-07-04'
) %>%
nrow()
## [1] 58
covid, which month reports the most total_tests?covid, which location reports the largest number of estimated infections in September, excluding 'Global' values?crime, of all 'ASSAULT OFFENSES' values in offense_parent_group which offense value has the fewest incidences?crime, how many crimes were reported (reported_year) one and two years after the year in which they occurred (occurred_year)?group_by(), list the variables by which you want to aggregate datasummarise(), create a variable and define the variable with an aggregation function%>% to ‘link’ group_by() and summarise()n(), n_distinct(), sum(), mean(), max(), min(), etc.summarise() functionExample
In covid, which month reports the most total_tests?
covid %>% group_by(month) %>% summarise(total_tests = sum(total_tests, na.rm = TRUE))
Example
In covid, which month reports the most total_tests?
covid %>% group_by(month) %>% summarise(total_tests = sum(total_tests, na.rm = TRUE)) %>% arrange(desc(total_tests)) ## # A tibble: 12 x 2 ## month total_tests ## <dbl> <dbl> ## 1 8 1719495. ## 2 7 1689056. ## 3 5 1210343. ## 4 6 1182845. ## 5 9 1152910. ## 6 4 958562. ## 7 3 312733. ## 8 2 44480. ## 9 1 0 ## 10 10 0 ## 11 11 0 ## 12 12 0
Example
In covid, which location reports the largest number of estimated infections in September, excluding 'Global' values?
covid %>% group_by(month, location) %>% summarise(est_infections = sum(est_infections, na.rm = TRUE)) %>% filter(month %in% 9 & ! location %in% 'Global') %>% arrange(desc(est_infections)) %>% head() ## # A tibble: 6 x 3 ## # Groups: month [1] ## month location est_infections ## <dbl> <chr> <dbl> ## 1 9 Latin America and Caribbean 7757636. ## 2 9 Europe and Central Asia 3550230. ## 3 9 South Asia 2481264. ## 4 9 India 2330349. ## 5 9 East Asia and Pacific 1596947. ## 6 9 Maharashtra 812265.
How can you rewrite the code below and get the same output?
In covid, which location reports the largest number of estimated infections in September, excluding 'Global' values?
covid %>% group_by(month, location) %>% summarise(est_infections = sum(est_infections, na.rm = TRUE)) %>% filter(month %in% 9 & ! location %in% 'Global') %>% arrange(desc(est_infections)) %>% head()
How can you rewrite the code below and get the same output?
In covid, which location reports the largest number of estimated infections in September, excluding 'Global' values?
covid %>% filter(month %in% 9 & ! location %in% 'Global') %>% group_by(location) %>% summarise(est_infections = sum(est_infections, na.rm = TRUE)) %>% arrange(desc(est_infections)) %>% head()
Exercise #1 - 5 minutes
In crime, of all 'ASSAULT OFFENSES' values in offense_parent_group which offense value has the fewest incidences? (Exclude NAs from the output you produce.)
Exercise #2 - 7 minutes
In crime, how many crimes were reported (reported_year) one and two years after the year in which they occurred (occurred_year)?
In crime, of all 'ASSAULT OFFENSES' values in offense_parent_group which offense value has the fewest incidences? (Exclude NAs from the output you produce.)
crime %>% group_by(offense_parent_group, offense) %>%
In crime, of all 'ASSAULT OFFENSES' values in offense_parent_group which offense value has the fewest incidences? (Exclude NAs from the output you produce.)
crime %>% group_by(offense_parent_group, offense) %>% summarise(incidence = n())
In crime, of all 'ASSAULT OFFENSES' values in offense_parent_group which offense value has the fewest incidences? (Exclude NAs from the output you produce.)
crime %>% group_by(offense_parent_group, offense) %>% summarise(incidence = n()) %>% filter(offense_parent_group %in% 'ASSAULT OFFENSES' & ! offense %in% NA) ## # A tibble: 3 x 3 ## # Groups: offense_parent_group [1] ## offense_parent_group offense incidence ## <chr> <chr> <int> ## 1 ASSAULT OFFENSES Aggravated Assault 3754 ## 2 ASSAULT OFFENSES Intimidation 3963 ## 3 ASSAULT OFFENSES Simple Assault 9527
In crime, how many crimes were reported (reported_year) one and two years after the year in which they occurred (occurred_year)?
Hint You need to calculate the difference between the two year variables.
In crime, how many crimes were reported (reported_year) one and two years after the year in which they occurred (occurred_year)?
Hint You need to calculate the difference between the two year variables.
crime %>% mutate(dif = reported_year - occurred_year)
In crime, how many crimes were reported (reported_year) one and two years after the year in which they occurred (occurred_year)?
Hint You need to calculate the difference between the two year variables.
crime %>% mutate(dif = reported_year - occurred_year) %>% filter(dif %in% c(1, 2))
In crime, how many crimes were reported (reported_year) one and two years after the year in which they occurred (occurred_year)?
Hint You need to calculate the difference between the two year variables.
crime %>% mutate(dif = reported_year - occurred_year) %>% filter(dif %in% c(1, 2)) %>% group_by(dif)
In crime, how many crimes were reported (reported_year) one and two years after the year in which they occurred (occurred_year)?
Hint You need to calculate the difference between the two year variables.
crime %>% mutate(dif = reported_year - occurred_year) %>% filter(dif %in% c(1, 2)) %>% group_by(dif) %>% summarise(n = n()) ## # A tibble: 2 x 2 ## dif n ## <dbl> <int> ## 1 1 1808 ## 2 2 181
crime dataset, report the mean difference in reported_date and occurred_date values by neighborhood.
reported_date and occurred_date should be called date_difneighborhood only report rows where the number of neighborhood values in the dataset is greater than 1000 (n)transmute and filter as well as group_by and summarise to solve this problemneighborhood, date_dif, n [...] %>%
write_csv('tidyverse_exercise_output.csv')
date_dif value for ‘HIGH POINT’ neighborhood?reported_date and occurred_date for ‘BALLARD NORTH’ neighborhood?crime dataset, report the mean difference in reported_date and occurred_date values by neighborhood.
reported_date and occurred_date should be called date_difneighborhood only report rows where the number of neighborhood values in the dataset is greater than 1000 (n)transmute and filter as well as group_by and summarise to solve this problemneighborhood, date_dif, ncrime %>%
transmute(
neighborhood
, date_dif = reported_date - occurred_date
) %>%
crime dataset, report the mean difference in reported_date and occurred_date values by neighborhood.
reported_date and occurred_date should be called date_difneighborhood only report rows where the number of neighborhood values in the dataset is greater than 1000 (n)transmute and filter as well as group_by and summarise to solve this problemneighborhood, date_dif, ncrime %>%
transmute(
neighborhood
, date_dif = reported_date - occurred_date
) %>%
group_by(neighborhood) %>%
summarise(
date_dif = mean(date_dif, na.rm = TRUE)
, n = n()
) %>%
crime dataset, report the mean difference in reported_year and occurred_year values by neighborhood.
reported_date and occurred_date should be called date_difneighborhood only report rows where the number of neighborhood values in the dataset is greater than 1000 (n)transmute and filter as well as group_by and summarise to solve this problemneighborhood, date_dif, ncrime %>%
transmute(
neighborhood
, date_dif = reported_date - occurred_date
) %>%
group_by(neighborhood) %>%
summarise(
date_dif = mean(date_dif, na.rm = TRUE)
, n = n()
) %>%
filter(n > 1000) %>%
write_csv('tidyverse_exercise_output.csv')
reported_date and occurred_date for ‘HIGH POINT’ neighborhood?reported_date and occurred_date for ‘BALLARD NORTH’ neighborhood?reported_date and occurred_date for ‘HIGH POINT’ neighborhood?reported_date and occurred_date for ‘BALLARD NORTH’ neighborhood?crime %>%
transmute(
neighborhood
, date_dif = reported_date - occurred_date
) %>%
group_by(neighborhood) %>%
summarise(
date_dif = mean(date_dif, na.rm = TRUE)
, n = n()
) %>%
filter(neighborhood %in% c('HIGH POINT', 'BALLARD NORTH'))
## # A tibble: 2 x 3
## neighborhood date_dif n
## <chr> <drtn> <int>
## 1 BALLARD NORTH 13.35251 days 2746
## 2 HIGH POINT 16.88184 days 1151
covid dataset, determine the location-month combination with the highest number of estimated infections per capita. Exclude dates prior to July and 'Global' values in location. To do this:
infections_per_capita that reports est_infections over populationinfections_per_capitacovid %>%
filter(
month >= 7 &
! location %in% 'Global'
)
covid dataset, determine the location-month combination with the highest number of estimated infections per capita. Exclude dates prior to July and 'Global' values in location. To do this:
infections_per_capita that reports est_infections over populationinfections_per_capitacovid %>%
filter(
month >= 7 &
! location %in% 'Global'
) %>%
mutate(infections_per_capita = est_infections / population)
covid dataset, determine the location-month combination with the highest number of estimated infections per capita. Exclude dates prior to July and 'Global' values in location. To do this:
infections_per_capita that reports est_infections over populationinfections_per_capitacovid %>%
filter(
month >= 7 &
! location %in% 'Global'
) %>%
mutate(infections_per_capita = est_infections / population) %>%
group_by(location, month) %>%
summarise(infections_per_capita = sum(infections_per_capita, na.rm = TRUE))
covid %>%
filter(
month >= 7 &
! location %in% 'Global'
) %>%
mutate(infections_per_capita = est_infections / population) %>%
group_by(location, month) %>%
summarise(infections_per_capita = sum(infections_per_capita, na.rm = TRUE)) %>%
ungroup %>%
filter(infections_per_capita %in% max(infections_per_capita, na.rm = TRUE))
## # A tibble: 1 x 3
## location month infections_per_capita
## <chr> <dbl> <dbl>
## 1 Manipur 12 0.167