library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(moderndive)
library(skimr)
library(nycflights13)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
2022-11-15
df2 = tribble(~name, ~`eat date`, ~num, ~flavor,
"Aidan", "2022-11-15", 15, "chocolate",
"Bruce", "2022-11-11", 10, "vanilla",
"Kathy", "2021-09-17", 30, "kiwi",
"Sophie", "2022-06-14", 15, "vanilla"
)
df3 = tribble(~flavor, ~`manufacture date`, ~num,
"chocolate", "2022-11-15", 500,
"vanilla", "2022-01-04", 350,
"strawberry", "2021-10-07", 275
)
df3.anti_join(df2, df3, by="flavor")
eat date equal to the
manufacture date.semi_join(df2, df3, by=c("eat date"="manufacture date"))
name,
the eat date of the ice cream, and the product of the
num columns in both data frames, matched by flavor. Do not
include other info.left_join(df2, df3, by="flavor") %>%
mutate(numProduct = num.x*num.y) %>%
clean_names() %>%
select(name, eat_date, num_product)
superheroes <- tribble(
~name, ~alignment, ~gender, ~publisher,
"Magneto", "bad", "male", "Marvel",
"Storm", "good", "female", "Marvel",
"Mystique", "bad", "female", "Marvel",
"Batman", "good", "male", "DC",
"Joker", "bad", "male", "DC",
"Catwoman", "bad", "female", "DC",
"Hellboy", "good", "male", "Dark Horse Comics"
)
publishers <- tribble(
~publisher, ~yr_founded,
"DC", 1934L,
"Marvel", 1939L,
"Image", 1992L
)
inner_join(superheroes, publishers, by="publisher")
publishers database?anti_join(superheroes, publishers, by="publisher")
left_join(superheroes, publishers, by="publisher")
The annoying renaming problem.
In the MtF dataset, there are a bunch of causes of death that have really long names.
df <- read_csv('mtf.csv') %>% clean_names()
## Rows: 31108 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Cause of Death, State
## dbl (2): Year, Deaths
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df %>%
group_by(cause_of_death) %>%
count()
df2 <- read_csv('shortNames.csv', col_names=c("original_name", "abbreviation"))
## Rows: 10 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): original_name, abbreviation
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df2
inner_join(df, df2, by=c("cause_of_death" = "original_name")) %>%
relocate(abbreviation, .before = cause_of_death) %>%
group_by(abbreviation, cause_of_death) %>%
count()