library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(moderndive)
library(skimr)
library(nycflights13)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

E03 Join Practice

2022-11-15

Part I.A: Base

df2 = tribble(~name, ~`eat date`, ~num, ~flavor,
              "Aidan", "2022-11-15", 15, "chocolate",
              "Bruce", "2022-11-11", 10, "vanilla",
              "Kathy", "2021-09-17", 30, "kiwi",
              "Sophie", "2022-06-14", 15, "vanilla"
              )
df3 = tribble(~flavor, ~`manufacture date`, ~num,
              "chocolate", "2022-11-15", 500,
              "vanilla", "2022-01-04", 350,
              "strawberry", "2021-10-07", 275
              )
  1. List all of the people who eat unusual ice creams that are not listed in df3.
anti_join(df2, df3, by="flavor")
  1. Make a data frame that shows only rows with people who ate ice cream with an eat date equal to the manufacture date.
semi_join(df2, df3, by=c("eat date"="manufacture date"))
  1. Make a data frame with consisting of the person’s name, the eat date of the ice cream, and the product of the num columns in both data frames, matched by flavor. Do not include other info.
left_join(df2, df3, by="flavor") %>% 
  mutate(numProduct = num.x*num.y) %>%
  clean_names() %>%
  select(name, eat_date, num_product)

Part I.B

superheroes <- tribble(
       ~name, ~alignment,  ~gender,          ~publisher,
   "Magneto",      "bad",   "male",            "Marvel",
     "Storm",     "good", "female",            "Marvel",
  "Mystique",      "bad", "female",            "Marvel",
    "Batman",     "good",   "male",                "DC",
     "Joker",      "bad",   "male",                "DC",
  "Catwoman",      "bad", "female",                "DC",
   "Hellboy",     "good",   "male", "Dark Horse Comics"
  )

publishers <- tribble(
  ~publisher, ~yr_founded,
        "DC",       1934L,
    "Marvel",       1939L,
     "Image",       1992L
  )
  1. Add the year that the publisher was founded to every superhero’s data row. Omit superheros from publishers with unknown founding years.
inner_join(superheroes, publishers, by="publisher")
  1. Which superheros are from publishers not in the official publishers database?
anti_join(superheroes, publishers, by="publisher")
  1. Oh wait, I want to see publishers amd years founded listed for every superhero. Just put NA in the year founded column if you don’t know it.
left_join(superheroes, publishers, by="publisher")

Part II

The annoying renaming problem.

In the MtF dataset, there are a bunch of causes of death that have really long names.

  1. Produce a table with cause of death and number of fatalities (total) that it causes.
df <- read_csv('mtf.csv') %>% clean_names()
## Rows: 31108 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Cause of Death, State
## dbl (2): Year, Deaths
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df %>% 
  group_by(cause_of_death) %>%
  count()
  1. Get that table into Excel or Google Docs and add a column where you manually type at least 10 abbreviations for the long names.
df2 <- read_csv('shortNames.csv', col_names=c("original_name", "abbreviation"))
## Rows: 10 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): original_name, abbreviation
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df2
  1. Put that together in R with your original data and use it to find ... something in Massachusetts?
inner_join(df, df2, by=c("cause_of_death" = "original_name")) %>%
  relocate(abbreviation, .before = cause_of_death) %>% 
  group_by(abbreviation, cause_of_death) %>%
  count()