Import two related datasets from TidyTuesday Project.
olympics <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-07-27/olympics.csv')
## Rows: 271116 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): name, sex, team, noc, games, season, city, sport, event, medal
## dbl (5): id, age, height, weight, year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
youtube <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2021/2021-03-02/youtube.csv')
## Rows: 247 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): brand, superbowl_ads_dot_com_url, youtube_url, id, kind, etag, ti...
## dbl (7): year, view_count, like_count, dislike_count, favorite_count, comm...
## lgl (7): funny, show_product_quickly, patriotic, celebrity, danger, animal...
## dttm (1): published_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Describe the two datasets:
Data1: olympics
Data 2: youtube
set.seed(1234)
olympics_small <- olympics %>% select(age, height, weight) %>% sample_n(10)
youtube_small <- youtube %>% select(brand, year, funny) %>% sample_n(10)
olympics_small
## # A tibble: 10 × 3
## age height weight
## <dbl> <dbl> <dbl>
## 1 37 187 110
## 2 30 178 67
## 3 20 161 52
## 4 27 170 63
## 5 21 NA NA
## 6 18 152 49
## 7 17 185 85
## 8 27 NA NA
## 9 29 184 85
## 10 28 179 NA
youtube_small
## # A tibble: 10 × 3
## brand year funny
## <chr> <dbl> <lgl>
## 1 Doritos 2008 TRUE
## 2 Pepsi 2018 FALSE
## 3 NFL 2018 TRUE
## 4 Bud Light 2020 TRUE
## 5 Toyota 2014 TRUE
## 6 Budweiser 2016 FALSE
## 7 Kia 2013 TRUE
## 8 Doritos 2013 TRUE
## 9 Kia 2015 TRUE
## 10 Budweiser 2012 FALSE
Describe the resulting data:
How is it different from the original two datasets? * 1 row compared to 10
library(dplyr)
olympics_small <- olympics_small %>% mutate(id = row_number())
youtube_small <- youtube_small %>% mutate(id = row_number())
joined_data <- inner_join(olympics_small, youtube_small, by = "id") %>% select(-id)
Describe the resulting data:
How is it different from the original two datasets?
olympics_small <- olympics_small %>% mutate(id = row_number())
youtube_small <- youtube_small %>% mutate(id = row_number())
left_joined_data <- left_join(olympics_small, youtube_small, by = "id") %>% select(-id)
Describe the resulting data:
How is it different from the original two datasets?
olympics_small <- olympics_small %>% mutate(id = row_number())
youtube_small <- youtube_small %>% mutate(id = row_number())
right_joined_data <- right_join(olympics_small, youtube_small, by = "id") %>% select(-id)
Describe the resulting data:
How is it different from the original two datasets?
olympics_small <- olympics_small %>% mutate(id = row_number())
youtube_small <- youtube_small %>% mutate(id = row_number())
full_joined_data <- full_join(olympics_small, youtube_small, by = "id") %>% select(-id)
Describe the resulting data:
How is it different from the original two datasets?
olympics_small <- olympics_small %>% mutate(id = row_number())
youtube_small <- youtube_small %>% mutate(id = row_number())
semi_joined_data <- semi_join(olympics_small, youtube_small, by = "id") %>% select(-id)
Describe the resulting data:
How is it different from the original two datasets?
olympics_small <- olympics_small %>% mutate(id = row_number())
youtube_small <- youtube_small %>% mutate(id = row_number())
anti_joined_data <- anti_join(olympics_small, youtube_small, by = "id") %>% select(-id)