Import two related datasets from TidyTuesday Project.
age_gaps <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-02-14/age_gaps.csv')
## Rows: 1155 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): movie_name, director, actor_1_name, actor_2_name, character_1_gend...
## dbl (5): release_year, age_difference, couple_number, actor_1_age, actor_2_age
## date (2): actor_1_birthdate, actor_2_birthdate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
centenarians <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2023/2023-05-30/centenarians.csv')
## Rows: 200 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): name, place_of_death_or_residence, gender, still_alive
## dbl (2): rank, age
## date (2): birth_date, death_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Describe the two datasets:
Data1
Data 2
tibble(
dataset = c("age_gaps", "centenarians"),
rows = c(nrow(age_gaps), nrow(centenarians)),
cols = c(ncol(age_gaps), ncol(centenarians))
)
## # A tibble: 2 × 3
## dataset rows cols
## <chr> <int> <int>
## 1 age_gaps 1155 13
## 2 centenarians 200 8
age_small <- age_gaps %>%
select(1:4) %>% # you can change to specific columns if you like
slice_head(n = 10)
cent_small <- centenarians %>%
select(1:4) %>%
slice_head(n = 10)
age_small <- age_small %>%
mutate(join_id = row_number())
cent_small <- cent_small %>%
mutate(join_id = row_number())
age_small
## # A tibble: 10 × 5
## movie_name release_year director age_difference join_id
## <chr> <dbl> <chr> <dbl> <int>
## 1 Harold and Maude 1971 Hal Ashby 52 1
## 2 Venus 2006 Roger Michell 50 2
## 3 The Quiet American 2002 Phillip Noyce 49 3
## 4 The Big Lebowski 1998 Joel Coen 45 4
## 5 Beginners 2010 Mike Mills 43 5
## 6 Poison Ivy 1992 Katt Shea 42 6
## 7 Whatever Works 2009 Woody Allen 40 7
## 8 Entrapment 1999 Jon Amiel 39 8
## 9 Husbands and Wives 1992 Woody Allen 38 9
## 10 Magnolia 1999 Paul Thomas Anderson 38 10
cent_small
## # A tibble: 10 × 5
## rank name birth_date death_date join_id
## <dbl> <chr> <date> <date> <int>
## 1 1 Jeanne Calment 1875-02-21 1997-08-04 1
## 2 2 Kane Tanaka 1903-01-02 2022-04-19 2
## 3 3 Sarah Knauss 1880-09-24 1999-12-30 3
## 4 4 Lucile Randon 1904-02-11 2023-01-17 4
## 5 5 Nabi Tajima 1900-08-04 2018-04-21 5
## 6 6 Marie-Louise Meilleur 1880-08-29 1998-04-16 6
## 7 7 Violet Brown 1900-03-10 2017-09-15 7
## 8 8 Emma Morano 1899-11-29 2017-04-15 8
## 9 9 Chiyo Miyako 1901-05-02 2018-07-22 9
## 10 10 Delphia Welford 1875-09-09 1992-11-14 10
Describe the resulting data:
How is it different from the original two datasets?
inner_result <- inner_join(age_small, cent_small, by = "join_id")
inner_result
## # A tibble: 10 × 9
## movie_name release_year director age_difference join_id rank name
## <chr> <dbl> <chr> <dbl> <int> <dbl> <chr>
## 1 Harold and Maude 1971 Hal Ashby 52 1 1 Jean…
## 2 Venus 2006 Roger Mic… 50 2 2 Kane…
## 3 The Quiet American 2002 Phillip N… 49 3 3 Sara…
## 4 The Big Lebowski 1998 Joel Coen 45 4 4 Luci…
## 5 Beginners 2010 Mike Mills 43 5 5 Nabi…
## 6 Poison Ivy 1992 Katt Shea 42 6 6 Mari…
## 7 Whatever Works 2009 Woody All… 40 7 7 Viol…
## 8 Entrapment 1999 Jon Amiel 39 8 8 Emma…
## 9 Husbands and Wives 1992 Woody All… 38 9 9 Chiy…
## 10 Magnolia 1999 Paul Thom… 38 10 10 Delp…
## # ℹ 2 more variables: birth_date <date>, death_date <date>
tibble(
columns = ncol(inner_result),
rows = nrow(inner_result)
)
## # A tibble: 1 × 2
## columns rows
## <int> <int>
## 1 9 10
Describe the resulting data:
How is it different from the original two datasets?
left_result <- left_join(age_small, cent_small, by = "join_id")
left_result
## # A tibble: 10 × 9
## movie_name release_year director age_difference join_id rank name
## <chr> <dbl> <chr> <dbl> <int> <dbl> <chr>
## 1 Harold and Maude 1971 Hal Ashby 52 1 1 Jean…
## 2 Venus 2006 Roger Mic… 50 2 2 Kane…
## 3 The Quiet American 2002 Phillip N… 49 3 3 Sara…
## 4 The Big Lebowski 1998 Joel Coen 45 4 4 Luci…
## 5 Beginners 2010 Mike Mills 43 5 5 Nabi…
## 6 Poison Ivy 1992 Katt Shea 42 6 6 Mari…
## 7 Whatever Works 2009 Woody All… 40 7 7 Viol…
## 8 Entrapment 1999 Jon Amiel 39 8 8 Emma…
## 9 Husbands and Wives 1992 Woody All… 38 9 9 Chiy…
## 10 Magnolia 1999 Paul Thom… 38 10 10 Delp…
## # ℹ 2 more variables: birth_date <date>, death_date <date>
tibble(
columns = ncol(left_result),
rows = nrow(left_result)
)
## # A tibble: 1 × 2
## columns rows
## <int> <int>
## 1 9 10
Describe the resulting data:
How is it different from the original two datasets?
right_result <- right_join(age_small, cent_small, by = "join_id")
right_result
## # A tibble: 10 × 9
## movie_name release_year director age_difference join_id rank name
## <chr> <dbl> <chr> <dbl> <int> <dbl> <chr>
## 1 Harold and Maude 1971 Hal Ashby 52 1 1 Jean…
## 2 Venus 2006 Roger Mic… 50 2 2 Kane…
## 3 The Quiet American 2002 Phillip N… 49 3 3 Sara…
## 4 The Big Lebowski 1998 Joel Coen 45 4 4 Luci…
## 5 Beginners 2010 Mike Mills 43 5 5 Nabi…
## 6 Poison Ivy 1992 Katt Shea 42 6 6 Mari…
## 7 Whatever Works 2009 Woody All… 40 7 7 Viol…
## 8 Entrapment 1999 Jon Amiel 39 8 8 Emma…
## 9 Husbands and Wives 1992 Woody All… 38 9 9 Chiy…
## 10 Magnolia 1999 Paul Thom… 38 10 10 Delp…
## # ℹ 2 more variables: birth_date <date>, death_date <date>
tibble(
columns = ncol(right_result),
rows = nrow(right_result)
)
## # A tibble: 1 × 2
## columns rows
## <int> <int>
## 1 9 10
Describe the resulting data:
How is it different from the original two datasets?
full_result <- full_join(age_small, cent_small, by = "join_id")
full_result
## # A tibble: 10 × 9
## movie_name release_year director age_difference join_id rank name
## <chr> <dbl> <chr> <dbl> <int> <dbl> <chr>
## 1 Harold and Maude 1971 Hal Ashby 52 1 1 Jean…
## 2 Venus 2006 Roger Mic… 50 2 2 Kane…
## 3 The Quiet American 2002 Phillip N… 49 3 3 Sara…
## 4 The Big Lebowski 1998 Joel Coen 45 4 4 Luci…
## 5 Beginners 2010 Mike Mills 43 5 5 Nabi…
## 6 Poison Ivy 1992 Katt Shea 42 6 6 Mari…
## 7 Whatever Works 2009 Woody All… 40 7 7 Viol…
## 8 Entrapment 1999 Jon Amiel 39 8 8 Emma…
## 9 Husbands and Wives 1992 Woody All… 38 9 9 Chiy…
## 10 Magnolia 1999 Paul Thom… 38 10 10 Delp…
## # ℹ 2 more variables: birth_date <date>, death_date <date>
tibble(
columns = ncol(full_result),
rows = nrow(full_result)
)
## # A tibble: 1 × 2
## columns rows
## <int> <int>
## 1 9 10
Describe the resulting data:
How is it different from the original two datasets?
semi_result <- semi_join(age_small, cent_small, by = "join_id")
semi_result
## # A tibble: 10 × 5
## movie_name release_year director age_difference join_id
## <chr> <dbl> <chr> <dbl> <int>
## 1 Harold and Maude 1971 Hal Ashby 52 1
## 2 Venus 2006 Roger Michell 50 2
## 3 The Quiet American 2002 Phillip Noyce 49 3
## 4 The Big Lebowski 1998 Joel Coen 45 4
## 5 Beginners 2010 Mike Mills 43 5
## 6 Poison Ivy 1992 Katt Shea 42 6
## 7 Whatever Works 2009 Woody Allen 40 7
## 8 Entrapment 1999 Jon Amiel 39 8
## 9 Husbands and Wives 1992 Woody Allen 38 9
## 10 Magnolia 1999 Paul Thomas Anderson 38 10
tibble(
columns = ncol(semi_result),
rows = nrow(semi_result)
)
## # A tibble: 1 × 2
## columns rows
## <int> <int>
## 1 5 10
Describe the resulting data:
How is it different from the original two datasets?
anti_result <- anti_join(age_small, cent_small, by = "join_id")
anti_result
## # A tibble: 0 × 5
## # ℹ 5 variables: movie_name <chr>, release_year <dbl>, director <chr>,
## # age_difference <dbl>, join_id <int>
tibble(
columns = ncol(anti_result),
rows = nrow(anti_result)
)
## # A tibble: 1 × 2
## columns rows
## <int> <int>
## 1 5 0