age_gaps <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-14/age_gaps.csv')
## Rows: 1155 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): movie_name, director, actor_1_name, actor_2_name, character_1_gend...
## dbl (5): release_year, age_difference, couple_number, actor_1_age, actor_2_age
## date (2): actor_1_birthdate, actor_2_birthdate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Unordered Factor Levels
# Transform data: calculate average age_differnce by movie_name
age_difference_by_movie_name <- age_gaps %>%
group_by(movie_name) %>%
summarise(
avg_age_difference = mean(age_difference, na.rm = TRUE)
)
age_difference_by_movie_name
## # A tibble: 830 × 2
## movie_name avg_age_difference
## <chr> <dbl>
## 1 10 Things I Hate About You 1
## 2 13 Going on 30 5
## 3 15 Minutes 24
## 4 27 Dresses 7.5
## 5 300 4
## 6 3000 Miles to Graceland 13
## 7 42 9
## 8 47 Ronin 17
## 9 50 First Dates 9
## 10 50/50 4
## # ℹ 820 more rows
# Plot
age_difference_by_movie_name %>%
ggplot(aes(x = avg_age_difference, y = movie_name)) +
geom_point()
Ordered Factor Levels
age_difference_by_movie_name %>%
ggplot(aes(x = avg_age_difference, y = fct_reorder(.f= movie_name, .x = avg_age_difference)))+
geom_point() +
# Labeling
labs(y = NULL, x = "Mean Age Difference by Movie")
Show examples of three functions:
age_gaps %>% distinct(movie_name)
## # A tibble: 830 × 1
## movie_name
## <chr>
## 1 Harold and Maude
## 2 Venus
## 3 The Quiet American
## 4 The Big Lebowski
## 5 Beginners
## 6 Poison Ivy
## 7 Whatever Works
## 8 Entrapment
## 9 Husbands and Wives
## 10 Magnolia
## # ℹ 820 more rows
# Recode
age_gaps %>%
# Rename levels
mutate(movie_name = fct_recode(movie_name, "Title" = "Venus")) %>%
select(couple_number, movie_name) %>%
filter(couple_number == "5")
## # A tibble: 5 × 2
## couple_number movie_name
## <dbl> <fct>
## 1 5 Love Actually
## 2 5 A View to a Kill
## 3 5 The Family Stone
## 4 5 He's Just Not That Into You
## 5 5 Mona Lisa Smile
# Collapse multiple levels into one
age_gaps %>%
mutate(movie_name = fct_collapse(movie_name,
"Big Success" = c("Love Actually", "A view to kill"),
("So So" = c("He's just Not That into you", "other")))) %>%
select(movie_name) %>%
filter(movie_name == "Venus")
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `movie_name = fct_collapse(...)`.
## Caused by warning:
## ! Unknown levels in `f`: A view to kill, He's just Not That into you, other
## # A tibble: 1 × 1
## movie_name
## <fct>
## 1 Venus
# Lump small levels into other levels
age_gaps %>% count(director) %>%
sample_n(10)
## # A tibble: 10 × 2
## director n
## <chr> <int>
## 1 Sylvain White 1
## 2 Frank Capra 2
## 3 David Frankel 2
## 4 Jason Moore 1
## 5 Steven Brill 2
## 6 Sam Levinson 1
## 7 Dan Kwan, Daniel Scheinert 1
## 8 King Vidor 2
## 9 Mel Gibson 2
## 10 Chris Weitz 1
age_gaps %>% mutate(director = fct_lump(director)) %>% distinct(director)
## # A tibble: 510 × 1
## director
## <fct>
## 1 Hal Ashby
## 2 Roger Michell
## 3 Phillip Noyce
## 4 Joel Coen
## 5 Mike Mills
## 6 Katt Shea
## 7 Woody Allen
## 8 Jon Amiel
## 9 Paul Thomas Anderson
## 10 Steven Spielberg
## # ℹ 500 more rows