Import your data

age_gaps <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-02-14/age_gaps.csv')
## Rows: 1155 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): movie_name, director, actor_1_name, actor_2_name, character_1_gend...
## dbl  (5): release_year, age_difference, couple_number, actor_1_age, actor_2_age
## date (2): actor_1_birthdate, actor_2_birthdate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Chapter 15

Create a factor

Modify factor order

Unordered Factor Levels

# Transform data: calculate average age_differnce by movie_name
age_difference_by_movie_name <- age_gaps %>%
    
    group_by(movie_name) %>% 
    summarise(
        avg_age_difference = mean(age_difference, na.rm = TRUE)
    )
age_difference_by_movie_name
## # A tibble: 830 × 2
##    movie_name                 avg_age_difference
##    <chr>                                   <dbl>
##  1 10 Things I Hate About You                1  
##  2 13 Going on 30                            5  
##  3 15 Minutes                               24  
##  4 27 Dresses                                7.5
##  5 300                                       4  
##  6 3000 Miles to Graceland                  13  
##  7 42                                        9  
##  8 47 Ronin                                 17  
##  9 50 First Dates                            9  
## 10 50/50                                     4  
## # ℹ 820 more rows
# Plot
age_difference_by_movie_name %>%
    
    ggplot(aes(x = avg_age_difference, y = movie_name)) + 
    geom_point()

Ordered Factor Levels

age_difference_by_movie_name %>%
    
    ggplot(aes(x = avg_age_difference, y = fct_reorder(.f= movie_name, .x = avg_age_difference)))+ 
    geom_point() +
    
    # Labeling
    labs(y = NULL, x = "Mean Age Difference by Movie") 

Modify factor levels

Show examples of three functions:

age_gaps %>% distinct(movie_name)
## # A tibble: 830 × 1
##    movie_name        
##    <chr>             
##  1 Harold and Maude  
##  2 Venus             
##  3 The Quiet American
##  4 The Big Lebowski  
##  5 Beginners         
##  6 Poison Ivy        
##  7 Whatever Works    
##  8 Entrapment        
##  9 Husbands and Wives
## 10 Magnolia          
## # ℹ 820 more rows
# Recode
age_gaps %>%
    
    # Rename levels
    mutate(movie_name = fct_recode(movie_name, "Title" = "Venus")) %>%
    select(couple_number, movie_name) %>%
    filter(couple_number == "5")
## # A tibble: 5 × 2
##   couple_number movie_name                 
##           <dbl> <fct>                      
## 1             5 Love Actually              
## 2             5 A View to a Kill           
## 3             5 The Family Stone           
## 4             5 He's Just Not That Into You
## 5             5 Mona Lisa Smile
# Collapse multiple levels into one
age_gaps %>%
    
    mutate(movie_name = fct_collapse(movie_name, 
                                     "Big Success" = c("Love Actually", "A view to kill"), 
                                     ("So So" = c("He's just Not That into you", "other")))) %>%
                                      
                        
                                    
    select(movie_name) %>% 
    filter(movie_name == "Venus")
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `movie_name = fct_collapse(...)`.
## Caused by warning:
## ! Unknown levels in `f`: A view to kill, He's just Not That into you, other
## # A tibble: 1 × 1
##   movie_name
##   <fct>     
## 1 Venus
# Lump small levels into other levels
age_gaps %>% count(director) %>%
    sample_n(10)
## # A tibble: 10 × 2
##    director                       n
##    <chr>                      <int>
##  1 Sylvain White                  1
##  2 Frank Capra                    2
##  3 David Frankel                  2
##  4 Jason Moore                    1
##  5 Steven Brill                   2
##  6 Sam Levinson                   1
##  7 Dan Kwan, Daniel Scheinert     1
##  8 King Vidor                     2
##  9 Mel Gibson                     2
## 10 Chris Weitz                    1
age_gaps %>% mutate(director = fct_lump(director)) %>% distinct(director)
## # A tibble: 510 × 1
##    director            
##    <fct>               
##  1 Hal Ashby           
##  2 Roger Michell       
##  3 Phillip Noyce       
##  4 Joel Coen           
##  5 Mike Mills          
##  6 Katt Shea           
##  7 Woody Allen         
##  8 Jon Amiel           
##  9 Paul Thomas Anderson
## 10 Steven Spielberg    
## # ℹ 500 more rows