Import your data

# excel file
data <- read_excel("../00_data/MyData.xlsx")
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900

## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
data
## # A tibble: 1,155 × 13
##    movie_name    release_year director age_difference couple_number actor_1_name
##    <chr>                <dbl> <chr>             <dbl>         <dbl> <chr>       
##  1 Venus                 2006 Roger M…             50             1 Peter O'Too…
##  2 The Quiet Am…         2002 Phillip…             49             1 Michael Cai…
##  3 The Big Lebo…         1998 Joel Co…             45             1 David Huddl…
##  4 Poison Ivy            1992 Katt Sh…             42             1 Tom Skerritt
##  5 Whatever Wor…         2009 Woody A…             40             1 Larry David 
##  6 Entrapment            1999 Jon Ami…             39             1 Sean Connery
##  7 Husbands and…         1992 Woody A…             38             1 Woody Allen 
##  8 Magnolia              1999 Paul Th…             38             1 Jason Robar…
##  9 Indiana Jone…         1989 Steven …             36             1 Sean Connery
## 10 Mr. Peabody …         1948 Irving …             36             1 William Pow…
## # ℹ 1,145 more rows
## # ℹ 7 more variables: actor_2_name <chr>, character_1_gender <chr>,
## #   character_2_gender <chr>, actor_1_birthdate <dttm>,
## #   actor_2_birthdate <dttm>, actor_1_age <dbl>, actor_2_age <dbl>

Pivoting

data %>%
    pivot_wider(names_from = couple_number, values_from = age_difference)
## # A tibble: 1,155 × 18
##    movie_name release_year director actor_1_name actor_2_name character_1_gender
##    <chr>             <dbl> <chr>    <chr>        <chr>        <chr>             
##  1 Venus              2006 Roger M… Peter O'Too… Jodie Whitt… man               
##  2 The Quiet…         2002 Phillip… Michael Cai… Do Thi Hai … man               
##  3 The Big L…         1998 Joel Co… David Huddl… Tara Reid    man               
##  4 Poison Ivy         1992 Katt Sh… Tom Skerritt Drew Barrym… man               
##  5 Whatever …         2009 Woody A… Larry David  Evan Rachel… man               
##  6 Entrapment         1999 Jon Ami… Sean Connery Catherine Z… man               
##  7 Husbands …         1992 Woody A… Woody Allen  Juliette Le… man               
##  8 Magnolia           1999 Paul Th… Jason Robar… Julianne Mo… man               
##  9 Indiana J…         1989 Steven … Sean Connery Alison Doody man               
## 10 Mr. Peabo…         1948 Irving … William Pow… Ann Blyth    man               
## # ℹ 1,145 more rows
## # ℹ 12 more variables: character_2_gender <chr>, actor_1_birthdate <dttm>,
## #   actor_2_birthdate <dttm>, actor_1_age <dbl>, actor_2_age <dbl>, `1` <dbl>,
## #   `2` <dbl>, `3` <dbl>, `4` <dbl>, `5` <dbl>, `6` <dbl>, `7` <dbl>

long to wide form

wide to long form

Separating and Uniting

Separate a column

data %>%
    
    separate(col = couple_number, into = c("movie_name"))
## # A tibble: 1,155 × 12
##    release_year director     age_difference movie_name actor_1_name actor_2_name
##           <dbl> <chr>                 <dbl> <chr>      <chr>        <chr>       
##  1         2006 Roger Miche…             50 1          Peter O'Too… Jodie Whitt…
##  2         2002 Phillip Noy…             49 1          Michael Cai… Do Thi Hai …
##  3         1998 Joel Coen                45 1          David Huddl… Tara Reid   
##  4         1992 Katt Shea                42 1          Tom Skerritt Drew Barrym…
##  5         2009 Woody Allen              40 1          Larry David  Evan Rachel…
##  6         1999 Jon Amiel                39 1          Sean Connery Catherine Z…
##  7         1992 Woody Allen              38 1          Woody Allen  Juliette Le…
##  8         1999 Paul Thomas…             38 1          Jason Robar… Julianne Mo…
##  9         1989 Steven Spie…             36 1          Sean Connery Alison Doody
## 10         1948 Irving Pich…             36 1          William Pow… Ann Blyth   
## # ℹ 1,145 more rows
## # ℹ 6 more variables: character_1_gender <chr>, character_2_gender <chr>,
## #   actor_1_birthdate <dttm>, actor_2_birthdate <dttm>, actor_1_age <dbl>,
## #   actor_2_age <dbl>

Unite two columns

data %>%
    
        unite(col= "age_difference", c(actor_1_age:actor_2_age), sep= "/" )
## # A tibble: 1,155 × 11
##    movie_name      release_year director couple_number actor_1_name actor_2_name
##    <chr>                  <dbl> <chr>            <dbl> <chr>        <chr>       
##  1 Venus                   2006 Roger M…             1 Peter O'Too… Jodie Whitt…
##  2 The Quiet Amer…         2002 Phillip…             1 Michael Cai… Do Thi Hai …
##  3 The Big Lebows…         1998 Joel Co…             1 David Huddl… Tara Reid   
##  4 Poison Ivy              1992 Katt Sh…             1 Tom Skerritt Drew Barrym…
##  5 Whatever Works          2009 Woody A…             1 Larry David  Evan Rachel…
##  6 Entrapment              1999 Jon Ami…             1 Sean Connery Catherine Z…
##  7 Husbands and W…         1992 Woody A…             1 Woody Allen  Juliette Le…
##  8 Magnolia                1999 Paul Th…             1 Jason Robar… Julianne Mo…
##  9 Indiana Jones …         1989 Steven …             1 Sean Connery Alison Doody
## 10 Mr. Peabody an…         1948 Irving …             1 William Pow… Ann Blyth   
## # ℹ 1,145 more rows
## # ℹ 5 more variables: character_1_gender <chr>, character_2_gender <chr>,
## #   actor_1_birthdate <dttm>, actor_2_birthdate <dttm>, age_difference <chr>

Missing Values

data %>% 
  pivot_wider(names_from = release_year, values_from = director) %>% 
  pivot_longer(
    cols = c(`2000`, `2022`), 
    names_to = "year", 
    values_to = "return", 
    values_drop_na = TRUE
  )
## # A tibble: 39 × 93
##    movie_name           age_difference couple_number actor_1_name   actor_2_name
##    <chr>                         <dbl>         <dbl> <chr>          <chr>       
##  1 Autumn in New York               22             1 Richard Gere   Winona Ryder
##  2 The Bubble                       21             1 Pedro Pascal   Maria Bakal…
##  3 The Northman                     20             1 Alexander Ska… Anya Taylor…
##  4 The Whole Nine Yards             17             1 Bruce Willis   Amanda Peet 
##  5 Remember the Titans              16             1 Denzel Washin… Nicole Ari …
##  6 What Lies Beneath                16             1 Harrison Ford  Michelle Pf…
##  7 The Gift                         15             1 Greg Kinnear   Katie Holmes
##  8 Wonder Boys                      13             1 Michael Dougl… Frances McD…
##  9 Return to Me                     10             1 David Duchovny Minnie Driv…
## 10 Proof of Life                     8             1 David Morse    Meg Ryan    
## # ℹ 29 more rows
## # ℹ 88 more variables: character_1_gender <chr>, character_2_gender <chr>,
## #   actor_1_birthdate <dttm>, actor_2_birthdate <dttm>, actor_1_age <dbl>,
## #   actor_2_age <dbl>, `2006` <chr>, `2002` <chr>, `1998` <chr>, `1992` <chr>,
## #   `2009` <chr>, `1999` <chr>, `1989` <chr>, `1948` <chr>, `1995` <chr>,
## #   `2003` <chr>, `2004` <chr>, `2005` <chr>, `2010` <chr>, `1981` <chr>,
## #   `1955` <chr>, `1997` <chr>, `2013` <chr>, `2008` <chr>, `1985` <chr>, …