Import your data
# excel file
data <- read_excel("../00_data/MyData.xlsx")
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
## Warning in read_fun(path = path, sheet_i = sheet, limits = limits, shim = shim,
## : NA inserted for an unsupported date prior to 1900
data
## # A tibble: 1,155 × 13
## movie_name release_year director age_difference couple_number actor_1_name
## <chr> <dbl> <chr> <dbl> <dbl> <chr>
## 1 Venus 2006 Roger M… 50 1 Peter O'Too…
## 2 The Quiet Am… 2002 Phillip… 49 1 Michael Cai…
## 3 The Big Lebo… 1998 Joel Co… 45 1 David Huddl…
## 4 Poison Ivy 1992 Katt Sh… 42 1 Tom Skerritt
## 5 Whatever Wor… 2009 Woody A… 40 1 Larry David
## 6 Entrapment 1999 Jon Ami… 39 1 Sean Connery
## 7 Husbands and… 1992 Woody A… 38 1 Woody Allen
## 8 Magnolia 1999 Paul Th… 38 1 Jason Robar…
## 9 Indiana Jone… 1989 Steven … 36 1 Sean Connery
## 10 Mr. Peabody … 1948 Irving … 36 1 William Pow…
## # ℹ 1,145 more rows
## # ℹ 7 more variables: actor_2_name <chr>, character_1_gender <chr>,
## # character_2_gender <chr>, actor_1_birthdate <dttm>,
## # actor_2_birthdate <dttm>, actor_1_age <dbl>, actor_2_age <dbl>
Pivoting
data %>%
pivot_wider(names_from = couple_number, values_from = age_difference)
## # A tibble: 1,155 × 18
## movie_name release_year director actor_1_name actor_2_name character_1_gender
## <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 Venus 2006 Roger M… Peter O'Too… Jodie Whitt… man
## 2 The Quiet… 2002 Phillip… Michael Cai… Do Thi Hai … man
## 3 The Big L… 1998 Joel Co… David Huddl… Tara Reid man
## 4 Poison Ivy 1992 Katt Sh… Tom Skerritt Drew Barrym… man
## 5 Whatever … 2009 Woody A… Larry David Evan Rachel… man
## 6 Entrapment 1999 Jon Ami… Sean Connery Catherine Z… man
## 7 Husbands … 1992 Woody A… Woody Allen Juliette Le… man
## 8 Magnolia 1999 Paul Th… Jason Robar… Julianne Mo… man
## 9 Indiana J… 1989 Steven … Sean Connery Alison Doody man
## 10 Mr. Peabo… 1948 Irving … William Pow… Ann Blyth man
## # ℹ 1,145 more rows
## # ℹ 12 more variables: character_2_gender <chr>, actor_1_birthdate <dttm>,
## # actor_2_birthdate <dttm>, actor_1_age <dbl>, actor_2_age <dbl>, `1` <dbl>,
## # `2` <dbl>, `3` <dbl>, `4` <dbl>, `5` <dbl>, `6` <dbl>, `7` <dbl>
long to wide form
wide to long form
Separating and Uniting
Separate a column
data %>%
separate(col = couple_number, into = c("movie_name"))
## # A tibble: 1,155 × 12
## release_year director age_difference movie_name actor_1_name actor_2_name
## <dbl> <chr> <dbl> <chr> <chr> <chr>
## 1 2006 Roger Miche… 50 1 Peter O'Too… Jodie Whitt…
## 2 2002 Phillip Noy… 49 1 Michael Cai… Do Thi Hai …
## 3 1998 Joel Coen 45 1 David Huddl… Tara Reid
## 4 1992 Katt Shea 42 1 Tom Skerritt Drew Barrym…
## 5 2009 Woody Allen 40 1 Larry David Evan Rachel…
## 6 1999 Jon Amiel 39 1 Sean Connery Catherine Z…
## 7 1992 Woody Allen 38 1 Woody Allen Juliette Le…
## 8 1999 Paul Thomas… 38 1 Jason Robar… Julianne Mo…
## 9 1989 Steven Spie… 36 1 Sean Connery Alison Doody
## 10 1948 Irving Pich… 36 1 William Pow… Ann Blyth
## # ℹ 1,145 more rows
## # ℹ 6 more variables: character_1_gender <chr>, character_2_gender <chr>,
## # actor_1_birthdate <dttm>, actor_2_birthdate <dttm>, actor_1_age <dbl>,
## # actor_2_age <dbl>
Unite two columns
data %>%
unite(col= "age_difference", c(actor_1_age:actor_2_age), sep= "/" )
## # A tibble: 1,155 × 11
## movie_name release_year director couple_number actor_1_name actor_2_name
## <chr> <dbl> <chr> <dbl> <chr> <chr>
## 1 Venus 2006 Roger M… 1 Peter O'Too… Jodie Whitt…
## 2 The Quiet Amer… 2002 Phillip… 1 Michael Cai… Do Thi Hai …
## 3 The Big Lebows… 1998 Joel Co… 1 David Huddl… Tara Reid
## 4 Poison Ivy 1992 Katt Sh… 1 Tom Skerritt Drew Barrym…
## 5 Whatever Works 2009 Woody A… 1 Larry David Evan Rachel…
## 6 Entrapment 1999 Jon Ami… 1 Sean Connery Catherine Z…
## 7 Husbands and W… 1992 Woody A… 1 Woody Allen Juliette Le…
## 8 Magnolia 1999 Paul Th… 1 Jason Robar… Julianne Mo…
## 9 Indiana Jones … 1989 Steven … 1 Sean Connery Alison Doody
## 10 Mr. Peabody an… 1948 Irving … 1 William Pow… Ann Blyth
## # ℹ 1,145 more rows
## # ℹ 5 more variables: character_1_gender <chr>, character_2_gender <chr>,
## # actor_1_birthdate <dttm>, actor_2_birthdate <dttm>, age_difference <chr>
Missing Values
data %>%
pivot_wider(names_from = release_year, values_from = director) %>%
pivot_longer(
cols = c(`2000`, `2022`),
names_to = "year",
values_to = "return",
values_drop_na = TRUE
)
## # A tibble: 39 × 93
## movie_name age_difference couple_number actor_1_name actor_2_name
## <chr> <dbl> <dbl> <chr> <chr>
## 1 Autumn in New York 22 1 Richard Gere Winona Ryder
## 2 The Bubble 21 1 Pedro Pascal Maria Bakal…
## 3 The Northman 20 1 Alexander Ska… Anya Taylor…
## 4 The Whole Nine Yards 17 1 Bruce Willis Amanda Peet
## 5 Remember the Titans 16 1 Denzel Washin… Nicole Ari …
## 6 What Lies Beneath 16 1 Harrison Ford Michelle Pf…
## 7 The Gift 15 1 Greg Kinnear Katie Holmes
## 8 Wonder Boys 13 1 Michael Dougl… Frances McD…
## 9 Return to Me 10 1 David Duchovny Minnie Driv…
## 10 Proof of Life 8 1 David Morse Meg Ryan
## # ℹ 29 more rows
## # ℹ 88 more variables: character_1_gender <chr>, character_2_gender <chr>,
## # actor_1_birthdate <dttm>, actor_2_birthdate <dttm>, actor_1_age <dbl>,
## # actor_2_age <dbl>, `2006` <chr>, `2002` <chr>, `1998` <chr>, `1992` <chr>,
## # `2009` <chr>, `1999` <chr>, `1989` <chr>, `1948` <chr>, `1995` <chr>,
## # `2003` <chr>, `2004` <chr>, `2005` <chr>, `2010` <chr>, `1981` <chr>,
## # `1955` <chr>, `1997` <chr>, `2013` <chr>, `2008` <chr>, `1985` <chr>, …