Import data
# excel file
data <- read_excel("Apply_1.xlsx")
data
## # A tibble: 1,155 × 13
## movie_name release_year director age_difference couple_number actor_1_name
## <chr> <dbl> <chr> <dbl> <dbl> <chr>
## 1 Harold and M… 1971 Hal Ash… 52 1 Ruth Gordon
## 2 Venus 2006 Roger M… 50 1 Peter O'Too…
## 3 The Quiet Am… 2002 Phillip… 49 1 Michael Cai…
## 4 The Big Lebo… 1998 Joel Co… 45 1 David Huddl…
## 5 Beginners 2010 Mike Mi… 43 1 Christopher…
## 6 Poison Ivy 1992 Katt Sh… 42 1 Tom Skerritt
## 7 Whatever Wor… 2009 Woody A… 40 1 Larry David
## 8 Entrapment 1999 Jon Ami… 39 1 Sean Connery
## 9 Husbands and… 1992 Woody A… 38 1 Woody Allen
## 10 Magnolia 1999 Paul Th… 38 1 Jason Robar…
## # ℹ 1,145 more rows
## # ℹ 7 more variables: actor_2_name <chr>, character_1_gender <chr>,
## # character_2_gender <chr>, actor_1_birthdate <chr>, actor_2_birthdate <chr>,
## # actor_1_age <dbl>, actor_2_age <dbl>
Apply the following dplyr verbs to your data
Filter rows
filter(data, actor_1_age == 75, actor_2_age == 23)
## # A tibble: 1 × 13
## movie_name release_year director age_difference couple_number actor_1_name
## <chr> <dbl> <chr> <dbl> <dbl> <chr>
## 1 Harold and Ma… 1971 Hal Ash… 52 1 Ruth Gordon
## # ℹ 7 more variables: actor_2_name <chr>, character_1_gender <chr>,
## # character_2_gender <chr>, actor_1_birthdate <chr>, actor_2_birthdate <chr>,
## # actor_1_age <dbl>, actor_2_age <dbl>
filter(data, actor_1_age == 75 | actor_2_age == 23)
## # A tibble: 58 × 13
## movie_name release_year director age_difference couple_number actor_1_name
## <chr> <dbl> <chr> <dbl> <dbl> <chr>
## 1 Harold and M… 1971 Hal Ash… 52 1 Ruth Gordon
## 2 The Big Lebo… 1998 Joel Co… 45 1 David Huddl…
## 3 Indiana Jone… 1989 Steven … 36 1 Sean Connery
## 4 Fort Apache,… 1981 Daniel … 33 1 Paul Newman
## 5 The Private … 2009 Rebecca… 32 1 Alan Arkin
## 6 High Noon 1952 Fred Zi… 28 1 Gary Cooper
## 7 The Squid an… 2005 Noah Ba… 27 1 Jeff Daniels
## 8 Die Another … 2002 Lee Tam… 26 1 Pierce Bros…
## 9 Dark Passage 1947 Delmer … 25 1 Humphrey Bo…
## 10 State of the… 1948 Frank C… 25 1 Spencer Tra…
## # ℹ 48 more rows
## # ℹ 7 more variables: actor_2_name <chr>, character_1_gender <chr>,
## # character_2_gender <chr>, actor_1_birthdate <chr>, actor_2_birthdate <chr>,
## # actor_1_age <dbl>, actor_2_age <dbl>
Arrange rows
arrange(data, desc(actor_1_age), desc(actor_2_age))
## # A tibble: 1,155 × 13
## movie_name release_year director age_difference couple_number actor_1_name
## <chr> <dbl> <chr> <dbl> <dbl> <chr>
## 1 Beginners 2010 Mike Mi… 43 1 Christopher…
## 2 A Walk in th… 2015 Ken Kwa… 23 1 Robert Redf…
## 3 Magnolia 1999 Paul Th… 38 1 Jason Robar…
## 4 The Private … 2009 Rebecca… 32 1 Alan Arkin
## 5 Harold and M… 1971 Hal Ash… 52 1 Ruth Gordon
## 6 Venus 2006 Roger M… 50 1 Peter O'Too…
## 7 The Royal Te… 2001 Wes And… 21 2 Gene Hackman
## 8 The Prom 2020 Ryan Mu… 22 1 Meryl Streep
## 9 And So It Go… 2014 Rob Rei… 2 1 Michael Dou…
## 10 Behind the C… 2013 Steven … 26 1 Michael Dou…
## # ℹ 1,145 more rows
## # ℹ 7 more variables: actor_2_name <chr>, character_1_gender <chr>,
## # character_2_gender <chr>, actor_1_birthdate <chr>, actor_2_birthdate <chr>,
## # actor_1_age <dbl>, actor_2_age <dbl>
arrange(data, desc(age_difference))
## # A tibble: 1,155 × 13
## movie_name release_year director age_difference couple_number actor_1_name
## <chr> <dbl> <chr> <dbl> <dbl> <chr>
## 1 Harold and M… 1971 Hal Ash… 52 1 Ruth Gordon
## 2 Venus 2006 Roger M… 50 1 Peter O'Too…
## 3 The Quiet Am… 2002 Phillip… 49 1 Michael Cai…
## 4 The Big Lebo… 1998 Joel Co… 45 1 David Huddl…
## 5 Beginners 2010 Mike Mi… 43 1 Christopher…
## 6 Poison Ivy 1992 Katt Sh… 42 1 Tom Skerritt
## 7 Whatever Wor… 2009 Woody A… 40 1 Larry David
## 8 Entrapment 1999 Jon Ami… 39 1 Sean Connery
## 9 Husbands and… 1992 Woody A… 38 1 Woody Allen
## 10 Magnolia 1999 Paul Th… 38 1 Jason Robar…
## # ℹ 1,145 more rows
## # ℹ 7 more variables: actor_2_name <chr>, character_1_gender <chr>,
## # character_2_gender <chr>, actor_1_birthdate <chr>, actor_2_birthdate <chr>,
## # actor_1_age <dbl>, actor_2_age <dbl>
Select columns
select(data, release_year:age_difference)
## # A tibble: 1,155 × 3
## release_year director age_difference
## <dbl> <chr> <dbl>
## 1 1971 Hal Ashby 52
## 2 2006 Roger Michell 50
## 3 2002 Phillip Noyce 49
## 4 1998 Joel Coen 45
## 5 2010 Mike Mills 43
## 6 1992 Katt Shea 42
## 7 2009 Woody Allen 40
## 8 1999 Jon Amiel 39
## 9 1992 Woody Allen 38
## 10 1999 Paul Thomas Anderson 38
## # ℹ 1,145 more rows
Add columns
mutate(data,
agediff = actor_1_age - actor_2_age) %>%
# Select actor_1_age actor_2_age, and agediff
select(actor_1_age:actor_2_age, agediff)
## # A tibble: 1,155 × 3
## actor_1_age actor_2_age agediff
## <dbl> <dbl> <dbl>
## 1 75 23 52
## 2 74 24 50
## 3 69 20 49
## 4 68 23 45
## 5 81 38 43
## 6 59 17 42
## 7 62 22 40
## 8 69 30 39
## 9 57 19 38
## 10 77 39 38
## # ℹ 1,145 more rows
Summarize by groups
summarise(data, avgagediff = mean(age_difference, na.rm = TRUE) )
## # A tibble: 1 × 1
## avgagediff
## <dbl>
## 1 10.4