Import data
# excel file
data <- read_excel("../00_data/MyData.xlsx")
## New names:
## • `` -> `...1`
data
## # A tibble: 3,401 × 9
## ...1 release_date movie production_budget domestic_gross worldwide_gross
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 6/22/2007 Evan Alm… 175000000 100289690 174131329
## 2 2 7/28/1995 Waterwor… 175000000 88246220 264246220
## 3 3 5/12/2017 King Art… 175000000 39175066 139950708
## 4 4 12/25/2013 47 Ronin 175000000 38362475 151716815
## 5 5 6/22/2018 Jurassic… 170000000 416769345 1304866322
## 6 6 8/1/2014 Guardian… 170000000 333172112 771051335
## 7 7 5/7/2010 Iron Man… 170000000 312433331 621156389
## 8 8 4/4/2014 Captain … 170000000 259746958 714401889
## 9 9 7/11/2014 Dawn of … 170000000 208545589 710644566
## 10 10 11/10/2004 The Pola… 170000000 186493587 310634169
## # ℹ 3,391 more rows
## # ℹ 3 more variables: distributor <chr>, mpaa_rating <chr>, genre <chr>
Apply the following dplyr verbs to your data
Filter rows
filter(data, mpaa_rating == "G")
## # A tibble: 85 × 9
## ...1 release_date movie production_budget domestic_gross worldwide_gross
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 10 11/10/2004 The Pola… 170000000 186493587 310634169
## 2 46 6/29/2007 Ratatoui… 150000000 206445654 626549695
## 3 74 6/16/1999 Tarzan 145000000 171091819 448191819
## 4 113 4/11/2014 Rio 2 130000000 131538435 492846291
## 5 151 11/2/2001 Monsters… 115000000 289423425 559757719
## 6 185 11/25/2009 The Prin… 105000000 104400899 270997378
## 7 208 6/21/1996 The Hunc… 100000000 100138851 325500000
## 8 209 12/15/2000 The Empe… 100000000 89296573 169296573
## 9 230 11/6/2015 The Pean… 99000000 130178411 250091610
## 10 244 5/30/2003 Finding … 94000000 380529370 936429370
## # ℹ 75 more rows
## # ℹ 3 more variables: distributor <chr>, mpaa_rating <chr>, genre <chr>
Arrange rows
arrange(data, release_date, genre, mpaa_rating)
## # A tibble: 3,401 × 9
## ...1 release_date movie production_budget domestic_gross worldwide_gross
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 3027 1/1/1946 It’s a W… 3180000 6600000 10768908
## 2 2999 1/1/1967 In Cold … 3500000 13000000 13007551
## 3 3270 1/1/1970 Beyond t… 1000000 9000000 9000000
## 4 1649 1/1/1970 Darling … 22000000 5000000 5000000
## 5 2364 1/1/1970 The Moll… 11000000 2200000 2200000
## 6 2349 1/1/1975 Barry Ly… 11000000 20000000 20169934
## 7 2984 1/1/1976 Network 3800000 23689877 23689877
## 8 2171 1/1/1978 Caravans 14000000 1000000 1000000
## 9 3034 1/1/1978 Coming H… 3000000 32653000 32653000
## 10 2041 1/1/1979 The Deer… 15000000 50000000 50009253
## # ℹ 3,391 more rows
## # ℹ 3 more variables: distributor <chr>, mpaa_rating <chr>, genre <chr>
Select columns
select(data, movie, production_budget)
## # A tibble: 3,401 × 2
## movie production_budget
## <chr> <dbl>
## 1 Evan Almighty 175000000
## 2 Waterworld 175000000
## 3 King Arthur: Legend of the Sword 175000000
## 4 47 Ronin 175000000
## 5 Jurassic World: Fallen Kingdom 170000000
## 6 Guardians of the Galaxy 170000000
## 7 Iron Man 2 170000000
## 8 Captain America: The Winter Soldier 170000000
## 9 Dawn of the Planet of the Apes 170000000
## 10 The Polar Express 170000000
## # ℹ 3,391 more rows
Add columns
mutate(data, profit = worldwide_gross + domestic_gross - production_budget)
## # A tibble: 3,401 × 10
## ...1 release_date movie production_budget domestic_gross worldwide_gross
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 6/22/2007 Evan Alm… 175000000 100289690 174131329
## 2 2 7/28/1995 Waterwor… 175000000 88246220 264246220
## 3 3 5/12/2017 King Art… 175000000 39175066 139950708
## 4 4 12/25/2013 47 Ronin 175000000 38362475 151716815
## 5 5 6/22/2018 Jurassic… 170000000 416769345 1304866322
## 6 6 8/1/2014 Guardian… 170000000 333172112 771051335
## 7 7 5/7/2010 Iron Man… 170000000 312433331 621156389
## 8 8 4/4/2014 Captain … 170000000 259746958 714401889
## 9 9 7/11/2014 Dawn of … 170000000 208545589 710644566
## 10 10 11/10/2004 The Pola… 170000000 186493587 310634169
## # ℹ 3,391 more rows
## # ℹ 4 more variables: distributor <chr>, mpaa_rating <chr>, genre <chr>,
## # profit <dbl>
Summarize by groups
by_day <- group_by(data, genre)
summarise(by_day, Avg_Production_Budget = mean(production_budget, na.rm = TRUE))
## # A tibble: 5 × 2
## genre Avg_Production_Budget
## <chr> <dbl>
## 1 Action 55870572.
## 2 Adventure 61111682.
## 3 Comedy 24289354.
## 4 Drama 21774116.
## 5 Horror 17224168.