# excel file
Movies <- read_excel("../00_data/MyData.xlsx")
## New names:
## • `` -> `...1`
Movies
## # A tibble: 3,401 × 9
## ...1 release_date movie production_budget domestic_gross worldwide_gross
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 6/22/2007 Evan Alm… 175000000 100289690 174131329
## 2 2 7/28/1995 Waterwor… 175000000 88246220 264246220
## 3 3 5/12/2017 King Art… 175000000 39175066 139950708
## 4 4 12/25/2013 47 Ronin 175000000 38362475 151716815
## 5 5 6/22/2018 Jurassic… 170000000 416769345 1304866322
## 6 6 8/1/2014 Guardian… 170000000 333172112 771051335
## 7 7 5/7/2010 Iron Man… 170000000 312433331 621156389
## 8 8 4/4/2014 Captain … 170000000 259746958 714401889
## 9 9 7/11/2014 Dawn of … 170000000 208545589 710644566
## 10 10 11/10/2004 The Pola… 170000000 186493587 310634169
## # ℹ 3,391 more rows
## # ℹ 3 more variables: distributor <chr>, mpaa_rating <chr>, genre <chr>
Movies %>% count(genre)
## # A tibble: 5 × 2
## genre n
## <chr> <int>
## 1 Action 573
## 2 Adventure 481
## 3 Comedy 813
## 4 Drama 1236
## 5 Horror 298
genre_levels <- c("Drama", "Comedy", "Action", "Adventure", "Horror")
Movies_rev <- Movies %>%
mutate(genre = genre %>% factor(levels = genre_levels))
Make two bar charts here - one before ordering another after
Movies_summary <- Movies %>%
group_by(genre) %>%
summarise(
domestic_gross = mean(domestic_gross, na.rm = TRUE),
)
Movies_summary
## # A tibble: 5 × 2
## genre domestic_gross
## <chr> <dbl>
## 1 Action 61650873.
## 2 Adventure 85269013.
## 3 Comedy 40359214.
## 4 Drama 28189735.
## 5 Horror 35183292.
ggplot(Movies_summary, aes(domestic_gross, genre)) + geom_point()
ggplot(Movies_summary, aes(domestic_gross, fct_reorder(genre, domestic_gross))) + geom_point()
Show examples of three functions:
Movies %>%
mutate(genre = fct_recode(genre,
"D" = "Drama",
"C" = "Comedy",
"Ac" = "Action",
"Ad" = "Adventure",
"H" = "Horror"
)) %>%
count(genre)
## # A tibble: 5 × 2
## genre n
## <fct> <int>
## 1 Ac 573
## 2 Ad 481
## 3 C 813
## 4 D 1236
## 5 H 298
Movies %>%
mutate(genre = fct_collapse(genre,
Drama = "Drama",
Other = c("Action", "Adventure", "Horror", "Comedy"))) %>%
count(genre)
## # A tibble: 2 × 2
## genre n
## <fct> <int>
## 1 Other 2165
## 2 Drama 1236
Movies %>%
mutate(distributor = fct_lump(distributor)) %>%
count(distributor)
## # A tibble: 202 × 2
## distributor n
## <fct> <int>
## 1 20th Century Fox 282
## 2 8X Entertainment 1
## 3 A24 16
## 4 Abramorama Films 1
## 5 Access Motion Pictu… 1
## 6 After Dark 1
## 7 Alchemy 7
## 8 Alliance Films 1
## 9 American Internatio… 2
## 10 Analysis 1
## # ℹ 192 more rows
Movies %>%
mutate(distributor = distributor %>% fct_lump(n = 1)) %>%
count(distributor, sort = T)
## # A tibble: 2 × 2
## distributor n
## <fct> <int>
## 1 Other 3027
## 2 Warner Bros. 374
Movies %>%
mutate(distributor = distributor %>% fct_lump(prop = 0.01)) %>%
count(distributor, sort = T)
## # A tibble: 17 × 2
## distributor n
## <fct> <int>
## 1 Other 729
## 2 Warner Bros. 374
## 3 Sony Pictures 339
## 4 Universal 307
## 5 20th Century Fox 282
## 6 Paramount Pictures 267
## 7 Walt Disney 240
## 8 Lionsgate 147
## 9 MGM 121
## 10 Miramax 103
## 11 New Line 100
## 12 Sony Pictures Classics 97
## 13 Fox Searchlight 81
## 14 Weinstein Co. 68
## 15 Focus Features 54
## 16 NA 48
## 17 Dreamworks SKG 44