Import your data

# excel file
Movies <- read_excel("../00_data/MyData.xlsx")
## New names:
## • `` -> `...1`
Movies
## # A tibble: 3,401 × 9
##     ...1 release_date movie     production_budget domestic_gross worldwide_gross
##    <dbl> <chr>        <chr>                 <dbl>          <dbl>           <dbl>
##  1     1 6/22/2007    Evan Alm…         175000000      100289690       174131329
##  2     2 7/28/1995    Waterwor…         175000000       88246220       264246220
##  3     3 5/12/2017    King Art…         175000000       39175066       139950708
##  4     4 12/25/2013   47 Ronin          175000000       38362475       151716815
##  5     5 6/22/2018    Jurassic…         170000000      416769345      1304866322
##  6     6 8/1/2014     Guardian…         170000000      333172112       771051335
##  7     7 5/7/2010     Iron Man…         170000000      312433331       621156389
##  8     8 4/4/2014     Captain …         170000000      259746958       714401889
##  9     9 7/11/2014    Dawn of …         170000000      208545589       710644566
## 10    10 11/10/2004   The Pola…         170000000      186493587       310634169
## # ℹ 3,391 more rows
## # ℹ 3 more variables: distributor <chr>, mpaa_rating <chr>, genre <chr>

Chapter 15

Create a factor

Movies %>% count(genre)
## # A tibble: 5 × 2
##   genre         n
##   <chr>     <int>
## 1 Action      573
## 2 Adventure   481
## 3 Comedy      813
## 4 Drama      1236
## 5 Horror      298
genre_levels <- c("Drama", "Comedy", "Action", "Adventure", "Horror")

Movies_rev <- Movies %>%
    mutate(genre = genre %>% factor(levels = genre_levels))

Modify factor order

Make two bar charts here - one before ordering another after

Movies_summary <- Movies %>%
  group_by(genre) %>%
  summarise(
    domestic_gross = mean(domestic_gross, na.rm = TRUE),
  )

Movies_summary
## # A tibble: 5 × 2
##   genre     domestic_gross
##   <chr>              <dbl>
## 1 Action         61650873.
## 2 Adventure      85269013.
## 3 Comedy         40359214.
## 4 Drama          28189735.
## 5 Horror         35183292.
ggplot(Movies_summary, aes(domestic_gross, genre)) + geom_point()

ggplot(Movies_summary, aes(domestic_gross, fct_reorder(genre, domestic_gross))) + geom_point()

Modify factor levels

Show examples of three functions:

  • fct_recode
Movies %>%
  mutate(genre = fct_recode(genre,
    "D" = "Drama",
           "C" = "Comedy",
           "Ac" = "Action",
           "Ad" = "Adventure",
           "H" = "Horror"
  )) %>%
  count(genre)
## # A tibble: 5 × 2
##   genre     n
##   <fct> <int>
## 1 Ac      573
## 2 Ad      481
## 3 C       813
## 4 D      1236
## 5 H       298
  • fct_collapse
Movies %>%
    mutate(genre = fct_collapse(genre,
                                Drama = "Drama",
                                Other = c("Action", "Adventure", "Horror", "Comedy"))) %>%
    count(genre)
## # A tibble: 2 × 2
##   genre     n
##   <fct> <int>
## 1 Other  2165
## 2 Drama  1236
  • fct_lump
Movies %>%
    mutate(distributor = fct_lump(distributor)) %>%
    count(distributor)
## # A tibble: 202 × 2
##    distributor              n
##    <fct>                <int>
##  1 20th Century Fox       282
##  2 8X Entertainment         1
##  3 A24                     16
##  4 Abramorama Films         1
##  5 Access Motion Pictu…     1
##  6 After Dark               1
##  7 Alchemy                  7
##  8 Alliance Films           1
##  9 American Internatio…     2
## 10 Analysis                 1
## # ℹ 192 more rows
Movies %>%
    mutate(distributor = distributor %>% fct_lump(n = 1)) %>%
    count(distributor, sort = T)
## # A tibble: 2 × 2
##   distributor      n
##   <fct>        <int>
## 1 Other         3027
## 2 Warner Bros.   374
Movies %>%
    mutate(distributor = distributor %>% fct_lump(prop = 0.01)) %>%
    count(distributor, sort = T)
## # A tibble: 17 × 2
##    distributor                n
##    <fct>                  <int>
##  1 Other                    729
##  2 Warner Bros.             374
##  3 Sony Pictures            339
##  4 Universal                307
##  5 20th Century Fox         282
##  6 Paramount Pictures       267
##  7 Walt Disney              240
##  8 Lionsgate                147
##  9 MGM                      121
## 10 Miramax                  103
## 11 New Line                 100
## 12 Sony Pictures Classics    97
## 13 Fox Searchlight           81
## 14 Weinstein Co.             68
## 15 Focus Features            54
## 16 NA                        48
## 17 Dreamworks SKG            44

Chapter 16