Load the movies dataset

movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")
head(movies)
## # A tibble: 6 × 8
##   Film                               Genre   `Lead Studio`        
##   <chr>                              <chr>   <chr>                
## 1 Zack and Miri Make a Porno         Romance The Weinstein Company
## 2 Youth in Revolt                    Comedy  The Weinstein Company
## 3 You Will Meet a Tall Dark Stranger Comedy  Independent          
## 4 When in Rome                       Comedy  Disney               
## 5 What Happens in Vegas              Comedy  Fox                  
## 6 Water For Elephants                Drama   20th Century Fox     
##   `Audience score %` Profitability `Rotten Tomatoes %` `Worldwide Gross`  Year
##                <dbl>         <dbl>               <dbl> <chr>             <dbl>
## 1                 70          1.75                  64 $41.94             2008
## 2                 52          1.09                  68 $19.62             2010
## 3                 35          1.21                  43 $26.66             2010
## 4                 44          0                     15 $43.04             2010
## 5                 72          6.27                  28 $219.37            2008
## 6                 72          3.08                  60 $117.09            2011

1) rename(): Rename Film -> movie_title, Year -> release_year

one <- movies %>%
  rename(movie_title = Film,
         release_year = Year)

head(one)
## # A tibble: 6 × 8
##   movie_title                        Genre   `Lead Studio`        
##   <chr>                              <chr>   <chr>                
## 1 Zack and Miri Make a Porno         Romance The Weinstein Company
## 2 Youth in Revolt                    Comedy  The Weinstein Company
## 3 You Will Meet a Tall Dark Stranger Comedy  Independent          
## 4 When in Rome                       Comedy  Disney               
## 5 What Happens in Vegas              Comedy  Fox                  
## 6 Water For Elephants                Drama   20th Century Fox     
##   `Audience score %` Profitability `Rotten Tomatoes %` `Worldwide Gross`
##                <dbl>         <dbl>               <dbl> <chr>            
## 1                 70          1.75                  64 $41.94           
## 2                 52          1.09                  68 $19.62           
## 3                 35          1.21                  43 $26.66           
## 4                 44          0                     15 $43.04           
## 5                 72          6.27                  28 $219.37          
## 6                 72          3.08                  60 $117.09          
##   release_year
##          <dbl>
## 1         2008
## 2         2010
## 3         2010
## 4         2010
## 5         2008
## 6         2011

2) select(): Keep only movie_title, release_year, Genre, Profitability

two <- one %>%
  select(movie_title, release_year, Genre, Profitability)

head(two)
## # A tibble: 6 × 4
##   movie_title                        release_year Genre   Profitability
##   <chr>                                     <dbl> <chr>           <dbl>
## 1 Zack and Miri Make a Porno                 2008 Romance          1.75
## 2 Youth in Revolt                            2010 Comedy           1.09
## 3 You Will Meet a Tall Dark Stranger         2010 Comedy           1.21
## 4 When in Rome                               2010 Comedy           0   
## 5 What Happens in Vegas                      2008 Comedy           6.27
## 6 Water For Elephants                        2011 Drama            3.08

3) filter(): Movies after 2000 and Rotten Tomatoes % > 80

(Filter from ‘one’ because ‘two’ doesn’t contain Rotten Tomatoes %.)

three <- one %>%
  filter(release_year > 2000, `Rotten Tomatoes %` > 80)

head(three)
## # A tibble: 6 × 8
##   movie_title            Genre     `Lead Studio`         `Audience score %`
##   <chr>                  <chr>     <chr>                              <dbl>
## 1 WALL-E                 Animation Disney                                89
## 2 Waitress               Romance   Independent                           67
## 3 Tangled                Animation Disney                                88
## 4 Rachel Getting Married Drama     Independent                           61
## 5 My Week with Marilyn   Drama     The Weinstein Company                 84
## 6 Midnight in Paris      Romence   Sony                                  84
##   Profitability `Rotten Tomatoes %` `Worldwide Gross` release_year
##           <dbl>               <dbl> <chr>                    <dbl>
## 1         2.90                   96 $521.28                   2008
## 2        11.1                    89 $22.18                    2007
## 3         1.37                   89 $355.01                   2010
## 4         1.38                   85 $16.61                    2008
## 5         0.826                  83 $8.26                     2011
## 6         8.74                   93 $148.66                   2011

4) mutate(): Add Profitability_millions (convert Profitability to millions)

four <- three %>%
  mutate(Profitability_millions = Profitability * 1000000)

head(four)
## # A tibble: 6 × 9
##   movie_title            Genre     `Lead Studio`         `Audience score %`
##   <chr>                  <chr>     <chr>                              <dbl>
## 1 WALL-E                 Animation Disney                                89
## 2 Waitress               Romance   Independent                           67
## 3 Tangled                Animation Disney                                88
## 4 Rachel Getting Married Drama     Independent                           61
## 5 My Week with Marilyn   Drama     The Weinstein Company                 84
## 6 Midnight in Paris      Romence   Sony                                  84
##   Profitability `Rotten Tomatoes %` `Worldwide Gross` release_year
##           <dbl>               <dbl> <chr>                    <dbl>
## 1         2.90                   96 $521.28                   2008
## 2        11.1                    89 $22.18                    2007
## 3         1.37                   89 $355.01                   2010
## 4         1.38                   85 $16.61                    2008
## 5         0.826                  83 $8.26                     2011
## 6         8.74                   93 $148.66                   2011
##   Profitability_millions
##                    <dbl>
## 1               2896019.
## 2              11089742.
## 3               1365692.
## 4               1384167.
## 5                825800 
## 6               8744706.

5) arrange(): Sort by Rotten Tomatoes % desc, then Profitability_millions desc

five <- four %>%
  arrange(desc(`Rotten Tomatoes %`), desc(Profitability_millions))

head(five)
## # A tibble: 6 × 9
##   movie_title       Genre     `Lead Studio` `Audience score %` Profitability
##   <chr>             <chr>     <chr>                      <dbl>         <dbl>
## 1 WALL-E            Animation Disney                        89          2.90
## 2 Midnight in Paris Romence   Sony                          84          8.74
## 3 Enchanted         Comedy    Disney                        80          4.01
## 4 Knocked Up        Comedy    Universal                     83          6.64
## 5 Waitress          Romance   Independent                   67         11.1 
## 6 A Serious Man     Drama     Universal                     64          4.38
##   `Rotten Tomatoes %` `Worldwide Gross` release_year Profitability_millions
##                 <dbl> <chr>                    <dbl>                  <dbl>
## 1                  96 $521.28                   2008               2896019.
## 2                  93 $148.66                   2011               8744706.
## 3                  93 $340.49                   2007               4005737.
## 4                  91 $219                      2007               6636402.
## 5                  89 $22.18                    2007              11089742.
## 6                  89 $30.68                    2009               4382857.

6) Combining functions: One pipeline from the original dataset

six <- movies %>%
  rename(movie_title = Film, release_year = Year) %>%
  select(movie_title, release_year, Genre, Profitability, `Rotten Tomatoes %`) %>%
  filter(release_year > 2000, `Rotten Tomatoes %` > 80) %>%
  mutate(Profitability_millions = Profitability * 1000000) %>%
  arrange(desc(`Rotten Tomatoes %`), desc(Profitability_millions))

head(six)
## # A tibble: 6 × 6
##   movie_title       release_year Genre     Profitability `Rotten Tomatoes %`
##   <chr>                    <dbl> <chr>             <dbl>               <dbl>
## 1 WALL-E                    2008 Animation          2.90                  96
## 2 Midnight in Paris         2011 Romence            8.74                  93
## 3 Enchanted                 2007 Comedy             4.01                  93
## 4 Knocked Up                2007 Comedy             6.64                  91
## 5 Waitress                  2007 Romance           11.1                   89
## 6 A Serious Man             2009 Drama              4.38                  89
##   Profitability_millions
##                    <dbl>
## 1               2896019.
## 2               8744706.
## 3               4005737.
## 4               6636402.
## 5              11089742.
## 6               4382857.

7) Interpret question 6

not necessarily. even among movies with very high Rotten Tomatoes scores, profitability varies a lot. Some highly rated movies are very profitable, but others are not, so higher ratings (“best”) do not always mean higher profitability (“most popular”).

EXTRA CREDIT: Average rating and Profitability_millions by Genre

extra <- movies %>%
  rename(movie_title = Film, release_year = Year) %>%
  mutate(Profitability_millions = Profitability * 1000000) %>%
  group_by(Genre) %>%
  summarize(
    avg_rating = mean(`Rotten Tomatoes %`),
    avg_profitability_millions = mean(Profitability_millions)
  )

head(extra)
## # A tibble: 6 × 3
##   Genre     avg_rating avg_profitability_millions
##   <chr>          <dbl>                      <dbl>
## 1 Action          11                     1245333.
## 2 Animation       74.2                   3759414.
## 3 Comdy           13                     2649068.
## 4 Comedy          42.7                   3776946.
## 5 Drama           51.5                   8407218.
## 6 Fantasy         73                     1783944.