library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")
## `curl` package not installed, falling back to using `url()`
## Rows: 77 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Question 1

q1  <- movies %>%  
  rename(movie_title = Film , release_year = Year)

head(q1)
## # A tibble: 6 × 8
##   movie_title               Genre `Lead Studio` `Audience score %` Profitability
##   <chr>                     <chr> <chr>                      <dbl>         <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei…                 70          1.75
## 2 Youth in Revolt           Come… The Weinstei…                 52          1.09
## 3 You Will Meet a Tall Dar… Come… Independent                   35          1.21
## 4 When in Rome              Come… Disney                        44          0   
## 5 What Happens in Vegas     Come… Fox                           72          6.27
## 6 Water For Elephants       Drama 20th Century…                 72          3.08
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>

2. select(): (4 points)

Create a new dataframe with only the columns: movie_title, release_year, Genre, Profitability,

q2 <- q1 %>% 
  select(movie_title, release_year, Genre, Profitability)
print(head(q2))
## # A tibble: 6 × 4
##   movie_title                        release_year Genre   Profitability
##   <chr>                                     <dbl> <chr>           <dbl>
## 1 Zack and Miri Make a Porno                 2008 Romance          1.75
## 2 Youth in Revolt                            2010 Comedy           1.09
## 3 You Will Meet a Tall Dark Stranger         2010 Comedy           1.21
## 4 When in Rome                               2010 Comedy           0   
## 5 What Happens in Vegas                      2008 Comedy           6.27
## 6 Water For Elephants                        2011 Drama            3.08

3. filter(): (4 points)

Filter the dataset to include only movies released after 2000 with a Rotten Tomatoes % higher than 80.

q3 <- q1 %>%  
  filter(release_year > 2000 & `Rotten Tomatoes %`> 80)
print(head(q3))
## # A tibble: 6 × 8
##   movie_title            Genre    `Lead Studio` `Audience score %` Profitability
##   <chr>                  <chr>    <chr>                      <dbl>         <dbl>
## 1 WALL-E                 Animati… Disney                        89         2.90 
## 2 Waitress               Romance  Independent                   67        11.1  
## 3 Tangled                Animati… Disney                        88         1.37 
## 4 Rachel Getting Married Drama    Independent                   61         1.38 
## 5 My Week with Marilyn   Drama    The Weinstei…                 84         0.826
## 6 Midnight in Paris      Romence  Sony                          84         8.74 
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>

4. mutate(): (4 points)

Add a new column called “Profitability_millions” that converts the Profitability to millions of dollars.

q4 <- q1 %>% 
  mutate(Profitability_millions = Profitability * 1,000,000)
head(q4)
## # A tibble: 6 × 10
##   movie_title               Genre `Lead Studio` `Audience score %` Profitability
##   <chr>                     <chr> <chr>                      <dbl>         <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei…                 70          1.75
## 2 Youth in Revolt           Come… The Weinstei…                 52          1.09
## 3 You Will Meet a Tall Dar… Come… Independent                   35          1.21
## 4 When in Rome              Come… Disney                        44          0   
## 5 What Happens in Vegas     Come… Fox                           72          6.27
## 6 Water For Elephants       Drama 20th Century…                 72          3.08
## # ℹ 5 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>, Profitability_millions <dbl>, `0` <dbl>

5. arrange(): (3 points)

Sort the filtered dataset by Rotten Tomatoes % in descending order, and then by Profitability in descending order.

q5 <- q1 %>%  
  arrange(desc(`Rotten Tomatoes %`), desc(Profitability))
head(q5)
## # A tibble: 6 × 8
##   movie_title       Genre     `Lead Studio` `Audience score %` Profitability
##   <chr>             <chr>     <chr>                      <dbl>         <dbl>
## 1 WALL-E            Animation Disney                        89          2.90
## 2 Midnight in Paris Romence   Sony                          84          8.74
## 3 Enchanted         Comedy    Disney                        80          4.01
## 4 Knocked Up        Comedy    Universal                     83          6.64
## 5 Waitress          Romance   Independent                   67         11.1 
## 6 A Serious Man     Drama     Universal                     64          4.38
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>

6. Combining functions: (3 points)

Use the pipe operator (%>%) to chain these operations together, starting with the original dataset and ending with a final dataframe that incorporates all the above transformations.

q6 <- movies %>%
  rename(movie_title = Film , release_year = Year) %>%
  select(movie_title, release_year, Genre, Profitability , `Rotten Tomatoes %`) %>%
  filter(release_year > 2000 & `Rotten Tomatoes %` > 80) %>%
  mutate(Profitability_millions = Profitability * 1000000) %>%
  arrange(desc('Rotten Tomatoes %'), desc(Profitability))

head(q6)
## # A tibble: 6 × 6
##   movie_title          release_year Genre   Profitability `Rotten Tomatoes %`
##   <chr>                       <dbl> <chr>           <dbl>               <dbl>
## 1 Waitress                     2007 Romance         11.1                   89
## 2 Midnight in Paris            2011 Romence          8.74                  93
## 3 (500) Days of Summer         2009 comedy           8.10                  87
## 4 Knocked Up                   2007 Comedy           6.64                  91
## 5 Beginners                    2011 Comedy           4.47                  84
## 6 A Serious Man                2009 Drama            4.38                  89
## # ℹ 1 more variable: Profitability_millions <dbl>

7. Interpret question 6 (1 point)

From the resulting data, are the best movies the most popular? There is some correlation between the best movies and most popular but that is not always the case.

EXTRA CREDIT (4 points)

Create a summary dataframe that shows the average rating and Profitability_millions for movies by Genre. Hint: You’ll need to use group_by() and summarize().

summary_df <- movies %>%
  group_by(Genre) %>%
  summarize(
    Average_Rating = mean(`Rotten Tomatoes %`, na.rm = TRUE),
    Average_Profitability = mean(Profitability, na.rm = TRUE)
  )
print(summary_df)
## # A tibble: 10 × 3
##    Genre     Average_Rating Average_Profitability
##    <chr>              <dbl>                 <dbl>
##  1 Action              11                   1.25 
##  2 Animation           74.2                 3.76 
##  3 Comdy               13                   2.65 
##  4 Comedy              42.7                 3.78 
##  5 Drama               51.5                 8.41 
##  6 Fantasy             73                   1.78 
##  7 Romance             42.1                 3.98 
##  8 Romence             93                   8.74 
##  9 comedy              87                   8.10 
## 10 romance             54                   0.653