Assignment 3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)

# Load the movies dataset
movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")

## Rows: 77 Columns: 8

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1. rename(): (4 points)

Rename the “Film” column to “movie_title” and “Year” to “release_year”.

q1 <- movies %>% 
  rename(movie_title = Film,
         release_year = Year)

head(q1 , 6)

## # A tibble: 6 × 8
##   movie_title               Genre `Lead Studio` `Audience score %` Profitability
##   <chr>                     <chr> <chr>                      <dbl>         <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei…                 70          1.75
## 2 Youth in Revolt           Come… The Weinstei…                 52          1.09
## 3 You Will Meet a Tall Dar… Come… Independent                   35          1.21
## 4 When in Rome              Come… Disney                        44          0   
## 5 What Happens in Vegas     Come… Fox                           72          6.27
## 6 Water For Elephants       Drama 20th Century…                 72          3.08
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>

2. select(): (4 points)

Create a new dataframe with only the columns: movie_title, release_year, Genre, Profitability,

q2 <- q1 %>%
  select(movie_title, release_year, Genre, Profitability, `Rotten Tomatoes %`, `Audience score %`)

head(q2, 6)

## # A tibble: 6 × 6
##   movie_title               release_year Genre Profitability `Rotten Tomatoes %`
##   <chr>                            <dbl> <chr>         <dbl>               <dbl>
## 1 Zack and Miri Make a Por…         2008 Roma…          1.75                  64
## 2 Youth in Revolt                   2010 Come…          1.09                  68
## 3 You Will Meet a Tall Dar…         2010 Come…          1.21                  43
## 4 When in Rome                      2010 Come…          0                     15
## 5 What Happens in Vegas             2008 Come…          6.27                  28
## 6 Water For Elephants               2011 Drama          3.08                  60
## # ℹ 1 more variable: `Audience score %` <dbl>

I included Rotten Tomatoes % and Audience score % as well since they are needed for the filter in Q3 and for the popularity comparison in Q7.

3. filter(): (4 points)

Filter the dataset to include only movies released after 2000 with a Rotten Tomatoes % higher than 80.

q3 <- q2 %>%
  filter(release_year > 2000, `Rotten Tomatoes %` > 80)

head(q3 , 6)

## # A tibble: 6 × 6
##   movie_title            release_year Genre    Profitability `Rotten Tomatoes %`
##   <chr>                         <dbl> <chr>            <dbl>               <dbl>
## 1 WALL-E                         2008 Animati…         2.90                   96
## 2 Waitress                       2007 Romance         11.1                    89
## 3 Tangled                        2010 Animati…         1.37                   89
## 4 Rachel Getting Married         2008 Drama            1.38                   85
## 5 My Week with Marilyn           2011 Drama            0.826                  83
## 6 Midnight in Paris              2011 Romence          8.74                   93
## # ℹ 1 more variable: `Audience score %` <dbl>

4. mutate(): (4 points)

Add a new column called “Profitability_millions” that converts the Profitability to millions of dollars.

q4 <- q3 %>%
  mutate(Profitability_millions = Profitability)

head(q4 , 6)

## # A tibble: 6 × 7
##   movie_title            release_year Genre    Profitability `Rotten Tomatoes %`
##   <chr>                         <dbl> <chr>            <dbl>               <dbl>
## 1 WALL-E                         2008 Animati…         2.90                   96
## 2 Waitress                       2007 Romance         11.1                    89
## 3 Tangled                        2010 Animati…         1.37                   89
## 4 Rachel Getting Married         2008 Drama            1.38                   85
## 5 My Week with Marilyn           2011 Drama            0.826                  83
## 6 Midnight in Paris              2011 Romence          8.74                   93
## # ℹ 2 more variables: `Audience score %` <dbl>, Profitability_millions <dbl>

5. arrange(): (3 points)

Sort the filtered dataset by Rotten Tomatoes % in descending order, and then by Profitability in descending order. five <- four %>% arrange(desc(Rotten Tomatoes %) , desc(Profitability_millions))

q5 <- q4 %>%
  arrange(desc(`Rotten Tomatoes %`), desc(Profitability_millions))

head(q5 , 6)

## # A tibble: 6 × 7
##   movie_title       release_year Genre     Profitability `Rotten Tomatoes %`
##   <chr>                    <dbl> <chr>             <dbl>               <dbl>
## 1 WALL-E                    2008 Animation          2.90                  96
## 2 Midnight in Paris         2011 Romence            8.74                  93
## 3 Enchanted                 2007 Comedy             4.01                  93
## 4 Knocked Up                2007 Comedy             6.64                  91
## 5 Waitress                  2007 Romance           11.1                   89
## 6 A Serious Man             2009 Drama              4.38                  89
## # ℹ 2 more variables: `Audience score %` <dbl>, Profitability_millions <dbl>

6. Combining functions: (3 points)

Use the pipe operator (%>%) to chain these operations together, starting with the original dataset and ending with a final dataframe that incorporates all the above transformations.

q6 <- movies %>%
  rename(movie_title = Film,
         release_year = Year) %>%
  select(movie_title, release_year, Genre, Profitability, `Rotten Tomatoes %`, `Audience score %`) %>%
  filter(release_year > 2000, `Rotten Tomatoes %` > 80) %>%
  mutate(Profitability_millions = Profitability) %>%
  arrange(desc(`Rotten Tomatoes %`), desc(Profitability_millions))

head(q6 , 6)

## # A tibble: 6 × 7
##   movie_title       release_year Genre     Profitability `Rotten Tomatoes %`
##   <chr>                    <dbl> <chr>             <dbl>               <dbl>
## 1 WALL-E                    2008 Animation          2.90                  96
## 2 Midnight in Paris         2011 Romence            8.74                  93
## 3 Enchanted                 2007 Comedy             4.01                  93
## 4 Knocked Up                2007 Comedy             6.64                  91
## 5 Waitress                  2007 Romance           11.1                   89
## 6 A Serious Man             2009 Drama              4.38                  89
## # ℹ 2 more variables: `Audience score %` <dbl>, Profitability_millions <dbl>

7. Interpret question 6 (1 point)

From the resulting data, are the best movies the most popular?

From the resulting data, the best movies based on Rotten Tomatoes are not always the most popular. While some highly rated movies like WALL-E also have strong audience scores, others such as A Serious Man have high critic ratings but much lower audience scores. This shows that critical success does not automatically mean audience popularity.

EXTRA CREDIT (4 points)

Create a summary dataframe that shows the average rating and Profitability_millions for movies by Genre. Hint: You’ll need to use group_by() and summarize().

extra <- q6 %>%
  mutate(Genre = tolower(Genre),
         Genre = ifelse(Genre == "romence", "romance", Genre)) %>%
  group_by(Genre) %>%
  summarize(
    avg_audience_score = mean(`Audience score %`, na.rm = TRUE),
    avg_profitability_millions = mean(Profitability_millions, na.rm = TRUE),
    number_of_movies = n()
  ) %>%
  arrange(desc(avg_profitability_millions))

head(extra , 6)

## # A tibble: 4 × 4
##   Genre     avg_audience_score avg_profitability_millions number_of_movies
##   <chr>                  <dbl>                      <dbl>            <int>
## 1 romance                 76                         6.61                3
## 2 comedy                  81                         5.80                4
## 3 drama                   69.7                       2.20                3
## 4 animation               88.5                       2.13                2