library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)

# Load the movies dataset
movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")

## Rows: 77 Columns: 8

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1. rename(): (4 points)

Rename the “Film” column to “movie_title” and “Year” to “release_year”.

q1 <- movies %>%
  rename(movie_title = Film,
         release_year = Year)

head(q1)

## # A tibble: 6 × 8
##   movie_title               Genre `Lead Studio` `Audience score %` Profitability
##   <chr>                     <chr> <chr>                      <dbl>         <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei…                 70          1.75
## 2 Youth in Revolt           Come… The Weinstei…                 52          1.09
## 3 You Will Meet a Tall Dar… Come… Independent                   35          1.21
## 4 When in Rome              Come… Disney                        44          0   
## 5 What Happens in Vegas     Come… Fox                           72          6.27
## 6 Water For Elephants       Drama 20th Century…                 72          3.08
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>

2. select(): (4 points)

Create a new data frame with only the columns: movie_title, release_year, Genre, Profitability,

q2 <- q1 %>%
  select(movie_title, release_year, Genre, Profitability)

head(q2)

## # A tibble: 6 × 4
##   movie_title                        release_year Genre   Profitability
##   <chr>                                     <dbl> <chr>           <dbl>
## 1 Zack and Miri Make a Porno                 2008 Romance          1.75
## 2 Youth in Revolt                            2010 Comedy           1.09
## 3 You Will Meet a Tall Dark Stranger         2010 Comedy           1.21
## 4 When in Rome                               2010 Comedy           0   
## 5 What Happens in Vegas                      2008 Comedy           6.27
## 6 Water For Elephants                        2011 Drama            3.08

3. filter(): (4 points)

Filter the data set to include only movies released after 2000 with a Rotten Tomatoes % higher than 80.

q3 <- q2 %>%
  filter(release_year > 2000 & 'Rotten Tomatoes %' > 80)

head(q3)

## # A tibble: 6 × 4
##   movie_title                        release_year Genre   Profitability
##   <chr>                                     <dbl> <chr>           <dbl>
## 1 Zack and Miri Make a Porno                 2008 Romance          1.75
## 2 Youth in Revolt                            2010 Comedy           1.09
## 3 You Will Meet a Tall Dark Stranger         2010 Comedy           1.21
## 4 When in Rome                               2010 Comedy           0   
## 5 What Happens in Vegas                      2008 Comedy           6.27
## 6 Water For Elephants                        2011 Drama            3.08

4. mutate(): (4 points)

Add a new column called “Profitability_millions” that converts the Profitability to millions of dollars.

q4 <- q3 %>%
  mutate(Profitability_millions = Profitability/1e6)

head(q4)

## # A tibble: 6 × 5
##   movie_title            release_year Genre Profitability Profitability_millions
##   <chr>                         <dbl> <chr>         <dbl>                  <dbl>
## 1 Zack and Miri Make a …         2008 Roma…          1.75             0.00000175
## 2 Youth in Revolt                2010 Come…          1.09             0.00000109
## 3 You Will Meet a Tall …         2010 Come…          1.21             0.00000121
## 4 When in Rome                   2010 Come…          0                0         
## 5 What Happens in Vegas          2008 Come…          6.27             0.00000627
## 6 Water For Elephants            2011 Drama          3.08             0.00000308

5. arrange(): (3 points)

Sort the filtered data set by Rotten Tomatoes % in descending order, and then by Profitability in descending order. five <- four %>% arrange(desc(Rotten Tomatoes %) , desc(Profitability_millions))

q5 <- q4 %>%
  arrange(desc('Audience score %'),desc(Profitability_millions))

head(q5)

## # A tibble: 6 × 5
##   movie_title            release_year Genre Profitability Profitability_millions
##   <chr>                         <dbl> <chr>         <dbl>                  <dbl>
## 1 Fireproof                      2008 Drama         66.9              0.0000669 
## 2 High School Musical 3…         2008 Come…         22.9              0.0000229 
## 3 The Twilight Saga: Ne…         2009 Drama         14.2              0.0000142 
## 4 Waitress                       2007 Roma…         11.1              0.0000111 
## 5 Twilight                       2008 Roma…         10.2              0.0000102 
## 6 Mamma Mia!                     2008 Come…          9.23             0.00000923

6. Combining functions: (3 points)

Use the pipe operator (%>%) to chain these operations together, starting with the original dataset and ending with a final dataframe that incorporates all the above transformations.

movies %>%
  rename(movie_title = Film,
         release_year = Year) %>%
  select(movie_title, release_year, Genre, Profitability) %>%
  filter(release_year > 2000 & 'Audience score %' > 80) %>%
  mutate(Profitability_millions = 1000000 * Profitability) %>%
  arrange(desc('Audience score %'),desc(Profitability_millions)) %>%
head(5)

## # A tibble: 5 × 5
##   movie_title            release_year Genre Profitability Profitability_millions
##   <chr>                         <dbl> <chr>         <dbl>                  <dbl>
## 1 Fireproof                      2008 Drama          66.9              66934000 
## 2 High School Musical 3…         2008 Come…          22.9              22913136.
## 3 The Twilight Saga: Ne…         2009 Drama          14.2              14196400 
## 4 Waitress                       2007 Roma…          11.1              11089742.
## 5 Twilight                       2008 Roma…          10.2              10180027.

7. Interpret question 6 (1 point)

From the resulting data, are the best movies the most popular?

The best Movies are not the most popular, as shown by Fireproof, which is the highest grossing and most popular movie but doesn’t have the highest Rotten Tomatoes score

EXTRA CREDIT (4 points)

Create a summary dataframe that shows the average rating and Profitability_millions for movies by Genre. Hint: You’ll need to use group_by() and summarize().

library(dplyr)

movies <- movies %>%
  rename(Audience_score = `Audience score %`)  

EX <- movies %>%
  mutate(Profitability_millions = Profitability / 1e6) %>%
  group_by(Genre) %>% 
  summarize(
    Avg_Rating = mean(Audience_score, na.rm = TRUE),
    Avg_Profitability_millions = mean(Profitability_millions, na.rm = TRUE)
  )

head(EX)

## # A tibble: 6 × 3
##   Genre     Avg_Rating Avg_Profitability_millions
##   <chr>          <dbl>                      <dbl>
## 1 Action          45                   0.00000125
## 2 Animation       70.2                 0.00000376
## 3 Comdy           61                   0.00000265
## 4 Comedy          61.0                 0.00000378
## 5 Drama           67.2                 0.00000841
## 6 Fantasy         81                   0.00000178

Assignment 3

2025-02-10

1. rename(): (4 points)

Rename the “Film” column to “movie_title” and “Year” to “release_year”.

2. select(): (4 points)

Create a new data frame with only the columns: movie_title, release_year, Genre, Profitability,

3. filter(): (4 points)

Filter the data set to include only movies released after 2000 with a Rotten Tomatoes % higher than 80.

4. mutate(): (4 points)

Add a new column called “Profitability_millions” that converts the Profitability to millions of dollars.

5. arrange(): (3 points)

Sort the filtered data set by Rotten Tomatoes % in descending order, and then by Profitability in descending order. five <- four %>% arrange(desc(Rotten Tomatoes %) , desc(Profitability_millions))

6. Combining functions: (3 points)

Use the pipe operator (%>%) to chain these operations together, starting with the original dataset and ending with a final dataframe that incorporates all the above transformations.

7. Interpret question 6 (1 point)

From the resulting data, are the best movies the most popular?

The best Movies are not the most popular, as shown by Fireproof, which is the highest grossing and most popular movie but doesn’t have the highest Rotten Tomatoes score

EXTRA CREDIT (4 points)

Create a summary dataframe that shows the average rating and Profitability_millions for movies by Genre. Hint: You’ll need to use group_by() and summarize().