library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)


movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")

## Rows: 77 Columns: 8

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1. rename(): (4 points)

Rename the “Film” column to “movie_title” and “Year” to “release_year”.

q1 <- movies %>%
  rename(movie_title = Film , release_year = Year)

head(q1)

2. select(): (4 points)

Create a new dataframe with only the columns: movie_title, release_year, Genre, Profitability,

q2 <- q1 %>%
  select(movie_title, release_year, Genre, Profitability)

head(q2)

3. filter(): (4 points)

Filter the dataset to include only movies released after 2000 with a Rotten Tomatoes % higher than 80.

q3 <- q1 %>%
  filter(release_year > 2000 & 'Rotten Tomatoes %' > 80)

head(q3)

4. mutate(): (4 points)

Add a new column called “Profitability_millions” that converts the Profitability to millions of dollars.

q4 <- q3 %>%
  mutate(Profitability_millions = Profitability*1e6)

head(q4)

5. arrange(): (3 points)

Sort the filtered dataset by Rotten Tomatoes % in descending order, and then by Profitability in descending order. five <- four %>% arrange(desc(Rotten Tomatoes %) , desc(Profitability_millions))

q5 <- q4 %>%
  arrange(desc('Rotten Tomatoes %'), desc(Profitability_millions))

head(q5)

6. Combining functions: (3 points)

Use the pipe operator (%>%) to chain these operations together, starting with the original dataset and ending with a final dataframe that incorporates all the above transformations.

q6 <- movies %>%
  rename(movie_title = Film, release_year = Year) %>%
  select(movie_title, release_year, Genre, Profitability, 'Rotten Tomatoes %') %>%
  filter(release_year > 2000 & 'Rotten Tomatoes %' > 80) %>%
  mutate(Profitability_millions = Profitability * 1e6) %>%
  arrange(desc('Rotten Tomatoes %'), desc(Profitability_millions))

head(q6)

7. Interpret question 6 (1 point)

From the resulting data, are the best movies the most popular?

The most profitable movies are not considered the most popular as seen from the data. When filtering the data to show the most profitable movies, they weren’t always the highest rated rotten tomatoes ratings. In conclusion, the movies with the highest gross aren’t the highest ratings per rotten tomatoes.

head(q7)

EXTRA CREDIT (4 points)

Create a summary dataframe that shows the average rating and Profitability_millions for movies by Genre. Hint: You’ll need to use group_by() and summarize().

summary_df <- movies %>%
  group_by(Genre) %>%
  summarize(
    Avg_Rating = mean(`Rotten Tomatoes %`, na.rm = TRUE),
    Avg_Profitability_millions = mean(Profitability, na.rm = TRUE)
  )

print(summary_df)

Assignment 3

Jake Memoli

2025-02-10