library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)

# Load the movies dataset
movies <- read_csv ("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv") 
## Rows: 77 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Question 1

q1 <- movies %>%
  rename( movie_title = Film , release_year = Year)

head(q1)
## # A tibble: 6 × 8
##   movie_title               Genre `Lead Studio` `Audience score %` Profitability
##   <chr>                     <chr> <chr>                      <dbl>         <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei…                 70          1.75
## 2 Youth in Revolt           Come… The Weinstei…                 52          1.09
## 3 You Will Meet a Tall Dar… Come… Independent                   35          1.21
## 4 When in Rome              Come… Disney                        44          0   
## 5 What Happens in Vegas     Come… Fox                           72          6.27
## 6 Water For Elephants       Drama 20th Century…                 72          3.08
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>

Question 2

q2 <- q1 %>% 
  select (movie_title, release_year, Genre, Profitability) 

head(q2)
## # A tibble: 6 × 4
##   movie_title                        release_year Genre   Profitability
##   <chr>                                     <dbl> <chr>           <dbl>
## 1 Zack and Miri Make a Porno                 2008 Romance          1.75
## 2 Youth in Revolt                            2010 Comedy           1.09
## 3 You Will Meet a Tall Dark Stranger         2010 Comedy           1.21
## 4 When in Rome                               2010 Comedy           0   
## 5 What Happens in Vegas                      2008 Comedy           6.27
## 6 Water For Elephants                        2011 Drama            3.08

Question 3

q3 <- q1 %>% 
  select (movie_title, release_year, Genre, Profitability , `Rotten Tomatoes %`)  %>%
  filter(release_year > 2000, `Rotten Tomatoes %` > 80)

head(q3)
## # A tibble: 6 × 5
##   movie_title            release_year Genre    Profitability `Rotten Tomatoes %`
##   <chr>                         <dbl> <chr>            <dbl>               <dbl>
## 1 WALL-E                         2008 Animati…         2.90                   96
## 2 Waitress                       2007 Romance         11.1                    89
## 3 Tangled                        2010 Animati…         1.37                   89
## 4 Rachel Getting Married         2008 Drama            1.38                   85
## 5 My Week with Marilyn           2011 Drama            0.826                  83
## 6 Midnight in Paris              2011 Romence          8.74                   93

Question 4

q4 <- q3 %>%
  mutate(Profitability_millions = Profitability / 1e6)

head(q4)
## # A tibble: 6 × 6
##   movie_title            release_year Genre    Profitability `Rotten Tomatoes %`
##   <chr>                         <dbl> <chr>            <dbl>               <dbl>
## 1 WALL-E                         2008 Animati…         2.90                   96
## 2 Waitress                       2007 Romance         11.1                    89
## 3 Tangled                        2010 Animati…         1.37                   89
## 4 Rachel Getting Married         2008 Drama            1.38                   85
## 5 My Week with Marilyn           2011 Drama            0.826                  83
## 6 Midnight in Paris              2011 Romence          8.74                   93
## # ℹ 1 more variable: Profitability_millions <dbl>

Question 5

q5 <- q4 %>%
  arrange(desc(`Rotten Tomatoes %`), desc(Profitability_millions))

head(q5)
## # A tibble: 6 × 6
##   movie_title       release_year Genre     Profitability `Rotten Tomatoes %`
##   <chr>                    <dbl> <chr>             <dbl>               <dbl>
## 1 WALL-E                    2008 Animation          2.90                  96
## 2 Midnight in Paris         2011 Romence            8.74                  93
## 3 Enchanted                 2007 Comedy             4.01                  93
## 4 Knocked Up                2007 Comedy             6.64                  91
## 5 Waitress                  2007 Romance           11.1                   89
## 6 A Serious Man             2009 Drama              4.38                  89
## # ℹ 1 more variable: Profitability_millions <dbl>

Question 6

final <- movies %>% #then
  rename(movie_title = Film, release_year = Year) %>% 
  select (movie_title, release_year, Genre, Profitability , `Rotten Tomatoes %`) %>%
  filter(release_year > 2000, 'Rotten Tomatoes %' > 80) %>%
  mutate(profitability_millions = Profitability / 1e6) %>%
  arrange(desc(`Rotten Tomatoes %`), desc(profitability_millions))

head(final)
## # A tibble: 6 × 6
##   movie_title       release_year Genre     Profitability `Rotten Tomatoes %`
##   <chr>                    <dbl> <chr>             <dbl>               <dbl>
## 1 WALL-E                    2008 Animation          2.90                  96
## 2 Midnight in Paris         2011 Romence            8.74                  93
## 3 Enchanted                 2007 Comedy             4.01                  93
## 4 Knocked Up                2007 Comedy             6.64                  91
## 5 Waitress                  2007 Romance           11.1                   89
## 6 A Serious Man             2009 Drama              4.38                  89
## # ℹ 1 more variable: profitability_millions <dbl>

Question 7

From the data, the best movies are not necessarily the most popular because the ones that were most profitable do not have the best Rotten Tomato score.

Extra Credit

library(dplyr)
library(tidyr)
library(stringr)

# Use `final` if it exists; otherwise recreate a safe df from movies
if (exists("final")) {
  df <- final
} else {
  df <- movies %>%
    rename(movie_title = Film, release_year = Year) %>%
    select(movie_title, release_year, Genre, Profitability, `Rotten Tomatoes %`) %>%
    filter(release_year > 2000, `Rotten Tomatoes %` > 80)
}

# Remove exact duplicate rows (if any)
df <- df %>% distinct()

# Ensure Profitability_millions exists
if (!"Profitability_millions" %in% names(df)) {
  df <- df %>% mutate(Profitability_millions = Profitability / 1e6)
}

# Split multi-genre strings into separate rows, clean + standardize genres
df_clean <- df %>%
  mutate(Genre = str_trim(Genre)) %>%
  separate_rows(Genre, sep = ",\\s*") %>%
  filter(!is.na(Genre)) %>%
  mutate(
    # Normalize case
    Genre = str_to_title(Genre),
    # Collapse common comedy typos/variants into "Comedy"
    Genre = str_replace_all(Genre, "Comdey|Comdy|Comd", "Comedy")
  ) %>%
  # Remove Romance (case-insensitive)
  filter(!str_detect(Genre, regex("romance", ignore_case = TRUE))) %>%
  # Drop duplicates of the same movie-genre combo
  distinct(movie_title, Genre, .keep_all = TRUE)

# Group and summarize
summary_by_genre <- df_clean %>%
  group_by(Genre) %>%
  summarize(
    avg_rating = mean(`Rotten Tomatoes %`, na.rm = TRUE),
    avg_profitability_millions = mean(Profitability_millions, na.rm = TRUE),
    n_movies = n(),
    .groups = "drop"
  ) %>%
  arrange(desc(avg_rating))

# Show first 6 rows
head(summary_by_genre)
## # A tibble: 6 × 4
##   Genre     avg_rating avg_profitability_millions n_movies
##   <chr>          <dbl>                      <dbl>    <int>
## 1 Romence         93                   0.00000874        1
## 2 Animation       80.3                 0.00000322        3
## 3 Fantasy         73                   0.00000178        1
## 4 Drama           51.5                 0.00000841       13
## 5 Comedy          42.8                 0.00000372       42
## 6 Action          11                   0.00000125        1