Setup

# Load tidyverse package
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load tidyverse package
library(stringi)
# Load movies dataset
movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")
## Rows: 77 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Correct Genre column category names
movies$Genre <- stri_replace_all_regex(
  movies$Genre, 
  pattern = c(
    'Comdy', 
    'comedy', 
    'Romence', 
    'romance'
    ),
  replacement = c(
    'Comedy', 
    'Comedy', 
    'Romance', 
    'Romance'
    ),
  vectorize_all = FALSE
  )

Question 1

# Rename Film column to movie_title and Year column to release_year
renamed_movies <- movies %>%
  rename(
    movie_title = Film, 
    release_year = Year
    )
# Print first 6 rows of data
head(renamed_movies)
## # A tibble: 6 × 8
##   movie_title               Genre `Lead Studio` `Audience score %` Profitability
##   <chr>                     <chr> <chr>                      <dbl>         <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei…                 70          1.75
## 2 Youth in Revolt           Come… The Weinstei…                 52          1.09
## 3 You Will Meet a Tall Dar… Come… Independent                   35          1.21
## 4 When in Rome              Come… Disney                        44          0   
## 5 What Happens in Vegas     Come… Fox                           72          6.27
## 6 Water For Elephants       Drama 20th Century…                 72          3.08
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>

Question 2

# Create new dataframe with only movie_title, release_year, Genre, Profitability, and Rotten Tomatoes % columns
selected_movies <- renamed_movies %>%
  select(
    movie_title, 
    release_year, 
    Genre, 
    Profitability, 
    `Rotten Tomatoes %`
    )
# Print first 6 rows of data
head(selected_movies)
## # A tibble: 6 × 5
##   movie_title               release_year Genre Profitability `Rotten Tomatoes %`
##   <chr>                            <dbl> <chr>         <dbl>               <dbl>
## 1 Zack and Miri Make a Por…         2008 Roma…          1.75                  64
## 2 Youth in Revolt                   2010 Come…          1.09                  68
## 3 You Will Meet a Tall Dar…         2010 Come…          1.21                  43
## 4 When in Rome                      2010 Come…          0                     15
## 5 What Happens in Vegas             2008 Come…          6.27                  28
## 6 Water For Elephants               2011 Drama          3.08                  60

Question 3

# Create new dataframe with only movie_title, release_year, Genre, Profitability, and Rotten Tomatoes % columns
selected_movies <- renamed_movies %>%
  select(
    movie_title, 
    release_year, 
    Genre, 
    Profitability, 
    `Rotten Tomatoes %`
    )
# Print first 6 rows of data
head(selected_movies)
## # A tibble: 6 × 5
##   movie_title               release_year Genre Profitability `Rotten Tomatoes %`
##   <chr>                            <dbl> <chr>         <dbl>               <dbl>
## 1 Zack and Miri Make a Por…         2008 Roma…          1.75                  64
## 2 Youth in Revolt                   2010 Come…          1.09                  68
## 3 You Will Meet a Tall Dar…         2010 Come…          1.21                  43
## 4 When in Rome                      2010 Come…          0                     15
## 5 What Happens in Vegas             2008 Come…          6.27                  28
## 6 Water For Elephants               2011 Drama          3.08                  60

Question 4

# Add new column called Profitability_millions that converts Profitability to millions of dollars
mutated_movies <- selected_movies %>%
  mutate(Profitability_millions = Profitability * 1e6)
# Print first 6 rows of data
head(mutated_movies)
## # A tibble: 6 × 6
##   movie_title               release_year Genre Profitability `Rotten Tomatoes %`
##   <chr>                            <dbl> <chr>         <dbl>               <dbl>
## 1 Zack and Miri Make a Por…         2008 Roma…          1.75                  64
## 2 Youth in Revolt                   2010 Come…          1.09                  68
## 3 You Will Meet a Tall Dar…         2010 Come…          1.21                  43
## 4 When in Rome                      2010 Come…          0                     15
## 5 What Happens in Vegas             2008 Come…          6.27                  28
## 6 Water For Elephants               2011 Drama          3.08                  60
## # ℹ 1 more variable: Profitability_millions <dbl>

Question 5

# Sort dataset by Rotten Tomatoes % in descending order and then Profitability_millions in descending order
arranged_movies <- mutated_movies %>%
  arrange(
    desc(`Rotten Tomatoes %`), 
    desc(Profitability_millions)
    )
# Print first 6 rows of data
head(arranged_movies)
## # A tibble: 6 × 6
##   movie_title       release_year Genre     Profitability `Rotten Tomatoes %`
##   <chr>                    <dbl> <chr>             <dbl>               <dbl>
## 1 WALL-E                    2008 Animation          2.90                  96
## 2 Midnight in Paris         2011 Romance            8.74                  93
## 3 Enchanted                 2007 Comedy             4.01                  93
## 4 Knocked Up                2007 Comedy             6.64                  91
## 5 Waitress                  2007 Romance           11.1                   89
## 6 A Serious Man             2009 Drama              4.38                  89
## # ℹ 1 more variable: Profitability_millions <dbl>

Question 6

# Combining functions
combined_movies <- movies %>%
  rename(
    movie_title = Film, 
    release_year = Year
    ) %>%
  select(
    movie_title, 
    release_year, 
    Genre, 
    Profitability, 
    `Rotten Tomatoes %`
    ) %>%
  filter(
    release_year > 2000, 
    `Rotten Tomatoes %` > 80
    ) %>%
  mutate(Profitability_millions = Profitability * 1e6) %>%
  arrange(
    desc(`Rotten Tomatoes %`), 
    desc(Profitability_millions)
    )
# Print first 6 rows of data
head(combined_movies)
## # A tibble: 6 × 6
##   movie_title       release_year Genre     Profitability `Rotten Tomatoes %`
##   <chr>                    <dbl> <chr>             <dbl>               <dbl>
## 1 WALL-E                    2008 Animation          2.90                  96
## 2 Midnight in Paris         2011 Romance            8.74                  93
## 3 Enchanted                 2007 Comedy             4.01                  93
## 4 Knocked Up                2007 Comedy             6.64                  91
## 5 Waitress                  2007 Romance           11.1                   89
## 6 A Serious Man             2009 Drama              4.38                  89
## # ℹ 1 more variable: Profitability_millions <dbl>

Question 7

# Create new dataframe with only movie_title, release_year, Genre, Profitability, and Rotten Tomatoes % columns
selected_movies <- renamed_movies %>%
  select(
    movie_title, 
    release_year, 
    Genre, 
    Profitability, 
    `Rotten Tomatoes %`
    )
# Print first 6 rows of data
head(selected_movies)
## # A tibble: 6 × 5
##   movie_title               release_year Genre Profitability `Rotten Tomatoes %`
##   <chr>                            <dbl> <chr>         <dbl>               <dbl>
## 1 Zack and Miri Make a Por…         2008 Roma…          1.75                  64
## 2 Youth in Revolt                   2010 Come…          1.09                  68
## 3 You Will Meet a Tall Dar…         2010 Come…          1.21                  43
## 4 When in Rome                      2010 Come…          0                     15
## 5 What Happens in Vegas             2008 Come…          6.27                  28
## 6 Water For Elephants               2011 Drama          3.08                  60
# Summary: The highest-rated movies are not always the most financially successful. There is strong relationship between Profitability_millions and Rotten Tomatoes % since the p-value of 0.2761 exceeds the standard significance threshold of 0.05.

Extra Credit

# Create summary dataframe that shows average_rating and average_Profitability_millions for movies by Genre
summary_movies <- movies %>% 
  mutate(Profitability_millions = Profitability * 1e6) %>% 
  group_by(Genre) %>% 
  summarise(
    average_rating = mean(`Rotten Tomatoes %`), 
    average_Profitability_millions = mean(Profitability_millions)
    )
# Print first 6 rows of data
head(summary_movies)
## # A tibble: 6 × 3
##   Genre     average_rating average_Profitability_millions
##   <chr>              <dbl>                          <dbl>
## 1 Action              11                         1245333.
## 2 Animation           74.2                       3759414.
## 3 Comedy              43.0                       3851160.
## 4 Drama               51.5                       8407218.
## 5 Fantasy             73                         1783944.
## 6 Romance             46.3                       4079972.