#JACOB STOUGHTON AND JAKUB KEPA - Data 3210
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
#Load the movies dataset
movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")
## Rows: 77 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Question 1. rename(): 4 points - Rename the “Film” column to “movie_title” and “Year” to “release_year”
q1 <- movies %>%
rename(movie_title = Film , release_year = Year)
print(head(q1))
## # A tibble: 6 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei… 70 1.75
## 2 Youth in Revolt Come… The Weinstei… 52 1.09
## 3 You Will Meet a Tall Dar… Come… Independent 35 1.21
## 4 When in Rome Come… Disney 44 0
## 5 What Happens in Vegas Come… Fox 72 6.27
## 6 Water For Elephants Drama 20th Century… 72 3.08
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
#Question 2. select(): (4 points) # Create a new dataframe with only the columns: movie_title, release_year, Genre, Profitability,
q2 <- q1 %>%
select(movie_title, release_year, Genre, Profitability)
print(q2)
## # A tibble: 77 × 4
## movie_title release_year Genre Profitability
## <chr> <dbl> <chr> <dbl>
## 1 Zack and Miri Make a Porno 2008 Romance 1.75
## 2 Youth in Revolt 2010 Comedy 1.09
## 3 You Will Meet a Tall Dark Stranger 2010 Comedy 1.21
## 4 When in Rome 2010 Comedy 0
## 5 What Happens in Vegas 2008 Comedy 6.27
## 6 Water For Elephants 2011 Drama 3.08
## 7 WALL-E 2008 Animation 2.90
## 8 Waitress 2007 Romance 11.1
## 9 Waiting For Forever 2011 Romance 0.005
## 10 Valentine's Day 2010 Comedy 4.18
## # ℹ 67 more rows
#Question 3. filter(): (4 points) # Filter the dataset to include only movies released after 2000 with a Rotten Tomatoes % higher than 80.
q3 <- q1 %>%
filter(`Rotten Tomatoes %` > 80, release_year > 2000)
print(q3)
## # A tibble: 12 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 WALL-E Animat… Disney 89 2.90
## 2 Waitress Romance Independent 67 11.1
## 3 Tangled Animat… Disney 88 1.37
## 4 Rachel Getting Married Drama Independent 61 1.38
## 5 My Week with Marilyn Drama The Weinstei… 84 0.826
## 6 Midnight in Paris Romence Sony 84 8.74
## 7 Knocked Up Comedy Universal 83 6.64
## 8 Jane Eyre Romance Universal 77 0
## 9 Enchanted Comedy Disney 80 4.01
## 10 Beginners Comedy Independent 80 4.47
## 11 A Serious Man Drama Universal 64 4.38
## 12 (500) Days of Summer comedy Fox 81 8.10
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
#Question 4. mutate(): (4 points) # Add a new column called “Profitability_millions” that converts the Profitability to millions of dollars.
q4 <- q3 %>%
mutate(Profitability_millions = Profitability/1e6)
print(q4)
## # A tibble: 12 × 9
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 WALL-E Animat… Disney 89 2.90
## 2 Waitress Romance Independent 67 11.1
## 3 Tangled Animat… Disney 88 1.37
## 4 Rachel Getting Married Drama Independent 61 1.38
## 5 My Week with Marilyn Drama The Weinstei… 84 0.826
## 6 Midnight in Paris Romence Sony 84 8.74
## 7 Knocked Up Comedy Universal 83 6.64
## 8 Jane Eyre Romance Universal 77 0
## 9 Enchanted Comedy Disney 80 4.01
## 10 Beginners Comedy Independent 80 4.47
## 11 A Serious Man Drama Universal 64 4.38
## 12 (500) Days of Summer comedy Fox 81 8.10
## # ℹ 4 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>, Profitability_millions <dbl>
#Question 5. arrange(): (3 points) # Sort the filtered dataset by Rotten Tomatoes % in descending order, and then by Profitability in descending order. five <- four %>% arrange(desc(Rotten Tomatoes %) , desc(Profitability_millions))
q5 <- q3 %>% arrange(desc(`Rotten Tomatoes %`), desc(Profitability))
print(q5)
## # A tibble: 12 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 WALL-E Animat… Disney 89 2.90
## 2 Midnight in Paris Romence Sony 84 8.74
## 3 Enchanted Comedy Disney 80 4.01
## 4 Knocked Up Comedy Universal 83 6.64
## 5 Waitress Romance Independent 67 11.1
## 6 A Serious Man Drama Universal 64 4.38
## 7 Tangled Animat… Disney 88 1.37
## 8 (500) Days of Summer comedy Fox 81 8.10
## 9 Rachel Getting Married Drama Independent 61 1.38
## 10 Jane Eyre Romance Universal 77 0
## 11 Beginners Comedy Independent 80 4.47
## 12 My Week with Marilyn Drama The Weinstei… 84 0.826
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
#Question 6. Combining functions: (3 points) # Use the pipe operator (%>%) to chain these operations together, starting with the original dataset and ending with a final dataframe that incorporates all the above transformations.
q6 <- movies %>%
rename (movie_title = Film, release_year = Year) %>%
select (movie_title, release_year, Genre, Profitability, `Rotten Tomatoes %` ) %>%
filter (`Rotten Tomatoes %` > 80, release_year > 2000) %>%
arrange (desc(`Rotten Tomatoes %`), desc(Profitability))
print(q6)
## # A tibble: 12 × 5
## movie_title release_year Genre Profitability `Rotten Tomatoes %`
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 WALL-E 2008 Animat… 2.90 96
## 2 Midnight in Paris 2011 Romence 8.74 93
## 3 Enchanted 2007 Comedy 4.01 93
## 4 Knocked Up 2007 Comedy 6.64 91
## 5 Waitress 2007 Romance 11.1 89
## 6 A Serious Man 2009 Drama 4.38 89
## 7 Tangled 2010 Animat… 1.37 89
## 8 (500) Days of Summer 2009 comedy 8.10 87
## 9 Rachel Getting Married 2008 Drama 1.38 85
## 10 Jane Eyre 2011 Romance 0 85
## 11 Beginners 2011 Comedy 4.47 84
## 12 My Week with Marilyn 2011 Drama 0.826 83
#Question 7. Interpret question 6 (1 point) # From the resulting data, are the best movies the most popular? # From the data above, we can conclude that the “best” movies aren’t always the most popular. The movie that has the highest audience score and the worldwide gross is WALLE. Below, it shows that ‘Enchanted’, a movie with a lower audience score (80%) than ‘Midnight in Paris’ (84), made almost double the worldwide gross. Additionally, these two movies also have identical Rotten Tomatoes Scores (93%), yet one made over twice the box office money. While a majority of the higher grossing movies also have higher audience scores, we can see from the data that the best movies are not always the most popular, but there is a correlation.
#EXTRA CREDIT (4 points) #Create a summary dataframe that shows the average rating and Profitability_millions for movies by Genre. Hint: You’ll need to use group_by() and summarize().
library(dplyr)
summary_df <- q4 %>%
group_by(Genre) %>%
summarize(average_rating=mean(`Audience score %`, na.rm = TRUE), average_profitability = mean(Profitability_millions, na.rm = TRUE))
print(summary_df)
## # A tibble: 6 × 3
## Genre average_rating average_profitability
## <chr> <dbl> <dbl>
## 1 Animation 88.5 0.00000213
## 2 Comedy 81 0.00000504
## 3 Drama 69.7 0.00000220
## 4 Romance 72 0.00000554
## 5 Romence 84 0.00000874
## 6 comedy 81 0.00000810