library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
# Load the movies dataset
movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")
## Rows: 77 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rename the “Film” column to “movie_title” and “Year” to “release_year”.
q1 <- movies %>%
rename(movie_title = Film ,
release_year = Year)
head(q1)
## # A tibble: 6 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei… 70 1.75
## 2 Youth in Revolt Come… The Weinstei… 52 1.09
## 3 You Will Meet a Tall Dar… Come… Independent 35 1.21
## 4 When in Rome Come… Disney 44 0
## 5 What Happens in Vegas Come… Fox 72 6.27
## 6 Water For Elephants Drama 20th Century… 72 3.08
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
Create a new dataframe with only the columns: movie_title, release_year, Genre, Profitability,
q2 <- movies %>%
select(Film, Year, Genre, Profitability)
head(q2)
## # A tibble: 6 × 4
## Film Year Genre Profitability
## <chr> <dbl> <chr> <dbl>
## 1 Zack and Miri Make a Porno 2008 Romance 1.75
## 2 Youth in Revolt 2010 Comedy 1.09
## 3 You Will Meet a Tall Dark Stranger 2010 Comedy 1.21
## 4 When in Rome 2010 Comedy 0
## 5 What Happens in Vegas 2008 Comedy 6.27
## 6 Water For Elephants 2011 Drama 3.08
Filter the dataset to include only movies released after 2000 with a Rotten Tomatoes % higher than 80.
q3 <- movies %>%
filter(Year > 2000 & `Rotten Tomatoes %` > 80)
head(q3)
## # A tibble: 6 × 8
## Film Genre `Lead Studio` `Audience score %` Profitability `Rotten Tomatoes %`
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 WALL… Anim… Disney 89 2.90 96
## 2 Wait… Roma… Independent 67 11.1 89
## 3 Tang… Anim… Disney 88 1.37 89
## 4 Rach… Drama Independent 61 1.38 85
## 5 My W… Drama The Weinstei… 84 0.826 83
## 6 Midn… Rome… Sony 84 8.74 93
## # ℹ 2 more variables: `Worldwide Gross` <chr>, Year <dbl>
Add a new column called “Profitability_millions” that converts the Profitability to millions of dollars.
q4 <- movies %>%
mutate(Profitability_millions = Profitability * 1000000)
head(q4)
## # A tibble: 6 × 9
## Film Genre `Lead Studio` `Audience score %` Profitability `Rotten Tomatoes %`
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Zack… Roma… The Weinstei… 70 1.75 64
## 2 Yout… Come… The Weinstei… 52 1.09 68
## 3 You … Come… Independent 35 1.21 43
## 4 When… Come… Disney 44 0 15
## 5 What… Come… Fox 72 6.27 28
## 6 Wate… Drama 20th Century… 72 3.08 60
## # ℹ 3 more variables: `Worldwide Gross` <chr>, Year <dbl>,
## # Profitability_millions <dbl>
Sort the filtered dataset by Rotten Tomatoes % in descending order, and then by Profitability in descending order. five <- four %>% arrange(desc(Rotten Tomatoes %) , desc(Profitability_millions))
q5 <- movies %>%
arrange(desc(`Rotten Tomatoes %`) , desc(Profitability))
head(q5)
## # A tibble: 6 × 8
## Film Genre `Lead Studio` `Audience score %` Profitability `Rotten Tomatoes %`
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 WALL… Anim… Disney 89 2.90 96
## 2 Midn… Rome… Sony 84 8.74 93
## 3 Ench… Come… Disney 80 4.01 93
## 4 Knoc… Come… Universal 83 6.64 91
## 5 Wait… Roma… Independent 67 11.1 89
## 6 A Se… Drama Universal 64 4.38 89
## # ℹ 2 more variables: `Worldwide Gross` <chr>, Year <dbl>
Use the pipe operator (%>%) to chain these operations together, starting with the original dataset and ending with a final dataframe that incorporates all the above transformations.
movies %>%
filter(Year > 2000 & `Rotten Tomatoes %` > 80) %>%
select(Film, Year, Genre, Profitability, `Rotten Tomatoes %`) %>%
mutate(Profitability_millions = Profitability * 1000000) %>%
arrange(desc(`Rotten Tomatoes %`) , desc(Profitability)) %>%
head(5)
## # A tibble: 5 × 6
## Film Year Genre Profitability `Rotten Tomatoes %` Profitability_millions
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 WALL-E 2008 Anim… 2.90 96 2896019.
## 2 Midnight… 2011 Rome… 8.74 93 8744706.
## 3 Enchanted 2007 Come… 4.01 93 4005737.
## 4 Knocked … 2007 Come… 6.64 91 6636402.
## 5 Waitress 2007 Roma… 11.1 89 11089742.
From the resulting data, are the best movies the most popular?
From the resulting data, the best movies are not the most popular movies because not all movies with a high rotten tomato scores also had high profitability. There is not a strong correlation between the best and most popular movies. For example, Wall-E had the highest rotten tomatoes score making it the best movie, but it only made 2 million dollars. However, fireproof made 66 million, but was only the 43rd best movie.