library(dplyr)
library(readr)
movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")
q1 <- movies %>%
rename(movie_title = Film, release_year = Year)
print(head(q1))
## # A tibble: 6 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei… 70 1.75
## 2 Youth in Revolt Come… The Weinstei… 52 1.09
## 3 You Will Meet a Tall Dar… Come… Independent 35 1.21
## 4 When in Rome Come… Disney 44 0
## 5 What Happens in Vegas Come… Fox 72 6.27
## 6 Water For Elephants Drama 20th Century… 72 3.08
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
q2 <- q1 %>%
select(movie_title, release_year, Genre, Profitability)
print(head(q2))
## # A tibble: 6 × 4
## movie_title release_year Genre Profitability
## <chr> <dbl> <chr> <dbl>
## 1 Zack and Miri Make a Porno 2008 Romance 1.75
## 2 Youth in Revolt 2010 Comedy 1.09
## 3 You Will Meet a Tall Dark Stranger 2010 Comedy 1.21
## 4 When in Rome 2010 Comedy 0
## 5 What Happens in Vegas 2008 Comedy 6.27
## 6 Water For Elephants 2011 Drama 3.08
q3 <- q1 %>%
filter(release_year > 2000 & `Audience score %` > 80)
print(head(q3))
## # A tibble: 6 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 WALL-E Anim… Disney 89 2.90
## 2 Twilight Roma… Summit 82 10.2
## 3 The Curious Case of Benj… Fant… Warner Bros. 81 1.78
## 4 Tangled Anim… Disney 88 1.37
## 5 Sex and the City Come… Warner Bros. 81 7.22
## 6 P.S. I Love You Roma… Independent 82 5.10
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
q4 <- q3 %>%
mutate(Profitability_in_Millions = Profitability*1000000)
print(head(q4))
## # A tibble: 6 × 9
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 WALL-E Anim… Disney 89 2.90
## 2 Twilight Roma… Summit 82 10.2
## 3 The Curious Case of Benj… Fant… Warner Bros. 81 1.78
## 4 Tangled Anim… Disney 88 1.37
## 5 Sex and the City Come… Warner Bros. 81 7.22
## 6 P.S. I Love You Roma… Independent 82 5.10
## # ℹ 4 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>, Profitability_in_Millions <dbl>
q5 <- q4 %>%
arrange(desc(`Audience score %`),desc(Profitability_in_Millions))
print(head(q5))
## # A tibble: 6 × 9
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 WALL-E Animation Disney 89 2.90
## 2 A Dangerous Method Drama Independent 89 0.449
## 3 Tangled Animation Disney 88 1.37
## 4 Midnight in Paris Romence Sony 84 8.74
## 5 My Week with Marilyn Drama The Weinstein… 84 0.826
## 6 Across the Universe romance Independent 84 0.653
## # ℹ 4 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>, Profitability_in_Millions <dbl>
q6 <- movies %>%
rename(movie_title = Film, release_year = Year, audience_score = `Audience score %`) %>%
select(movie_title, release_year, Genre, Profitability, audience_score) %>%
filter(release_year > 2000 & audience_score > 80) %>%
mutate(Profitability_in_Millions = Profitability * 1000000) %>%
arrange(desc(audience_score), desc(Profitability_in_Millions))
print(head(q6))
## # A tibble: 6 × 6
## movie_title release_year Genre Profitability audience_score
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 WALL-E 2008 Animation 2.90 89
## 2 A Dangerous Method 2011 Drama 0.449 89
## 3 Tangled 2010 Animation 1.37 88
## 4 Midnight in Paris 2011 Romence 8.74 84
## 5 My Week with Marilyn 2011 Drama 0.826 84
## 6 Across the Universe 2007 romance 0.653 84
## # ℹ 1 more variable: Profitability_in_Millions <dbl>
From this data, the best movies (based on audience score) are not necessarily the most popular (based on profitability). For instance, Twilight, which has a high profitability, doesn’t have the highest audience score, while a movie like WALL-E which has one of the highest audience scores, are less profitable.
extra_credit <- q6 %>%
group_by(Genre) %>%
summarize(avg_audience_score = mean(audience_score), avg_profitability_millions = mean(Profitability_in_Millions))
print(extra_credit)
## # A tibble: 8 × 3
## Genre avg_audience_score avg_profitability_millions
## <chr> <dbl> <dbl>
## 1 Animation 88.5 2130856.
## 2 Comedy 82 6929099.
## 3 Drama 86.5 637222.
## 4 Fantasy 81 1783944.
## 5 Romance 82 7641572.
## 6 Romence 84 8744706.
## 7 comedy 81 8096000
## 8 romance 84 652603.