library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
# Load the movies dataset
movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")
## Rows: 77 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q1 <- movies %>%
rename(movie_title = Film, release_year = Year)
print(head(q1))
## # A tibble: 6 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei… 70 1.75
## 2 Youth in Revolt Come… The Weinstei… 52 1.09
## 3 You Will Meet a Tall Dar… Come… Independent 35 1.21
## 4 When in Rome Come… Disney 44 0
## 5 What Happens in Vegas Come… Fox 72 6.27
## 6 Water For Elephants Drama 20th Century… 72 3.08
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
#Create a new dataframe with only the columns: movie_title, release_year, Genre, Profitability,
q2 <- q1 %>%
select(movie_title, release_year, Genre, Profitability)
print(head(q2))
## # A tibble: 6 × 4
## movie_title release_year Genre Profitability
## <chr> <dbl> <chr> <dbl>
## 1 Zack and Miri Make a Porno 2008 Romance 1.75
## 2 Youth in Revolt 2010 Comedy 1.09
## 3 You Will Meet a Tall Dark Stranger 2010 Comedy 1.21
## 4 When in Rome 2010 Comedy 0
## 5 What Happens in Vegas 2008 Comedy 6.27
## 6 Water For Elephants 2011 Drama 3.08
q3 <- q2 %>%
filter(release_year > 2000, "Rotten Tomatoes %" > 80)
print(head(q3))
## # A tibble: 6 × 4
## movie_title release_year Genre Profitability
## <chr> <dbl> <chr> <dbl>
## 1 Zack and Miri Make a Porno 2008 Romance 1.75
## 2 Youth in Revolt 2010 Comedy 1.09
## 3 You Will Meet a Tall Dark Stranger 2010 Comedy 1.21
## 4 When in Rome 2010 Comedy 0
## 5 What Happens in Vegas 2008 Comedy 6.27
## 6 Water For Elephants 2011 Drama 3.08
q4 <- q3 %>%
mutate(Profitability_millions = Profitability / 1e6)
print(head(q4))
## # A tibble: 6 × 5
## movie_title release_year Genre Profitability Profitability_millions
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 Zack and Miri Make a … 2008 Roma… 1.75 0.00000175
## 2 Youth in Revolt 2010 Come… 1.09 0.00000109
## 3 You Will Meet a Tall … 2010 Come… 1.21 0.00000121
## 4 When in Rome 2010 Come… 0 0
## 5 What Happens in Vegas 2008 Come… 6.27 0.00000627
## 6 Water For Elephants 2011 Drama 3.08 0.00000308
q5 <- q4%>%
arrange(desc("Rotten Tomatoes %"), desc(Profitability_millions))
print(head(q5))
## # A tibble: 6 × 5
## movie_title release_year Genre Profitability Profitability_millions
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 Fireproof 2008 Drama 66.9 0.0000669
## 2 High School Musical 3… 2008 Come… 22.9 0.0000229
## 3 The Twilight Saga: Ne… 2009 Drama 14.2 0.0000142
## 4 Waitress 2007 Roma… 11.1 0.0000111
## 5 Twilight 2008 Roma… 10.2 0.0000102
## 6 Mamma Mia! 2008 Come… 9.23 0.00000923
final_data <- movies %>%
rename(movie_title = Film, release_year = Year) %>%
select(movie_title, release_year, Genre, Profitability) %>%
filter(release_year > 2000, "Rotten Tomatoes %" > 80) %>%
mutate(Profitability_millions = Profitability / 1e6) %>%
arrange(desc("Rotten Tomatoes %"), desc(Profitability_millions))
print(head(final_data))
## # A tibble: 6 × 5
## movie_title release_year Genre Profitability Profitability_millions
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 Fireproof 2008 Drama 66.9 0.0000669
## 2 High School Musical 3… 2008 Come… 22.9 0.0000229
## 3 The Twilight Saga: Ne… 2009 Drama 14.2 0.0000142
## 4 Waitress 2007 Roma… 11.1 0.0000111
## 5 Twilight 2008 Roma… 10.2 0.0000102
## 6 Mamma Mia! 2008 Come… 9.23 0.00000923
No, the best movies (those with the highest ratings) are not always the most popular. While some movies are both loved by critics and make a lot of money, many highly rated films don’t earn the most profits. This shows that popularity doesn’t always match quality.
summary <- movies %>%
group_by(Genre) %>%
summarize(
avg_rating = mean(`Rotten Tomatoes %`, na.rm = TRUE),
avg_profitability_millions = mean(Profitability / 1e6, na.rm = TRUE)
)
print(summary)
## # A tibble: 10 × 3
## Genre avg_rating avg_profitability_millions
## <chr> <dbl> <dbl>
## 1 Action 11 0.00000125
## 2 Animation 74.2 0.00000376
## 3 Comdy 13 0.00000265
## 4 Comedy 42.7 0.00000378
## 5 Drama 51.5 0.00000841
## 6 Fantasy 73 0.00000178
## 7 Romance 42.1 0.00000398
## 8 Romence 93 0.00000874
## 9 comedy 87 0.00000810
## 10 romance 54 0.000000653