library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
### Load the movies dataset
movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")
## Rows: 77 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
movies_two <- movies %>%
rename(movie_title = Film ,
release_year = Year)
print(head(movies_two))
## # A tibble: 6 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei… 70 1.75
## 2 Youth in Revolt Come… The Weinstei… 52 1.09
## 3 You Will Meet a Tall Dar… Come… Independent 35 1.21
## 4 When in Rome Come… Disney 44 0
## 5 What Happens in Vegas Come… Fox 72 6.27
## 6 Water For Elephants Drama 20th Century… 72 3.08
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
popular_movie <- movies_two %>%
select(movie_title, release_year, Genre, Profitability)
print(head(popular_movie))
## # A tibble: 6 × 4
## movie_title release_year Genre Profitability
## <chr> <dbl> <chr> <dbl>
## 1 Zack and Miri Make a Porno 2008 Romance 1.75
## 2 Youth in Revolt 2010 Comedy 1.09
## 3 You Will Meet a Tall Dark Stranger 2010 Comedy 1.21
## 4 When in Rome 2010 Comedy 0
## 5 What Happens in Vegas 2008 Comedy 6.27
## 6 Water For Elephants 2011 Drama 3.08
new_popular_movies <- movies %>%
filter(Year > 2000 ,
`Rotten Tomatoes %` > 80)
print(head(new_popular_movies))
## # A tibble: 6 × 8
## Film Genre `Lead Studio` `Audience score %` Profitability `Rotten Tomatoes %`
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 WALL… Anim… Disney 89 2.90 96
## 2 Wait… Roma… Independent 67 11.1 89
## 3 Tang… Anim… Disney 88 1.37 89
## 4 Rach… Drama Independent 61 1.38 85
## 5 My W… Drama The Weinstei… 84 0.826 83
## 6 Midn… Rome… Sony 84 8.74 93
## # ℹ 2 more variables: `Worldwide Gross` <chr>, Year <dbl>
Profitability_millions <- new_popular_movies %>%
mutate(Profitability_millions = Profitability * 1000000)
print(head(Profitability_millions))
## # A tibble: 6 × 9
## Film Genre `Lead Studio` `Audience score %` Profitability `Rotten Tomatoes %`
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 WALL… Anim… Disney 89 2.90 96
## 2 Wait… Roma… Independent 67 11.1 89
## 3 Tang… Anim… Disney 88 1.37 89
## 4 Rach… Drama Independent 61 1.38 85
## 5 My W… Drama The Weinstei… 84 0.826 83
## 6 Midn… Rome… Sony 84 8.74 93
## # ℹ 3 more variables: `Worldwide Gross` <chr>, Year <dbl>,
## # Profitability_millions <dbl>
sorted_movies <- Profitability_millions %>%
arrange(desc(`Rotten Tomatoes %`), desc(Profitability_millions))
head(sorted_movies)
## # A tibble: 6 × 9
## Film Genre `Lead Studio` `Audience score %` Profitability `Rotten Tomatoes %`
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 WALL… Anim… Disney 89 2.90 96
## 2 Midn… Rome… Sony 84 8.74 93
## 3 Ench… Come… Disney 80 4.01 93
## 4 Knoc… Come… Universal 83 6.64 91
## 5 Wait… Roma… Independent 67 11.1 89
## 6 A Se… Drama Universal 64 4.38 89
## # ℹ 3 more variables: `Worldwide Gross` <chr>, Year <dbl>,
## # Profitability_millions <dbl>
movies
## # A tibble: 77 × 8
## Film Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Zack and Miri Make a Po… Roma… The Weinstei… 70 1.75
## 2 Youth in Revolt Come… The Weinstei… 52 1.09
## 3 You Will Meet a Tall Da… Come… Independent 35 1.21
## 4 When in Rome Come… Disney 44 0
## 5 What Happens in Vegas Come… Fox 72 6.27
## 6 Water For Elephants Drama 20th Century… 72 3.08
## 7 WALL-E Anim… Disney 89 2.90
## 8 Waitress Roma… Independent 67 11.1
## 9 Waiting For Forever Roma… Independent 53 0.005
## 10 Valentine's Day Come… Warner Bros. 54 4.18
## # ℹ 67 more rows
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # Year <dbl>
transformations_movies <- movies %>%
rename(movie_title = Film, release_year = Year) %>%
select(movie_title, release_year, Genre, Profitability, `Rotten Tomatoes %`) %>%
filter(release_year > 2000, `Rotten Tomatoes %` > 80) %>%
mutate(Profitability_millions = Profitability * 1000000) %>%
arrange(desc(`Rotten Tomatoes %`), desc(Profitability_millions)) %>%
print(head(6))
## # A
## # tibble:
## # 12
## # ×
## # 6
## # ℹ 6
## # more
## # variables:
## # movie_title <chr>,
## # release_year <dbl>,
## # Genre <chr>,
## # Profitability <dbl>, …
“based on the data, the best movies, the ones with the hightest rotten tomatoes score, are not the most profitable”
summary_df <- movies %>%
group_by(Genre) %>%
summarize(Average_Rating = mean(`Audience score %`, na.rm = TRUE) ,
Profitability_Millions = mean(Profitability_millions, na.rm = TRUE)
)
## Warning: There were 10 warnings in `summarize()`.
## The first warning was:
## ℹ In argument: `Profitability_Millions = mean(Profitability_millions, na.rm =
## TRUE)`.
## ℹ In group 1: `Genre = "Action"`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 9 remaining warnings.
print(summary_df)
## # A tibble: 10 × 3
## Genre Average_Rating Profitability_Millions
## <chr> <dbl> <dbl>
## 1 Action 45 NA
## 2 Animation 70.2 NA
## 3 Comdy 61 NA
## 4 Comedy 61.0 NA
## 5 Drama 67.2 NA
## 6 Fantasy 81 NA
## 7 Romance 62.8 NA
## 8 Romence 84 NA
## 9 comedy 81 NA
## 10 romance 84 NA