library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")
## Rows: 77 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q1 <- movies %>%
rename(movie_title = Film, release_year = Year)
print(head(q1))
## # A tibble: 6 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei… 70 1.75
## 2 Youth in Revolt Come… The Weinstei… 52 1.09
## 3 You Will Meet a Tall Dar… Come… Independent 35 1.21
## 4 When in Rome Come… Disney 44 0
## 5 What Happens in Vegas Come… Fox 72 6.27
## 6 Water For Elephants Drama 20th Century… 72 3.08
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
-Create a new dataframe with only the columns: movie_title, release_year, Genre, Profitability,
q2 <- q1 %>%
select(movie_title, release_year, Genre, Profitability)
print(head(q2))
## # A tibble: 6 × 4
## movie_title release_year Genre Profitability
## <chr> <dbl> <chr> <dbl>
## 1 Zack and Miri Make a Porno 2008 Romance 1.75
## 2 Youth in Revolt 2010 Comedy 1.09
## 3 You Will Meet a Tall Dark Stranger 2010 Comedy 1.21
## 4 When in Rome 2010 Comedy 0
## 5 What Happens in Vegas 2008 Comedy 6.27
## 6 Water For Elephants 2011 Drama 3.08
q3 <- q1 %>%
filter(release_year > 2000 & `Rotten Tomatoes %` > 80)
print(q3)
## # A tibble: 12 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 WALL-E Animat… Disney 89 2.90
## 2 Waitress Romance Independent 67 11.1
## 3 Tangled Animat… Disney 88 1.37
## 4 Rachel Getting Married Drama Independent 61 1.38
## 5 My Week with Marilyn Drama The Weinstei… 84 0.826
## 6 Midnight in Paris Romence Sony 84 8.74
## 7 Knocked Up Comedy Universal 83 6.64
## 8 Jane Eyre Romance Universal 77 0
## 9 Enchanted Comedy Disney 80 4.01
## 10 Beginners Comedy Independent 80 4.47
## 11 A Serious Man Drama Universal 64 4.38
## 12 (500) Days of Summer comedy Fox 81 8.10
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
q4 <- q1 %>%
mutate(Profitability_millions = Profitability * 1000000)
print(select(q4, movie_title, Profitability, Profitability_millions))
## # A tibble: 77 × 3
## movie_title Profitability Profitability_millions
## <chr> <dbl> <dbl>
## 1 Zack and Miri Make a Porno 1.75 1747542.
## 2 Youth in Revolt 1.09 1090000
## 3 You Will Meet a Tall Dark Stranger 1.21 1211818.
## 4 When in Rome 0 0
## 5 What Happens in Vegas 6.27 6267647.
## 6 Water For Elephants 3.08 3081421.
## 7 WALL-E 2.90 2896019.
## 8 Waitress 11.1 11089742.
## 9 Waiting For Forever 0.005 5000
## 10 Valentine's Day 4.18 4184038.
## # ℹ 67 more rows
q5 <- q1 %>%
arrange(desc(`Rotten Tomatoes %`), desc(Profitability))
head(q5)
## # A tibble: 6 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 WALL-E Animation Disney 89 2.90
## 2 Midnight in Paris Romence Sony 84 8.74
## 3 Enchanted Comedy Disney 80 4.01
## 4 Knocked Up Comedy Universal 83 6.64
## 5 Waitress Romance Independent 67 11.1
## 6 A Serious Man Drama Universal 64 4.38
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
q6 <- movies %>%
rename(movie_title = Film, release_year = Year)%>%
filter(release_year > 2000 & `Rotten Tomatoes %` > 80)%>%
arrange(desc(`Rotten Tomatoes %`), desc(Profitability))%>%
select(movie_title, release_year, Genre, Profitability, `Rotten Tomatoes %`)%>%
mutate(Profitability_millions = Profitability * 1000000)
head(q6)
## # A tibble: 6 × 6
## movie_title release_year Genre Profitability `Rotten Tomatoes %`
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 WALL-E 2008 Animation 2.90 96
## 2 Midnight in Paris 2011 Romence 8.74 93
## 3 Enchanted 2007 Comedy 4.01 93
## 4 Knocked Up 2007 Comedy 6.64 91
## 5 Waitress 2007 Romance 11.1 89
## 6 A Serious Man 2009 Drama 4.38 89
## # ℹ 1 more variable: Profitability_millions <dbl>
– Based on the resulting data, not all of the most popular movies bring in the most profit.The fifth best movie has the highest Profitability, where the 1st ranked movie brings in the lowest profitability.
summary_df <- q4 %>%
group_by(Genre) %>%
summarize(
Avg_Rating = mean(`Rotten Tomatoes %`, na.rm = TRUE),
Avg_Profitability = mean(Profitability_millions, na.rm = TRUE)
)
print(summary_df)
## # A tibble: 10 × 3
## Genre Avg_Rating Avg_Profitability
## <chr> <dbl> <dbl>
## 1 Action 11 1245333.
## 2 Animation 74.2 3759414.
## 3 Comdy 13 2649068.
## 4 Comedy 42.7 3776946.
## 5 Drama 51.5 8407218.
## 6 Fantasy 73 1783944.
## 7 Romance 42.1 3984790.
## 8 Romence 93 8744706.
## 9 comedy 87 8096000
## 10 romance 54 652603.