library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")
## Rows: 77 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rename the “Film” column to “movie_title” and “Year” to “release_year”
renamed_movies <- movies %>%
rename(movie_title = Film,
release_year = Year)
head(renamed_movies , 3)
## # A tibble: 3 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei… 70 1.75
## 2 Youth in Revolt Come… The Weinstei… 52 1.09
## 3 You Will Meet a Tall Dar… Come… Independent 35 1.21
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
Create a new dataframe with only the columns: movie_title, release_year, Genre, Profitability
selected_movies <- movies %>%
rename(movie_title = Film,
release_year = Year) %>%
select(movie_title , release_year , Genre , Profitability)
head(selected_movies , 3)
## # A tibble: 3 × 4
## movie_title release_year Genre Profitability
## <chr> <dbl> <chr> <dbl>
## 1 Zack and Miri Make a Porno 2008 Romance 1.75
## 2 Youth in Revolt 2010 Comedy 1.09
## 3 You Will Meet a Tall Dark Stranger 2010 Comedy 1.21
Filter the dataset to include only movies released after 2000 with a Rotten Tomatoes % higher than 80
filtered_movies <- movies %>% # then
rename(movie_title = Film,
release_year = Year) %>%
filter(release_year > 2000 & 'Rotten Tomatoes %' > 80)
head(filtered_movies ,3)
## # A tibble: 3 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei… 70 1.75
## 2 Youth in Revolt Come… The Weinstei… 52 1.09
## 3 You Will Meet a Tall Dar… Come… Independent 35 1.21
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
Add a new column called “Profitability_millions” that converts the Profitability to millions of dollars.
movie_profits <- movies %>%
mutate(Profitability_millions = Profitability*1e6) %>%
select(Profitability_millions)
head(movie_profits , 3)
## # A tibble: 3 × 1
## Profitability_millions
## <dbl>
## 1 1747542.
## 2 1090000
## 3 1211818.
Sort the filtered dataset by Rotten Tomatoes % in descending order, and then by Profitability in descending order. five <- four %>% arrange(desc(Rotten Tomatoes %) , desc(Profitability_millions))
sorted_movies <- movies %>%
arrange(desc('Rotten Tomatoes %') , desc(Profitability))
head(sorted_movies , 3)
## # A tibble: 3 × 8
## Film Genre `Lead Studio` `Audience score %` Profitability `Rotten Tomatoes %`
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Fire… Drama Independent 51 66.9 40
## 2 High… Come… Disney 76 22.9 65
## 3 The … Drama Summit 78 14.2 27
## # ℹ 2 more variables: `Worldwide Gross` <chr>, Year <dbl>
Use the pipe operator (%>%) to chain these operations together, starting with the original dataset and ending with a final dataframe that incorporates all the above transformations
combining <- movies %>% # then
rename(movie_title = Film , release_year = Year) %>%
select(movie_title , release_year , Genre , Profitability) %>%
filter(release_year > 2000 & 'Rotten Tomatoes %' > 80) %>%
mutate(Profitability_millions = Profitability/1,000,000) %>%
arrange(desc('Rotten Tomatoes %') , desc(Profitability))
head(combining)
## # A tibble: 6 × 6
## movie_title release_year Genre Profitability Profitability_millions `0`
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 Fireproof 2008 Drama 66.9 66.9 0
## 2 High School Mus… 2008 Come… 22.9 22.9 0
## 3 The Twilight Sa… 2009 Drama 14.2 14.2 0
## 4 Waitress 2007 Roma… 11.1 11.1 0
## 5 Twilight 2008 Roma… 10.2 10.2 0
## 6 Mamma Mia! 2008 Come… 9.23 9.23 0
From the resulting data, are the best movies the most popular?
# Based on the resulting data, it appears that the best movies are normally the most popular, but sometimes it may depend on audience score.
Create a summary dataframe that shows the average rating and Profitability_millions for movies by Genre. Hint: You’ll need to use group_by() and summarize().
extra_credit <- movies %>%
mutate(Profitability_millions = Profitability*1e6) %>%
select(Genre, 'Rotten Tomatoes %', Profitability_millions)
Genre_Summary <- extra_credit %>%
group_by(Genre) %>%
summarize(
avg_rotten_tomatoes = mean('Rotten Tomatoes %', na.rm = TRUE),
avg_profitability = mean(Profitability_millions, na.rm = TRUE))
## Warning: There were 10 warnings in `summarize()`.
## The first warning was:
## ℹ In argument: `avg_rotten_tomatoes = mean("Rotten Tomatoes %", na.rm = TRUE)`.
## ℹ In group 1: `Genre = "Action"`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 9 remaining warnings.
head(Genre_Summary)
## # A tibble: 6 × 3
## Genre avg_rotten_tomatoes avg_profitability
## <chr> <dbl> <dbl>
## 1 Action NA 1245333.
## 2 Animation NA 3759414.
## 3 Comdy NA 2649068.
## 4 Comedy NA 3776946.
## 5 Drama NA 8407218.
## 6 Fantasy NA 1783944.