library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
# Load the movies dataset
movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")
## Rows: 77 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
1. rename(): (4 points)
Rename the “Film” column to “movie_title” and “Year” to
“release_year”.
q1 <- movies %>%
rename(movie_title = Film , release_year = Year)
head(q1)
## # A tibble: 6 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei… 70 1.75
## 2 Youth in Revolt Come… The Weinstei… 52 1.09
## 3 You Will Meet a Tall Dar… Come… Independent 35 1.21
## 4 When in Rome Come… Disney 44 0
## 5 What Happens in Vegas Come… Fox 72 6.27
## 6 Water For Elephants Drama 20th Century… 72 3.08
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
2. select(): (4 points)
Create a new dataframe with only the columns: movie_title,
release_year, Genre, Profitability,
q2 <- q1 %>%
select(movie_title , release_year , Genre , Profitability)
head(q2)
## # A tibble: 6 × 4
## movie_title release_year Genre Profitability
## <chr> <dbl> <chr> <dbl>
## 1 Zack and Miri Make a Porno 2008 Romance 1.75
## 2 Youth in Revolt 2010 Comedy 1.09
## 3 You Will Meet a Tall Dark Stranger 2010 Comedy 1.21
## 4 When in Rome 2010 Comedy 0
## 5 What Happens in Vegas 2008 Comedy 6.27
## 6 Water For Elephants 2011 Drama 3.08
3. filter(): (4 points)
Filter the dataset to include only movies released after 2000 with a
Rotten Tomatoes % higher than 80.
q3 <- q1 %>%
filter(release_year > 2000 & `Rotten Tomatoes %` > 80)
head(q3)
## # A tibble: 6 × 8
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 WALL-E Animati… Disney 89 2.90
## 2 Waitress Romance Independent 67 11.1
## 3 Tangled Animati… Disney 88 1.37
## 4 Rachel Getting Married Drama Independent 61 1.38
## 5 My Week with Marilyn Drama The Weinstei… 84 0.826
## 6 Midnight in Paris Romence Sony 84 8.74
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>
4. mutate(): (4 points)
Add a new column called “Profitability_millions” that converts the
Profitability to millions of dollars.
q4 <- q3 %>%
mutate(Profitability_millions = Profitability * 1000000)
head(q4)
## # A tibble: 6 × 9
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 WALL-E Animati… Disney 89 2.90
## 2 Waitress Romance Independent 67 11.1
## 3 Tangled Animati… Disney 88 1.37
## 4 Rachel Getting Married Drama Independent 61 1.38
## 5 My Week with Marilyn Drama The Weinstei… 84 0.826
## 6 Midnight in Paris Romence Sony 84 8.74
## # ℹ 4 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>, Profitability_millions <dbl>
5. arrange(): (3 points)
Sort the filtered dataset by Rotten Tomatoes % in descending order,
and then by Profitability in descending order.
q5 <- q4 %>%
arrange(desc(`Rotten Tomatoes %`) , Profitability_millions)
head(q5)
## # A tibble: 6 × 9
## movie_title Genre `Lead Studio` `Audience score %` Profitability
## <chr> <chr> <chr> <dbl> <dbl>
## 1 WALL-E Animation Disney 89 2.90
## 2 Enchanted Comedy Disney 80 4.01
## 3 Midnight in Paris Romence Sony 84 8.74
## 4 Knocked Up Comedy Universal 83 6.64
## 5 Tangled Animation Disney 88 1.37
## 6 A Serious Man Drama Universal 64 4.38
## # ℹ 4 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## # release_year <dbl>, Profitability_millions <dbl>
6. Combining functions: (3 points)
7. Interpret question 6 (1 point)
From the resulting data, are the best movies the most popular?
As seen in the table above, the most popular movies can be decided
by the profitability and the best movies can be decided by the Rotten
Tomatoes %. As you can see the movie WALL-E has a Rotten Tomatoes % of
96, making it the best movie, but it had profitability of 2.86 million,
where as some worse movies with a rating of 93, like Enchanted and
Midnight in Paris, have higher Profitabilities at 4 million and 8.74
million. This shows that the best movies are not the most popular.
Create a summary dataframe that shows the average rating and
Profitability_millions for movies by Genre. Hint: You’ll need to use
group_by() and summarize().
movies <- movies %>%
mutate(Genre = tolower(trimws(Genre))) %>% # Convert to lowercase and trim spaces
mutate(Genre = case_when(
Genre == "comdy" ~ "comedy",
Genre == "romence" ~ "romance",
TRUE ~ Genre # Keep other genres as they are
))
summary_df <- movies %>%
group_by(Genre) %>%
summarize(
avg_rating = mean(`Rotten Tomatoes %`, na.rm = TRUE),
avg_profitability = mean(Profitability, na.rm = TRUE)
)
print(summary_df)
## # A tibble: 6 × 3
## Genre avg_rating avg_profitability
## <chr> <dbl> <dbl>
## 1 action 11 1.25
## 2 animation 74.2 3.76
## 3 comedy 43.0 3.85
## 4 drama 51.5 8.41
## 5 fantasy 73 1.78
## 6 romance 46.3 4.08