Load the movies dataset

movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")
## Rows: 77 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1.rename(): (4 points)

Rename the “Film” column to “movie_title” and “Year” to “release_year”.

q1 <- movies %>%
  rename(movie_title = Film, 
         release_year = Year)

print(select(q1,movie_title,Genre,`Lead Studio`, `Audience score %`,release_year))
## # A tibble: 77 × 5
##    movie_title               Genre `Lead Studio` `Audience score %` release_year
##    <chr>                     <chr> <chr>                      <dbl>        <dbl>
##  1 Zack and Miri Make a Por… Roma… The Weinstei…                 70         2008
##  2 Youth in Revolt           Come… The Weinstei…                 52         2010
##  3 You Will Meet a Tall Dar… Come… Independent                   35         2010
##  4 When in Rome              Come… Disney                        44         2010
##  5 What Happens in Vegas     Come… Fox                           72         2008
##  6 Water For Elephants       Drama 20th Century…                 72         2011
##  7 WALL-E                    Anim… Disney                        89         2008
##  8 Waitress                  Roma… Independent                   67         2007
##  9 Waiting For Forever       Roma… Independent                   53         2011
## 10 Valentine's Day           Come… Warner Bros.                  54         2010
## # ℹ 67 more rows

2. select(): (4 points)

Create a new dataframe with only the columns: movie_title, release_year, Genre, Profitability,

q2 <- q1 %>% 
  select(movie_title, release_year, Genre, Profitability)

print(q2)
## # A tibble: 77 × 4
##    movie_title                        release_year Genre     Profitability
##    <chr>                                     <dbl> <chr>             <dbl>
##  1 Zack and Miri Make a Porno                 2008 Romance           1.75 
##  2 Youth in Revolt                            2010 Comedy            1.09 
##  3 You Will Meet a Tall Dark Stranger         2010 Comedy            1.21 
##  4 When in Rome                               2010 Comedy            0    
##  5 What Happens in Vegas                      2008 Comedy            6.27 
##  6 Water For Elephants                        2011 Drama             3.08 
##  7 WALL-E                                     2008 Animation         2.90 
##  8 Waitress                                   2007 Romance          11.1  
##  9 Waiting For Forever                        2011 Romance           0.005
## 10 Valentine's Day                            2010 Comedy            4.18 
## # ℹ 67 more rows

3. filter(): (4 points)

Filter the dataset to include only movies released after 2000 with a Rotten Tomatoes % higher than 80.

q3 <- q1 %>% 
  filter(release_year > 2000 & `Rotten Tomatoes %`> 80)

print(select(q3,movie_title,Genre,`Lead Studio`, `Rotten Tomatoes %`,release_year))
## # A tibble: 12 × 5
##    movie_title            Genre   `Lead Studio` `Rotten Tomatoes %` release_year
##    <chr>                  <chr>   <chr>                       <dbl>        <dbl>
##  1 WALL-E                 Animat… Disney                         96         2008
##  2 Waitress               Romance Independent                    89         2007
##  3 Tangled                Animat… Disney                         89         2010
##  4 Rachel Getting Married Drama   Independent                    85         2008
##  5 My Week with Marilyn   Drama   The Weinstei…                  83         2011
##  6 Midnight in Paris      Romence Sony                           93         2011
##  7 Knocked Up             Comedy  Universal                      91         2007
##  8 Jane Eyre              Romance Universal                      85         2011
##  9 Enchanted              Comedy  Disney                         93         2007
## 10 Beginners              Comedy  Independent                    84         2011
## 11 A Serious Man          Drama   Universal                      89         2009
## 12 (500) Days of Summer   comedy  Fox                            87         2009

4. mutate(): (4 points)

Add a new column called “Profitability_millions” that converts the Profitability to millions of dollars.

q4 <- q3 %>% 
  mutate(Profitability_millions = Profitability * 1000000)

print(select(q4,movie_title,Genre,Profitability,release_year,Profitability_millions)) 
## # A tibble: 12 × 5
##    movie_title           Genre Profitability release_year Profitability_millions
##    <chr>                 <chr>         <dbl>        <dbl>                  <dbl>
##  1 WALL-E                Anim…         2.90          2008               2896019.
##  2 Waitress              Roma…        11.1           2007              11089742.
##  3 Tangled               Anim…         1.37          2010               1365692.
##  4 Rachel Getting Marri… Drama         1.38          2008               1384167.
##  5 My Week with Marilyn  Drama         0.826         2011                825800 
##  6 Midnight in Paris     Rome…         8.74          2011               8744706.
##  7 Knocked Up            Come…         6.64          2007               6636402.
##  8 Jane Eyre             Roma…         0             2011                     0 
##  9 Enchanted             Come…         4.01          2007               4005737.
## 10 Beginners             Come…         4.47          2011               4471875 
## 11 A Serious Man         Drama         4.38          2009               4382857.
## 12 (500) Days of Summer  come…         8.10          2009               8096000

5. arrange(): (3 points)

Sort the filtered dataset by Rotten Tomatoes % in descending order, and then by Profitability in descending order. five <- four %>% arrange(desc(Rotten Tomatoes %) , desc(Profitability_millions))

q5 <- q4 %>%
  arrange(desc(`Rotten Tomatoes %`) , desc(Profitability_millions))

print(select(q5, movie_title, Genre, `Rotten Tomatoes %`, release_year, Profitability_millions))
## # A tibble: 12 × 5
##    movie_title     Genre `Rotten Tomatoes %` release_year Profitability_millions
##    <chr>           <chr>               <dbl>        <dbl>                  <dbl>
##  1 WALL-E          Anim…                  96         2008               2896019.
##  2 Midnight in Pa… Rome…                  93         2011               8744706.
##  3 Enchanted       Come…                  93         2007               4005737.
##  4 Knocked Up      Come…                  91         2007               6636402.
##  5 Waitress        Roma…                  89         2007              11089742.
##  6 A Serious Man   Drama                  89         2009               4382857.
##  7 Tangled         Anim…                  89         2010               1365692.
##  8 (500) Days of … come…                  87         2009               8096000 
##  9 Rachel Getting… Drama                  85         2008               1384167.
## 10 Jane Eyre       Roma…                  85         2011                     0 
## 11 Beginners       Come…                  84         2011               4471875 
## 12 My Week with M… Drama                  83         2011                825800

6. Combining functions: (3 points)

Use the pipe operator (%>%) to chain these operations together, starting with the original dataset and ending with a final dataframe that incorporates all the above transformations.

combining <-movies %>% 
  rename(movie_title = Film, 
         release_year = Year) %>% 
  select(movie_title , release_year , Genre , Profitability , `Rotten Tomatoes %`) %>% 
  filter(release_year > 2000 , `Rotten Tomatoes %` > 80) %>% 
  mutate(Profitability_millions = Profitability * 1000000) %>% 
  arrange(desc(`Rotten Tomatoes %`), 
          desc(Profitability_millions))

print(combining) 
## # A tibble: 12 × 6
##    movie_title            release_year Genre   Profitability `Rotten Tomatoes %`
##    <chr>                         <dbl> <chr>           <dbl>               <dbl>
##  1 WALL-E                         2008 Animat…         2.90                   96
##  2 Midnight in Paris              2011 Romence         8.74                   93
##  3 Enchanted                      2007 Comedy          4.01                   93
##  4 Knocked Up                     2007 Comedy          6.64                   91
##  5 Waitress                       2007 Romance        11.1                    89
##  6 A Serious Man                  2009 Drama           4.38                   89
##  7 Tangled                        2010 Animat…         1.37                   89
##  8 (500) Days of Summer           2009 comedy          8.10                   87
##  9 Rachel Getting Married         2008 Drama           1.38                   85
## 10 Jane Eyre                      2011 Romance         0                      85
## 11 Beginners                      2011 Comedy          4.47                   84
## 12 My Week with Marilyn           2011 Drama           0.826                  83
## # ℹ 1 more variable: Profitability_millions <dbl>

7. Interpret question 6 (1 point)

EXTRA CREDIT

Create a summary dataframe that shows the average rating and Profitability_millions for movies by Genre. Hint: You’ll need to use group_by() and summarize().

extracredit <- movies %>% 
  mutate(Profitability_millions = Profitability * 1000000) %>% 
  mutate(Genre = case_when(
    Genre == "Romence" ~ "Romance",   # Fixed the missing comma
    Genre == "comedy" ~ "Comedy",    # Fixed the missing comma
    Genre == "romance" ~ "Romance",  # Fixed the missing comma
    Genre == "Comdy" ~ "Comedy",     # Fixed the missing comma
    TRUE ~ Genre                     # Keep the genre as is if no match
  )) %>% 
  group_by(Genre) %>% 
  summarize(
    Avg_Rating = mean(`Rotten Tomatoes %`, na.rm = TRUE),  
    Avg_Profitability = mean(Profitability_millions, na.rm = TRUE)  
  )

print(extracredit)
## # A tibble: 6 × 3
##   Genre     Avg_Rating Avg_Profitability
##   <chr>          <dbl>             <dbl>
## 1 Action          11            1245333.
## 2 Animation       74.2          3759414.
## 3 Comedy          43.0          3851160.
## 4 Drama           51.5          8407218.
## 5 Fantasy         73            1783944.
## 6 Romance         46.3          4079972.