library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)

Load the movies dataset

movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")
## Rows: 77 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1. rename(): (4 points)

q1  <- movies %>%  
  rename(movie_title = Film, release_year = Year)

print(head(q1))
## # A tibble: 6 × 8
##   movie_title               Genre `Lead Studio` `Audience score %` Profitability
##   <chr>                     <chr> <chr>                      <dbl>         <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei…                 70          1.75
## 2 Youth in Revolt           Come… The Weinstei…                 52          1.09
## 3 You Will Meet a Tall Dar… Come… Independent                   35          1.21
## 4 When in Rome              Come… Disney                        44          0   
## 5 What Happens in Vegas     Come… Fox                           72          6.27
## 6 Water For Elephants       Drama 20th Century…                 72          3.08
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>

2. select(): (4 points)

-Create a new dataframe with only the columns: movie_title, release_year, Genre, Profitability,

q2 <- q1 %>% 
  select(movie_title, release_year, Genre, Profitability)

print(head(q2))
## # A tibble: 6 × 4
##   movie_title                        release_year Genre   Profitability
##   <chr>                                     <dbl> <chr>           <dbl>
## 1 Zack and Miri Make a Porno                 2008 Romance          1.75
## 2 Youth in Revolt                            2010 Comedy           1.09
## 3 You Will Meet a Tall Dark Stranger         2010 Comedy           1.21
## 4 When in Rome                               2010 Comedy           0   
## 5 What Happens in Vegas                      2008 Comedy           6.27
## 6 Water For Elephants                        2011 Drama            3.08

3. filter(): (4 points)

q3 <- q1 %>%  
  filter(release_year > 2000 & `Rotten Tomatoes %` > 80)

print(q3)
## # A tibble: 12 × 8
##    movie_title            Genre   `Lead Studio` `Audience score %` Profitability
##    <chr>                  <chr>   <chr>                      <dbl>         <dbl>
##  1 WALL-E                 Animat… Disney                        89         2.90 
##  2 Waitress               Romance Independent                   67        11.1  
##  3 Tangled                Animat… Disney                        88         1.37 
##  4 Rachel Getting Married Drama   Independent                   61         1.38 
##  5 My Week with Marilyn   Drama   The Weinstei…                 84         0.826
##  6 Midnight in Paris      Romence Sony                          84         8.74 
##  7 Knocked Up             Comedy  Universal                     83         6.64 
##  8 Jane Eyre              Romance Universal                     77         0    
##  9 Enchanted              Comedy  Disney                        80         4.01 
## 10 Beginners              Comedy  Independent                   80         4.47 
## 11 A Serious Man          Drama   Universal                     64         4.38 
## 12 (500) Days of Summer   comedy  Fox                           81         8.10 
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>

4. mutate(): (4 points)

q4 <- q1 %>% 
  mutate(Profitability_millions = Profitability * 1000000)

print(select(q4, movie_title, Profitability, Profitability_millions))
## # A tibble: 77 × 3
##    movie_title                        Profitability Profitability_millions
##    <chr>                                      <dbl>                  <dbl>
##  1 Zack and Miri Make a Porno                 1.75                1747542.
##  2 Youth in Revolt                            1.09                1090000 
##  3 You Will Meet a Tall Dark Stranger         1.21                1211818.
##  4 When in Rome                               0                         0 
##  5 What Happens in Vegas                      6.27                6267647.
##  6 Water For Elephants                        3.08                3081421.
##  7 WALL-E                                     2.90                2896019.
##  8 Waitress                                  11.1                11089742.
##  9 Waiting For Forever                        0.005                  5000 
## 10 Valentine's Day                            4.18                4184038.
## # ℹ 67 more rows

5. arrange(): (3 points)

q5 <- q1 %>%  
  arrange(desc(`Rotten Tomatoes %`), desc(Profitability))

head(q5)
## # A tibble: 6 × 8
##   movie_title       Genre     `Lead Studio` `Audience score %` Profitability
##   <chr>             <chr>     <chr>                      <dbl>         <dbl>
## 1 WALL-E            Animation Disney                        89          2.90
## 2 Midnight in Paris Romence   Sony                          84          8.74
## 3 Enchanted         Comedy    Disney                        80          4.01
## 4 Knocked Up        Comedy    Universal                     83          6.64
## 5 Waitress          Romance   Independent                   67         11.1 
## 6 A Serious Man     Drama     Universal                     64          4.38
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>

6. Combining functions: (3 points)

q6 <- movies %>%
rename(movie_title = Film, release_year = Year)%>%
filter(release_year > 2000 & `Rotten Tomatoes %` > 80)%>%
arrange(desc(`Rotten Tomatoes %`), desc(Profitability))%>%
select(movie_title, release_year, Genre, Profitability, `Rotten Tomatoes %`)%>%
mutate(Profitability_millions = Profitability * 1000000)


head(q6)
## # A tibble: 6 × 6
##   movie_title       release_year Genre     Profitability `Rotten Tomatoes %`
##   <chr>                    <dbl> <chr>             <dbl>               <dbl>
## 1 WALL-E                    2008 Animation          2.90                  96
## 2 Midnight in Paris         2011 Romence            8.74                  93
## 3 Enchanted                 2007 Comedy             4.01                  93
## 4 Knocked Up                2007 Comedy             6.64                  91
## 5 Waitress                  2007 Romance           11.1                   89
## 6 A Serious Man             2009 Drama              4.38                  89
## # ℹ 1 more variable: Profitability_millions <dbl>

7. Interpret question 6 (1 point)

– Based on the resulting data, not all of the most popular movies bring in the most profit.The fifth best movie has the highest Profitability, where the 1st ranked movie brings in the lowest profitability.

EXTRA CREDIT (4 points)

summary_df <- q4 %>%
  group_by(Genre) %>%
  summarize(
    Avg_Rating = mean(`Rotten Tomatoes %`, na.rm = TRUE),
    Avg_Profitability = mean(Profitability_millions, na.rm = TRUE)
  )

print(summary_df)
## # A tibble: 10 × 3
##    Genre     Avg_Rating Avg_Profitability
##    <chr>          <dbl>             <dbl>
##  1 Action          11            1245333.
##  2 Animation       74.2          3759414.
##  3 Comdy           13            2649068.
##  4 Comedy          42.7          3776946.
##  5 Drama           51.5          8407218.
##  6 Fantasy         73            1783944.
##  7 Romance         42.1          3984790.
##  8 Romence         93            8744706.
##  9 comedy          87            8096000 
## 10 romance         54             652603.