#JACOB STOUGHTON AND JAKUB KEPA - Data 3210

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)

#Load the movies dataset

movies <- read_csv("https://gist.githubusercontent.com/tiangechen/b68782efa49a16edaf07dc2cdaa855ea/raw/0c794a9717f18b094eabab2cd6a6b9a226903577/movies.csv")
## Rows: 77 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Film, Genre, Lead Studio, Worldwide Gross
## dbl (4): Audience score %, Profitability, Rotten Tomatoes %, Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Question 1. rename(): 4 points - Rename the “Film” column to “movie_title” and “Year” to “release_year”

q1 <- movies %>%
  rename(movie_title = Film , release_year = Year)

print(head(q1))
## # A tibble: 6 × 8
##   movie_title               Genre `Lead Studio` `Audience score %` Profitability
##   <chr>                     <chr> <chr>                      <dbl>         <dbl>
## 1 Zack and Miri Make a Por… Roma… The Weinstei…                 70          1.75
## 2 Youth in Revolt           Come… The Weinstei…                 52          1.09
## 3 You Will Meet a Tall Dar… Come… Independent                   35          1.21
## 4 When in Rome              Come… Disney                        44          0   
## 5 What Happens in Vegas     Come… Fox                           72          6.27
## 6 Water For Elephants       Drama 20th Century…                 72          3.08
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>

#Question 2. select(): (4 points) # Create a new dataframe with only the columns: movie_title, release_year, Genre, Profitability,

q2 <- q1 %>%
select(movie_title, release_year, Genre, Profitability)
print(q2)
## # A tibble: 77 × 4
##    movie_title                        release_year Genre     Profitability
##    <chr>                                     <dbl> <chr>             <dbl>
##  1 Zack and Miri Make a Porno                 2008 Romance           1.75 
##  2 Youth in Revolt                            2010 Comedy            1.09 
##  3 You Will Meet a Tall Dark Stranger         2010 Comedy            1.21 
##  4 When in Rome                               2010 Comedy            0    
##  5 What Happens in Vegas                      2008 Comedy            6.27 
##  6 Water For Elephants                        2011 Drama             3.08 
##  7 WALL-E                                     2008 Animation         2.90 
##  8 Waitress                                   2007 Romance          11.1  
##  9 Waiting For Forever                        2011 Romance           0.005
## 10 Valentine's Day                            2010 Comedy            4.18 
## # ℹ 67 more rows

#Question 3. filter(): (4 points) # Filter the dataset to include only movies released after 2000 with a Rotten Tomatoes % higher than 80.

q3 <- q1 %>%
  filter(`Rotten Tomatoes %` > 80, release_year > 2000)
print(q3)
## # A tibble: 12 × 8
##    movie_title            Genre   `Lead Studio` `Audience score %` Profitability
##    <chr>                  <chr>   <chr>                      <dbl>         <dbl>
##  1 WALL-E                 Animat… Disney                        89         2.90 
##  2 Waitress               Romance Independent                   67        11.1  
##  3 Tangled                Animat… Disney                        88         1.37 
##  4 Rachel Getting Married Drama   Independent                   61         1.38 
##  5 My Week with Marilyn   Drama   The Weinstei…                 84         0.826
##  6 Midnight in Paris      Romence Sony                          84         8.74 
##  7 Knocked Up             Comedy  Universal                     83         6.64 
##  8 Jane Eyre              Romance Universal                     77         0    
##  9 Enchanted              Comedy  Disney                        80         4.01 
## 10 Beginners              Comedy  Independent                   80         4.47 
## 11 A Serious Man          Drama   Universal                     64         4.38 
## 12 (500) Days of Summer   comedy  Fox                           81         8.10 
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>

#Question 4. mutate(): (4 points) # Add a new column called “Profitability_millions” that converts the Profitability to millions of dollars.

q4 <- q3 %>% 
mutate(Profitability_millions = Profitability/1e6)
print(q4)
## # A tibble: 12 × 9
##    movie_title            Genre   `Lead Studio` `Audience score %` Profitability
##    <chr>                  <chr>   <chr>                      <dbl>         <dbl>
##  1 WALL-E                 Animat… Disney                        89         2.90 
##  2 Waitress               Romance Independent                   67        11.1  
##  3 Tangled                Animat… Disney                        88         1.37 
##  4 Rachel Getting Married Drama   Independent                   61         1.38 
##  5 My Week with Marilyn   Drama   The Weinstei…                 84         0.826
##  6 Midnight in Paris      Romence Sony                          84         8.74 
##  7 Knocked Up             Comedy  Universal                     83         6.64 
##  8 Jane Eyre              Romance Universal                     77         0    
##  9 Enchanted              Comedy  Disney                        80         4.01 
## 10 Beginners              Comedy  Independent                   80         4.47 
## 11 A Serious Man          Drama   Universal                     64         4.38 
## 12 (500) Days of Summer   comedy  Fox                           81         8.10 
## # ℹ 4 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>, Profitability_millions <dbl>

#Question 5. arrange(): (3 points) # Sort the filtered dataset by Rotten Tomatoes % in descending order, and then by Profitability in descending order. five <- four %>% arrange(desc(Rotten Tomatoes %) , desc(Profitability_millions))

q5 <- q3 %>% arrange(desc(`Rotten Tomatoes %`), desc(Profitability))
print(q5)
## # A tibble: 12 × 8
##    movie_title            Genre   `Lead Studio` `Audience score %` Profitability
##    <chr>                  <chr>   <chr>                      <dbl>         <dbl>
##  1 WALL-E                 Animat… Disney                        89         2.90 
##  2 Midnight in Paris      Romence Sony                          84         8.74 
##  3 Enchanted              Comedy  Disney                        80         4.01 
##  4 Knocked Up             Comedy  Universal                     83         6.64 
##  5 Waitress               Romance Independent                   67        11.1  
##  6 A Serious Man          Drama   Universal                     64         4.38 
##  7 Tangled                Animat… Disney                        88         1.37 
##  8 (500) Days of Summer   comedy  Fox                           81         8.10 
##  9 Rachel Getting Married Drama   Independent                   61         1.38 
## 10 Jane Eyre              Romance Universal                     77         0    
## 11 Beginners              Comedy  Independent                   80         4.47 
## 12 My Week with Marilyn   Drama   The Weinstei…                 84         0.826
## # ℹ 3 more variables: `Rotten Tomatoes %` <dbl>, `Worldwide Gross` <chr>,
## #   release_year <dbl>

#Question 6. Combining functions: (3 points) # Use the pipe operator (%>%) to chain these operations together, starting with the original dataset and ending with a final dataframe that incorporates all the above transformations.

q6 <- movies %>% 
rename (movie_title = Film, release_year = Year) %>%
select (movie_title, release_year, Genre, Profitability, `Rotten Tomatoes %` ) %>%
 filter (`Rotten Tomatoes %` > 80, release_year > 2000) %>% 
 arrange (desc(`Rotten Tomatoes %`), desc(Profitability))  
  print(q6)
## # A tibble: 12 × 5
##    movie_title            release_year Genre   Profitability `Rotten Tomatoes %`
##    <chr>                         <dbl> <chr>           <dbl>               <dbl>
##  1 WALL-E                         2008 Animat…         2.90                   96
##  2 Midnight in Paris              2011 Romence         8.74                   93
##  3 Enchanted                      2007 Comedy          4.01                   93
##  4 Knocked Up                     2007 Comedy          6.64                   91
##  5 Waitress                       2007 Romance        11.1                    89
##  6 A Serious Man                  2009 Drama           4.38                   89
##  7 Tangled                        2010 Animat…         1.37                   89
##  8 (500) Days of Summer           2009 comedy          8.10                   87
##  9 Rachel Getting Married         2008 Drama           1.38                   85
## 10 Jane Eyre                      2011 Romance         0                      85
## 11 Beginners                      2011 Comedy          4.47                   84
## 12 My Week with Marilyn           2011 Drama           0.826                  83

#Question 7. Interpret question 6 (1 point) # From the resulting data, are the best movies the most popular? # From the data above, we can conclude that the “best” movies aren’t always the most popular. The movie that has the highest audience score and the worldwide gross is WALLE. Below, it shows that ‘Enchanted’, a movie with a lower audience score (80%) than ‘Midnight in Paris’ (84), made almost double the worldwide gross. Additionally, these two movies also have identical Rotten Tomatoes Scores (93%), yet one made over twice the box office money. While a majority of the higher grossing movies also have higher audience scores, we can see from the data that the best movies are not always the most popular, but there is a correlation.

#EXTRA CREDIT (4 points) #Create a summary dataframe that shows the average rating and Profitability_millions for movies by Genre. Hint: You’ll need to use group_by() and summarize().

library(dplyr)
summary_df <- q4 %>% 
  group_by(Genre) %>%
  summarize(average_rating=mean(`Audience score %`, na.rm = TRUE), average_profitability = mean(Profitability_millions, na.rm = TRUE))
print(summary_df)
## # A tibble: 6 × 3
##   Genre     average_rating average_profitability
##   <chr>              <dbl>                 <dbl>
## 1 Animation           88.5            0.00000213
## 2 Comedy              81              0.00000504
## 3 Drama               69.7            0.00000220
## 4 Romance             72              0.00000554
## 5 Romence             84              0.00000874
## 6 comedy              81              0.00000810