##1 What are the column names and data types in the dataset
# Load the dataset
library(readr)
Top_Movies_Cleaned_Data_ <- read_csv("C:/Users/adity/Desktop/CA 3/Top Movies (Cleaned Data).csv")
## Rows: 6569 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): Movie Name, MPAA Rating, Franchise, Keywords, Source, Genre, Prod...
## dbl  (14): Production Budget (USD), Domestic Gross (USD), Worldwide Gross (U...
## num   (1): id
## date  (2): Release Date, Video Release
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Top_Movies_Cleaned_Data_)
movies=Top_Movies_Cleaned_Data_
summary(movies)
##        id        Movie Name         Release Date        Production Budget (USD)
##  Min.   :   1   Length:6569        Min.   :1969-04-01   Min.   :       86      
##  1st Qu.:1643   Class :character   1st Qu.:2001-11-21   1st Qu.:  5000000      
##  Median :3285   Mode  :character   Median :2009-02-06   Median : 17000000      
##  Mean   :3285                      Mean   :2008-06-26   Mean   : 33019121      
##  3rd Qu.:4927                      3rd Qu.:2015-05-30   3rd Qu.: 40000000      
##  Max.   :6569                      Max.   :2068-12-11   Max.   :533200000      
##                                    NA's   :149                                 
##  Domestic Gross (USD) Worldwide Gross (USD) Domestic Box Office (USD)
##  Min.   :        0    Min.   :0.000e+00     Min.   :      264        
##  1st Qu.:  1083350    1st Qu.:3.771e+06     1st Qu.:  4237802        
##  Median : 16204793    Median :2.734e+07     Median : 21567853        
##  Mean   : 42497640    Mean   :9.452e+07     Mean   : 47745339        
##  3rd Qu.: 51872378    3rd Qu.:9.763e+07     3rd Qu.: 58046314        
##  Max.   :936662225    Max.   :2.924e+09     Max.   :936662225        
##                                             NA's   :722              
##  International Box Office (USD) Worldwide Box Office (USD)
##  Min.   :1.700e+01              Min.   :1.700e+01         
##  1st Qu.:2.223e+06              1st Qu.:1.180e+07         
##  Median :1.881e+07              Median :4.629e+07         
##  Mean   :6.774e+07              Mean   :1.201e+08         
##  3rd Qu.:7.000e+07              3rd Qu.:1.371e+08         
##  Max.   :2.138e+09              Max.   :2.924e+09         
##  NA's   :1525                   NA's   :1525              
##  Est. Domestic DVD Sales (USD) Est. Domestic Blu-ray Sales (USD)
##  Min.   :     3060             Min.   :      724                
##  1st Qu.:  1920015             1st Qu.:   787427                
##  Median :  7271108             Median :  2379845                
##  Mean   : 16890142             Mean   :  7812575                
##  3rd Qu.: 19397418             3rd Qu.:  7657140                
##  Max.   :370026155             Max.   :222484647                
##  NA's   :4003                  NA's   :4413                     
##  Total Est. Domestic Video Sales (USD) Opening Weekend (USD)      Legs        
##  Min.   :      724                     Min.   :       24     Min.   :  1.000  
##  1st Qu.:  2431747                     1st Qu.:   229096     1st Qu.:  2.530  
##  Median :  9488031                     Median :  6256853     Median :  3.340  
##  Mean   : 21885097                     Mean   : 13553726     Mean   :  5.779  
##  3rd Qu.: 25161794                     3rd Qu.: 16338316     3rd Qu.:  5.260  
##  Max.   :432535014                     Max.   :357115007     Max.   :998.330  
##  NA's   :3819                          NA's   :1173          NA's   :1008     
##  Infl. Adj. Dom. BO (USD) Video Release        MPAA Rating       
##  Min.   :2.510e+02        Min.   :1980-03-04   Length:6569       
##  1st Qu.:6.529e+06        1st Qu.:2005-05-10   Class :character  
##  Median :3.630e+07        Median :2011-03-01   Mode  :character  
##  Mean   :8.005e+07        Mean   :2010-06-30                     
##  3rd Qu.:1.007e+08        3rd Qu.:2015-12-14                     
##  Max.   :1.771e+09        Max.   :2025-01-21                     
##  NA's   :931              NA's   :1231                           
##  Running Time (minutes)  Franchise           Keywords        
##  Min.   :  9.0          Length:6569        Length:6569       
##  1st Qu.: 95.0          Class :character   Class :character  
##  Median :105.0          Mode  :character   Mode  :character  
##  Mean   :108.8                                               
##  3rd Qu.:119.0                                               
##  Max.   :262.0                                               
##  NA's   :817                                                 
##     Source             Genre           Production Method  Creative Type     
##  Length:6569        Length:6569        Length:6569        Length:6569       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Production/Financing Companies Production Countries  Languages        
##  Length:6569                    Length:6569          Length:6569       
##  Class :character               Class :character     Class :character  
##  Mode  :character               Mode  :character     Mode  :character  
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##  Domestic Releases  International Releases Theater counts    
##  Length:6569        Length:6569            Length:6569       
##  Class :character   Class :character       Class :character  
##  Mode  :character   Mode  :character       Mode  :character  
##                                                              
##                                                              
##                                                              
##                                                              
##  Domestic Share Percentage  Movie URL        
##  Min.   :  0.00            Length:6569       
##  1st Qu.: 39.60            Class :character  
##  Median : 59.80            Mode  :character  
##  Mean   : 62.51                              
##  3rd Qu.: 97.60                              
##  Max.   :100.00                              
##  NA's   :722

INTERPRETATION The given R code calculates the average production budget, average domestic gross, and average worldwide gross for all movies in the dataset using the summarise() function from dplyr. The na.rm = TRUE argument ensures that missing values are excluded from the calculations. The result will show: 1. The average production budget represents the typical amount spent to make a movie. 2. The average domestic gross shows how much a movie earns, on average, in the U.S. market. 3. The average worldwide gross reflects a movie’s global earnings.

#4 What is the range (min and max) of movie running times
movies %>%
  summarise(
    min_runtime = min(`Running Time (minutes)`, na.rm = TRUE),
    max_runtime = max(`Running Time (minutes)`, na.rm = TRUE)
  )
## # A tibble: 1 × 2
##   min_runtime max_runtime
##         <dbl>       <dbl>
## 1           9         262

###INTERPRETATION The code calculates the minimum and maximum movie runtimes from the dataset. 1. min() gives the shortest movie duration, while max() gives the longest. 2. na.rm = TRUE ensures missing values don’t affect the results. 3. The range helps us understand how movie lengths vary across different films.

#level2
#Data Extraction & Filtering
#1 Which are the top 10 movies by worldwide gross?
movies %>%
  arrange(desc(`Worldwide Gross (USD)`)) %>%
  select(`Movie Name`, `Worldwide Gross (USD)`) %>%
  head(10)
## # A tibble: 10 × 2
##    `Movie Name`                         `Worldwide Gross (USD)`
##    <chr>                                                  <dbl>
##  1 Avatar                                            2923706026
##  2 Avengers: Endgame                                 2748242781
##  3 Avatar: The Way of Water                          2315589775
##  4 Titanic                                           2223048786
##  5 Star Wars Ep. VII: The Force Awakens              2056046835
##  6 Avengers: Infinity War                            2048359754
##  7 Spider-Man: No Way Home                           1921206586
##  8 Inside Out 2                                      1698863816
##  9 Jurassic World                                    1671063641
## 10 The Lion King                                     1661454403

###INTERPRETATION This analysis identifies the top 10 highest-earning movies globally based on their total box office revenue. It highlights blockbuster hits that achieved massive commercial success, often belonging to popular franchises such as Marvel, Star Wars, or Avatar. These movies typically have large production and marketing budgets along with wide international releases. Analyzing these top performers can provide valuable insights into the types of content that tend to succeed worldwide.

#2 Which movies have a production budget greater than $200 million?
movies %>%
  filter(`Production Budget (USD)` > 200000000) %>%
  select(`Movie Name`, `Production Budget (USD)`)
## # A tibble: 57 × 2
##    `Movie Name`                                `Production Budget (USD)`
##    <chr>                                                           <dbl>
##  1 Star Wars Ep. VII: The Force Awakens                        533200000
##  2 Avatar: The Way of Water                                    460000000
##  3 Indiana Jones and the Dial of Destiny                       402300000
##  4 Avengers: Endgame                                           400000000
##  5 Mission: Impossible—The Final Reckoning                     400000000
##  6 Pirates of the Caribbean: On Stranger Tides                 379000000
##  7 Avengers: Age of Ultron                                     365000000
##  8 Fast X                                                      340000000
##  9 Solo: A Star Wars Story                                     330400000
## 10 Avengers: Infinity War                                      300000000
## # ℹ 47 more rows

INTERPRETATION This query lists all movies with a production budget exceeding $200 million. Such high-budget films are typically large-scale blockbusters that often involve heavy use of visual effects, top-tier casts, and global marketing campaigns. They are usually produced by major studios and are often part of well-known franchises like Marvel, DC, or Star Wars. Identifying these movies helps in analyzing the relationship between budget size and box office performance, and also in understanding the level of financial risk taken by studios when producing such expensive films.

#3 Which R-rated movies grossed more than $500 million worldwide?
movies %>%
  filter(`MPAA Rating` == "R", `Worldwide Gross (USD)` > 500000000) %>%
  select(`Movie Name`, `MPAA Rating`, `Worldwide Gross (USD)`)
## # A tibble: 18 × 3
##    `Movie Name`                             `MPAA Rating` Worldwide Gross (USD…¹
##    <chr>                                    <chr>                          <dbl>
##  1 Deadpool & Wolverine                     R                         1338071348
##  2 The Matrix Reloaded                      R                          738576929
##  3 The Revenant                             R                          532938302
##  4 Logan                                    R                          614202315
##  5 Detective Chinatown 3 (唐人街探案3)      R                          625136211
##  6 Deadpool 2                               R                          786362370
##  7 Oppenheimer                              R                          976518442
##  8 Terminator 2: Judgment Day               R                          515344899
##  9 The Hangover Part II                     R                          586764305
## 10 Mei Ren Yu                               R                          525018479
## 11 Deadpool                                 R                          781947691
## 12 American Sniper                          R                          547626372
## 13 Joker                                    R                         1063574394
## 14 Ted                                      R                          556016627
## 15 Fifty Shades of Grey                     R                          570794950
## 16 It                                       R                          701012746
## 17 The Passion of the Christ                R                          622313635
## 18 Demon Slayer: Kimetsu no Yaiba—The Movi… R                          507119058
## # ℹ abbreviated name: ¹​`Worldwide Gross (USD)`

###INTERPRETATION This query identifies commercially successful R-rated films that performed exceptionally well at the global box office. Although R-rated films usually have a restricted audience, the movies on this list have broken conventional barriers and attracted wide attention despite their mature content. Notable examples often include high-profile films like Deadpool, Joker, or Logan, which combined strong storytelling, franchise appeal, and global marketing. Analyzing these results provides insights into the earning potential of mature content, especially when supported by strong branding or significant cultural impact.

#4 Which movies were released before the year 2000?
# Convert Release Date to date format first
movies %>%
  filter(`Release Date` < as.Date("2000-01-01")) %>%
  select(`Movie Name`, `Release Date`)
## # A tibble: 1,260 × 2
##    `Movie Name`            `Release Date`
##    <chr>                   <date>        
##  1 Titanic                 1997-12-18    
##  2 Wild Wild West          1999-06-30    
##  3 Waterworld              1995-07-28    
##  4 Tarzan                  1999-06-16    
##  5 Armageddon              1998-07-01    
##  6 Lethal Weapon 4         1998-07-10    
##  7 The World is Not Enough 1999-11-19    
##  8 Godzilla                1998-05-19    
##  9 Batman & Robin          1997-06-20    
## 10 The 13th Warrior        1999-08-27    
## # ℹ 1,250 more rows

INTERPRETATION The output reveals which genres are the most profitable on average at the global box office. Genres such as Action, Adventure, and Science Fiction often top the list due to the presence of high-budget franchises and their strong global appeal. In contrast, lower-grossing genres like Indie, Drama, or Documentaries usually cater to niche audiences or have limited international releases. This insight is valuable for producers, studios, and investors as it helps identify high-return genres. It also provides an understanding of how genre preferences vary across global markets and how a movie’s genre can significantly influence its financial performance.

#2 Which franchises have the highest average worldwide gross?
movies %>%
  filter(!is.na(Franchise)) %>%
  group_by(Franchise) %>%
  summarise(avg_worldwide_gross = mean(`Worldwide Gross (USD)`, na.rm = TRUE)) %>%
  arrange(desc(avg_worldwide_gross))
## # A tibble: 670 × 2
##    Franchise                                       avg_worldwide_gross
##    <chr>                                                         <dbl>
##  1 Avatar                                                  2619647900.
##  2 Marvel Cinematic UniverseAvengers                       1926754931.
##  3 Marvel Cinematic UniverseSpider-Man                     1921206586 
##  4 Barbie                                                  1447585476 
##  5 Frozen                                                  1361338308 
##  6 DeadpoolX-MenMarvel Cinematic UniverseWolverine         1338071348 
##  7 Inside Out                                              1274586426.
##  8 Marvel Cinematic UniverseBlack Panther                  1094071314 
##  9 The Lion King                                           1092613690 
## 10 Zootopia                                                1025485003 
## # ℹ 660 more rows

INTERPRETATION This analysis reveals how movie ratings impact domestic earnings. Ratings such as PG-13 and PG often dominate because they appeal to a broader audience, including families and teenagers. While R-rated films can achieve high earnings, they typically target a more restricted audience, which may limit their overall box office potential. G-rated films tend to perform well within the family and animation segments but usually have fewer high-grossing releases overall.

#3 What is the average domestic gross by MPAA rating
movies %>%
  group_by(`MPAA Rating`) %>%
  summarise(avg_domestic_gross = mean(`Domestic Gross (USD)`, na.rm = TRUE)) %>%
  arrange(desc(avg_domestic_gross))
## # A tibble: 28 × 2
##    `MPAA Rating`                                              avg_domestic_gross
##    <chr>                                                                   <dbl>
##  1 G                                                                   74594187.
##  2 PG                                                                  70291857.
##  3 PG-13                                                               66063527.
##  4 Not RatedApproved pca #20241                                        46300000 
##  5 NC-17Rated X                                                        45000000 
##  6 Not Ratedapproved                                                   34562222 
##  7 R                                                                   27905347.
##  8 M/PG                                                                23706667.
##  9 NC-17for nudity and erotic sexuality throughout, and for …          20350754 
## 10 NC-17for a scene of explicit sexual content.(Rating bulle…           9737892 
## # ℹ 18 more rows

INTERPREATION This analysis shows how movie ratings influence domestic earnings. PG-13 and PG-rated films often dominate the box office because they appeal to a broader audience, including families and teenagers. Although R-rated films can achieve high earnings, their more restricted audience can limit their overall box office potential. Meanwhile, G-rated films typically perform well in the family and animation markets but generally have fewer high-grossing releases compared to broader-rated films.

#level4

#Sorting & Ranking Data
#1 Which movies have the highest return on investment (ROI)?
# Create ROI column
movies <- movies %>%
  mutate(ROI = `Worldwide Gross (USD)` / `Production Budget (USD)`)
# Sort by highest ROI
movies %>%
  arrange(desc(ROI)) %>%
  select(`Movie Name`, `Production Budget (USD)`, `Worldwide Gross (USD)`, ROI) %>%
  head(10)
## # A tibble: 10 × 4
##    `Movie Name`             Production Budget (US…¹ Worldwide Gross (USD…²   ROI
##    <chr>                                      <dbl>                  <dbl> <dbl>
##  1 Deep Throat                                25000               45000000 1800 
##  2 Paranormal Activity                       450000              194183034  432.
##  3 The Gallows                               100000               42964410  430.
##  4 The Blair Witch Project                   600000              248300000  414.
##  5 Super Size Me                              65000               22233808  342.
##  6 Bambi                                     858000              268000000  312.
##  7 Night of the Living Dead                  114000               30087064  264.
##  8 Rocky                                    1000000              225000000  225 
##  9 Halloween                                 325000               70274000  216.
## 10 The Brothers McMullen                      50000               10426506  209.
## # ℹ abbreviated names: ¹​`Production Budget (USD)`, ²​`Worldwide Gross (USD)`

INTERPRETATIOM This query highlights the most financially efficient films, i.e., those that earned the most revenue per dollar spent. High-ROI films are often low-budget hits that performed unexpectedly well at the box office, such as Paranormal Activity or The Blair Witch Project. These movies demonstrate that huge profits don’t always require massive production budgets; instead, strong storytelling, innovative marketing, or viral popularity can sometimes be enough to achieve great financial success.

#2 What are the top 5 lowest-grossing movies with production budgets over $100 million?
movies %>%
  filter(`Production Budget (USD)` > 100000000) %>%
  arrange(`Worldwide Gross (USD)`) %>%
  select(`Movie Name`, `Production Budget (USD)`, `Worldwide Gross (USD)`) %>%
  head(5)
## # A tibble: 5 × 3
##   `Movie Name`                     Production Budget (U…¹ Worldwide Gross (USD…²
##   <chr>                                             <dbl>                  <dbl>
## 1 Mission: Impossible—The Final R…              400000000                      0
## 2 Beverly Hills Cop: Axel F                     150000000                      0
## 3 6 Underground                                 150000000                      0
## 4 Desert Warrior                                140000000                      0
## 5 Outlaw King                                   120000000                      0
## # ℹ abbreviated names: ¹​`Production Budget (USD)`, ²​`Worldwide Gross (USD)`

INTERPRETATION This query identifies big-budget movies that failed commercially at the global box office. These films likely suffered from poor audience reception, marketing failures, critical backlash, or unfavorable release timing. They represent high-risk investments that didn’t yield the expected returns, offering valuable lessons on the unpredictability of box office performance.

#3 Which movies have the highest "legs" (i.e., ratio of domestic gross to opening weekend gross)?
# Create legs column
movies <- movies %>%
  mutate(Legs = `Domestic Gross (USD)` / `Opening Weekend (USD)`)

movies %>%
  filter(!is.na(Legs), is.finite(Legs)) %>%
  arrange(desc(Legs)) %>%
  select(`Movie Name`, `Domestic Gross (USD)`, `Opening Weekend (USD)`, Legs) %>%
  head(10)
## # A tibble: 10 × 4
##    `Movie Name`              `Domestic Gross (USD)` Opening Weekend (USD…¹  Legs
##    <chr>                                      <dbl>                  <dbl> <dbl>
##  1 Mr. Holland’s Opus                      82582604                  14466 5709.
##  2 After Hours                            193298906                  45435 4254.
##  3 Play it to the Bone                      8427204                   3366 2504.
##  4 The Rocky Horror Picture…               40564000                  21245 1909.
##  5 Frozen                                 400953009                 243390 1647.
##  6 Paranormal Activity                    107918810                  77873 1386.
##  7 Lone Survivor                          125095601                  90872 1377.
##  8 Hostiles                                29819114                  22849 1305.
##  9 Beauty and the Beast                   206333165                 162146 1273.
## 10 A Man Called Otto                       64267657                  56257 1142.
## # ℹ abbreviated name: ¹​`Opening Weekend (USD)`

INTERPRETATION A high “legs” ratio indicates that a movie had strong staying power, continuing to earn well after its opening weekend, often driven by positive word-of-mouth, critical acclaim, or repeat viewership. Movies with high legs typically start with modest openings but gain momentum over time, signaling sustained audience engagement and interest. In contrast, movies with low legs tend to generate most of their earnings in the first weekend, with a sharp decline afterward. This metric is valuable for distributors and marketers, as it helps evaluate the long-term performance of films and the effectiveness of post-release buzz.

#4 Which 10 movies had the lowest ROI?
# Step 1: Create ROI column
movies <- movies %>%
  mutate(ROI = `Worldwide Gross (USD)` / `Production Budget (USD)`)

# Step 2: Now find 10 movies with the lowest ROI
movies %>%
  filter(!is.na(ROI), is.finite(ROI)) %>%
  arrange(ROI) %>%
  select(`Movie Name`, `Production Budget (USD)`, `Worldwide Gross (USD)`, ROI) %>%
  head(10)
## # A tibble: 10 × 4
##    `Movie Name`              Production Budget (U…¹ Worldwide Gross (USD…²   ROI
##    <chr>                                      <dbl>                  <dbl> <dbl>
##  1 Mission: Impossible—The …              400000000                      0     0
##  2 Beverly Hills Cop: Axel F              150000000                      0     0
##  3 6 Underground                          150000000                      0     0
##  4 Desert Warrior                         140000000                      0     0
##  5 Outlaw King                            120000000                      0     0
##  6 Emancipation                           120000000                      0     0
##  7 Triple Frontier                        115000000                      0     0
##  8 Artemis Fowl                           100000000                      0     0
##  9 Bright                                  90000000                      0     0
## 10 Army of the Dead                        90000000                      0     0
## # ℹ abbreviated names: ¹​`Production Budget (USD)`, ²​`Worldwide Gross (USD)`

###INTERPRETATION These are the least financially efficient movies, earning the lowest returns compared to their production budgets. Many of these films likely had high production costs but underperformed at the box office, possibly due to poor marketing, weak scripts, competition, or negative reviews. A low ROI signals significant financial risk and serves as an important red flag for investors and studios aiming to manage budgets effectively. Analyzing these films can provide insights into the factors that lead to box office underperformance, even with large investments.

#level5
# Simple Feature Engineering
#1 Can we create a new column that shows ROI (Return on Investment)?
movies <- movies %>%
  mutate(ROI = `Worldwide Gross (USD)` / `Production Budget (USD)`)
#2 Can we label each movie as “Hit” or “Flop” based on ROI?
movies <- movies %>%
  mutate(Status = ifelse(ROI > 2, "Hit", "Flop"))
#3 Can we make a new column that shows how long the movie is: short, medium, or long?
movies <- movies %>%
  mutate(Movie_Length = case_when(
    `Running Time (minutes)` < 90 ~ "Short",
    `Running Time (minutes)` <= 120 ~ "Medium",
    `Running Time (minutes)` > 120 ~ "Long",
    TRUE ~ NA_character_
  ))
print(movies)
## # A tibble: 6,569 × 35
##       id `Movie Name`                      `Release Date` Production Budget (U…¹
##    <dbl> <chr>                             <date>                          <dbl>
##  1     1 Star Wars Ep. VII: The Force Awa… 2015-12-16                  533200000
##  2     2 Avatar: The Way of Water          2022-12-09                  460000000
##  3     3 Indiana Jones and the Dial of De… 2023-06-28                  402300000
##  4     4 Avengers: Endgame                 2019-04-23                  400000000
##  5     5 Mission: Impossible—The Final Re… 2025-05-21                  400000000
##  6     6 Pirates of the Caribbean: On Str… 2011-05-20                  379000000
##  7     7 Avengers: Age of Ultron           2015-04-22                  365000000
##  8     8 Fast X                            2023-05-17                  340000000
##  9     9 Solo: A Star Wars Story           2018-05-23                  330400000
## 10    10 Avengers: Infinity War            2018-04-25                  300000000
## # ℹ 6,559 more rows
## # ℹ abbreviated name: ¹​`Production Budget (USD)`
## # ℹ 31 more variables: `Domestic Gross (USD)` <dbl>,
## #   `Worldwide Gross (USD)` <dbl>, `Domestic Box Office (USD)` <dbl>,
## #   `International Box Office (USD)` <dbl>, `Worldwide Box Office (USD)` <dbl>,
## #   `Est. Domestic DVD Sales (USD)` <dbl>,
## #   `Est. Domestic Blu-ray Sales (USD)` <dbl>, …
#4 Can we create a column that shows how many years ago the movie was released?
movies <- movies %>%
  mutate(Release_Year = year(as.Date(`Release Date`, format = "%Y-%m-%d")))

current_year <- year(Sys.Date())
movies <- movies %>%
  mutate(Years_Ago = current_year - Release_Year)
print(movies)
## # A tibble: 6,569 × 37
##       id `Movie Name`                      `Release Date` Production Budget (U…¹
##    <dbl> <chr>                             <date>                          <dbl>
##  1     1 Star Wars Ep. VII: The Force Awa… 2015-12-16                  533200000
##  2     2 Avatar: The Way of Water          2022-12-09                  460000000
##  3     3 Indiana Jones and the Dial of De… 2023-06-28                  402300000
##  4     4 Avengers: Endgame                 2019-04-23                  400000000
##  5     5 Mission: Impossible—The Final Re… 2025-05-21                  400000000
##  6     6 Pirates of the Caribbean: On Str… 2011-05-20                  379000000
##  7     7 Avengers: Age of Ultron           2015-04-22                  365000000
##  8     8 Fast X                            2023-05-17                  340000000
##  9     9 Solo: A Star Wars Story           2018-05-23                  330400000
## 10    10 Avengers: Infinity War            2018-04-25                  300000000
## # ℹ 6,559 more rows
## # ℹ abbreviated name: ¹​`Production Budget (USD)`
## # ℹ 33 more variables: `Domestic Gross (USD)` <dbl>,
## #   `Worldwide Gross (USD)` <dbl>, `Domestic Box Office (USD)` <dbl>,
## #   `International Box Office (USD)` <dbl>, `Worldwide Box Office (USD)` <dbl>,
## #   `Est. Domestic DVD Sales (USD)` <dbl>,
## #   `Est. Domestic Blu-ray Sales (USD)` <dbl>, …

###INTERPRETATION 1. ROI (Return on Investment): A new column has been created to calculate ROI, which is the ratio of worldwide gross to production budget. This provides a quick insight into a film’s financial performance, showing how much it earned compared to the cost of production.

###2. Movie Status (Hit/Flop): Based on ROI, movies are labeled as “Hit” if their ROI is greater than 2, meaning they earned at least double their investment. If the ROI is less than 2, they are labeled as “Flop”. This provides an easy-to-understand classification of movie performance.

###3. Movie Length Category: Movies are categorized into three groups based on their runtime: Short (<90 minutes), Medium (90–120 minutes), and Long (>120 minutes). This categorization can be useful in analyzing audience preferences, understanding performance based on runtime, and aiding in production planning and content strategy.

###4. Years Ago Released: A new column is added to show how many years ago each movie was released. This feature is useful for conducting time-based analysis, such as studying performance trends, comparing modern vs. older releases, and understanding the potential age-related impacts on earnings.

#visualization

#1 What is the correlation between Production Budget and Worldwide Gross?
ggplot(movies, aes(x = `Production Budget (USD)`, y = `Worldwide Gross (USD)`)) +
  geom_point(alpha = 0.6, color = "blue") +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Budget vs Worldwide Gross", x = "Production Budget (USD)", y = "Worldwide Gross (USD)")
## `geom_smooth()` using formula = 'y ~ x'

Higher production budgets are generally associated with higher worldwide gross. However, the relationship is moderate, not extremely strong, because other factors (like marketing, star power, timing of release, genre) also significantly impact movie earnings.

#2 Create a bar plot showing average worldwide gross by Genre

movies %>%
  group_by(Genre) %>%
  summarise(avg_gross = mean(`Worldwide Gross (USD)`, na.rm = TRUE)) %>%
  arrange(desc(avg_gross)) %>%
  ggplot(aes(x = reorder(Genre, avg_gross), y = avg_gross)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(title = "Average Worldwide Gross by Genre", x = "Genre", y = "Average Worldwide Gross (USD)")

This bar plot illustrates the average worldwide gross earnings for movies across different genres. The bars are ordered from highest to lowest average gross, making it easy to identify which genres tend to perform better financially. The genre Adventure stands out with the highest average worldwide gross, followed closely by Action and Musical genres. These results suggest that movies categorized under Adventure and Action genres tend to attract larger audiences and generate more revenue globally, possibly due to their broad appeal, high entertainment value, and often significant investment in production and marketing. On the other hand, genres like Documentary, Multiple Genres, and Concert/Performance tend to have the lowest average worldwide gross, indicating a more niche audience or limited box office potential. The “NA” category at the top likely represents missing or unclassified genre data, which still shows some revenue but is not meaningful for genre-specific analysis. Overall, the plot highlights that genre plays a significant role in a movie’s average financial success worldwide.

#3 Plot a boxplot of Running Time across different Genres

ggplot(movies, aes(x = Genre, y = `Running Time (minutes)`)) +
  geom_boxplot(fill = "lightgreen") +
  coord_flip() +
  labs(title = "Running Time Distribution by Genre", x = "Genre", y = "Running Time (minutes)")
## Warning: Removed 817 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

This boxplot displays the distribution of running times (in minutes) across different movie genres. Each box represents the interquartile range (middle 50% of values) for that genre, with the horizontal line inside the box marking the median running time. From the plot, it is evident that genres like Action and Adventure tend to have longer running times, with their medians and upper ranges reaching close to or even above 120 minutes. In contrast, genres such as Reality, Documentary, and Concert/Performance show much shorter running times on average, often below 100 minutes. The plot also highlights a significant number of outliers across several genres, particularly noticeable for Adventure and Action genres, indicating that some movies within these genres are exceptionally long compared to others. The Musical and Multiple Genres categories also show a wide spread of running times. Overall, the plot suggests that movie length tends to vary notably by genre, with more action-oriented and adventure films generally being longer, possibly due to their complex storylines and elaborate production elements.

#4 Build a simple linear regression model predicting Worldwide Gross from Production Budget

# Simple Linear Regression
model_simple <- lm(`Worldwide Gross (USD)` ~ `Production Budget (USD)`, data = movies)
summary(model_simple)
## 
## Call:
## lm(formula = `Worldwide Gross (USD)` ~ `Production Budget (USD)`, 
##     data = movies)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.213e+09 -3.810e+07 -3.036e+06  1.315e+07  2.207e+09 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -6.157e+06  1.962e+06  -3.138  0.00171 ** 
## `Production Budget (USD)`  3.049e+00  3.512e-02  86.822  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 128300000 on 6567 degrees of freedom
## Multiple R-squared:  0.5344, Adjusted R-squared:  0.5343 
## F-statistic:  7538 on 1 and 6567 DF,  p-value: < 2.2e-16
#5 Create a multiple linear regression model using Production Budget, Running Time, and Opening Weekend

# Multiple Linear Regression
model_multiple <- lm(`Worldwide Gross (USD)` ~ `Production Budget (USD)` + `Running Time (minutes)` + `Opening Weekend (USD)`, data = movies)
summary(model_multiple)
## 
## Call:
## lm(formula = `Worldwide Gross (USD)` ~ `Production Budget (USD)` + 
##     `Running Time (minutes)` + `Opening Weekend (USD)`, data = movies)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -510031345  -37159559   -8263616   16225632 2209193662 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               -8.879e+07  8.397e+06 -10.574   <2e-16 ***
## `Production Budget (USD)`  8.585e-01  4.376e-02  19.619   <2e-16 ***
## `Running Time (minutes)`   7.896e+05  7.922e+04   9.966   <2e-16 ***
## `Opening Weekend (USD)`    6.127e+00  8.461e-02  72.418   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 98860000 on 4898 degrees of freedom
##   (1667 observations deleted due to missingness)
## Multiple R-squared:  0.7775, Adjusted R-squared:  0.7773 
## F-statistic:  5704 on 3 and 4898 DF,  p-value: < 2.2e-16
#6 Compare Domestic Share % across MPAA Ratings using a boxplot

ggplot(movies, aes(x = `MPAA Rating`, y = `Domestic Share Percentage`)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Domestic Share % by MPAA Rating", x = "MPAA Rating", y = "Domestic Share Percentage")
## Warning: Removed 722 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

The boxplot visually compares the distribution of “Domestic Share Percentage” across different “MPAA Ratings.” Each box represents the interquartile range (IQR) of the domestic share percentage for a specific MPAA rating, with the line inside the box indicating the median. The whiskers extend to 1.5 times the IQR, and any points beyond the whiskers are plotted as individual outliers.

Looking at the plot, we can observe that the median domestic share percentage appears to vary across the MPAA ratings. For instance, some ratings might exhibit a higher median and a tighter spread of domestic share percentages compared to others. The presence of outliers suggests movies with unusually high or low domestic share percentages within a particular rating category. To gain a deeper understanding, we would need to examine the specific median values, the spread (as indicated by the box size and whisker length), and the number and position of outliers for each MPAA rating. The warning message indicates that some data points were excluded from the plot because they fell outside the calculated range, which might be important to consider for a complete analysis.

#7 Find out which Genres have the highest median Running Time

movies %>%
  group_by(Genre) %>%
  summarise(median_runtime = median(`Running Time (minutes)`, na.rm = TRUE)) %>%
  arrange(desc(median_runtime))
## # A tibble: 15 × 2
##    Genre               median_runtime
##    <chr>                        <dbl>
##  1 Multiple Genres               136.
##  2 Western                       115 
##  3 Drama                         114 
##  4 Musical                       112 
##  5 <NA>                          112 
##  6 Action                        111 
##  7 Thriller/Suspense             108 
##  8 Romantic Comedy               103 
##  9 Adventure                      99 
## 10 Black Comedy                   99 
## 11 Comedy                         98 
## 12 Horror                         96 
## 13 Concert/Performance            94 
## 14 Documentary                    94 
## 15 Reality                        92
#8 Create a histogram of Worldwide Gross

ggplot(movies, aes(x = `Worldwide Gross (USD)`)) +
  geom_histogram(bins = 50, fill = "orange", color = "black") +
  labs(title = "Distribution of Worldwide Gross", x = "Worldwide Gross (USD)", y = "Count")

The histogram displays the distribution of “Worldwide Gross (USD)” for the movies in the dataset. The x-axis represents the worldwide gross in US dollars, while the y-axis shows the count or frequency of movies falling within each bin. The orange bars illustrate the number of movies within specific ranges of worldwide gross.

Observing the histogram, we can see that the distribution is heavily skewed to the right. A very large number of movies have a relatively low worldwide gross, as indicated by the tall bars on the left side of the plot near $0. As we move towards higher worldwide gross values, the height of the bars decreases rapidly, signifying that fewer and fewer movies achieve very high worldwide gross figures. This suggests that while many movies are produced, only a small fraction become major box office successes on a global scale. The long tail extending to the right indicates the presence of some movies with exceptionally high worldwide gross, stretching into the billions of dollars.

#9 Analyze the trend of average Worldwide Gross over years

movies %>%
  mutate(Release_Year = lubridate::year(as.Date(`Release Date`))) %>%
  group_by(Release_Year) %>%
  summarise(avg_gross = mean(`Worldwide Gross (USD)`, na.rm = TRUE)) %>%
  ggplot(aes(x = Release_Year, y = avg_gross)) +
  geom_line(color = "blue") +
  geom_point(color = "red") +
  labs(title = "Average Worldwide Gross Over Years", x = "Release Year", y = "Average Worldwide Gross (USD)")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

The plot displays the “Average Worldwide Gross (USD)” of movies over different “Release Years.” The x-axis represents the release year, and the y-axis represents the average worldwide gross in US dollars. The blue line connects the average worldwide gross for each year, showing the trend over time, while the red points mark the average worldwide gross for each specific year.

Looking at the plot, we can observe the trend in the average worldwide gross of movies as the release year changes. There appear to be fluctuations and potential peaks and troughs in the average gross over the years. For example, we can see periods where the average worldwide gross was relatively high and other periods where it was lower. The sharp peaks indicate years with significantly higher average worldwide gross compared to the surrounding years. The warning messages suggest that some data points with missing values or values outside the plot’s scale were removed, which might influence the visual representation of the trend. To fully interpret the trend, we would need to examine the specific years corresponding to the peaks and troughs and consider any external factors that might have influenced the average worldwide gross during those periods.

#10 What is the relationship between Opening Weekend revenue and Total Domestic Gross

# Scatter plot with regression line
ggplot(movies, aes(x = `Opening Weekend (USD)`, y = `Domestic Gross (USD)`)) +
  geom_point(alpha = 0.6, color = "darkorange") +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Opening Weekend vs Total Domestic Gross", x = "Opening Weekend Revenue (USD)", y = "Total Domestic Gross (USD)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1173 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1173 rows containing missing values or values outside the scale range
## (`geom_point()`).

The scatter plot illustrates the relationship between “Opening Weekend Revenue (USD)” and “Total Domestic Gross (USD)” for the movies in the dataset. Each orange point represents a single movie, with its position on the x-axis indicating its opening weekend revenue and its position on the y-axis indicating its total domestic gross. The dark blue line represents a linear regression model fitted to the data, aiming to capture the general trend between these two variables.

Observing the plot, we can see a clear positive correlation between opening weekend revenue and total domestic gross. Generally, movies with higher opening weekend revenue tend to have a higher total domestic gross. The upward sloping regression line visually confirms this positive relationship. However, the scatter of the orange points around the blue line indicates that the relationship is not perfectly linear, meaning that opening weekend revenue is not the sole predictor of total domestic gross. Some movies with similar opening weekend revenues can have varying total domestic grosses. The warning messages indicate that a significant number of rows were removed due to missing or out-of-scale values, which might affect the representativeness of the plotted relationship. Despite this, the overall trend suggests that a strong opening weekend is often a good indicator of a movie’s potential for a higher total domestic gross.

#11 What is the trend of Production Budget over years

movies %>%
  mutate(Release_Year = lubridate::year(as.Date(`Release Date`))) %>%
  group_by(Release_Year) %>%
  summarise(avg_budget = mean(`Production Budget (USD)`, na.rm = TRUE)) %>%
  ggplot(aes(x = Release_Year, y = avg_budget)) +
  geom_line(color = "purple") +
  geom_point(color = "black") +
  labs(title = "Trend of Average Production Budget Over Years", x = "Release Year", y = "Average Production Budget (USD)")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

The plot illustrates the “Trend of Average Production Budget (USD) Over Years.” The x-axis represents the “Release Year,” and the y-axis represents the “Average Production Budget (USD).” The purple line connects the average production budget for each year, showing how it has changed over time, while the black points mark the average production budget for each specific year.

Looking at the plot, we can observe the evolution of the average movie production budget across different release years. The trend shows periods of relatively stable average budgets, as well as periods of increase and decrease. Notably, there appears to be a significant peak in the average production budget around the year 2010. This suggests that movies released around that time had considerably higher average production budgets compared to other periods in this dataset. Following this peak, the average production budget seems to have decreased. The warning messages indicate that some data points with missing or out-of-scale values were removed, which might slightly affect the visual representation of the trend. Overall, the plot provides a historical perspective on how the average financial investment in movie production has varied over the years.

#12 Which Production Companies have the highest average Worldwide Gross

movies %>%
  filter(!is.na(`Production/Financing Companies`)) %>%
  group_by(`Production/Financing Companies`) %>%
  summarise(avg_worldwide_gross = mean(`Worldwide Gross (USD)`, na.rm = TRUE)) %>%
  arrange(desc(avg_worldwide_gross)) %>%
  head(10) %>%
  ggplot(aes(x = reorder(`Production/Financing Companies`, avg_worldwide_gross), y = avg_worldwide_gross)) +
  geom_col(fill = "darkgreen") +
  coord_flip() +
  labs(title = "Top 10 Production Companies by Average Worldwide Gross", x = "Production Company", y = "Average Worldwide Gross (USD)")