##1 What are the column names and data types in the dataset
# Load the dataset
library(readr)
Top_Movies_Cleaned_Data_ <- read_csv("C:/Users/adity/Desktop/CA 3/Top Movies (Cleaned Data).csv")
## Rows: 6569 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): Movie Name, MPAA Rating, Franchise, Keywords, Source, Genre, Prod...
## dbl (14): Production Budget (USD), Domestic Gross (USD), Worldwide Gross (U...
## num (1): id
## date (2): Release Date, Video Release
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Top_Movies_Cleaned_Data_)
movies=Top_Movies_Cleaned_Data_
summary(movies)
## id Movie Name Release Date Production Budget (USD)
## Min. : 1 Length:6569 Min. :1969-04-01 Min. : 86
## 1st Qu.:1643 Class :character 1st Qu.:2001-11-21 1st Qu.: 5000000
## Median :3285 Mode :character Median :2009-02-06 Median : 17000000
## Mean :3285 Mean :2008-06-26 Mean : 33019121
## 3rd Qu.:4927 3rd Qu.:2015-05-30 3rd Qu.: 40000000
## Max. :6569 Max. :2068-12-11 Max. :533200000
## NA's :149
## Domestic Gross (USD) Worldwide Gross (USD) Domestic Box Office (USD)
## Min. : 0 Min. :0.000e+00 Min. : 264
## 1st Qu.: 1083350 1st Qu.:3.771e+06 1st Qu.: 4237802
## Median : 16204793 Median :2.734e+07 Median : 21567853
## Mean : 42497640 Mean :9.452e+07 Mean : 47745339
## 3rd Qu.: 51872378 3rd Qu.:9.763e+07 3rd Qu.: 58046314
## Max. :936662225 Max. :2.924e+09 Max. :936662225
## NA's :722
## International Box Office (USD) Worldwide Box Office (USD)
## Min. :1.700e+01 Min. :1.700e+01
## 1st Qu.:2.223e+06 1st Qu.:1.180e+07
## Median :1.881e+07 Median :4.629e+07
## Mean :6.774e+07 Mean :1.201e+08
## 3rd Qu.:7.000e+07 3rd Qu.:1.371e+08
## Max. :2.138e+09 Max. :2.924e+09
## NA's :1525 NA's :1525
## Est. Domestic DVD Sales (USD) Est. Domestic Blu-ray Sales (USD)
## Min. : 3060 Min. : 724
## 1st Qu.: 1920015 1st Qu.: 787427
## Median : 7271108 Median : 2379845
## Mean : 16890142 Mean : 7812575
## 3rd Qu.: 19397418 3rd Qu.: 7657140
## Max. :370026155 Max. :222484647
## NA's :4003 NA's :4413
## Total Est. Domestic Video Sales (USD) Opening Weekend (USD) Legs
## Min. : 724 Min. : 24 Min. : 1.000
## 1st Qu.: 2431747 1st Qu.: 229096 1st Qu.: 2.530
## Median : 9488031 Median : 6256853 Median : 3.340
## Mean : 21885097 Mean : 13553726 Mean : 5.779
## 3rd Qu.: 25161794 3rd Qu.: 16338316 3rd Qu.: 5.260
## Max. :432535014 Max. :357115007 Max. :998.330
## NA's :3819 NA's :1173 NA's :1008
## Infl. Adj. Dom. BO (USD) Video Release MPAA Rating
## Min. :2.510e+02 Min. :1980-03-04 Length:6569
## 1st Qu.:6.529e+06 1st Qu.:2005-05-10 Class :character
## Median :3.630e+07 Median :2011-03-01 Mode :character
## Mean :8.005e+07 Mean :2010-06-30
## 3rd Qu.:1.007e+08 3rd Qu.:2015-12-14
## Max. :1.771e+09 Max. :2025-01-21
## NA's :931 NA's :1231
## Running Time (minutes) Franchise Keywords
## Min. : 9.0 Length:6569 Length:6569
## 1st Qu.: 95.0 Class :character Class :character
## Median :105.0 Mode :character Mode :character
## Mean :108.8
## 3rd Qu.:119.0
## Max. :262.0
## NA's :817
## Source Genre Production Method Creative Type
## Length:6569 Length:6569 Length:6569 Length:6569
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Production/Financing Companies Production Countries Languages
## Length:6569 Length:6569 Length:6569
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Domestic Releases International Releases Theater counts
## Length:6569 Length:6569 Length:6569
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Domestic Share Percentage Movie URL
## Min. : 0.00 Length:6569
## 1st Qu.: 39.60 Class :character
## Median : 59.80 Mode :character
## Mean : 62.51
## 3rd Qu.: 97.60
## Max. :100.00
## NA's :722
# Display column names and data types
glimpse(movies)
## Rows: 6,569
## Columns: 32
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,…
## $ `Movie Name` <chr> "Star Wars Ep. VII: The Force …
## $ `Release Date` <date> 2015-12-16, 2022-12-09, 2023-…
## $ `Production Budget (USD)` <dbl> 533200000, 460000000, 40230000…
## $ `Domestic Gross (USD)` <dbl> 936662225, 684075767, 17448046…
## $ `Worldwide Gross (USD)` <dbl> 2056046835, 2315589775, 383963…
## $ `Domestic Box Office (USD)` <dbl> 936662225, 684075767, 17448046…
## $ `International Box Office (USD)` <dbl> 1119384610, 1631514008, 209482…
## $ `Worldwide Box Office (USD)` <dbl> 2056046835, 2315589775, 383963…
## $ `Est. Domestic DVD Sales (USD)` <dbl> 39601100, 7771432, NA, 2396129…
## $ `Est. Domestic Blu-ray Sales (USD)` <dbl> 151559711, 11365092, NA, 83907…
## $ `Total Est. Domestic Video Sales (USD)` <dbl> 191160811, 19136524, NA, 10786…
## $ `Opening Weekend (USD)` <dbl> 247966675, 134100226, 60368101…
## $ Legs <dbl> 3.78, 5.10, 2.89, 2.40, NA, 2.…
## $ `Infl. Adj. Dom. BO (USD)` <dbl> 1191447782, 693596368, 1744804…
## $ `Video Release` <date> 2016-04-01, 2023-03-28, 2023-…
## $ `MPAA Rating` <chr> "PG-13", "PG-13", "PG-13", "PG…
## $ `Running Time (minutes)` <dbl> 136, 190, 142, 181, NA, 136, 1…
## $ Franchise <chr> "Star Wars", "Avatar", "Indian…
## $ Keywords <chr> "Space Opera,Good vs. Evil,Del…
## $ Source <chr> "Original Screenplay", "Origin…
## $ Genre <chr> "Adventure", "Action", "Advent…
## $ `Production Method` <chr> "Animation,Live Action", "Anim…
## $ `Creative Type` <chr> "Science Fiction", "Science Fi…
## $ `Production/Financing Companies` <chr> "Lucasfilm,Bad Robot", "Lights…
## $ `Production Countries` <chr> "United States", "United State…
## $ Languages <chr> "English", "English", "English…
## $ `Domestic Releases` <chr> "December 18th, 2015 (IMAX) by…
## $ `International Releases` <chr> "December 16th, 2015 (Wide) (B…
## $ `Theater counts` <chr> "4,134 opening theaters/4,134 …
## $ `Domestic Share Percentage` <dbl> 45.6, 29.5, 45.4, 31.2, NA, 23…
## $ `Movie URL` <chr> "https://www.the-numbers.com/m…
#2 Are there any missing values? If yes, in which columns
# Count missing values in each column
colSums(is.na(movies))
## id Movie Name
## 0 0
## Release Date Production Budget (USD)
## 149 0
## Domestic Gross (USD) Worldwide Gross (USD)
## 0 0
## Domestic Box Office (USD) International Box Office (USD)
## 722 1525
## Worldwide Box Office (USD) Est. Domestic DVD Sales (USD)
## 1525 4003
## Est. Domestic Blu-ray Sales (USD) Total Est. Domestic Video Sales (USD)
## 4413 3819
## Opening Weekend (USD) Legs
## 1173 1008
## Infl. Adj. Dom. BO (USD) Video Release
## 931 1231
## MPAA Rating Running Time (minutes)
## 390 817
## Franchise Keywords
## 5146 884
## Source Genre
## 268 195
## Production Method Creative Type
## 203 300
## Production/Financing Companies Production Countries
## 2369 550
## Languages Domestic Releases
## 966 463
## International Releases Theater counts
## 3338 1175
## Domestic Share Percentage Movie URL
## 722 0
#4 What is the range (min and max) of movie running times
movies %>%
summarise(
min_runtime = min(`Running Time (minutes)`, na.rm = TRUE),
max_runtime = max(`Running Time (minutes)`, na.rm = TRUE)
)
## # A tibble: 1 × 2
## min_runtime max_runtime
## <dbl> <dbl>
## 1 9 262
###INTERPRETATION The code calculates the minimum and maximum movie runtimes from the dataset. 1. min() gives the shortest movie duration, while max() gives the longest. 2. na.rm = TRUE ensures missing values don’t affect the results. 3. The range helps us understand how movie lengths vary across different films.
#level2
#Data Extraction & Filtering
#1 Which are the top 10 movies by worldwide gross?
movies %>%
arrange(desc(`Worldwide Gross (USD)`)) %>%
select(`Movie Name`, `Worldwide Gross (USD)`) %>%
head(10)
## # A tibble: 10 × 2
## `Movie Name` `Worldwide Gross (USD)`
## <chr> <dbl>
## 1 Avatar 2923706026
## 2 Avengers: Endgame 2748242781
## 3 Avatar: The Way of Water 2315589775
## 4 Titanic 2223048786
## 5 Star Wars Ep. VII: The Force Awakens 2056046835
## 6 Avengers: Infinity War 2048359754
## 7 Spider-Man: No Way Home 1921206586
## 8 Inside Out 2 1698863816
## 9 Jurassic World 1671063641
## 10 The Lion King 1661454403
###INTERPRETATION This analysis identifies the top 10 highest-earning movies globally based on their total box office revenue. It highlights blockbuster hits that achieved massive commercial success, often belonging to popular franchises such as Marvel, Star Wars, or Avatar. These movies typically have large production and marketing budgets along with wide international releases. Analyzing these top performers can provide valuable insights into the types of content that tend to succeed worldwide.
#2 Which movies have a production budget greater than $200 million?
movies %>%
filter(`Production Budget (USD)` > 200000000) %>%
select(`Movie Name`, `Production Budget (USD)`)
## # A tibble: 57 × 2
## `Movie Name` `Production Budget (USD)`
## <chr> <dbl>
## 1 Star Wars Ep. VII: The Force Awakens 533200000
## 2 Avatar: The Way of Water 460000000
## 3 Indiana Jones and the Dial of Destiny 402300000
## 4 Avengers: Endgame 400000000
## 5 Mission: Impossible—The Final Reckoning 400000000
## 6 Pirates of the Caribbean: On Stranger Tides 379000000
## 7 Avengers: Age of Ultron 365000000
## 8 Fast X 340000000
## 9 Solo: A Star Wars Story 330400000
## 10 Avengers: Infinity War 300000000
## # ℹ 47 more rows
#3 Which R-rated movies grossed more than $500 million worldwide?
movies %>%
filter(`MPAA Rating` == "R", `Worldwide Gross (USD)` > 500000000) %>%
select(`Movie Name`, `MPAA Rating`, `Worldwide Gross (USD)`)
## # A tibble: 18 × 3
## `Movie Name` `MPAA Rating` Worldwide Gross (USD…¹
## <chr> <chr> <dbl>
## 1 Deadpool & Wolverine R 1338071348
## 2 The Matrix Reloaded R 738576929
## 3 The Revenant R 532938302
## 4 Logan R 614202315
## 5 Detective Chinatown 3 (唐人街探案3) R 625136211
## 6 Deadpool 2 R 786362370
## 7 Oppenheimer R 976518442
## 8 Terminator 2: Judgment Day R 515344899
## 9 The Hangover Part II R 586764305
## 10 Mei Ren Yu R 525018479
## 11 Deadpool R 781947691
## 12 American Sniper R 547626372
## 13 Joker R 1063574394
## 14 Ted R 556016627
## 15 Fifty Shades of Grey R 570794950
## 16 It R 701012746
## 17 The Passion of the Christ R 622313635
## 18 Demon Slayer: Kimetsu no Yaiba—The Movi… R 507119058
## # ℹ abbreviated name: ¹`Worldwide Gross (USD)`
###INTERPRETATION This query identifies commercially successful R-rated films that performed exceptionally well at the global box office. Although R-rated films usually have a restricted audience, the movies on this list have broken conventional barriers and attracted wide attention despite their mature content. Notable examples often include high-profile films like Deadpool, Joker, or Logan, which combined strong storytelling, franchise appeal, and global marketing. Analyzing these results provides insights into the earning potential of mature content, especially when supported by strong branding or significant cultural impact.
#4 Which movies were released before the year 2000?
# Convert Release Date to date format first
movies %>%
filter(`Release Date` < as.Date("2000-01-01")) %>%
select(`Movie Name`, `Release Date`)
## # A tibble: 1,260 × 2
## `Movie Name` `Release Date`
## <chr> <date>
## 1 Titanic 1997-12-18
## 2 Wild Wild West 1999-06-30
## 3 Waterworld 1995-07-28
## 4 Tarzan 1999-06-16
## 5 Armageddon 1998-07-01
## 6 Lethal Weapon 4 1998-07-10
## 7 The World is Not Enough 1999-11-19
## 8 Godzilla 1998-05-19
## 9 Batman & Robin 1997-06-20
## 10 The 13th Warrior 1999-08-27
## # ℹ 1,250 more rows
#level3
#Grouping & Summarization
#1 What is the average worldwide gross revenue by genre
movies %>%
group_by(Genre) %>%
summarise(avg_worldwide_gross = mean(`Worldwide Gross (USD)`, na.rm = TRUE)) %>%
arrange(desc(avg_worldwide_gross))
## # A tibble: 15 × 2
## Genre avg_worldwide_gross
## <chr> <dbl>
## 1 Adventure 250675646.
## 2 Action 199171742.
## 3 Musical 144800165.
## 4 Reality 79783439.
## 5 Romantic Comedy 74922995.
## 6 Thriller/Suspense 63648835.
## 7 Comedy 62566086.
## 8 Horror 57704157.
## 9 Western 53111536.
## 10 Drama 46402884.
## 11 Black Comedy 36974083.
## 12 Concert/Performance 31915124.
## 13 Multiple Genres 25134267
## 14 Documentary 10332054.
## 15 <NA> 2828742.
#2 Which franchises have the highest average worldwide gross?
movies %>%
filter(!is.na(Franchise)) %>%
group_by(Franchise) %>%
summarise(avg_worldwide_gross = mean(`Worldwide Gross (USD)`, na.rm = TRUE)) %>%
arrange(desc(avg_worldwide_gross))
## # A tibble: 670 × 2
## Franchise avg_worldwide_gross
## <chr> <dbl>
## 1 Avatar 2619647900.
## 2 Marvel Cinematic UniverseAvengers 1926754931.
## 3 Marvel Cinematic UniverseSpider-Man 1921206586
## 4 Barbie 1447585476
## 5 Frozen 1361338308
## 6 DeadpoolX-MenMarvel Cinematic UniverseWolverine 1338071348
## 7 Inside Out 1274586426.
## 8 Marvel Cinematic UniverseBlack Panther 1094071314
## 9 The Lion King 1092613690
## 10 Zootopia 1025485003
## # ℹ 660 more rows
#3 What is the average domestic gross by MPAA rating
movies %>%
group_by(`MPAA Rating`) %>%
summarise(avg_domestic_gross = mean(`Domestic Gross (USD)`, na.rm = TRUE)) %>%
arrange(desc(avg_domestic_gross))
## # A tibble: 28 × 2
## `MPAA Rating` avg_domestic_gross
## <chr> <dbl>
## 1 G 74594187.
## 2 PG 70291857.
## 3 PG-13 66063527.
## 4 Not RatedApproved pca #20241 46300000
## 5 NC-17Rated X 45000000
## 6 Not Ratedapproved 34562222
## 7 R 27905347.
## 8 M/PG 23706667.
## 9 NC-17for nudity and erotic sexuality throughout, and for … 20350754
## 10 NC-17for a scene of explicit sexual content.(Rating bulle… 9737892
## # ℹ 18 more rows
#level4
#Sorting & Ranking Data
#1 Which movies have the highest return on investment (ROI)?
# Create ROI column
movies <- movies %>%
mutate(ROI = `Worldwide Gross (USD)` / `Production Budget (USD)`)
# Sort by highest ROI
movies %>%
arrange(desc(ROI)) %>%
select(`Movie Name`, `Production Budget (USD)`, `Worldwide Gross (USD)`, ROI) %>%
head(10)
## # A tibble: 10 × 4
## `Movie Name` Production Budget (US…¹ Worldwide Gross (USD…² ROI
## <chr> <dbl> <dbl> <dbl>
## 1 Deep Throat 25000 45000000 1800
## 2 Paranormal Activity 450000 194183034 432.
## 3 The Gallows 100000 42964410 430.
## 4 The Blair Witch Project 600000 248300000 414.
## 5 Super Size Me 65000 22233808 342.
## 6 Bambi 858000 268000000 312.
## 7 Night of the Living Dead 114000 30087064 264.
## 8 Rocky 1000000 225000000 225
## 9 Halloween 325000 70274000 216.
## 10 The Brothers McMullen 50000 10426506 209.
## # ℹ abbreviated names: ¹`Production Budget (USD)`, ²`Worldwide Gross (USD)`
#3 Which movies have the highest "legs" (i.e., ratio of domestic gross to opening weekend gross)?
# Create legs column
movies <- movies %>%
mutate(Legs = `Domestic Gross (USD)` / `Opening Weekend (USD)`)
movies %>%
filter(!is.na(Legs), is.finite(Legs)) %>%
arrange(desc(Legs)) %>%
select(`Movie Name`, `Domestic Gross (USD)`, `Opening Weekend (USD)`, Legs) %>%
head(10)
## # A tibble: 10 × 4
## `Movie Name` `Domestic Gross (USD)` Opening Weekend (USD…¹ Legs
## <chr> <dbl> <dbl> <dbl>
## 1 Mr. Holland’s Opus 82582604 14466 5709.
## 2 After Hours 193298906 45435 4254.
## 3 Play it to the Bone 8427204 3366 2504.
## 4 The Rocky Horror Picture… 40564000 21245 1909.
## 5 Frozen 400953009 243390 1647.
## 6 Paranormal Activity 107918810 77873 1386.
## 7 Lone Survivor 125095601 90872 1377.
## 8 Hostiles 29819114 22849 1305.
## 9 Beauty and the Beast 206333165 162146 1273.
## 10 A Man Called Otto 64267657 56257 1142.
## # ℹ abbreviated name: ¹`Opening Weekend (USD)`
#4 Which 10 movies had the lowest ROI?
# Step 1: Create ROI column
movies <- movies %>%
mutate(ROI = `Worldwide Gross (USD)` / `Production Budget (USD)`)
# Step 2: Now find 10 movies with the lowest ROI
movies %>%
filter(!is.na(ROI), is.finite(ROI)) %>%
arrange(ROI) %>%
select(`Movie Name`, `Production Budget (USD)`, `Worldwide Gross (USD)`, ROI) %>%
head(10)
## # A tibble: 10 × 4
## `Movie Name` Production Budget (U…¹ Worldwide Gross (USD…² ROI
## <chr> <dbl> <dbl> <dbl>
## 1 Mission: Impossible—The … 400000000 0 0
## 2 Beverly Hills Cop: Axel F 150000000 0 0
## 3 6 Underground 150000000 0 0
## 4 Desert Warrior 140000000 0 0
## 5 Outlaw King 120000000 0 0
## 6 Emancipation 120000000 0 0
## 7 Triple Frontier 115000000 0 0
## 8 Artemis Fowl 100000000 0 0
## 9 Bright 90000000 0 0
## 10 Army of the Dead 90000000 0 0
## # ℹ abbreviated names: ¹`Production Budget (USD)`, ²`Worldwide Gross (USD)`
###INTERPRETATION These are the least financially efficient movies, earning the lowest returns compared to their production budgets. Many of these films likely had high production costs but underperformed at the box office, possibly due to poor marketing, weak scripts, competition, or negative reviews. A low ROI signals significant financial risk and serves as an important red flag for investors and studios aiming to manage budgets effectively. Analyzing these films can provide insights into the factors that lead to box office underperformance, even with large investments.
#level5
# Simple Feature Engineering
#1 Can we create a new column that shows ROI (Return on Investment)?
movies <- movies %>%
mutate(ROI = `Worldwide Gross (USD)` / `Production Budget (USD)`)
#2 Can we label each movie as “Hit” or “Flop” based on ROI?
movies <- movies %>%
mutate(Status = ifelse(ROI > 2, "Hit", "Flop"))
#3 Can we make a new column that shows how long the movie is: short, medium, or long?
movies <- movies %>%
mutate(Movie_Length = case_when(
`Running Time (minutes)` < 90 ~ "Short",
`Running Time (minutes)` <= 120 ~ "Medium",
`Running Time (minutes)` > 120 ~ "Long",
TRUE ~ NA_character_
))
print(movies)
## # A tibble: 6,569 × 35
## id `Movie Name` `Release Date` Production Budget (U…¹
## <dbl> <chr> <date> <dbl>
## 1 1 Star Wars Ep. VII: The Force Awa… 2015-12-16 533200000
## 2 2 Avatar: The Way of Water 2022-12-09 460000000
## 3 3 Indiana Jones and the Dial of De… 2023-06-28 402300000
## 4 4 Avengers: Endgame 2019-04-23 400000000
## 5 5 Mission: Impossible—The Final Re… 2025-05-21 400000000
## 6 6 Pirates of the Caribbean: On Str… 2011-05-20 379000000
## 7 7 Avengers: Age of Ultron 2015-04-22 365000000
## 8 8 Fast X 2023-05-17 340000000
## 9 9 Solo: A Star Wars Story 2018-05-23 330400000
## 10 10 Avengers: Infinity War 2018-04-25 300000000
## # ℹ 6,559 more rows
## # ℹ abbreviated name: ¹`Production Budget (USD)`
## # ℹ 31 more variables: `Domestic Gross (USD)` <dbl>,
## # `Worldwide Gross (USD)` <dbl>, `Domestic Box Office (USD)` <dbl>,
## # `International Box Office (USD)` <dbl>, `Worldwide Box Office (USD)` <dbl>,
## # `Est. Domestic DVD Sales (USD)` <dbl>,
## # `Est. Domestic Blu-ray Sales (USD)` <dbl>, …
#4 Can we create a column that shows how many years ago the movie was released?
movies <- movies %>%
mutate(Release_Year = year(as.Date(`Release Date`, format = "%Y-%m-%d")))
current_year <- year(Sys.Date())
movies <- movies %>%
mutate(Years_Ago = current_year - Release_Year)
print(movies)
## # A tibble: 6,569 × 37
## id `Movie Name` `Release Date` Production Budget (U…¹
## <dbl> <chr> <date> <dbl>
## 1 1 Star Wars Ep. VII: The Force Awa… 2015-12-16 533200000
## 2 2 Avatar: The Way of Water 2022-12-09 460000000
## 3 3 Indiana Jones and the Dial of De… 2023-06-28 402300000
## 4 4 Avengers: Endgame 2019-04-23 400000000
## 5 5 Mission: Impossible—The Final Re… 2025-05-21 400000000
## 6 6 Pirates of the Caribbean: On Str… 2011-05-20 379000000
## 7 7 Avengers: Age of Ultron 2015-04-22 365000000
## 8 8 Fast X 2023-05-17 340000000
## 9 9 Solo: A Star Wars Story 2018-05-23 330400000
## 10 10 Avengers: Infinity War 2018-04-25 300000000
## # ℹ 6,559 more rows
## # ℹ abbreviated name: ¹`Production Budget (USD)`
## # ℹ 33 more variables: `Domestic Gross (USD)` <dbl>,
## # `Worldwide Gross (USD)` <dbl>, `Domestic Box Office (USD)` <dbl>,
## # `International Box Office (USD)` <dbl>, `Worldwide Box Office (USD)` <dbl>,
## # `Est. Domestic DVD Sales (USD)` <dbl>,
## # `Est. Domestic Blu-ray Sales (USD)` <dbl>, …
###INTERPRETATION 1. ROI (Return on Investment): A new column has been created to calculate ROI, which is the ratio of worldwide gross to production budget. This provides a quick insight into a film’s financial performance, showing how much it earned compared to the cost of production.
###2. Movie Status (Hit/Flop): Based on ROI, movies are labeled as “Hit” if their ROI is greater than 2, meaning they earned at least double their investment. If the ROI is less than 2, they are labeled as “Flop”. This provides an easy-to-understand classification of movie performance.
###3. Movie Length Category: Movies are categorized into three groups based on their runtime: Short (<90 minutes), Medium (90–120 minutes), and Long (>120 minutes). This categorization can be useful in analyzing audience preferences, understanding performance based on runtime, and aiding in production planning and content strategy.
###4. Years Ago Released: A new column is added to show how many years ago each movie was released. This feature is useful for conducting time-based analysis, such as studying performance trends, comparing modern vs. older releases, and understanding the potential age-related impacts on earnings.
#visualization
#1 What is the correlation between Production Budget and Worldwide Gross?
ggplot(movies, aes(x = `Production Budget (USD)`, y = `Worldwide Gross (USD)`)) +
geom_point(alpha = 0.6, color = "blue") +
geom_smooth(method = "lm", color = "red") +
labs(title = "Budget vs Worldwide Gross", x = "Production Budget (USD)", y = "Worldwide Gross (USD)")
## `geom_smooth()` using formula = 'y ~ x'
Higher production budgets are generally associated with higher
worldwide gross. However, the relationship is moderate, not
extremely strong, because other factors (like marketing, star power,
timing of release, genre) also significantly impact movie
earnings.
#2 Create a bar plot showing average worldwide gross by Genre
movies %>%
group_by(Genre) %>%
summarise(avg_gross = mean(`Worldwide Gross (USD)`, na.rm = TRUE)) %>%
arrange(desc(avg_gross)) %>%
ggplot(aes(x = reorder(Genre, avg_gross), y = avg_gross)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "Average Worldwide Gross by Genre", x = "Genre", y = "Average Worldwide Gross (USD)")
This bar plot illustrates the average worldwide gross earnings for
movies across different genres. The bars are ordered from highest to
lowest average gross, making it easy to identify which genres tend to
perform better financially. The genre Adventure stands out with the
highest average worldwide gross, followed closely by Action and Musical
genres. These results suggest that movies categorized under Adventure
and Action genres tend to attract larger audiences and generate more
revenue globally, possibly due to their broad appeal, high entertainment
value, and often significant investment in production and marketing. On
the other hand, genres like Documentary, Multiple Genres, and
Concert/Performance tend to have the lowest average worldwide gross,
indicating a more niche audience or limited box office potential. The
“NA” category at the top likely represents missing or unclassified genre
data, which still shows some revenue but is not meaningful for
genre-specific analysis. Overall, the plot highlights that genre plays a
significant role in a movie’s average financial success
worldwide.
#3 Plot a boxplot of Running Time across different Genres
ggplot(movies, aes(x = Genre, y = `Running Time (minutes)`)) +
geom_boxplot(fill = "lightgreen") +
coord_flip() +
labs(title = "Running Time Distribution by Genre", x = "Genre", y = "Running Time (minutes)")
## Warning: Removed 817 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
This boxplot displays the distribution of running times (in minutes)
across different movie genres. Each box represents the interquartile
range (middle 50% of values) for that genre, with the horizontal line
inside the box marking the median running time. From the plot, it is
evident that genres like Action and Adventure tend to have longer
running times, with their medians and upper ranges reaching close to or
even above 120 minutes. In contrast, genres such as Reality,
Documentary, and Concert/Performance show much shorter running times on
average, often below 100 minutes. The plot also highlights a significant
number of outliers across several genres, particularly noticeable for
Adventure and Action genres, indicating that some movies within these
genres are exceptionally long compared to others. The Musical and
Multiple Genres categories also show a wide spread of running times.
Overall, the plot suggests that movie length tends to vary notably by
genre, with more action-oriented and adventure films generally being
longer, possibly due to their complex storylines and elaborate
production elements.
#4 Build a simple linear regression model predicting Worldwide Gross from Production Budget
# Simple Linear Regression
model_simple <- lm(`Worldwide Gross (USD)` ~ `Production Budget (USD)`, data = movies)
summary(model_simple)
##
## Call:
## lm(formula = `Worldwide Gross (USD)` ~ `Production Budget (USD)`,
## data = movies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.213e+09 -3.810e+07 -3.036e+06 1.315e+07 2.207e+09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.157e+06 1.962e+06 -3.138 0.00171 **
## `Production Budget (USD)` 3.049e+00 3.512e-02 86.822 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 128300000 on 6567 degrees of freedom
## Multiple R-squared: 0.5344, Adjusted R-squared: 0.5343
## F-statistic: 7538 on 1 and 6567 DF, p-value: < 2.2e-16
#5 Create a multiple linear regression model using Production Budget, Running Time, and Opening Weekend
# Multiple Linear Regression
model_multiple <- lm(`Worldwide Gross (USD)` ~ `Production Budget (USD)` + `Running Time (minutes)` + `Opening Weekend (USD)`, data = movies)
summary(model_multiple)
##
## Call:
## lm(formula = `Worldwide Gross (USD)` ~ `Production Budget (USD)` +
## `Running Time (minutes)` + `Opening Weekend (USD)`, data = movies)
##
## Residuals:
## Min 1Q Median 3Q Max
## -510031345 -37159559 -8263616 16225632 2209193662
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.879e+07 8.397e+06 -10.574 <2e-16 ***
## `Production Budget (USD)` 8.585e-01 4.376e-02 19.619 <2e-16 ***
## `Running Time (minutes)` 7.896e+05 7.922e+04 9.966 <2e-16 ***
## `Opening Weekend (USD)` 6.127e+00 8.461e-02 72.418 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 98860000 on 4898 degrees of freedom
## (1667 observations deleted due to missingness)
## Multiple R-squared: 0.7775, Adjusted R-squared: 0.7773
## F-statistic: 5704 on 3 and 4898 DF, p-value: < 2.2e-16
#6 Compare Domestic Share % across MPAA Ratings using a boxplot
ggplot(movies, aes(x = `MPAA Rating`, y = `Domestic Share Percentage`)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Domestic Share % by MPAA Rating", x = "MPAA Rating", y = "Domestic Share Percentage")
## Warning: Removed 722 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
The boxplot visually compares the distribution of “Domestic Share
Percentage” across different “MPAA Ratings.” Each box represents the
interquartile range (IQR) of the domestic share percentage for a
specific MPAA rating, with the line inside the box indicating the
median. The whiskers extend to 1.5 times the IQR, and any points beyond
the whiskers are plotted as individual outliers.
Looking at the plot, we can observe that the median domestic share percentage appears to vary across the MPAA ratings. For instance, some ratings might exhibit a higher median and a tighter spread of domestic share percentages compared to others. The presence of outliers suggests movies with unusually high or low domestic share percentages within a particular rating category. To gain a deeper understanding, we would need to examine the specific median values, the spread (as indicated by the box size and whisker length), and the number and position of outliers for each MPAA rating. The warning message indicates that some data points were excluded from the plot because they fell outside the calculated range, which might be important to consider for a complete analysis.
#7 Find out which Genres have the highest median Running Time
movies %>%
group_by(Genre) %>%
summarise(median_runtime = median(`Running Time (minutes)`, na.rm = TRUE)) %>%
arrange(desc(median_runtime))
## # A tibble: 15 × 2
## Genre median_runtime
## <chr> <dbl>
## 1 Multiple Genres 136.
## 2 Western 115
## 3 Drama 114
## 4 Musical 112
## 5 <NA> 112
## 6 Action 111
## 7 Thriller/Suspense 108
## 8 Romantic Comedy 103
## 9 Adventure 99
## 10 Black Comedy 99
## 11 Comedy 98
## 12 Horror 96
## 13 Concert/Performance 94
## 14 Documentary 94
## 15 Reality 92
#8 Create a histogram of Worldwide Gross
ggplot(movies, aes(x = `Worldwide Gross (USD)`)) +
geom_histogram(bins = 50, fill = "orange", color = "black") +
labs(title = "Distribution of Worldwide Gross", x = "Worldwide Gross (USD)", y = "Count")
The histogram displays the distribution of “Worldwide Gross (USD)”
for the movies in the dataset. The x-axis represents the worldwide gross
in US dollars, while the y-axis shows the count or frequency of movies
falling within each bin. The orange bars illustrate the number of movies
within specific ranges of worldwide gross.
Observing the histogram, we can see that the distribution is heavily skewed to the right. A very large number of movies have a relatively low worldwide gross, as indicated by the tall bars on the left side of the plot near $0. As we move towards higher worldwide gross values, the height of the bars decreases rapidly, signifying that fewer and fewer movies achieve very high worldwide gross figures. This suggests that while many movies are produced, only a small fraction become major box office successes on a global scale. The long tail extending to the right indicates the presence of some movies with exceptionally high worldwide gross, stretching into the billions of dollars.
#9 Analyze the trend of average Worldwide Gross over years
movies %>%
mutate(Release_Year = lubridate::year(as.Date(`Release Date`))) %>%
group_by(Release_Year) %>%
summarise(avg_gross = mean(`Worldwide Gross (USD)`, na.rm = TRUE)) %>%
ggplot(aes(x = Release_Year, y = avg_gross)) +
geom_line(color = "blue") +
geom_point(color = "red") +
labs(title = "Average Worldwide Gross Over Years", x = "Release Year", y = "Average Worldwide Gross (USD)")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
The plot displays the “Average Worldwide Gross (USD)” of movies over
different “Release Years.” The x-axis represents the release year, and
the y-axis represents the average worldwide gross in US dollars. The
blue line connects the average worldwide gross for each year, showing
the trend over time, while the red points mark the average worldwide
gross for each specific year.
Looking at the plot, we can observe the trend in the average worldwide gross of movies as the release year changes. There appear to be fluctuations and potential peaks and troughs in the average gross over the years. For example, we can see periods where the average worldwide gross was relatively high and other periods where it was lower. The sharp peaks indicate years with significantly higher average worldwide gross compared to the surrounding years. The warning messages suggest that some data points with missing values or values outside the plot’s scale were removed, which might influence the visual representation of the trend. To fully interpret the trend, we would need to examine the specific years corresponding to the peaks and troughs and consider any external factors that might have influenced the average worldwide gross during those periods.
#10 What is the relationship between Opening Weekend revenue and Total Domestic Gross
# Scatter plot with regression line
ggplot(movies, aes(x = `Opening Weekend (USD)`, y = `Domestic Gross (USD)`)) +
geom_point(alpha = 0.6, color = "darkorange") +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Opening Weekend vs Total Domestic Gross", x = "Opening Weekend Revenue (USD)", y = "Total Domestic Gross (USD)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1173 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1173 rows containing missing values or values outside the scale range
## (`geom_point()`).
The scatter plot illustrates the relationship between “Opening
Weekend Revenue (USD)” and “Total Domestic Gross (USD)” for the movies
in the dataset. Each orange point represents a single movie, with its
position on the x-axis indicating its opening weekend revenue and its
position on the y-axis indicating its total domestic gross. The dark
blue line represents a linear regression model fitted to the data,
aiming to capture the general trend between these two
variables.
Observing the plot, we can see a clear positive correlation between opening weekend revenue and total domestic gross. Generally, movies with higher opening weekend revenue tend to have a higher total domestic gross. The upward sloping regression line visually confirms this positive relationship. However, the scatter of the orange points around the blue line indicates that the relationship is not perfectly linear, meaning that opening weekend revenue is not the sole predictor of total domestic gross. Some movies with similar opening weekend revenues can have varying total domestic grosses. The warning messages indicate that a significant number of rows were removed due to missing or out-of-scale values, which might affect the representativeness of the plotted relationship. Despite this, the overall trend suggests that a strong opening weekend is often a good indicator of a movie’s potential for a higher total domestic gross.
#11 What is the trend of Production Budget over years
movies %>%
mutate(Release_Year = lubridate::year(as.Date(`Release Date`))) %>%
group_by(Release_Year) %>%
summarise(avg_budget = mean(`Production Budget (USD)`, na.rm = TRUE)) %>%
ggplot(aes(x = Release_Year, y = avg_budget)) +
geom_line(color = "purple") +
geom_point(color = "black") +
labs(title = "Trend of Average Production Budget Over Years", x = "Release Year", y = "Average Production Budget (USD)")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
The plot illustrates the “Trend of Average Production Budget (USD)
Over Years.” The x-axis represents the “Release Year,” and the y-axis
represents the “Average Production Budget (USD).” The purple line
connects the average production budget for each year, showing how it has
changed over time, while the black points mark the average production
budget for each specific year.
Looking at the plot, we can observe the evolution of the average movie production budget across different release years. The trend shows periods of relatively stable average budgets, as well as periods of increase and decrease. Notably, there appears to be a significant peak in the average production budget around the year 2010. This suggests that movies released around that time had considerably higher average production budgets compared to other periods in this dataset. Following this peak, the average production budget seems to have decreased. The warning messages indicate that some data points with missing or out-of-scale values were removed, which might slightly affect the visual representation of the trend. Overall, the plot provides a historical perspective on how the average financial investment in movie production has varied over the years.
#12 Which Production Companies have the highest average Worldwide Gross
movies %>%
filter(!is.na(`Production/Financing Companies`)) %>%
group_by(`Production/Financing Companies`) %>%
summarise(avg_worldwide_gross = mean(`Worldwide Gross (USD)`, na.rm = TRUE)) %>%
arrange(desc(avg_worldwide_gross)) %>%
head(10) %>%
ggplot(aes(x = reorder(`Production/Financing Companies`, avg_worldwide_gross), y = avg_worldwide_gross)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(title = "Top 10 Production Companies by Average Worldwide Gross", x = "Production Company", y = "Average Worldwide Gross (USD)")