Loading necessary R libraries for data manipulation, visualization and analysis.
library(tidyverse) # For data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2) # For creating plots
library(dplyr) # For data manipulation
library(skimr) # For data summarization
library(lubridate) # For date manipulation
library(jsonlite) # For handling JSON data
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(textdata) # For sentiment analysis
library(syuzhet) # For sentiment analysis
Reading the CSV file containing the movie data.
# Loading csv file
moviedb <- read.csv("D:\\Docs\\Project\\data analysis movie\\tmdb_5000_movies.csv")
View(moviedb)
Handling JSON data within the dataset.
# Apply from JSON to each element in the 'genres' column
moviedb$genres_parsed <- lapply(moviedb$genres, fromJSON)
# Extract genre names from the parsed JSON
moviedb$genre_names <- sapply(moviedb$genres_parsed, function(x) {
paste(x$name, collapse = ", ")
})
# Parse the JSON column
moviedb$parsed_keywords <- lapply(moviedb$keywords, fromJSON)
# Extract keywords from parsed JSON
moviedb$extracted_keywords <- sapply(moviedb$parsed_keywords, function(x) {
# Convert the list of keywords to a single string
paste(sapply(x$name, as.character), collapse = ", ")
})
# Parse the JSON column
moviedb$parsed_production_companies <- lapply(moviedb$production_companies, fromJSON)
# Extract production companies from parsed JSON
moviedb$extracted_production_companies <- sapply(moviedb$parsed_production_companies, function(x) {
# Convert the list of companies to a single string
paste(sapply(x$name, as.character), collapse = ", ")
})
# Parse the json column
moviedb$parsed_production_countries <- lapply(moviedb$production_countries,fromJSON)
# Extract production countries from parsed json
moviedb$extracted_production_countries <- sapply(moviedb$parsed_production_countries,function(x){
#convert the list of countries to single string
paste(sapply(x$name,as.character),collapse=",")
})
# Parse the json column
moviedb$parsed_spoken_languages <- lapply(moviedb$spoken_languages,fromJSON)
# Extract production countries from parsed json
moviedb$extracted_spoken_languages <- sapply(moviedb$parsed_spoken_languages,function(x){
#convert the list of countries to single string
paste(sapply(x$name,as.character),collapse=",")
})
Unnecessary columns are removed, and column names are updated. Missing runtime values are filled with the mean runtime. Duplicates are identified and removed to ensure data integrity.
# Removing unnecessary column
moviedb <- moviedb %>%
select(-c(genres,keywords,production_companies,production_countries,spoken_languages ,genres_parsed,
parsed_keywords,parsed_production_companies,parsed_production_countries,
parsed_spoken_languages))
# Renaming column name
moviedb <- moviedb %>%
rename(
genres= genre_names,
keywords= extracted_keywords,
production_companies= extracted_production_companies,
production_countries =extracted_production_countries,
spoken_languages =extracted_spoken_languages
)
# Checking for null values
sum(is.na(moviedb))
## [1] 2
colSums(is.na(moviedb))
## budget homepage id
## 0 0 0
## original_language original_title overview
## 0 0 0
## popularity release_date revenue
## 0 0 0
## runtime status tagline
## 2 0 0
## title vote_average vote_count
## 0 0 0
## genres keywords production_companies
## 0 0 0
## production_countries spoken_languages
## 0 0
#Handling Missing Values in Runtime
# Calculate mean runtime excluding NAs
mean_runtime <- mean(moviedb$runtime, na.rm = TRUE)
# Replace missing runtime values with the mean
moviedb$runtime[is.na(moviedb$runtime)] <- mean_runtime
# Confirming null values
colSums(is.na(moviedb))
## budget homepage id
## 0 0 0
## original_language original_title overview
## 0 0 0
## popularity release_date revenue
## 0 0 0
## runtime status tagline
## 0 0 0
## title vote_average vote_count
## 0 0 0
## genres keywords production_companies
## 0 0 0
## production_countries spoken_languages
## 0 0
# Checking for duplicate value
# Counting duplicate value
moviedb %>%
count(across(everything())) %>%
filter(n > 1)
## [1] budget homepage id
## [4] original_language original_title overview
## [7] popularity release_date revenue
## [10] runtime status tagline
## [13] title vote_average vote_count
## [16] genres keywords production_companies
## [19] production_countries spoken_languages n
## <0 rows> (or 0-length row.names)
# Removing duplicate
moviedb <- distinct(moviedb)
Inspecting the structure of the dataset, converting the release_date column to Date type, and confirming the updated structure.
# Checking structure of dataset
str(moviedb)
## 'data.frame': 4803 obs. of 20 variables:
## $ budget : int 237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
## $ homepage : chr "http://www.avatarmovie.com/" "http://disney.go.com/disneypictures/pirates/" "http://www.sonypictures.com/movies/spectre/" "http://www.thedarkknightrises.com/" ...
## $ id : int 19995 285 206647 49026 49529 559 38757 99861 767 209112 ...
## $ original_language : chr "en" "en" "en" "en" ...
## $ original_title : chr "Avatar" "Pirates of the Caribbean: At World's End" "Spectre" "The Dark Knight Rises" ...
## $ overview : chr "In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes tor"| __truncated__ "Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with W"| __truncated__ "A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M battles pol"| __truncated__ "Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protec"| __truncated__ ...
## $ popularity : num 150.4 139.1 107.4 112.3 43.9 ...
## $ release_date : chr "2009-12-10" "2007-05-19" "2015-10-26" "2012-07-16" ...
## $ revenue : num 2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
## $ runtime : num 162 169 148 165 132 139 100 141 153 151 ...
## $ status : chr "Released" "Released" "Released" "Released" ...
## $ tagline : chr "Enter the World of Pandora." "At the end of the world, the adventure begins." "A Plan No One Escapes" "The Legend Ends" ...
## $ title : chr "Avatar" "Pirates of the Caribbean: At World's End" "Spectre" "The Dark Knight Rises" ...
## $ vote_average : num 7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
## $ vote_count : int 11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
## $ genres : chr "Action, Adventure, Fantasy, Science Fiction" "Adventure, Fantasy, Action" "Action, Adventure, Crime" "Action, Crime, Drama, Thriller" ...
## $ keywords : chr "culture clash, future, space war, space colony, society, space travel, futuristic, romance, space, alien, tribe"| __truncated__ "ocean, drug abuse, exotic island, east india trading company, love of one's life, traitor, shipwreck, strong wo"| __truncated__ "spy, based on novel, secret agent, sequel, mi6, british secret service, united kingdom" "dc comics, crime fighter, terrorist, secret identity, burglar, hostage drama, time bomb, gotham city, vigilante"| __truncated__ ...
## $ production_companies: chr "Ingenious Film Partners, Twentieth Century Fox Film Corporation, Dune Entertainment, Lightstorm Entertainment" "Walt Disney Pictures, Jerry Bruckheimer Films, Second Mate Productions" "Columbia Pictures, Danjaq, B24" "Legendary Pictures, Warner Bros., DC Entertainment, Syncopy" ...
## $ production_countries: chr "United States of America,United Kingdom" "United States of America" "United Kingdom,United States of America" "United States of America" ...
## $ spoken_languages : chr "English,Español" "English" "Français,English,Español,Italiano,Deutsch" "English" ...
# Converting datatype of date as date
moviedb$release_date <- as.Date(moviedb$release_date)
# Confirming structure
str(moviedb)
## 'data.frame': 4803 obs. of 20 variables:
## $ budget : int 237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
## $ homepage : chr "http://www.avatarmovie.com/" "http://disney.go.com/disneypictures/pirates/" "http://www.sonypictures.com/movies/spectre/" "http://www.thedarkknightrises.com/" ...
## $ id : int 19995 285 206647 49026 49529 559 38757 99861 767 209112 ...
## $ original_language : chr "en" "en" "en" "en" ...
## $ original_title : chr "Avatar" "Pirates of the Caribbean: At World's End" "Spectre" "The Dark Knight Rises" ...
## $ overview : chr "In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes tor"| __truncated__ "Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with W"| __truncated__ "A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M battles pol"| __truncated__ "Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protec"| __truncated__ ...
## $ popularity : num 150.4 139.1 107.4 112.3 43.9 ...
## $ release_date : Date, format: "2009-12-10" "2007-05-19" ...
## $ revenue : num 2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
## $ runtime : num 162 169 148 165 132 139 100 141 153 151 ...
## $ status : chr "Released" "Released" "Released" "Released" ...
## $ tagline : chr "Enter the World of Pandora." "At the end of the world, the adventure begins." "A Plan No One Escapes" "The Legend Ends" ...
## $ title : chr "Avatar" "Pirates of the Caribbean: At World's End" "Spectre" "The Dark Knight Rises" ...
## $ vote_average : num 7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
## $ vote_count : int 11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
## $ genres : chr "Action, Adventure, Fantasy, Science Fiction" "Adventure, Fantasy, Action" "Action, Adventure, Crime" "Action, Crime, Drama, Thriller" ...
## $ keywords : chr "culture clash, future, space war, space colony, society, space travel, futuristic, romance, space, alien, tribe"| __truncated__ "ocean, drug abuse, exotic island, east india trading company, love of one's life, traitor, shipwreck, strong wo"| __truncated__ "spy, based on novel, secret agent, sequel, mi6, british secret service, united kingdom" "dc comics, crime fighter, terrorist, secret identity, burglar, hostage drama, time bomb, gotham city, vigilante"| __truncated__ ...
## $ production_companies: chr "Ingenious Film Partners, Twentieth Century Fox Film Corporation, Dune Entertainment, Lightstorm Entertainment" "Walt Disney Pictures, Jerry Bruckheimer Films, Second Mate Productions" "Columbia Pictures, Danjaq, B24" "Legendary Pictures, Warner Bros., DC Entertainment, Syncopy" ...
## $ production_countries: chr "United States of America,United Kingdom" "United States of America" "United Kingdom,United States of America" "United States of America" ...
## $ spoken_languages : chr "English,Español" "English" "Français,English,Español,Italiano,Deutsch" "English" ...
Reading the CSV file containing another movie data.
# Loading csv file
movie_cast <- read.csv("D:\\Docs\\Project\\data analysis movie\\tmdb_5000_credits.csv")
View(movie_cast)
The cast and crew columns in the movie_cast dataset are parsed from JSON format.
# Parse the json column
movie_cast$parsed_cast <- lapply(movie_cast$cast,fromJSON)
# Extract production countries from parsed json
movie_cast$extracted_cast <- sapply(movie_cast$parsed_cast,function(x){
#convert the list of countries to single string
paste(sapply(x$name,as.character),collapse=",")
})
# Parse the json column
movie_cast$parsed_crew <- lapply(movie_cast$crew,fromJSON)
# Extract production countries from parsed json
movie_cast$extracted_crew <- sapply(movie_cast$parsed_crew,function(x){
#convert the list of countries to single string
paste(sapply(x$name,as.character),collapse=",")
})
Unnecessary columns are removed from the movie_cast dataset, and the remaining columns are renamed for clarity. The dataset is then checked for null values and duplicates:
# Removing unnecessary column
movie_cast <- movie_cast %>%
select(-c(cast,crew,parsed_cast,parsed_crew))
# Renaming column name
movie_cast <- movie_cast %>%
rename(
id = movie_id,
cast = extracted_cast,
crew = extracted_crew
)
# Checking for null values
sum(is.na(movie_cast))
## [1] 0
colSums(is.na(movie_cast))
## id title cast crew
## 0 0 0 0
# Checking for duplicate value
movie_cast %>%
count(across(everything())) %>%
filter(n > 1)
## [1] id title cast crew n
## <0 rows> (or 0-length row.names)
# Removing duplicate value
movie_cast <- distinct(movie_cast)
# Checking structure of dataset
str(movie_cast)
## 'data.frame': 4803 obs. of 4 variables:
## $ id : int 19995 285 206647 49026 49529 559 38757 99861 767 209112 ...
## $ title: chr "Avatar" "Pirates of the Caribbean: At World's End" "Spectre" "The Dark Knight Rises" ...
## $ cast : chr "Sam Worthington,Zoe Saldana,Sigourney Weaver,Stephen Lang,Michelle Rodriguez,Giovanni Ribisi,Joel David Moore,C"| __truncated__ "Johnny Depp,Orlando Bloom,Keira Knightley,Stellan Skarsgård,Chow Yun-fat,Bill Nighy,Geoffrey Rush,Jack Davenpor"| __truncated__ "Daniel Craig,Christoph Waltz,Léa Seydoux,Ralph Fiennes,Monica Bellucci,Ben Whishaw,Naomie Harris,Dave Bautista,"| __truncated__ "Christian Bale,Michael Caine,Gary Oldman,Anne Hathaway,Tom Hardy,Marion Cotillard,Joseph Gordon-Levitt,Morgan F"| __truncated__ ...
## $ crew : chr "Stephen E. Rivkin,Rick Carter,Christopher Boyes,Christopher Boyes,Mali Finn,James Horner,James Cameron,James Ca"| __truncated__ "Dariusz Wolski,Gore Verbinski,Jerry Bruckheimer,Ted Elliott,Terry Rossio,Stephen E. Rivkin,Craig Wood,Hans Zimm"| __truncated__ "Thomas Newman,Sam Mendes,Anna Pinnock,John Logan,John Logan,Dennis Gassner,Ian Fleming,Lee Smith,Bill Bernstein"| __truncated__ "Hans Zimmer,Charles Roven,Christopher Nolan,Christopher Nolan,Christopher Nolan,Christopher Nolan,Jonathan Nola"| __truncated__ ...
The moviedb and movie_cast datasets are merged to create a comprehensive movie_analysis dataset and to provide a more complete view of the movies.
# Merging both dataset
movie_analysis <- merge(moviedb,movie_cast)
# View the merged dataset
View(movie_analysis)
# Checking the structure of the merged dataset
str(movie_analysis)
## 'data.frame': 4803 obs. of 22 variables:
## $ id : int 100 10003 100042 10008 10012 10013 10014 10016 10017 10022 ...
## $ title : chr "Lock, Stock and Two Smoking Barrels" "The Saint" "Dumb and Dumber To" "An American Haunting" ...
## $ budget : int 1350000 68000000 40000000 0 35000000 18000000 3000000 28000000 2700000 56000000 ...
## $ homepage : chr "http://www.universalstudiosentertainment.com/lock-stock-and-two-smoking-barrels/" "" "" "" ...
## $ original_language : chr "en" "en" "en" "en" ...
## $ original_title : chr "Lock, Stock and Two Smoking Barrels" "The Saint" "Dumb and Dumber To" "An American Haunting" ...
## $ overview : chr "A card sharp and his unwillingly-enlisted friends need to make a lot of cash quick after losing a sketchy poker"| __truncated__ "Ivan Tretiak, Russian Mafia boss who wants to create an oil crisis in Moscow and seize power as a result sends "| __truncated__ "20 years after the dimwits set out on their first adventure, they head out in search of one of their long lost "| __truncated__ "Based on the true events of the only case in US History where a spirit caused the death of a man." ...
## $ popularity : num 16.03 20.91 67.77 7.44 11.2 ...
## $ release_date : Date, format: "1998-03-05" "1997-04-03" ...
## $ revenue : num 3.90e+06 1.18e+08 1.70e+08 0.00 1.93e+07 ...
## $ runtime : num 105 116 110 83 97 103 87 98 93 95 ...
## $ status : chr "Released" "Released" "Released" "Released" ...
## $ tagline : chr "A Disgrace to Criminals Everywhere." "Never reveal your name. Never turn your back. Never surrender your heart." "The Second Parts Are More Fools" "Possession Knows No Bounds" ...
## $ vote_average : num 7.5 5.9 5.4 5.3 5.1 5.9 5.7 4.8 5.9 5.8 ...
## $ vote_count : int 1648 302 1127 142 167 136 361 296 108 842 ...
## $ genres : chr "Comedy, Crime" "Thriller, Action, Romance, Science Fiction, Adventure" "Comedy" "Horror, Thriller" ...
## $ keywords : chr "ambush, alcohol, shotgun, tea, joint, machismo, cocktail, rifle, marijuana, cockney accent, pot smoking, hatche"| __truncated__ "berlin, russia, gas, master thief, the saint" "friendship, sequel, road movie, buddy comedy" "witch, independent film, curse, suitor, family" ...
## $ production_companies: chr "Handmade Films Ltd., Summit Entertainment, PolyGram Filmed Entertainment, SKA Films, The Steve Tisch Company" "Paramount Pictures, Mace Neufeld Productions, Rysher Entertainment" "New Line Cinema, Universal Pictures, Conundrum Entertainment, Red Granite Pictures" "Media Pro Pictures, Allan Zeman Productions, After Dark Films, Remstar Productions, Sweetpea Entertainment, Red"| __truncated__ ...
## $ production_countries: chr "United Kingdom" "United States of America" "United States of America" "United Kingdom,United States of America" ...
## $ spoken_languages : chr "English" "English,Pусский" "English" "English" ...
## $ cast : chr "Jason Flemyng,Dexter Fletcher,Nick Moran,Jason Statham,Vinnie Jones,Sting,Steven Mackintosh,Nicholas Rowe,Lenny"| __truncated__ "Val Kilmer,Elisabeth Shue,Rade Serbedzija,Henry Goodman,Alun Armstrong,Michael Byrne,Yevgeni Lazarev,Irina Apek"| __truncated__ "Jim Carrey,Jeff Daniels,Rachel Melvin,Kathleen Turner,Brady Bluhm,Laurie Holden,Steve Tom,Rob Riggle,Don Lake,P"| __truncated__ "Donald Sutherland,Sissy Spacek,James D'Arcy,Matthew Marsh,Gaye Brown,Vernon Dobtcheff,Thom Fell,Shauna Shim,Zoe"| __truncated__ ...
## $ crew : chr "John Murphy,Guy Ritchie,Guy Ritchie,Guy Ritchie,Matthew Vaughn,David A. Hughes,Tim Maurice-Jones,Iain Andrews,E"| __truncated__ "Jonathan Hensleigh,Jonathan Hensleigh,Graeme Revell,Patsy Pollock,Elisabeth Leustig,Mace Neufeld,Terry Rawlings"| __truncated__ "Matthew F. Leonetti,Marc S. Fischer,Rick Montgomery,Bobby Farrelly,Bobby Farrelly,Bobby Farrelly,Peter Farrelly"| __truncated__ "Richard Comeau,Simon Franks,Zygi Kamasa,Adrian Biddle,Derek Marcil,David Worley,Curtis Lindersmith,Maxime Rémil"| __truncated__ ...
Checking missing values and addressing missing values through various imputation method to ensure the dataset is complete and ready for analysis.
# Checking for null value
sum(is.na(movie_analysis))
## [1] 1
# Counting the number of NA values in each column
colSums(is.na(movie_analysis))
## id title budget
## 0 0 0
## homepage original_language original_title
## 0 0 0
## overview popularity release_date
## 0 0 1
## revenue runtime status
## 0 0 0
## tagline vote_average vote_count
## 0 0 0
## genres keywords production_companies
## 0 0 0
## production_countries spoken_languages cast
## 0 0 0
## crew
## 0
# Imputation missing data
# Convert release_date to numeric (days since a reference date)
reference_date <- as.Date("2000-01-01")
numeric_dates <- as.numeric(movie_analysis$release_date - reference_date)
# Compute median date
median_date <- reference_date + median(numeric_dates, na.rm = TRUE)
# Replace missing dates with median date
movie_analysis$release_date[is.na(movie_analysis$release_date)] <- median_date
# Impute missing values in the 'budget' column with the median
movie_analysis$budget[is.na(movie_analysis$budget)] <- median(movie_analysis$budget, na.rm = TRUE)
# Impute missing values in the 'revenue' column with the median
movie_analysis$revenue[is.na(movie_analysis$revenue)] <- median(movie_analysis$revenue, na.rm = TRUE)
# Impute missing values in the 'popularity' column with the mean
movie_analysis$popularity[is.na(movie_analysis$popularity)] <- mean(movie_analysis$popularity, na.rm = TRUE)
A summary of the dataset and a check for remaining NA values are performed to ensure that all missing values have been addressed.
# Display summary of the dataset to confirm missing values are handled
summary(movie_analysis)
## id title budget homepage
## Min. : 5 Length:4803 Min. : 0 Length:4803
## 1st Qu.: 9014 Class :character 1st Qu.: 790000 Class :character
## Median : 14629 Mode :character Median : 15000000 Mode :character
## Mean : 57166 Mean : 29045040
## 3rd Qu.: 58611 3rd Qu.: 40000000
## Max. :459488 Max. :380000000
## original_language original_title overview popularity
## Length:4803 Length:4803 Length:4803 Min. : 0.000
## Class :character Class :character Class :character 1st Qu.: 4.668
## Mode :character Mode :character Mode :character Median : 12.922
## Mean : 21.492
## 3rd Qu.: 28.314
## Max. :875.581
## release_date revenue runtime status
## Min. :1916-09-04 Min. :0.000e+00 Min. : 0.0 Length:4803
## 1st Qu.:1999-07-14 1st Qu.:0.000e+00 1st Qu.: 94.0 Class :character
## Median :2005-10-03 Median :1.917e+07 Median :103.0 Mode :character
## Mean :2002-12-28 Mean :8.226e+07 Mean :106.9
## 3rd Qu.:2011-02-16 3rd Qu.:9.292e+07 3rd Qu.:117.5
## Max. :2017-02-03 Max. :2.788e+09 Max. :338.0
## tagline vote_average vote_count genres
## Length:4803 Min. : 0.000 Min. : 0.0 Length:4803
## Class :character 1st Qu.: 5.600 1st Qu.: 54.0 Class :character
## Mode :character Median : 6.200 Median : 235.0 Mode :character
## Mean : 6.092 Mean : 690.2
## 3rd Qu.: 6.800 3rd Qu.: 737.0
## Max. :10.000 Max. :13752.0
## keywords production_companies production_countries
## Length:4803 Length:4803 Length:4803
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## spoken_languages cast crew
## Length:4803 Length:4803 Length:4803
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
# Confirm that there are no remaining NA values
colSums(is.na(movie_analysis))
## id title budget
## 0 0 0
## homepage original_language original_title
## 0 0 0
## overview popularity release_date
## 0 0 0
## revenue runtime status
## 0 0 0
## tagline vote_average vote_count
## 0 0 0
## genres keywords production_companies
## 0 0 0
## production_countries spoken_languages cast
## 0 0 0
## crew
## 0
Descriptive Statistics: Calculated average, highest, and lowest budgets and revenues, and visualized the top and bottom 10 movies in terms of both budget and revenue.
# Analyze the distribution of movie budgets
movie_analysis %>% filter(budget>0) %>% summarise(avg_budget=mean(budget),highest_budget=max(budget),lowest_budget=min(budget),
sd_budget=sd(budget))
## avg_budget highest_budget lowest_budget sd_budget
## 1 37042838 380000000 1 42646514
# 10 highest budget movie
movie_analysis %>% group_by(title) %>% summarise(highest_budget=max(budget)) %>%
arrange(desc(highest_budget)) %>% slice_head(n=10)
## # A tibble: 10 × 2
## title highest_budget
## <chr> <int>
## 1 Pirates of the Caribbean: On Stranger Tides 380000000
## 2 Pirates of the Caribbean: At World's End 300000000
## 3 Avengers: Age of Ultron 280000000
## 4 Superman Returns 270000000
## 5 John Carter 260000000
## 6 Tangled 260000000
## 7 Spider-Man 3 258000000
## 8 The Lone Ranger 255000000
## 9 Batman v Superman: Dawn of Justice 250000000
## 10 Captain America: Civil War 250000000
# Plot 10 highest budget movie
movie_analysis %>% group_by(title) %>% summarise(highest_budget=max(budget)) %>%
arrange(desc(highest_budget)) %>% slice_head(n=10) %>%
ggplot(aes(x = reorder(title, highest_budget), y = highest_budget))+
geom_col(fill='#1f77b4')+coord_flip()+
labs(x="Movie Title",y="Budget",title="Top 10 Highest Budget Movies")+
theme_classic()
# 10 lowest budget movie
movie_analysis %>%filter(budget>0) %>% group_by(title) %>% summarise(lowest_budget=min(budget)) %>%
arrange(lowest_budget) %>% slice_head(n=10)
## # A tibble: 10 × 2
## title lowest_budget
## <chr> <int>
## 1 Fear Clinic 1
## 2 Fetching Cody 1
## 3 Incident at Loch Ness 1
## 4 Me You and Five Bucks 1
## 5 Modern Times 1
## 6 Slam 1
## 7 The Algerian 1
## 8 To Be Frank, Sinatra at 100 2
## 9 Four Single Fathers 3
## 10 A Farewell to Arms 4
# Plot 10 lowest budget movie
movie_analysis %>%filter(budget>0) %>% group_by(title) %>% summarise(lowest_budget=min(budget)) %>%
arrange(lowest_budget) %>% slice_head(n=10) %>%
ggplot(aes(x = reorder(title,lowest_budget),y=lowest_budget))+
geom_col(fill='#ff7f0e')+coord_flip()+
labs(x="Movie Title",y="Budget",title="Top 10 Lowest Budget Movies")+
theme_classic()
# Examine the distribution of revenues
movie_analysis %>% filter(revenue>0) %>%
summarise(highest_revenue=max(revenue),lowest_revenue=min(revenue),average_revenue=mean(revenue))
## highest_revenue lowest_revenue average_revenue
## 1 2787965087 5 117031353
# 10 highest revenue generated movie
movie_analysis %>% group_by(title) %>% summarise(highest_revenue=max(revenue)) %>%
arrange(desc(highest_revenue)) %>% slice_head(n=10)
## # A tibble: 10 × 2
## title highest_revenue
## <chr> <dbl>
## 1 Avatar 2787965087
## 2 Titanic 1845034188
## 3 The Avengers 1519557910
## 4 Jurassic World 1513528810
## 5 Furious 7 1506249360
## 6 Avengers: Age of Ultron 1405403694
## 7 Frozen 1274219009
## 8 Iron Man 3 1215439994
## 9 Minions 1156730962
## 10 Captain America: Civil War 1153304495
# Plot 10 highest revenue generated movie
movie_analysis %>% group_by(title) %>% summarise(highest_revenue=max(revenue)) %>%
arrange(desc(highest_revenue)) %>% slice_head(n=10) %>%
ggplot(aes(x=reorder(title,highest_revenue),y=highest_revenue))+
geom_col(fill='#2ca02c')+coord_flip()+
labs(x="Movie Title",y="Revenue",title="Top 10 Highest Reveneu Generated Movies")+
theme_classic()
# 10 lowest revenue generated movie
movie_analysis %>% filter(revenue>0) %>% group_by(title)%>% summarise(lowest_revenue=min(revenue)) %>%
arrange(lowest_revenue) %>% slice_head(n=10)
## # A tibble: 10 × 2
## title lowest_revenue
## <chr> <dbl>
## 1 Split Second 5
## 2 Bran Nue Dae 7
## 3 Dreaming of Joseph Lees 7
## 4 Bats 10
## 5 F.I.S.T. 11
## 6 Of Horses and Men 11
## 7 Chasing Liberty 12
## 8 The Cookout 12
## 9 Angela's Ashes 13
## 10 The 51st State 14
# Plot 10 lowest revenue generated movie
movie_analysis %>% filter(revenue>0) %>% group_by(title)%>% summarise(lowest_revenue=min(revenue)) %>%
arrange(lowest_revenue) %>% slice_head(n=10) %>%
ggplot(aes(x=reorder(title,lowest_revenue),y=lowest_revenue))+
geom_col(fill='#d62728')+coord_flip()+
labs(x="Movie Title",y="Revenue",title="10 Lowest Reveneu Generated Movies")+
theme_classic()
Analyzed the distribution and visualized the top and bottom 10 most and least popular movies.
# Analyze Popularity
movie_analysis %>%filter(popularity>0) %>%
summarise(max_popularity=max(popularity),min_popularity=min(popularity),avg_popularity=mean(popularity))
## max_popularity min_popularity avg_popularity
## 1 875.5813 0.000372 21.49678
# 10 highest popular movies
movie_analysis %>% group_by(title) %>% summarise(max_popularity=max(popularity)) %>%
arrange(desc(max_popularity)) %>% slice_head(n=10)
## # A tibble: 10 × 2
## title max_popularity
## <chr> <dbl>
## 1 Minions 876.
## 2 Interstellar 724.
## 3 Deadpool 515.
## 4 Guardians of the Galaxy 481.
## 5 Mad Max: Fury Road 434.
## 6 Jurassic World 419.
## 7 Pirates of the Caribbean: The Curse of the Black Pearl 272.
## 8 Dawn of the Planet of the Apes 244.
## 9 The Hunger Games: Mockingjay - Part 1 206.
## 10 Big Hero 6 204.
# Plot 10 highest popular movies
movie_analysis %>% group_by(title) %>% summarise(max_popularity=max(popularity)) %>%
arrange(desc(max_popularity)) %>% slice_head(n=10) %>%
ggplot(aes(x=reorder(title,max_popularity),y=max_popularity))+
geom_col(fill='#9467bd')+coord_flip()+
labs(x="Movie Title",y="Popularity",title="Top 10 Highest Popular Movies")+
theme_classic()
# 10 lowest popular movies
movie_analysis %>% filter(revenue>0) %>% group_by(title)%>% summarise(min_popularity=min(revenue)) %>%
arrange(min_popularity) %>% slice_head(n=10)
## # A tibble: 10 × 2
## title min_popularity
## <chr> <dbl>
## 1 Split Second 5
## 2 Bran Nue Dae 7
## 3 Dreaming of Joseph Lees 7
## 4 Bats 10
## 5 F.I.S.T. 11
## 6 Of Horses and Men 11
## 7 Chasing Liberty 12
## 8 The Cookout 12
## 9 Angela's Ashes 13
## 10 The 51st State 14
# Plot 10 lowest popular movies
movie_analysis %>% filter(revenue>0) %>% group_by(title)%>% summarise(min_popularity=min(revenue)) %>%
arrange(min_popularity) %>% slice_head(n=10) %>%
ggplot(aes(x=reorder(title,min_popularity),y=min_popularity))+
geom_col(fill='#8c564b')+coord_flip()+
labs(x="Movie Title",y="Popularity",title="10 Lowest Popular Movies")+
theme_classic()
Explored movie runtime statistics and visualized revenue and average revenue over time.Also,Analyzed popularity by genre over time.
# Run time (in minute)
movie_analysis %>% filter(runtime>0) %>%
summarise(max_runtime=max(runtime),min_runtime=min(runtime),avg_runtime=mean(runtime))
## max_runtime min_runtime avg_runtime
## 1 338 14 107.6604
# No of movie by year
movie_analysis$year <- as.numeric(format(movie_analysis$release_date, "%Y"))
# Summary of Movie Data by Year
movie_analysis %>%
group_by(year) %>%
summarise(
total_movie = n(),
avg_revenue = mean(revenue),
avg_popularity = mean(popularity)
)
## # A tibble: 90 × 4
## year total_movie avg_revenue avg_popularity
## <dbl> <int> <dbl> <dbl>
## 1 1916 1 8394751 3.23
## 2 1925 1 22000000 0.786
## 3 1927 1 650422 32.4
## 4 1929 2 2179000 1.40
## 5 1930 1 8000000 8.48
## 6 1932 1 25 1.20
## 7 1933 2 2240500 1.28
## 8 1934 1 4500000 11.9
## 9 1935 1 3202000 3.90
## 10 1936 2 5618000 15.6
## # ℹ 80 more rows
# Plot total revenue over time
movie_analysis %>%
group_by(year) %>%
summarize(total_revenue = sum(revenue), .groups = 'drop') %>%
ggplot(aes(x = year, y = total_revenue)) +
geom_line(color = "blue") +
geom_point(color = "red", size = 1) +
labs(x = "Year", y = "Total Revenue",title = "Total Revenue by Year") +
theme_minimal()
# Plot total budget over time
movie_analysis %>%
group_by(year) %>%
summarize(total_budget = sum(budget), .groups = 'drop') %>%
ggplot(aes(x = year, y = total_budget)) +
geom_line(color = "blue") +
geom_point(color = "red", size = 1) +
labs(x = "Year", y = "Total Budget",title = "Total Budget by Year") +
theme_minimal()
# Plot average revenue over time
movie_analysis %>%
group_by(year) %>%
summarize(avg_revenue = mean(revenue), .groups = 'drop') %>%
ggplot(aes(x = year, y = avg_revenue)) +
geom_line(color = "green",size=1) +
geom_point(color = "orange", size = 1.5) +
labs(title = "Average Revenue by Year",
x = "Year",
y = "Average Revenue") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Plot popularity by genre over time
movie_analysis %>%separate_rows(genres, sep = ",\\s*") %>%
mutate(keywords = trimws(genres)) %>% filter(genres!="") %>%
group_by(year, genres) %>%
summarize(avg_popularity = mean(popularity)) %>%
ggplot(aes(x = year, y = avg_popularity, fill = genres)) +
geom_col()+
labs(title = "Average Popularity by Genre and Year", x = "Year", y = "Average Popularity") +
theme_minimal() +
theme(legend.position = "bottom")
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
Aggregated and visualized total revenue and budget by genre and assessed popularity.
# Analyze genres by revenue
movie_analysis %>%
# This assumes each keyword entry is a comma-separated list
separate_rows(genres,sep = ",\\s*") %>%
# Remove any leading or trailing spaces from keywords
mutate(keywords = trimws(genres)) %>%
# Summarize total revenue by each individual keyword
filter(revenue>0) %>% filter(genres!="") %>% group_by(genres) %>%
summarise(total_revenue=sum(revenue)) %>% arrange(desc(total_revenue))
## # A tibble: 19 × 2
## genres total_revenue
## <chr> <dbl>
## 1 Adventure 164841561551
## 2 Action 162959914515
## 3 Comedy 122760517608
## 4 Drama 119710983984
## 5 Thriller 103250426269
## 6 Family 83283238689
## 7 Fantasy 81982199925
## 8 Science Fiction 81564235745
## 9 Romance 53642137545
## 10 Animation 52812167865
## 11 Crime 46040860686
## 12 Mystery 27248722761
## 13 Horror 22599894663
## 14 War 12118445911
## 15 History 11332141732
## 16 Music 8964351078
## 17 Western 3792169111
## 18 Documentary 1082277678
## 19 Foreign 12398151
# Plot genres by revenue
movie_analysis %>%
separate_rows(genres,sep = ",\\s*") %>%
mutate(keywords = trimws(genres)) %>%
filter(revenue>0) %>% filter(genres!="") %>% group_by(genres) %>%
summarise(total_revenue=sum(revenue)) %>% arrange(desc(total_revenue)) %>%
ggplot(aes(x=reorder(genres,total_revenue),y=total_revenue))+
geom_col(fill = "#2ca02c") +
labs(x="Genres",y="Total Reveneu",title = "Genres by Revenue")+
theme(axis.text.x = element_text(angle=50,hjust = 1))
# Analyze genres by budget
movie_analysis %>%
# This assumes each keyword entry is a comma-separated list
separate_rows(genres,sep = ",\\s*") %>%
# Remove any leading or trailing spaces from keywords
mutate(keywords = trimws(genres)) %>%
filter(budget>0) %>% filter(genres!="") %>% group_by(genres) %>%
summarise(total_budget=sum(budget)) %>% arrange(desc(total_budget))
## # A tibble: 20 × 2
## genres total_budget
## <chr> <dbl>
## 1 Action 59443406599
## 2 Adventure 52398220463
## 3 Drama 47498112157
## 4 Comedy 43589711809
## 5 Thriller 40727495424
## 6 Science Fiction 27748069865
## 7 Fantasy 26949696595
## 8 Family 26019109625
## 9 Crime 19383466474
## 10 Romance 18158357743
## 11 Animation 15553021056
## 12 Mystery 10699081450
## 13 Horror 7563921991
## 14 History 5890982931
## 15 War 5080673879
## 16 Music 2942970295
## 17 Western 2220453601
## 18 Documentary 291861725
## 19 Foreign 22375007
## 20 TV Movie 9200000
# Plot genres by budget
movie_analysis %>%
separate_rows(genres,sep = ",\\s*") %>%
mutate(keywords = trimws(genres)) %>%
filter(budget>0) %>% filter(genres!="") %>% group_by(genres) %>%
summarise(total_budget=sum(budget)) %>% arrange(desc(total_budget)) %>%
ggplot(aes(x=reorder(genres,total_budget),y=total_budget))+
geom_col(fill="#d62728")+
labs(x="Genres", y="Total Budget", title="Genres by Budget") +
theme(axis.text.x = element_text(angle = 50,hjust = 1))
# Analyze genres by popularity
movie_analysis %>%
# This assumes each keyword entry is a comma-separated list
separate_rows(genres,sep = ",\\s*") %>%
# Remove any leading or trailing spaces from keywords
mutate(keywords = trimws(genres)) %>%
filter(popularity>0) %>% filter(genres!="") %>% group_by(genres) %>%
summarise(avg_popularity=mean(popularity)) %>% arrange(desc(avg_popularity))
## # A tibble: 20 × 2
## genres avg_popularity
## <chr> <dbl>
## 1 Adventure 39.3
## 2 Animation 38.8
## 3 Science Fiction 36.5
## 4 Fantasy 36.4
## 5 Action 30.9
## 6 Family 27.8
## 7 Mystery 24.6
## 8 Thriller 24.5
## 9 War 23.8
## 10 Crime 22.9
## 11 Horror 18.3
## 12 Western 18.2
## 13 Comedy 18.2
## 14 Drama 17.8
## 15 History 17.4
## 16 Romance 16.0
## 17 Music 13.1
## 18 TV Movie 6.39
## 19 Documentary 3.95
## 20 Foreign 0.687
# Plot genres by popularity
movie_analysis %>%
separate_rows(genres,sep = ",\\s*") %>%
mutate(keywords = trimws(genres)) %>%
filter(popularity>0) %>% filter(genres!="") %>% group_by(genres) %>%
summarise(avg_popularity=mean(popularity)) %>% arrange(desc(avg_popularity)) %>%
ggplot(aes(x=reorder(genres,avg_popularity),y=avg_popularity)) +
geom_col(fill="#1f77b4")+
labs(x="Genres",y="Average Popularity",title="Genres by Popularity")+
theme(axis.text.x = element_text(angle = 50,hjust = 1))
Analyzed and visualized the impact of keywords on revenue, budget, and popularity.
# Analyze keyword by revenue
movie_analysis %>% separate_rows(keywords, sep = ",\\s*") %>%
mutate(keywords = trimws(keywords)) %>%
group_by(keywords) %>%
summarise(total_revenue = sum(revenue)) %>%
arrange(desc(total_revenue))
## # A tibble: 9,815 × 2
## keywords total_revenue
## <chr> <dbl>
## 1 duringcreditsstinger 57827617707
## 2 3d 38821575006
## 3 aftercreditsstinger 37735182966
## 4 based on novel 28330574606
## 5 sequel 26416350086
## 6 superhero 26398102240
## 7 dystopia 22093114379
## 8 based on comic book 21464756515
## 9 marvel comic 19447081252
## 10 woman director 16140425506
## # ℹ 9,805 more rows
# Plot keyword by revenue
movie_analysis %>% separate_rows(keywords, sep = ",\\s*") %>%
mutate(keywords = trimws(keywords)) %>%
group_by(keywords) %>%
summarise(total_revenue = sum(revenue)) %>%
arrange(desc(total_revenue)) %>%slice_head(n=10) %>%
ggplot(aes(x=total_revenue,y=reorder(keywords,total_revenue)))+
geom_col(fill="#1f77b4")+
labs(x="Total Revenue",y="Keywords",title="Top 10 Keyword by Revenue")+
theme_minimal()
# Analyze keyword by budget
movie_analysis %>% separate_rows(keywords, sep = ",\\s*") %>%
mutate(keywords = trimws(keywords)) %>%
group_by(keywords) %>%
summarise(total_budget = sum(budget)) %>%
arrange(desc(total_budget))
## # A tibble: 9,815 × 2
## keywords total_budget
## <chr> <dbl>
## 1 duringcreditsstinger 16032534128
## 2 3d 10579000003
## 3 aftercreditsstinger 10577914650
## 4 based on novel 8929115000
## 5 dystopia 8149277523
## 6 superhero 7591902660
## 7 sequel 6977805523
## 8 based on comic book 6385100000
## 9 woman director 5549967548
## 10 alien 5221650531
## # ℹ 9,805 more rows
# Plot keyword by budget
movie_analysis %>% separate_rows(keywords, sep = ",\\s*") %>%
mutate(keywords = trimws(keywords)) %>%
group_by(keywords) %>%
summarise(total_budget = sum(budget)) %>%
arrange(desc(total_budget)) %>%slice_head(n=10) %>%
ggplot(aes(x=total_budget,y=reorder(keywords,total_budget)))+
geom_col(fill="#1f77b4")+
labs(x="Total Budget",y="Keywords",title="Top 10 Keyword by Budget")+
theme_minimal()
# Analyze keyword by popularity
movie_analysis %>% separate_rows(keywords, sep = ",\\s*") %>%
mutate(keywords = trimws(keywords)) %>%
group_by(keywords) %>%
summarise(avg_popularity= mean(popularity)) %>%
arrange(desc(avg_popularity))
## # A tibble: 9,815 × 2
## keywords avg_popularity
## <chr> <dbl>
## 1 evil mastermind 876.
## 2 astrophysics 724.
## 3 relativity 724.
## 4 zero gravity 724.
## 5 self healing 515.
## 6 dark future 434.
## 7 peak oil 434.
## 8 capuchin monkey 272.
## 9 gold coin 272.
## 10 pirate ship 272.
## # ℹ 9,805 more rows
# Plot keyword by popularity
movie_analysis %>% separate_rows(keywords, sep = ",\\s*") %>%
mutate(keywords = trimws(keywords)) %>%
group_by(keywords) %>%
summarise(avg_popularity= mean(popularity)) %>%
arrange(desc(avg_popularity)) %>%slice_head(n=10) %>%
ggplot(aes(x=avg_popularity,y=reorder(keywords,avg_popularity)))+
geom_col(fill="#1f77b4")+
labs(x="Average Popularity",y="Keywords",title="Top 10 Keyword by Popularity")+
theme_minimal()
Analyzed revenue, budget, and average rating by spoken languages, with visualizations for the top languages.
# Analyze Language by Revenue
movie_analysis %>% separate_rows(spoken_languages, sep = ",\\s*") %>%
mutate(keywords = trimws(spoken_languages)) %>%
group_by(spoken_languages) %>%
summarise(total_revenue=sum(revenue)) %>% arrange(desc(total_revenue))
## # A tibble: 62 × 2
## spoken_languages total_revenue
## <chr> <dbl>
## 1 English 389058696018
## 2 Français 43293893830
## 3 Español 36149323251
## 4 Deutsch 25020728621
## 5 Italiano 21495420851
## 6 Pусский 20791626225
## 7 普通话 13886998166
## 8 日本語 11583181763
## 9 العربية 9064378961
## 10 Latin 7184674842
## # ℹ 52 more rows
# Plot Top Language by Revenue
movie_analysis %>% separate_rows(spoken_languages, sep = ",\\s*") %>%
mutate(keywords = trimws(spoken_languages)) %>%
group_by(spoken_languages) %>%
summarise(total_revenue=sum(revenue)) %>% arrange(desc(total_revenue)) %>% slice_head(n=10) %>%
ggplot(aes(x = reorder(spoken_languages, total_revenue), y = total_revenue)) +
geom_col(fill = "steelblue") +
labs(x="Language",y="Total Revenue",title = "Top 10 Language by Revenue")+ theme_classic()
# Analyze Language by Budget
movie_analysis %>% separate_rows(spoken_languages, sep = ",\\s*") %>%
mutate(keywords = trimws(spoken_languages)) %>%
group_by(spoken_languages) %>%
summarise(total_budget=sum(budget)) %>% arrange(desc(total_budget))
## # A tibble: 62 × 2
## spoken_languages total_budget
## <chr> <dbl>
## 1 English 137375134938
## 2 Français 14426725944
## 3 Español 12089195167
## 4 Deutsch 8896962289
## 5 Pусский 7596831148
## 6 Italiano 6812077701
## 7 普通话 5215892068
## 8 日本語 4238472699
## 9 العربية 2933167686
## 10 Português 2392068060
## # ℹ 52 more rows
# Plot Language by Budget
movie_analysis %>% separate_rows(spoken_languages, sep = ",\\s*") %>%
mutate(keywords = trimws(spoken_languages)) %>%
group_by(spoken_languages) %>%
summarise(total_budget=sum(budget)) %>% arrange(desc(total_budget)) %>% slice_head(n=10) %>%
ggplot(aes(x = reorder(spoken_languages, total_budget), y = total_budget)) +
geom_col(fill = "steelblue")+
labs(x="Language",y="Total Budget",title = "Top 10 Language by Budget")+ theme_classic()
# Analyze Language average by Rating
movie_analysis %>% separate_rows(spoken_languages,sep = ",\\s*") %>%
mutate(keywords = trimws(spoken_languages)) %>%
filter(vote_average>0)%>% filter(keywords != "No Language") %>% group_by(spoken_languages) %>%
summarise(avg_rating=mean(vote_average)) %>% arrange(desc(avg_rating))
## # A tibble: 60 × 2
## spoken_languages avg_rating
## <chr> <dbl>
## 1 తెలుగు 7.5
## 2 Bahasa indonesia 7.4
## 3 فارسی 7.22
## 4 Gaeilge 7.22
## 5 Galego 7.2
## 6 Somali 7.2
## 7 Català 7
## 8 Wolof 7
## 9 اردو 6.9
## 10 български език 6.87
## # ℹ 50 more rows
# Plot Language average by Rating
movie_analysis %>% separate_rows(spoken_languages,sep = ",\\s*") %>%
mutate(keywords = trimws(spoken_languages)) %>%
filter(vote_average>0)%>% filter(keywords != "No Language") %>% group_by(spoken_languages) %>%
summarise(avg_rating=mean(vote_average)) %>% arrange(desc(avg_rating))%>% slice_head(n=10) %>%
ggplot(aes(x = reorder(spoken_languages, avg_rating), y = avg_rating)) +
geom_col(fill = "steelblue")+
labs(x="Language",y="Average Rating",title = "Top 10 Language by Rating")+ theme_classic()+
theme(axis.text.x = element_text(angle = 45,hjust = 1))
Calculated profit and analyzed the top and bottom 10 profitable and non-profitable movies.
# Creating a profit column
movie_analysis$profit <- movie_analysis$revenue - movie_analysis$budget
# Summary of the profit column
summary(movie_analysis$profit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -165710090 -799538 2511317 53215599 55312860 2550965087
# Analyze Top 10 Most Profitable Movies
movie_analysis %>% filter(profit>0) %>% group_by(title) %>%
summarise(profit=max(profit)) %>% arrange(desc(profit)) %>%
slice_head(n=10)
## # A tibble: 10 × 2
## title profit
## <chr> <dbl>
## 1 Avatar 2550965087
## 2 Titanic 1645034188
## 3 Jurassic World 1363528810
## 4 Furious 7 1316249360
## 5 The Avengers 1299557910
## 6 Avengers: Age of Ultron 1125403694
## 7 Frozen 1124219009
## 8 Minions 1082730962
## 9 The Lord of the Rings: The Return of the King 1024888979
## 10 Iron Man 3 1015439994
# Plot Top 10 Most Profitable Movies
movie_analysis %>% filter(profit>0) %>% group_by(title) %>%
summarise(profit=max(profit)) %>% arrange(desc(profit)) %>%
slice_head(n=10) %>%
ggplot(aes(x = reorder(title, profit), y = profit)) +
geom_col(fill = "darkgreen") +coord_flip()+
labs(x = "Movie Title", y = "Profit",title = "Top 10 Most Profitable Movies") +
theme_classic()
# Analyze Top 10 Least Profitable Movies:
movie_analysis %>% filter(profit>0) %>% group_by(title) %>%
summarise(profit=max(profit)) %>% arrange(profit) %>%
slice_head(n=10)
## # A tibble: 10 × 2
## title profit
## <chr> <dbl>
## 1 Of Horses and Men 1
## 2 The Prophecy 8
## 3 Bats 10
## 4 Tae Guk Gi: The Brotherhood of War 15
## 5 A Farewell to Arms 21
## 6 Lake of Fire 25
## 7 My Baby's Daddy 51
## 8 Rugrats in Paris: The Movie 73
## 9 When Did You Last See Your Father? 92
## 10 Khiladi 786 126
# Plot Top 10 Least Profitable Movies
movie_analysis %>% filter(profit>0) %>% group_by(title) %>%
summarise(profit=max(profit)) %>% arrange(profit) %>%
slice_head(n=10) %>%
ggplot(aes(x = reorder(title, profit), y = profit)) +
geom_col(fill = "darkred") +coord_flip()+
labs(x = "Movie Title", y = "Profit",title = "Top 10 Least Profitable Movies") +
theme_classic()
# Analyze Top 10 Flop Movies (Least Profitable or Non-Profitable)
movie_analysis %>% filter(profit<=0) %>% group_by(title) %>%
summarise(profit=max(profit)) %>% arrange(profit) %>%
slice_head(n=10)
## # A tibble: 10 × 2
## title profit
## <chr> <dbl>
## 1 The Lone Ranger -165710090
## 2 The Wolfman -150000000
## 3 The Alamo -119180039
## 4 Mars Needs Moms -111007242
## 5 Dragonball Evolution -100000000
## 6 The 13th Warrior -98301101
## 7 The Adventures of Pluto Nash -92896027
## 8 Metropolis -91969578
## 9 Mighty Joe Young -90000000
## 10 Volcano -90000000
# Plot Top 10 Flop Movies (Least Profitable or Non-Profitable)
movie_analysis %>% filter(profit<=0) %>% group_by(title) %>%
summarise(profit=max(profit)) %>% arrange(profit) %>%
slice_head(n=10) %>%
ggplot(aes(x = reorder(title, profit), y = profit)) +
geom_col(fill = "darkred") +coord_flip()+
labs(x = "Movie Title", y = "Profit",title = "Top 10 Flop Movies") +
theme_classic()
Analyzed and visualized the impact of production companies and countries on budget,revenue and popularity.
# Analyze production companies by budget
movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
mutate(keywords = trimws(production_companies)) %>%
group_by(production_companies) %>% summarise(total_budget=sum(budget)) %>%
arrange(desc(total_budget))
## # A tibble: 5,026 × 2
## production_companies total_budget
## <chr> <dbl>
## 1 Warner Bros. 18327993949
## 2 Universal Pictures 13707866273
## 3 Paramount Pictures 13309613007
## 4 Columbia Pictures 11109274736
## 5 Twentieth Century Fox Film Corporation 10827167800
## 6 Walt Disney Pictures 9583280000
## 7 New Line Cinema 6095485000
## 8 Village Roadshow Pictures 5526800003
## 9 Relativity Media 5367146652
## 10 Touchstone Pictures 4657484523
## # ℹ 5,016 more rows
# Plot Top 10 production companies by budget
movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
mutate(keywords = trimws(production_companies)) %>%
group_by(production_companies) %>% summarise(total_budget=sum(budget)) %>%
arrange(desc(total_budget)) %>% slice_head(n=10) %>%
ggplot(aes(x=reorder(production_companies,total_budget),y=total_budget))+
geom_col(fill="steelblue")+
labs(x="Production Companies",y="Total Budget",
title="Top 10 Production Companies by Budget")+ theme_classic()+
theme(axis.text.x = element_text(angle = 45,hjust=1))
# Analyze production companies by revenue
movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
mutate(keywords = trimws(production_companies)) %>%
group_by(production_companies) %>% summarise(total_revenue=sum(revenue)) %>%
arrange(desc(total_revenue))
## # A tibble: 5,026 × 2
## production_companies total_revenue
## <chr> <dbl>
## 1 Warner Bros. 49155747874
## 2 Universal Pictures 42588465942
## 3 Paramount Pictures 40878523165
## 4 Twentieth Century Fox Film Corporation 39357151309
## 5 Walt Disney Pictures 28683256048
## 6 Columbia Pictures 28599634775
## 7 New Line Cinema 19444865804
## 8 Amblin Entertainment 16090835147
## 9 DreamWorks SKG 14384533626
## 10 Dune Entertainment 13797504190
## # ℹ 5,016 more rows
# Plot Top 10 production companies by revenue
movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
mutate(keywords = trimws(production_companies)) %>%
group_by(production_companies) %>% summarise(total_revenue=sum(revenue)) %>%
arrange(desc(total_revenue)) %>% slice_head(n=10) %>%
ggplot(aes(x=reorder(production_companies,total_revenue),y=total_revenue))+
geom_col(fill = "steelblue")+
labs(x="Production Companies",y="Total Revenue",
title="Top 10 Production Companies by Revenue")+theme_classic()+
theme(axis.text.x = element_text(angle = 45,hjust=1))
# Analyze production companies by popularity
movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
mutate(keywords = trimws(production_companies)) %>%
group_by(production_companies) %>% summarise(avg_popularity=mean(popularity)) %>%
arrange(desc(avg_popularity))
## # A tibble: 5,026 × 2
## production_companies avg_popularity
## <chr> <dbl>
## 1 The Donners' Company 515.
## 2 Bulletproof Cupid 481.
## 3 Kinberg Genre 327.
## 4 Illumination Entertainment 235.
## 5 Deluxe Digital Studios 198.
## 6 Vita-Ray Dutch Productions (III) 198.
## 7 Syncopy 192.
## 8 Lynda Obst Productions 192.
## 9 Cruel & Unusual Films 156.
## 10 Atman Entertainment 147.
## # ℹ 5,016 more rows
# Plot Top 10 production companies by popularity
movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
mutate(keywords = trimws(production_companies)) %>%
group_by(production_companies) %>% summarise(avg_popularity=mean(popularity)) %>%
arrange(desc(avg_popularity)) %>% slice_head(n=10) %>%
ggplot(aes(x=reorder(production_companies,avg_popularity),y=avg_popularity))+
geom_col(fill = "steelblue")+
labs(x="Production Companies",y="Average Popularity",
title="Top 10 Production Companies by Popularity")+
theme_classic()+
theme(axis.text.x = element_text(angle=45,hjust = 1))
# Analyze production companies by Profit
movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
mutate(keywords = trimws(production_companies)) %>%
group_by(production_companies) %>% summarise(total_profit=sum(profit)) %>%
arrange(desc(total_profit))
## # A tibble: 5,026 × 2
## production_companies total_profit
## <chr> <dbl>
## 1 Warner Bros. 30827753925
## 2 Universal Pictures 28880599669
## 3 Twentieth Century Fox Film Corporation 28529983509
## 4 Paramount Pictures 27568910158
## 5 Walt Disney Pictures 19099976048
## 6 Columbia Pictures 17490360039
## 7 New Line Cinema 13349380804
## 8 Amblin Entertainment 12587035147
## 9 DreamWorks SKG 9966033626
## 10 Dune Entertainment 9776504187
## # ℹ 5,016 more rows
# Plot production companies by Profit
movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
mutate(keywords = trimws(production_companies)) %>%
group_by(production_companies) %>% summarise(total_profit=sum(profit)) %>%
arrange(desc(total_profit)) %>% slice_head(n=10) %>%
ggplot(aes(x = reorder(production_companies, total_profit), y = total_profit )) +
geom_bar(stat = "identity", fill = "steelblue") + coord_flip() +
labs( x = "Production Companies",y = "Total Profit",
title = "Top 10 Production Companies by Profit Gained") +
theme_minimal()
# Analyze production countries by Budget
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
mutate(keywords = trimws(production_countries)) %>%
group_by(production_countries) %>% summarise(total_budget=sum(budget)) %>%
arrange(desc(total_budget))
## # A tibble: 89 × 2
## production_countries total_budget
## <chr> <dbl>
## 1 United States of America 132492940438
## 2 United Kingdom 20266758528
## 3 Germany 12192997188
## 4 France 6783139181
## 5 Canada 6329203663
## 6 Australia 4424322261
## 7 China 2432225115
## 8 Italy 2206951350
## 9 New Zealand 2082500000
## 10 Japan 1734234237
## # ℹ 79 more rows
# Plot production countries by Budget
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
mutate(keywords = trimws(production_countries)) %>%
group_by(production_countries) %>% summarise(total_budget=sum(budget)) %>%
arrange(desc(total_budget)) %>% slice_head(n=10) %>%
ggplot(aes(x = reorder(production_countries, total_budget), y = total_budget)) +
geom_bar(stat = "identity", fill = "steelblue") + coord_flip() +
labs( x = "Production Country",y = "Total Revenue",
title = "Top 10 Production Countries by Total Budget") +
theme_minimal()
# Analyze Production Countries by Revenue
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
mutate(keywords = trimws(production_countries)) %>%
group_by(production_countries) %>% summarise(total_revenue=sum(revenue)) %>%
arrange(desc(total_revenue))
## # A tibble: 89 × 2
## production_countries total_revenue
## <chr> <dbl>
## 1 United States of America 380026795469
## 2 United Kingdom 60084988968
## 3 Germany 25480164585
## 4 Canada 14990315869
## 5 France 14187924310
## 6 Australia 9649710497
## 7 New Zealand 8325644337
## 8 China 6825078486
## 9 Japan 5195183028
## 10 Italy 3871679328
## # ℹ 79 more rows
# Plot Top 10 Production Countries by Revenue
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
mutate(keywords = trimws(production_countries)) %>%
group_by(production_countries) %>% summarise(total_revenue=sum(revenue)) %>%
arrange(desc(total_revenue)) %>% slice_head(n=10) %>%
ggplot(aes(x = reorder(production_countries, total_revenue), y = total_revenue)) +
geom_bar(stat = "identity", fill = "steelblue") +coord_flip() +
labs( x = "Production Country",y = "Total Revenue",
title = "Top 10 Production Countries by Total Revenue") +
theme_minimal()
# Analyze production countries by Popularity
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
mutate(keywords = trimws(production_countries)) %>%
group_by(production_countries) %>% summarise(avg_popularity=mean(popularity)) %>%
arrange(desc(avg_popularity))
## # A tibble: 89 × 2
## production_countries avg_popularity
## <chr> <dbl>
## 1 Dominica 146.
## 2 Jamaica 97.4
## 3 Malta 59.1
## 4 Philippines 54.8
## 5 Bahamas 46.9
## 6 Taiwan 42.2
## 7 New Zealand 42.1
## 8 Singapore 41.8
## 9 Indonesia 40.3
## 10 Czech Republic 31.8
## # ℹ 79 more rows
# Plot production countries by Budget
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
mutate(keywords = trimws(production_countries)) %>%
group_by(production_countries) %>% summarise(avg_popularity=mean(popularity)) %>%
arrange(desc(avg_popularity)) %>% slice_head(n=10) %>%
ggplot(aes(x = reorder(production_countries, avg_popularity), y = avg_popularity )) +
geom_bar(stat = "identity", fill = "steelblue") + coord_flip() +
labs( x = "Production Country",y = "Total Revenue",
title = "Top 10 Production Countries by Average Popularity") +
theme_minimal()
# Analyze production countries by Profit
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
mutate(keywords = trimws(production_countries)) %>%
group_by(production_countries) %>% summarise(total_profit=sum(profit)) %>%
arrange(desc(total_profit))
## # A tibble: 89 × 2
## production_countries total_profit
## <chr> <dbl>
## 1 United States of America 247533855031
## 2 United Kingdom 39818230440
## 3 Germany 13287167397
## 4 Canada 8661112206
## 5 France 7404785129
## 6 New Zealand 6243144337
## 7 Australia 5225388236
## 8 China 4392853371
## 9 Japan 3460948791
## 10 Italy 1664727978
## # ℹ 79 more rows
# Plot production countries by Profit
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
mutate(keywords = trimws(production_countries)) %>%
group_by(production_countries) %>% summarise(total_profit=sum(profit)) %>%
arrange(desc(total_profit)) %>% slice_head(n=10) %>%
ggplot(aes(x = reorder(production_countries, total_profit), y = total_profit )) +
geom_bar(stat = "identity", fill = "steelblue") + coord_flip() +
labs( x = "Production Country",y = "Total Profit",
title = "Top 10 Production Countries by Profit Gained") +
theme_minimal()
# Analyze movie by voting
movie_analysis %>%
arrange(desc(vote_average)) %>%
select(title, vote_average) %>% as_tibble()
## # A tibble: 4,803 × 2
## title vote_average
## <chr> <dbl>
## 1 Me You and Five Bucks 10
## 2 Little Big Top 10
## 3 Dancer, Texas Pop. 81 10
## 4 Stiff Upper Lips 10
## 5 Sardaarji 9.5
## 6 One Man's Hero 9.3
## 7 The Shawshank Redemption 8.5
## 8 There Goes My Baby 8.5
## 9 The Godfather 8.4
## 10 The Prisoner of Zenda 8.4
## # ℹ 4,793 more rows
Sentiment of Overviews and Taglines: Analyzed and visualized sentiment scores from overviews and taglines.
# Clean the 'overview' and 'tagline' columns
df <- movie_analysis %>%
mutate(
overview_clean = str_to_lower(overview),
overview_clean = str_replace_all(overview_clean, "[^[:alnum:] ]", ""),
tagline_clean = str_to_lower(tagline),
tagline_clean = str_replace_all(tagline_clean, "[^[:alnum:] ]", "")
)
# Perform sentiment analysis using syuzhet
df <- df %>%
mutate( sentiment_overview = get_sentiment(overview_clean, method = "syuzhet"),
sentiment_tagline = get_sentiment(tagline_clean, method = "syuzhet"))
# Plot sentiment distribution for 'overview'
ggplot(df, aes(x = sentiment_overview)) +
geom_histogram(binwidth = 0.1, fill = "skyblue", color = "black") +
labs(title = "Sentiment Distribution of Film Overviews", x = "Sentiment Score", y = "Frequency")
# Plot sentiment distribution for 'tagline'
ggplot(df, aes(x = sentiment_tagline)) +
geom_histogram(binwidth = 0.1, fill = "lightgreen", color = "black") +
labs(title = "Sentiment Distribution of Film Taglines", x = "Sentiment Score", y = "Frequency")
# correlation with revenue
cor(df$sentiment_overview, df$revenue, use = "complete.obs")
## [1] -0.0007436787
cor(df$sentiment_tagline, df$revenue, use = "complete.obs")
## [1] 0.002233445
Sentiment Overview and Revenue: The correlation is approximately -0.00074. This is very close to zero, suggesting there is almost no linear relationship between the sentiment overview and revenue.
Sentiment Tagline and Revenue: The correlation is approximately 0.00223. This is also very close to zero, indicating an almost nonexistent linear relationship between the sentiment tagline and revenue.
Both sentiment metrics (overview and tagline) have extremely weak correlations with revenue.
# Plot Sentiment of Overviews vs. Revenue
ggplot(df, aes(x = sentiment_overview, y = revenue)) +
geom_point() +
labs(title = "Sentiment of Overviews vs. Revenue", x = "Sentiment Score (Overview)", y = "Revenue")
# Plot Sentiment of Taglines vs. Revenue
ggplot(df, aes(x = sentiment_tagline, y = revenue)) +
geom_point() +
labs(title = "Sentiment of Taglines vs. Revenue", x = "Sentiment Score (Tagline)", y = "Revenue")
Budget vs Revenue: Analyzed the correlation and relationship between budget and revenue.
# Budget vs Revenue
cor(movie_analysis$budget, movie_analysis$revenue, method = "pearson")
## [1] 0.7308229
# Plot Relationship between budget and revenue
plot(movie_analysis$budget, movie_analysis$revenue,
main = "Relationship between Budget and Revenue",
xlab = "Budget",
ylab = "Revenue",
pch = 19, col = "dodgerblue")
# Add a regression line to visualize the trend
abline(lm(revenue ~ budget, data = movie_analysis), col = "red", lwd = 2)
Rating vs Popularity/Revenue: Examined correlations between ratings, popularity, and revenue.
# Rating vs Popularity
cor(movie_analysis$vote_average, movie_analysis$popularity, method = "pearson")
## [1] 0.2739518
# Rating vs Revenue
cor(movie_analysis$vote_average, movie_analysis$revenue, method = "pearson")
## [1] 0.1971497
Success Metrics by Cast and Crew: Analyzed average revenue and ratings for cast and crew members.
# Separate actor names into individual rows
actors_long <- movie_analysis %>%
separate_rows(cast, sep = ", ") %>%
select(id, cast)
# Separate crew names into individual rows
crew_long <- movie_analysis %>%
separate_rows(crew, sep = ", ") %>%
select(id, crew)
# Aggregate success metrics by cast
cast_success <- actors_long %>%
inner_join(movie_analysis %>% select(id, revenue, vote_average), by = "id") %>%
group_by(cast) %>%
summarise(
mean_revenue = mean(revenue),
mean_rating = mean(vote_average),
n_movies = n()
)
# Aggregate success metrics by crew
crew_success <- crew_long %>%
inner_join(movie_analysis %>% select(id, revenue, vote_average), by = "id") %>%
group_by(crew) %>%
summarise(
mean_revenue = mean(revenue),
mean_rating = mean(vote_average),
n_movies = n()
)
Correlations with Number of Movies: Examined relationships between success metrics and the number of movies.
# Correlation for cast and crew
cor(cast_success$mean_revenue, cast_success$n_movies, method = "pearson")
## [1] -0.007276891
cor(crew_success$mean_rating, crew_success$n_movies, method = "pearson")
## [1] -0.03731809
Budget and Revenue: Performed a t-test to compare revenue between high and low-budget movies.
# Define the Hypotheses:
# Null Hypothesis (H0): There is no significant difference in revenue between movies with high and low budgets.
# Alternative Hypothesis (H1): Movies with higher budgets have significantly higher revenues.
# Define high and low budget groups using the median
median_budget <- median(df$budget)
high_budget <- df %>% filter(budget > median_budget) %>% pull(revenue)
low_budget <- df %>% filter(budget <= median_budget) %>% pull(revenue)
# Perform t-test
t.test(high_budget, low_budget)
##
## Welch Two Sample t-test
##
## data: high_budget and low_budget
## t = 29.888, df = 2513.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 125222700 142807530
## sample estimates:
## mean of x mean of y
## 152574628 18559513
Overall, high-budget movies have significantly higher average values compared to low-budget movies.
Genres and Revenue/Ratings: Planned ANOVA to test differences in revenue and ratings across genres.
#Define the Hypotheses:
# Null Hypothesis (H0): There is no significant difference in average ratings or revenue between different genres.
# Alternative Hypothesis (H1): There are significant differences in average ratings or revenue between genres.
# We'll split it into separate rows for each genre
df_genres <- movie_analysis %>%
separate_rows(genres, sep = '\\|')
# ANOVA for revenue
anova_revenue <- aov(revenue ~ genres, data = df_genres)
summary(anova_revenue)
## Df Sum Sq Mean Sq F value Pr(>F)
## genres 1174 6.469e+19 5.510e+16 3.19 <2e-16 ***
## Residuals 3628 6.267e+19 1.727e+16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA for ratings
anova_ratings <- aov(vote_average ~ genres, data = df_genres)
summary(anova_ratings)
## Df Sum Sq Mean Sq F value Pr(>F)
## genres 1174 2321 1.977 1.583 <2e-16 ***
## Residuals 3628 4531 1.249
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Split the production companies and count the number of companies for each movie
movie_analysis$num_companies <- sapply(strsplit(movie_analysis$production_companies, ","), length)
# Now plot the number of production companies against popularity
plot(movie_analysis$num_companies, movie_analysis$popularity,
main = "Number of Production Companies vs. Popularity",
xlab = "Number of Production Companies",
ylab = "Popularity",
pch = 19, col = "orange")
# Fit a linear model using the numeric count of production companies
model <- lm(popularity ~ num_companies, data = movie_analysis)
# Add the regression line to the plot
abline(model, col = "red", lwd = 2)
# Correlation between number of production companies and popularity
cor(movie_analysis$num_companies, movie_analysis$popularity)
## [1] 0.1900217
Finally, we export the cleaned data frame for further analysis to CSV format.
# Load the package to export
library(writexl)
# Export directly CSV file to local computer
write.csv(movie_analysis,"movie_analysis.csv",row.names = FALSE)