Movie_Dataset_Analysis

Loading Required Libraries

Loading necessary R libraries for data manipulation, visualization and analysis.

library(tidyverse)   # For data manipulation and visualization

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)     # For creating plots
library(dplyr)       # For data manipulation
library(skimr)       # For data summarization
library(lubridate)   # For date manipulation
library(jsonlite)    # For handling JSON data

## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:purrr':
## 
##     flatten

library(textdata)    # For sentiment analysis
library(syuzhet)     # For sentiment analysis

Loading Movie Dataset

Reading the CSV file containing the movie data.

# Loading csv file
moviedb <- read.csv("D:\\Docs\\Project\\data analysis movie\\tmdb_5000_movies.csv")
View(moviedb)

Parse JSON columns in movie dataset

Handling JSON data within the dataset.

# Apply from JSON to each element in the 'genres' column
moviedb$genres_parsed <- lapply(moviedb$genres, fromJSON)
# Extract genre names from the parsed JSON
moviedb$genre_names <- sapply(moviedb$genres_parsed, function(x) {
  paste(x$name, collapse = ", ")
})

# Parse the JSON column
moviedb$parsed_keywords <- lapply(moviedb$keywords, fromJSON)

# Extract keywords from parsed JSON
moviedb$extracted_keywords <- sapply(moviedb$parsed_keywords, function(x) {
# Convert the list of keywords to a single string
  paste(sapply(x$name, as.character), collapse = ", ")
})

# Parse the JSON column
moviedb$parsed_production_companies <- lapply(moviedb$production_companies, fromJSON)

# Extract production companies from parsed JSON
moviedb$extracted_production_companies <- sapply(moviedb$parsed_production_companies, function(x) {
  # Convert the list of companies to a single string
  paste(sapply(x$name, as.character), collapse = ", ")
})

# Parse the json column
moviedb$parsed_production_countries <- lapply(moviedb$production_countries,fromJSON)

# Extract production countries from parsed json
moviedb$extracted_production_countries <- sapply(moviedb$parsed_production_countries,function(x){
  #convert the list of countries to single string
  paste(sapply(x$name,as.character),collapse=",")
})

# Parse the json column
moviedb$parsed_spoken_languages <- lapply(moviedb$spoken_languages,fromJSON)

# Extract production countries from parsed json
moviedb$extracted_spoken_languages <- sapply(moviedb$parsed_spoken_languages,function(x){
  #convert the list of countries to single string
  paste(sapply(x$name,as.character),collapse=",")
})

Data Cleaning

Unnecessary columns are removed, and column names are updated. Missing runtime values are filled with the mean runtime. Duplicates are identified and removed to ensure data integrity.

# Removing unnecessary column
moviedb <- moviedb %>%
  select(-c(genres,keywords,production_companies,production_countries,spoken_languages ,genres_parsed,
            parsed_keywords,parsed_production_companies,parsed_production_countries,
            parsed_spoken_languages))

# Renaming column name
moviedb <- moviedb %>%
  rename(
    genres= genre_names,
    keywords= extracted_keywords,
    production_companies= extracted_production_companies,
    production_countries =extracted_production_countries,
    spoken_languages =extracted_spoken_languages 
  ) 

# Checking for null values
sum(is.na(moviedb))

## [1] 2

colSums(is.na(moviedb))

##               budget             homepage                   id 
##                    0                    0                    0 
##    original_language       original_title             overview 
##                    0                    0                    0 
##           popularity         release_date              revenue 
##                    0                    0                    0 
##              runtime               status              tagline 
##                    2                    0                    0 
##                title         vote_average           vote_count 
##                    0                    0                    0 
##               genres             keywords production_companies 
##                    0                    0                    0 
## production_countries     spoken_languages 
##                    0                    0

#Handling Missing Values in Runtime
# Calculate mean runtime excluding NAs
mean_runtime <- mean(moviedb$runtime, na.rm = TRUE)

# Replace missing runtime values with the mean
moviedb$runtime[is.na(moviedb$runtime)] <- mean_runtime

# Confirming null values
colSums(is.na(moviedb))

##               budget             homepage                   id 
##                    0                    0                    0 
##    original_language       original_title             overview 
##                    0                    0                    0 
##           popularity         release_date              revenue 
##                    0                    0                    0 
##              runtime               status              tagline 
##                    0                    0                    0 
##                title         vote_average           vote_count 
##                    0                    0                    0 
##               genres             keywords production_companies 
##                    0                    0                    0 
## production_countries     spoken_languages 
##                    0                    0

# Checking for duplicate value
# Counting duplicate value 
moviedb %>%
  count(across(everything())) %>%
  filter(n > 1)

##  [1] budget               homepage             id                  
##  [4] original_language    original_title       overview            
##  [7] popularity           release_date         revenue             
## [10] runtime              status               tagline             
## [13] title                vote_average         vote_count          
## [16] genres               keywords             production_companies
## [19] production_countries spoken_languages     n                   
## <0 rows> (or 0-length row.names)

# Removing duplicate 
moviedb <- distinct(moviedb)

Checking and Converting Data Types

Inspecting the structure of the dataset, converting the release_date column to Date type, and confirming the updated structure.

# Checking structure of dataset
str(moviedb)

## 'data.frame':    4803 obs. of  20 variables:
##  $ budget              : int  237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
##  $ homepage            : chr  "http://www.avatarmovie.com/" "http://disney.go.com/disneypictures/pirates/" "http://www.sonypictures.com/movies/spectre/" "http://www.thedarkknightrises.com/" ...
##  $ id                  : int  19995 285 206647 49026 49529 559 38757 99861 767 209112 ...
##  $ original_language   : chr  "en" "en" "en" "en" ...
##  $ original_title      : chr  "Avatar" "Pirates of the Caribbean: At World's End" "Spectre" "The Dark Knight Rises" ...
##  $ overview            : chr  "In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes tor"| __truncated__ "Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with W"| __truncated__ "A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M battles pol"| __truncated__ "Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protec"| __truncated__ ...
##  $ popularity          : num  150.4 139.1 107.4 112.3 43.9 ...
##  $ release_date        : chr  "2009-12-10" "2007-05-19" "2015-10-26" "2012-07-16" ...
##  $ revenue             : num  2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
##  $ runtime             : num  162 169 148 165 132 139 100 141 153 151 ...
##  $ status              : chr  "Released" "Released" "Released" "Released" ...
##  $ tagline             : chr  "Enter the World of Pandora." "At the end of the world, the adventure begins." "A Plan No One Escapes" "The Legend Ends" ...
##  $ title               : chr  "Avatar" "Pirates of the Caribbean: At World's End" "Spectre" "The Dark Knight Rises" ...
##  $ vote_average        : num  7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
##  $ vote_count          : int  11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
##  $ genres              : chr  "Action, Adventure, Fantasy, Science Fiction" "Adventure, Fantasy, Action" "Action, Adventure, Crime" "Action, Crime, Drama, Thriller" ...
##  $ keywords            : chr  "culture clash, future, space war, space colony, society, space travel, futuristic, romance, space, alien, tribe"| __truncated__ "ocean, drug abuse, exotic island, east india trading company, love of one's life, traitor, shipwreck, strong wo"| __truncated__ "spy, based on novel, secret agent, sequel, mi6, british secret service, united kingdom" "dc comics, crime fighter, terrorist, secret identity, burglar, hostage drama, time bomb, gotham city, vigilante"| __truncated__ ...
##  $ production_companies: chr  "Ingenious Film Partners, Twentieth Century Fox Film Corporation, Dune Entertainment, Lightstorm Entertainment" "Walt Disney Pictures, Jerry Bruckheimer Films, Second Mate Productions" "Columbia Pictures, Danjaq, B24" "Legendary Pictures, Warner Bros., DC Entertainment, Syncopy" ...
##  $ production_countries: chr  "United States of America,United Kingdom" "United States of America" "United Kingdom,United States of America" "United States of America" ...
##  $ spoken_languages    : chr  "English,Español" "English" "Français,English,Español,Italiano,Deutsch" "English" ...

# Converting datatype of date as date
moviedb$release_date <- as.Date(moviedb$release_date)

# Confirming structure
str(moviedb)

## 'data.frame':    4803 obs. of  20 variables:
##  $ budget              : int  237000000 300000000 245000000 250000000 260000000 258000000 260000000 280000000 250000000 250000000 ...
##  $ homepage            : chr  "http://www.avatarmovie.com/" "http://disney.go.com/disneypictures/pirates/" "http://www.sonypictures.com/movies/spectre/" "http://www.thedarkknightrises.com/" ...
##  $ id                  : int  19995 285 206647 49026 49529 559 38757 99861 767 209112 ...
##  $ original_language   : chr  "en" "en" "en" "en" ...
##  $ original_title      : chr  "Avatar" "Pirates of the Caribbean: At World's End" "Spectre" "The Dark Knight Rises" ...
##  $ overview            : chr  "In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes tor"| __truncated__ "Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with W"| __truncated__ "A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. While M battles pol"| __truncated__ "Following the death of District Attorney Harvey Dent, Batman assumes responsibility for Dent's crimes to protec"| __truncated__ ...
##  $ popularity          : num  150.4 139.1 107.4 112.3 43.9 ...
##  $ release_date        : Date, format: "2009-12-10" "2007-05-19" ...
##  $ revenue             : num  2.79e+09 9.61e+08 8.81e+08 1.08e+09 2.84e+08 ...
##  $ runtime             : num  162 169 148 165 132 139 100 141 153 151 ...
##  $ status              : chr  "Released" "Released" "Released" "Released" ...
##  $ tagline             : chr  "Enter the World of Pandora." "At the end of the world, the adventure begins." "A Plan No One Escapes" "The Legend Ends" ...
##  $ title               : chr  "Avatar" "Pirates of the Caribbean: At World's End" "Spectre" "The Dark Knight Rises" ...
##  $ vote_average        : num  7.2 6.9 6.3 7.6 6.1 5.9 7.4 7.3 7.4 5.7 ...
##  $ vote_count          : int  11800 4500 4466 9106 2124 3576 3330 6767 5293 7004 ...
##  $ genres              : chr  "Action, Adventure, Fantasy, Science Fiction" "Adventure, Fantasy, Action" "Action, Adventure, Crime" "Action, Crime, Drama, Thriller" ...
##  $ keywords            : chr  "culture clash, future, space war, space colony, society, space travel, futuristic, romance, space, alien, tribe"| __truncated__ "ocean, drug abuse, exotic island, east india trading company, love of one's life, traitor, shipwreck, strong wo"| __truncated__ "spy, based on novel, secret agent, sequel, mi6, british secret service, united kingdom" "dc comics, crime fighter, terrorist, secret identity, burglar, hostage drama, time bomb, gotham city, vigilante"| __truncated__ ...
##  $ production_companies: chr  "Ingenious Film Partners, Twentieth Century Fox Film Corporation, Dune Entertainment, Lightstorm Entertainment" "Walt Disney Pictures, Jerry Bruckheimer Films, Second Mate Productions" "Columbia Pictures, Danjaq, B24" "Legendary Pictures, Warner Bros., DC Entertainment, Syncopy" ...
##  $ production_countries: chr  "United States of America,United Kingdom" "United States of America" "United Kingdom,United States of America" "United States of America" ...
##  $ spoken_languages    : chr  "English,Español" "English" "Français,English,Español,Italiano,Deutsch" "English" ...

Loading Another Movie Dataset

Reading the CSV file containing another movie data.

# Loading csv file 
movie_cast <- read.csv("D:\\Docs\\Project\\data analysis movie\\tmdb_5000_credits.csv")
View(movie_cast)

Parse JSON columns in movie dataset

The cast and crew columns in the movie_cast dataset are parsed from JSON format.

# Parse the json column
movie_cast$parsed_cast <- lapply(movie_cast$cast,fromJSON)

# Extract production countries from parsed json
movie_cast$extracted_cast <- sapply(movie_cast$parsed_cast,function(x){
  #convert the list of countries to single string
  paste(sapply(x$name,as.character),collapse=",")
})

# Parse the json column
movie_cast$parsed_crew <- lapply(movie_cast$crew,fromJSON)
# Extract production countries from parsed json
movie_cast$extracted_crew <- sapply(movie_cast$parsed_crew,function(x){
  #convert the list of countries to single string
  paste(sapply(x$name,as.character),collapse=",")
})

Data Cleaning

Unnecessary columns are removed from the movie_cast dataset, and the remaining columns are renamed for clarity. The dataset is then checked for null values and duplicates:

# Removing unnecessary column
movie_cast <- movie_cast %>%
  select(-c(cast,crew,parsed_cast,parsed_crew))

# Renaming column name
movie_cast <- movie_cast %>%
  rename(
    id = movie_id,
    cast = extracted_cast,
    crew = extracted_crew
  ) 

# Checking for null values
sum(is.na(movie_cast))

## [1] 0

colSums(is.na(movie_cast))

##    id title  cast  crew 
##     0     0     0     0

# Checking for duplicate value
movie_cast %>%
  count(across(everything())) %>%
  filter(n > 1)

## [1] id    title cast  crew  n    
## <0 rows> (or 0-length row.names)

# Removing duplicate value
movie_cast <- distinct(movie_cast)

# Checking structure of dataset 
str(movie_cast)

## 'data.frame':    4803 obs. of  4 variables:
##  $ id   : int  19995 285 206647 49026 49529 559 38757 99861 767 209112 ...
##  $ title: chr  "Avatar" "Pirates of the Caribbean: At World's End" "Spectre" "The Dark Knight Rises" ...
##  $ cast : chr  "Sam Worthington,Zoe Saldana,Sigourney Weaver,Stephen Lang,Michelle Rodriguez,Giovanni Ribisi,Joel David Moore,C"| __truncated__ "Johnny Depp,Orlando Bloom,Keira Knightley,Stellan Skarsgård,Chow Yun-fat,Bill Nighy,Geoffrey Rush,Jack Davenpor"| __truncated__ "Daniel Craig,Christoph Waltz,Léa Seydoux,Ralph Fiennes,Monica Bellucci,Ben Whishaw,Naomie Harris,Dave Bautista,"| __truncated__ "Christian Bale,Michael Caine,Gary Oldman,Anne Hathaway,Tom Hardy,Marion Cotillard,Joseph Gordon-Levitt,Morgan F"| __truncated__ ...
##  $ crew : chr  "Stephen E. Rivkin,Rick Carter,Christopher Boyes,Christopher Boyes,Mali Finn,James Horner,James Cameron,James Ca"| __truncated__ "Dariusz Wolski,Gore Verbinski,Jerry Bruckheimer,Ted Elliott,Terry Rossio,Stephen E. Rivkin,Craig Wood,Hans Zimm"| __truncated__ "Thomas Newman,Sam Mendes,Anna Pinnock,John Logan,John Logan,Dennis Gassner,Ian Fleming,Lee Smith,Bill Bernstein"| __truncated__ "Hans Zimmer,Charles Roven,Christopher Nolan,Christopher Nolan,Christopher Nolan,Christopher Nolan,Jonathan Nola"| __truncated__ ...

Merging Datasets

The moviedb and movie_cast datasets are merged to create a comprehensive movie_analysis dataset and to provide a more complete view of the movies.

# Merging both dataset
movie_analysis <- merge(moviedb,movie_cast)

# View the merged dataset
View(movie_analysis)

# Checking the structure of the merged dataset
str(movie_analysis)

## 'data.frame':    4803 obs. of  22 variables:
##  $ id                  : int  100 10003 100042 10008 10012 10013 10014 10016 10017 10022 ...
##  $ title               : chr  "Lock, Stock and Two Smoking Barrels" "The Saint" "Dumb and Dumber To" "An American Haunting" ...
##  $ budget              : int  1350000 68000000 40000000 0 35000000 18000000 3000000 28000000 2700000 56000000 ...
##  $ homepage            : chr  "http://www.universalstudiosentertainment.com/lock-stock-and-two-smoking-barrels/" "" "" "" ...
##  $ original_language   : chr  "en" "en" "en" "en" ...
##  $ original_title      : chr  "Lock, Stock and Two Smoking Barrels" "The Saint" "Dumb and Dumber To" "An American Haunting" ...
##  $ overview            : chr  "A card sharp and his unwillingly-enlisted friends need to make a lot of cash quick after losing a sketchy poker"| __truncated__ "Ivan Tretiak, Russian Mafia boss who wants to create an oil crisis in Moscow and seize power as a result sends "| __truncated__ "20 years after the dimwits set out on their first adventure, they head out in search of one of their long lost "| __truncated__ "Based on the true events of the only case in US History where a spirit caused the death of a man." ...
##  $ popularity          : num  16.03 20.91 67.77 7.44 11.2 ...
##  $ release_date        : Date, format: "1998-03-05" "1997-04-03" ...
##  $ revenue             : num  3.90e+06 1.18e+08 1.70e+08 0.00 1.93e+07 ...
##  $ runtime             : num  105 116 110 83 97 103 87 98 93 95 ...
##  $ status              : chr  "Released" "Released" "Released" "Released" ...
##  $ tagline             : chr  "A Disgrace to Criminals Everywhere." "Never reveal your name. Never turn your back. Never surrender your heart." "The Second Parts Are More Fools" "Possession Knows No Bounds" ...
##  $ vote_average        : num  7.5 5.9 5.4 5.3 5.1 5.9 5.7 4.8 5.9 5.8 ...
##  $ vote_count          : int  1648 302 1127 142 167 136 361 296 108 842 ...
##  $ genres              : chr  "Comedy, Crime" "Thriller, Action, Romance, Science Fiction, Adventure" "Comedy" "Horror, Thriller" ...
##  $ keywords            : chr  "ambush, alcohol, shotgun, tea, joint, machismo, cocktail, rifle, marijuana, cockney accent, pot smoking, hatche"| __truncated__ "berlin, russia, gas, master thief, the saint" "friendship, sequel, road movie, buddy comedy" "witch, independent film, curse, suitor, family" ...
##  $ production_companies: chr  "Handmade Films Ltd., Summit Entertainment, PolyGram Filmed Entertainment, SKA Films, The Steve Tisch Company" "Paramount Pictures, Mace Neufeld Productions, Rysher Entertainment" "New Line Cinema, Universal Pictures, Conundrum Entertainment, Red Granite Pictures" "Media Pro Pictures, Allan Zeman Productions, After Dark Films, Remstar Productions, Sweetpea Entertainment, Red"| __truncated__ ...
##  $ production_countries: chr  "United Kingdom" "United States of America" "United States of America" "United Kingdom,United States of America" ...
##  $ spoken_languages    : chr  "English" "English,Pусский" "English" "English" ...
##  $ cast                : chr  "Jason Flemyng,Dexter Fletcher,Nick Moran,Jason Statham,Vinnie Jones,Sting,Steven Mackintosh,Nicholas Rowe,Lenny"| __truncated__ "Val Kilmer,Elisabeth Shue,Rade Serbedzija,Henry Goodman,Alun Armstrong,Michael Byrne,Yevgeni Lazarev,Irina Apek"| __truncated__ "Jim Carrey,Jeff Daniels,Rachel Melvin,Kathleen Turner,Brady Bluhm,Laurie Holden,Steve Tom,Rob Riggle,Don Lake,P"| __truncated__ "Donald Sutherland,Sissy Spacek,James D'Arcy,Matthew Marsh,Gaye Brown,Vernon Dobtcheff,Thom Fell,Shauna Shim,Zoe"| __truncated__ ...
##  $ crew                : chr  "John Murphy,Guy Ritchie,Guy Ritchie,Guy Ritchie,Matthew Vaughn,David A. Hughes,Tim Maurice-Jones,Iain Andrews,E"| __truncated__ "Jonathan Hensleigh,Jonathan Hensleigh,Graeme Revell,Patsy Pollock,Elisabeth Leustig,Mace Neufeld,Terry Rawlings"| __truncated__ "Matthew F. Leonetti,Marc S. Fischer,Rick Montgomery,Bobby Farrelly,Bobby Farrelly,Bobby Farrelly,Peter Farrelly"| __truncated__ "Richard Comeau,Simon Franks,Zygi Kamasa,Adrian Biddle,Derek Marcil,David Worley,Curtis Lindersmith,Maxime Rémil"| __truncated__ ...

Handling Missing Values

Checking missing values and addressing missing values through various imputation method to ensure the dataset is complete and ready for analysis.

# Checking for null value
sum(is.na(movie_analysis))

## [1] 1

# Counting the number of NA values in each column
colSums(is.na(movie_analysis))

##                   id                title               budget 
##                    0                    0                    0 
##             homepage    original_language       original_title 
##                    0                    0                    0 
##             overview           popularity         release_date 
##                    0                    0                    1 
##              revenue              runtime               status 
##                    0                    0                    0 
##              tagline         vote_average           vote_count 
##                    0                    0                    0 
##               genres             keywords production_companies 
##                    0                    0                    0 
## production_countries     spoken_languages                 cast 
##                    0                    0                    0 
##                 crew 
##                    0

# Imputation missing data 
# Convert release_date to numeric (days since a reference date)
reference_date <- as.Date("2000-01-01")
numeric_dates <- as.numeric(movie_analysis$release_date - reference_date)

# Compute median date
median_date <- reference_date + median(numeric_dates, na.rm = TRUE)

# Replace missing dates with median date
movie_analysis$release_date[is.na(movie_analysis$release_date)] <- median_date

# Impute missing values in the 'budget' column with the median
movie_analysis$budget[is.na(movie_analysis$budget)] <- median(movie_analysis$budget, na.rm = TRUE)

# Impute missing values in the 'revenue' column with the median
movie_analysis$revenue[is.na(movie_analysis$revenue)] <- median(movie_analysis$revenue, na.rm = TRUE)

# Impute missing values in the 'popularity' column with the mean
movie_analysis$popularity[is.na(movie_analysis$popularity)] <- mean(movie_analysis$popularity, na.rm = TRUE)

Confirming Missing Value Handling

A summary of the dataset and a check for remaining NA values are performed to ensure that all missing values have been addressed.

# Display summary of the dataset to confirm missing values are handled
summary(movie_analysis)

##        id            title               budget            homepage        
##  Min.   :     5   Length:4803        Min.   :        0   Length:4803       
##  1st Qu.:  9014   Class :character   1st Qu.:   790000   Class :character  
##  Median : 14629   Mode  :character   Median : 15000000   Mode  :character  
##  Mean   : 57166                      Mean   : 29045040                     
##  3rd Qu.: 58611                      3rd Qu.: 40000000                     
##  Max.   :459488                      Max.   :380000000                     
##  original_language  original_title       overview           popularity     
##  Length:4803        Length:4803        Length:4803        Min.   :  0.000  
##  Class :character   Class :character   Class :character   1st Qu.:  4.668  
##  Mode  :character   Mode  :character   Mode  :character   Median : 12.922  
##                                                           Mean   : 21.492  
##                                                           3rd Qu.: 28.314  
##                                                           Max.   :875.581  
##   release_date           revenue             runtime         status         
##  Min.   :1916-09-04   Min.   :0.000e+00   Min.   :  0.0   Length:4803       
##  1st Qu.:1999-07-14   1st Qu.:0.000e+00   1st Qu.: 94.0   Class :character  
##  Median :2005-10-03   Median :1.917e+07   Median :103.0   Mode  :character  
##  Mean   :2002-12-28   Mean   :8.226e+07   Mean   :106.9                     
##  3rd Qu.:2011-02-16   3rd Qu.:9.292e+07   3rd Qu.:117.5                     
##  Max.   :2017-02-03   Max.   :2.788e+09   Max.   :338.0                     
##    tagline           vote_average      vote_count         genres         
##  Length:4803        Min.   : 0.000   Min.   :    0.0   Length:4803       
##  Class :character   1st Qu.: 5.600   1st Qu.:   54.0   Class :character  
##  Mode  :character   Median : 6.200   Median :  235.0   Mode  :character  
##                     Mean   : 6.092   Mean   :  690.2                     
##                     3rd Qu.: 6.800   3rd Qu.:  737.0                     
##                     Max.   :10.000   Max.   :13752.0                     
##    keywords         production_companies production_countries
##  Length:4803        Length:4803          Length:4803         
##  Class :character   Class :character     Class :character    
##  Mode  :character   Mode  :character     Mode  :character    
##                                                              
##                                                              
##                                                              
##  spoken_languages       cast               crew          
##  Length:4803        Length:4803        Length:4803       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##

# Confirm that there are no remaining NA values
colSums(is.na(movie_analysis))

##                   id                title               budget 
##                    0                    0                    0 
##             homepage    original_language       original_title 
##                    0                    0                    0 
##             overview           popularity         release_date 
##                    0                    0                    0 
##              revenue              runtime               status 
##                    0                    0                    0 
##              tagline         vote_average           vote_count 
##                    0                    0                    0 
##               genres             keywords production_companies 
##                    0                    0                    0 
## production_countries     spoken_languages                 cast 
##                    0                    0                    0 
##                 crew 
##                    0

Analysis

Descriptive Statistics: Calculated average, highest, and lowest budgets and revenues, and visualized the top and bottom 10 movies in terms of both budget and revenue.

# Analyze the distribution of movie budgets
movie_analysis %>% filter(budget>0) %>% summarise(avg_budget=mean(budget),highest_budget=max(budget),lowest_budget=min(budget),
                             sd_budget=sd(budget))

##   avg_budget highest_budget lowest_budget sd_budget
## 1   37042838      380000000             1  42646514

# 10 highest budget movie
movie_analysis %>% group_by(title) %>% summarise(highest_budget=max(budget)) %>% 
  arrange(desc(highest_budget)) %>% slice_head(n=10)

## # A tibble: 10 × 2
##    title                                       highest_budget
##    <chr>                                                <int>
##  1 Pirates of the Caribbean: On Stranger Tides      380000000
##  2 Pirates of the Caribbean: At World's End         300000000
##  3 Avengers: Age of Ultron                          280000000
##  4 Superman Returns                                 270000000
##  5 John Carter                                      260000000
##  6 Tangled                                          260000000
##  7 Spider-Man 3                                     258000000
##  8 The Lone Ranger                                  255000000
##  9 Batman v Superman: Dawn of Justice               250000000
## 10 Captain America: Civil War                       250000000

# Plot 10 highest budget movie
movie_analysis %>% group_by(title) %>% summarise(highest_budget=max(budget)) %>% 
  arrange(desc(highest_budget)) %>% slice_head(n=10) %>% 
  ggplot(aes(x = reorder(title, highest_budget), y = highest_budget))+
  geom_col(fill='#1f77b4')+coord_flip()+
  labs(x="Movie Title",y="Budget",title="Top 10 Highest Budget Movies")+
  theme_classic()

Finding from the plot:

Franchise Influence: Dominance of “Pirates of the Caribbean” and “Avengers” films.
Superhero Focus: Significant presence of Marvel and DC superhero movies.
High Production Costs: Reflects large investments in effects, stars, and marketing.
Recent Trends: Mainly films from the 2000s and early 2010s.

# 10 lowest budget movie
movie_analysis %>%filter(budget>0) %>%  group_by(title) %>%  summarise(lowest_budget=min(budget)) %>% 
  arrange(lowest_budget) %>% slice_head(n=10)

## # A tibble: 10 × 2
##    title                       lowest_budget
##    <chr>                               <int>
##  1 Fear Clinic                             1
##  2 Fetching Cody                           1
##  3 Incident at Loch Ness                   1
##  4 Me You and Five Bucks                   1
##  5 Modern Times                            1
##  6 Slam                                    1
##  7 The Algerian                            1
##  8 To Be Frank, Sinatra at 100             2
##  9 Four Single Fathers                     3
## 10 A Farewell to Arms                      4

# Plot 10 lowest budget movie
movie_analysis %>%filter(budget>0) %>%  group_by(title) %>%  summarise(lowest_budget=min(budget)) %>% 
  arrange(lowest_budget) %>% slice_head(n=10) %>% 
  ggplot(aes(x = reorder(title,lowest_budget),y=lowest_budget))+
  geom_col(fill='#ff7f0e')+coord_flip()+
  labs(x="Movie Title",y="Budget",title="Top 10 Lowest Budget Movies")+
  theme_classic()

Finding from the plot:

Extremely Low Budgets: Most have budgets of just $1.
Lowest Budget Film: “Fear Clinic” with a budget of $1.
Cost-Efficient Production: Produced with minimal resources.
Genre Variety: Includes different genres and types.
Low Financial Risk: Minimal investment and risk involved.

# Examine the distribution of revenues
movie_analysis %>% filter(revenue>0) %>% 
  summarise(highest_revenue=max(revenue),lowest_revenue=min(revenue),average_revenue=mean(revenue))

##   highest_revenue lowest_revenue average_revenue
## 1      2787965087              5       117031353

# 10 highest revenue generated movie
movie_analysis %>% group_by(title) %>% summarise(highest_revenue=max(revenue)) %>% 
  arrange(desc(highest_revenue)) %>% slice_head(n=10)

## # A tibble: 10 × 2
##    title                      highest_revenue
##    <chr>                                <dbl>
##  1 Avatar                          2787965087
##  2 Titanic                         1845034188
##  3 The Avengers                    1519557910
##  4 Jurassic World                  1513528810
##  5 Furious 7                       1506249360
##  6 Avengers: Age of Ultron         1405403694
##  7 Frozen                          1274219009
##  8 Iron Man 3                      1215439994
##  9 Minions                         1156730962
## 10 Captain America: Civil War      1153304495

# Plot 10 highest revenue generated movie
movie_analysis %>% group_by(title) %>% summarise(highest_revenue=max(revenue)) %>% 
  arrange(desc(highest_revenue)) %>% slice_head(n=10) %>% 
  ggplot(aes(x=reorder(title,highest_revenue),y=highest_revenue))+
  geom_col(fill='#2ca02c')+coord_flip()+
  labs(x="Movie Title",y="Revenue",title="Top 10 Highest Reveneu Generated Movies")+
  theme_classic()

Finding from the plot:

Blockbuster Hits: Major films with massive revenue.
Franchise Success: Includes top franchises like “Avatar” and “Avengers.”
High Earnings: All exceed $1 billion in revenue.
Recent Releases: Many from the last decade.
Genre Variety: Covers multiple genres, including sci-fi and animation.

# 10 lowest revenue generated movie
movie_analysis %>% filter(revenue>0) %>%  group_by(title)%>% summarise(lowest_revenue=min(revenue)) %>% 
  arrange(lowest_revenue) %>% slice_head(n=10)

## # A tibble: 10 × 2
##    title                   lowest_revenue
##    <chr>                            <dbl>
##  1 Split Second                         5
##  2 Bran Nue Dae                         7
##  3 Dreaming of Joseph Lees              7
##  4 Bats                                10
##  5 F.I.S.T.                            11
##  6 Of Horses and Men                   11
##  7 Chasing Liberty                     12
##  8 The Cookout                         12
##  9 Angela's Ashes                      13
## 10 The 51st State                      14

# Plot 10 lowest revenue generated movie
movie_analysis %>% filter(revenue>0) %>%  group_by(title)%>% summarise(lowest_revenue=min(revenue)) %>% 
  arrange(lowest_revenue) %>% slice_head(n=10) %>% 
  ggplot(aes(x=reorder(title,lowest_revenue),y=lowest_revenue))+
  geom_col(fill='#d62728')+coord_flip()+
  labs(x="Movie Title",y="Revenue",title="10 Lowest Reveneu Generated Movies")+
  theme_classic()

Finding from the plot:

Very Low Revenue: Earnings mostly below $15.
Indie and Niche: Films with limited commercial reach.
Lowest Revenue: Peaks at $14.
Genre Variety: Includes a range of film types.
Financial Struggles: Shows difficulties in generating significant revenue.

Popularity Analysis

Analyzed the distribution and visualized the top and bottom 10 most and least popular movies.

# Analyze Popularity 
movie_analysis %>%filter(popularity>0) %>% 
  summarise(max_popularity=max(popularity),min_popularity=min(popularity),avg_popularity=mean(popularity))

##   max_popularity min_popularity avg_popularity
## 1       875.5813       0.000372       21.49678

# 10 highest popular movies
movie_analysis %>% group_by(title) %>% summarise(max_popularity=max(popularity)) %>% 
  arrange(desc(max_popularity)) %>% slice_head(n=10)

## # A tibble: 10 × 2
##    title                                                  max_popularity
##    <chr>                                                           <dbl>
##  1 Minions                                                          876.
##  2 Interstellar                                                     724.
##  3 Deadpool                                                         515.
##  4 Guardians of the Galaxy                                          481.
##  5 Mad Max: Fury Road                                               434.
##  6 Jurassic World                                                   419.
##  7 Pirates of the Caribbean: The Curse of the Black Pearl           272.
##  8 Dawn of the Planet of the Apes                                   244.
##  9 The Hunger Games: Mockingjay - Part 1                            206.
## 10 Big Hero 6                                                       204.

# Plot 10 highest popular movies
movie_analysis %>% group_by(title) %>% summarise(max_popularity=max(popularity)) %>% 
  arrange(desc(max_popularity)) %>% slice_head(n=10) %>% 
  ggplot(aes(x=reorder(title,max_popularity),y=max_popularity))+
  geom_col(fill='#9467bd')+coord_flip()+
  labs(x="Movie Title",y="Popularity",title="Top 10 Highest Popular Movies")+
  theme_classic()

Finding from the plot:

Top Popularity: Movies with the highest popularity scores, led by “Minions” and “Interstellar.”
High Popularity Scores: Top films have scores ranging from 204 to 876.
Diverse Genres: Includes a mix of animation, sci-fi, action, and adventure.
Recent Hits: Many are recent blockbusters and popular franchises.
Strong Engagement: High audience engagement and cultural impact.

# 10 lowest popular movies
movie_analysis %>% filter(revenue>0) %>%  group_by(title)%>% summarise(min_popularity=min(revenue)) %>% 
  arrange(min_popularity) %>% slice_head(n=10)

## # A tibble: 10 × 2
##    title                   min_popularity
##    <chr>                            <dbl>
##  1 Split Second                         5
##  2 Bran Nue Dae                         7
##  3 Dreaming of Joseph Lees              7
##  4 Bats                                10
##  5 F.I.S.T.                            11
##  6 Of Horses and Men                   11
##  7 Chasing Liberty                     12
##  8 The Cookout                         12
##  9 Angela's Ashes                      13
## 10 The 51st State                      14

# Plot 10 lowest popular movies
movie_analysis %>% filter(revenue>0) %>%  group_by(title)%>% summarise(min_popularity=min(revenue)) %>% 
  arrange(min_popularity) %>% slice_head(n=10) %>% 
  ggplot(aes(x=reorder(title,min_popularity),y=min_popularity))+
  geom_col(fill='#8c564b')+coord_flip()+
  labs(x="Movie Title",y="Popularity",title="10 Lowest Popular Movies")+
  theme_classic()

Finding from the plot:

Extremely Low Popularity: All have very low popularity scores, starting at 5.
Limited Reach: Likely have minimal audience engagement and visibility.
Niche or Lesser-Known: Includes obscure or less mainstream films.
Low Cultural Impact: Shows limited cultural or commercial influence.

Runtime & Yearly Trends:

Explored movie runtime statistics and visualized revenue and average revenue over time.Also,Analyzed popularity by genre over time.

# Run time (in minute)
movie_analysis %>% filter(runtime>0) %>% 
  summarise(max_runtime=max(runtime),min_runtime=min(runtime),avg_runtime=mean(runtime))

##   max_runtime min_runtime avg_runtime
## 1         338          14    107.6604

# No of movie by year
movie_analysis$year <- as.numeric(format(movie_analysis$release_date, "%Y"))

# Summary of Movie Data by Year
movie_analysis %>%
  group_by(year) %>%
  summarise(
    total_movie = n(),
    avg_revenue = mean(revenue),
    avg_popularity = mean(popularity)
  )

## # A tibble: 90 × 4
##     year total_movie avg_revenue avg_popularity
##    <dbl>       <int>       <dbl>          <dbl>
##  1  1916           1     8394751          3.23 
##  2  1925           1    22000000          0.786
##  3  1927           1      650422         32.4  
##  4  1929           2     2179000          1.40 
##  5  1930           1     8000000          8.48 
##  6  1932           1          25          1.20 
##  7  1933           2     2240500          1.28 
##  8  1934           1     4500000         11.9  
##  9  1935           1     3202000          3.90 
## 10  1936           2     5618000         15.6  
## # ℹ 80 more rows

# Plot total revenue over time
movie_analysis %>%
  group_by(year) %>%
  summarize(total_revenue = sum(revenue), .groups = 'drop') %>%
  ggplot(aes(x = year, y = total_revenue)) +
  geom_line(color = "blue") + 
  geom_point(color = "red", size = 1) +   
  labs(x = "Year", y = "Total Revenue",title = "Total Revenue by Year") +
  theme_minimal()

Finding from the plot:

Exponential Growth:Exponential growth after the year 1990, indicating a rapid increase in revenue over time.
Early Stagnation: Prior to 1990, the revenue growth appears relatively flat, suggesting a period of slow growth or even decline.
Dramatic Increase Post-1990: The most striking feature of the plot is the dramatic increase in total revenue starting around 1990.
Potential Outlier in 2020: The data point for the year 2020 might be considered an outlier, as it shows a sharp decrease in revenue compared to the preceding year.

# Plot total budget over time
movie_analysis %>%
  group_by(year) %>%
  summarize(total_budget = sum(budget), .groups = 'drop') %>%
  ggplot(aes(x = year, y = total_budget)) +
  geom_line(color = "blue") + 
  geom_point(color = "red", size = 1) +   
  labs(x = "Year", y = "Total Budget",title = "Total Budget by Year") +
  theme_minimal()

Finding from the plot:

Fluctuating Trend: The plot demonstrates a fluctuating pattern in average revenue over time, with periods of growth, decline, and stability.
Early Growth: The revenue initially increases, reaching a peak around the 1930s.
Declines and Recoveries: Subsequently, the revenue fluctuates, with periods of decline followed by recoveries.
Increasing Volatility: The fluctuations become more pronounced in the later years, especially after the 1970s.
Overall Upward Trend: Despite the fluctuations, there seems to be a general upward trend in average revenue, particularly in the latter half of the 20th century.

# Plot average revenue over time
movie_analysis %>%
  group_by(year) %>%
  summarize(avg_revenue = mean(revenue), .groups = 'drop') %>%
  ggplot(aes(x = year, y = avg_revenue)) +
  geom_line(color = "green",size=1) +  
  geom_point(color = "orange", size = 1.5) +  
  labs(title = "Average Revenue by Year",
       x = "Year",
       y = "Average Revenue") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Finding from the plot:

Dynamic Genre Popularity: The plot reveals a dynamic landscape of genre popularity over time.
Rising and Declining Genres: Some genres gain popularity while others decline.
Emergence of New Genres: Newer genres have gained prominence.
Fluctuating Trends: Many genres experience fluctuations in popularity.
Interdependence of Genres: The popularity of certain genres may be influenced by others.

# Plot popularity by genre over time
movie_analysis %>%separate_rows(genres, sep = ",\\s*") %>%
  mutate(keywords = trimws(genres)) %>% filter(genres!="") %>% 
  group_by(year, genres) %>%
  summarize(avg_popularity = mean(popularity)) %>% 
  ggplot(aes(x = year, y = avg_popularity, fill = genres)) +
  geom_col()+
  labs(title = "Average Popularity by Genre and Year", x = "Year", y = "Average Popularity") +
  theme_minimal() +
  theme(legend.position = "bottom")

## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

Finding from the plot:

Trend Overview: The plot shows an increasing trend in average popularity across all genres over time, especially after 1970.
Genre Popularity Peaks: Significant peaks in popularity can be observed around 1930, 1970, and early 2000s.
Genre Diversity: A wide variety of genres contribute to popularity, with no single genre dominating over the entire period.
Recent Years: Post-2000, there is a noticeable rise in the popularity of multiple genres, indicating a broader audience interest in diverse genres.
Popular Genres: Genres like Drama, Action, and Comedy appear to have consistent representation across different time periods.

Revenue, Budget, and Popularity by Genre:

Aggregated and visualized total revenue and budget by genre and assessed popularity.

# Analyze genres by revenue
movie_analysis %>% 
  # This assumes each keyword entry is a comma-separated list
  separate_rows(genres,sep = ",\\s*") %>% 
  # Remove any leading or trailing spaces from keywords
  mutate(keywords = trimws(genres)) %>% 
  # Summarize total revenue by each individual keyword
  filter(revenue>0) %>% filter(genres!="") %>% group_by(genres) %>%
  summarise(total_revenue=sum(revenue)) %>% arrange(desc(total_revenue))

## # A tibble: 19 × 2
##    genres          total_revenue
##    <chr>                   <dbl>
##  1 Adventure        164841561551
##  2 Action           162959914515
##  3 Comedy           122760517608
##  4 Drama            119710983984
##  5 Thriller         103250426269
##  6 Family            83283238689
##  7 Fantasy           81982199925
##  8 Science Fiction   81564235745
##  9 Romance           53642137545
## 10 Animation         52812167865
## 11 Crime             46040860686
## 12 Mystery           27248722761
## 13 Horror            22599894663
## 14 War               12118445911
## 15 History           11332141732
## 16 Music              8964351078
## 17 Western            3792169111
## 18 Documentary        1082277678
## 19 Foreign              12398151

# Plot genres by revenue
movie_analysis %>% 
  separate_rows(genres,sep = ",\\s*") %>% 
  mutate(keywords = trimws(genres)) %>% 
  filter(revenue>0) %>% filter(genres!="") %>%  group_by(genres) %>%
  summarise(total_revenue=sum(revenue)) %>% arrange(desc(total_revenue)) %>% 
  ggplot(aes(x=reorder(genres,total_revenue),y=total_revenue))+ 
  geom_col(fill = "#2ca02c") +
  labs(x="Genres",y="Total Reveneu",title = "Genres by Revenue")+
  theme(axis.text.x = element_text(angle=50,hjust = 1))

Finding from the plot:

Top Earners: Adventure and Action lead with over $160 billion each.
High Revenue: Comedy, Drama, and Thriller exceed $100 billion.
Moderate: Family, Fantasy, and Science Fiction earn between $81 billion and $83 billion.
Lower Revenue: Romance and Animation generate over $50 billion.
Least Revenue: Foreign and Documentary are the lowest, with Foreign at $12.4 million and Documentary at $1.08 billion.

# Analyze genres by budget
movie_analysis %>% 
  # This assumes each keyword entry is a comma-separated list
  separate_rows(genres,sep = ",\\s*") %>% 
  # Remove any leading or trailing spaces from keywords
  mutate(keywords = trimws(genres)) %>% 
  filter(budget>0) %>% filter(genres!="") %>% group_by(genres) %>%
  summarise(total_budget=sum(budget)) %>% arrange(desc(total_budget))

## # A tibble: 20 × 2
##    genres          total_budget
##    <chr>                  <dbl>
##  1 Action           59443406599
##  2 Adventure        52398220463
##  3 Drama            47498112157
##  4 Comedy           43589711809
##  5 Thriller         40727495424
##  6 Science Fiction  27748069865
##  7 Fantasy          26949696595
##  8 Family           26019109625
##  9 Crime            19383466474
## 10 Romance          18158357743
## 11 Animation        15553021056
## 12 Mystery          10699081450
## 13 Horror            7563921991
## 14 History           5890982931
## 15 War               5080673879
## 16 Music             2942970295
## 17 Western           2220453601
## 18 Documentary        291861725
## 19 Foreign             22375007
## 20 TV Movie             9200000

# Plot genres by budget
movie_analysis %>% 
  separate_rows(genres,sep = ",\\s*") %>% 
  mutate(keywords = trimws(genres)) %>% 
  filter(budget>0) %>% filter(genres!="") %>% group_by(genres) %>%
  summarise(total_budget=sum(budget)) %>% arrange(desc(total_budget)) %>% 
  ggplot(aes(x=reorder(genres,total_budget),y=total_budget))+
  geom_col(fill="#d62728")+
  labs(x="Genres", y="Total Budget", title="Genres by Budget") +
  theme(axis.text.x = element_text(angle = 50,hjust = 1))

Finding from the plot:

Highest Budgets: Action and Adventure with over $50 billion each.
Significant: Drama and Comedy around $47 billion and $43 billion.
Moderate: Thriller, Science Fiction, Fantasy, and Family between $26 billion and $41 billion.
Lower: Crime and Romance with $18 billion to $19 billion.
Least: Documentary and Foreign have the smallest budgets, with Documentary at $292 million and Foreign at $22.4 million. TV Movies are the lowest at $9.2 million.

# Analyze genres by popularity
movie_analysis %>% 
  # This assumes each keyword entry is a comma-separated list
  separate_rows(genres,sep = ",\\s*") %>% 
  # Remove any leading or trailing spaces from keywords
  mutate(keywords = trimws(genres)) %>% 
  filter(popularity>0) %>% filter(genres!="") %>% group_by(genres) %>%
  summarise(avg_popularity=mean(popularity)) %>% arrange(desc(avg_popularity))

## # A tibble: 20 × 2
##    genres          avg_popularity
##    <chr>                    <dbl>
##  1 Adventure               39.3  
##  2 Animation               38.8  
##  3 Science Fiction         36.5  
##  4 Fantasy                 36.4  
##  5 Action                  30.9  
##  6 Family                  27.8  
##  7 Mystery                 24.6  
##  8 Thriller                24.5  
##  9 War                     23.8  
## 10 Crime                   22.9  
## 11 Horror                  18.3  
## 12 Western                 18.2  
## 13 Comedy                  18.2  
## 14 Drama                   17.8  
## 15 History                 17.4  
## 16 Romance                 16.0  
## 17 Music                   13.1  
## 18 TV Movie                 6.39 
## 19 Documentary              3.95 
## 20 Foreign                  0.687

# Plot genres by popularity
movie_analysis %>% 
  separate_rows(genres,sep = ",\\s*") %>% 
  mutate(keywords = trimws(genres)) %>% 
  filter(popularity>0) %>% filter(genres!="") %>%  group_by(genres) %>%
  summarise(avg_popularity=mean(popularity)) %>% arrange(desc(avg_popularity)) %>% 
  ggplot(aes(x=reorder(genres,avg_popularity),y=avg_popularity)) + 
  geom_col(fill="#1f77b4")+
  labs(x="Genres",y="Average Popularity",title="Genres by Popularity")+
  theme(axis.text.x = element_text(angle = 50,hjust = 1))

Finding from the plot:

Most Popular: Adventure and Animation with scores around 39.
High Popularity: Science Fiction and Fantasy near 37.
Moderate: Action and Family around 31 and 28.
Lower: Mystery, Thriller, and War between 24 and 23.
Least Popular: Comedy, Drama, and History between 17 and 18.
Lowest: Romance (16), Music (13), TV Movie (6.39), Documentary (3.95), and Foreign (0.687).

Revenue, Budget, and Popularity by Keyword

Analyzed and visualized the impact of keywords on revenue, budget, and popularity.

# Analyze keyword by revenue
  movie_analysis %>% separate_rows(keywords, sep = ",\\s*") %>%
  mutate(keywords = trimws(keywords)) %>%
  group_by(keywords) %>%
  summarise(total_revenue = sum(revenue)) %>%
  arrange(desc(total_revenue))

## # A tibble: 9,815 × 2
##    keywords             total_revenue
##    <chr>                        <dbl>
##  1 duringcreditsstinger   57827617707
##  2 3d                     38821575006
##  3 aftercreditsstinger    37735182966
##  4 based on novel         28330574606
##  5 sequel                 26416350086
##  6 superhero              26398102240
##  7 dystopia               22093114379
##  8 based on comic book    21464756515
##  9 marvel comic           19447081252
## 10 woman director         16140425506
## # ℹ 9,805 more rows

# Plot keyword by revenue
  movie_analysis %>% separate_rows(keywords, sep = ",\\s*") %>%
    mutate(keywords = trimws(keywords)) %>%
    group_by(keywords) %>%
    summarise(total_revenue = sum(revenue)) %>%
    arrange(desc(total_revenue))  %>%slice_head(n=10) %>% 
    ggplot(aes(x=total_revenue,y=reorder(keywords,total_revenue)))+
    geom_col(fill="#1f77b4")+
    labs(x="Total Revenue",y="Keywords",title="Top 10 Keyword by Revenue")+
    theme_minimal()

Finding from the plot:

Top Keywords: During Credits Stinger $57.8 billion and 3D $38.8 billion.
High Revenue: After Credits Stinger $37.7 billion and Based on Novel ($28.3 billion).
Significant: Sequel and Superhero both around $26.4 billion.
Popular: Dystopia $22.1 billion and Based on Comic Book $21.5 billion.
Notable: Marvel Comic $19.4 billion and Woman Director $16.1 billion.

# Analyze keyword by budget
  movie_analysis %>% separate_rows(keywords, sep = ",\\s*") %>%
    mutate(keywords = trimws(keywords)) %>%
    group_by(keywords) %>%
    summarise(total_budget = sum(budget)) %>%
    arrange(desc(total_budget))

## # A tibble: 9,815 × 2
##    keywords             total_budget
##    <chr>                       <dbl>
##  1 duringcreditsstinger  16032534128
##  2 3d                    10579000003
##  3 aftercreditsstinger   10577914650
##  4 based on novel         8929115000
##  5 dystopia               8149277523
##  6 superhero              7591902660
##  7 sequel                 6977805523
##  8 based on comic book    6385100000
##  9 woman director         5549967548
## 10 alien                  5221650531
## # ℹ 9,805 more rows

# Plot keyword by budget
  movie_analysis %>% separate_rows(keywords, sep = ",\\s*") %>%
    mutate(keywords = trimws(keywords)) %>%
    group_by(keywords) %>%
    summarise(total_budget = sum(budget)) %>%
    arrange(desc(total_budget)) %>%slice_head(n=10) %>% 
    ggplot(aes(x=total_budget,y=reorder(keywords,total_budget)))+
    geom_col(fill="#1f77b4")+
    labs(x="Total Budget",y="Keywords",title="Top 10 Keyword by Budget")+
    theme_minimal()

Finding from the plot:

Highest Budgets: During Credits Stinger $16.0 billion and 3D $10.6 billion.
Significant: After Credits Stinger $10.6 billion and Based on Novel $8.9 billion.
Moderate: Dystopia $8.1 billion and Superhero $7.6 billion.
Notable: Sequel $6.9 billion and Based on Comic Book $6.4 billion.
Additional: Woman Director $5.5 billion and Alien $5.2 billion.

# Analyze keyword by popularity
  movie_analysis %>% separate_rows(keywords, sep = ",\\s*") %>%
    mutate(keywords = trimws(keywords)) %>%
    group_by(keywords) %>%
    summarise(avg_popularity= mean(popularity)) %>%
    arrange(desc(avg_popularity))

## # A tibble: 9,815 × 2
##    keywords        avg_popularity
##    <chr>                    <dbl>
##  1 evil mastermind           876.
##  2 astrophysics              724.
##  3 relativity                724.
##  4 zero gravity              724.
##  5 self healing              515.
##  6 dark future               434.
##  7 peak oil                  434.
##  8 capuchin monkey           272.
##  9 gold coin                 272.
## 10 pirate ship               272.
## # ℹ 9,805 more rows

# Plot keyword by popularity
  movie_analysis %>% separate_rows(keywords, sep = ",\\s*") %>%
    mutate(keywords = trimws(keywords)) %>%
    group_by(keywords) %>%
    summarise(avg_popularity= mean(popularity)) %>%
    arrange(desc(avg_popularity)) %>%slice_head(n=10) %>% 
    ggplot(aes(x=avg_popularity,y=reorder(keywords,avg_popularity)))+
    geom_col(fill="#1f77b4")+
    labs(x="Average Popularity",y="Keywords",title="Top 10 Keyword by Popularity")+
    theme_minimal()

Finding from the plot:

Highest Popularity: Evil Mastermind (876) and Astrophysics, Relativity, Zero Gravity (724 each).
High: Self Healing (515).
Moderate: Dark Future, Peak Oil (434 each).
Lower: Capuchin Monkey, Gold Coin, Pirate Ship (272 each).

Language Analysis:

Analyzed revenue, budget, and average rating by spoken languages, with visualizations for the top languages.

# Analyze Language by Revenue
movie_analysis %>% separate_rows(spoken_languages, sep = ",\\s*") %>%
  mutate(keywords = trimws(spoken_languages)) %>%
  group_by(spoken_languages) %>% 
  summarise(total_revenue=sum(revenue)) %>% arrange(desc(total_revenue))

## # A tibble: 62 × 2
##    spoken_languages total_revenue
##    <chr>                    <dbl>
##  1 English           389058696018
##  2 Français           43293893830
##  3 Español            36149323251
##  4 Deutsch            25020728621
##  5 Italiano           21495420851
##  6 Pусский            20791626225
##  7 普通话             13886998166
##  8 日本語             11583181763
##  9 العربية             9064378961
## 10 Latin               7184674842
## # ℹ 52 more rows

# Plot Top Language by Revenue
movie_analysis %>% separate_rows(spoken_languages, sep = ",\\s*") %>%
  mutate(keywords = trimws(spoken_languages)) %>%
  group_by(spoken_languages) %>% 
  summarise(total_revenue=sum(revenue)) %>% arrange(desc(total_revenue)) %>%                 slice_head(n=10) %>% 
  ggplot(aes(x = reorder(spoken_languages, total_revenue), y = total_revenue)) +
  geom_col(fill = "steelblue") +
  labs(x="Language",y="Total Revenue",title = "Top 10 Language by Revenue")+                 theme_classic()

Finding from the plot:

Highest Revenue: English ($389 billion).
High: Français ($43.3 billion), Español ($36.1 billion), Deutsch ($25 billion).
Moderate: Italiano ($21.5 billion), Pусский ($20.8 billion), 普通话 (Mandarin) ($13.9 billion), 日本語 (Japanese) ($11.6 billion).
Lower: العربية (Arabic) ($9.1 billion), Latin ($7.2 billion).

# Analyze Language by Budget 
movie_analysis %>% separate_rows(spoken_languages, sep = ",\\s*") %>%
  mutate(keywords = trimws(spoken_languages)) %>%
  group_by(spoken_languages) %>% 
  summarise(total_budget=sum(budget)) %>% arrange(desc(total_budget))

## # A tibble: 62 × 2
##    spoken_languages total_budget
##    <chr>                   <dbl>
##  1 English          137375134938
##  2 Français          14426725944
##  3 Español           12089195167
##  4 Deutsch            8896962289
##  5 Pусский            7596831148
##  6 Italiano           6812077701
##  7 普通话             5215892068
##  8 日本語             4238472699
##  9 العربية            2933167686
## 10 Português          2392068060
## # ℹ 52 more rows

# Plot Language by Budget 
movie_analysis %>% separate_rows(spoken_languages, sep = ",\\s*") %>%
  mutate(keywords = trimws(spoken_languages)) %>%
  group_by(spoken_languages) %>% 
  summarise(total_budget=sum(budget)) %>% arrange(desc(total_budget)) %>%                    slice_head(n=10) %>% 
  ggplot(aes(x = reorder(spoken_languages, total_budget), y = total_budget)) +
  geom_col(fill = "steelblue")+
  labs(x="Language",y="Total Budget",title = "Top 10 Language by Budget")+ theme_classic()

Finding from the plot:

Highest Budget: English ($137.4 billion).
High: Français ($14.4 billion), Español ($12.1 billion).
Moderate: Deutsch ($8.9 billion), Pусский ($7.6 billion), Italiano ($6.8 billion).
Lower: 普通话 (Mandarin) ($5.2 billion), 日本語 (Japanese) ($4.2 billion), العربية (Arabic) ($2.9 billion), Português ($2.4 billion).

# Analyze Language average by Rating 
movie_analysis %>% separate_rows(spoken_languages,sep = ",\\s*") %>% 
  mutate(keywords = trimws(spoken_languages)) %>% 
  filter(vote_average>0)%>% filter(keywords != "No Language") %>% group_by(spoken_languages) %>% 
  summarise(avg_rating=mean(vote_average)) %>% arrange(desc(avg_rating))

## # A tibble: 60 × 2
##    spoken_languages avg_rating
##    <chr>                 <dbl>
##  1 తెలుగు                  7.5 
##  2 Bahasa indonesia       7.4 
##  3 فارسی                  7.22
##  4 Gaeilge                7.22
##  5 Galego                 7.2 
##  6 Somali                 7.2 
##  7 Català                 7   
##  8 Wolof                  7   
##  9 اردو                   6.9 
## 10 български език         6.87
## # ℹ 50 more rows

# Plot Language average by Rating 
movie_analysis %>% separate_rows(spoken_languages,sep = ",\\s*") %>% 
  mutate(keywords = trimws(spoken_languages)) %>% 
  filter(vote_average>0)%>% filter(keywords != "No Language") %>%                            group_by(spoken_languages) %>% 
  summarise(avg_rating=mean(vote_average)) %>% arrange(desc(avg_rating))%>%                  slice_head(n=10) %>% 
  ggplot(aes(x = reorder(spoken_languages, avg_rating), y = avg_rating)) +
  geom_col(fill = "steelblue")+
  labs(x="Language",y="Average Rating",title = "Top 10 Language by Rating")+                 theme_classic()+
  theme(axis.text.x = element_text(angle = 45,hjust = 1))

Finding from the plot:

Highest Rating: Telugu (తెలుగు) (7.5).
High: Bahasa Indonesia (7.4), Persian (فارسی ) and Irish (Gaeilge) (7.22 each).
Moderate: Galician and Somali (7.2), Catalan and Wolof (7.0).
Lower: اردو (Urdu) (6.9), Bulgarian (български език) (6.87).

Profit Analysis

Calculated profit and analyzed the top and bottom 10 profitable and non-profitable movies.

# Creating a profit column
movie_analysis$profit <- movie_analysis$revenue - movie_analysis$budget  

# Summary of the profit column 
summary(movie_analysis$profit)

##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -165710090    -799538    2511317   53215599   55312860 2550965087

# Analyze Top 10 Most Profitable Movies
movie_analysis %>% filter(profit>0) %>% group_by(title) %>% 
  summarise(profit=max(profit)) %>% arrange(desc(profit)) %>% 
  slice_head(n=10)

## # A tibble: 10 × 2
##    title                                             profit
##    <chr>                                              <dbl>
##  1 Avatar                                        2550965087
##  2 Titanic                                       1645034188
##  3 Jurassic World                                1363528810
##  4 Furious 7                                     1316249360
##  5 The Avengers                                  1299557910
##  6 Avengers: Age of Ultron                       1125403694
##  7 Frozen                                        1124219009
##  8 Minions                                       1082730962
##  9 The Lord of the Rings: The Return of the King 1024888979
## 10 Iron Man 3                                    1015439994

# Plot Top 10 Most Profitable Movies
movie_analysis %>% filter(profit>0) %>% group_by(title) %>% 
  summarise(profit=max(profit)) %>% arrange(desc(profit)) %>% 
  slice_head(n=10) %>% 
  ggplot(aes(x = reorder(title, profit), y = profit)) +
  geom_col(fill = "darkgreen") +coord_flip()+
  labs(x = "Movie Title", y = "Profit",title = "Top 10 Most Profitable Movies") +
  theme_classic()

Finding from the plot:

Top Profit: Avatar ($2.55 billion).
High: Titanic ($1.65 billion), Jurassic World ($1.36 billion), Furious 7 ($1.32 billion).
Significant: The Avengers ($1.30 billion), Avengers: Age of Ultron and Frozen (about $1.12 billion each), Minions ($1.08 billion).
Top Ten: The Lord of the Rings: The Return of the King and Iron Man 3 (around $1.02 billion each).

# Analyze Top 10 Least Profitable Movies: 
movie_analysis %>% filter(profit>0) %>% group_by(title) %>% 
  summarise(profit=max(profit)) %>% arrange(profit) %>% 
  slice_head(n=10)

## # A tibble: 10 × 2
##    title                              profit
##    <chr>                               <dbl>
##  1 Of Horses and Men                       1
##  2 The Prophecy                            8
##  3 Bats                                   10
##  4 Tae Guk Gi: The Brotherhood of War     15
##  5 A Farewell to Arms                     21
##  6 Lake of Fire                           25
##  7 My Baby's Daddy                        51
##  8 Rugrats in Paris: The Movie            73
##  9 When Did You Last See Your Father?     92
## 10 Khiladi 786                           126

# Plot Top 10 Least Profitable Movies
movie_analysis %>% filter(profit>0) %>% group_by(title) %>% 
  summarise(profit=max(profit)) %>% arrange(profit) %>% 
  slice_head(n=10) %>% 
  ggplot(aes(x = reorder(title, profit), y = profit)) +
  geom_col(fill = "darkred") +coord_flip()+
  labs(x = "Movie Title", y = "Profit",title = "Top 10 Least Profitable Movies") +
  theme_classic()

Finding from the plot:

Lowest Profit: Of Horses and Men ($1).
Very Low: The Prophecy ($8), Bats ($10).
Low: Tae Guk Gi ($15), A Farewell to Arms ($21).
Moderate: Lake of Fire ($25), My Baby’s Daddy ($51), Rugrats in Paris ($73).
Higher: When Did You Last See Your Father? ($92), Khiladi 786 ($126).

# Analyze Top 10 Flop Movies (Least Profitable or Non-Profitable)
movie_analysis %>% filter(profit<=0) %>% group_by(title) %>% 
  summarise(profit=max(profit)) %>% arrange(profit) %>% 
  slice_head(n=10)

## # A tibble: 10 × 2
##    title                            profit
##    <chr>                             <dbl>
##  1 The Lone Ranger              -165710090
##  2 The Wolfman                  -150000000
##  3 The Alamo                    -119180039
##  4 Mars Needs Moms              -111007242
##  5 Dragonball Evolution         -100000000
##  6 The 13th Warrior              -98301101
##  7 The Adventures of Pluto Nash  -92896027
##  8 Metropolis                    -91969578
##  9 Mighty Joe Young              -90000000
## 10 Volcano                       -90000000

# Plot Top 10 Flop Movies (Least Profitable or Non-Profitable)
movie_analysis %>% filter(profit<=0) %>% group_by(title) %>% 
  summarise(profit=max(profit)) %>% arrange(profit) %>% 
  slice_head(n=10) %>% 
  ggplot(aes(x = reorder(title, profit), y = profit)) +
  geom_col(fill = "darkred") +coord_flip()+
  labs(x = "Movie Title", y = "Profit",title = "Top 10 Flop Movies") +
  theme_classic()

Finding from the plot:

Biggest Losses: The Lone Ranger (-$165.7 million) and The Wolfman (-$150 million).
Significant: The Alamo (-$119.2 million) and Mars Needs Moms (-$111 million).
High Losses: Dragonball Evolution and The 13th Warrior (around -$100 million).
Moderate Losses: The Adventures of Pluto Nash (-$92.9 million) and Metropolis (-$91.9 million).
Least Profitable: Mighty Joe Young and Volcano (both -$90 million).

Revenue and Popularity by Production Companies and Countries

Analyzed and visualized the impact of production companies and countries on budget,revenue and popularity.

# Analyze production companies by budget
  movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
    mutate(keywords = trimws(production_companies)) %>%
    group_by(production_companies) %>% summarise(total_budget=sum(budget)) %>% 
    arrange(desc(total_budget))

## # A tibble: 5,026 × 2
##    production_companies                   total_budget
##    <chr>                                         <dbl>
##  1 Warner Bros.                            18327993949
##  2 Universal Pictures                      13707866273
##  3 Paramount Pictures                      13309613007
##  4 Columbia Pictures                       11109274736
##  5 Twentieth Century Fox Film Corporation  10827167800
##  6 Walt Disney Pictures                     9583280000
##  7 New Line Cinema                          6095485000
##  8 Village Roadshow Pictures                5526800003
##  9 Relativity Media                         5367146652
## 10 Touchstone Pictures                      4657484523
## # ℹ 5,016 more rows

# Plot Top 10 production companies by budget
  movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
    mutate(keywords = trimws(production_companies)) %>%
    group_by(production_companies) %>% summarise(total_budget=sum(budget)) %>% 
    arrange(desc(total_budget)) %>% slice_head(n=10) %>% 
    ggplot(aes(x=reorder(production_companies,total_budget),y=total_budget))+
    geom_col(fill="steelblue")+
    labs(x="Production Companies",y="Total Budget",
         title="Top 10 Production Companies by Budget")+ theme_classic()+
    theme(axis.text.x = element_text(angle = 45,hjust=1))

Finding from the plot:

Highest Budget: Warner Bros. ($18.3 billion).
High: Universal Pictures ($13.7 billion), Paramount Pictures ($13.3 billion), Columbia Pictures ($11.1 billion).
Significant: Twentieth Century Fox ($10.8 billion), Walt Disney Pictures ($9.6 billion).
Notable: New Line Cinema ($6.1 billion), Village Roadshow Pictures ($5.5 billion).
Top Ten: Relativity Media ($5.4 billion), Touchstone Pictures ($4.7 billion).

# Analyze production companies by revenue
movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
  mutate(keywords = trimws(production_companies)) %>%
  group_by(production_companies) %>% summarise(total_revenue=sum(revenue)) %>% 
  arrange(desc(total_revenue))

## # A tibble: 5,026 × 2
##    production_companies                   total_revenue
##    <chr>                                          <dbl>
##  1 Warner Bros.                             49155747874
##  2 Universal Pictures                       42588465942
##  3 Paramount Pictures                       40878523165
##  4 Twentieth Century Fox Film Corporation   39357151309
##  5 Walt Disney Pictures                     28683256048
##  6 Columbia Pictures                        28599634775
##  7 New Line Cinema                          19444865804
##  8 Amblin Entertainment                     16090835147
##  9 DreamWorks SKG                           14384533626
## 10 Dune Entertainment                       13797504190
## # ℹ 5,016 more rows

# Plot Top 10 production companies by revenue
movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
  mutate(keywords = trimws(production_companies)) %>%
  group_by(production_companies) %>% summarise(total_revenue=sum(revenue)) %>% 
  arrange(desc(total_revenue)) %>% slice_head(n=10) %>% 
  ggplot(aes(x=reorder(production_companies,total_revenue),y=total_revenue))+
  geom_col(fill = "steelblue")+
  labs(x="Production Companies",y="Total Revenue",
       title="Top 10 Production Companies by Revenue")+theme_classic()+
  theme(axis.text.x = element_text(angle = 45,hjust=1))

Finding from the plot:

Top Revenue: Warner Bros. ($49.2 billion) and Universal Pictures ($42.6 billion).
High: Paramount Pictures ($40.9 billion) and Twentieth Century Fox ($39.4 billion).
Significant: Walt Disney Pictures ($28.7 billion) and Columbia Pictures ($28.6 billion).
Notable: New Line Cinema ($19.4 billion), Amblin Entertainment ($16.1 billion), DreamWorks SKG ($14.4 billion).
Top Ten: Dune Entertainment ($13.8 billion).

# Analyze production companies by popularity
movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
  mutate(keywords = trimws(production_companies)) %>%
  group_by(production_companies) %>% summarise(avg_popularity=mean(popularity)) %>% 
  arrange(desc(avg_popularity))

## # A tibble: 5,026 × 2
##    production_companies             avg_popularity
##    <chr>                                     <dbl>
##  1 The Donners' Company                       515.
##  2 Bulletproof Cupid                          481.
##  3 Kinberg Genre                              327.
##  4 Illumination Entertainment                 235.
##  5 Deluxe Digital Studios                     198.
##  6 Vita-Ray Dutch Productions (III)           198.
##  7 Syncopy                                    192.
##  8 Lynda Obst Productions                     192.
##  9 Cruel & Unusual Films                      156.
## 10 Atman Entertainment                        147.
## # ℹ 5,016 more rows

# Plot Top 10 production companies by popularity
movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
  mutate(keywords = trimws(production_companies)) %>%
  group_by(production_companies) %>% summarise(avg_popularity=mean(popularity)) %>% 
  arrange(desc(avg_popularity)) %>% slice_head(n=10) %>% 
  ggplot(aes(x=reorder(production_companies,avg_popularity),y=avg_popularity))+
  geom_col(fill = "steelblue")+
  labs(x="Production Companies",y="Average Popularity",
       title="Top 10 Production Companies by Popularity")+
  theme_classic()+
  theme(axis.text.x = element_text(angle=45,hjust = 1))

Finding from the plot:

Highest Popularity: The Donners’ Company (514.6) and Bulletproof Cupid (481.1).
High: Kinberg Genre (326.9) and Illumination Entertainment (234.9).
Moderate: Deluxe Digital Studios and Vita-Ray Dutch Productions (III) (198.4 each).
Lower: Syncopy (192.4), Lynda Obst Productions (192.2), Cruel & Unusual Films (155.8), Atman Entertainment (146.8).

# Analyze production companies by Profit
movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
  mutate(keywords = trimws(production_companies)) %>%
  group_by(production_companies) %>% summarise(total_profit=sum(profit)) %>% 
  arrange(desc(total_profit))

## # A tibble: 5,026 × 2
##    production_companies                   total_profit
##    <chr>                                         <dbl>
##  1 Warner Bros.                            30827753925
##  2 Universal Pictures                      28880599669
##  3 Twentieth Century Fox Film Corporation  28529983509
##  4 Paramount Pictures                      27568910158
##  5 Walt Disney Pictures                    19099976048
##  6 Columbia Pictures                       17490360039
##  7 New Line Cinema                         13349380804
##  8 Amblin Entertainment                    12587035147
##  9 DreamWorks SKG                           9966033626
## 10 Dune Entertainment                       9776504187
## # ℹ 5,016 more rows

# Plot production companies by Profit
movie_analysis %>% separate_rows(production_companies, sep = ",\\s*") %>%
  mutate(keywords = trimws(production_companies)) %>%
  group_by(production_companies) %>% summarise(total_profit=sum(profit)) %>% 
  arrange(desc(total_profit)) %>% slice_head(n=10) %>% 
  ggplot(aes(x = reorder(production_companies, total_profit), y = total_profit )) +
  geom_bar(stat = "identity", fill = "steelblue") + coord_flip() + 
  labs( x = "Production Companies",y = "Total Profit",
        title = "Top 10 Production Companies by Profit Gained") +
  theme_minimal()

Findng from the plot:

Highest Profit: Warner Bros. $30.83 billion and Universal Pictures $28.88 billion.
High Profit: Twentieth Century Fox $28.53 billion and Paramount Pictures $27.57 billion.
Significant Profit: Walt Disney Pictures $19.10 billion and Columbia Pictures $17.49 billion.
Notable Profit: New Line Cinema $13.35 billion and Amblin Entertainment $12.59 billion.
Lower Profit: DreamWorks SKG $9.97 billion and Dune Entertainment $9.78 billion.

# Analyze production countries by Budget
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
  mutate(keywords = trimws(production_countries)) %>%
  group_by(production_countries) %>% summarise(total_budget=sum(budget)) %>% 
  arrange(desc(total_budget))

## # A tibble: 89 × 2
##    production_countries     total_budget
##    <chr>                           <dbl>
##  1 United States of America 132492940438
##  2 United Kingdom            20266758528
##  3 Germany                   12192997188
##  4 France                     6783139181
##  5 Canada                     6329203663
##  6 Australia                  4424322261
##  7 China                      2432225115
##  8 Italy                      2206951350
##  9 New Zealand                2082500000
## 10 Japan                      1734234237
## # ℹ 79 more rows

# Plot production countries by Budget
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
  mutate(keywords = trimws(production_countries)) %>%
  group_by(production_countries) %>% summarise(total_budget=sum(budget)) %>% 
  arrange(desc(total_budget)) %>% slice_head(n=10) %>% 
  ggplot(aes(x = reorder(production_countries, total_budget), y = total_budget)) +
  geom_bar(stat = "identity", fill = "steelblue") + coord_flip() + 
  labs( x = "Production Country",y = "Total Revenue",
        title = "Top 10 Production Countries by Total Budget") +
  theme_minimal()

Finding from the plot:

Highest Budget: United States of America $132.49 billion.
High Budget: United Kingdom $20.27 billion and Germany $12.19 billion.
Significant Budget: France $6.78 billion) and Canada $6.33 billion.
Notable Budget: Australia $4.42 billion and China $2.43 billion.
Lower Budget: Italy $2.21 billion, New Zealand $2.08 billion, and Japan $1.73 billion.

# Analyze Production Countries by Revenue
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
  mutate(keywords = trimws(production_countries)) %>%
  group_by(production_countries) %>% summarise(total_revenue=sum(revenue)) %>% 
  arrange(desc(total_revenue))

## # A tibble: 89 × 2
##    production_countries     total_revenue
##    <chr>                            <dbl>
##  1 United States of America  380026795469
##  2 United Kingdom             60084988968
##  3 Germany                    25480164585
##  4 Canada                     14990315869
##  5 France                     14187924310
##  6 Australia                   9649710497
##  7 New Zealand                 8325644337
##  8 China                       6825078486
##  9 Japan                       5195183028
## 10 Italy                       3871679328
## # ℹ 79 more rows

# Plot Top 10 Production Countries by Revenue
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
  mutate(keywords = trimws(production_countries)) %>%
  group_by(production_countries) %>% summarise(total_revenue=sum(revenue)) %>% 
  arrange(desc(total_revenue)) %>% slice_head(n=10) %>% 
  ggplot(aes(x = reorder(production_countries, total_revenue), y = total_revenue)) +
  geom_bar(stat = "identity", fill = "steelblue") +coord_flip() + 
  labs( x = "Production Country",y = "Total Revenue",
       title = "Top 10 Production Countries by Total Revenue") +
  theme_minimal()

Finding from the plot:

Highest Revenue: United States of America $380.03 billion.
High Revenue: United Kingdom $60.08 billion and Germany $25.48 billion.
Significant Revenue: Canada $14.99 billion and France $14.19 billion.
Notable Revenue: Australia $9.65 billion and New Zealand $8.33 billion.
Lower Revenue: China $6.83 billion, Japan $5.20 billion, and Italy $3.87 billion.

# Analyze production countries by Popularity
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
  mutate(keywords = trimws(production_countries)) %>%
  group_by(production_countries) %>% summarise(avg_popularity=mean(popularity)) %>% 
  arrange(desc(avg_popularity))

## # A tibble: 89 × 2
##    production_countries avg_popularity
##    <chr>                         <dbl>
##  1 Dominica                      146. 
##  2 Jamaica                        97.4
##  3 Malta                          59.1
##  4 Philippines                    54.8
##  5 Bahamas                        46.9
##  6 Taiwan                         42.2
##  7 New Zealand                    42.1
##  8 Singapore                      41.8
##  9 Indonesia                      40.3
## 10 Czech Republic                 31.8
## # ℹ 79 more rows

# Plot production countries by Budget
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
  mutate(keywords = trimws(production_countries)) %>%
  group_by(production_countries) %>% summarise(avg_popularity=mean(popularity)) %>% 
  arrange(desc(avg_popularity)) %>% slice_head(n=10) %>% 
  ggplot(aes(x = reorder(production_countries, avg_popularity), y = avg_popularity )) +
  geom_bar(stat = "identity", fill = "steelblue") + coord_flip() + 
  labs( x = "Production Country",y = "Total Revenue",
        title = "Top 10 Production Countries by Average Popularity") +
  theme_minimal()

Finding from the plot

Highest Popularity: Dominica (145.85).
High Popularity: Jamaica (97.37) and Malta (59.12).
Moderate Popularity: Philippines (54.81) and Bahamas (46.85).
Notable Popularity: Taiwan (42.16), New Zealand (42.13), and Singapore (41.83).
Lower Popularity: Indonesia (40.29) and Czech Republic (31.77).

# Analyze production countries by Profit
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
  mutate(keywords = trimws(production_countries)) %>%
  group_by(production_countries) %>% summarise(total_profit=sum(profit)) %>% 
  arrange(desc(total_profit))

## # A tibble: 89 × 2
##    production_countries     total_profit
##    <chr>                           <dbl>
##  1 United States of America 247533855031
##  2 United Kingdom            39818230440
##  3 Germany                   13287167397
##  4 Canada                     8661112206
##  5 France                     7404785129
##  6 New Zealand                6243144337
##  7 Australia                  5225388236
##  8 China                      4392853371
##  9 Japan                      3460948791
## 10 Italy                      1664727978
## # ℹ 79 more rows

# Plot production countries by Profit
movie_analysis %>% separate_rows(production_countries, sep = ",\\s*") %>%
  mutate(keywords = trimws(production_countries)) %>%
  group_by(production_countries) %>% summarise(total_profit=sum(profit)) %>% 
  arrange(desc(total_profit)) %>% slice_head(n=10) %>% 
  ggplot(aes(x = reorder(production_countries, total_profit), y = total_profit )) +
  geom_bar(stat = "identity", fill = "steelblue") + coord_flip() + 
  labs( x = "Production Country",y = "Total Profit",
        title = "Top 10 Production Countries by Profit Gained") +
  theme_minimal()

Finding from the plot:

Highest Profit: United States of America $247.53 billion.
High Profit: United Kingdom $39.82 billion and Germany $13.29 billion.
Significant Profit: Canada $8.66 billion and France $7.40 billion.
Notable Profit: New Zealand $6.24 billion and Australia $5.23 billion.
Lower Profit: China $4.39 billion, Japan $3.46 billion, and Italy $1.66 billion.

# Analyze movie by voting
movie_analysis %>% 
  arrange(desc(vote_average)) %>% 
  select(title, vote_average) %>% as_tibble()

## # A tibble: 4,803 × 2
##    title                    vote_average
##    <chr>                           <dbl>
##  1 Me You and Five Bucks            10  
##  2 Little Big Top                   10  
##  3 Dancer, Texas Pop. 81            10  
##  4 Stiff Upper Lips                 10  
##  5 Sardaarji                         9.5
##  6 One Man's Hero                    9.3
##  7 The Shawshank Redemption          8.5
##  8 There Goes My Baby                8.5
##  9 The Godfather                     8.4
## 10 The Prisoner of Zenda             8.4
## # ℹ 4,793 more rows

Sentiment Analysis:

Sentiment of Overviews and Taglines: Analyzed and visualized sentiment scores from overviews and taglines.

# Clean the 'overview' and 'tagline' columns
df <- movie_analysis %>%
  mutate(
    overview_clean = str_to_lower(overview),
    overview_clean = str_replace_all(overview_clean, "[^[:alnum:] ]", ""),
    tagline_clean = str_to_lower(tagline),
    tagline_clean = str_replace_all(tagline_clean, "[^[:alnum:] ]", "")
  )
# Perform sentiment analysis using syuzhet
df <- df %>%
  mutate( sentiment_overview = get_sentiment(overview_clean, method = "syuzhet"),
    sentiment_tagline = get_sentiment(tagline_clean, method = "syuzhet"))

# Plot sentiment distribution for 'overview'
ggplot(df, aes(x = sentiment_overview)) +
  geom_histogram(binwidth = 0.1, fill = "skyblue", color = "black") +
  labs(title = "Sentiment Distribution of Film Overviews", x = "Sentiment Score", y = "Frequency")

Finding from the plot:

Most film overviews have neutral sentiment with scores close to zero.
Fewer overviews have highly positive or highly negative sentiment.
The histogram is bell-shaped, indicating that most film overviews have sentiment scores clustered around zero, which suggests that the majority of overviews are neutral in sentiment.
Neutral sentiment is common, but there’s a broad range of sentiments.

# Plot sentiment distribution for 'tagline'
ggplot(df, aes(x = sentiment_tagline)) +
  geom_histogram(binwidth = 0.1, fill = "lightgreen", color = "black") +
  labs(title = "Sentiment Distribution of Film Taglines", x = "Sentiment Score", y = "Frequency")

Finding from the plot:

Most film taglines have a sentiment score of zero, indicating neutrality.
Sentiment scores are concentrated around zero, with few taglines having slightly positive or negative sentiments.
Very few taglines have highly positive or negative sentiment scores.
Taglines are mostly neutral, with minimal sentiment variation.

# correlation with revenue
cor(df$sentiment_overview, df$revenue, use = "complete.obs")

## [1] -0.0007436787

cor(df$sentiment_tagline, df$revenue, use = "complete.obs")

## [1] 0.002233445

Finding from the correlation:

Sentiment Overview and Revenue: The correlation is approximately -0.00074. This is very close to zero, suggesting there is almost no linear relationship between the sentiment overview and revenue.
Sentiment Tagline and Revenue: The correlation is approximately 0.00223. This is also very close to zero, indicating an almost nonexistent linear relationship between the sentiment tagline and revenue.
Both sentiment metrics (overview and tagline) have extremely weak correlations with revenue.

# Plot Sentiment of Overviews vs. Revenue
ggplot(df, aes(x = sentiment_overview, y = revenue)) +
  geom_point() +
  labs(title = "Sentiment of Overviews vs. Revenue", x = "Sentiment Score (Overview)", y = "Revenue")

Finding from the plot:

No strong linear correlation between sentiment scores of movie overviews and revenue.
Most movies cluster around a neutral sentiment score with varying revenues.
Outliers exist with high revenues and near-neutral sentiment scores.
Sentiment of overviews does not strongly predict revenue.

# Plot Sentiment of Taglines vs. Revenue 
ggplot(df, aes(x = sentiment_tagline, y = revenue)) +
  geom_point() +
  labs(title = "Sentiment of Taglines vs. Revenue", x = "Sentiment Score (Tagline)", y = "Revenue")

Finding from the plot:

Lack of strong correlation between sentiment scores of taglines and revenue.
Narrower range of sentiment scores compared to overviews.
Revenue tends to be low at extreme sentiment scores.
Sentiment of taglines also does not strongly predict revenue.

Correlation Analysis:

Budget vs Revenue: Analyzed the correlation and relationship between budget and revenue.

# Budget vs Revenue 
cor(movie_analysis$budget, movie_analysis$revenue, method = "pearson")

## [1] 0.7308229

Finding from the correlation:

There is a strong positive correlation of 0.731 between budget and revenue, suggesting that higher budgets are generally associated with higher revenues for movies.

# Plot Relationship between budget and revenue
plot(movie_analysis$budget, movie_analysis$revenue,
     main = "Relationship between Budget and Revenue",
     xlab = "Budget",
     ylab = "Revenue",
     pch = 19, col = "dodgerblue")

# Add a regression line to visualize the trend
abline(lm(revenue ~ budget, data = movie_analysis), col = "red", lwd = 2)

Finding from the plot:

Positive correlation between budget and revenue.
High variability in revenue, especially for lower budget films.
Presence of outliers with exceptionally high revenues.
Higher budget generally leads to higher revenue, but the relationship is not strictly linear and other factors contribute to revenue.

Rating vs Popularity/Revenue: Examined correlations between ratings, popularity, and revenue.

# Rating vs Popularity
cor(movie_analysis$vote_average, movie_analysis$popularity, method = "pearson")

## [1] 0.2739518

# Rating vs Revenue
cor(movie_analysis$vote_average, movie_analysis$revenue, method = "pearson")

## [1] 0.1971497

Finding from the correlation:

Rating and Popularity: There is a moderate positive correlation 0.274, indicating that higher-rated movies tend to be somewhat more popular, but the relationship is not very strong.
Rating and Revenue: There is a weak positive correlation 0.197, suggesting that higher-rated movies have a slight tendency to generate more revenue, though this relationship is weak.

Cast and Crew Analysis:

Success Metrics by Cast and Crew: Analyzed average revenue and ratings for cast and crew members.

# Separate actor names into individual rows
actors_long <- movie_analysis %>%
  separate_rows(cast, sep = ", ") %>%
  select(id, cast) 

# Separate crew names into individual rows
crew_long <- movie_analysis %>%
  separate_rows(crew, sep = ", ") %>%
  select(id, crew) 

# Aggregate success metrics by cast
cast_success <- actors_long %>%
  inner_join(movie_analysis %>% select(id, revenue, vote_average), by = "id") %>%
  group_by(cast) %>%
  summarise(
    mean_revenue = mean(revenue),
    mean_rating = mean(vote_average),
    n_movies = n()
  )

# Aggregate success metrics by crew
crew_success <- crew_long %>%
  inner_join(movie_analysis %>% select(id, revenue, vote_average), by = "id") %>%
  group_by(crew) %>%
  summarise(
    mean_revenue = mean(revenue),
    mean_rating = mean(vote_average),
    n_movies = n()
  )

Correlations with Number of Movies: Examined relationships between success metrics and the number of movies.

# Correlation for cast and crew
cor(cast_success$mean_revenue, cast_success$n_movies, method = "pearson")

## [1] -0.007276891

cor(crew_success$mean_rating, crew_success$n_movies, method = "pearson")

## [1] -0.03731809

Finding from the correlation:

Cast Success: There is almost no relationship between the average revenue and the number of movies. This means that having more movies doesn’t seem to affect the average revenue much.
Crew Success: There is a very weak negative relationship between the average rating and the number of movies. This suggests that, very slightly, more movies might be associated with slightly lower average ratings, but the effect is minimal.

Hypothesis Testing:

Budget and Revenue: Performed a t-test to compare revenue between high and low-budget movies.

# Define the Hypotheses:
# Null Hypothesis (H0): There is no significant difference in revenue between movies with high and low budgets.
# Alternative Hypothesis (H1): Movies with higher budgets have significantly higher revenues.

# Define high and low budget groups using the median
median_budget <- median(df$budget)
high_budget <- df %>% filter(budget > median_budget) %>% pull(revenue)
low_budget <- df %>% filter(budget <= median_budget) %>% pull(revenue)

# Perform t-test
t.test(high_budget, low_budget)

## 
##  Welch Two Sample t-test
## 
## data:  high_budget and low_budget
## t = 29.888, df = 2513.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  125222700 142807530
## sample estimates:
## mean of x mean of y 
## 152574628  18559513

Finding from the T-Test:

Significant Difference: There is a highly significant difference between high-budget and low-budget groups, with a very small p-value < 2.2 X 10 ^ (-16).
Mean Values: High-budget movies have a much higher mean value $152,574,628 compared to low-budget movies $18,559,513.
Confidence Interval: The 95% confidence interval for the difference in means is large and does not include zero, confirming a substantial difference between the two groups.

Overall, high-budget movies have significantly higher average values compared to low-budget movies.

Genres and Revenue/Ratings: Planned ANOVA to test differences in revenue and ratings across genres.

#Define the Hypotheses:
# Null Hypothesis (H0): There is no significant difference in average ratings or revenue between different genres.
# Alternative Hypothesis (H1): There are significant differences in average ratings or revenue between genres.
# We'll split it into separate rows for each genre
df_genres <- movie_analysis %>%
  separate_rows(genres, sep = '\\|')

# ANOVA for revenue
anova_revenue <- aov(revenue ~ genres, data = df_genres)
summary(anova_revenue)

##               Df    Sum Sq   Mean Sq F value Pr(>F)    
## genres      1174 6.469e+19 5.510e+16    3.19 <2e-16 ***
## Residuals   3628 6.267e+19 1.727e+16                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# ANOVA for ratings
anova_ratings <- aov(vote_average ~ genres, data = df_genres)

Finding from the ANOVA:

Significance: There are statistically significant differences in average ratings among the various genres.
F-Statistic: 1.583
p-Value: < 2e-16 (indicating strong statistical significance)
Conclusion: The analysis shows that genre has a substantial impact on the average ratings.

summary(anova_ratings)

##               Df Sum Sq Mean Sq F value Pr(>F)    
## genres      1174   2321   1.977   1.583 <2e-16 ***
## Residuals   3628   4531   1.249                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Split the production companies and count the number of companies for each movie
movie_analysis$num_companies <- sapply(strsplit(movie_analysis$production_companies, ","), length)

# Now plot the number of production companies against popularity
plot(movie_analysis$num_companies, movie_analysis$popularity,
     main = "Number of Production Companies vs. Popularity",
     xlab = "Number of Production Companies",
     ylab = "Popularity",
     pch = 19, col = "orange")

# Fit a linear model using the numeric count of production companies
model <- lm(popularity ~ num_companies, data = movie_analysis)

# Add the regression line to the plot
abline(model, col = "red", lwd = 2)

Finding from the plot:

Weak Positive Correlation: The scatter plot shows a weak positive correlation between the number of production companies and movie popularity.
Large Variability: Other factors likely play a more significant role in determining popularity.
Regression Line: The regression line does not fit the data well, further emphasizing the weak correlation.
Outliers: A few outliers suggest exceptions to the general trend.

# Correlation between number of production companies and popularity
cor(movie_analysis$num_companies, movie_analysis$popularity)

## [1] 0.1900217

Finding from the correlation:

There is a weak positive correlation between the number of production companies and movie popularity. This suggests that a higher number of production companies is slightly associated with increased popularity, but the relationship is not strong.

Export Combined DataFrame

Finally, we export the cleaned data frame for further analysis to CSV format.

# Load the package to export
library(writexl)

# Export directly CSV file to local computer
write.csv(movie_analysis,"movie_analysis.csv",row.names = FALSE)

Movie_Dataset_Analysis

Sailesh Basnet

2024-08-31

Loading Required Libraries

Loading Movie Dataset

Parse JSON columns in movie dataset

Data Cleaning

Checking and Converting Data Types

Loading Another Movie Dataset

Parse JSON columns in movie dataset

Data Cleaning

Merging Datasets

Handling Missing Values

Confirming Missing Value Handling

Analysis

Finding from the plot:

Finding from the plot:

Finding from the plot:

Finding from the plot:

Popularity Analysis

Finding from the plot:

Finding from the plot:

Runtime & Yearly Trends:

Finding from the plot:

Finding from the plot:

Finding from the plot:

Finding from the plot:

Revenue, Budget, and Popularity by Genre:

Finding from the plot:

Finding from the plot:

Finding from the plot:

Revenue, Budget, and Popularity by Keyword

Finding from the plot:

Finding from the plot:

Finding from the plot:

Language Analysis:

Finding from the plot:

Finding from the plot:

Finding from the plot:

Profit Analysis

Finding from the plot:

Finding from the plot:

Finding from the plot:

Revenue and Popularity by Production Companies and Countries

Finding from the plot:

Finding from the plot:

Finding from the plot:

Findng from the plot:

Finding from the plot:

Finding from the plot:

Finding from the plot

Finding from the plot:

Sentiment Analysis:

Finding from the plot:

Finding from the plot:

Finding from the correlation:

Finding from the plot:

Finding from the plot:

Correlation Analysis:

Finding from the correlation:

Finding from the plot:

Finding from the correlation:

Cast and Crew Analysis:

Finding from the correlation:

Hypothesis Testing:

Finding from the T-Test:

Finding from the ANOVA:

Finding from the plot:

Finding from the correlation:

Export Combined DataFrame