1.0 INTRODUCTION
1.1 LOAD PACKAGES
1.2 LOAD DATASETS
1.3 BASIC STATISTICS OF DATASETS
2.0 DATA WRANGLING
2.1 JOIN THE DATASETS
2.2 EXTRACT WEEKDAY, MONTH, QUARTER AND YEAR FROM RELEASE DATE
2.3 REMOVE DUPLICATES
2.4 CLEANING GENRES, PRODUCTION COMPANIES AND PRODUCTION COUNTRIES
COLUMNS
2.5 REMOVE IRRELEVANT COLUMNS, CLEANING REVENUE AND BUDGET COLUMN
2.6 CLASSIFY THE BUDGET COLUMNS
2.6 CLASSIFY THE MOVIES
3.0 ANALYSIS
3.1 COORELATION AND SUMMARY STATISTICS
3.2 VARIABLES VALUES DISTRIBUTION
3.3 MOVIES TRENDS
3.4 TOP BUDGET MOVIES
3.5 MOST POPULAR MOVIES
3.6 TOP BUDGET AND REVENUE MOVIES
3.7 TOP REVENUE GENERATING AND RATED MOVIES
3.8 REVENUE AMD POPULAR MOVIES BY GENRE
3.9 REVENUE BY QUARTER
3.10 SUPERHIT AND BLOCKBUSTER MOVIES BY TOP PRODUCTION COMPANIES
3.11 TOP NON-ENGLISH ORIGINAL LANGUAGE WITH SUPERHIT AND BLOCKBUSTER
MOVIES
3.12 YEAR WITH HIGHEST NUMBER OF PRODUCED FLOP MOVIES
3.13 YEAR WITH HIGHEST NUMBER OF PRODUCED SUPERHIT MOVIES
3.14 SUPERHIT MOVIES GENRE DISTRIBUTION
4.0 CONCLUSION
The data sets are available in Kaggle, they consist of movies released on or before July 2017. This analysis will explore this data sets to gain some useful insights. Included in the data sets are information on the movies cast, crew, plot keywords, budget, revenue, posters, release dates, languages, production companies, countries, TMDB vote counts and vote averages.
library(tidyverse)
library(lubridate)
ratings<- read_csv("ratings_small.csv")
movies <- read_csv("movies_metadata.csv")
str(movies)
## spec_tbl_df [45,466 × 24] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ adult : logi [1:45466] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ belongs_to_collection: chr [1:45466] "{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path"| __truncated__ NA "{'id': 119050, 'name': 'Grumpy Old Men Collection', 'poster_path': '/nLvUdqgPgm3F85NMCii9gVFUcet.jpg', 'backdro"| __truncated__ NA ...
## $ budget : num [1:45466] 3.0e+07 6.5e+07 0.0 1.6e+07 0.0 6.0e+07 5.8e+07 0.0 3.5e+07 5.8e+07 ...
## $ genres : chr [1:45466] "[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]" "[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]" "[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]" "[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]" ...
## $ homepage : chr [1:45466] "http://toystory.disney.com/toy-story" NA NA NA ...
## $ id : num [1:45466] 862 8844 15602 31357 11862 ...
## $ imdb_id : chr [1:45466] "tt0114709" "tt0113497" "tt0113228" "tt0114885" ...
## $ original_language : chr [1:45466] "en" "en" "en" "en" ...
## $ original_title : chr [1:45466] "Toy Story" "Jumanji" "Grumpier Old Men" "Waiting to Exhale" ...
## $ overview : chr [1:45466] "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. "| __truncated__ "When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwi"| __truncated__ "A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanw"| __truncated__ "Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive \"good man\""| __truncated__ ...
## $ popularity : num [1:45466] 21.95 17.02 11.71 3.86 8.39 ...
## $ poster_path : chr [1:45466] "/rhIRbceoE9lR4veEXuwCC2wARtG.jpg" "/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg" "/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg" "/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg" ...
## $ production_companies : chr [1:45466] "[{'name': 'Pixar Animation Studios', 'id': 3}]" "[{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film', 'id': 2550}, {'name': 'Interscope Communicat"| __truncated__ "[{'name': 'Warner Bros.', 'id': 6194}, {'name': 'Lancaster Gate', 'id': 19464}]" "[{'name': 'Twentieth Century Fox Film Corporation', 'id': 306}]" ...
## $ production_countries : chr [1:45466] "[{'iso_3166_1': 'US', 'name': 'United States of America'}]" "[{'iso_3166_1': 'US', 'name': 'United States of America'}]" "[{'iso_3166_1': 'US', 'name': 'United States of America'}]" "[{'iso_3166_1': 'US', 'name': 'United States of America'}]" ...
## $ release_date : Date[1:45466], format: "1995-10-30" "1995-12-15" ...
## $ revenue : num [1:45466] 3.74e+08 2.63e+08 0.00 8.15e+07 7.66e+07 ...
## $ runtime : num [1:45466] 81 104 101 127 106 170 127 97 106 130 ...
## $ spoken_languages : chr [1:45466] "[{'iso_639_1': 'en', 'name': 'English'}]" "[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]" "[{'iso_639_1': 'en', 'name': 'English'}]" "[{'iso_639_1': 'en', 'name': 'English'}]" ...
## $ status : chr [1:45466] "Released" "Released" "Released" "Released" ...
## $ tagline : chr [1:45466] NA "Roll the dice and unleash the excitement!" "Still Yelling. Still Fighting. Still Ready for Love." "Friends are the people who let you be yourself... and never let you forget it." ...
## $ title : chr [1:45466] "Toy Story" "Jumanji" "Grumpier Old Men" "Waiting to Exhale" ...
## $ video : logi [1:45466] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ vote_average : num [1:45466] 7.7 6.9 6.5 6.1 5.7 7.7 6.2 5.4 5.5 6.6 ...
## $ vote_count : num [1:45466] 5415 2413 92 34 173 ...
## - attr(*, "spec")=
## .. cols(
## .. adult = col_logical(),
## .. belongs_to_collection = col_character(),
## .. budget = col_double(),
## .. genres = col_character(),
## .. homepage = col_character(),
## .. id = col_double(),
## .. imdb_id = col_character(),
## .. original_language = col_character(),
## .. original_title = col_character(),
## .. overview = col_character(),
## .. popularity = col_double(),
## .. poster_path = col_character(),
## .. production_companies = col_character(),
## .. production_countries = col_character(),
## .. release_date = col_date(format = ""),
## .. revenue = col_double(),
## .. runtime = col_double(),
## .. spoken_languages = col_character(),
## .. status = col_character(),
## .. tagline = col_character(),
## .. title = col_character(),
## .. video = col_logical(),
## .. vote_average = col_double(),
## .. vote_count = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
dim(movies)
## [1] 45466 24
str(ratings)
## spec_tbl_df [100,004 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ userId : num [1:100004] 1 1 1 1 1 1 1 1 1 1 ...
## $ movieId : num [1:100004] 31 1029 1061 1129 1172 ...
## $ rating : num [1:100004] 2.5 3 3 2 4 2 2 2 3.5 2 ...
## $ timestamp: num [1:100004] 1.26e+09 1.26e+09 1.26e+09 1.26e+09 1.26e+09 ...
## - attr(*, "spec")=
## .. cols(
## .. userId = col_double(),
## .. movieId = col_double(),
## .. rating = col_double(),
## .. timestamp = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
str(ratings)
## spec_tbl_df [100,004 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ userId : num [1:100004] 1 1 1 1 1 1 1 1 1 1 ...
## $ movieId : num [1:100004] 31 1029 1061 1129 1172 ...
## $ rating : num [1:100004] 2.5 3 3 2 4 2 2 2 3.5 2 ...
## $ timestamp: num [1:100004] 1.26e+09 1.26e+09 1.26e+09 1.26e+09 1.26e+09 ...
## - attr(*, "spec")=
## .. cols(
## .. userId = col_double(),
## .. movieId = col_double(),
## .. rating = col_double(),
## .. timestamp = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
movies <- merge( movies, ratings, by.x = 'id', by.y = 'movieId')
movies <- movies %>%
mutate( weekday = wday(release_date),
month = month(release_date),
quarter = quarter(release_date),
year = year(release_date))
movies <- distinct( movies, original_title, .keep_all = TRUE)
movies <- movies %>%
separate(genres, into = c("a","b", "c","d","e","genres"), sep = "'") %>%
select(-a, -b, -c, -d, -e)
movies3 <- movies %>%
separate(production_companies, into = c("a","b", "c","production_companies"), sep = "'") %>%
select(-a, -b, -c)
movies5 <- movies3 %>%
separate(production_countries, into = c("a","b", "c","production_countries"), sep = "'") %>%
select(-a, -b, -c)
movies <- movies5 %>%
select(-adult,-belongs_to_collection, -homepage, -overview, -poster_path, -spoken_languages,-tagline, -video, -userId)
movies <- movies %>%
filter(budget != 0, revenue != 0)
movies <- movies %>%
mutate(budget_classification =
ifelse(budget > 100000000, "Super_Big_Budget",
ifelse( budget > 50000000, "Big_Budget",
ifelse( budget > 0000000, "Mediun_Budget","Small_Budget"))))
# classifying the movies
movies <- movies %>%
mutate(movie_classification =
ifelse(revenue > 4*budget, "Superhit",
ifelse(revenue > 2.5*budget, "Blockbuster",
ifelse(revenue > 1.5*budget, "Minor_Success","Flop"))))
movies_num <- movies %>%
select (rating, budget, popularity,revenue, runtime
,vote_average, vote_count)
movies_sum <- summary(movies_num)
movies_cor <- cor(movies_num)
movies_sum
## rating budget popularity revenue
## Min. :0.500 Min. : 1 Min. : 0.07518 Min. :5.000e+00
## 1st Qu.:3.000 1st Qu.: 6200000 1st Qu.: 7.70163 1st Qu.:1.688e+07
## Median :3.500 Median : 22000000 Median : 10.38818 Median :6.424e+07
## Mean :3.367 Mean : 36315230 Mean : 11.52431 Mean :1.284e+08
## 3rd Qu.:4.000 3rd Qu.: 50000000 3rd Qu.: 13.29984 3rd Qu.:1.638e+08
## Max. :5.000 Max. :380000000 Max. :140.95024 Max. :1.845e+09
## runtime vote_average vote_count
## Min. : 66.0 Min. :0.000 Min. : 0.0
## 1st Qu.:100.0 1st Qu.:6.100 1st Qu.: 251.5
## Median :113.0 Median :6.700 Median : 562.0
## Mean :115.5 Mean :6.682 Mean : 1085.9
## 3rd Qu.:126.0 3rd Qu.:7.300 3rd Qu.: 1311.5
## Max. :216.0 Max. :8.500 Max. :12269.0
movies_cor
## rating budget popularity revenue runtime
## rating 1.000000000 0.03277123 -0.002377592 0.01583497 -0.01150849
## budget 0.032771232 1.00000000 0.249938635 0.69160130 0.23196259
## popularity -0.002377592 0.24993863 1.000000000 0.40414496 0.23283892
## revenue 0.015834970 0.69160130 0.404144963 1.00000000 0.25798866
## runtime -0.011508490 0.23196259 0.232838924 0.25798866 1.00000000
## vote_average -0.007670521 -0.22630966 0.309431234 0.04632344 0.33385843
## vote_count -0.004739141 0.42120552 0.706050777 0.68594022 0.26652502
## vote_average vote_count
## rating -0.007670521 -0.004739141
## budget -0.226309659 0.421205515
## popularity 0.309431234 0.706050777
## revenue 0.046323444 0.685940216
## runtime 0.333858434 0.266525016
## vote_average 1.000000000 0.374767637
## vote_count 0.374767637 1.000000000
ggplot( data = movies, aes(x = rating)) +
geom_histogram( binwidth = 1 , fill = "orange", color = "black") +
ggtitle(" RATINGS DISTRIBUTION")
ggplot( data = movies, aes(x = rating)) +
geom_boxplot(fill = "orange", color = "black") +
ggtitle(" RATINGS DISTRIBUTION")
ggplot( data = movies, aes( x = budget )) +
geom_histogram(fill = "orange", color = "black") +
ggtitle(" BUDGET DISTRIBUTION")
ggplot( data = movies, aes( x = budget )) +
geom_boxplot(fill = "orange", color = "black") +
ggtitle(" BUDGET DISTRIBUTION")
ggplot( data = movies, aes( x = popularity)) +
geom_histogram(fill = "orange", color = "black") +
ggtitle(" POPULARITY DISTRIBUTION")
ggplot( data = movies, aes( x = popularity)) +
geom_boxplot(fill = "orange", color = "black") +
ggtitle(" POPULARITY DISTRIBUTION")
ggplot( data= movies, aes( x = release_date)) +
geom_boxplot(fill = "orange", color = "black") +
ggtitle(" RELEASE DATE DISTRIBUTION")
ggplot( data= movies, aes( x = release_date)) +
geom_histogram(fill = "orange", color = "black") +
ggtitle(" RELEASE DATE DISTRIBUTION")
ggplot( data= movies, aes( x = runtime)) +
geom_boxplot(fill = "orange", color = "black") +
ggtitle(" RUNTIME DISTRIBUTION")
ggplot( data= movies, aes( x = runtime)) +
geom_histogram(fill = "orange", color = "black") +
ggtitle(" RUNTIME DISTRIBUTION")
movie_genre <- movies %>%
group_by(genres) %>%
summarise(frequency = n())%>%
arrange(desc(frequency))%>%
filter(genres != "NA")
ggplot(data = movie_genre, aes(x = reorder(genres, frequency), y = frequency)) +
geom_bar( stat = "identity" ,fill = "orange", color = "black") +
coord_flip() +
xlab("genre") +
ggtitle(" GENRE DISTRIBUTION")
movie_lang <- movies %>%
group_by(original_language) %>%
summarise(frequency = n())%>%
arrange(desc(frequency))%>%
filter(frequency >10)
ggplot(data = movie_lang, aes(x = reorder(original_language, frequency), y = frequency)) +
geom_bar( stat = "identity" ,fill = "orange", color = "black") +
coord_flip() +
xlab( "original language") +
ggtitle(" ORIGINAL LANGUAGE DISTRIBUTION")
movie_companies <- movies %>%
group_by(production_companies) %>%
summarise(frequency = n())%>%
filter(frequency >10 & production_companies != "NA") %>%
arrange(desc(frequency))
ggplot(data = movie_companies, aes(x = reorder(production_companies, frequency), y = frequency)) +
geom_bar( stat = "identity" ,fill = "orange", color = "black") +
coord_flip() +
xlab( "production companies" ) +
ggtitle("PRODUCTION COMPANIES DISTRIBUTION")
top_countries <- movies %>%
group_by(production_countries) %>%
filter(production_countries != "NA")%>%
summarise( count = n()) %>%
arrange(desc(count)) %>%
head(10)
ggplot( data= top_countries, aes( x = reorder(production_countries, count), y = count)) +
geom_bar(stat = "identity", fill = "orange", color = "black") +
xlab( "production_countries" ) +
coord_flip() +
ggtitle("TOP PRODUCTION COUNTRIES DISTRIBUTION")
ggplot( data = movies, aes( x = year ,y = budget, color = genres)) +
geom_point() +
theme(legend.position = "right") +
ggtitle("BUDGET TREND")
revenue_trend <- movies %>%
select(year, revenue) %>%
group_by(year)%>%
summarise(revenue = sum(revenue))%>%
filter( year > 1990)
ggplot( data = revenue_trend, aes( x = year ,y = revenue)) +
geom_col(fill = "orange", color = "black") +
ggtitle("REVENUE TREND")
top_budget <- movies %>%
select( original_title, budget) %>%
arrange(desc(budget))
top_budget_movies <- head(top_budget, 10)
ggplot(data = top_budget_movies, aes(x = reorder(original_title, budget), y = budget)) +
geom_col(fill = "orange", color = "black") +
xlab( "original_title" ) +
coord_flip() +
ggtitle(" TOP BUDGET MOVIES")
most_popular <- movies %>%
select( original_title, popularity) %>%
arrange(desc(popularity))
most_popular_movies <- head(most_popular, 10)
ggplot(data = most_popular_movies, aes(x = reorder(original_title, popularity), y = popularity)) +
geom_col(fill = "orange", color = "black") +
xlab( "original_title" ) +
coord_flip()+
ggtitle(" MOST POPULAR MOVIES")
budget_revenue <- movies %>%
select( original_title, budget, revenue, genres, rating) %>%
arrange(desc(budget))
ggplot(data = budget_revenue, aes(x = budget, y = revenue, color = genres)) +
geom_point() +
theme(legend.position = "right") +
ggtitle(" TOP BUDGET AND REVENUE MOVIES BY GENRE")
top_revenue_rating <- movies %>%
select( original_title, budget, genres, revenue, rating) %>%
arrange(desc(revenue))
ggplot(data = top_revenue_rating, aes(y = revenue, x= rating, fill= genres ))+
geom_col( stat= "identity") +
ggtitle(" REVENUE AND RATING MOVIES BY GENRE")
top_popular_revenue <- movies %>%
select( original_title, revenue,genres,popularity) %>%
arrange(desc(popularity))
ggplot(data = top_popular_revenue, aes( x = popularity, y= revenue, fill= genres ))+
geom_point() +
ggtitle(" REVENUE AND POPULARITY MOVIES BY GENRE")
revenue_quarter <- movies %>%
select( quarter, revenue) %>%
group_by (quarter) %>%
summarise( revenue = sum(revenue))%>%
filter( quarter != "NA")
ggplot(data =revenue_quarter, aes( x= quarter, y = revenue))+
geom_col( fill= "orange", color = "black") +
ggtitle(" REVENUE BY QUARTER")
companies <- movies %>%
filter( movie_classification == "Superhit" |
movie_classification == "Blockbuster")%>%
group_by(production_companies) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
head(10)
ggplot( data = companies, aes( x= reorder(production_companies, count),
y= count)) +
geom_bar( stat = "identity", color = "black", fill = "orange") +
coord_flip() +
labs(x = "Production Companies") +
ggtitle(" Top Production Companies")
non_english <- movies %>%
filter(original_language != "en",
movie_classification == "Superhit" |
movie_classification == "Blockbuster") %>%
group_by(original_language) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
head(10)
ggplot( data = non_english, aes( x= reorder(original_language, count),
y= count)) +
geom_bar( stat = "identity", color = "black", fill = "orange") +
coord_flip() +
labs(x = "Original Languages")
ggtitle(" Top Original Language")
## $title
## [1] " Top Original Language"
##
## attr(,"class")
## [1] "labels"
flop <- movies %>%
filter( movie_classification == "Flop" ) %>%
group_by(year) %>%
summarize(count = n()) %>%
arrange(desc(count))
ggplot( data = flop, aes( x= year, y= count)) +
geom_line() +
ggtitle(" Flop Movies Trend")
superhit <- movies %>%
filter( movie_classification == "Superhit" ) %>%
group_by(year) %>%
summarize(count = n()) %>%
arrange(desc(count))
ggplot( data = superhit, aes( x= year, y= count)) +
geom_line() +
ggtitle(" Superhit Movies Trend")
genre <- movies %>%
filter( movie_classification == "Superhit", genres != "NA") %>%
group_by(genres) %>%
summarize(count = n()) %>%
arrange(desc(count))
ggplot( data = genre, aes( x= reorder(genres, count),
y= count)) +
geom_bar( stat = "identity", color = "black", fill = "orange") +
coord_flip() +
labs(x = "Genres")
ggtitle(" Superhit Movies Genre")
## $title
## [1] " Superhit Movies Genre"
##
## attr(,"class")
## [1] "labels"