TABLE OF CONTENTS

1.0 INTRODUCTION
1.1 LOAD PACKAGES
1.2 LOAD DATASETS
1.3 BASIC STATISTICS OF DATASETS

2.0 DATA WRANGLING
2.1 JOIN THE DATASETS
2.2 EXTRACT WEEKDAY, MONTH, QUARTER AND YEAR FROM RELEASE DATE
2.3 REMOVE DUPLICATES
2.4 CLEANING GENRES, PRODUCTION COMPANIES AND PRODUCTION COUNTRIES COLUMNS
2.5 REMOVE IRRELEVANT COLUMNS, CLEANING REVENUE AND BUDGET COLUMN
2.6 CLASSIFY THE BUDGET COLUMNS
2.6 CLASSIFY THE MOVIES

3.0 ANALYSIS
3.1 COORELATION AND SUMMARY STATISTICS
3.2 VARIABLES VALUES DISTRIBUTION
3.3 MOVIES TRENDS
3.4 TOP BUDGET MOVIES
3.5 MOST POPULAR MOVIES
3.6 TOP BUDGET AND REVENUE MOVIES
3.7 TOP REVENUE GENERATING AND RATED MOVIES
3.8 REVENUE AMD POPULAR MOVIES BY GENRE
3.9 REVENUE BY QUARTER
3.10 SUPERHIT AND BLOCKBUSTER MOVIES BY TOP PRODUCTION COMPANIES
3.11 TOP NON-ENGLISH ORIGINAL LANGUAGE WITH SUPERHIT AND BLOCKBUSTER MOVIES
3.12 YEAR WITH HIGHEST NUMBER OF PRODUCED FLOP MOVIES
3.13 YEAR WITH HIGHEST NUMBER OF PRODUCED SUPERHIT MOVIES
3.14 SUPERHIT MOVIES GENRE DISTRIBUTION

4.0 CONCLUSION

1.0 INTRODUCTION

The data sets are available in Kaggle, they consist of movies released on or before July 2017. This analysis will explore this data sets to gain some useful insights. Included in the data sets are information on the movies cast, crew, plot keywords, budget, revenue, posters, release dates, languages, production companies, countries, TMDB vote counts and vote averages.

1.1 LOAD PACKAGES

library(tidyverse)
library(lubridate)

1.2 LOAD DATASETS

ratings<- read_csv("ratings_small.csv")
movies <- read_csv("movies_metadata.csv")

1.3 BASIC STATISTICS OF DATASETS

str(movies)
## spec_tbl_df [45,466 × 24] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ adult                : logi [1:45466] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ belongs_to_collection: chr [1:45466] "{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path"| __truncated__ NA "{'id': 119050, 'name': 'Grumpy Old Men Collection', 'poster_path': '/nLvUdqgPgm3F85NMCii9gVFUcet.jpg', 'backdro"| __truncated__ NA ...
##  $ budget               : num [1:45466] 3.0e+07 6.5e+07 0.0 1.6e+07 0.0 6.0e+07 5.8e+07 0.0 3.5e+07 5.8e+07 ...
##  $ genres               : chr [1:45466] "[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]" "[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]" "[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]" "[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]" ...
##  $ homepage             : chr [1:45466] "http://toystory.disney.com/toy-story" NA NA NA ...
##  $ id                   : num [1:45466] 862 8844 15602 31357 11862 ...
##  $ imdb_id              : chr [1:45466] "tt0114709" "tt0113497" "tt0113228" "tt0114885" ...
##  $ original_language    : chr [1:45466] "en" "en" "en" "en" ...
##  $ original_title       : chr [1:45466] "Toy Story" "Jumanji" "Grumpier Old Men" "Waiting to Exhale" ...
##  $ overview             : chr [1:45466] "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. "| __truncated__ "When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwi"| __truncated__ "A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanw"| __truncated__ "Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive \"good man\""| __truncated__ ...
##  $ popularity           : num [1:45466] 21.95 17.02 11.71 3.86 8.39 ...
##  $ poster_path          : chr [1:45466] "/rhIRbceoE9lR4veEXuwCC2wARtG.jpg" "/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg" "/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg" "/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg" ...
##  $ production_companies : chr [1:45466] "[{'name': 'Pixar Animation Studios', 'id': 3}]" "[{'name': 'TriStar Pictures', 'id': 559}, {'name': 'Teitler Film', 'id': 2550}, {'name': 'Interscope Communicat"| __truncated__ "[{'name': 'Warner Bros.', 'id': 6194}, {'name': 'Lancaster Gate', 'id': 19464}]" "[{'name': 'Twentieth Century Fox Film Corporation', 'id': 306}]" ...
##  $ production_countries : chr [1:45466] "[{'iso_3166_1': 'US', 'name': 'United States of America'}]" "[{'iso_3166_1': 'US', 'name': 'United States of America'}]" "[{'iso_3166_1': 'US', 'name': 'United States of America'}]" "[{'iso_3166_1': 'US', 'name': 'United States of America'}]" ...
##  $ release_date         : Date[1:45466], format: "1995-10-30" "1995-12-15" ...
##  $ revenue              : num [1:45466] 3.74e+08 2.63e+08 0.00 8.15e+07 7.66e+07 ...
##  $ runtime              : num [1:45466] 81 104 101 127 106 170 127 97 106 130 ...
##  $ spoken_languages     : chr [1:45466] "[{'iso_639_1': 'en', 'name': 'English'}]" "[{'iso_639_1': 'en', 'name': 'English'}, {'iso_639_1': 'fr', 'name': 'Français'}]" "[{'iso_639_1': 'en', 'name': 'English'}]" "[{'iso_639_1': 'en', 'name': 'English'}]" ...
##  $ status               : chr [1:45466] "Released" "Released" "Released" "Released" ...
##  $ tagline              : chr [1:45466] NA "Roll the dice and unleash the excitement!" "Still Yelling. Still Fighting. Still Ready for Love." "Friends are the people who let you be yourself... and never let you forget it." ...
##  $ title                : chr [1:45466] "Toy Story" "Jumanji" "Grumpier Old Men" "Waiting to Exhale" ...
##  $ video                : logi [1:45466] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ vote_average         : num [1:45466] 7.7 6.9 6.5 6.1 5.7 7.7 6.2 5.4 5.5 6.6 ...
##  $ vote_count           : num [1:45466] 5415 2413 92 34 173 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   adult = col_logical(),
##   ..   belongs_to_collection = col_character(),
##   ..   budget = col_double(),
##   ..   genres = col_character(),
##   ..   homepage = col_character(),
##   ..   id = col_double(),
##   ..   imdb_id = col_character(),
##   ..   original_language = col_character(),
##   ..   original_title = col_character(),
##   ..   overview = col_character(),
##   ..   popularity = col_double(),
##   ..   poster_path = col_character(),
##   ..   production_companies = col_character(),
##   ..   production_countries = col_character(),
##   ..   release_date = col_date(format = ""),
##   ..   revenue = col_double(),
##   ..   runtime = col_double(),
##   ..   spoken_languages = col_character(),
##   ..   status = col_character(),
##   ..   tagline = col_character(),
##   ..   title = col_character(),
##   ..   video = col_logical(),
##   ..   vote_average = col_double(),
##   ..   vote_count = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
dim(movies)
## [1] 45466    24
str(ratings)
## spec_tbl_df [100,004 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ userId   : num [1:100004] 1 1 1 1 1 1 1 1 1 1 ...
##  $ movieId  : num [1:100004] 31 1029 1061 1129 1172 ...
##  $ rating   : num [1:100004] 2.5 3 3 2 4 2 2 2 3.5 2 ...
##  $ timestamp: num [1:100004] 1.26e+09 1.26e+09 1.26e+09 1.26e+09 1.26e+09 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   userId = col_double(),
##   ..   movieId = col_double(),
##   ..   rating = col_double(),
##   ..   timestamp = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(ratings)
## spec_tbl_df [100,004 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ userId   : num [1:100004] 1 1 1 1 1 1 1 1 1 1 ...
##  $ movieId  : num [1:100004] 31 1029 1061 1129 1172 ...
##  $ rating   : num [1:100004] 2.5 3 3 2 4 2 2 2 3.5 2 ...
##  $ timestamp: num [1:100004] 1.26e+09 1.26e+09 1.26e+09 1.26e+09 1.26e+09 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   userId = col_double(),
##   ..   movieId = col_double(),
##   ..   rating = col_double(),
##   ..   timestamp = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

2.0 DATA WRANGLING

2.1 JOIN THE DATASETS

movies <- merge( movies, ratings, by.x = 'id', by.y = 'movieId')

2.2 EXTRACT WEEKDAY, MONTH, QUARTER AND YEAR FROM RELEASE DATE

movies <- movies %>%
  mutate( weekday = wday(release_date),
          month = month(release_date),
          quarter = quarter(release_date),
          year = year(release_date))

2.3 REMOVE DUPLICATES

movies <- distinct( movies, original_title, .keep_all = TRUE)

2.4 CLEANING GENRES, PRODUCTION COMPANIES AND PRODUCTION COUNTRIES COLUMNS

movies <- movies %>%
  separate(genres, into = c("a","b", "c","d","e","genres"), sep = "'") %>%
  select(-a, -b, -c, -d, -e)

movies3 <- movies %>%
  separate(production_companies, into = c("a","b", "c","production_companies"), sep = "'") %>%
  select(-a, -b, -c)

movies5 <- movies3 %>%
  separate(production_countries, into = c("a","b", "c","production_countries"), sep = "'") %>%
  select(-a, -b, -c)

2.5 REMOVING IRRELEVANT COLUMNS, CLEANING REVENUE AND BUDGET COLUMN

movies <- movies5 %>%
  select(-adult,-belongs_to_collection, -homepage, -overview, -poster_path, -spoken_languages,-tagline, -video, -userId)
movies <- movies %>%  
   filter(budget != 0, revenue != 0)

2.6 CLASSIFY THE BUDGET COLUMNS

  • Budget greater than $100m are classified as Super Big Budget.
  • Budget between $50m & $100m are classified as Big Budget.
  • Budget between $10m & $50m are classified as Medium Budget.
  • Budget less than $10m are classified as Small Budget.
movies <- movies %>%
  mutate(budget_classification = 
           ifelse(budget > 100000000, "Super_Big_Budget",
           ifelse( budget > 50000000, "Big_Budget",
           ifelse( budget > 0000000, "Mediun_Budget","Small_Budget"))))

2.6 CLASSIFY THE MOVIES

  • Movies whose Revenue is four times greater than its budget are classified as Superhit.
  • Movies whose Revenue is two and half times greater than its budget are classified as Blockbuster.
  • Movies whose Revenue is one and half times greater than its budget are classified as Minor Success.
  • Movies whose Revenue is same or less than its budget are classified as Flop.
# classifying the movies
movies <- movies %>%
  mutate(movie_classification = 
           ifelse(revenue > 4*budget, "Superhit",
           ifelse(revenue > 2.5*budget, "Blockbuster",
           ifelse(revenue > 1.5*budget, "Minor_Success","Flop"))))

3.0 ANALYSIS

3.1 COORELATION AND SUMMARY STATISTICS

movies_num <- movies %>%
  select (rating, budget, popularity,revenue, runtime
          ,vote_average, vote_count)
movies_sum <- summary(movies_num)
movies_cor <- cor(movies_num)
movies_sum
##      rating          budget            popularity           revenue         
##  Min.   :0.500   Min.   :        1   Min.   :  0.07518   Min.   :5.000e+00  
##  1st Qu.:3.000   1st Qu.:  6200000   1st Qu.:  7.70163   1st Qu.:1.688e+07  
##  Median :3.500   Median : 22000000   Median : 10.38818   Median :6.424e+07  
##  Mean   :3.367   Mean   : 36315230   Mean   : 11.52431   Mean   :1.284e+08  
##  3rd Qu.:4.000   3rd Qu.: 50000000   3rd Qu.: 13.29984   3rd Qu.:1.638e+08  
##  Max.   :5.000   Max.   :380000000   Max.   :140.95024   Max.   :1.845e+09  
##     runtime       vote_average     vote_count     
##  Min.   : 66.0   Min.   :0.000   Min.   :    0.0  
##  1st Qu.:100.0   1st Qu.:6.100   1st Qu.:  251.5  
##  Median :113.0   Median :6.700   Median :  562.0  
##  Mean   :115.5   Mean   :6.682   Mean   : 1085.9  
##  3rd Qu.:126.0   3rd Qu.:7.300   3rd Qu.: 1311.5  
##  Max.   :216.0   Max.   :8.500   Max.   :12269.0
movies_cor
##                    rating      budget   popularity    revenue     runtime
## rating        1.000000000  0.03277123 -0.002377592 0.01583497 -0.01150849
## budget        0.032771232  1.00000000  0.249938635 0.69160130  0.23196259
## popularity   -0.002377592  0.24993863  1.000000000 0.40414496  0.23283892
## revenue       0.015834970  0.69160130  0.404144963 1.00000000  0.25798866
## runtime      -0.011508490  0.23196259  0.232838924 0.25798866  1.00000000
## vote_average -0.007670521 -0.22630966  0.309431234 0.04632344  0.33385843
## vote_count   -0.004739141  0.42120552  0.706050777 0.68594022  0.26652502
##              vote_average   vote_count
## rating       -0.007670521 -0.004739141
## budget       -0.226309659  0.421205515
## popularity    0.309431234  0.706050777
## revenue       0.046323444  0.685940216
## runtime       0.333858434  0.266525016
## vote_average  1.000000000  0.374767637
## vote_count    0.374767637  1.000000000

3.2 VARIABLES VALUES DISTRIBUTION

  • Ratings distribution
  • Budget distribution
  • Popularity distribution
  • Release date distribution
  • Runtime distribution
  • Genre distribution
  • Original language distribution
  • Production Companies
  • Production Countries
ggplot( data = movies, aes(x = rating)) +
  geom_histogram( binwidth = 1  , fill = "orange", color = "black") +
  ggtitle(" RATINGS DISTRIBUTION")

ggplot( data = movies, aes(x = rating)) +
  geom_boxplot(fill = "orange", color = "black") +
  ggtitle(" RATINGS DISTRIBUTION")

ggplot( data = movies, aes( x = budget )) +
  geom_histogram(fill = "orange", color = "black") +
  ggtitle(" BUDGET DISTRIBUTION")

ggplot( data = movies, aes( x = budget )) +
  geom_boxplot(fill = "orange", color = "black") +
  ggtitle(" BUDGET DISTRIBUTION")

ggplot( data = movies, aes( x = popularity)) +
  geom_histogram(fill = "orange", color = "black") +
  ggtitle(" POPULARITY DISTRIBUTION")

ggplot( data = movies, aes( x = popularity)) +
  geom_boxplot(fill = "orange", color = "black") +
  ggtitle(" POPULARITY DISTRIBUTION")

ggplot( data= movies, aes( x = release_date)) +
  geom_boxplot(fill = "orange", color = "black") +
  ggtitle(" RELEASE DATE DISTRIBUTION")

ggplot( data= movies, aes( x = release_date)) +
  geom_histogram(fill = "orange", color = "black") +
  ggtitle(" RELEASE DATE DISTRIBUTION")

ggplot( data= movies, aes( x = runtime)) +
  geom_boxplot(fill = "orange", color = "black") +
  ggtitle(" RUNTIME DISTRIBUTION")

ggplot( data= movies, aes( x = runtime)) +
  geom_histogram(fill = "orange", color = "black") +
  ggtitle(" RUNTIME DISTRIBUTION")

movie_genre <- movies %>%
  group_by(genres) %>%
  summarise(frequency = n())%>%
  arrange(desc(frequency))%>%
  filter(genres != "NA")
ggplot(data = movie_genre, aes(x = reorder(genres, frequency), y = frequency)) +
  geom_bar( stat = "identity" ,fill = "orange", color = "black") +
  coord_flip() +
  xlab("genre") +
  ggtitle(" GENRE DISTRIBUTION")

movie_lang <- movies %>%
  group_by(original_language) %>%
  summarise(frequency = n())%>%
  arrange(desc(frequency))%>%
  filter(frequency >10)

ggplot(data = movie_lang, aes(x = reorder(original_language, frequency), y = frequency)) +
  geom_bar( stat = "identity" ,fill = "orange", color = "black") +
  coord_flip() +
  xlab(  "original language") +
  ggtitle(" ORIGINAL LANGUAGE DISTRIBUTION")

movie_companies <- movies %>%
  group_by(production_companies) %>%
  summarise(frequency = n())%>%
  filter(frequency >10 & production_companies != "NA") %>%
  arrange(desc(frequency))

ggplot(data = movie_companies, aes(x = reorder(production_companies, frequency), y = frequency)) +
  geom_bar( stat = "identity" ,fill = "orange", color = "black") +
  coord_flip() +
  xlab(  "production companies" ) +
  ggtitle("PRODUCTION COMPANIES DISTRIBUTION")

top_countries <- movies %>%
  group_by(production_countries) %>%
  filter(production_countries != "NA")%>%
  summarise( count = n()) %>%
  arrange(desc(count)) %>%
  head(10)

ggplot( data= top_countries, aes( x = reorder(production_countries, count), y = count)) +
  geom_bar(stat = "identity", fill = "orange", color = "black") +
    xlab(  "production_countries" ) +
  coord_flip() +
ggtitle("TOP PRODUCTION COUNTRIES DISTRIBUTION")

3.4 TOP BUDGET MOVIES

top_budget <- movies  %>%
  select( original_title, budget) %>%
  arrange(desc(budget))
top_budget_movies <- head(top_budget, 10)
ggplot(data = top_budget_movies, aes(x = reorder(original_title, budget), y = budget)) +
  geom_col(fill = "orange", color = "black") +
    xlab(  "original_title" ) +
  coord_flip() +
ggtitle(" TOP BUDGET MOVIES")

3.6 TOP BUDGET AND REVENUE MOVIES

budget_revenue <- movies  %>%
  select( original_title, budget, revenue, genres, rating) %>%
  arrange(desc(budget))
ggplot(data = budget_revenue, aes(x =  budget, y = revenue, color = genres)) +
  geom_point() +
  theme(legend.position = "right") +
  ggtitle(" TOP BUDGET AND REVENUE MOVIES BY GENRE")

3.7 TOP REVENUE GENERATING AND RATED MOVIES

top_revenue_rating <- movies  %>%
  select( original_title, budget, genres, revenue, rating) %>%
  arrange(desc(revenue))
ggplot(data = top_revenue_rating, aes(y =  revenue, x= rating, fill= genres ))+
  geom_col( stat= "identity") +
  ggtitle(" REVENUE AND RATING MOVIES BY GENRE")

3.9 REVENUE BY QUARTER

revenue_quarter <- movies %>%
  select( quarter, revenue) %>%
  group_by (quarter) %>%
  summarise( revenue = sum(revenue))%>%
  filter( quarter != "NA")
ggplot(data =revenue_quarter, aes( x= quarter, y =  revenue))+
  geom_col(  fill= "orange", color = "black") +
  ggtitle(" REVENUE BY QUARTER")

3.10 SUPERHIT AND BLOCKBUSTER MOVIES BY TOP PRODUCTION COMPANIES

companies <- movies %>%
  filter( movie_classification == "Superhit" |
            movie_classification == "Blockbuster")%>%
  group_by(production_companies) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  head(10)
ggplot( data = companies, aes( x= reorder(production_companies, count),
                               y= count)) +
  geom_bar( stat = "identity", color = "black", fill = "orange") +
  coord_flip() +
  labs(x = "Production Companies") +
  ggtitle(" Top Production Companies")

3.11 TOP NON-ENGLISH ORIGINAL LANGUAGE WITH SUPERHIT AND BLOCKBUSTER MOVIES

non_english <- movies %>%
  filter(original_language != "en",
         movie_classification == "Superhit" |
           movie_classification == "Blockbuster") %>%
  group_by(original_language) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  head(10)               
ggplot( data = non_english, aes( x= reorder(original_language, count),
                                 y= count)) +
  geom_bar( stat = "identity", color = "black", fill = "orange") +
  coord_flip() +
  labs(x = "Original Languages")

ggtitle(" Top Original Language")
## $title
## [1] " Top Original Language"
## 
## attr(,"class")
## [1] "labels"

3.12 YEAR WITH HIGHEST NUMBER OF PRODUCED FLOP MOVIES

flop <- movies %>%
  filter( movie_classification == "Flop" ) %>%
  group_by(year) %>%
  summarize(count = n()) %>%
  arrange(desc(count))
ggplot( data = flop, aes( x= year, y= count)) +
  geom_line() +
  ggtitle(" Flop Movies Trend")

3.13 YEAR WITH HIGHEST NUMBER OF PRODUCED SUPERHIT MOVIES

superhit <- movies %>%
  filter( movie_classification == "Superhit" ) %>%
  group_by(year) %>%
  summarize(count = n()) %>%
  arrange(desc(count))
ggplot( data = superhit, aes( x= year, y= count)) +
  geom_line() +
  ggtitle(" Superhit Movies Trend")

3.14 SUPERHIT MOVIES GENRE DISTRIBUTION

genre <- movies %>%
  filter( movie_classification == "Superhit", genres != "NA") %>%
  group_by(genres) %>%
  summarize(count = n()) %>%
  arrange(desc(count))
ggplot( data = genre, aes( x= reorder(genres, count),
                           y= count)) +
  geom_bar( stat = "identity", color = "black", fill = "orange") +
  coord_flip() +
  labs(x = "Genres")

ggtitle(" Superhit Movies Genre")
## $title
## [1] " Superhit Movies Genre"
## 
## attr(,"class")
## [1] "labels"

4.0 CONCLUSION