Explanation

Hello! This analysis will present the analysis of Marvel Movie which released from 1986 to 2021

Data Explanatory

Data Input & Structure

marvel <- read.csv("marvel_clean.csv")

inspecting & check the data structure

str(marvel)
## 'data.frame':    64 obs. of  9 variables:
##  $ Title                     : chr  "Howard the Duck" "Blade" "X-Men" "Blade II" ...
##  $ Distributor               : chr  "Universal Pictures" "New Line Cinema" "20th Century Fox" "New Line Cinema" ...
##  $ ReleaseDateUS             : chr  "8/1/1986 0:00" "8/21/1998 0:00" "7/14/2000 0:00" "3/22/2002 0:00" ...
##  $ Budget                    : int  37000000 45000000 75000000 54000000 139000000 78000000 110000000 137000000 33000000 200000000 ...
##  $ OpeningWeekendNorthAmerica: int  5070136 17073856 54471475 32528016 114844116 40310419 85558731 62128420 13834527 88156227 ...
##  $ NorthAmerica              : int  16295774 70087718 157299717 82348319 403706375 102543518 214949694 132177234 33810189 373585825 ...
##  $ OtherTerritories          : int  21667000 61095812 139039810 72661713 418002176 76636200 192761855 113183246 20889916 415390628 ...
##  $ Worldwide                 : num  3.80e+07 1.31e+08 2.96e+08 1.55e+08 8.22e+08 ...
##  $ CinemaScoreReview         : chr  "B-" "A-" "A-" "B+" ...

Based on the structure above, we need to change several variables to factor and to date

marvel$Title <- as.factor(marvel$Title)
marvel$Distributor <- as.factor(marvel$Distributor)
marvel$CinemaScoreReview <- as.factor(marvel$CinemaScoreReview)
marvel$ReleaseDateUS <- as.factor(marvel$ReleaseDateUS)

str(marvel)
## 'data.frame':    64 obs. of  9 variables:
##  $ Title                     : Factor w/ 63 levels "Ant-Man","Ant-Man and the Wasp",..: 29 9 57 10 38 16 63 30 50 39 ...
##  $ Distributor               : Factor w/ 9 levels "20th Century Fox",..: 8 5 1 5 7 1 1 8 4 7 ...
##  $ ReleaseDateUS             : Factor w/ 64 levels "1/14/2005 0:00",..: 58 60 48 18 34 14 28 44 21 46 ...
##  $ Budget                    : int  37000000 45000000 75000000 54000000 139000000 78000000 110000000 137000000 33000000 200000000 ...
##  $ OpeningWeekendNorthAmerica: int  5070136 17073856 54471475 32528016 114844116 40310419 85558731 62128420 13834527 88156227 ...
##  $ NorthAmerica              : int  16295774 70087718 157299717 82348319 403706375 102543518 214949694 132177234 33810189 373585825 ...
##  $ OtherTerritories          : int  21667000 61095812 139039810 72661713 418002176 76636200 192761855 113183246 20889916 415390628 ...
##  $ Worldwide                 : num  3.80e+07 1.31e+08 2.96e+08 1.55e+08 8.22e+08 ...
##  $ CinemaScoreReview         : Factor w/ 7 levels "A","A-","A+",..: 5 2 2 6 2 4 1 5 NA 2 ...
marvel$ReleaseDateUS <- strptime(marvel$ReleaseDateUS, format="%m/%d/%Y")

marvel$ReleaseDateUS
##  [1] "1986-08-01 +07" "1998-08-21 +07" "2000-07-14 +07" "2002-03-22 +07"
##  [5] "2002-05-03 +07" "2003-02-14 +07" "2003-05-02 +07" "2003-06-20 +07"
##  [9] "2004-04-16 +07" "2004-06-30 +07" "2004-12-08 +07" "2005-01-14 +07"
## [13] "2005-07-08 +07" "2006-05-26 +07" "2007-02-16 +07" "2007-05-04 +07"
## [17] "2007-06-15 +07" "2008-05-02 +07" "2008-06-13 +07" "2008-12-05 +07"
## [21] "2009-05-01 +07" "2010-05-07 +07" "2011-05-06 +07" "2011-06-03 +07"
## [25] "2011-07-22 +07" "2012-02-17 +07" "2012-05-04 +07" "2012-07-03 +07"
## [29] "2013-05-03 +07" "2013-07-26 +07" "2013-11-08 +07" "2014-04-04 +07"
## [33] "2014-05-02 +07" "2014-05-23 +07" "2014-08-01 +07" "2014-11-07 +07"
## [37] "2015-05-01 +07" "2015-07-17 +07" "2015-08-07 +07" "2016-02-12 +07"
## [41] "2016-05-06 +07" "2016-05-27 +07" "2016-11-04 +07" "2017-03-03 +07"
## [45] "2017-05-05 +07" "2017-07-07 +07" "2017-09-01 +07" "2017-11-03 +07"
## [49] "2018-02-16 +07" "2018-04-27 +07" "2018-05-18 +07" "2018-07-06 +07"
## [53] "2018-10-05 +07" "2018-12-14 +07" "2019-03-08 +07" "2019-04-26 +07"
## [57] "2019-06-07 +07" "2019-07-02 +07" "2020-08-28 +07" "2021-07-09 +07"
## [61] "2021-09-03 +07" "2021-10-01 +07" "2021-11-05 +07" "2021-12-17 +07"
marvel$ReleaseDateUS <- as.Date(marvel$ReleaseDateUS,format="%Y-%m-%d")

str(marvel$ReleaseDateUS)
##  Date[1:64], format: "1986-08-01" "1998-08-21" "2000-07-14" "2002-03-22" "2002-05-03" ...

Missing Data

anyNA(marvel)
## [1] TRUE
colSums(is.na(marvel))
##                      Title                Distributor 
##                          0                          0 
##              ReleaseDateUS                     Budget 
##                          0                          0 
## OpeningWeekendNorthAmerica               NorthAmerica 
##                          0                          0 
##           OtherTerritories                  Worldwide 
##                          0                          0 
##          CinemaScoreReview 
##                          5

Deleting the NA row for Cinema Score

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
marvel.clean <- drop_na(data = marvel, CinemaScoreReview)
anyNA(marvel.clean)
## [1] FALSE

To make the data long format in order to have easier process

marvel.long <- pivot_longer(data =  marvel.clean, 
             cols = c(NorthAmerica, OtherTerritories),
             names_to = "RevenueArea", 
             values_to = "Revenue")
marvel.long
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
marvel.long$Year <- year(marvel.long$ReleaseDateUS)
marvel.long$RevenueArea <- as.factor(marvel.long$RevenueArea)
marvel.long$Revenue <- as.numeric(marvel.long$Revenue)
marvel.long$Year <- as.factor(marvel.long$Year)
marvel.long
summary(marvel.long)
##                      Title                                  Distributor
##  Ant-Man                :  2   Walt Disney Studios Motion Pictures:40  
##  Ant-Man and the Wasp   :  2   20th Century Fox                   :30  
##  Avengers: Age of Ultron:  2   Sony Pictures                      :26  
##  Avengers: Endgame      :  2   Paramount Pictures                 : 8  
##  Avengers: Infinity War :  2   New Line Cinema                    : 6  
##  Big Hero 6             :  2   Universal Pictures                 : 6  
##  (Other)                :106   (Other)                            : 2  
##  ReleaseDateUS            Budget          OpeningWeekendNorthAmerica
##  Min.   :1986-08-01   Min.   : 35000000   Min.   :  4271451         
##  1st Qu.:2007-09-03   1st Qu.:110000000   1st Qu.: 55614510         
##  Median :2014-04-04   Median :160000000   Median : 85058311         
##  Mean   :2012-06-25   Mean   :155288136   Mean   : 96023192         
##  3rd Qu.:2018-01-20   3rd Qu.:200000000   3rd Qu.:121315618         
##  Max.   :2021-12-17   Max.   :356000000   Max.   :357115007         
##                                                                     
##    Worldwide         CinemaScoreReview           RevenueArea
##  Min.   :1.010e+07   A :42             NorthAmerica    :59  
##  1st Qu.:3.579e+08   A-:28             OtherTerritories:59  
##  Median :6.168e+08   A+:10                                  
##  Mean   :6.721e+08   B :10                                  
##  3rd Qu.:8.459e+08   B-:10                                  
##  Max.   :2.798e+09   B+:16                                  
##                      C+: 2                                  
##     Revenue               Year   
##  Min.   :2.049e+06   2018   :12  
##  1st Qu.:1.662e+08   2014   :10  
##  Median :2.485e+08   2021   :10  
##  Mean   :3.360e+08   2016   : 8  
##  3rd Qu.:4.332e+08   2017   : 8  
##  Max.   :1.938e+09   2019   : 8  
##                      (Other):62

From summary above, we may conclude some of the things:

  • Majority of Marvel movies are produce by Walt disney studios (33%)
  • Budget to produce 1 movies was range $35 million to $365 million, with average budget per movies were $155 million
  • During the weekend launching, minimum Marvel movies revenue reached $4 million, while the highest revenue were $357 million
  • Minimum Worldwide revenue from each movie ranged from $2 million up to $2 billion, with averge revebue per movies were $600 million
  • Most of the Marvel movies rated A from the reviewer, yet there is 1 movie that have relatively lower rating that the others (C+)

Study Case

1. Which marvel movies that have higher revenue and which earn the lowest in worldwide

case1 <- aggregate(formula = Worldwide ~ Title, 
                       data = marvel.long, 
                       FUN = mean)
ggplot(data = case1, mapping = aes(x = Worldwide, y = reorder(Title, Worldwide))) +
  geom_col(mapping = aes(fill = Worldwide)) +
  geom_vline(xintercept = mean(marvel.long$Worldwide), linetype = 2, col = "Red") +
  labs(y="Title", x="Revenue", title = "Revenue by Movie title") +
  scale_fill_gradient(low = "darkolivegreen1", high = "darkgreen")+
  theme_minimal()

Interpretation:

  • Avenger: Endgame record the highest revenue among other Marvel movie
  • Other 3 Avenger’s movie are also in high recor (Avenger: Invinity war, Avenger: Age of ultron, The Avenger)
  • Meanwhile, Punisher: War zone, Howard the duck, Elektra, Blade and Ghost rider are the 5 movie that have lowest revenue
  • Seems superhero movies way more preferred by customers
  • Red line is showing the average worldwide revenue by each movie

2. Show the production budget and revenue by distibutor

UnikWW <- marvel.long[match(unique(marvel.long$Title), marvel.long$Title), ]
UnikWW
ggplot(data = UnikWW, mapping = aes(x = Distributor, y = Worldwide)) +
  geom_boxplot(mapping = aes(color = Distributor)) +
  labs(title =  "Worldwide Revenue rate by Distributor",
       subtitle = "Marvel movies",
       x = "Distributor",
       y = "Revenue",
       color = "Distributor")+
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

ggplot(data = UnikWW, mapping = aes(x = Distributor, y = Budget)) +
  geom_boxplot(mapping = aes(color = Distributor)) +
  labs(title =  "Production Budget rate by Distributor",
       subtitle = "Marvel movies",
       x = "Distributor",
       y = "Production Budget",
       color = "Distributor")+
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

Interpretation:

  • In terms of Revenue, Walt Disney is Marvel’s movie distributor with highest revenue record, followed by Sony pictures.
  • While, production budget from Walt Disney and Sony picture is record to be similar (having similar both 3rd quarter and hingher amount)
  • Universal Picture production cost seems much higher, yet the revenue is even lower compared with 20th Century Fox

3. Continuing the analysis above, is there any correlation between Production budget and Revenue earned?

ggplot(data = UnikWW, mapping = aes(x = Worldwide, y = Budget)) +
  geom_jitter(mapping = aes(color = Distributor)) +
  labs(title =  "Correlation Test Production budget VS Revenue earned",
       subtitle = "Marvel movies",
       x = "Production budget",
       y = "Revenue earned")+
  theme_gray()

cor(UnikWW$Budget, UnikWW$Worldwide)
## [1] 0.7478962

Interpretation:

Production budget & Revenue earned have quite strong positive relationship (Cor Coef = 0.74). The higher the production budget is, the revenue earned will be higher.

4.Which movies contribute the most revenue on each area?

Because of the title is so many, we decide to deep dive into only movie that have revenue above average

TitleAboveAvg <- marvel.long[marvel.long$Worldwide >= mean(marvel.long$Worldwide), ]
TitleAboveAvg
ggplot(TitleAboveAvg,aes(Revenue, Title))+
  geom_col(fill ="navy")+
  facet_grid(cols = vars(RevenueArea), scales = "free_y")+
  labs( x="Revenue", y= "Title")

Interpretation:

  • Avenger: Endgame hit record for the highest movie revenue on both area
  • All Avenger movie series remain hit for both area. However, the demand for North america is relatively wider than Other Territories since Black panther and Spiderman also being Top 5 in terms of revenue

5. Please show the number of movies released, and revenue by year

yeartable <- as.data.frame(table("Year"=UnikWW$Year))
yeartable
ggplot(data = yeartable, aes(x = Year, y = Freq, group = 1)) +
  geom_line()+
  geom_point()+
  labs(title = "Number of Movies Released by Year",
       subtitle = "Marvel movies 1986-2021",
       caption = "Source: Marvel movies",
       x = "Year",
       y = NULL,
       fill = NULL)

YearRev <- aggregate(formula = Worldwide ~ Year, 
                       data = UnikWW, 
                       FUN = mean)
ggplot(data = YearRev, aes(x = Year, y = Worldwide, group = 1)) +
  geom_line()+
  geom_point()+
  labs(title = "Movies Yearly Revenues",
       subtitle = "Marvel movies 1986-2021",
       caption = "Source: Marvel movies",
       x = "Year",
       y = NULL,
       fill = NULL)

Interpretation:

  • Number of movie released are continue to increase recently
  • Pandemic (2020) has not impact significantly to the number of movies released, yet it has quite significant impact to the revenue earned. During Pandemic (2021) the revenue drop to the lowest since past 6 years.

6. How many movies rate the highest and how many is the lowest? Is there any correlation to budget and or revenue and or distributor?

Rating <- as.data.frame(table(UnikWW$CinemaScoreReview))
Rating
ggplot(Rating,aes(x=Freq,
                  y=Var1))+
  geom_segment(aes(x=0, xend=Freq, yend=Var1), color="grey")+
  geom_point(color="firebrick", size=3)+
  labs(y=NULL, x=NULL, title = "Number of Movies by Rating")

UnikWW[UnikWW$CinemaScoreReview == "C+", ]

Interpretation:

  • 67% of Marvel movies released from 1986 have a very good rating (A+ to A-). Majority of the customers are satisfied with Marvel movies.
  • Ghost Rider movies (2012) has given the lowest rating among all of the Marvel movies released
RatingDist <- as.data.frame(table(UnikWW$Distributor, UnikWW$CinemaScoreReview))
RatingDist
ggplot(RatingDist,aes(Freq, Var1))+
  geom_col(fill ="firebrick")+
  facet_grid(cols = vars(Var2), scales = "free_y")+
  labs(x=NULL, y= "Title", title = "Number of Movie Rating earned by Distributor", subtitle = "Marvel movies 1986-2021")

Interpretation:

  • Walt Disney Studio successfully creating an A rating movies so far. The lowest rating produced by Walt Disney Studio was B.
  • Besides, 20th Century Fox has good reputation to produce A Rating movies, yet they also has several B Rating movies.
  • While Sony pictures has produce movie and given a very wide range of rating, starting from A+ to C+
Revbud <- UnikWW[, c("Budget","Worldwide", "CinemaScoreReview")]
Revbud
RevbudA <- Revbud[Revbud$CinemaScoreReview %in% c("A+", "A", "A-"), ]
ggplot(data = RevbudA, mapping = aes(x = Worldwide, y = Budget)) +
  geom_jitter(mapping = aes(color = CinemaScoreReview)) +
  labs(title =  "Correlation Test Production budget VS Revenue earned for A Rating Movies",
       subtitle = "Marvel movies",
       x = "Production budget",
       y = "Revenue earned")+
  theme_gray()

RevbudB <- Revbud[Revbud$CinemaScoreReview %in% c("B+", "B", "B-"), ]
ggplot(data = RevbudB, mapping = aes(x = Worldwide, y = Budget)) +
  geom_jitter(mapping = aes(color = CinemaScoreReview)) +
  labs(title =  "Correlation Test Production budget VS Revenue earned for B Rating Movies",
       subtitle = "Marvel movies",
       x = "Production budget",
       y = "Revenue earned")+
  theme_gray()

cor(RevbudA$Worldwide, RevbudA$Budget)
## [1] 0.7576101
cor(RevbudB$Worldwide, RevbudB$Budget)
## [1] 0.7121279

Interpretation:

  • There are a similar correlation between Budget VS Revenue for movies rating group. However, A groups showing higher correlation.