Hello! This analysis will present the analysis of Marvel Movie which released from 1986 to 2021
marvel <- read.csv("marvel_clean.csv")inspecting & check the data structure
str(marvel)## 'data.frame': 64 obs. of 9 variables:
## $ Title : chr "Howard the Duck" "Blade" "X-Men" "Blade II" ...
## $ Distributor : chr "Universal Pictures" "New Line Cinema" "20th Century Fox" "New Line Cinema" ...
## $ ReleaseDateUS : chr "8/1/1986 0:00" "8/21/1998 0:00" "7/14/2000 0:00" "3/22/2002 0:00" ...
## $ Budget : int 37000000 45000000 75000000 54000000 139000000 78000000 110000000 137000000 33000000 200000000 ...
## $ OpeningWeekendNorthAmerica: int 5070136 17073856 54471475 32528016 114844116 40310419 85558731 62128420 13834527 88156227 ...
## $ NorthAmerica : int 16295774 70087718 157299717 82348319 403706375 102543518 214949694 132177234 33810189 373585825 ...
## $ OtherTerritories : int 21667000 61095812 139039810 72661713 418002176 76636200 192761855 113183246 20889916 415390628 ...
## $ Worldwide : num 3.80e+07 1.31e+08 2.96e+08 1.55e+08 8.22e+08 ...
## $ CinemaScoreReview : chr "B-" "A-" "A-" "B+" ...
Based on the structure above, we need to change several variables to factor and to date
marvel$Title <- as.factor(marvel$Title)
marvel$Distributor <- as.factor(marvel$Distributor)
marvel$CinemaScoreReview <- as.factor(marvel$CinemaScoreReview)
marvel$ReleaseDateUS <- as.factor(marvel$ReleaseDateUS)
str(marvel)## 'data.frame': 64 obs. of 9 variables:
## $ Title : Factor w/ 63 levels "Ant-Man","Ant-Man and the Wasp",..: 29 9 57 10 38 16 63 30 50 39 ...
## $ Distributor : Factor w/ 9 levels "20th Century Fox",..: 8 5 1 5 7 1 1 8 4 7 ...
## $ ReleaseDateUS : Factor w/ 64 levels "1/14/2005 0:00",..: 58 60 48 18 34 14 28 44 21 46 ...
## $ Budget : int 37000000 45000000 75000000 54000000 139000000 78000000 110000000 137000000 33000000 200000000 ...
## $ OpeningWeekendNorthAmerica: int 5070136 17073856 54471475 32528016 114844116 40310419 85558731 62128420 13834527 88156227 ...
## $ NorthAmerica : int 16295774 70087718 157299717 82348319 403706375 102543518 214949694 132177234 33810189 373585825 ...
## $ OtherTerritories : int 21667000 61095812 139039810 72661713 418002176 76636200 192761855 113183246 20889916 415390628 ...
## $ Worldwide : num 3.80e+07 1.31e+08 2.96e+08 1.55e+08 8.22e+08 ...
## $ CinemaScoreReview : Factor w/ 7 levels "A","A-","A+",..: 5 2 2 6 2 4 1 5 NA 2 ...
marvel$ReleaseDateUS <- strptime(marvel$ReleaseDateUS, format="%m/%d/%Y")
marvel$ReleaseDateUS## [1] "1986-08-01 +07" "1998-08-21 +07" "2000-07-14 +07" "2002-03-22 +07"
## [5] "2002-05-03 +07" "2003-02-14 +07" "2003-05-02 +07" "2003-06-20 +07"
## [9] "2004-04-16 +07" "2004-06-30 +07" "2004-12-08 +07" "2005-01-14 +07"
## [13] "2005-07-08 +07" "2006-05-26 +07" "2007-02-16 +07" "2007-05-04 +07"
## [17] "2007-06-15 +07" "2008-05-02 +07" "2008-06-13 +07" "2008-12-05 +07"
## [21] "2009-05-01 +07" "2010-05-07 +07" "2011-05-06 +07" "2011-06-03 +07"
## [25] "2011-07-22 +07" "2012-02-17 +07" "2012-05-04 +07" "2012-07-03 +07"
## [29] "2013-05-03 +07" "2013-07-26 +07" "2013-11-08 +07" "2014-04-04 +07"
## [33] "2014-05-02 +07" "2014-05-23 +07" "2014-08-01 +07" "2014-11-07 +07"
## [37] "2015-05-01 +07" "2015-07-17 +07" "2015-08-07 +07" "2016-02-12 +07"
## [41] "2016-05-06 +07" "2016-05-27 +07" "2016-11-04 +07" "2017-03-03 +07"
## [45] "2017-05-05 +07" "2017-07-07 +07" "2017-09-01 +07" "2017-11-03 +07"
## [49] "2018-02-16 +07" "2018-04-27 +07" "2018-05-18 +07" "2018-07-06 +07"
## [53] "2018-10-05 +07" "2018-12-14 +07" "2019-03-08 +07" "2019-04-26 +07"
## [57] "2019-06-07 +07" "2019-07-02 +07" "2020-08-28 +07" "2021-07-09 +07"
## [61] "2021-09-03 +07" "2021-10-01 +07" "2021-11-05 +07" "2021-12-17 +07"
marvel$ReleaseDateUS <- as.Date(marvel$ReleaseDateUS,format="%Y-%m-%d")
str(marvel$ReleaseDateUS)## Date[1:64], format: "1986-08-01" "1998-08-21" "2000-07-14" "2002-03-22" "2002-05-03" ...
anyNA(marvel)## [1] TRUE
colSums(is.na(marvel))## Title Distributor
## 0 0
## ReleaseDateUS Budget
## 0 0
## OpeningWeekendNorthAmerica NorthAmerica
## 0 0
## OtherTerritories Worldwide
## 0 0
## CinemaScoreReview
## 5
Deleting the NA row for Cinema Score
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
marvel.clean <- drop_na(data = marvel, CinemaScoreReview)
anyNA(marvel.clean)## [1] FALSE
To make the data long format in order to have easier process
marvel.long <- pivot_longer(data = marvel.clean,
cols = c(NorthAmerica, OtherTerritories),
names_to = "RevenueArea",
values_to = "Revenue")
marvel.longlibrary(lubridate)##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
marvel.long$Year <- year(marvel.long$ReleaseDateUS)marvel.long$RevenueArea <- as.factor(marvel.long$RevenueArea)
marvel.long$Revenue <- as.numeric(marvel.long$Revenue)
marvel.long$Year <- as.factor(marvel.long$Year)marvel.longsummary(marvel.long)## Title Distributor
## Ant-Man : 2 Walt Disney Studios Motion Pictures:40
## Ant-Man and the Wasp : 2 20th Century Fox :30
## Avengers: Age of Ultron: 2 Sony Pictures :26
## Avengers: Endgame : 2 Paramount Pictures : 8
## Avengers: Infinity War : 2 New Line Cinema : 6
## Big Hero 6 : 2 Universal Pictures : 6
## (Other) :106 (Other) : 2
## ReleaseDateUS Budget OpeningWeekendNorthAmerica
## Min. :1986-08-01 Min. : 35000000 Min. : 4271451
## 1st Qu.:2007-09-03 1st Qu.:110000000 1st Qu.: 55614510
## Median :2014-04-04 Median :160000000 Median : 85058311
## Mean :2012-06-25 Mean :155288136 Mean : 96023192
## 3rd Qu.:2018-01-20 3rd Qu.:200000000 3rd Qu.:121315618
## Max. :2021-12-17 Max. :356000000 Max. :357115007
##
## Worldwide CinemaScoreReview RevenueArea
## Min. :1.010e+07 A :42 NorthAmerica :59
## 1st Qu.:3.579e+08 A-:28 OtherTerritories:59
## Median :6.168e+08 A+:10
## Mean :6.721e+08 B :10
## 3rd Qu.:8.459e+08 B-:10
## Max. :2.798e+09 B+:16
## C+: 2
## Revenue Year
## Min. :2.049e+06 2018 :12
## 1st Qu.:1.662e+08 2014 :10
## Median :2.485e+08 2021 :10
## Mean :3.360e+08 2016 : 8
## 3rd Qu.:4.332e+08 2017 : 8
## Max. :1.938e+09 2019 : 8
## (Other):62
From summary above, we may conclude some of the things:
case1 <- aggregate(formula = Worldwide ~ Title,
data = marvel.long,
FUN = mean)ggplot(data = case1, mapping = aes(x = Worldwide, y = reorder(Title, Worldwide))) +
geom_col(mapping = aes(fill = Worldwide)) +
geom_vline(xintercept = mean(marvel.long$Worldwide), linetype = 2, col = "Red") +
labs(y="Title", x="Revenue", title = "Revenue by Movie title") +
scale_fill_gradient(low = "darkolivegreen1", high = "darkgreen")+
theme_minimal() Interpretation:
UnikWW <- marvel.long[match(unique(marvel.long$Title), marvel.long$Title), ]
UnikWWggplot(data = UnikWW, mapping = aes(x = Distributor, y = Worldwide)) +
geom_boxplot(mapping = aes(color = Distributor)) +
labs(title = "Worldwide Revenue rate by Distributor",
subtitle = "Marvel movies",
x = "Distributor",
y = "Revenue",
color = "Distributor")+
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())ggplot(data = UnikWW, mapping = aes(x = Distributor, y = Budget)) +
geom_boxplot(mapping = aes(color = Distributor)) +
labs(title = "Production Budget rate by Distributor",
subtitle = "Marvel movies",
x = "Distributor",
y = "Production Budget",
color = "Distributor")+
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())Interpretation:
ggplot(data = UnikWW, mapping = aes(x = Worldwide, y = Budget)) +
geom_jitter(mapping = aes(color = Distributor)) +
labs(title = "Correlation Test Production budget VS Revenue earned",
subtitle = "Marvel movies",
x = "Production budget",
y = "Revenue earned")+
theme_gray()cor(UnikWW$Budget, UnikWW$Worldwide)## [1] 0.7478962
Interpretation:
Production budget & Revenue earned have quite strong positive relationship (Cor Coef = 0.74). The higher the production budget is, the revenue earned will be higher.
Because of the title is so many, we decide to deep dive into only movie that have revenue above average
TitleAboveAvg <- marvel.long[marvel.long$Worldwide >= mean(marvel.long$Worldwide), ]
TitleAboveAvgggplot(TitleAboveAvg,aes(Revenue, Title))+
geom_col(fill ="navy")+
facet_grid(cols = vars(RevenueArea), scales = "free_y")+
labs( x="Revenue", y= "Title")Interpretation:
yeartable <- as.data.frame(table("Year"=UnikWW$Year))
yeartableggplot(data = yeartable, aes(x = Year, y = Freq, group = 1)) +
geom_line()+
geom_point()+
labs(title = "Number of Movies Released by Year",
subtitle = "Marvel movies 1986-2021",
caption = "Source: Marvel movies",
x = "Year",
y = NULL,
fill = NULL)YearRev <- aggregate(formula = Worldwide ~ Year,
data = UnikWW,
FUN = mean)ggplot(data = YearRev, aes(x = Year, y = Worldwide, group = 1)) +
geom_line()+
geom_point()+
labs(title = "Movies Yearly Revenues",
subtitle = "Marvel movies 1986-2021",
caption = "Source: Marvel movies",
x = "Year",
y = NULL,
fill = NULL)Interpretation:
Rating <- as.data.frame(table(UnikWW$CinemaScoreReview))
Ratingggplot(Rating,aes(x=Freq,
y=Var1))+
geom_segment(aes(x=0, xend=Freq, yend=Var1), color="grey")+
geom_point(color="firebrick", size=3)+
labs(y=NULL, x=NULL, title = "Number of Movies by Rating")UnikWW[UnikWW$CinemaScoreReview == "C+", ]Interpretation:
RatingDist <- as.data.frame(table(UnikWW$Distributor, UnikWW$CinemaScoreReview))
RatingDistggplot(RatingDist,aes(Freq, Var1))+
geom_col(fill ="firebrick")+
facet_grid(cols = vars(Var2), scales = "free_y")+
labs(x=NULL, y= "Title", title = "Number of Movie Rating earned by Distributor", subtitle = "Marvel movies 1986-2021")Interpretation:
Revbud <- UnikWW[, c("Budget","Worldwide", "CinemaScoreReview")]
RevbudRevbudA <- Revbud[Revbud$CinemaScoreReview %in% c("A+", "A", "A-"), ]ggplot(data = RevbudA, mapping = aes(x = Worldwide, y = Budget)) +
geom_jitter(mapping = aes(color = CinemaScoreReview)) +
labs(title = "Correlation Test Production budget VS Revenue earned for A Rating Movies",
subtitle = "Marvel movies",
x = "Production budget",
y = "Revenue earned")+
theme_gray()RevbudB <- Revbud[Revbud$CinemaScoreReview %in% c("B+", "B", "B-"), ]ggplot(data = RevbudB, mapping = aes(x = Worldwide, y = Budget)) +
geom_jitter(mapping = aes(color = CinemaScoreReview)) +
labs(title = "Correlation Test Production budget VS Revenue earned for B Rating Movies",
subtitle = "Marvel movies",
x = "Production budget",
y = "Revenue earned")+
theme_gray()cor(RevbudA$Worldwide, RevbudA$Budget)## [1] 0.7576101
cor(RevbudB$Worldwide, RevbudB$Budget)## [1] 0.7121279
Interpretation: