Required packages

library(rvest)
## Loading required package: xml2
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(stringr)
library(MVN)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## sROC 0.1-2 loaded
library(ggplot2)

Executive Summary

In this exercise, two sets of data containing information on movies in the Marvel Cinematic Universe were scraped from two different websites and then combined. Both dataset had its own issue that needed to be resolved in order to produce a tidy combined dataset.

The aim of the combined dataset seeks to provide information on the how well the MCU moves are doing financially and critically. The dataset will allow anyone to analyse the profitability and growth of the Marvel movie franchise.

In this report, the steps taken below sought to:

Data

Both datasets used in this report were scraped from the web.

The first dataset contains 23 observations of 10 variables. It was obtained from the Wikipedia page List of Marvel Cinematic Universe Films, https://en.wikipedia.org/wiki/List_of_Marvel_Cinematic_Universe_films

The dataset contain the following variables:

The second dataset contains 23 observations of 3 variables. It was obtained from IMDB, https://www.imdb.com/search/keyword/?keywords=blockbuster%2Cmarvel-cinematic-universe&ref_=kw_ref_key&mode=advanced&page=1&sort=moviemeter,asc.

The dataset contains the following variables:

# import MCU box office data
url <- "https://en.wikipedia.org/wiki/List_of_Marvel_Cinematic_Universe_films"

MCU <- url %>%
  read_html() %>%
  html_nodes(xpath='//*[@id="mw-content-text"]/div/table[8]') %>%
  html_table(fill=TRUE)

MCU_BoxOffice <- MCU[[1]]

# import MCU movie rating 
url2 <- "https://www.imdb.com/search/keyword/?keywords=blockbuster%2Cmarvel-cinematic-universe&ref_=kw_ref_key&mode=advanced&page=1&sort=moviemeter,asc"

MCU_IMDB_Webpage <- read_html(url2)

title_data_html <- html_nodes(MCU_IMDB_Webpage, ".lister-item-header a")
title_data  <- html_text(title_data_html)

imdb_data_html <- html_nodes(MCU_IMDB_Webpage, ".ratings-imdb-rating strong")
imdb_data  <- html_text(imdb_data_html)

metascore_data_html <- html_nodes(MCU_IMDB_Webpage, ".metascore")
metascore_data <- html_text(metascore_data_html)

MCU_Rating <- data.frame(title_data, imdb_data, metascore_data)

Tidy & Manipulate Data I

The MCU_BoxOffice dataset does not conform to the tidy data principle. The Phase variable is stored across multiple columns. We will need to create a new column named Phase and input the variables into that column.

The Ref(s) column was dropped as it does not contain any useful information.

# rename columns
names(MCU_BoxOffice) <- c("Film", "U.S. release date", "North America Box office gross", "World Wide Box office gross", "Combined Box Office gross",  "U.S. All-time ranking", "World Wide All-time ranking", "Budget (Millions)", "Ref(s)") 

MCU_BoxOffice <- MCU_BoxOffice[1:8] # dropping Ref(s) column as it does not contain useful info

MCU_BoxOffice <- MCU_BoxOffice %>% mutate(Phase = 1) # create a new column named Phase
MCU_BoxOffice[11:16,"Phase"] <- 2 # input the correct Phase number
MCU_BoxOffice[18:28,"Phase"] <- 3

# deleted blank rows
MCU_BoxOffice <- MCU_BoxOffice[c(4:9, 11:16, 18:28), ]

# rename movies names in MCU_Rating

MCU_Rating$title_data <- MCU_Rating$title_data %>% as.character()

MCU_Rating[12,1] <- MCU_BoxOffice[6,1]
MCU_Rating[6,1] <- MCU_BoxOffice[23,1]

# join 2 data frames
MCU_Complete <- left_join(MCU_BoxOffice, MCU_Rating, by=c("Film" = "title_data"))

MCU_Complete %>% head()

Understand

Once the two dataframes are merged, it resulted in a new dataframe is is named MCU_Complete

The following actions were performed to convert the columns into the desired data type:

MCU_Complete %>% glimpse()
## Observations: 23
## Variables: 11
## $ Film                             <chr> "Iron Man", "The Incredible Hulk",...
## $ `U.S. release date`              <chr> "May 2, 2008", "June 13, 2008", "M...
## $ `North America Box office gross` <chr> "$318,604,126", "$134,806,913", "$...
## $ `World Wide Box office gross`    <chr> "$266,762,121", "$129,964,083", "$...
## $ `Combined Box Office gross`      <chr> "$585,366,247", "$264,770,996", "$...
## $ `U.S. All-time ranking`          <chr> "74", "452", "79", "256", "272", "...
## $ `World Wide All-time ranking`    <chr> "170", "571", "151", "256", "346",...
## $ `Budget (Millions)`              <chr> "$140 million", "$150 million", "$...
## $ Phase                            <dbl> 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2...
## $ imdb_data                        <fct> 7.9, 6.7, 7.0, 7.0, 6.9, 8.0, 7.2,...
## $ metascore_data                   <fct> 79        , 61        , 57        ...
# clean up data "$" columns
MCU_Complete[,c(3:5)] <- MCU_Complete[,c(3:5)] %>%
  apply(2,function (x) str_replace_all(x,pattern = "[^[:digit:]]", replacement="")) %>%
  as.numeric()

MCU_Complete$`Budget (Millions)` <- MCU_Complete$`Budget (Millions)` %>%
  gsub(pattern = "[^0-9\\.\\–]", replacement ="")

# Format release date column
MCU_Complete$`U.S. release date` <- MCU_Complete$`U.S. release date` %>%
  mdy()

# Format ranking to numerics
MCU_Complete[,c(6,7)] <- MCU_Complete[,c(6,7)] %>%
  lapply(FUN= as.numeric)

# Formate rating to numerics
MCU_Complete[,10] <- as.numeric(levels(MCU_Complete[,10]))[MCU_Complete[,10]]
MCU_Complete[,11] <- as.numeric(levels(MCU_Complete[,11]))[MCU_Complete[,11]]

# Change Budget to numerics
MCU_Complete[19,8] <- (316+400) /2
MCU_Complete[21,8] <- (150+175) /2

MCU_Complete$`Budget (Millions)` <- MCU_Complete$`Budget (Millions)` %>% as.numeric

# Converting Phase to factors
MCU_BoxOffice$Phase <- MCU_BoxOffice$Phase %>%
  factor(levels = c(1, 2, 3), ordered=TRUE)

Tidy & Manipulate Data II

Create a column named North_America_Box_Office_share which shows the proportion of North America Box Office gross within Combined Box Office gross. This will be useful to see how much the franchise has grown beyond the North American territory. If the Combined Box Office numbers have grown, while proprotion of North America Box Office share has decreased, it means that the growth in other territories has outpaced the growth in North America.

# create percentage of North America Office proprotion
MCU_Complete <- MCU_Complete %>% mutate(North_America_Box_Office_share = `North America Box office gross`/`Combined Box Office gross`)

Transform

In order to align all measurement units, Budget (Millions) column is multiplied by 1 million, and renamed to Budget. This will bring to the the same measurement scale as Box Office figures and will make plotting and analysing easier.

The imdb_data column is multiplied by 10, so that the scores are out of 100, in line with Metacritic scores.

MCU_Complete <- MCU_Complete %>% mutate(`Budget (Millions)` = `Budget (Millions)` * 1000000)

colnames(MCU_Complete)[colnames(MCU_Complete) == 'Budget (Millions)'] <- 'Budget'

# Multiply IMDB score by 10 to bring it in line with Metascore
MCU_Complete <- MCU_Complete %>% mutate(imdb_data = imdb_data * 10)

Scan I

# Scan for missing and special values
MCU_Complete %>% anyNA()
## [1] FALSE
MCU_Complete %>% apply(2,is.infinite) %>% sum()
## [1] 0
# Visually checking min and max values as sanity check for impossible values
MCU_Complete %>% apply(2,min)
##                           Film              U.S. release date 
##                      "Ant-Man"                   "2008-05-02" 
## North America Box office gross    World Wide Box office gross 
##                    "134806913"                   " 129964083" 
##      Combined Box Office gross          U.S. All-time ranking 
##                   " 264770996"                          "  2" 
##    World Wide All-time ranking                         Budget 
##                          "  1"                    "109300000" 
##                          Phase                      imdb_data 
##                            "1"                           "67" 
##                 metascore_data North_America_Box_Office_share 
##                           "54"                    "0.3068028"
MCU_Complete %>% apply(2,max)
##                           Film              U.S. release date 
##         "Thor: The Dark World"                   "2019-07-02" 
## North America Box office gross    World Wide Box office gross 
##                    "858373000"                   "1939427564" 
##      Combined Box Office gross          U.S. All-time ranking 
##                   "2797800564"                          "452" 
##    World Wide All-time ranking                         Budget 
##                          "571"                    "365500000" 
##                          Phase                      imdb_data 
##                            "3"                           "85" 
##                 metascore_data North_America_Box_Office_share 
##                           "88"                    "0.5442817"
# Check if all 23 films are within film release date range
release_date_range <- c(interval(ymd("2008-05-02"), ymd("2019-07-02")))

MCU_Complete$`U.S. release date` %within% release_date_range %>% sum()
## [1] 23
# Scan for obvious errors
MCU_Complete %>% filter(Budget < 100000000 | Budget > 378500000) %>% print()
##  [1] Film                           U.S. release date             
##  [3] North America Box office gross World Wide Box office gross   
##  [5] Combined Box Office gross      U.S. All-time ranking         
##  [7] World Wide All-time ranking    Budget                        
##  [9] Phase                          imdb_data                     
## [11] metascore_data                 North_America_Box_Office_share
## <0 rows> (or 0-length row.names)
# Check North America box office + World wide box office = Combined box office
identical((MCU_Complete$`North America Box office gross`+ MCU_Complete$`World Wide Box office gross`), MCU_Complete$`Combined Box Office gross`)
## [1] TRUE
# Check user ratings are between 0-100
MCU_Complete %>% filter(imdb_data < 0 | imdb_data > 100 |
                          metascore_data < 0 | metascore_data > 100) %>% print()
##  [1] Film                           U.S. release date             
##  [3] North America Box office gross World Wide Box office gross   
##  [5] Combined Box Office gross      U.S. All-time ranking         
##  [7] World Wide All-time ranking    Budget                        
##  [9] Phase                          imdb_data                     
## [11] metascore_data                 North_America_Box_Office_share
## <0 rows> (or 0-length row.names)

Scan II

We can see that potenial outliers largely exists in Avengers: End Game, where it has an unusually large budget and box office gross.

The Chi-Square QQ-Plot also gives us the row number of where potential multivariate outliers are.

The potential outliers detected are verified against a second data source to verify that they are not data entry errors. They will not be removed or capped from this dataset as they are legitimate data and provide important information.

# produce descriptive statistics
MCU_Complete %>% summary()
##      Film           U.S. release date    North America Box office gross
##  Length:23          Min.   :2008-05-02   Min.   :134806913             
##  Class :character   1st Qu.:2012-11-02   1st Qu.:224645330             
##  Mode  :character   Median :2015-07-17   Median :333176600             
##                     Mean   :2015-02-18   Mean   :371542272             
##                     3rd Qu.:2017-12-25   3rd Qu.:417921916             
##                     Max.   :2019-07-02   Max.   :858373000             
##  World Wide Box office gross Combined Box Office gross U.S. All-time ranking
##  Min.   :1.300e+08           Min.   :2.648e+08         Min.   :  2.0        
##  1st Qu.:3.726e+08           1st Qu.:6.233e+08         1st Qu.: 26.5        
##  Median :4.739e+08           Median :8.540e+08         Median : 63.0        
##  Mean   :6.105e+08           Mean   :9.820e+08         Mean   :105.7        
##  3rd Qu.:7.433e+08           3rd Qu.:1.184e+09         3rd Qu.:164.5        
##  Max.   :1.939e+09           Max.   :2.798e+09         Max.   :452.0        
##  World Wide All-time ranking     Budget              Phase         imdb_data   
##  Min.   :  1.0               Min.   :109300000   Min.   :1.000   Min.   :67.0  
##  1st Qu.: 20.0               1st Qu.:156350000   1st Qu.:1.500   1st Qu.:70.5  
##  Median : 74.0               Median :177000000   Median :2.000   Median :74.0  
##  Mean   :115.5               Mean   :196839130   Mean   :2.217   Mean   :74.7  
##  3rd Qu.:150.5               3rd Qu.:200000000   3rd Qu.:3.000   3rd Qu.:78.5  
##  Max.   :571.0               Max.   :365500000   Max.   :3.000   Max.   :85.0  
##  metascore_data  North_America_Box_Office_share
##  Min.   :54.00   Min.   :0.3068                
##  1st Qu.:64.00   1st Qu.:0.3441                
##  Median :69.00   Median :0.3689                
##  Mean   :68.65   Mean   :0.3955                
##  3rd Qu.:73.50   3rd Qu.:0.4412                
##  Max.   :88.00   Max.   :0.5443
# produce boxplot to check for univariate outliers

MCU_BoxOffice_P <- MCU_Complete %>% gather(key = BoxOffice, value = USD, c(`North America Box office gross`, `World Wide Box office gross`, `Combined Box Office gross`))

ggplot(data=MCU_BoxOffice_P, aes(x=BoxOffice, y=USD)) +
  geom_boxplot()

ggplot(data=MCU_Complete, aes(y=Budget)) +
  geom_boxplot()

MCU_Rating_P <- MCU_Complete %>% gather(key = RatingSite, value = Rating, c(imdb_data, metascore_data)) 

ggplot(data=MCU_Rating_P, aes(x=RatingSite, y=Rating)) +
  geom_boxplot()

MCU_Ranking_P <- MCU_Complete %>% gather(key = Rank, value = Ranking, c(`World Wide All-time ranking`, `U.S. All-time ranking`)) 

ggplot(data=MCU_Ranking_P, aes(x=Rank, y=Ranking)) +
  geom_boxplot()

# scan for outliers: North America/World Wide Box Office

ggplot(data=MCU_Complete, aes(`North America Box office gross`, `World Wide Box office gross`)) +
  geom_point()

mvn(MCU_Complete[,c(3,4)], multivariateOutlierMethod = "quan", showOutliers = T)

## $multivariateNormality
##              Test        Statistic              p value Result
## 1 Mardia Skewness 29.4113528430744  6.4485567040088e-06     NO
## 2 Mardia Kurtosis 4.01946427542468 5.83306253141291e-05     NO
## 3             MVN             <NA>                 <NA>     NO
## 
## $univariateNormality
##           Test                       Variable Statistic   p value Normality
## 1 Shapiro-Wilk North America Box office gross    0.8952    0.0201    NO    
## 2 Shapiro-Wilk  World Wide Box office gross      0.8330    0.0014    NO    
## 
## $Descriptives
##                                 n      Mean   Std.Dev    Median       Min
## North America Box office gross 23 371542272 188202433 333176600 134806913
## World Wide Box office gross    23 610481879 404547277 473942950 129964083
##                                       Max      25th      75th      Skew
## North America Box office gross  858373000 224645330 417921917 0.9733487
## World Wide Box office gross    1939427564 372567601 743303928 1.6527548
##                                 Kurtosis
## North America Box office gross 0.1172456
## World Wide Box office gross    2.8468752
## 
## $multivariateOutliers
##    Observation Mahalanobis Distance Outlier
## 18          18               58.449    TRUE
## 6            6               12.994    TRUE
## 22          22               12.525    TRUE
## 1            1                9.205    TRUE
# Combined Box Office/Budget
ggplot(data=MCU_Complete, aes(`Combined Box Office gross`, Budget)) +
  geom_point()

ggplot(data=MCU_Complete, aes(`Combined Box Office gross`, Budget)) +
  geom_point() +
  geom_text(data=subset(MCU_Complete, Budget > 200000000 | Budget < 156350000), 
                        aes(`Combined Box Office gross`, Budget, label=Film),
            nudge_y = -5000000, nudge_x = -400000000, alpha = 0.9, size = 1.7) +
  facet_wrap(~Phase) # label Budget outside Q1 and Q3 with film name

mvn(MCU_Complete[,c(5,8)], multivariateOutlierMethod = "quan", showOutliers = T)

## $multivariateNormality
##              Test        Statistic              p value Result
## 1 Mardia Skewness 20.1088132831652 0.000475293760456016     NO
## 2 Mardia Kurtosis 3.07232492041949  0.00212398389902457     NO
## 3             MVN             <NA>                 <NA>     NO
## 
## $univariateNormality
##           Test                  Variable Statistic   p value Normality
## 1 Shapiro-Wilk Combined Box Office gross    0.8621    0.0045    NO    
## 2 Shapiro-Wilk          Budget              0.7673    0.0001    NO    
## 
## $Descriptives
##                            n      Mean   Std.Dev    Median       Min        Max
## Combined Box Office gross 23 982024151 576952326 853977126 264770996 2797800564
## Budget                    23 196839130  70109283 177000000 109300000  365500000
##                                25th       75th     Skew Kurtosis
## Combined Box Office gross 623303735 1184053773 1.448621 2.068870
## Budget                    156350000  200000000 1.456039 1.006464
## 
## $multivariateOutliers
##    Observation Mahalanobis Distance Outlier
## 11          11               36.441    TRUE
## 22          22               29.022    TRUE
## 19          19               26.488    TRUE
# North America/World Wide Box Office/Budget
mvn(MCU_Complete[,c(3,4,8)], multivariateOutlierMethod = "quan", showOutliers = T)

## $multivariateNormality
##              Test        Statistic              p value Result
## 1 Mardia Skewness 41.6246993744131 8.74379707939654e-06     NO
## 2 Mardia Kurtosis 3.56452319392107 0.000364518218795684     NO
## 3             MVN             <NA>                 <NA>     NO
## 
## $univariateNormality
##           Test                       Variable Statistic   p value Normality
## 1 Shapiro-Wilk North America Box office gross    0.8952    0.0201    NO    
## 2 Shapiro-Wilk  World Wide Box office gross      0.8330    0.0014    NO    
## 3 Shapiro-Wilk             Budget                0.7673    0.0001    NO    
## 
## $Descriptives
##                                 n      Mean   Std.Dev    Median       Min
## North America Box office gross 23 371542272 188202433 333176600 134806913
## World Wide Box office gross    23 610481879 404547277 473942950 129964083
## Budget                         23 196839130  70109283 177000000 109300000
##                                       Max      25th      75th      Skew
## North America Box office gross  858373000 224645330 417921917 0.9733487
## World Wide Box office gross    1939427564 372567601 743303928 1.6527548
## Budget                          365500000 156350000 200000000 1.4560389
##                                 Kurtosis
## North America Box office gross 0.1172456
## World Wide Box office gross    2.8468752
## Budget                         1.0064642
## 
## $multivariateOutliers
##    Observation Mahalanobis Distance Outlier
## 22          22               59.366    TRUE
## 11          11               57.314    TRUE
## 19          19               43.178    TRUE
## 18          18               27.298    TRUE
#US ranking/World Wide Ranking
ggplot(data=MCU_Complete, aes(`U.S. All-time ranking`, `World Wide All-time ranking`)) +
  geom_point()

mvn(MCU_Complete[,c(6,7)], multivariateOutlierMethod = "quan", showOutliers = T)

## $multivariateNormality
##              Test        Statistic              p value Result
## 1 Mardia Skewness 18.9558318336133 0.000801804199170797     NO
## 2 Mardia Kurtosis 2.01636784302799   0.0437615255033987     NO
## 3             MVN             <NA>                 <NA>     NO
## 
## $univariateNormality
##           Test                    Variable Statistic   p value Normality
## 1 Shapiro-Wilk    U.S. All-time ranking       0.8187     8e-04    NO    
## 2 Shapiro-Wilk World Wide All-time ranking    0.7796     2e-04    NO    
## 
## $Descriptives
##                              n     Mean  Std.Dev Median Min Max 25th  75th
## U.S. All-time ranking       23 105.6957 115.0227     63   2 452 26.5 164.5
## World Wide All-time ranking 23 115.4783 134.0349     74   1 571 20.0 150.5
##                                 Skew Kurtosis
## U.S. All-time ranking       1.346474 1.199206
## World Wide All-time ranking 1.815021 3.380828
## 
## $multivariateOutliers
##    Observation Mahalanobis Distance Outlier
## 2            2              251.511    TRUE
## 5            5               93.611    TRUE
## 1            1               82.616    TRUE
## 3            3               50.056    TRUE
## 10          10               13.676    TRUE
## 4            4               12.678    TRUE
## 8            8               12.425    TRUE
## 15          15               10.076    TRUE
# IMDB score/Metacritic score
ggplot(data=MCU_Complete, aes(metascore_data, imdb_data)) +
  geom_point()

mvn(MCU_Complete[,c(10,11)], multivariateOutlierMethod = "quan", showOutliers = T)

## $multivariateNormality
##              Test        Statistic            p value Result
## 1 Mardia Skewness 8.11562047197169 0.0874335147346808    YES
## 2 Mardia Kurtosis 1.31823240035508  0.187425862411238    YES
## 3             MVN             <NA>               <NA>    YES
## 
## $univariateNormality
##           Test       Variable Statistic   p value Normality
## 1 Shapiro-Wilk   imdb_data       0.9579    0.4225    YES   
## 2 Shapiro-Wilk metascore_data    0.9843    0.9653    YES   
## 
## $Descriptives
##                 n     Mean  Std.Dev Median Min Max 25th 75th      Skew
## imdb_data      23 74.69565 4.930746     74  67  85 70.5 78.5 0.4011367
## metascore_data 23 68.65217 7.906319     69  54  88 64.0 73.5 0.2464982
##                  Kurtosis
## imdb_data      -0.8602391
## metascore_data -0.1685804
## 
## $multivariateOutliers
##    Observation Mahalanobis Distance Outlier
## 18          18               21.528    TRUE
## 19          19               11.489    TRUE