Dataset

The website has information about the movies spread in 8 webpages.

Load packages

library(knitr)
library(XML)
library(kableExtra)# manipulate table styles
suppressMessages(library(stringr))
suppressMessages(library(dplyr))

## Warning: package 'dplyr' was built under R version 3.5.1

suppressMessages(library(RCurl))
suppressMessages(library(tidyverse))

Function that creates a vector with all links to be accessed to retrieve data

start_url <- "https://www.boxofficemojo.com/alltime/world/?pagenum="
end_url <- "&p=.htm"
# function
pages <- function(n){
        urls <- vector('character')
        for (i in 1:n){
                temp <- str_c(start_url,i,end_url, collapse = "")
                urls <- c(urls, temp)
        }
        return(urls)
}

urls <-pages(8)
urls

## [1] "https://www.boxofficemojo.com/alltime/world/?pagenum=1&p=.htm"
## [2] "https://www.boxofficemojo.com/alltime/world/?pagenum=2&p=.htm"
## [3] "https://www.boxofficemojo.com/alltime/world/?pagenum=3&p=.htm"
## [4] "https://www.boxofficemojo.com/alltime/world/?pagenum=4&p=.htm"
## [5] "https://www.boxofficemojo.com/alltime/world/?pagenum=5&p=.htm"
## [6] "https://www.boxofficemojo.com/alltime/world/?pagenum=6&p=.htm"
## [7] "https://www.boxofficemojo.com/alltime/world/?pagenum=7&p=.htm"
## [8] "https://www.boxofficemojo.com/alltime/world/?pagenum=8&p=.htm"

Parsing data from web site

We creating a dataframe with empty vectors to storage all the information that is going to be read from the website.

movie_df <- data.frame(V1 = numeric(0), V2 = character(0), V3 = character(0), V4 = numeric(0), V5 = numeric(0), V6 = numeric(0), V7 = numeric(0), V8 = numeric(0), V9=numeric(0), stringsAsFactors = FALSE)

sapply(movie_df, class)

##          V1          V2          V3          V4          V5          V6 
##   "numeric" "character" "character"   "numeric"   "numeric"   "numeric" 
##          V7          V8          V9 
##   "numeric"   "numeric"   "numeric"

Now we retrieve the information from each page, parse it and append to the above dataframe.

Tyding the data will also be performed in this step, since the numerical data contains the symbols $, % and ^.

for (i in 1:8){
        url <-urls[i]
        htmlData <- getURL(url)
        parsedData <-htmlParse(htmlData, encoding = "UTF-8")
        data <- readHTMLTable(parsedData, skip.rows =1 )
        table <- data[[2]]
        # Converting the type of columns 1 to character type:
        table[,1] <- suppressWarnings(as.numeric(table[,1])) 
        #Converting the type of columns 2 and 3 to character type:
        for(j in seq(2,3)){
                table[,j] <- as.character(table[,j])
        }
        # Removing the symbols $, % and ^ from the numerical columns and changing them to numeric type in R:
        for(j in seq(4,9)){
                table[,j] <- suppressWarnings(as.numeric(gsub("[\\$%\\^,]","", table[,j])))
        }
        movie_df <- rbind(movie_df, table, sort = F)

}
sapply(movie_df, class)

##          V1          V2          V3          V4          V5          V6 
##   "numeric" "character" "character"   "numeric"   "numeric"   "numeric" 
##          V7          V8          V9 
##   "numeric"   "numeric"   "numeric"

head(movie_df)

##   V1                           V2   V3     V4    V5   V6     V7   V8   V9
## 1  1                       Avatar  Fox 2788.0 760.5 27.3 2027.5 72.7 2009
## 2 13                      Titanic Par. 2187.5 659.4 30.1 1528.1 69.9 1997
## 3 24 Star Wars: The Force Awakens   BV 2068.2 936.7 45.3 1131.6 54.7 2015
## 4 35       Avengers: Infinity War   BV 2046.5 678.8 33.2 1367.7 66.8 2018
## 5 46               Jurassic World Uni. 1671.7 652.3 39.0 1019.4 61.0 2015
## 6 57        Marvel's The Avengers   BV 1518.8 623.4 41.0  895.5 59.0 2012

dim(movie_df)

## [1] 759   9

Because some of the gross values are in the thousands (they appear on the webpages with k), R replace it with NAs.

Rename the columns of the movie_df:

colnames(movie_df) <- c( "Rank", "Title" , "Studio", "Worldwide", "Domestic", "Domestic.Perc", "Overseas", "Overseas.Perc", "Year")

head(movie_df)

##   Rank                        Title Studio Worldwide Domestic
## 1    1                       Avatar    Fox    2788.0    760.5
## 2   13                      Titanic   Par.    2187.5    659.4
## 3   24 Star Wars: The Force Awakens     BV    2068.2    936.7
## 4   35       Avengers: Infinity War     BV    2046.5    678.8
## 5   46               Jurassic World   Uni.    1671.7    652.3
## 6   57        Marvel's The Avengers     BV    1518.8    623.4
##   Domestic.Perc Overseas Overseas.Perc Year
## 1          27.3   2027.5          72.7 2009
## 2          30.1   1528.1          69.9 1997
## 3          45.3   1131.6          54.7 2015
## 4          33.2   1367.7          66.8 2018
## 5          39.0   1019.4          61.0 2015
## 6          41.0    895.5          59.0 2012

How to order the dataframe??

#new_df <- movie_df[order(movie_df$Rank),]

The questions we need to find answers to in this dataset are:

1. Do certain movie studios have a higher percentage of domestic grosses?

Let’s calculate the ratio of domestic to international grosses and sort to see the top 10 studios with the highest domestic grosses are.

movie_df$DIRatio <- round(movie_df$Domestic.Perc/movie_df$Overseas.Perc,2)
head(movie_df)

##   Rank                        Title Studio Worldwide Domestic
## 1    1                       Avatar    Fox    2788.0    760.5
## 2   13                      Titanic   Par.    2187.5    659.4
## 3   24 Star Wars: The Force Awakens     BV    2068.2    936.7
## 4   35       Avengers: Infinity War     BV    2046.5    678.8
## 5   46               Jurassic World   Uni.    1671.7    652.3
## 6   57        Marvel's The Avengers     BV    1518.8    623.4
##   Domestic.Perc Overseas Overseas.Perc Year DIRatio
## 1          27.3   2027.5          72.7 2009    0.38
## 2          30.1   1528.1          69.9 1997    0.43
## 3          45.3   1131.6          54.7 2015    0.83
## 4          33.2   1367.7          66.8 2018    0.50
## 5          39.0   1019.4          61.0 2015    0.64
## 6          41.0    895.5          59.0 2012    0.69

Now we can find the top studios with the highest Ratio of Domestic to International percentages:

#Option 1 - averaging out ratio of domestic percentages
StudioData<-movie_df %>% group_by(Studio) %>%
  summarise(mean=round(mean(DIRatio),2))
StudioData[order(StudioData$mean,decreasing=T)[1:10],]

## # A tibble: 10 x 2
##    Studio  mean
##    <chr>  <dbl>
##  1 Dis.    2.13
##  2 IFC     1.9 
##  3 Col.    1.69
##  4 NM      1.54
##  5 SPC     1.5 
##  6 USA     1.49
##  7 NL      1.42
##  8 Art.    1.3 
##  9 SonR    1.21
## 10 Lions   1.16

#Option 2 - Averaging out domestic percentages 
StudioData2<-movie_df %>% group_by(Studio) %>%
  summarise(mean=round(mean(Domestic.Perc),2))
StudioData2[order(StudioData2$mean,decreasing=T)[1:10],]

## # A tibble: 10 x 2
##    Studio  mean
##    <chr>  <dbl>
##  1 Dis.    68  
##  2 IFC     65.5
##  3 NM      60.6
##  4 SPC     60  
##  5 USA     59.8
##  6 Art.    56.5
##  7 Col.    54.9
##  8 SonR    54.5
##  9 Lions   53.6
## 10 NL      52.2

2. Has the percentage of international grosses gone up recently (I’m defining “recent” this as the last decade or so)?

To answer that question, let’s calculate the average % of overseas grosses in the last decade and compare it with the % in earlier years.

#Average International gross percentages between 2008 and 2018
movie_df %>% filter(Year>=2008) %>%
  summarise(mean=mean(Overseas.Perc))

##      mean
## 1 63.8088

#Average International gross percentages before 2008
movie_df %>% filter(Year<2008) %>%
  summarise(mean=mean(Overseas.Perc))

##       mean
## 1 53.71486

We can conclude that it seems that International grosses have gone up in the last decade.

3. Have movies been getting more popular, grossing more money lately?

To answer that question, let’s calculate the average total gross dollars in the last decade and compare it with the dollars in earlier years.

#Average total gross dollars between 2008 and 2018
movie_df %>% filter(Year>=2008) %>%
  summarise(mean=mean(Worldwide))

##      mean
## 1 483.257

#Average total gross dollars before 2008
movie_df %>% filter(Year<2008) %>%
  summarise(mean=mean(Worldwide))

##       mean
## 1 366.4883

We can conclude that movies have been grossing more money in the past decade.

plot(movie_df$Year, movie_df$Worldwide, 
   xlim=c(1938,2018),
   main = "Worldwide Movie Grosses",
   xlab = "Year",
   ylab = "Worlwide Gross Dollars (Millions)"
)

The graph above illustrated that visually as well. There seems to be more top films in recent years.

Project 2 - Dataset 3

E. Azrilyan and B. Sosnovski

10/04/2018