The website has information about the movies spread in 8 webpages.
library(knitr)
library(XML)
library(kableExtra)# manipulate table styles
suppressMessages(library(stringr))
suppressMessages(library(dplyr))
## Warning: package 'dplyr' was built under R version 3.5.1
suppressMessages(library(RCurl))
suppressMessages(library(tidyverse))
start_url <- "https://www.boxofficemojo.com/alltime/world/?pagenum="
end_url <- "&p=.htm"
# function
pages <- function(n){
urls <- vector('character')
for (i in 1:n){
temp <- str_c(start_url,i,end_url, collapse = "")
urls <- c(urls, temp)
}
return(urls)
}
urls <-pages(8)
urls
## [1] "https://www.boxofficemojo.com/alltime/world/?pagenum=1&p=.htm"
## [2] "https://www.boxofficemojo.com/alltime/world/?pagenum=2&p=.htm"
## [3] "https://www.boxofficemojo.com/alltime/world/?pagenum=3&p=.htm"
## [4] "https://www.boxofficemojo.com/alltime/world/?pagenum=4&p=.htm"
## [5] "https://www.boxofficemojo.com/alltime/world/?pagenum=5&p=.htm"
## [6] "https://www.boxofficemojo.com/alltime/world/?pagenum=6&p=.htm"
## [7] "https://www.boxofficemojo.com/alltime/world/?pagenum=7&p=.htm"
## [8] "https://www.boxofficemojo.com/alltime/world/?pagenum=8&p=.htm"
We creating a dataframe with empty vectors to storage all the information that is going to be read from the website.
movie_df <- data.frame(V1 = numeric(0), V2 = character(0), V3 = character(0), V4 = numeric(0), V5 = numeric(0), V6 = numeric(0), V7 = numeric(0), V8 = numeric(0), V9=numeric(0), stringsAsFactors = FALSE)
sapply(movie_df, class)
## V1 V2 V3 V4 V5 V6
## "numeric" "character" "character" "numeric" "numeric" "numeric"
## V7 V8 V9
## "numeric" "numeric" "numeric"
Now we retrieve the information from each page, parse it and append to the above dataframe.
Tyding the data will also be performed in this step, since the numerical data contains the symbols $, % and ^.
for (i in 1:8){
url <-urls[i]
htmlData <- getURL(url)
parsedData <-htmlParse(htmlData, encoding = "UTF-8")
data <- readHTMLTable(parsedData, skip.rows =1 )
table <- data[[2]]
# Converting the type of columns 1 to character type:
table[,1] <- suppressWarnings(as.numeric(table[,1]))
#Converting the type of columns 2 and 3 to character type:
for(j in seq(2,3)){
table[,j] <- as.character(table[,j])
}
# Removing the symbols $, % and ^ from the numerical columns and changing them to numeric type in R:
for(j in seq(4,9)){
table[,j] <- suppressWarnings(as.numeric(gsub("[\\$%\\^,]","", table[,j])))
}
movie_df <- rbind(movie_df, table, sort = F)
}
sapply(movie_df, class)
## V1 V2 V3 V4 V5 V6
## "numeric" "character" "character" "numeric" "numeric" "numeric"
## V7 V8 V9
## "numeric" "numeric" "numeric"
head(movie_df)
## V1 V2 V3 V4 V5 V6 V7 V8 V9
## 1 1 Avatar Fox 2788.0 760.5 27.3 2027.5 72.7 2009
## 2 13 Titanic Par. 2187.5 659.4 30.1 1528.1 69.9 1997
## 3 24 Star Wars: The Force Awakens BV 2068.2 936.7 45.3 1131.6 54.7 2015
## 4 35 Avengers: Infinity War BV 2046.5 678.8 33.2 1367.7 66.8 2018
## 5 46 Jurassic World Uni. 1671.7 652.3 39.0 1019.4 61.0 2015
## 6 57 Marvel's The Avengers BV 1518.8 623.4 41.0 895.5 59.0 2012
dim(movie_df)
## [1] 759 9
Because some of the gross values are in the thousands (they appear on the webpages with k), R replace it with NAs.
Rename the columns of the movie_df:
colnames(movie_df) <- c( "Rank", "Title" , "Studio", "Worldwide", "Domestic", "Domestic.Perc", "Overseas", "Overseas.Perc", "Year")
head(movie_df)
## Rank Title Studio Worldwide Domestic
## 1 1 Avatar Fox 2788.0 760.5
## 2 13 Titanic Par. 2187.5 659.4
## 3 24 Star Wars: The Force Awakens BV 2068.2 936.7
## 4 35 Avengers: Infinity War BV 2046.5 678.8
## 5 46 Jurassic World Uni. 1671.7 652.3
## 6 57 Marvel's The Avengers BV 1518.8 623.4
## Domestic.Perc Overseas Overseas.Perc Year
## 1 27.3 2027.5 72.7 2009
## 2 30.1 1528.1 69.9 1997
## 3 45.3 1131.6 54.7 2015
## 4 33.2 1367.7 66.8 2018
## 5 39.0 1019.4 61.0 2015
## 6 41.0 895.5 59.0 2012
How to order the dataframe??
#new_df <- movie_df[order(movie_df$Rank),]
Let’s calculate the ratio of domestic to international grosses and sort to see the top 10 studios with the highest domestic grosses are.
movie_df$DIRatio <- round(movie_df$Domestic.Perc/movie_df$Overseas.Perc,2)
head(movie_df)
## Rank Title Studio Worldwide Domestic
## 1 1 Avatar Fox 2788.0 760.5
## 2 13 Titanic Par. 2187.5 659.4
## 3 24 Star Wars: The Force Awakens BV 2068.2 936.7
## 4 35 Avengers: Infinity War BV 2046.5 678.8
## 5 46 Jurassic World Uni. 1671.7 652.3
## 6 57 Marvel's The Avengers BV 1518.8 623.4
## Domestic.Perc Overseas Overseas.Perc Year DIRatio
## 1 27.3 2027.5 72.7 2009 0.38
## 2 30.1 1528.1 69.9 1997 0.43
## 3 45.3 1131.6 54.7 2015 0.83
## 4 33.2 1367.7 66.8 2018 0.50
## 5 39.0 1019.4 61.0 2015 0.64
## 6 41.0 895.5 59.0 2012 0.69
Now we can find the top studios with the highest Ratio of Domestic to International percentages:
#Option 1 - averaging out ratio of domestic percentages
StudioData<-movie_df %>% group_by(Studio) %>%
summarise(mean=round(mean(DIRatio),2))
StudioData[order(StudioData$mean,decreasing=T)[1:10],]
## # A tibble: 10 x 2
## Studio mean
## <chr> <dbl>
## 1 Dis. 2.13
## 2 IFC 1.9
## 3 Col. 1.69
## 4 NM 1.54
## 5 SPC 1.5
## 6 USA 1.49
## 7 NL 1.42
## 8 Art. 1.3
## 9 SonR 1.21
## 10 Lions 1.16
#Option 2 - Averaging out domestic percentages
StudioData2<-movie_df %>% group_by(Studio) %>%
summarise(mean=round(mean(Domestic.Perc),2))
StudioData2[order(StudioData2$mean,decreasing=T)[1:10],]
## # A tibble: 10 x 2
## Studio mean
## <chr> <dbl>
## 1 Dis. 68
## 2 IFC 65.5
## 3 NM 60.6
## 4 SPC 60
## 5 USA 59.8
## 6 Art. 56.5
## 7 Col. 54.9
## 8 SonR 54.5
## 9 Lions 53.6
## 10 NL 52.2
To answer that question, let’s calculate the average % of overseas grosses in the last decade and compare it with the % in earlier years.
#Average International gross percentages between 2008 and 2018
movie_df %>% filter(Year>=2008) %>%
summarise(mean=mean(Overseas.Perc))
## mean
## 1 63.8088
#Average International gross percentages before 2008
movie_df %>% filter(Year<2008) %>%
summarise(mean=mean(Overseas.Perc))
## mean
## 1 53.71486
We can conclude that it seems that International grosses have gone up in the last decade.
To answer that question, let’s calculate the average total gross dollars in the last decade and compare it with the dollars in earlier years.
#Average total gross dollars between 2008 and 2018
movie_df %>% filter(Year>=2008) %>%
summarise(mean=mean(Worldwide))
## mean
## 1 483.257
#Average total gross dollars before 2008
movie_df %>% filter(Year<2008) %>%
summarise(mean=mean(Worldwide))
## mean
## 1 366.4883
We can conclude that movies have been grossing more money in the past decade.
plot(movie_df$Year, movie_df$Worldwide,
xlim=c(1938,2018),
main = "Worldwide Movie Grosses",
xlab = "Year",
ylab = "Worlwide Gross Dollars (Millions)"
)
The graph above illustrated that visually as well. There seems to be more top films in recent years.