library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
The data shows films in UK and how much each film grossed for each theater. We want to be able to predict top theaters for a specific genre or comp title. In the movie industry, a lot of the analysis is compared with compatible titles, but first we need to clean the data to make sure we have everything we need.
master_file <- read_excel("C:/Users/munkhnaran.gankhuyag/OneDrive - insidemedia.net/Python/Rentrak Project/master_file_0509.xlsx",
col_types = c("numeric", "text", "numeric",
"text", "text", "numeric", "text",
"text", "text", "text", "date", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric","numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric"))
summary(master_file)
## Year Film TheatreNumber TheatreName
## Min. :2015 Length:57068 Min. : 7550 Length:57068
## 1st Qu.:2016 Class :character 1st Qu.: 8467 Class :character
## Median :2017 Mode :character Median : 14767 Mode :character
## Mean :2017 Mean : 229319
## 3rd Qu.:2018 3rd Qu.: 50273
## Max. :2019 Max. :1092965
##
## City City Population Circuit Region
## Length:57068 Min. : 1409 Length:57068 Length:57068
## Class :character 1st Qu.: 48516 Class :character Class :character
## Mode :character Median : 117763 Mode :character Mode :character
## Mean : 703944
## 3rd Qu.: 318014
## Max. :7556900
## NA's :1919
## Branch State DateofFirstEngagement
## Length:57068 Length:57068 Min. :2015-01-01 00:00:00
## Class :character Class :character 1st Qu.:2016-06-18 00:00:00
## Mode :character Mode :character Median :2017-09-08 00:00:00
## Mean :2017-08-03 12:47:23
## 3rd Qu.:2018-11-14 00:00:00
## Max. :2020-10-19 00:00:00
##
## Weeks Weeks\r\nInRange MarketShare TotalGross
## Min. : 1.000 Min. : 1.000 Min. :2.000e-08 Min. : 1
## 1st Qu.: 6.000 1st Qu.: 5.000 1st Qu.:9.240e-04 1st Qu.: 18990
## Median : 9.000 Median : 8.000 Median :1.890e-03 Median : 40989
## Mean : 9.411 Mean : 8.882 Mean :2.278e-03 Mean : 59970
## 3rd Qu.:12.000 3rd Qu.:12.000 3rd Qu.:3.137e-03 3rd Qu.: 77285
## Max. :33.000 Max. :33.000 Max. :3.651e-02 Max. :1885916
##
## OpeningWeekendGross Opening7DayGross Week0Gross Week1WeekendGross
## Min. : 0 Min. : 0 Min. : 0.000 Min. : 0
## 1st Qu.: 3667 1st Qu.: 6097 1st Qu.: 0.000 1st Qu.: 2650
## Median : 8615 Median : 14378 Median : 0.000 Median : 8341
## Mean : 14324 Mean : 23707 Mean : 8.365 Mean : 13893
## 3rd Qu.: 17730 3rd Qu.: 29256 3rd Qu.: 0.000 3rd Qu.: 17715
## Max. :301698 Max. :609211 Max. :13828.830 Max. :301698
##
## Week1Gross Week2WeekendGross Week2Gross Week3WeekendGross
## Min. : 0 Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 4536 1st Qu.: 2671 1st Qu.: 4491 1st Qu.: 1629
## Median : 13494 Median : 5891 Median : 9729 Median : 3849
## Mean : 21723 Mean : 8738 Mean : 14333 Mean : 5596
## 3rd Qu.: 28083 3rd Qu.: 11270 3rd Qu.: 18360 3rd Qu.: 7423
## Max. :492339 Max. :158272 Max. :314517 Max. :153820
##
## Week3Gross Week4WeekendGross Week4Gross Week5WeekendGross
## Min. : 0 Min. : 0.0 Min. : 0 Min. : 0
## 1st Qu.: 2654 1st Qu.: 808.6 1st Qu.: 1285 1st Qu.: 239
## Median : 6127 Median : 2308.8 Median : 3582 Median : 1356
## Mean : 8890 Mean : 3408.2 Mean : 5466 Mean : 2171
## 3rd Qu.: 11739 3rd Qu.: 4645.8 3rd Qu.: 7242 3rd Qu.: 3008
## Max. :298679 Max. :126346.5 Max. :235305 Max. :101123
##
## Week5Gross Week6WeekendGross Week6Gross Week7WeekendGross
## Min. : 0.0 Min. : 0.0 Min. : 0 Min. : 0.0
## 1st Qu.: 395.2 1st Qu.: 0.0 1st Qu.: 0 1st Qu.: 0.0
## Median : 2133.1 Median : 676.1 Median : 1024 Median : 223.1
## Mean : 3520.2 Mean : 1329.3 Mean : 2145 Mean : 869.0
## 3rd Qu.: 4882.1 3rd Qu.: 1896.0 3rd Qu.: 2923 3rd Qu.: 1195.9
## Max. :177792.0 Max. :72739.7 Max. :133714 Max. :64386.0
##
## Week7Gross Week8WeekendGross Week8Gross Week9WeekendGross
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0
## Median : 335.1 Median : 0.0 Median : 0.0 Median : 0.0
## Mean : 1355.4 Mean : 517.8 Mean : 801.7 Mean : 307.4
## 3rd Qu.: 1745.8 3rd Qu.: 610.5 3rd Qu.: 880.3 3rd Qu.: 285.4
## Max. :112019.8 Max. :60438.4 Max. :83665.4 Max. :32806.6
##
## Week9Gross Week10WeekendGross Week10Gross Adventure
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. :0.0000
## 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.: 0.0 1st Qu.:0.0000
## Median : 0.0 Median : 0.0 Median : 0.0 Median :1.0000
## Mean : 489.9 Mean : 208.5 Mean : 316.2 Mean :0.6703
## 3rd Qu.: 466.6 3rd Qu.: 115.2 3rd Qu.: 181.4 3rd Qu.:1.0000
## Max. :62609.3 Max. :21501.2 Max. :32906.2 Max. :1.0000
##
## Action Musical Drama History
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.4493 Mean :0.1635 Mean :0.2803 Mean :0.08411
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## Comedy Mystery Romance Thriller
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.3971 Mean :0.08658 Mean :0.1141 Mean :0.2056
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
##
## Sci Fi War Family Horror
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.4815 Mean :0.03701 Mean :0.3236 Mean :0.02173
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.00000
##
## Superhero Novel Animation Franchise
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :1.0000
## Mean :0.1963 Mean :0.1097 Mean :0.2322 Mean :0.6068
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## Sequel
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.4687
## 3rd Qu.:1.0000
## Max. :1.0000
##
When we observe the data, we see that the year we have years range from 2015-2019. 2020 data was excluded, as limited movies were released during that time. In addition to the original data, some time was spent gathering additional data such as genres and city populations. Our primary focus will be total gross, which will be our independent variable in our dataset.
We need to look for ourliers in our dataset. Since there are multiple levels(Film, Theatre), we need to observe both the average gross for the film and theaters to make sure there aren’t any outliers.
film_avg <- master_file %>% group_by(Film) %>% summarise(avg_gross = mean(TotalGross), avg_weeks = mean(Weeks)) %>% arrange(desc(avg_gross))
head(film_avg)
## # A tibble: 6 x 3
## Film avg_gross avg_weeks
## <chr> <dbl> <dbl>
## 1 Force Awakens 239999. 11.6
## 2 Spectre 157770. 10.1
## 3 AVENGERS ENDGAME 153506. 8.67
## 4 STAR WARS THE LAST JEDI 148581. 7.79
## 5 BEAUTY AND THE BEAST 139429. 12.3
## 6 AVENGERS INFINITY WAR 136245. 9.70
theatre_avg <- master_file %>% group_by(TheatreNumber) %>% summarise(avg_gross = mean(TotalGross), avg_weeks = mean(Weeks)) %>% arrange(desc(avg_gross))
head(theatre_avg)
## # A tibble: 6 x 3
## TheatreNumber avg_gross avg_weeks
## <dbl> <dbl> <dbl>
## 1 52140 276850. 13.6
## 2 14600 251588. 11.3
## 3 14568 246481. 4.47
## 4 14382 227757. 12.1
## 5 14289 218552. 13.2
## 6 997733 204193. 13.9
Instantly, we notice that Force Awakens has a significant gross per theater.
Splitting the data in quartiles
"Film Average Gross Sales Summary"
## [1] "Film Average Gross Sales Summary"
summary(film_avg$avg_gross)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22546 37162 47063 61637 78539 239999
print("")
## [1] ""
"Theatre Average Gross Sales Summary"
## [1] "Theatre Average Gross Sales Summary"
summary(theatre_avg$avg_gross)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 28 4122 16003 33052 50026 276850
Some charts to further investigate the data.
ggplot(film_avg, aes(x=avg_weeks, y=avg_gross)) +
geom_point()+labs(title="Average Film Sales vs Average of Weeks in Theatre",
x ="Average Number of Weeks", y = "Average Gross")
ggplot(film_avg) +
aes(x = "", y = avg_gross) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal() + labs(title="Average Film Sales",
x ="Films", y = "Average Gross")
ggplot(theatre_avg, aes(x=avg_weeks, y=avg_gross)) +
geom_point()+labs(title="Average Theatre Sales vs Average of Weeks in Theatre",
x ="Average Number of Weeks", y = "Average Gross")
ggplot(theatre_avg) +
aes(x = "", y = avg_gross) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal() + labs(title="Average Threatre Sales",
x ="Threatres", y = "Average Gross")
Histogram charts.
hist(film_avg$avg_gross, 50, main = "Histogram Avg. Gross Sales by Film", xlab = "Average Sales")
hist(theatre_avg$avg_gross, 50, main = "Histogram Avg. Gross Sales by Theatre", xlab = "Average Sales")
We will use IQR method to remove the outliers in the data.
Q <- quantile(theatre_avg$avg_gross, probs= c(0.25, .75), na.rm = TRUE)
Q2 <- quantile(film_avg$avg_gross, probs= c(0.25, .75), na.rm = TRUE)
iqr <- IQR(theatre_avg$avg_gross)
iqr2 <- IQR(film_avg$avg_gross)
up <- Q[2]+1.5*iqr # Upper Range
low<- Q[1]-1.5*iqr # Lower Range
up2 <- Q2[2]+1.5*iqr2 # Upper Range
low2<- Q2[1]-1.5*iqr2 # Lower Range
We notice the 75% cutoff is 118
'Theatre and Film cutoff'
## [1] "Theatre and Film cutoff"
"Theatre Gross"
## [1] "Theatre Gross"
up
## 75%
## 118882.2
low
## 25%
## -64733.78
"Film Gross"
## [1] "Film Gross"
up2
## 75%
## 140605.5
low2
## 25%
## -24904.13
Eliminate the data outside of cutoff point and identify which rows were removed. We see that 43 Theatres and 4 Films were considered outliers and were removed from our dataset.
#eliminate outliers
eliminated<- subset(theatre_avg, avg_gross > (Q[1] - 1.5*iqr) & theatre_avg$avg_gross < (Q[2]+1.5*iqr))
eliminated2<- subset(film_avg, avg_gross > (Q2[1] - 1.5*iqr2) & film_avg$avg_gross < (Q2[2]+1.5*iqr2))
#which outliers were removed
which(theatre_avg$avg_gross %in% c(boxplot.stats(theatre_avg$avg_gross)$out))
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
which(film_avg$avg_gross %in% c(boxplot.stats(film_avg$avg_gross)$out))
## [1] 1 2 3 4
library(outliers)
highest <- grubbs.test(eliminated$avg_gross)
lowest <- grubbs.test(eliminated$avg_gross, opposite = TRUE )
highest2 <- grubbs.test(eliminated2$avg_gross)
lowest2 <- grubbs.test(eliminated2$avg_gross, opposite = TRUE )
highest
##
## Grubbs test for one outlier
##
## data: eliminated$avg_gross
## G = 3.05789, U = 0.98998, p-value = 1
## alternative hypothesis: highest value 118306.926076923 is an outlier
lowest
##
## Grubbs test for one outlier
##
## data: eliminated$avg_gross
## G = 0.90979, U = 0.99911, p-value = 1
## alternative hypothesis: lowest value 28.0000000030941 is an outlier
highest2
##
## Grubbs test for one outlier
##
## data: eliminated2$avg_gross
## G = 2.82005, U = 0.93318, p-value = 0.254
## alternative hypothesis: highest value 139428.994704322 is an outlier
lowest2
##
## Grubbs test for one outlier
##
## data: eliminated2$avg_gross
## G = 1.22238, U = 0.98744, p-value = 1
## alternative hypothesis: lowest value 22546.4670281431 is an outlier
After cleaning our dataset and removing our outliers, we have a clean dataset to work with for our analysis.
exclude_films <- film_avg[1:4,1]
exclude_theatres <- theatre_avg[1:43,1]
x<- subset(master_file, !Film %in% exclude_films[[1]])
master_clean <- subset(x, !TheatreNumber %in% exclude_theatres[[1]])