698 Project Data Cleaning

R Markdown

library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)

The data shows films in UK and how much each film grossed for each theater. We want to be able to predict top theaters for a specific genre or comp title. In the movie industry, a lot of the analysis is compared with compatible titles, but first we need to clean the data to make sure we have everything we need.

master_file <- read_excel("C:/Users/munkhnaran.gankhuyag/OneDrive - insidemedia.net/Python/Rentrak Project/master_file_0509.xlsx", 
col_types = c("numeric", "text", "numeric", 
"text", "text", "numeric", "text", 
"text", "text", "text", "date", "numeric", 
"numeric", "numeric", "numeric", 
"numeric", "numeric", "numeric", 
"numeric", "numeric", "numeric", 
"numeric", "numeric", "numeric", 
"numeric", "numeric", "numeric", 
"numeric", "numeric", "numeric", 
"numeric", "numeric", "numeric", 
"numeric", "numeric", "numeric", 
"numeric", "numeric","numeric", "numeric", "numeric", 
"numeric", "numeric", "numeric", 
"numeric", "numeric", "numeric", 
"numeric", "numeric", "numeric", 
"numeric", "numeric", "numeric", 
"numeric", "numeric", "numeric"))
summary(master_file)

##       Year          Film           TheatreNumber     TheatreName       
##  Min.   :2015   Length:57068       Min.   :   7550   Length:57068      
##  1st Qu.:2016   Class :character   1st Qu.:   8467   Class :character  
##  Median :2017   Mode  :character   Median :  14767   Mode  :character  
##  Mean   :2017                      Mean   : 229319                     
##  3rd Qu.:2018                      3rd Qu.:  50273                     
##  Max.   :2019                      Max.   :1092965                     
##                                                                        
##      City           City Population     Circuit             Region         
##  Length:57068       Min.   :   1409   Length:57068       Length:57068      
##  Class :character   1st Qu.:  48516   Class :character   Class :character  
##  Mode  :character   Median : 117763   Mode  :character   Mode  :character  
##                     Mean   : 703944                                        
##                     3rd Qu.: 318014                                        
##                     Max.   :7556900                                        
##                     NA's   :1919                                           
##     Branch             State           DateofFirstEngagement        
##  Length:57068       Length:57068       Min.   :2015-01-01 00:00:00  
##  Class :character   Class :character   1st Qu.:2016-06-18 00:00:00  
##  Mode  :character   Mode  :character   Median :2017-09-08 00:00:00  
##                                        Mean   :2017-08-03 12:47:23  
##                                        3rd Qu.:2018-11-14 00:00:00  
##                                        Max.   :2020-10-19 00:00:00  
##                                                                     
##      Weeks        Weeks\r\nInRange  MarketShare          TotalGross     
##  Min.   : 1.000   Min.   : 1.000   Min.   :2.000e-08   Min.   :      1  
##  1st Qu.: 6.000   1st Qu.: 5.000   1st Qu.:9.240e-04   1st Qu.:  18990  
##  Median : 9.000   Median : 8.000   Median :1.890e-03   Median :  40989  
##  Mean   : 9.411   Mean   : 8.882   Mean   :2.278e-03   Mean   :  59970  
##  3rd Qu.:12.000   3rd Qu.:12.000   3rd Qu.:3.137e-03   3rd Qu.:  77285  
##  Max.   :33.000   Max.   :33.000   Max.   :3.651e-02   Max.   :1885916  
##                                                                         
##  OpeningWeekendGross Opening7DayGross   Week0Gross        Week1WeekendGross
##  Min.   :     0      Min.   :     0   Min.   :    0.000   Min.   :     0   
##  1st Qu.:  3667      1st Qu.:  6097   1st Qu.:    0.000   1st Qu.:  2650   
##  Median :  8615      Median : 14378   Median :    0.000   Median :  8341   
##  Mean   : 14324      Mean   : 23707   Mean   :    8.365   Mean   : 13893   
##  3rd Qu.: 17730      3rd Qu.: 29256   3rd Qu.:    0.000   3rd Qu.: 17715   
##  Max.   :301698      Max.   :609211   Max.   :13828.830   Max.   :301698   
##                                                                            
##    Week1Gross     Week2WeekendGross   Week2Gross     Week3WeekendGross
##  Min.   :     0   Min.   :     0    Min.   :     0   Min.   :     0   
##  1st Qu.:  4536   1st Qu.:  2671    1st Qu.:  4491   1st Qu.:  1629   
##  Median : 13494   Median :  5891    Median :  9729   Median :  3849   
##  Mean   : 21723   Mean   :  8738    Mean   : 14333   Mean   :  5596   
##  3rd Qu.: 28083   3rd Qu.: 11270    3rd Qu.: 18360   3rd Qu.:  7423   
##  Max.   :492339   Max.   :158272    Max.   :314517   Max.   :153820   
##                                                                       
##    Week3Gross     Week4WeekendGross    Week4Gross     Week5WeekendGross
##  Min.   :     0   Min.   :     0.0   Min.   :     0   Min.   :     0   
##  1st Qu.:  2654   1st Qu.:   808.6   1st Qu.:  1285   1st Qu.:   239   
##  Median :  6127   Median :  2308.8   Median :  3582   Median :  1356   
##  Mean   :  8890   Mean   :  3408.2   Mean   :  5466   Mean   :  2171   
##  3rd Qu.: 11739   3rd Qu.:  4645.8   3rd Qu.:  7242   3rd Qu.:  3008   
##  Max.   :298679   Max.   :126346.5   Max.   :235305   Max.   :101123   
##                                                                        
##    Week5Gross       Week6WeekendGross   Week6Gross     Week7WeekendGross
##  Min.   :     0.0   Min.   :    0.0   Min.   :     0   Min.   :    0.0  
##  1st Qu.:   395.2   1st Qu.:    0.0   1st Qu.:     0   1st Qu.:    0.0  
##  Median :  2133.1   Median :  676.1   Median :  1024   Median :  223.1  
##  Mean   :  3520.2   Mean   : 1329.3   Mean   :  2145   Mean   :  869.0  
##  3rd Qu.:  4882.1   3rd Qu.: 1896.0   3rd Qu.:  2923   3rd Qu.: 1195.9  
##  Max.   :177792.0   Max.   :72739.7   Max.   :133714   Max.   :64386.0  
##                                                                         
##    Week7Gross       Week8WeekendGross   Week8Gross      Week9WeekendGross
##  Min.   :     0.0   Min.   :    0.0   Min.   :    0.0   Min.   :    0.0  
##  1st Qu.:     0.0   1st Qu.:    0.0   1st Qu.:    0.0   1st Qu.:    0.0  
##  Median :   335.1   Median :    0.0   Median :    0.0   Median :    0.0  
##  Mean   :  1355.4   Mean   :  517.8   Mean   :  801.7   Mean   :  307.4  
##  3rd Qu.:  1745.8   3rd Qu.:  610.5   3rd Qu.:  880.3   3rd Qu.:  285.4  
##  Max.   :112019.8   Max.   :60438.4   Max.   :83665.4   Max.   :32806.6  
##                                                                          
##    Week9Gross      Week10WeekendGross  Week10Gross        Adventure     
##  Min.   :    0.0   Min.   :    0.0    Min.   :    0.0   Min.   :0.0000  
##  1st Qu.:    0.0   1st Qu.:    0.0    1st Qu.:    0.0   1st Qu.:0.0000  
##  Median :    0.0   Median :    0.0    Median :    0.0   Median :1.0000  
##  Mean   :  489.9   Mean   :  208.5    Mean   :  316.2   Mean   :0.6703  
##  3rd Qu.:  466.6   3rd Qu.:  115.2    3rd Qu.:  181.4   3rd Qu.:1.0000  
##  Max.   :62609.3   Max.   :21501.2    Max.   :32906.2   Max.   :1.0000  
##                                                                         
##      Action          Musical           Drama           History       
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :0.4493   Mean   :0.1635   Mean   :0.2803   Mean   :0.08411  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##                                                                      
##      Comedy          Mystery           Romance          Thriller     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.00000   Median :0.0000   Median :0.0000  
##  Mean   :0.3971   Mean   :0.08658   Mean   :0.1141   Mean   :0.2056  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##                                                                      
##      Sci Fi            War              Family           Horror       
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.4815   Mean   :0.03701   Mean   :0.3236   Mean   :0.02173  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##                                                                       
##    Superhero          Novel          Animation        Franchise     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :1.0000  
##  Mean   :0.1963   Mean   :0.1097   Mean   :0.2322   Mean   :0.6068  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                                                                     
##      Sequel      
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.4687  
##  3rd Qu.:1.0000  
##  Max.   :1.0000  
##

When we observe the data, we see that the year we have years range from 2015-2019. 2020 data was excluded, as limited movies were released during that time. In addition to the original data, some time was spent gathering additional data such as genres and city populations. Our primary focus will be total gross, which will be our independent variable in our dataset.

We need to look for ourliers in our dataset. Since there are multiple levels(Film, Theatre), we need to observe both the average gross for the film and theaters to make sure there aren’t any outliers.

film_avg <- master_file %>% group_by(Film) %>% summarise(avg_gross = mean(TotalGross), avg_weeks = mean(Weeks)) %>% arrange(desc(avg_gross))

head(film_avg)

## # A tibble: 6 x 3
##   Film                    avg_gross avg_weeks
##   <chr>                       <dbl>     <dbl>
## 1 Force Awakens             239999.     11.6 
## 2 Spectre                   157770.     10.1 
## 3 AVENGERS ENDGAME          153506.      8.67
## 4 STAR WARS THE LAST JEDI   148581.      7.79
## 5 BEAUTY AND THE BEAST      139429.     12.3 
## 6 AVENGERS INFINITY WAR     136245.      9.70

theatre_avg <- master_file %>% group_by(TheatreNumber) %>% summarise(avg_gross = mean(TotalGross), avg_weeks = mean(Weeks)) %>% arrange(desc(avg_gross))

head(theatre_avg)

## # A tibble: 6 x 3
##   TheatreNumber avg_gross avg_weeks
##           <dbl>     <dbl>     <dbl>
## 1         52140   276850.     13.6 
## 2         14600   251588.     11.3 
## 3         14568   246481.      4.47
## 4         14382   227757.     12.1 
## 5         14289   218552.     13.2 
## 6        997733   204193.     13.9

Instantly, we notice that Force Awakens has a significant gross per theater.

Splitting the data in quartiles

"Film Average Gross Sales Summary"

## [1] "Film Average Gross Sales Summary"

summary(film_avg$avg_gross)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22546   37162   47063   61637   78539  239999

print("")

## [1] ""

"Theatre Average Gross Sales Summary"

## [1] "Theatre Average Gross Sales Summary"

summary(theatre_avg$avg_gross)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      28    4122   16003   33052   50026  276850

Some charts to further investigate the data.

ggplot(film_avg, aes(x=avg_weeks, y=avg_gross)) +
  geom_point()+labs(title="Average Film Sales vs Average of Weeks in Theatre",
        x ="Average Number of Weeks", y = "Average Gross")

ggplot(film_avg) +
  aes(x = "", y = avg_gross) +
  geom_boxplot(fill = "#0c4c8a") +
  theme_minimal() + labs(title="Average Film Sales",
        x ="Films", y = "Average Gross")

ggplot(theatre_avg, aes(x=avg_weeks, y=avg_gross)) +
  geom_point()+labs(title="Average Theatre Sales vs Average of Weeks in Theatre",
        x ="Average Number of Weeks", y = "Average Gross")

ggplot(theatre_avg) +
  aes(x = "", y = avg_gross) +
  geom_boxplot(fill = "#0c4c8a") +
  theme_minimal() + labs(title="Average Threatre Sales",
        x ="Threatres", y = "Average Gross")

Histogram charts.

hist(film_avg$avg_gross, 50, main = "Histogram Avg. Gross Sales by Film", xlab = "Average Sales")

hist(theatre_avg$avg_gross, 50, main = "Histogram Avg. Gross Sales by Theatre", xlab = "Average Sales")

We will use IQR method to remove the outliers in the data.

Q <- quantile(theatre_avg$avg_gross, probs= c(0.25, .75), na.rm = TRUE)
Q2 <- quantile(film_avg$avg_gross, probs= c(0.25, .75), na.rm = TRUE)

iqr <- IQR(theatre_avg$avg_gross)
iqr2 <- IQR(film_avg$avg_gross)

up <-  Q[2]+1.5*iqr # Upper Range  
low<- Q[1]-1.5*iqr # Lower Range


up2 <-  Q2[2]+1.5*iqr2 # Upper Range  
low2<- Q2[1]-1.5*iqr2 # Lower Range

We notice the 75% cutoff is 118

'Theatre and Film cutoff'

## [1] "Theatre and Film cutoff"

"Theatre Gross"

## [1] "Theatre Gross"

up

##      75% 
## 118882.2

low

##       25% 
## -64733.78

"Film Gross"

## [1] "Film Gross"

up2

##      75% 
## 140605.5

low2

##       25% 
## -24904.13

Eliminate the data outside of cutoff point and identify which rows were removed. We see that 43 Theatres and 4 Films were considered outliers and were removed from our dataset.

#eliminate outliers
eliminated<- subset(theatre_avg, avg_gross > (Q[1] - 1.5*iqr) & theatre_avg$avg_gross < (Q[2]+1.5*iqr))
eliminated2<- subset(film_avg, avg_gross > (Q2[1] - 1.5*iqr2) & film_avg$avg_gross < (Q2[2]+1.5*iqr2))

#which outliers were removed
which(theatre_avg$avg_gross %in% c(boxplot.stats(theatre_avg$avg_gross)$out))

##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43

which(film_avg$avg_gross %in% c(boxplot.stats(film_avg$avg_gross)$out))

## [1] 1 2 3 4

library(outliers)
highest <- grubbs.test(eliminated$avg_gross)
lowest <- grubbs.test(eliminated$avg_gross, opposite = TRUE )

highest2 <- grubbs.test(eliminated2$avg_gross)
lowest2 <- grubbs.test(eliminated2$avg_gross, opposite = TRUE )

highest

## 
##  Grubbs test for one outlier
## 
## data:  eliminated$avg_gross
## G = 3.05789, U = 0.98998, p-value = 1
## alternative hypothesis: highest value 118306.926076923 is an outlier

lowest

## 
##  Grubbs test for one outlier
## 
## data:  eliminated$avg_gross
## G = 0.90979, U = 0.99911, p-value = 1
## alternative hypothesis: lowest value 28.0000000030941 is an outlier

highest2

## 
##  Grubbs test for one outlier
## 
## data:  eliminated2$avg_gross
## G = 2.82005, U = 0.93318, p-value = 0.254
## alternative hypothesis: highest value 139428.994704322 is an outlier

lowest2

## 
##  Grubbs test for one outlier
## 
## data:  eliminated2$avg_gross
## G = 1.22238, U = 0.98744, p-value = 1
## alternative hypothesis: lowest value 22546.4670281431 is an outlier

After cleaning our dataset and removing our outliers, we have a clean dataset to work with for our analysis.

exclude_films <- film_avg[1:4,1]
exclude_theatres <- theatre_avg[1:43,1]

x<- subset(master_file, !Film %in% exclude_films[[1]])
master_clean <- subset(x, !TheatreNumber %in% exclude_theatres[[1]])