** Please click all the tabs (in sequence) to get the entire set of information in these pages. **

** To download code, see the instructions in Session 2: https://rpubs.com/hkb/DAX-Session2 **

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
options(scipen=10000000)
options(digits=3)
# install.packages("knitr")
library(knitr)

library(dplyr)
library(tidyverse)
library(ggplot2)
library(gridExtra)
library(ggrepel)
library(boxoffice) # because the package is already installed

Session 4

Objectives

Movies Data Set

Lets load the data by making a call to boxofficemojo.com through the boxoffice() library. If, for some reason, you have not yet installed the package look through Session 2 notes and do it.

date.seq <- paste(2000:2019,"-12-31",sep="") 
# Fetch the data 
movies <- boxoffice(date = as.Date(date.seq), top_n = 50)

We’ll extend the data frame by adding - for each movie in the database - Year, and Rank within Year based on gross revenues.

movies <- movies %>% na.omit() %>% mutate(Year =  as.numeric(format(as.Date(date), "%Y"))) # na.omit() omits the rows with NA values; create new column Year. which extracts the Y (year) from the date

# Extract the Year, then Rank by Sales

movies <- movies %>% group_by(Year) %>% arrange(desc(total_gross)) %>%  mutate(rank=row_number())

Who’s making top movies each year?

But what if you want to know how many times a distributor is in the top 50 (or top.n for that year) one year at a time.

leading.distributors <- movies %>% select(distributor, Year) %>% group_by(distributor,Year) %>% mutate(count = n()) 
ggplot(leading.distributors %>% filter(Year > 2009), aes(x=Year, y=distributor, size=count)) + geom_point() + theme_classic()

count.movies <- data.frame(table(movies$Year))
names(count.movies) = c("Year", "Count")
ggplot(count.movies, aes(x=Year, y=Count)) + geom_point() + expand_limits(y=0) + theme_classic()

So, there’s more data in recent years, and fewer movies per year in past years.

sort(table((movies %>% filter(rank < 11, Year > 2009))$distributor), decreasing=TRUE)

 Warner Bros.   Walt Disney  20th Century 
           23            21            14 
Sony Pictures  Paramount Pi     Universal 
           13            10             7 
    Lionsgate  STX Entertai           MGM 
            6             2             1 
   Relativity  Summit Enter Weinstein Co. 
            1             1             1 
hits.per.year <- as.data.frame(table(movies$distributor, movies$Year)) 
names(hits.per.year) = c("distributor", "Year", "count")
hits.per.year <- hits.per.year %>% group_by(distributor) %>% mutate(total = sum(count)) %>% arrange(desc(total))
movies.top10 <- movies %>% filter(rank < 11)  # %>% group_by(Year) %>% arrange(rank)
kable(movies.top10 %>% filter(rank %in% c(1,10)) %>% select(movie, Year, rank, total_gross) %>% arrange(rank)) 
movie Year rank total_gross
Star Wars Ep. VII: The Fo 2015 1 651967269
The Dark Knight 2008 1 530924926
Star Wars Ep. VIII: The L 2017 1 517218368
Frozen II 2019 1 430144682
Rogue One: A Star Wars Story 2016 1 408235850
The Hunger Games: Catchin 2013 1 395526705
The Hunger Games: Mocking 2014 1 313282914
Inception 2010 1 292558188
Skyfall 2012 1 290904271
Harry Potter and the Sorc 2001 1 288493000
The Twilight Saga: New Moon 2009 1 284512392
The Twilight Saga: Breaki 2011 1 274841954
Harry Potter and the Gobl 2005 1 273281180
Dr. Seuss The Grinch 2018 1 266280410
How the Grinch Stole Chri 2000 1 251629105
The Lord of the Rings: Th 2003 1 249400000
Harry Potter and the Cham 2002 1 243855000
I am Legend 2007 1 199345154
The Polar Express 2006 1 176454984
The Polar Express 2004 1 151623383
Lincoln 2012 10 134189097
Creed II 2018 10 112448520
Paranormal Activity 2009 10 107792845
Jackass Presents: Bad Gra 2013 10 101337759
The Social Network 2010 10 92876282
Arrival 2016 10 91676099
Tower Heist 2011 10 76650420
Zombieland: Double Tap 2019 10 72930156
A Bad Moms Christmas 2017 10 71891988
Night at the Museum: Secr 2014 10 69354582
Daddys Home 2015 10 64684278
Yes Man 2008 10 60029690
Rocky Balboa 2006 10 47940632
Mr. Magorium’s Wonder Emp 2007 10 31049456
The Hot Chick 2002 10 24021000
Peter Pan 2003 10 22000000
The Ringer 2005 10 17265628
A Beautiful Mind 2001 10 15949000
All the Pretty Horses 2000 10 7640564
p1 <- ggplot(data=movies, aes(x=rank,y=total_gross)) + geom_line(aes(color=as.factor(Year))) + theme_classic()

p2 <- p1 + coord_trans(y = "log10") # convert y axis to log scale

grid.arrange(p1, p2, ncol=2) # arrange both plots side by side, in two columns

Let’s flip what we put on the horizontal axis vs color. So, let’s make Year the horizontal axis and display the n-th highest grossing movie each year, using color for Rank

p1 <- ggplot(data= movies.top10 %>% filter(rank < 41), aes(x=Year,y=total_gross)) + geom_line(aes(color=as.factor(rank))) + theme_classic()

p2 <- p1 + coord_trans(y = "log10") # convert y axis to log scale

grid.arrange(p1, p2, ncol=2) # arrange both plots side by side, in two columns

How well are the top movies doing each year?

But what if you want to know how many times a distributor is in the top 50 (or top.n for that year) one year at a time.

leading.movies <- movies.top10 %>% select(movie, Year, total_gross) 
ggplot(movies.top10, aes(x=Year, y=rank, size=total_gross)) + geom_point() + theme_classic()

movies.top10 <- movies.top10 %>% mutate(Year.new = Year-1999, rank.new=rev(rank))
lm <- lm(total_gross ~ rank.new * Year.new, data = movies.top10)
summary(lm)

Call:
lm(formula = total_gross ~ rank.new * Year.new, data = movies.top10)

Residuals:
      Min        1Q    Median        3Q       Max 
-98080938 -34321252  -2413607  25734341 338186276 

Coefficients:
                   Estimate Std. Error t value
(Intercept)       -44050046   17349403   -2.54
rank.new           20728716    2830252    7.32
Year.new            4205876    1444418    2.91
rank.new:Year.new    520312     234292    2.22
                         Pr(>|t|)    
(Intercept)                 0.012 *  
rank.new          0.0000000000063 ***
Year.new                    0.004 ** 
rank.new:Year.new           0.028 *  
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 54400000 on 194 degrees of freedom
Multiple R-squared:  0.72,  Adjusted R-squared:  0.715 
F-statistic:  166 on 3 and 194 DF,  p-value: <0.0000000000000002
LS0tCnRpdGxlOiAiU2Vzc2lvbiA0IgphdXRob3I6ICJIZW1hbnQgQmhhcmdhdmEiCmRhdGU6ICI3LzI3LzIwMjAiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCioqIFBsZWFzZSBjbGljayBhbGwgdGhlIHRhYnMgKGluIHNlcXVlbmNlKSB0byBnZXQgdGhlIGVudGlyZSBzZXQgb2YgaW5mb3JtYXRpb24gaW4gdGhlc2UgcGFnZXMuICoqCgoqKiBUbyBkb3dubG9hZCBjb2RlLCBzZWUgdGhlIGluc3RydWN0aW9ucyBpbiBTZXNzaW9uIDI6IGh0dHBzOi8vcnB1YnMuY29tL2hrYi9EQVgtU2Vzc2lvbjIgKioKCgpgYGB7ciBzZXR1cH0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFLCB3YXJuaW5nPUZBTFNFLCBtZXNzYWdlPUZBTFNFKQpvcHRpb25zKHNjaXBlbj0xMDAwMDAwMCkKb3B0aW9ucyhkaWdpdHM9MykKYGBgCgpgYGB7ciBwYWNrYWdlc30KIyBpbnN0YWxsLnBhY2thZ2VzKCJrbml0ciIpCmxpYnJhcnkoa25pdHIpCgpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KGdyaWRFeHRyYSkKbGlicmFyeShnZ3JlcGVsKQpsaWJyYXJ5KGJveG9mZmljZSkgIyBiZWNhdXNlIHRoZSBwYWNrYWdlIGlzIGFscmVhZHkgaW5zdGFsbGVkCmBgYAoKIyBTZXNzaW9uIDQKCiMjIE9iamVjdGl2ZXMKCiogTGlmZSBhcyBhIERhdGEgQW5hbHlzdCAtIFNvbmljIFByYWJodWRlc2FpCgoqIFN0b3J5dGVsbGluZyB3aXRoIERhdGEgLSBmcm9tIEZpbmRIb3RlbC5jb20sIGh0dHBzOi8vYmxvZy5maW5kaG90ZWwubmV0LzIwMjAvMDYvZGF0YS1zdG9yeS1vZi10aGUtdHJhdmVsLW1hcmtldC1yZWNvdmVyeS8gCgoqIFRoZSBNb3ZpZSBCdXNpbmVzczogQ3JlYXRpbmcgbmV3IFN0b3JpZXMgd2l0aCB0aGUgVG9vbHMgeW91J3ZlIExlYXJudAoKIyMgTW92aWVzIERhdGEgU2V0CgpMZXRzIGxvYWQgdGhlIGRhdGEgYnkgbWFraW5nIGEgY2FsbCB0byBib3hvZmZpY2Vtb2pvLmNvbSB0aHJvdWdoIHRoZSBib3hvZmZpY2UoKSBsaWJyYXJ5LiBJZiwgZm9yIHNvbWUgcmVhc29uLCB5b3UgaGF2ZSBub3QgeWV0IGluc3RhbGxlZCB0aGUgcGFja2FnZSBsb29rIHRocm91Z2ggU2Vzc2lvbiAyIG5vdGVzIGFuZCBkbyBpdC4gCgpgYGB7ciBtb3ZpZXMuZGF0YX0KZGF0ZS5zZXEgPC0gcGFzdGUoMjAwMDoyMDE5LCItMTItMzEiLHNlcD0iIikgCiMgRmV0Y2ggdGhlIGRhdGEgCm1vdmllcyA8LSBib3hvZmZpY2UoZGF0ZSA9IGFzLkRhdGUoZGF0ZS5zZXEpLCB0b3BfbiA9IDUwKQpgYGAKCldlJ2xsIGV4dGVuZCB0aGUgZGF0YSBmcmFtZSBieSBhZGRpbmcgLSBmb3IgZWFjaCBtb3ZpZSBpbiB0aGUgZGF0YWJhc2UgLSBZZWFyLCBhbmQgUmFuayB3aXRoaW4gWWVhciBiYXNlZCBvbiBncm9zcyByZXZlbnVlcy4gCgpgYGB7ciBtb3ZpZXMuZXh0ZW5kfSAKbW92aWVzIDwtIG1vdmllcyAlPiUgbmEub21pdCgpICU+JSBtdXRhdGUoWWVhciA9ICBhcy5udW1lcmljKGZvcm1hdChhcy5EYXRlKGRhdGUpLCAiJVkiKSkpICMgbmEub21pdCgpIG9taXRzIHRoZSByb3dzIHdpdGggTkEgdmFsdWVzOyBjcmVhdGUgbmV3IGNvbHVtbiBZZWFyLiB3aGljaCBleHRyYWN0cyB0aGUgWSAoeWVhcikgZnJvbSB0aGUgZGF0ZQoKIyBFeHRyYWN0IHRoZSBZZWFyLCB0aGVuIFJhbmsgYnkgU2FsZXMKCm1vdmllcyA8LSBtb3ZpZXMgJT4lIGdyb3VwX2J5KFllYXIpICU+JSBhcnJhbmdlKGRlc2ModG90YWxfZ3Jvc3MpKSAlPiUgIG11dGF0ZShyYW5rPXJvd19udW1iZXIoKSkKCmBgYAoKIyMgV2hvJ3MgbWFraW5nIHRvcCBtb3ZpZXMgZWFjaCB5ZWFyPwoKQnV0IHdoYXQgaWYgeW91IHdhbnQgdG8ga25vdyBob3cgbWFueSB0aW1lcyBhIGRpc3RyaWJ1dG9yIGlzIGluIHRoZSB0b3AgNTAgKG9yIHRvcC5uIGZvciB0aGF0IHllYXIpIG9uZSB5ZWFyIGF0IGEgdGltZS4gCgpgYGB7ciB0YWJsZS5kaXN0cmlidXRvci5ZZWFyLCBmaWcud2lkdGg9MTUsZmlnLmhlaWdodD0xMn0KbGVhZGluZy5kaXN0cmlidXRvcnMgPC0gbW92aWVzICU+JSBzZWxlY3QoZGlzdHJpYnV0b3IsIFllYXIpICU+JSBncm91cF9ieShkaXN0cmlidXRvcixZZWFyKSAlPiUgbXV0YXRlKGNvdW50ID0gbigpKSAKZ2dwbG90KGxlYWRpbmcuZGlzdHJpYnV0b3JzICU+JSBmaWx0ZXIoWWVhciA+IDIwMDkpLCBhZXMoeD1ZZWFyLCB5PWRpc3RyaWJ1dG9yLCBzaXplPWNvdW50KSkgKyBnZW9tX3BvaW50KCkgKyB0aGVtZV9jbGFzc2ljKCkKYGBgCgoKYGBge3IgdGFibGUuWWVhci5wbG90LCBmaWcud2lkdGg9OCxmaWcuaGVpZ2h0PTN9CmNvdW50Lm1vdmllcyA8LSBkYXRhLmZyYW1lKHRhYmxlKG1vdmllcyRZZWFyKSkKbmFtZXMoY291bnQubW92aWVzKSA9IGMoIlllYXIiLCAiQ291bnQiKQpnZ3Bsb3QoY291bnQubW92aWVzLCBhZXMoeD1ZZWFyLCB5PUNvdW50KSkgKyBnZW9tX3BvaW50KCkgKyBleHBhbmRfbGltaXRzKHk9MCkgKyB0aGVtZV9jbGFzc2ljKCkKYGBgCgpTbywgdGhlcmUncyBtb3JlIGRhdGEgaW4gcmVjZW50IHllYXJzLCBhbmQgZmV3ZXIgbW92aWVzIHBlciB5ZWFyIGluIHBhc3QgeWVhcnMuIAoKYGBge3IgdGFibGV9CnNvcnQodGFibGUoKG1vdmllcyAlPiUgZmlsdGVyKHJhbmsgPCAxMSwgWWVhciA+IDIwMDkpKSRkaXN0cmlidXRvciksIGRlY3JlYXNpbmc9VFJVRSkKaGl0cy5wZXIueWVhciA8LSBhcy5kYXRhLmZyYW1lKHRhYmxlKG1vdmllcyRkaXN0cmlidXRvciwgbW92aWVzJFllYXIpKSAKbmFtZXMoaGl0cy5wZXIueWVhcikgPSBjKCJkaXN0cmlidXRvciIsICJZZWFyIiwgImNvdW50IikKaGl0cy5wZXIueWVhciA8LSBoaXRzLnBlci55ZWFyICU+JSBncm91cF9ieShkaXN0cmlidXRvcikgJT4lIG11dGF0ZSh0b3RhbCA9IHN1bShjb3VudCkpICU+JSBhcnJhbmdlKGRlc2ModG90YWwpKQpgYGAKCmBgYHtyIHRvcDEwfQptb3ZpZXMudG9wMTAgPC0gbW92aWVzICU+JSBmaWx0ZXIocmFuayA8IDExKSAgIyAlPiUgZ3JvdXBfYnkoWWVhcikgJT4lIGFycmFuZ2UocmFuaykKa2FibGUobW92aWVzLnRvcDEwICU+JSBmaWx0ZXIocmFuayAlaW4lIGMoMSwxMCkpICU+JSBzZWxlY3QobW92aWUsIFllYXIsIHJhbmssIHRvdGFsX2dyb3NzKSAlPiUgYXJyYW5nZShyYW5rKSkgCmBgYAoKCmBgYHtyIG1vdmllcy5yYW5rLnBsb3QsZmlnLndpZHRoPTE0LGZpZy5oZWlnaHQ9M30KcDEgPC0gZ2dwbG90KGRhdGE9bW92aWVzLCBhZXMoeD1yYW5rLHk9dG90YWxfZ3Jvc3MpKSArIGdlb21fbGluZShhZXMoY29sb3I9YXMuZmFjdG9yKFllYXIpKSkgKyB0aGVtZV9jbGFzc2ljKCkKCnAyIDwtIHAxICsgY29vcmRfdHJhbnMoeSA9ICJsb2cxMCIpICMgY29udmVydCB5IGF4aXMgdG8gbG9nIHNjYWxlCgpncmlkLmFycmFuZ2UocDEsIHAyLCBuY29sPTIpICMgYXJyYW5nZSBib3RoIHBsb3RzIHNpZGUgYnkgc2lkZSwgaW4gdHdvIGNvbHVtbnMKYGBgCgpMZXQncyBmbGlwIHdoYXQgd2UgcHV0IG9uIHRoZSBob3Jpem9udGFsIGF4aXMgdnMgY29sb3IuIFNvLCBsZXQncyBtYWtlIFllYXIgdGhlIGhvcml6b250YWwgYXhpcyBhbmQgZGlzcGxheSB0aGUgbi10aCBoaWdoZXN0IGdyb3NzaW5nIG1vdmllIGVhY2ggeWVhciwgdXNpbmcgY29sb3IgZm9yIFJhbmsKYGBge3IgbW92aWVzLlllYXIucGxvdCxmaWcud2lkdGg9MTQsZmlnLmhlaWdodD0zfQpwMSA8LSBnZ3Bsb3QoZGF0YT0gbW92aWVzLnRvcDEwICU+JSBmaWx0ZXIocmFuayA8IDQxKSwgYWVzKHg9WWVhcix5PXRvdGFsX2dyb3NzKSkgKyBnZW9tX2xpbmUoYWVzKGNvbG9yPWFzLmZhY3RvcihyYW5rKSkpICsgdGhlbWVfY2xhc3NpYygpCgpwMiA8LSBwMSArIGNvb3JkX3RyYW5zKHkgPSAibG9nMTAiKSAjIGNvbnZlcnQgeSBheGlzIHRvIGxvZyBzY2FsZQoKZ3JpZC5hcnJhbmdlKHAxLCBwMiwgbmNvbD0yKSAjIGFycmFuZ2UgYm90aCBwbG90cyBzaWRlIGJ5IHNpZGUsIGluIHR3byBjb2x1bW5zCmBgYAoKCiMjIEhvdyB3ZWxsIGFyZSB0aGUgdG9wIG1vdmllcyBkb2luZyBlYWNoIHllYXI/CgpCdXQgd2hhdCBpZiB5b3Ugd2FudCB0byBrbm93IGhvdyBtYW55IHRpbWVzIGEgZGlzdHJpYnV0b3IgaXMgaW4gdGhlIHRvcCA1MCAob3IgdG9wLm4gZm9yIHRoYXQgeWVhcikgb25lIHllYXIgYXQgYSB0aW1lLiAKCmBgYHtyIG1vdmllLnllYXIuZ3Jvc3MsIGZpZy53aWR0aD0xNSxmaWcuaGVpZ2h0PTh9CmxlYWRpbmcubW92aWVzIDwtIG1vdmllcy50b3AxMCAlPiUgc2VsZWN0KG1vdmllLCBZZWFyLCB0b3RhbF9ncm9zcykgCmdncGxvdChtb3ZpZXMudG9wMTAsIGFlcyh4PVllYXIsIHk9cmFuaywgc2l6ZT10b3RhbF9ncm9zcykpICsgZ2VvbV9wb2ludCgpICsgdGhlbWVfY2xhc3NpYygpCmBgYAoKYGBge3IgbW92aWUueWVhci5ncm9zcy5yZWd9Cm1vdmllcy50b3AxMCA8LSBtb3ZpZXMudG9wMTAgJT4lIG11dGF0ZShZZWFyLm5ldyA9IFllYXItMTk5OSwgcmFuay5uZXc9cmV2KHJhbmspKQpsbSA8LSBsbSh0b3RhbF9ncm9zcyB+IHJhbmsubmV3ICogWWVhci5uZXcsIGRhdGEgPSBtb3ZpZXMudG9wMTApCnN1bW1hcnkobG0pCmBgYA==