In this initial analysis, a cleaned subset of the TMDB dataset from https://www.kaggle.com/tmdb/tmdb-movie-metadata is used.

We analyse the different dependent and independent variables present in order to hypothesize and find patterns between data.

Visualising the length and breadth of the dataset.

plot(tmdb$revenue ~ tmdb$title,data=tmdb,ylab="revenue", xlab="title",main="Visualisation of the dataset")

Descriptive statistics.

describe(tmdb)
##                    vars    n         mean           sd      median
## X                     1 3197      1599.00       923.04     1599.00
## budget                2 3197  40943447.66  44521230.99 26000000.00
## original_language*    3 3197         5.28         2.01        5.00
## popularity            4 3197        29.15        36.29       20.54
## release_date*         5 3197      1287.66       690.82     1311.00
## revenue               6 3197 121666487.05 186820671.23 55260558.00
## runtime               7 3197       110.76        21.01      107.00
## title*                8 3197      1598.79       922.75     1599.00
## vote_average          9 3197         6.31         0.88        6.30
## vote_count           10 3197       982.54      1418.77      478.00
##                        trimmed         mad   min          max        range
## X                      1599.00     1184.60  1.00 3.197000e+03 3.196000e+03
## budget             32519429.03 28169400.00  1.00 3.800000e+08 3.800000e+08
## original_language*        5.00        0.00  1.00 2.700000e+01 2.600000e+01
## popularity               23.76       17.96  0.02 8.755800e+02 8.755600e+02
## release_date*          1295.87      862.87  1.00 2.474000e+03 2.473000e+03
## revenue            80852502.15 68585903.29  5.00 2.787965e+09 2.787965e+09
## runtime                 108.50       17.79 41.00 3.380000e+02 2.970000e+02
## title*                 1598.86     1184.60  1.00 3.196000e+03 3.195000e+03
## vote_average              6.34        0.89  0.00 8.500000e+00 8.500000e+00
## vote_count              667.82      545.60  0.00 1.375200e+04 1.375200e+04
##                     skew kurtosis         se
## X                   0.00    -1.20      16.32
## budget              2.07     5.44  787400.79
## original_language*  8.39    76.44       0.04
## popularity          9.24   161.66       0.64
## release_date*      -0.07    -1.12      12.22
## revenue             3.81    24.78 3304103.25
## runtime             1.70     7.39       0.37
## title*              0.00    -1.20      16.32
## vote_average       -0.61     1.94       0.02
## vote_count          3.20    13.80      25.09

Creating a few categorical variables

is_profit=ifelse((tmdb$revenue-tmdb$budget)>0,1,0)
#1:film resulted in net profit
season_no<-ifelse(month(as.Date(tmdb$release_date)) %in% seq(3,5),1,ifelse(month(as.Date(tmdb$release_date)) %in% seq(6,8),2,ifelse(month(as.Date(tmdb$release_date)) %in% seq(9,11),3,ifelse(month(as.Date(tmdb$release_date)) %in% range(12,1,2),4,0))))
#1=spring,2=summer,3=fall,4=winter
is_short<-ifelse(tmdb$runtime<50,1,0)
#short films are <50min as per Sundance Film Festival
tmdb<-cbind(tmdb,is_profit,season_no,is_short)

Contingency tables

xtabs(~season_no+is_short,data=tmdb)
##          is_short
## season_no   0   1
##         0 223   1
##         1 697   1
##         2 829   0
##         3 924   0
##         4 522   0
xtabs(~is_profit+original_language,data=tmdb)
##          original_language
## is_profit   af   cn   da   de   en   es   fa   fr   he   hi   id   is   it
##         0    0    1    0    2  740    4    0   12    0    1    1    0    1
##         1    1    4    3    7 2334   11    1   12    1    6    1    1    5
##          original_language
## is_profit   ja   ko   nb   nl   no   pl   pt   ro   ru   te   th   vi   xx
##         0    4    2    0    1    0    0    0    0    3    0    0    1    0
##         1    9    3    1    1    1    1    2    1    3    1    1    0    1
##          original_language
## is_profit   zh
##         0    4
##         1    8

Boxplots of variables important to the study.

par(mfrow=c(1,2))
boxplot(tmdb$revenue,ylab="revenue")
boxplot(tmdb$popularity,ylab="popularity")

Histograms of suitable data fields.

par(mfrow=c(2,2))
hist(tmdb$budget)
hist(tmdb$runtime)
hist(tmdb$vote_average)

Suitable plots.

par(mfrow=c(2,3))
plot(tmdb$revenue,tmdb$budget)
plot(tmdb$vote_count,tmdb$vote_average)
plot(tmdb$revenue,tmdb$popularity)
plot(tmdb$release_date,tmdb$budget)
plot(tmdb$release_date,tmdb$revenue)
plot(tmdb$release_date,tmdb$popularity)

Correlation matrix (rounded to 2 decimal places)

kek<-cbind(tmdb[,c(2,4,6,7,9,10)])
round(cor(kek, use="complete.obs", method="kendall"),2)
##              budget popularity revenue runtime vote_average vote_count
## budget         1.00       0.33    0.50    0.17        -0.09       0.34
## popularity     0.33       1.00    0.52    0.14         0.23       0.81
## revenue        0.50       0.52    1.00    0.15         0.08       0.56
## runtime        0.17       0.14    0.15    1.00         0.27       0.14
## vote_average  -0.09       0.23    0.08    0.27         1.00       0.25
## vote_count     0.34       0.81    0.56    0.14         0.25       1.00

Corrgram

corrgram(tmdb, order=TRUE, lower.panel=panel.shade,
  upper.panel=panel.pie, text.panel=panel.txt,
  diag.panel=panel.minmax, 
    main="Corrgram")

Scatterplot matrix

pairs(kek)

Hypothesis testing:

H0:Short films make as much revenue as feature length films. H1:Short films have less revenue compared to feature length films.

aggregate(tmdb$revenue, by=list(tmdb$is_short), mean)
##   Group.1         x
## 1       0 121739019
## 2       1   5796450
t.test(tmdb$is_short,tmdb$revenue)
## 
##  Welch Two Sample t-test
## 
## data:  tmdb$is_short and tmdb$revenue
## t = -36.823, df = 3196, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -128144864 -115188110
## sample estimates:
##    mean of x    mean of y 
## 6.255865e-04 1.216665e+08

Due to very low p value we reject the null hypothesis.