In this initial analysis, a cleaned subset of the TMDB dataset from https://www.kaggle.com/tmdb/tmdb-movie-metadata is used.
We analyse the different dependent and independent variables present in order to hypothesize and find patterns between data.
Visualising the length and breadth of the dataset.
plot(tmdb$revenue ~ tmdb$title,data=tmdb,ylab="revenue", xlab="title",main="Visualisation of the dataset")
Descriptive statistics.
describe(tmdb)
## vars n mean sd median
## X 1 3197 1599.00 923.04 1599.00
## budget 2 3197 40943447.66 44521230.99 26000000.00
## original_language* 3 3197 5.28 2.01 5.00
## popularity 4 3197 29.15 36.29 20.54
## release_date* 5 3197 1287.66 690.82 1311.00
## revenue 6 3197 121666487.05 186820671.23 55260558.00
## runtime 7 3197 110.76 21.01 107.00
## title* 8 3197 1598.79 922.75 1599.00
## vote_average 9 3197 6.31 0.88 6.30
## vote_count 10 3197 982.54 1418.77 478.00
## trimmed mad min max range
## X 1599.00 1184.60 1.00 3.197000e+03 3.196000e+03
## budget 32519429.03 28169400.00 1.00 3.800000e+08 3.800000e+08
## original_language* 5.00 0.00 1.00 2.700000e+01 2.600000e+01
## popularity 23.76 17.96 0.02 8.755800e+02 8.755600e+02
## release_date* 1295.87 862.87 1.00 2.474000e+03 2.473000e+03
## revenue 80852502.15 68585903.29 5.00 2.787965e+09 2.787965e+09
## runtime 108.50 17.79 41.00 3.380000e+02 2.970000e+02
## title* 1598.86 1184.60 1.00 3.196000e+03 3.195000e+03
## vote_average 6.34 0.89 0.00 8.500000e+00 8.500000e+00
## vote_count 667.82 545.60 0.00 1.375200e+04 1.375200e+04
## skew kurtosis se
## X 0.00 -1.20 16.32
## budget 2.07 5.44 787400.79
## original_language* 8.39 76.44 0.04
## popularity 9.24 161.66 0.64
## release_date* -0.07 -1.12 12.22
## revenue 3.81 24.78 3304103.25
## runtime 1.70 7.39 0.37
## title* 0.00 -1.20 16.32
## vote_average -0.61 1.94 0.02
## vote_count 3.20 13.80 25.09
Creating a few categorical variables
is_profit=ifelse((tmdb$revenue-tmdb$budget)>0,1,0)
#1:film resulted in net profit
season_no<-ifelse(month(as.Date(tmdb$release_date)) %in% seq(3,5),1,ifelse(month(as.Date(tmdb$release_date)) %in% seq(6,8),2,ifelse(month(as.Date(tmdb$release_date)) %in% seq(9,11),3,ifelse(month(as.Date(tmdb$release_date)) %in% range(12,1,2),4,0))))
#1=spring,2=summer,3=fall,4=winter
is_short<-ifelse(tmdb$runtime<50,1,0)
#short films are <50min as per Sundance Film Festival
tmdb<-cbind(tmdb,is_profit,season_no,is_short)
Contingency tables
xtabs(~season_no+is_short,data=tmdb)
## is_short
## season_no 0 1
## 0 223 1
## 1 697 1
## 2 829 0
## 3 924 0
## 4 522 0
xtabs(~is_profit+original_language,data=tmdb)
## original_language
## is_profit af cn da de en es fa fr he hi id is it
## 0 0 1 0 2 740 4 0 12 0 1 1 0 1
## 1 1 4 3 7 2334 11 1 12 1 6 1 1 5
## original_language
## is_profit ja ko nb nl no pl pt ro ru te th vi xx
## 0 4 2 0 1 0 0 0 0 3 0 0 1 0
## 1 9 3 1 1 1 1 2 1 3 1 1 0 1
## original_language
## is_profit zh
## 0 4
## 1 8
Boxplots of variables important to the study.
par(mfrow=c(1,2))
boxplot(tmdb$revenue,ylab="revenue")
boxplot(tmdb$popularity,ylab="popularity")
Histograms of suitable data fields.
par(mfrow=c(2,2))
hist(tmdb$budget)
hist(tmdb$runtime)
hist(tmdb$vote_average)
Suitable plots.
par(mfrow=c(2,3))
plot(tmdb$revenue,tmdb$budget)
plot(tmdb$vote_count,tmdb$vote_average)
plot(tmdb$revenue,tmdb$popularity)
plot(tmdb$release_date,tmdb$budget)
plot(tmdb$release_date,tmdb$revenue)
plot(tmdb$release_date,tmdb$popularity)
Correlation matrix (rounded to 2 decimal places)
kek<-cbind(tmdb[,c(2,4,6,7,9,10)])
round(cor(kek, use="complete.obs", method="kendall"),2)
## budget popularity revenue runtime vote_average vote_count
## budget 1.00 0.33 0.50 0.17 -0.09 0.34
## popularity 0.33 1.00 0.52 0.14 0.23 0.81
## revenue 0.50 0.52 1.00 0.15 0.08 0.56
## runtime 0.17 0.14 0.15 1.00 0.27 0.14
## vote_average -0.09 0.23 0.08 0.27 1.00 0.25
## vote_count 0.34 0.81 0.56 0.14 0.25 1.00
Corrgram
corrgram(tmdb, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
diag.panel=panel.minmax,
main="Corrgram")
Scatterplot matrix
pairs(kek)
Hypothesis testing:
H0:Short films make as much revenue as feature length films. H1:Short films have less revenue compared to feature length films.
aggregate(tmdb$revenue, by=list(tmdb$is_short), mean)
## Group.1 x
## 1 0 121739019
## 2 1 5796450
t.test(tmdb$is_short,tmdb$revenue)
##
## Welch Two Sample t-test
##
## data: tmdb$is_short and tmdb$revenue
## t = -36.823, df = 3196, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -128144864 -115188110
## sample estimates:
## mean of x mean of y
## 6.255865e-04 1.216665e+08
Due to very low p value we reject the null hypothesis.