In this initial analysis, a cleaned subset of the TMDB dataset from https://www.kaggle.com/tmdb/tmdb-movie-metadata is used.

We analyse the different dependent and independent variables present in order to hypothesize and find patterns between data.

Cleaning the dataset

tmdb<-read.csv('C:\\Users\\ADI\\Downloads\\tmdb_5000_movies.csv',header=T,sep=",",stringsAsFactors = F)
tmdb<-tmdb[complete.cases(tmdb),]
tmdb<-tmdb[tmdb$budget!=0,]
tmdb<-tmdb[tmdb$revenue!=0,]
tmdb<-cbind(tmdb[,c(1,2,4,6,9,12,13,14,15,18,19,20)])
tmdb<-tmdb[c(1:3030,3032:nrow(tmdb)),]

Extracting genre data from json string

for (i in 1:nrow(tmdb)){
  kek<-fromJSON(tmdb$genres[i])
  for (j in 1:nrow(kek[2])){
    tmdb[i,kek[,2][j]]=1
  }
}
tmdb[is.na(tmdb)]<-0
tmdb<-tmdb[-c(2,9)]

Visualising the length and breadth of the dataset.

plot(tmdb$revenue/10^6 ~ as.Date(tmdb$release_date),data=tmdb,ylab="Revenue (Million USD)", xlab="Release date",main="Visualisation of the dataset")

Descriptive statistics.

kable(describe(tmdb))
## Warning in describe(tmdb): NAs introduced by coercion

## Warning in describe(tmdb): NAs introduced by coercion

## Warning in describe(tmdb): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning
## Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf

## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning
## -Inf
vars n mean sd median trimmed mad min max range skew kurtosis se
budget 1 3228 4.066642e+07 4.439840e+07 2.500000e+07 3.223486e+07 2.742810e+07 1.000000 3.800000e+08 3.800000e+08 2.0874924 5.5103834 7.814489e+05
id 2 3228 4.477959e+04 7.462097e+04 1.144650e+04 2.524563e+04 1.486603e+04 5.000000 4.178590e+05 4.178540e+05 2.3702084 4.9185428 1.313391e+03
original_language* 3 3228 NaN NA NA NaN NA Inf -Inf -Inf NA NA NA
popularity 4 3228 2.904267e+01 3.616773e+01 2.041296e+01 2.366746e+01 1.787573e+01 0.019984 8.755813e+02 8.755613e+02 9.2578808 162.4783998 6.365822e-01
release_date* 5 3228 NaN NA NA NaN NA Inf -Inf -Inf NA NA NA
revenue 6 3228 1.212800e+08 1.863198e+08 5.519150e+07 8.061051e+07 6.878759e+07 5.000000 2.787965e+09 2.787965e+09 3.8161677 24.8492115 3.279384e+06
runtime 7 3228 1.107215e+02 2.096831e+01 1.070000e+02 1.084652e+02 1.779120e+01 41.000000 3.380000e+02 2.970000e+02 1.6932466 7.3879771 3.690597e-01
title* 8 3228 9.744000e+02 9.566594e+02 8.540000e+02 9.611250e+02 1.243901e+03 9.000000 2.046000e+03 2.037000e+03 0.0560208 -2.1009033 1.683800e+01
vote_average 9 3228 6.309758e+00 8.737225e-01 6.300000e+00 6.341022e+00 8.895600e-01 0.000000 8.500000e+00 8.500000e+00 -0.6084960 1.9475113 1.537820e-02
vote_count 10 3228 9.775895e+02 1.414424e+03 4.710000e+02 6.639443e+02 5.396664e+02 0.000000 1.375200e+04 1.375200e+04 3.2042773 13.8899767 2.489505e+01
Action 11 3228 2.843866e-01 4.511917e-01 0.000000e+00 2.306502e-01 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 0.9554546 -1.0874431 7.941300e-03
Adventure 12 3228 2.047708e-01 4.035965e-01 0.000000e+00 1.311920e-01 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 1.4625388 0.1390631 7.103600e-03
Fantasy 13 3228 1.059480e-01 3.078187e-01 0.000000e+00 7.739900e-03 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 2.5594936 4.5524181 5.417900e-03
Science Fiction 14 3228 1.335192e-01 3.401877e-01 0.000000e+00 4.218270e-02 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 2.1539124 2.6401567 5.987600e-03
Crime 15 3228 1.614002e-01 3.679567e-01 0.000000e+00 7.701240e-02 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 1.8398623 1.3855229 6.476300e-03
Drama 16 3228 4.464064e-01 4.971965e-01 0.000000e+00 4.330495e-01 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 0.2155162 -1.9541578 8.751100e-03
Thriller 17 3228 2.896530e-01 4.536716e-01 0.000000e+00 2.372291e-01 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 0.9270221 -1.1409832 7.985000e-03
Animation 18 3228 5.824040e-02 2.342337e-01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 3.7707867 12.2226189 4.122700e-03
Family 19 3228 1.130731e-01 3.167312e-01 0.000000e+00 1.664090e-02 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 2.4424939 3.9670056 5.574700e-03
Western 20 3228 1.765800e-02 1.317253e-01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 7.3211823 51.6156999 2.318500e-03
Comedy 21 3228 3.438662e-01 4.750707e-01 0.000000e+00 3.049536e-01 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 0.6571042 -1.5686998 8.361600e-03
Romance 22 3228 1.778191e-01 3.824196e-01 0.000000e+00 9.752320e-02 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 1.6844382 0.8375919 6.730900e-03
Horror 23 3228 1.028501e-01 3.038100e-01 0.000000e+00 3.870000e-03 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 2.6136526 4.8326772 5.347300e-03
Mystery 24 3228 8.209420e-02 2.745507e-01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 3.0433461 7.2642060 4.832300e-03
History 25 3228 4.491950e-02 2.071593e-01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 4.3921705 17.2965199 3.646200e-03
War 26 3228 3.717470e-02 1.892191e-01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 4.8904365 21.9231610 3.330400e-03
Music 27 3228 3.438660e-02 1.822484e-01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 5.1080748 24.0998939 3.207700e-03
Documentary 28 3228 1.177200e-02 1.078750e-01 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 9.0489300 79.9078888 1.898700e-03
Foreign 29 3228 1.548900e-03 3.933230e-02 0.000000e+00 0.000000e+00 0.000000e+00 0.000000 1.000000e+00 1.000000e+00 25.3378082 640.2028513 6.923000e-04

Creating a few more categorical variables

is_profit=ifelse((tmdb$revenue-tmdb$budget)>0,1,0)
#1:film resulted in net profit
season_no<-ifelse(month(as.Date(tmdb$release_date)) %in% seq(3,5),1,ifelse(month(as.Date(tmdb$release_date)) %in% seq(6,8),2,ifelse(month(as.Date(tmdb$release_date)) %in% seq(9,11),3,ifelse(month(as.Date(tmdb$release_date)) %in% c(1,2,12),4,0))))
for(i in 1:nrow(tmdb)){
tmdb[i,"is_niche"]<-ifelse(sum(tmdb[i,c(11:14,17:19,21:23)]),0,1)
}
#1=spring,2=summer,3=fall,4=winter
tmdb<-cbind(tmdb,is_profit,season_no)

Contingency tables

One way tables:

addmargins(table(tmdb$season_no))
## 
##    1    2    3    4  Sum 
##  704  837  933  754 3228
addmargins(table(tmdb$is_profit))
## 
##    0    1  Sum 
##  790 2438 3228
addmargins(xtabs(~season_no+Action,data=tmdb))
##          Action
## season_no    0    1  Sum
##       1    476  228  704
##       2    548  289  837
##       3    721  212  933
##       4    565  189  754
##       Sum 2310  918 3228
addmargins(xtabs(~is_profit+Fantasy,data=tmdb))
##          Fantasy
## is_profit    0    1  Sum
##       0    723   67  790
##       1   2163  275 2438
##       Sum 2886  342 3228

Boxplots of variables important to the study.

par(mfrow=c(1,2))
boxplot(tmdb$revenue,ylab="revenue")
boxplot(tmdb$popularity,ylab="popularity")

Histograms of suitable data fields.

par(mfrow=c(2,1))
hist(tmdb$season_no)
hist(tmdb$vote_average)

Suitable plots.

par(mfrow=c(2,3))
plot(tmdb$revenue,tmdb$budget)
plot(tmdb$revenue,tmdb$vote_average)
plot(tmdb$revenue,tmdb$popularity)
plot(tmdb$season_no,tmdb$revenue)
plot(as.Date(tmdb$release_date),tmdb$revenue)
plot(as.Date(tmdb$release_date),tmdb$popularity)

Correlation matrix (rounded to 2 decimal places)

kek<-cbind(tmdb[,c(1,4,6,7,9,10:32)])
kable(round(cor(kek, use="complete.obs", method="kendall"),2))
budget popularity revenue runtime vote_average vote_count Action Adventure Fantasy Science Fiction Crime Drama Thriller Animation Family Western Comedy Romance Horror Mystery History War Music Documentary Foreign is_niche is_profit season_no
budget 1.00 0.33 0.50 0.17 -0.09 0.34 0.24 0.28 0.19 0.13 -0.01 -0.16 0.05 0.17 0.18 -0.03 -0.02 -0.09 -0.15 0.02 0.00 0.01 -0.06 -0.11 -0.05 -0.16 0.04 -0.05
popularity 0.33 1.00 0.52 0.14 0.23 0.82 0.13 0.17 0.14 0.11 0.02 -0.12 0.07 0.11 0.06 -0.03 -0.09 -0.10 -0.01 0.04 -0.05 0.00 -0.07 -0.11 -0.05 -0.11 0.34 -0.04
revenue 0.50 0.52 1.00 0.16 0.09 0.56 0.15 0.23 0.15 0.08 -0.04 -0.17 0.01 0.15 0.17 -0.06 0.00 -0.05 -0.06 0.00 -0.02 -0.01 -0.03 -0.08 -0.05 -0.14 0.47 -0.05
runtime 0.17 0.14 0.16 1.00 0.27 0.14 0.08 0.07 -0.02 0.00 0.05 0.27 0.03 -0.22 -0.20 0.05 -0.27 0.04 -0.17 0.04 0.19 0.15 0.00 -0.05 -0.03 0.19 0.05 0.02
vote_average -0.09 0.23 0.09 0.27 1.00 0.25 -0.11 -0.04 -0.05 -0.05 0.05 0.26 -0.08 0.04 -0.04 0.05 -0.17 0.02 -0.14 0.01 0.11 0.09 0.04 0.06 0.00 0.22 0.19 0.03
vote_count 0.34 0.82 0.56 0.14 0.25 1.00 0.13 0.17 0.14 0.13 0.02 -0.13 0.06 0.09 0.06 -0.04 -0.07 -0.09 0.00 0.03 -0.05 -0.01 -0.08 -0.10 -0.05 -0.12 0.36 -0.04
Action 0.24 0.13 0.15 0.08 -0.11 0.13 1.00 0.36 0.07 0.25 0.14 -0.24 0.29 -0.09 -0.12 0.04 -0.19 -0.22 -0.08 -0.06 0.00 0.05 -0.09 -0.06 -0.02 -0.24 0.00 -0.08
Adventure 0.28 0.17 0.23 0.07 -0.04 0.17 0.36 1.00 0.24 0.23 -0.13 -0.26 -0.01 0.19 0.23 0.03 -0.07 -0.14 -0.13 -0.07 -0.03 0.01 -0.07 -0.06 -0.02 -0.20 0.05 -0.08
Fantasy 0.19 0.14 0.15 -0.02 -0.05 0.14 0.07 0.24 1.00 0.05 -0.13 -0.16 -0.11 0.12 0.27 -0.03 0.02 -0.03 0.01 -0.04 -0.07 -0.05 -0.02 -0.04 -0.01 -0.13 0.04 -0.04
Science Fiction 0.13 0.11 0.08 0.00 -0.05 0.13 0.25 0.23 0.05 1.00 -0.13 -0.22 0.10 -0.02 -0.01 -0.05 -0.14 -0.13 0.07 0.01 -0.09 -0.07 -0.07 -0.04 -0.02 -0.15 0.01 -0.09
Crime -0.01 0.02 -0.04 0.05 0.05 0.02 0.14 -0.13 -0.13 -0.13 1.00 0.07 0.30 -0.11 -0.15 -0.01 -0.09 -0.13 -0.08 0.13 -0.06 -0.08 -0.06 -0.05 -0.02 -0.01 -0.01 0.03
Drama -0.16 -0.12 -0.17 0.27 0.26 -0.13 -0.24 -0.26 -0.16 -0.22 0.07 1.00 -0.04 -0.19 -0.20 -0.01 -0.23 0.20 -0.19 0.03 0.20 0.14 0.05 -0.09 0.01 0.33 -0.10 0.11
Thriller 0.05 0.07 0.01 0.03 -0.08 0.06 0.29 -0.01 -0.11 0.10 0.30 -0.04 1.00 -0.15 -0.22 -0.06 -0.37 -0.22 0.21 0.25 -0.08 -0.05 -0.11 -0.07 -0.03 -0.25 -0.01 -0.01
Animation 0.17 0.11 0.15 -0.22 0.04 0.09 -0.09 0.19 0.12 -0.02 -0.11 -0.19 -0.15 1.00 0.60 -0.01 0.11 -0.09 -0.08 -0.07 -0.05 -0.04 0.03 -0.03 0.02 -0.10 0.03 -0.02
Family 0.18 0.06 0.17 -0.20 -0.04 0.06 -0.12 0.23 0.27 -0.01 -0.15 -0.20 -0.22 0.60 1.00 -0.03 0.19 -0.08 -0.12 -0.10 -0.08 -0.07 0.06 -0.01 0.04 -0.14 0.05 -0.04
Western -0.03 -0.03 -0.06 0.05 0.05 -0.04 0.04 0.03 -0.03 -0.05 -0.01 -0.01 -0.06 -0.01 -0.03 1.00 -0.05 -0.03 -0.05 -0.02 0.03 0.01 0.01 -0.01 -0.01 0.11 -0.02 -0.01
Comedy -0.02 -0.09 0.00 -0.27 -0.17 -0.07 -0.19 -0.07 0.02 -0.14 -0.09 -0.23 -0.37 0.11 0.19 -0.05 1.00 0.19 -0.16 -0.17 -0.14 -0.12 0.03 -0.05 -0.01 -0.28 0.01 -0.03
Romance -0.09 -0.10 -0.05 0.04 0.02 -0.09 -0.22 -0.14 -0.03 -0.13 -0.13 0.20 -0.22 -0.09 -0.08 -0.03 0.19 1.00 -0.13 -0.07 -0.02 0.00 0.12 -0.05 0.02 -0.18 0.01 0.02
Horror -0.15 -0.01 -0.06 -0.17 -0.14 0.00 -0.08 -0.13 0.01 0.07 -0.08 -0.19 0.21 -0.08 -0.12 -0.05 -0.16 -0.13 1.00 0.15 -0.07 -0.06 -0.06 -0.04 -0.01 -0.13 0.05 0.00
Mystery 0.02 0.04 0.00 0.04 0.01 0.03 -0.06 -0.07 -0.04 0.01 0.13 0.03 0.25 -0.07 -0.10 -0.02 -0.17 -0.07 0.15 1.00 -0.05 -0.04 -0.05 -0.03 -0.01 -0.05 -0.02 0.00
History 0.00 -0.05 -0.02 0.19 0.11 -0.05 0.00 -0.03 -0.07 -0.09 -0.06 0.20 -0.08 -0.05 -0.08 0.03 -0.14 -0.02 -0.07 -0.05 1.00 0.31 -0.01 0.02 -0.01 0.22 -0.03 0.07
War 0.01 0.00 -0.01 0.15 0.09 -0.01 0.05 0.01 -0.05 -0.07 -0.08 0.14 -0.05 -0.04 -0.07 0.01 -0.12 0.00 -0.06 -0.04 0.31 1.00 -0.03 -0.02 -0.01 0.10 -0.02 0.04
Music -0.06 -0.07 -0.03 0.00 0.04 -0.08 -0.09 -0.07 -0.02 -0.07 -0.06 0.05 -0.11 0.03 0.06 0.01 0.03 0.12 -0.06 -0.05 -0.01 -0.03 1.00 0.09 -0.01 0.06 0.00 0.02
Documentary -0.11 -0.11 -0.08 -0.05 0.06 -0.10 -0.06 -0.06 -0.04 -0.04 -0.05 -0.09 -0.07 -0.03 -0.01 -0.01 -0.05 -0.05 -0.04 -0.03 0.02 -0.02 0.09 1.00 0.07 0.22 -0.01 0.02
Foreign -0.05 -0.05 -0.05 -0.03 0.00 -0.05 -0.02 -0.02 -0.01 -0.02 -0.02 0.01 -0.03 0.02 0.04 -0.01 -0.01 0.02 -0.01 -0.01 -0.01 -0.01 -0.01 0.07 1.00 0.01 -0.07 0.01
is_niche -0.16 -0.11 -0.14 0.19 0.22 -0.12 -0.24 -0.20 -0.13 -0.15 -0.01 0.33 -0.25 -0.10 -0.14 0.11 -0.28 -0.18 -0.13 -0.05 0.22 0.10 0.06 0.22 0.01 1.00 -0.05 0.08
is_profit 0.04 0.34 0.47 0.05 0.19 0.36 0.00 0.05 0.04 0.01 -0.01 -0.10 -0.01 0.03 0.05 -0.02 0.01 0.01 0.05 -0.02 -0.03 -0.02 0.00 -0.01 -0.07 -0.05 1.00 -0.03
season_no -0.05 -0.04 -0.05 0.02 0.03 -0.04 -0.08 -0.08 -0.04 -0.09 0.03 0.11 -0.01 -0.02 -0.04 -0.01 -0.03 0.02 0.00 0.00 0.07 0.04 0.02 0.02 0.01 0.08 -0.03 1.00

Corrgram

corrgram(tmdb, order=TRUE, lower.panel=panel.shade,
  upper.panel=panel.pie, text.panel=panel.txt,
  diag.panel=panel.minmax, 
    main="Corrgram")

Scatterplot matrix

pairs(kek[,c(1:6,27)])

Hypothesis testing:

Hypothesis 1-

H0:There is no significant revenue difference across release seasons for films. H1:There is a significant revenue difference across release seasons for films.

t.test(tmdb$season_no,tmdb$revenue)
## 
##  Welch Two Sample t-test
## 
## data:  tmdb$season_no and tmdb$revenue
## t = -36.983, df = 3227, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -127709883 -114850109
## sample estimates:
##    mean of x    mean of y 
## 2.538104e+00 1.212800e+08

Due to very low p value we reject the null hypothesis.

Hypothesis 2-

H0:Average scores of a movie are significantly independent of whether a movie is niche or mainstream H1:Average scores of a movie are significantly dependent on whether a movie is niche or mainstream

  t.test(tmdb$vote_average,tmdb$is_niche)
## 
##  Welch Two Sample t-test
## 
## data:  tmdb$vote_average and tmdb$is_niche
## t = 375.27, df = 4156.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  6.148595 6.213177
## sample estimates:
## mean of x mean of y 
## 6.3097584 0.1288724

Due to very low p value we reject the null hypothesis.

Regression analysis:

Model1- revenue=b0+b1(budget)+b2(season)+b3(average_votes)+b4(popularity)

reg<-lm(revenue~ budget+season_no+vote_average+popularity, tmdb)
summary(reg)
## 
## Call:
## lm(formula = revenue ~ budget + season_no + vote_average + popularity, 
##     data = tmdb)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -942315694  -48463108   -8457926   28342093 1982127522 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.659e+08  1.618e+07 -10.253   <2e-16 ***
## budget        2.378e+00  5.167e-02  46.017   <2e-16 ***
## season_no    -4.251e+06  1.901e+06  -2.236   0.0254 *  
## vote_average  2.423e+07  2.464e+06   9.830   <2e-16 ***
## popularity    1.667e+06  6.594e+04  25.285   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 115200000 on 3223 degrees of freedom
## Multiple R-squared:  0.6182, Adjusted R-squared:  0.6178 
## F-statistic:  1305 on 4 and 3223 DF,  p-value: < 2.2e-16

revenue=(-1.739e+08)+(2.386e+00)(budget)+(-1.234e+06)(season)+(2.416e+07)(average_votes)+(1.669e+06)(popularity)

Model2- We observe in previous model that season_no is not a statistically significant variable (it does not have *, has a high p-value) Thus we generate a new model for revenue, which is best fit

reg1<-lm(revenue~ budget+vote_average+popularity, tmdb)
summary(reg1)
## 
## Call:
## lm(formula = revenue ~ budget + vote_average + popularity, data = tmdb)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -947047990  -47963423   -8101178   28799282 1973822993 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.759e+08  1.556e+07 -11.304   <2e-16 ***
## budget        2.387e+00  5.151e-02  46.348   <2e-16 ***
## vote_average  2.402e+07  2.464e+06   9.748   <2e-16 ***
## popularity    1.670e+06  6.597e+04  25.315   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 115300000 on 3224 degrees of freedom
## Multiple R-squared:  0.6176, Adjusted R-squared:  0.6173 
## F-statistic:  1736 on 3 and 3224 DF,  p-value: < 2.2e-16
plot(reg1)

revenue=(-1.759e+08)+( 2.387e+00)(budget)+(2.402e+07)(average_votes)+(1.670e+06)(popularity)

This model is the best fit model since all variables are statistically significant and Multiple R squared value indicates that this is a good model.

Model3- average_votes=b0+b1(duration)+b2(budget)+b3(mainstream)

reg2<-lm(vote_average~ budget+ runtime+ !is_niche, tmdb)
summary(reg2)
## 
## Call:
## lm(formula = vote_average ~ budget + runtime + (!is_niche), data = tmdb)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9184 -0.4643  0.0305  0.5145  2.3854 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.978e+00  9.456e-02  52.640  < 2e-16 ***
## budget        -1.865e-09  3.325e-10  -5.609 2.21e-08 ***
## runtime        1.543e-02  7.133e-04  21.637  < 2e-16 ***
## !is_nicheTRUE -3.454e-01  4.405e-02  -7.841 6.04e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7943 on 3224 degrees of freedom
## Multiple R-squared:  0.1742, Adjusted R-squared:  0.1735 
## F-statistic: 226.8 on 3 and 3224 DF,  p-value: < 2.2e-16
plot(reg2)

average_votes=(4.978)+(1.543e-0)2(runtime)+(-1.865e-09)(budget)+-3.454e-01(mainstream)

This model is the best fit model since all variables are statistically significant and Multiple R squared value indicates that this is a good model.

Insights from regression analysis- 1)The revenue of a film significantly depends upon its budget,popularity and average votes 2)The average votes earned by a film significantly depends on its budget, runtime, whether its genre is mainstream/niche