df1 <- movies_metadata
df1$budget <- as.numeric(df1$budget)
Warning: NAs introduced by coercion
df1$id <- as.numeric(df1$id)
Warning: NAs introduced by coercion
df1$popularity <- as.numeric(df1$popularity)
Warning: NAs introduced by coercion
df1$release_date <- as.Date(df1$release_date)
df1$runtime <- as.numeric(df1$runtime)
library("dplyr")
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
df1$en <- if_else(condition = df1$original_language == "en",
true = 1,
false=0)
df1$original_language <- NULL
df1$overview <- NULL
df1$original_title <- NULL
df1$imdb_id <- NULL
df1$homepage <- NULL
df1$poster_path <- NULL
df1$production_companies <- NULL
df1$production_countries <- NULL
df1$title <- NULL
df1$tagline <- NULL
df1$belongs_to_collection <- NULL
df1$genres <- NULL
df1$spoken_languages <- NULL
df1$adult <- NULL
library("visdat")
library("stargazer")
Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
Attaching package: 'MASS'
The following object is masked from 'package:dplyr':
select
budget id popularity release_date revenue runtime
3 3 6 90 6 263
status video vote_average vote_count en
0 0 6 6 0
reg1 <- lm(data = df1, formula = vote_count ~ .,)
stargazer(reg1, type ="text")
===================================================
Dependent variable:
-----------------------------
vote_count
---------------------------------------------------
budget 0.00000***
(0.00000)
id 0.00005***
(0.00001)
popularity 15.128***
(0.253)
release_date 0.001***
(0.0002)
revenue 0.00000***
(0.00000)
runtime 0.067*
(0.035)
statusCanceled -26.493
(274.283)
statusIn Production -22.036
(73.150)
statusPlanned -45.353
(81.851)
statusPost Production -47.454
(41.847)
statusReleased -14.966
(31.331)
statusRumored -12.583
(36.078)
videoTrue -5.035
(29.109)
vote_average 9.525***
(0.703)
en 11.984***
(2.904)
Constant -63.452**
(31.658)
---------------------------------------------------
Observations 45,130
R2 0.695
Adjusted R2 0.695
Residual Std. Error 272.485 (df = 45114)
F Statistic 6,841.625*** (df = 15; 45114)
===================================================
Note: *p<0.1; **p<0.05; ***p<0.01
stepAIC(reg1, direction = c("backward"))
Start: AIC=506156.6
vote_count ~ budget + id + popularity + release_date + revenue +
runtime + status + video + vote_average + en
Df Sum of Sq RSS AIC
- status 6 132499 3349769335 506146
- video 1 2222 3349639057 506155
<none> 3349636836 506157
- runtime 1 279979 3349916814 506158
- release_date 1 838575 3350475411 506166
- id 1 1059973 3350696808 506169
- en 1 1264385 3350901220 506172
- vote_average 1 13618308 3363255144 506338
- budget 1 41083493 3390720328 506705
- popularity 1 264561238 3614198074 509585
- revenue 1 1677667423 5027304259 524479
Step: AIC=506146.4
vote_count ~ budget + id + popularity + release_date + revenue +
runtime + video + vote_average + en
Df Sum of Sq RSS AIC
- video 1 2071 3349771406 506144
<none> 3349769335 506146
- runtime 1 278837 3350048173 506148
- release_date 1 830697 3350600032 506156
- id 1 1007894 3350777229 506158
- en 1 1260149 3351029485 506161
- vote_average 1 13597170 3363366505 506327
- budget 1 41091711 3390861046 506695
- popularity 1 264636317 3614405652 509576
- revenue 1 1678002842 5027772177 524471
Step: AIC=506144.4
vote_count ~ budget + id + popularity + release_date + revenue +
runtime + vote_average + en
Df Sum of Sq RSS AIC
<none> 3349771406 506144
- runtime 1 278898 3350050305 506146
- release_date 1 829509 3350600915 506154
- id 1 1006132 3350777539 506156
- en 1 1258811 3351030217 506159
- vote_average 1 13601981 3363373387 506325
- budget 1 41097331 3390868737 506693
- popularity 1 264691776 3614463182 509575
- revenue 1 1678020303 5027791710 524469
Call:
lm(formula = vote_count ~ budget + id + popularity + release_date +
revenue + runtime + vote_average + en, data = df1)
Coefficients:
(Intercept) budget id popularity release_date
-7.819e+01 2.753e-06 4.567e-05 1.513e+01 5.235e-04
revenue runtime vote_average en
4.877e-06 6.687e-02 9.514e+00 1.195e+01
reg2 <- lm(
data = df1,
formula = vote_count ~ budget + id + popularity + release_date + revenue + runtime + vote_average + en
)
stargazer(reg2, type = "text")
=================================================
Dependent variable:
-----------------------------
vote_count
-------------------------------------------------
budget 0.00000***
(0.00000)
id 0.00005***
(0.00001)
popularity 15.125***
(0.253)
release_date 0.001***
(0.0002)
revenue 0.00000***
(0.00000)
runtime 0.067*
(0.035)
vote_average 9.514***
(0.703)
en 11.954***
(2.903)
Constant -78.188***
(5.918)
-------------------------------------------------
Observations 45,130
R2 0.695
Adjusted R2 0.695
Residual Std. Error 272.470 (df = 45121)
F Statistic 12,829.300*** (df = 8; 45121)
=================================================
Note: *p<0.1; **p<0.05; ***p<0.01
reg2 <- lm(formula = vote_count ~ budget + id + popularity + release_date +
revenue + runtime + vote_average + en, data = df1)
stargazer(reg2, type = "text")
=================================================
Dependent variable:
-----------------------------
vote_count
-------------------------------------------------
budget 0.00000***
(0.00000)
id 0.00005***
(0.00001)
popularity 15.125***
(0.253)
release_date 0.001***
(0.0002)
revenue 0.00000***
(0.00000)
runtime 0.067*
(0.035)
vote_average 9.514***
(0.703)
en 11.954***
(2.903)
Constant -78.188***
(5.918)
-------------------------------------------------
Observations 45,130
R2 0.695
Adjusted R2 0.695
Residual Std. Error 272.470 (df = 45121)
F Statistic 12,829.300*** (df = 8; 45121)
=================================================
Note: *p<0.1; **p<0.05; ***p<0.01
reg3 <- lm(data=df1, formual= revenue ~ .,)
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
extra argument 'formual' will be disregarded
stepAIC(reg3, discover = c("backwards"))
Start: AIC=1462560
budget ~ id + popularity + release_date + revenue + runtime +
status + video + vote_average + vote_count + en
Df Sum of Sq RSS AIC
- status 6 1.0643e+15 5.3546e+18 1462557
- video 1 2.2828e+14 5.3538e+18 1462560
<none> 5.3536e+18 1462560
- vote_average 1 2.6794e+15 5.3562e+18 1462580
- popularity 1 1.2413e+16 5.3660e+18 1462662
- runtime 1 2.1555e+16 5.3751e+18 1462739
- en 1 2.9450e+16 5.3830e+18 1462805
- id 1 4.5648e+16 5.3992e+18 1462941
- vote_count 1 6.5662e+16 5.4192e+18 1463108
- release_date 1 6.9331e+16 5.4229e+18 1463138
- revenue 1 1.8412e+18 7.1948e+18 1475897
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
extra argument 'formual' will be disregarded
Step: AIC=1462557
budget ~ id + popularity + release_date + revenue + runtime +
video + vote_average + vote_count + en
Df Sum of Sq RSS AIC
- video 1 2.2310e+14 5.3549e+18 1462556
<none> 5.3546e+18 1462557
- vote_average 1 2.5988e+15 5.3572e+18 1462576
- popularity 1 1.2596e+16 5.3672e+18 1462661
- runtime 1 2.1888e+16 5.3765e+18 1462739
- en 1 2.9436e+16 5.3841e+18 1462802
- id 1 4.5726e+16 5.4004e+18 1462938
- vote_count 1 6.5685e+16 5.4203e+18 1463105
- release_date 1 6.8947e+16 5.4236e+18 1463132
- revenue 1 1.8411e+18 7.1957e+18 1475892
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
extra argument 'formual' will be disregarded
Step: AIC=1462556
budget ~ id + popularity + release_date + revenue + runtime +
vote_average + vote_count + en
Df Sum of Sq RSS AIC
<none> 5.3549e+18 1462556
- vote_average 1 2.5840e+15 5.3574e+18 1462576
- popularity 1 1.2637e+16 5.3675e+18 1462661
- runtime 1 2.1896e+16 5.3768e+18 1462739
- en 1 2.9362e+16 5.3842e+18 1462801
- id 1 4.5946e+16 5.4008e+18 1462940
- vote_count 1 6.5697e+16 5.4206e+18 1463105
- release_date 1 6.8835e+16 5.4237e+18 1463131
- revenue 1 1.8411e+18 7.1959e+18 1475891
Call:
lm(formula = budget ~ id + popularity + release_date + revenue +
runtime + vote_average + vote_count + en, data = df1, formual = revenue ~
.)
Coefficients:
(Intercept) id popularity release_date revenue
-9.797e+05 -9.720e+00 1.084e+05 1.499e+02 1.707e-01
runtime vote_average vote_count en
1.870e+04 -1.314e+05 4.402e+03 1.821e+06
reg4 <- lm(formula = budget ~ id + popularity + release_date + revenue +
runtime + vote_average + vote_count + en, data = df1, formual = revenue ~
.)
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
extra argument 'formual' will be disregarded
stargazer(reg4, type = "text")
================================================
Dependent variable:
----------------------------
budget
------------------------------------------------
id -9.720***
(0.494)
popularity 108,430.100***
(10,507.890)
release_date 149.853***
(6.222)
revenue 0.171***
(0.001)
runtime 18,698.900***
(1,376.642)
vote_average -131,363.200***
(28,152.360)
vote_count 4,401.672***
(187.081)
en 1,820,974.000***
(115,769.900)
Constant -979,653.700***
(237,012.300)
------------------------------------------------
Observations 45,130
R2 0.612
Adjusted R2 0.612
Residual Std. Error 10,893,930.000 (df = 45121)
F Statistic 8,891.270*** (df = 8; 45121)
================================================
Note: *p<0.1; **p<0.05; ***p<0.01