The_Movies_Dataset

Author

joshhong

rm(list=ls())
movies_metadata <- read.csv("~/Downloads/archive/movies_metadata.csv")
 credits <- read.csv("~/Downloads/archive/credits.csv")
   
 keywords <- read.csv("~/Downloads/archive/keywords.csv")
  
 links_small <- read.csv("~/Downloads/archive/links_small.csv")
  
links <- read.csv("~/Downloads/archive/links.csv")
   
 ratings_small <- read.csv("~/Downloads/archive/ratings_small.csv")
   
 ratings <- read.csv("~/Downloads/archive/ratings.csv")
df1 <- movies_metadata
df1$budget <- as.numeric(df1$budget)
Warning: NAs introduced by coercion
df1$id <- as.numeric(df1$id)
Warning: NAs introduced by coercion
df1$popularity <- as.numeric(df1$popularity)
Warning: NAs introduced by coercion
df1$release_date <- as.Date(df1$release_date)
df1$runtime <- as.numeric(df1$runtime)

library("dplyr")

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
df1$en <- if_else(condition = df1$original_language == "en",
                  true = 1,
                  false=0)

df1$original_language <- NULL
df1$overview <- NULL
df1$original_title <- NULL
df1$imdb_id <- NULL
df1$homepage <- NULL
df1$poster_path <- NULL
df1$production_companies <- NULL
df1$production_countries <- NULL
df1$title <- NULL
df1$tagline <- NULL
df1$belongs_to_collection <- NULL
df1$genres <- NULL
df1$spoken_languages <- NULL
df1$adult <- NULL
library("visdat")
library("stargazer")

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
library("MASS")

Attaching package: 'MASS'
The following object is masked from 'package:dplyr':

    select
colSums(is.na(df1))
      budget           id   popularity release_date      revenue      runtime 
           3            3            6           90            6          263 
      status        video vote_average   vote_count           en 
           0            0            6            6            0 
vis_dat(df1[1:10000, ])

reg1 <- lm(data = df1, formula = vote_count ~ .,)

stargazer(reg1, type ="text")

===================================================
                           Dependent variable:     
                      -----------------------------
                               vote_count          
---------------------------------------------------
budget                         0.00000***          
                                (0.00000)          
                                                   
id                             0.00005***          
                                (0.00001)          
                                                   
popularity                      15.128***          
                                 (0.253)           
                                                   
release_date                    0.001***           
                                (0.0002)           
                                                   
revenue                        0.00000***          
                                (0.00000)          
                                                   
runtime                          0.067*            
                                 (0.035)           
                                                   
statusCanceled                   -26.493           
                                (274.283)          
                                                   
statusIn Production              -22.036           
                                (73.150)           
                                                   
statusPlanned                    -45.353           
                                (81.851)           
                                                   
statusPost Production            -47.454           
                                (41.847)           
                                                   
statusReleased                   -14.966           
                                (31.331)           
                                                   
statusRumored                    -12.583           
                                (36.078)           
                                                   
videoTrue                        -5.035            
                                (29.109)           
                                                   
vote_average                    9.525***           
                                 (0.703)           
                                                   
en                              11.984***          
                                 (2.904)           
                                                   
Constant                        -63.452**          
                                (31.658)           
                                                   
---------------------------------------------------
Observations                     45,130            
R2                                0.695            
Adjusted R2                       0.695            
Residual Std. Error       272.485 (df = 45114)     
F Statistic           6,841.625*** (df = 15; 45114)
===================================================
Note:                   *p<0.1; **p<0.05; ***p<0.01
stepAIC(reg1, direction = c("backward"))
Start:  AIC=506156.6
vote_count ~ budget + id + popularity + release_date + revenue + 
    runtime + status + video + vote_average + en

               Df  Sum of Sq        RSS    AIC
- status        6     132499 3349769335 506146
- video         1       2222 3349639057 506155
<none>                       3349636836 506157
- runtime       1     279979 3349916814 506158
- release_date  1     838575 3350475411 506166
- id            1    1059973 3350696808 506169
- en            1    1264385 3350901220 506172
- vote_average  1   13618308 3363255144 506338
- budget        1   41083493 3390720328 506705
- popularity    1  264561238 3614198074 509585
- revenue       1 1677667423 5027304259 524479

Step:  AIC=506146.4
vote_count ~ budget + id + popularity + release_date + revenue + 
    runtime + video + vote_average + en

               Df  Sum of Sq        RSS    AIC
- video         1       2071 3349771406 506144
<none>                       3349769335 506146
- runtime       1     278837 3350048173 506148
- release_date  1     830697 3350600032 506156
- id            1    1007894 3350777229 506158
- en            1    1260149 3351029485 506161
- vote_average  1   13597170 3363366505 506327
- budget        1   41091711 3390861046 506695
- popularity    1  264636317 3614405652 509576
- revenue       1 1678002842 5027772177 524471

Step:  AIC=506144.4
vote_count ~ budget + id + popularity + release_date + revenue + 
    runtime + vote_average + en

               Df  Sum of Sq        RSS    AIC
<none>                       3349771406 506144
- runtime       1     278898 3350050305 506146
- release_date  1     829509 3350600915 506154
- id            1    1006132 3350777539 506156
- en            1    1258811 3351030217 506159
- vote_average  1   13601981 3363373387 506325
- budget        1   41097331 3390868737 506693
- popularity    1  264691776 3614463182 509575
- revenue       1 1678020303 5027791710 524469

Call:
lm(formula = vote_count ~ budget + id + popularity + release_date + 
    revenue + runtime + vote_average + en, data = df1)

Coefficients:
 (Intercept)        budget            id    popularity  release_date  
  -7.819e+01     2.753e-06     4.567e-05     1.513e+01     5.235e-04  
     revenue       runtime  vote_average            en  
   4.877e-06     6.687e-02     9.514e+00     1.195e+01  
reg2 <- lm(
  data = df1,
  formula = vote_count ~ budget + id + popularity + release_date + revenue + runtime + vote_average + en
)

stargazer(reg2, type = "text")

=================================================
                         Dependent variable:     
                    -----------------------------
                             vote_count          
-------------------------------------------------
budget                       0.00000***          
                              (0.00000)          
                                                 
id                           0.00005***          
                              (0.00001)          
                                                 
popularity                    15.125***          
                               (0.253)           
                                                 
release_date                  0.001***           
                              (0.0002)           
                                                 
revenue                      0.00000***          
                              (0.00000)          
                                                 
runtime                        0.067*            
                               (0.035)           
                                                 
vote_average                  9.514***           
                               (0.703)           
                                                 
en                            11.954***          
                               (2.903)           
                                                 
Constant                     -78.188***          
                               (5.918)           
                                                 
-------------------------------------------------
Observations                   45,130            
R2                              0.695            
Adjusted R2                     0.695            
Residual Std. Error     272.470 (df = 45121)     
F Statistic         12,829.300*** (df = 8; 45121)
=================================================
Note:                 *p<0.1; **p<0.05; ***p<0.01
reg2 <- lm(formula = vote_count ~ budget + id + popularity + release_date + 
    revenue + runtime + vote_average + en, data = df1)

stargazer(reg2, type = "text")

=================================================
                         Dependent variable:     
                    -----------------------------
                             vote_count          
-------------------------------------------------
budget                       0.00000***          
                              (0.00000)          
                                                 
id                           0.00005***          
                              (0.00001)          
                                                 
popularity                    15.125***          
                               (0.253)           
                                                 
release_date                  0.001***           
                              (0.0002)           
                                                 
revenue                      0.00000***          
                              (0.00000)          
                                                 
runtime                        0.067*            
                               (0.035)           
                                                 
vote_average                  9.514***           
                               (0.703)           
                                                 
en                            11.954***          
                               (2.903)           
                                                 
Constant                     -78.188***          
                               (5.918)           
                                                 
-------------------------------------------------
Observations                   45,130            
R2                              0.695            
Adjusted R2                     0.695            
Residual Std. Error     272.470 (df = 45121)     
F Statistic         12,829.300*** (df = 8; 45121)
=================================================
Note:                 *p<0.1; **p<0.05; ***p<0.01
reg3 <- lm(data=df1, formual= revenue ~ .,)
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
 extra argument 'formual' will be disregarded
stepAIC(reg3, discover = c("backwards"))
Start:  AIC=1462560
budget ~ id + popularity + release_date + revenue + runtime + 
    status + video + vote_average + vote_count + en

               Df  Sum of Sq        RSS     AIC
- status        6 1.0643e+15 5.3546e+18 1462557
- video         1 2.2828e+14 5.3538e+18 1462560
<none>                       5.3536e+18 1462560
- vote_average  1 2.6794e+15 5.3562e+18 1462580
- popularity    1 1.2413e+16 5.3660e+18 1462662
- runtime       1 2.1555e+16 5.3751e+18 1462739
- en            1 2.9450e+16 5.3830e+18 1462805
- id            1 4.5648e+16 5.3992e+18 1462941
- vote_count    1 6.5662e+16 5.4192e+18 1463108
- release_date  1 6.9331e+16 5.4229e+18 1463138
- revenue       1 1.8412e+18 7.1948e+18 1475897
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
 extra argument 'formual' will be disregarded

Step:  AIC=1462557
budget ~ id + popularity + release_date + revenue + runtime + 
    video + vote_average + vote_count + en

               Df  Sum of Sq        RSS     AIC
- video         1 2.2310e+14 5.3549e+18 1462556
<none>                       5.3546e+18 1462557
- vote_average  1 2.5988e+15 5.3572e+18 1462576
- popularity    1 1.2596e+16 5.3672e+18 1462661
- runtime       1 2.1888e+16 5.3765e+18 1462739
- en            1 2.9436e+16 5.3841e+18 1462802
- id            1 4.5726e+16 5.4004e+18 1462938
- vote_count    1 6.5685e+16 5.4203e+18 1463105
- release_date  1 6.8947e+16 5.4236e+18 1463132
- revenue       1 1.8411e+18 7.1957e+18 1475892
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
 extra argument 'formual' will be disregarded

Step:  AIC=1462556
budget ~ id + popularity + release_date + revenue + runtime + 
    vote_average + vote_count + en

               Df  Sum of Sq        RSS     AIC
<none>                       5.3549e+18 1462556
- vote_average  1 2.5840e+15 5.3574e+18 1462576
- popularity    1 1.2637e+16 5.3675e+18 1462661
- runtime       1 2.1896e+16 5.3768e+18 1462739
- en            1 2.9362e+16 5.3842e+18 1462801
- id            1 4.5946e+16 5.4008e+18 1462940
- vote_count    1 6.5697e+16 5.4206e+18 1463105
- release_date  1 6.8835e+16 5.4237e+18 1463131
- revenue       1 1.8411e+18 7.1959e+18 1475891

Call:
lm(formula = budget ~ id + popularity + release_date + revenue + 
    runtime + vote_average + vote_count + en, data = df1, formual = revenue ~ 
    .)

Coefficients:
 (Intercept)            id    popularity  release_date       revenue  
  -9.797e+05    -9.720e+00     1.084e+05     1.499e+02     1.707e-01  
     runtime  vote_average    vote_count            en  
   1.870e+04    -1.314e+05     4.402e+03     1.821e+06  
reg4 <- lm(formula = budget ~ id + popularity + release_date + revenue + 
    runtime + vote_average + vote_count + en, data = df1, formual = revenue ~ 
    .)
Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
 extra argument 'formual' will be disregarded
stargazer(reg4, type = "text")

================================================
                        Dependent variable:     
                    ----------------------------
                               budget           
------------------------------------------------
id                           -9.720***          
                              (0.494)           
                                                
popularity                 108,430.100***       
                            (10,507.890)        
                                                
release_date                 149.853***         
                              (6.222)           
                                                
revenue                       0.171***          
                              (0.001)           
                                                
runtime                    18,698.900***        
                            (1,376.642)         
                                                
vote_average              -131,363.200***       
                            (28,152.360)        
                                                
vote_count                  4,401.672***        
                             (187.081)          
                                                
en                        1,820,974.000***      
                           (115,769.900)        
                                                
Constant                  -979,653.700***       
                           (237,012.300)        
                                                
------------------------------------------------
Observations                   45,130           
R2                             0.612            
Adjusted R2                    0.612            
Residual Std. Error 10,893,930.000 (df = 45121) 
F Statistic         8,891.270*** (df = 8; 45121)
================================================
Note:                *p<0.1; **p<0.05; ***p<0.01