movies

Setup

rm(list=ls())
library(visdat)
library(stargazer)

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(MASS)

Attaching package: 'MASS'
The following object is masked from 'package:dplyr':

    select
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ ggplot2   3.5.2     ✔ stringr   1.5.1
✔ lubridate 1.9.4     ✔ tibble    3.3.0
✔ purrr     1.1.0     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ MASS::select()  masks dplyr::select()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Import Data

movies_metadata <- read.csv("~/Desktop/BC Data Analysis/archive/movies_metadata.csv")
# credits <- read.csv("~/Desktop/BC Data Analysis/archive/credits.csv")
# keywords <- read.csv("~/Desktop/BC Data Analysis/archive/keywords.csv")
# links_small <- read.csv("~/Desktop/BC Data Analysis/archive/links_small.csv")
# links <- read.csv("~/Desktop/BC Data Analysis/archive/links.csv")
# ratings_small <- read.csv("~/Desktop/BC Data Analysis/archive/ratings_small.csv")
# ratings <- read.csv("~/Desktop/BC Data Analysis/archive/ratings.csv")

# ratings <- read.csv(“~/Desktop/BC Data Analysis/archive/ratings.csv”)

vis_dat(movies_metadata[1:10000,])

df <- movies_metadata
df$budget <- as.numeric(df$budget)
Warning: NAs introduced by coercion
df$id <- as.numeric(df$id)
Warning: NAs introduced by coercion
df$popularity <- as.numeric(df$popularity)
Warning: NAs introduced by coercion
df$release_date <- ymd(df$release_date)
Warning: 3 failed to parse.
df$release_year <- format(as.Date(df$release_date, format="%d/%m/%Y"),"%Y")

df$overview <- NULL #deleting the column

#sort(x = table(df$original_language), decreasing = T)

df$original_language_english <- if_else(condition = df$original_language == "en", 
                                        true = 1, 
                                        false = 0)
df$original_language <- NULL
df$title <- NULL
df$homepage <- NULL
df$belongs_to_collection <- NULL
df$poster_path <- NULL
df$tagline <- NULL
df$status <- NULL
df$imdb_id <- NULL
df$id <- NULL
df$original_title <- NULL
head(!is.na(x = df$video))
[1] TRUE TRUE TRUE TRUE TRUE TRUE
df<- df[!is.na(df$video),]
df<- df[!is.na(df$popularity),]
df <- df[df$adult %in% c("True", "False"),]
df <- df[! df$genre %in% c("[]"),]
df <- df[! df$production_companies %in% c("[]"),]
df <- df[! df$production_countries %in% c("[]"),]
df <- df[! df$spoken_languages %in% c("[]"),]

df <- df[! df$budget %in% c("0"),]
df <- df[! df$revenue %in% c("0"),]
table(df$tagline)
< table of extent 0 >
# mean_val <- mean(df$budget[df$budget != 0], na.rm = TRUE)
# df$budget[df$budget == 0] <- mean_val
# mean_val <- mean(df$revenue[df$revenue != 0], na.rm = TRUE)
# df$revenue[df$revenue == 0] <- mean_val

Run Regression

# reg1 <- lm(data = df,
#            formula = vote_average ~ budget + genres + popularity + production_countries + release_date + revenue + runtime + spoken_languages + vote_count + release_year + original_language_english)

reg2 <- lm(data = df,
           formula = vote_average ~ budget + popularity + release_date + revenue + runtime + vote_count + original_language_english)

stargazer(reg2, type = "text")

=====================================================
                              Dependent variable:    
                          ---------------------------
                                 vote_average        
-----------------------------------------------------
budget                             -0.000***         
                                    (0.000)          
                                                     
popularity                          0.002**          
                                    (0.001)          
                                                     
release_date                      -0.00003***        
                                   (0.00000)         
                                                     
revenue                             -0.000           
                                    (0.000)          
                                                     
runtime                            0.011***          
                                    (0.001)          
                                                     
vote_count                         0.0003***         
                                   (0.00001)         
                                                     
original_language_english          -0.355***         
                                    (0.036)          
                                                     
Constant                           5.642***          
                                    (0.075)          
                                                     
-----------------------------------------------------
Observations                         5,176           
R2                                   0.303           
Adjusted R2                          0.302           
Residual Std. Error            0.743 (df = 5168)     
F Statistic                320.686*** (df = 7; 5168) 
=====================================================
Note:                     *p<0.1; **p<0.05; ***p<0.01
stepAIC(object = reg2, direction = c("backward"))
Start:  AIC=-3070.23
vote_average ~ budget + popularity + release_date + revenue + 
    runtime + vote_count + original_language_english

                            Df Sum of Sq    RSS     AIC
<none>                                   2851.3 -3070.2
- revenue                    1      1.39 2852.7 -3069.7
- popularity                 1      2.94 2854.2 -3066.9
- original_language_english  1     52.22 2903.5 -2978.3
- budget                     1    121.50 2972.8 -2856.2
- release_date               1    127.77 2979.1 -2845.3
- runtime                    1    265.65 3116.9 -2611.2
- vote_count                 1    373.17 3224.5 -2435.6

Call:
lm(formula = vote_average ~ budget + popularity + release_date + 
    revenue + runtime + vote_count + original_language_english, 
    data = df)

Coefficients:
              (Intercept)                     budget  
                5.642e+00                 -5.808e-09  
               popularity               release_date  
                1.954e-03                 -2.938e-05  
                  revenue                    runtime  
               -1.831e-10                  1.104e-02  
               vote_count  original_language_english  
                3.450e-04                 -3.545e-01  

StepAIC

vis_dat(mtcars)

reg1 <-
lm(formula = mpg ~ ., data = mtcars)
stargazer(reg1, type = "text")

===============================================
                        Dependent variable:    
                    ---------------------------
                                mpg            
-----------------------------------------------
cyl                           -0.111           
                              (1.045)          
                                               
disp                           0.013           
                              (0.018)          
                                               
hp                            -0.021           
                              (0.022)          
                                               
drat                           0.787           
                              (1.635)          
                                               
wt                            -3.715*          
                              (1.894)          
                                               
qsec                           0.821           
                              (0.731)          
                                               
vs                             0.318           
                              (2.105)          
                                               
am                             2.520           
                              (2.057)          
                                               
gear                           0.655           
                              (1.493)          
                                               
carb                          -0.199           
                              (0.829)          
                                               
Constant                      12.303           
                             (18.718)          
                                               
-----------------------------------------------
Observations                    32             
R2                             0.869           
Adjusted R2                    0.807           
Residual Std. Error       2.650 (df = 21)      
F Statistic           13.932*** (df = 10; 21)  
===============================================
Note:               *p<0.1; **p<0.05; ***p<0.01
?stepAIC()
library(MASS)
stepAIC(object = reg1, 
        direction = c("backward"))
Start:  AIC=70.9
mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb

       Df Sum of Sq    RSS    AIC
- cyl   1    0.0799 147.57 68.915
- vs    1    0.1601 147.66 68.932
- carb  1    0.4067 147.90 68.986
- gear  1    1.3531 148.85 69.190
- drat  1    1.6270 149.12 69.249
- disp  1    3.9167 151.41 69.736
- hp    1    6.8399 154.33 70.348
- qsec  1    8.8641 156.36 70.765
<none>              147.49 70.898
- am    1   10.5467 158.04 71.108
- wt    1   27.0144 174.51 74.280

Step:  AIC=68.92
mpg ~ disp + hp + drat + wt + qsec + vs + am + gear + carb

       Df Sum of Sq    RSS    AIC
- vs    1    0.2685 147.84 66.973
- carb  1    0.5201 148.09 67.028
- gear  1    1.8211 149.40 67.308
- drat  1    1.9826 149.56 67.342
- disp  1    3.9009 151.47 67.750
- hp    1    7.3632 154.94 68.473
<none>              147.57 68.915
- qsec  1   10.0933 157.67 69.032
- am    1   11.8359 159.41 69.384
- wt    1   27.0280 174.60 72.297

Step:  AIC=66.97
mpg ~ disp + hp + drat + wt + qsec + am + gear + carb

       Df Sum of Sq    RSS    AIC
- carb  1    0.6855 148.53 65.121
- gear  1    2.1437 149.99 65.434
- drat  1    2.2139 150.06 65.449
- disp  1    3.6467 151.49 65.753
- hp    1    7.1060 154.95 66.475
<none>              147.84 66.973
- am    1   11.5694 159.41 67.384
- qsec  1   15.6830 163.53 68.200
- wt    1   27.3799 175.22 70.410

Step:  AIC=65.12
mpg ~ disp + hp + drat + wt + qsec + am + gear

       Df Sum of Sq    RSS    AIC
- gear  1     1.565 150.09 63.457
- drat  1     1.932 150.46 63.535
<none>              148.53 65.121
- disp  1    10.110 158.64 65.229
- am    1    12.323 160.85 65.672
- hp    1    14.826 163.35 66.166
- qsec  1    26.408 174.94 68.358
- wt    1    69.127 217.66 75.350

Step:  AIC=63.46
mpg ~ disp + hp + drat + wt + qsec + am

       Df Sum of Sq    RSS    AIC
- drat  1     3.345 153.44 62.162
- disp  1     8.545 158.64 63.229
<none>              150.09 63.457
- hp    1    13.285 163.38 64.171
- am    1    20.036 170.13 65.466
- qsec  1    25.574 175.67 66.491
- wt    1    67.572 217.66 73.351

Step:  AIC=62.16
mpg ~ disp + hp + wt + qsec + am

       Df Sum of Sq    RSS    AIC
- disp  1     6.629 160.07 61.515
<none>              153.44 62.162
- hp    1    12.572 166.01 62.682
- qsec  1    26.470 179.91 65.255
- am    1    32.198 185.63 66.258
- wt    1    69.043 222.48 72.051

Step:  AIC=61.52
mpg ~ hp + wt + qsec + am

       Df Sum of Sq    RSS    AIC
- hp    1     9.219 169.29 61.307
<none>              160.07 61.515
- qsec  1    20.225 180.29 63.323
- am    1    25.993 186.06 64.331
- wt    1    78.494 238.56 72.284

Step:  AIC=61.31
mpg ~ wt + qsec + am

       Df Sum of Sq    RSS    AIC
<none>              169.29 61.307
- am    1    26.178 195.46 63.908
- qsec  1   109.034 278.32 75.217
- wt    1   183.347 352.63 82.790

Call:
lm(formula = mpg ~ wt + qsec + am, data = mtcars)

Coefficients:
(Intercept)           wt         qsec           am  
      9.618       -3.917        1.226        2.936  
reg2 <- lm(formula = mpg ~ wt + qsec + am, data = mtcars)

stargazer(reg1, reg2, type = "text")

==================================================================
                                 Dependent variable:              
                    ----------------------------------------------
                                         mpg                      
                              (1)                    (2)          
------------------------------------------------------------------
cyl                         -0.111                                
                            (1.045)                               
                                                                  
disp                         0.013                                
                            (0.018)                               
                                                                  
hp                          -0.021                                
                            (0.022)                               
                                                                  
drat                         0.787                                
                            (1.635)                               
                                                                  
wt                          -3.715*               -3.917***       
                            (1.894)                (0.711)        
                                                                  
qsec                         0.821                 1.226***       
                            (0.731)                (0.289)        
                                                                  
vs                           0.318                                
                            (2.105)                               
                                                                  
am                           2.520                 2.936**        
                            (2.057)                (1.411)        
                                                                  
gear                         0.655                                
                            (1.493)                               
                                                                  
carb                        -0.199                                
                            (0.829)                               
                                                                  
Constant                    12.303                  9.618         
                           (18.718)                (6.960)        
                                                                  
------------------------------------------------------------------
Observations                  32                      32          
R2                           0.869                  0.850         
Adjusted R2                  0.807                  0.834         
Residual Std. Error     2.650 (df = 21)        2.459 (df = 28)    
F Statistic         13.932*** (df = 10; 21) 52.750*** (df = 3; 28)
==================================================================
Note:                                  *p<0.1; **p<0.05; ***p<0.01