homework8_take2

Set Up

remove(list = ls())

library(visdat)
library(stargazer)

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(MASS)

Attaching package: 'MASS'
The following object is masked from 'package:dplyr':

    select
library(tidyverse) 
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ ggplot2   3.5.2     ✔ stringr   1.5.1
✔ lubridate 1.9.4     ✔ tibble    3.3.0
✔ purrr     1.1.0     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ MASS::select()  masks dplyr::select()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Importing Data

metadata <- read.csv("~/Downloads/movies_dataset/movies_metadata.csv")
vis_dat(metadata[1:10000,])

Summary Stats of Original Data

stargazer(metadata,
          type = "text",
          digits = 1)

===============================================================
Statistic      N        Mean       St. Dev.   Min      Max     
---------------------------------------------------------------
revenue      45,460 11,209,348.0 64,332,247.0  0  2,787,965,087
runtime      45,203     94.1         38.4      0      1,256    
vote_average 45,460     5.6          1.9      0.0     10.0     
vote_count   45,460    109.9        491.3      0     14,075    
---------------------------------------------------------------
summary(metadata)
    adult           belongs_to_collection    budget             genres         
 Length:45466       Length:45466          Length:45466       Length:45466      
 Class :character   Class :character      Class :character   Class :character  
 Mode  :character   Mode  :character      Mode  :character   Mode  :character  
                                                                               
                                                                               
                                                                               
                                                                               
   homepage              id              imdb_id          original_language 
 Length:45466       Length:45466       Length:45466       Length:45466      
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
                                                                            
 original_title       overview          popularity        poster_path       
 Length:45466       Length:45466       Length:45466       Length:45466      
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
                                                                            
 production_companies production_countries release_date      
 Length:45466         Length:45466         Length:45466      
 Class :character     Class :character     Class :character  
 Mode  :character     Mode  :character     Mode  :character  
                                                             
                                                             
                                                             
                                                             
    revenue             runtime        spoken_languages      status         
 Min.   :0.000e+00   Min.   :   0.00   Length:45466       Length:45466      
 1st Qu.:0.000e+00   1st Qu.:  85.00   Class :character   Class :character  
 Median :0.000e+00   Median :  95.00   Mode  :character   Mode  :character  
 Mean   :1.121e+07   Mean   :  94.13                                        
 3rd Qu.:0.000e+00   3rd Qu.: 107.00                                        
 Max.   :2.788e+09   Max.   :1256.00                                        
 NA's   :6           NA's   :263                                            
   tagline             title              video            vote_average   
 Length:45466       Length:45466       Length:45466       Min.   : 0.000  
 Class :character   Class :character   Class :character   1st Qu.: 5.000  
 Mode  :character   Mode  :character   Mode  :character   Median : 6.000  
                                                          Mean   : 5.618  
                                                          3rd Qu.: 6.800  
                                                          Max.   :10.000  
                                                          NA's   :6       
   vote_count     
 Min.   :    0.0  
 1st Qu.:    3.0  
 Median :   10.0  
 Mean   :  109.9  
 3rd Qu.:   34.0  
 Max.   :14075.0  
 NA's   :6        

Attempting to Clean Data

df <- metadata

df = dplyr::select(df, -overview, -title, -tagline, -title, -homepage, -poster_path, -original_title,-spoken_languages, -production_companies, -production_countries, -belongs_to_collection, -genres, -imdb_id, -id)

df$release_date <- ymd(df$release_date)
Warning: 3 failed to parse.
df$release_year <- format(as.Date(df$release_date, format="%d/%m/%Y"),"%Y")

df <- df[df$budget != "0" & df$revenue !="0", ]

df$original_language_english <- if_else(condition = df$original_language == "en",
                                        true = 1,
                                        false = 0)

df$status_type<- if_else(condition = df$status == "Released",
                                        true = 1,
                                        false = 0)

df$video_status <- if_else(condition = df$video == "False",
                                        true = 1,
                                        false = 0)

df$adult_status <- if_else(condition = df$adult == "False",
                                        true = 1,
                                        false = 0)



df$original_language <- NULL
df$video <- NULL
df$adult <- NULL
df$status <- NULL
df$release_date <- NULL

df$budget <- as.numeric(df$budget)
df$vote_count <- as.numeric(df$vote_count)
df$popularity <- as.numeric(df$popularity)
df$release_year <- as.numeric(df$release_year)

df_clean <- na.omit(df)
vis_dat(df_clean)

glimpse(df_clean)
Rows: 5,380
Columns: 11
$ budget                    <dbl> 30000000, 65000000, 16000000, 60000000, 3500…
$ popularity                <dbl> 21.946943, 17.015539, 3.859495, 17.924927, 5…
$ revenue                   <dbl> 373554033, 262797249, 81452156, 187436818, 6…
$ runtime                   <dbl> 81, 104, 127, 170, 106, 130, 106, 192, 119, …
$ vote_average              <dbl> 7.7, 6.9, 6.1, 7.7, 5.5, 6.6, 6.5, 7.1, 5.7,…
$ vote_count                <dbl> 5415, 2413, 34, 1886, 174, 1194, 199, 72, 13…
$ release_year              <dbl> 1995, 1995, 1995, 1995, 1995, 1995, 1995, 19…
$ original_language_english <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ status_type               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ video_status              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ adult_status              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…

Summary of Clean Data

stargazer(df_clean,
          type = "text")

====================================================================================
Statistic                   N        Mean         St. Dev.       Min        Max     
------------------------------------------------------------------------------------
budget                    5,380 31,099,460.000 40,164,901.000     1     380,000,000 
popularity                5,380     9.776          13.819      0.00000    547.488   
revenue                   5,380 90,334,905.000 166,153,145.000    1    2,787,965,087
runtime                   5,380    109.828         22.041         0         338     
vote_average              5,380     6.274           0.925       0.000      9.100    
vote_count                5,380    732.445        1,251.597       0       14,075    
release_year              5,380   1,999.756        15.920       1,915      2,017    
original_language_english 5,380     0.892           0.310         0          1      
status_type               5,380     0.999           0.027         0          1      
video_status              5,380     1.000           0.000         1          1      
adult_status              5,380     1.000           0.000         1          1      
------------------------------------------------------------------------------------

Initial Regression (Kitchen Sink)

reg1 <- lm(formula = vote_average ~ .
   , data = df_clean)

stargazer(reg1, type = "text")

=====================================================
                              Dependent variable:    
                          ---------------------------
                                 vote_average        
-----------------------------------------------------
budget                             -0.000***         
                                    (0.000)          
                                                     
popularity                         0.002***          
                                    (0.001)          
                                                     
revenue                             -0.000           
                                    (0.000)          
                                                     
runtime                            0.011***          
                                    (0.001)          
                                                     
vote_count                         0.0003***         
                                   (0.00001)         
                                                     
release_year                       -0.010***         
                                    (0.001)          
                                                     
original_language_english          -0.303***         
                                    (0.037)          
                                                     
status_type                          0.156           
                                    (0.396)          
                                                     
video_status                                         
                                                     
                                                     
adult_status                                         
                                                     
                                                     
Constant                           25.895***         
                                    (1.537)          
                                                     
-----------------------------------------------------
Observations                         5,380           
R2                                   0.270           
Adjusted R2                          0.269           
Residual Std. Error            0.791 (df = 5371)     
F Statistic                247.973*** (df = 8; 5371) 
=====================================================
Note:                     *p<0.1; **p<0.05; ***p<0.01

Backwards Selection

stepAIC(object = reg1, 
        direction = c("backward")
      )
Start:  AIC=-2509.74
vote_average ~ budget + popularity + revenue + runtime + vote_count + 
    release_year + original_language_english + status_type + 
    video_status + adult_status


Step:  AIC=-2509.74
vote_average ~ budget + popularity + revenue + runtime + vote_count + 
    release_year + original_language_english + status_type + 
    video_status


Step:  AIC=-2509.74
vote_average ~ budget + popularity + revenue + runtime + vote_count + 
    release_year + original_language_english + status_type

                            Df Sum of Sq    RSS     AIC
- status_type                1      0.10 3363.1 -2511.6
<none>                                   3363.0 -2509.7
- revenue                    1      1.40 3364.5 -2509.5
- popularity                 1      4.62 3367.7 -2504.4
- original_language_english  1     41.32 3404.4 -2446.1
- budget                     1    123.02 3486.1 -2318.4
- release_year               1    125.70 3488.7 -2314.3
- runtime                    1    260.59 3623.6 -2110.2
- vote_count                 1    381.13 3744.2 -1934.2

Step:  AIC=-2511.59
vote_average ~ budget + popularity + revenue + runtime + vote_count + 
    release_year + original_language_english

                            Df Sum of Sq    RSS     AIC
<none>                                   3363.1 -2511.6
- revenue                    1      1.41 3364.6 -2511.3
- popularity                 1      4.63 3367.8 -2506.2
- original_language_english  1     41.29 3404.4 -2447.9
- budget                     1    122.95 3486.1 -2320.4
- release_year               1    125.85 3489.0 -2315.9
- runtime                    1    260.72 3623.9 -2111.9
- vote_count                 1    381.18 3744.3 -1936.0

Call:
lm(formula = vote_average ~ budget + popularity + revenue + runtime + 
    vote_count + release_year + original_language_english, data = df_clean)

Coefficients:
              (Intercept)                     budget  
                2.606e+01                 -5.803e-09  
               popularity                    revenue  
                2.446e-03                 -1.840e-10  
                  runtime                 vote_count  
                1.057e-02                  3.482e-04  
             release_year  original_language_english  
               -1.038e-02                 -3.032e-01  

Optimized Regression Using AIC

reg2 <- lm(formula = vote_average ~ budget + popularity + revenue + runtime + 
    vote_count + release_year + original_language_english, data = df_clean)

stargazer(reg2, type = "text")

=====================================================
                              Dependent variable:    
                          ---------------------------
                                 vote_average        
-----------------------------------------------------
budget                             -0.000***         
                                    (0.000)          
                                                     
popularity                         0.002***          
                                    (0.001)          
                                                     
revenue                             -0.000           
                                    (0.000)          
                                                     
runtime                            0.011***          
                                    (0.001)          
                                                     
vote_count                         0.0003***         
                                   (0.00001)         
                                                     
release_year                       -0.010***         
                                    (0.001)          
                                                     
original_language_english          -0.303***         
                                    (0.037)          
                                                     
Constant                           26.060***         
                                    (1.479)          
                                                     
-----------------------------------------------------
Observations                         5,380           
R2                                   0.270           
Adjusted R2                          0.269           
Residual Std. Error            0.791 (df = 5372)     
F Statistic                283.420*** (df = 7; 5372) 
=====================================================
Note:                     *p<0.1; **p<0.05; ***p<0.01