Setup
rm (list= ls ())
library (visdat)
library (stargazer)
Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
Attaching package: 'MASS'
The following object is masked from 'package:dplyr':
select
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ readr 2.1.5
✔ ggplot2 3.5.2 ✔ stringr 1.5.1
✔ lubridate 1.9.4 ✔ tibble 3.3.0
✔ purrr 1.1.0 ✔ tidyr 1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ MASS::select() masks dplyr::select()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Import Data
movies_metadata <- read.csv ("~/Desktop/BC Data Analysis/archive/movies_metadata.csv" )
# credits <- read.csv("~/Desktop/BC Data Analysis/archive/credits.csv")
# keywords <- read.csv("~/Desktop/BC Data Analysis/archive/keywords.csv")
# links_small <- read.csv("~/Desktop/BC Data Analysis/archive/links_small.csv")
# links <- read.csv("~/Desktop/BC Data Analysis/archive/links.csv")
# ratings_small <- read.csv("~/Desktop/BC Data Analysis/archive/ratings_small.csv")
# ratings <- read.csv("~/Desktop/BC Data Analysis/archive/ratings.csv")
# ratings <- read.csv(“~/Desktop/BC Data Analysis/archive/ratings.csv”)
vis_dat (movies_metadata[1 : 10000 ,])
df <- movies_metadata
df$ budget <- as.numeric (df$ budget)
Warning: NAs introduced by coercion
df$ id <- as.numeric (df$ id)
Warning: NAs introduced by coercion
df$ popularity <- as.numeric (df$ popularity)
Warning: NAs introduced by coercion
df$ release_date <- ymd (df$ release_date)
Warning: 3 failed to parse.
df$ release_year <- format (as.Date (df$ release_date, format= "%d/%m/%Y" ),"%Y" )
df$ overview <- NULL #deleting the column
#sort(x = table(df$original_language), decreasing = T)
df$ original_language_english <- if_else (condition = df$ original_language == "en" ,
true = 1 ,
false = 0 )
df$ original_language <- NULL
df$ title <- NULL
df$ homepage <- NULL
df$ belongs_to_collection <- NULL
df$ poster_path <- NULL
df$ tagline <- NULL
df$ status <- NULL
df$ imdb_id <- NULL
df$ id <- NULL
df$ original_title <- NULL
head (! is.na (x = df$ video))
[1] TRUE TRUE TRUE TRUE TRUE TRUE
df<- df[! is.na (df$ video),]
df<- df[! is.na (df$ popularity),]
df <- df[df$ adult %in% c ("True" , "False" ),]
df <- df[! df$ genre %in% c ("[]" ),]
df <- df[! df$ production_companies %in% c ("[]" ),]
df <- df[! df$ production_countries %in% c ("[]" ),]
df <- df[! df$ spoken_languages %in% c ("[]" ),]
df <- df[! df$ budget %in% c ("0" ),]
df <- df[! df$ revenue %in% c ("0" ),]
table (df$ tagline)
# mean_val <- mean(df$budget[df$budget != 0], na.rm = TRUE)
# df$budget[df$budget == 0] <- mean_val
# mean_val <- mean(df$revenue[df$revenue != 0], na.rm = TRUE)
# df$revenue[df$revenue == 0] <- mean_val
Run Regression
# reg1 <- lm(data = df,
# formula = vote_average ~ budget + genres + popularity + production_countries + release_date + revenue + runtime + spoken_languages + vote_count + release_year + original_language_english)
reg2 <- lm (data = df,
formula = vote_average ~ budget + popularity + release_date + revenue + runtime + vote_count + original_language_english)
stargazer (reg2, type = "text" )
=====================================================
Dependent variable:
---------------------------
vote_average
-----------------------------------------------------
budget -0.000***
(0.000)
popularity 0.002**
(0.001)
release_date -0.00003***
(0.00000)
revenue -0.000
(0.000)
runtime 0.011***
(0.001)
vote_count 0.0003***
(0.00001)
original_language_english -0.355***
(0.036)
Constant 5.642***
(0.075)
-----------------------------------------------------
Observations 5,176
R2 0.303
Adjusted R2 0.302
Residual Std. Error 0.743 (df = 5168)
F Statistic 320.686*** (df = 7; 5168)
=====================================================
Note: *p<0.1; **p<0.05; ***p<0.01
stepAIC (object = reg2, direction = c ("backward" ))
Start: AIC=-3070.23
vote_average ~ budget + popularity + release_date + revenue +
runtime + vote_count + original_language_english
Df Sum of Sq RSS AIC
<none> 2851.3 -3070.2
- revenue 1 1.39 2852.7 -3069.7
- popularity 1 2.94 2854.2 -3066.9
- original_language_english 1 52.22 2903.5 -2978.3
- budget 1 121.50 2972.8 -2856.2
- release_date 1 127.77 2979.1 -2845.3
- runtime 1 265.65 3116.9 -2611.2
- vote_count 1 373.17 3224.5 -2435.6
Call:
lm(formula = vote_average ~ budget + popularity + release_date +
revenue + runtime + vote_count + original_language_english,
data = df)
Coefficients:
(Intercept) budget
5.642e+00 -5.808e-09
popularity release_date
1.954e-03 -2.938e-05
revenue runtime
-1.831e-10 1.104e-02
vote_count original_language_english
3.450e-04 -3.545e-01
StepAIC
reg1 <-
lm (formula = mpg ~ ., data = mtcars)
stargazer (reg1, type = "text" )
===============================================
Dependent variable:
---------------------------
mpg
-----------------------------------------------
cyl -0.111
(1.045)
disp 0.013
(0.018)
hp -0.021
(0.022)
drat 0.787
(1.635)
wt -3.715*
(1.894)
qsec 0.821
(0.731)
vs 0.318
(2.105)
am 2.520
(2.057)
gear 0.655
(1.493)
carb -0.199
(0.829)
Constant 12.303
(18.718)
-----------------------------------------------
Observations 32
R2 0.869
Adjusted R2 0.807
Residual Std. Error 2.650 (df = 21)
F Statistic 13.932*** (df = 10; 21)
===============================================
Note: *p<0.1; **p<0.05; ***p<0.01
?stepAIC ()
library (MASS)
stepAIC (object = reg1,
direction = c ("backward" ))
Start: AIC=70.9
mpg ~ cyl + disp + hp + drat + wt + qsec + vs + am + gear + carb
Df Sum of Sq RSS AIC
- cyl 1 0.0799 147.57 68.915
- vs 1 0.1601 147.66 68.932
- carb 1 0.4067 147.90 68.986
- gear 1 1.3531 148.85 69.190
- drat 1 1.6270 149.12 69.249
- disp 1 3.9167 151.41 69.736
- hp 1 6.8399 154.33 70.348
- qsec 1 8.8641 156.36 70.765
<none> 147.49 70.898
- am 1 10.5467 158.04 71.108
- wt 1 27.0144 174.51 74.280
Step: AIC=68.92
mpg ~ disp + hp + drat + wt + qsec + vs + am + gear + carb
Df Sum of Sq RSS AIC
- vs 1 0.2685 147.84 66.973
- carb 1 0.5201 148.09 67.028
- gear 1 1.8211 149.40 67.308
- drat 1 1.9826 149.56 67.342
- disp 1 3.9009 151.47 67.750
- hp 1 7.3632 154.94 68.473
<none> 147.57 68.915
- qsec 1 10.0933 157.67 69.032
- am 1 11.8359 159.41 69.384
- wt 1 27.0280 174.60 72.297
Step: AIC=66.97
mpg ~ disp + hp + drat + wt + qsec + am + gear + carb
Df Sum of Sq RSS AIC
- carb 1 0.6855 148.53 65.121
- gear 1 2.1437 149.99 65.434
- drat 1 2.2139 150.06 65.449
- disp 1 3.6467 151.49 65.753
- hp 1 7.1060 154.95 66.475
<none> 147.84 66.973
- am 1 11.5694 159.41 67.384
- qsec 1 15.6830 163.53 68.200
- wt 1 27.3799 175.22 70.410
Step: AIC=65.12
mpg ~ disp + hp + drat + wt + qsec + am + gear
Df Sum of Sq RSS AIC
- gear 1 1.565 150.09 63.457
- drat 1 1.932 150.46 63.535
<none> 148.53 65.121
- disp 1 10.110 158.64 65.229
- am 1 12.323 160.85 65.672
- hp 1 14.826 163.35 66.166
- qsec 1 26.408 174.94 68.358
- wt 1 69.127 217.66 75.350
Step: AIC=63.46
mpg ~ disp + hp + drat + wt + qsec + am
Df Sum of Sq RSS AIC
- drat 1 3.345 153.44 62.162
- disp 1 8.545 158.64 63.229
<none> 150.09 63.457
- hp 1 13.285 163.38 64.171
- am 1 20.036 170.13 65.466
- qsec 1 25.574 175.67 66.491
- wt 1 67.572 217.66 73.351
Step: AIC=62.16
mpg ~ disp + hp + wt + qsec + am
Df Sum of Sq RSS AIC
- disp 1 6.629 160.07 61.515
<none> 153.44 62.162
- hp 1 12.572 166.01 62.682
- qsec 1 26.470 179.91 65.255
- am 1 32.198 185.63 66.258
- wt 1 69.043 222.48 72.051
Step: AIC=61.52
mpg ~ hp + wt + qsec + am
Df Sum of Sq RSS AIC
- hp 1 9.219 169.29 61.307
<none> 160.07 61.515
- qsec 1 20.225 180.29 63.323
- am 1 25.993 186.06 64.331
- wt 1 78.494 238.56 72.284
Step: AIC=61.31
mpg ~ wt + qsec + am
Df Sum of Sq RSS AIC
<none> 169.29 61.307
- am 1 26.178 195.46 63.908
- qsec 1 109.034 278.32 75.217
- wt 1 183.347 352.63 82.790
Call:
lm(formula = mpg ~ wt + qsec + am, data = mtcars)
Coefficients:
(Intercept) wt qsec am
9.618 -3.917 1.226 2.936
reg2 <- lm (formula = mpg ~ wt + qsec + am, data = mtcars)
stargazer (reg1, reg2, type = "text" )
==================================================================
Dependent variable:
----------------------------------------------
mpg
(1) (2)
------------------------------------------------------------------
cyl -0.111
(1.045)
disp 0.013
(0.018)
hp -0.021
(0.022)
drat 0.787
(1.635)
wt -3.715* -3.917***
(1.894) (0.711)
qsec 0.821 1.226***
(0.731) (0.289)
vs 0.318
(2.105)
am 2.520 2.936**
(2.057) (1.411)
gear 0.655
(1.493)
carb -0.199
(0.829)
Constant 12.303 9.618
(18.718) (6.960)
------------------------------------------------------------------
Observations 32 32
R2 0.869 0.850
Adjusted R2 0.807 0.834
Residual Std. Error 2.650 (df = 21) 2.459 (df = 28)
F Statistic 13.932*** (df = 10; 21) 52.750*** (df = 3; 28)
==================================================================
Note: *p<0.1; **p<0.05; ***p<0.01