## Loading required package: stringr
## Loading required package: ggplot2
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
summary(m4 <- lm(Audience..score ~Worldwide.Gross + Budget + Profitability+Budget:Profitability, Hollywood))
##
## Call:
## lm(formula = Audience..score ~ Worldwide.Gross + Budget + Profitability +
## Budget:Profitability, data = Hollywood)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.797 -11.070 -0.781 10.280 37.777
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.593e+01 1.245e+00 44.929 < 2e-16 ***
## Worldwide.Gross -1.599e-03 1.105e-02 -0.145 0.8850
## Budget -7.625e-02 3.353e-02 -2.274 0.0235 *
## Profitability -1.165e-03 1.810e-04 -6.434 3.65e-10 ***
## Budget:Profitability 5.824e-04 9.039e-05 6.443 3.45e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.37 on 390 degrees of freedom
## Multiple R-squared: 0.1375, Adjusted R-squared: 0.1287
## F-statistic: 15.55 on 4 and 390 DF, p-value: 8.22e-12
Модель объясняет всего 13% всех случаев. Добавим оценку кинокритиков.
summary(m2 <- lm(Audience..score ~Rotten.Tomatoes + Number.of.Theatres.in.US.Opening.Weekend + Box.Office.Average.per.US.Cinema..Opening.Weekend. + Worldwide.Gross + Budget + Profitability + Opening.Weekend, Hollywood))
##
## Call:
## lm(formula = Audience..score ~ Rotten.Tomatoes + Number.of.Theatres.in.US.Opening.Weekend +
## Box.Office.Average.per.US.Cinema..Opening.Weekend. + Worldwide.Gross +
## Budget + Profitability + Opening.Weekend, data = Hollywood)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.209 -7.002 0.608 7.081 30.670
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 3.395e+01 1.719e+00
## Rotten.Tomatoes 4.438e-01 2.195e-02
## Number.of.Theatres.in.US.Opening.Weekend 3.494e-05 6.096e-04
## Box.Office.Average.per.US.Cinema..Opening.Weekend. -1.503e-05 1.032e-04
## Worldwide.Gross 1.557e-02 7.716e-03
## Budget 1.554e-02 2.460e-02
## Profitability -1.206e-05 8.791e-06
## Opening.Weekend 1.677e-02 1.769e-02
## t value Pr(>|t|)
## (Intercept) 19.750 <2e-16 ***
## Rotten.Tomatoes 20.220 <2e-16 ***
## Number.of.Theatres.in.US.Opening.Weekend 0.057 0.9543
## Box.Office.Average.per.US.Cinema..Opening.Weekend. -0.146 0.8842
## Worldwide.Gross 2.018 0.0442 *
## Budget 0.632 0.5280
## Profitability -1.372 0.1709
## Opening.Weekend 0.948 0.3439
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.27 on 387 degrees of freedom
## Multiple R-squared: 0.5398, Adjusted R-squared: 0.5315
## F-statistic: 64.85 on 7 and 387 DF, p-value: < 2.2e-16
anova(m4,m2)
## Analysis of Variance Table
##
## Model 1: Audience..score ~ Worldwide.Gross + Budget + Profitability +
## Budget:Profitability
## Model 2: Audience..score ~ Rotten.Tomatoes + Number.of.Theatres.in.US.Opening.Weekend +
## Box.Office.Average.per.US.Cinema..Opening.Weekend. + Worldwide.Gross +
## Budget + Profitability + Opening.Weekend
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 390 92103
## 2 387 49145 3 42958 112.76 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Модель значительно улучшается, RSS уменьшается вдвое.
Hollywood <- Hollywood[-244,]
Hollywood <- Hollywood[-287,]
vif(m2)
## Rotten.Tomatoes
## 1.051541
## Number.of.Theatres.in.US.Opening.Weekend
## 2.122956
## Box.Office.Average.per.US.Cinema..Opening.Weekend.
## 1.377036
## Worldwide.Gross
## 1.996827
## Budget
## 1.802883
## Profitability
## 1.008242
## Opening.Weekend
## 1.325678
plot(m2)
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
Убераем незначимые переменные:
summary(m3 <- lm(Audience..score ~Rotten.Tomatoes + Worldwide.Gross + Budget + Profitability, Hollywood))
##
## Call:
## lm(formula = Audience..score ~ Rotten.Tomatoes + Worldwide.Gross +
## Budget + Profitability, data = Hollywood)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.020 -6.912 0.278 6.891 30.172
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.486e+01 1.382e+00 25.224 <2e-16 ***
## Rotten.Tomatoes 4.436e-01 2.174e-02 20.402 <2e-16 ***
## Worldwide.Gross 1.559e-02 7.000e-03 2.227 0.0265 *
## Budget 1.350e-02 2.331e-02 0.579 0.5628
## Profitability -1.226e-05 8.776e-06 -1.396 0.1634
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.25 on 388 degrees of freedom
## Multiple R-squared: 0.5395, Adjusted R-squared: 0.5347
## F-statistic: 113.6 on 4 and 388 DF, p-value: < 2.2e-16
Пытаемся улучшить модель добовлением интерактивного эффекта:
summary(m3 <- lm(Audience..score ~Rotten.Tomatoes + Worldwide.Gross + Budget + Profitability+Budget:Profitability, Hollywood))
##
## Call:
## lm(formula = Audience..score ~ Rotten.Tomatoes + Worldwide.Gross +
## Budget + Profitability + Budget:Profitability, data = Hollywood)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.428 -6.566 0.357 6.833 30.126
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.583e+01 1.353e+00 26.484 < 2e-16 ***
## Rotten.Tomatoes 4.222e-01 2.149e-02 19.650 < 2e-16 ***
## Worldwide.Gross -4.618e-03 7.853e-03 -0.588 0.557
## Budget -2.676e-02 2.393e-02 -1.118 0.264
## Profitability -6.792e-04 1.309e-04 -5.189 3.42e-07 ***
## Budget:Profitability 3.341e-04 6.543e-05 5.106 5.16e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.9 on 387 degrees of freedom
## Multiple R-squared: 0.5685, Adjusted R-squared: 0.5629
## F-statistic: 102 on 5 and 387 DF, p-value: < 2.2e-16
vif(m3)
## Rotten.Tomatoes Worldwide.Gross Budget
## 1.066018 2.206408 1.821598
## Profitability Budget:Profitability
## 238.724249 238.964351
AIC(m2, m3)
## Warning in AIC.default(m2, m3): models are not all fitted to the same
## number of observations
## df AIC
## m2 9 3044.300
## m3 7 3001.084
Не смотря на то, что интерактивный эффект значимый, значительно улучшить модель не удалось, RSS уменьшается на 3к, но забирает 2 степени свободы.
ggplot(hollywood, aes(Rotten.Tomatoes, Audience..score)) + stat_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
ggplot(filter(Hollywood, Profitability <2000), aes(Rotten.Tomatoes, Audience..score, color=Profitability)) + stat_smooth(method="lm") + geom_point(shape = 19)
Практически этот же график, только с доходностью фильма, показанной цветом + выбраны только фильмы с доходностью меньше 2к (минус 11 фильмов).
summary(m11<-lm(Worldwide.Gross~Budget+Rotten.Tomatoes, Hollywood))
##
## Call:
## lm(formula = Worldwide.Gross ~ Budget + Rotten.Tomatoes, data = Hollywood)
##
## Residuals:
## Min 1Q Median 3Q Max
## -281.46 -46.78 -2.24 33.84 478.04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.6212 9.9979 -0.262 0.79332
## Budget 2.0526 0.1325 15.492 < 2e-16 ***
## Rotten.Tomatoes 0.4216 0.1555 2.711 0.00701 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 81.39 on 390 degrees of freedom
## Multiple R-squared: 0.3925, Adjusted R-squared: 0.3894
## F-statistic: 126 on 2 and 390 DF, p-value: < 2.2e-16
summary(m1<-lm(Worldwide.Gross~Domestic.Gross+Foreign.Gross+Budget+Rotten.Tomatoes, Hollywood))
##
## Call:
## lm(formula = Worldwide.Gross ~ Domestic.Gross + Foreign.Gross +
## Budget + Rotten.Tomatoes, data = Hollywood)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.63 -18.27 13.56 34.07 195.88
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -30.28648 7.73591 -3.915 0.000107 ***
## Domestic.Gross 0.27737 0.07676 3.614 0.000342 ***
## Foreign.Gross 0.85555 0.05451 15.697 < 2e-16 ***
## Budget 0.85907 0.11313 7.594 2.35e-13 ***
## Rotten.Tomatoes 0.20844 0.11184 1.864 0.063123 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 58.23 on 388 degrees of freedom
## Multiple R-squared: 0.6906, Adjusted R-squared: 0.6875
## F-statistic: 216.6 on 4 and 388 DF, p-value: < 2.2e-16
vif(m1)
## Domestic.Gross Foreign.Gross Budget Rotten.Tomatoes
## 1.414092 1.729206 1.427558 1.012668
plot(m1)