## Loading required package: stringr
## Loading required package: ggplot2
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Предсказываем оценку зрителей только по бюджету и кассовым сборам

summary(m4 <- lm(Audience..score ~Worldwide.Gross +  Budget + Profitability+Budget:Profitability, Hollywood))
## 
## Call:
## lm(formula = Audience..score ~ Worldwide.Gross + Budget + Profitability + 
##     Budget:Profitability, data = Hollywood)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.797 -11.070  -0.781  10.280  37.777 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.593e+01  1.245e+00  44.929  < 2e-16 ***
## Worldwide.Gross      -1.599e-03  1.105e-02  -0.145   0.8850    
## Budget               -7.625e-02  3.353e-02  -2.274   0.0235 *  
## Profitability        -1.165e-03  1.810e-04  -6.434 3.65e-10 ***
## Budget:Profitability  5.824e-04  9.039e-05   6.443 3.45e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.37 on 390 degrees of freedom
## Multiple R-squared:  0.1375, Adjusted R-squared:  0.1287 
## F-statistic: 15.55 on 4 and 390 DF,  p-value: 8.22e-12

Модель объясняет всего 13% всех случаев. Добавим оценку кинокритиков.

summary(m2 <- lm(Audience..score ~Rotten.Tomatoes  +  Number.of.Theatres.in.US.Opening.Weekend  + Box.Office.Average.per.US.Cinema..Opening.Weekend.    +   Worldwide.Gross +   Budget + Profitability + Opening.Weekend, Hollywood))
## 
## Call:
## lm(formula = Audience..score ~ Rotten.Tomatoes + Number.of.Theatres.in.US.Opening.Weekend + 
##     Box.Office.Average.per.US.Cinema..Opening.Weekend. + Worldwide.Gross + 
##     Budget + Profitability + Opening.Weekend, data = Hollywood)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.209  -7.002   0.608   7.081  30.670 
## 
## Coefficients:
##                                                      Estimate Std. Error
## (Intercept)                                         3.395e+01  1.719e+00
## Rotten.Tomatoes                                     4.438e-01  2.195e-02
## Number.of.Theatres.in.US.Opening.Weekend            3.494e-05  6.096e-04
## Box.Office.Average.per.US.Cinema..Opening.Weekend. -1.503e-05  1.032e-04
## Worldwide.Gross                                     1.557e-02  7.716e-03
## Budget                                              1.554e-02  2.460e-02
## Profitability                                      -1.206e-05  8.791e-06
## Opening.Weekend                                     1.677e-02  1.769e-02
##                                                    t value Pr(>|t|)    
## (Intercept)                                         19.750   <2e-16 ***
## Rotten.Tomatoes                                     20.220   <2e-16 ***
## Number.of.Theatres.in.US.Opening.Weekend             0.057   0.9543    
## Box.Office.Average.per.US.Cinema..Opening.Weekend.  -0.146   0.8842    
## Worldwide.Gross                                      2.018   0.0442 *  
## Budget                                               0.632   0.5280    
## Profitability                                       -1.372   0.1709    
## Opening.Weekend                                      0.948   0.3439    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.27 on 387 degrees of freedom
## Multiple R-squared:  0.5398, Adjusted R-squared:  0.5315 
## F-statistic: 64.85 on 7 and 387 DF,  p-value: < 2.2e-16
anova(m4,m2)
## Analysis of Variance Table
## 
## Model 1: Audience..score ~ Worldwide.Gross + Budget + Profitability + 
##     Budget:Profitability
## Model 2: Audience..score ~ Rotten.Tomatoes + Number.of.Theatres.in.US.Opening.Weekend + 
##     Box.Office.Average.per.US.Cinema..Opening.Weekend. + Worldwide.Gross + 
##     Budget + Profitability + Opening.Weekend
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1    390 92103                                  
## 2    387 49145  3     42958 112.76 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Модель значительно улучшается, RSS уменьшается вдвое.

Hollywood <- Hollywood[-244,]
Hollywood <- Hollywood[-287,]
vif(m2)
##                                    Rotten.Tomatoes 
##                                           1.051541 
##           Number.of.Theatres.in.US.Opening.Weekend 
##                                           2.122956 
## Box.Office.Average.per.US.Cinema..Opening.Weekend. 
##                                           1.377036 
##                                    Worldwide.Gross 
##                                           1.996827 
##                                             Budget 
##                                           1.802883 
##                                      Profitability 
##                                           1.008242 
##                                    Opening.Weekend 
##                                           1.325678
plot(m2)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Убераем незначимые переменные:

summary(m3 <- lm(Audience..score ~Rotten.Tomatoes   +   Worldwide.Gross +   Budget + Profitability, Hollywood))
## 
## Call:
## lm(formula = Audience..score ~ Rotten.Tomatoes + Worldwide.Gross + 
##     Budget + Profitability, data = Hollywood)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.020  -6.912   0.278   6.891  30.172 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      3.486e+01  1.382e+00  25.224   <2e-16 ***
## Rotten.Tomatoes  4.436e-01  2.174e-02  20.402   <2e-16 ***
## Worldwide.Gross  1.559e-02  7.000e-03   2.227   0.0265 *  
## Budget           1.350e-02  2.331e-02   0.579   0.5628    
## Profitability   -1.226e-05  8.776e-06  -1.396   0.1634    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.25 on 388 degrees of freedom
## Multiple R-squared:  0.5395, Adjusted R-squared:  0.5347 
## F-statistic: 113.6 on 4 and 388 DF,  p-value: < 2.2e-16

Пытаемся улучшить модель добовлением интерактивного эффекта:

summary(m3 <- lm(Audience..score ~Rotten.Tomatoes  +    Worldwide.Gross +   Budget + Profitability+Budget:Profitability, Hollywood))
## 
## Call:
## lm(formula = Audience..score ~ Rotten.Tomatoes + Worldwide.Gross + 
##     Budget + Profitability + Budget:Profitability, data = Hollywood)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.428  -6.566   0.357   6.833  30.126 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.583e+01  1.353e+00  26.484  < 2e-16 ***
## Rotten.Tomatoes       4.222e-01  2.149e-02  19.650  < 2e-16 ***
## Worldwide.Gross      -4.618e-03  7.853e-03  -0.588    0.557    
## Budget               -2.676e-02  2.393e-02  -1.118    0.264    
## Profitability        -6.792e-04  1.309e-04  -5.189 3.42e-07 ***
## Budget:Profitability  3.341e-04  6.543e-05   5.106 5.16e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.9 on 387 degrees of freedom
## Multiple R-squared:  0.5685, Adjusted R-squared:  0.5629 
## F-statistic:   102 on 5 and 387 DF,  p-value: < 2.2e-16
vif(m3)
##      Rotten.Tomatoes      Worldwide.Gross               Budget 
##             1.066018             2.206408             1.821598 
##        Profitability Budget:Profitability 
##           238.724249           238.964351
AIC(m2, m3)
## Warning in AIC.default(m2, m3): models are not all fitted to the same
## number of observations
##    df      AIC
## m2  9 3044.300
## m3  7 3001.084

Не смотря на то, что интерактивный эффект значимый, значительно улучшить модель не удалось, RSS уменьшается на 3к, но забирает 2 степени свободы.

ggplot(hollywood, aes(Rotten.Tomatoes, Audience..score)) + stat_smooth()
## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.

ggplot(filter(Hollywood, Profitability <2000), aes(Rotten.Tomatoes, Audience..score, color=Profitability)) + stat_smooth(method="lm") + geom_point(shape = 19)

Практически этот же график, только с доходностью фильма, показанной цветом + выбраны только фильмы с доходностью меньше 2к (минус 11 фильмов).

Модель, определяющая оценки зрителей

summary(m11<-lm(Worldwide.Gross~Budget+Rotten.Tomatoes, Hollywood))
## 
## Call:
## lm(formula = Worldwide.Gross ~ Budget + Rotten.Tomatoes, data = Hollywood)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -281.46  -46.78   -2.24   33.84  478.04 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -2.6212     9.9979  -0.262  0.79332    
## Budget            2.0526     0.1325  15.492  < 2e-16 ***
## Rotten.Tomatoes   0.4216     0.1555   2.711  0.00701 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 81.39 on 390 degrees of freedom
## Multiple R-squared:  0.3925, Adjusted R-squared:  0.3894 
## F-statistic:   126 on 2 and 390 DF,  p-value: < 2.2e-16
summary(m1<-lm(Worldwide.Gross~Domestic.Gross+Foreign.Gross+Budget+Rotten.Tomatoes, Hollywood))
## 
## Call:
## lm(formula = Worldwide.Gross ~ Domestic.Gross + Foreign.Gross + 
##     Budget + Rotten.Tomatoes, data = Hollywood)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -168.63  -18.27   13.56   34.07  195.88 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -30.28648    7.73591  -3.915 0.000107 ***
## Domestic.Gross    0.27737    0.07676   3.614 0.000342 ***
## Foreign.Gross     0.85555    0.05451  15.697  < 2e-16 ***
## Budget            0.85907    0.11313   7.594 2.35e-13 ***
## Rotten.Tomatoes   0.20844    0.11184   1.864 0.063123 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 58.23 on 388 degrees of freedom
## Multiple R-squared:  0.6906, Adjusted R-squared:  0.6875 
## F-statistic: 216.6 on 4 and 388 DF,  p-value: < 2.2e-16
vif(m1)
##  Domestic.Gross   Foreign.Gross          Budget Rotten.Tomatoes 
##        1.414092        1.729206        1.427558        1.012668
plot(m1)