Load the Libraries + Functions

Choosing Predictors

Running a Binary Logistic Regression

blr_model = lrm(Sentiment ~ character + plot + story + great + actor, data = df_movie)
blr_model
## Logistic Regression Model
##  
##  lrm(formula = Sentiment ~ character + plot + story + great + 
##      actor, data = df_movie)
##  
##                         Model Likelihood    Discrimination    Rank Discrim.    
##                               Ratio Test           Indexes          Indexes    
##  Obs           977    LR chi2      46.06    R2       0.061    C       0.562    
##   0            487    d.f.             5    g        0.318    Dxy     0.123    
##   1            490    Pr(> chi2) <0.0001    gr       1.375    gamma   0.418    
##  max |deriv| 1e-10                          gp       0.063    tau-a   0.062    
##                                             Brier    0.239                     
##  
##            Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept -0.0431 0.0695 -0.62  0.5348  
##  character  0.3753 0.2961  1.27  0.2050  
##  plot      -1.6941 0.5485 -3.09  0.0020  
##  story     -0.2720 0.3724 -0.73  0.4651  
##  great      2.1885 0.5325  4.11  <0.0001 
##  actor      0.3298 0.4189  0.79  0.4311  
## 

Coefficients

Variable Selection

glm_model = glm(Sentiment ~ character + plot + story + great + actor , data = df_movie, family = 'binomial')
glm_model_bw = step(glm_model, direction = 'backward')
## Start:  AIC=1320.34
## Sentiment ~ character + plot + story + great + actor
## 
##             Df Deviance    AIC
## - story      1   1308.9 1318.9
## - actor      1   1309.0 1319.0
## - character  1   1310.0 1320.0
## <none>           1308.3 1320.3
## - plot       1   1321.5 1331.5
## - great      1   1336.1 1346.1
## 
## Step:  AIC=1318.88
## Sentiment ~ character + plot + great + actor
## 
##             Df Deviance    AIC
## - actor      1   1309.5 1317.5
## - character  1   1310.3 1318.3
## <none>           1308.9 1318.9
## - plot       1   1321.8 1329.8
## - great      1   1336.5 1344.5
## 
## Step:  AIC=1317.5
## Sentiment ~ character + plot + great
## 
##             Df Deviance    AIC
## - character  1   1311.1 1317.1
## <none>           1309.5 1317.5
## - plot       1   1322.6 1328.6
## - great      1   1338.1 1344.1
## 
## Step:  AIC=1317.11
## Sentiment ~ plot + great
## 
##         Df Deviance    AIC
## <none>       1311.1 1317.1
## - plot   1   1324.1 1328.1
## - great  1   1340.2 1344.2

Outliers

influencePlot(glm_model)

##        StudRes        Hat       CookD
## 134 -2.1777847 0.02629098 0.039491797
## 425 -0.6873315 0.05967553 0.002881715
## 465 -2.3476167 0.02472222 0.053876380
## 614 -1.2178683 0.07494388 0.014811833

Assumptions

rms::vif(blr_model)
## character      plot     story     great     actor 
##  1.015098  1.001805  1.011054  1.002779  1.007283
rms::vif(glm_model_bw)
##     plot    great 
## 1.000228 1.000228

Test for Overfitting

model = lrm(data=df_movie, x = T, y = T, Sentiment ~ plot + great)
validate(model)
##           index.orig training    test optimism index.corrected  n
## Dxy           0.0990   0.0976  0.0990  -0.0014          0.1004 40
## R2            0.0578   0.0606  0.0556   0.0050          0.0528 40
## Intercept     0.0000   0.0000 -0.0054   0.0054         -0.0054 40
## Slope         1.0000   1.0000  0.9228   0.0772          0.9228 40
## Emax          0.0000   0.0000  0.0188   0.0188          0.0188 40
## D             0.0433   0.0455  0.0415   0.0040          0.0393 40
## U            -0.0020  -0.0020  0.0020  -0.0041          0.0020 40
## Q             0.0453   0.0476  0.0395   0.0080          0.0373 40
## B             0.2402   0.2397  0.2407  -0.0010          0.2412 40
## g             0.2618   0.2917  0.2546   0.0371          0.2247 40
## gp            0.0496   0.0488  0.0477   0.0011          0.0484 40