Movies

remove(list =ls())

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MASS)

Attaching package: 'MASS'

The following object is masked from 'package:dplyr':

    select
library(rsconnect)
titanic <- read.csv("C:/Users/james/Downloads/titanic.csv")

summary(Titanic)
Number of cases in table: 2201 
Number of factors: 4 
Test for independence of all factors:
    Chisq = 1637.4, df = 25, p-value = 0
    Chi-squared approximation may be incorrect
# Select relevant variables and clean the data
clean_df <- titanic %>%
  dplyr::select(Survived, Pclass, Sex, Age, SibSp, Parch, Fare, Embarked) %>%
  drop_na() %>%
  mutate(
    Survived = as.factor(Survived),
    Pclass = as.factor(Pclass),
    Sex = as.factor(Sex),
    Embarked = as.factor(Embarked)
  )

# Verify the structure of the cleaned data frame
str(clean_df)
'data.frame':   714 obs. of  8 variables:
 $ Survived: Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
 $ Pclass  : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 1 3 3 2 3 ...
 $ Sex     : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
 $ Age     : num  22 38 26 35 35 54 2 27 14 4 ...
 $ SibSp   : int  1 1 0 1 0 0 3 0 1 1 ...
 $ Parch   : int  0 0 0 0 0 0 1 2 0 1 ...
 $ Fare    : num  7.25 71.28 7.92 53.1 8.05 ...
 $ Embarked: Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 4 4 4 2 4 ...
# Run the "kitchen sink" model with all potential predictors
kitchen_sink_model <- glm(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
                          data = clean_df,
                          family = "binomial")

summary(kitchen_sink_model)

Call:
glm(formula = Survived ~ Pclass + Sex + Age + SibSp + Parch + 
    Fare + Embarked, family = "binomial", data = clean_df)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept)  16.691979 607.920015   0.027 0.978095    
Pclass2      -1.189637   0.329197  -3.614 0.000302 ***
Pclass3      -2.395220   0.343356  -6.976 3.04e-12 ***
Sexmale      -2.637859   0.223006 -11.829  < 2e-16 ***
Age          -0.043308   0.008322  -5.204 1.95e-07 ***
SibSp        -0.362925   0.129290  -2.807 0.005000 ** 
Parch        -0.060365   0.123944  -0.487 0.626233    
Fare          0.001451   0.002595   0.559 0.576143    
EmbarkedC   -12.259048 607.919885  -0.020 0.983911    
EmbarkedQ   -13.082427 607.920088  -0.022 0.982831    
EmbarkedS   -12.661895 607.919868  -0.021 0.983383    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 964.52  on 713  degrees of freedom
Residual deviance: 632.34  on 703  degrees of freedom
AIC: 654.34

Number of Fisher Scoring iterations: 13
# Perform backward selection using stepAIC
final_model <- stepAIC(kitchen_sink_model, direction = "backward")
Start:  AIC=654.34
Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked

           Df Deviance    AIC
- Embarked  3   635.78 651.78
- Parch     1   632.58 652.58
- Fare      1   632.67 652.67
<none>          632.34 654.34
- SibSp     1   640.85 660.85
- Age       1   662.15 682.15
- Pclass    2   686.64 704.64
- Sex       1   806.80 826.80

Step:  AIC=651.78
Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare

         Df Deviance    AIC
- Parch   1   636.03 650.03
- Fare    1   636.46 650.46
<none>        635.78 651.78
- SibSp   1   645.25 659.25
- Age     1   667.36 681.36
- Pclass  2   695.26 707.26
- Sex     1   814.49 828.49

Step:  AIC=650.03
Survived ~ Pclass + Sex + Age + SibSp + Fare

         Df Deviance    AIC
- Fare    1   636.56 648.56
<none>        636.03 650.03
- SibSp   1   647.23 659.23
- Age     1   667.61 679.61
- Pclass  2   699.21 709.21
- Sex     1   819.08 831.08

Step:  AIC=648.56
Survived ~ Pclass + Sex + Age + SibSp

         Df Deviance    AIC
<none>        636.56 648.56
- SibSp   1   647.28 657.28
- Age     1   669.40 679.40
- Pclass  2   742.29 750.29
- Sex     1   823.72 833.72
# View the summary of the final model
summary(final_model)

Call:
glm(formula = Survived ~ Pclass + Sex + Age + SibSp, family = "binomial", 
    data = clean_df)

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept)  4.334201   0.450700   9.617  < 2e-16 ***
Pclass2     -1.414360   0.284727  -4.967 6.78e-07 ***
Pclass3     -2.652618   0.285832  -9.280  < 2e-16 ***
Sexmale     -2.627679   0.214771 -12.235  < 2e-16 ***
Age         -0.044760   0.008225  -5.442 5.27e-08 ***
SibSp       -0.380190   0.121516  -3.129  0.00176 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 964.52  on 713  degrees of freedom
Residual deviance: 636.56  on 708  degrees of freedom
AIC: 648.56

Number of Fisher Scoring iterations: 5