library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(tibble)
library(readr)
library(purrr)
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
## 
##     lift
library(stringr)
library(forcats)
library(tidyr)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
R.version
##                _                                
## platform       x86_64-w64-mingw32               
## arch           x86_64                           
## os             mingw32                          
## crt            ucrt                             
## system         x86_64, mingw32                  
## status                                          
## major          4                                
## minor          2.1                              
## year           2022                             
## month          06                               
## day            23                               
## svn rev        82513                            
## language       R                                
## version.string R version 4.2.1 (2022-06-23 ucrt)
## nickname       Funny-Looking Kid
titanic <- read.csv("train.csv")
str(titanic)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...
titanic %>% glimpse()
## Rows: 891
## Columns: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
## $ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
## $ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
## $ Sex         <chr> "male", "female", "female", "female", "male", "male", "mal…
## $ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
## $ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
## $ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
## $ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
## $ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
## $ Cabin       <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C…
## $ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…

Information: - PassengerId: Passenger’s Identification Number - Survived: Passenger’s survival, 1 = Yes, 0 = No - Pclass: Socio Economy Status, 1st = Higher, 2nd = Middle, 3rd = Lower - Name: Passenger’s name - Sex: Passenger’s gender - Age: Passenger’s age - SibSp: sibsp: The dataset defines family relations in this way - Sibling = brother, sister, stepbrother, stepsister - Spouse = husband, wife (mistresses and fiancés were ignored) - Parch: parch: The dataset defines family relations in this way - Parent = mother, father - Child = daughter, son, stepdaughter, stepson - Some children travelled only with a nanny, therefore parch=0 for them - Ticket: Passenger’s ticket ID - Fare: The fare of the ticket - Cabin: Cabin number - Embarked: Port of emabarkation. C = Cherbourg, Q = Queenstown, S = Southampton

summary(titanic)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 
titanic_fin <- titanic %>% 
  dplyr::select(-c(PassengerId, Name, Cabin, Ticket)) %>% 
  mutate(Survived = as.factor(Survived),
         Pclass = as.factor(Pclass)) %>% 
  mutate(Age = replace_na(Age, replace = mean(Age, na.rm = T)))
str(titanic_fin)
## 'data.frame':    891 obs. of  8 variables:
##  $ Survived: Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Pclass  : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
##  $ Sex     : chr  "male" "female" "female" "female" ...
##  $ Age     : num  22 38 26 35 35 ...
##  $ SibSp   : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch   : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Fare    : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Embarked: chr  "S" "C" "S" "S" ...
colSums(is.na(titanic_fin))
## Survived   Pclass      Sex      Age    SibSp    Parch     Fare Embarked 
##        0        0        0        0        0        0        0        0
summary(titanic)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 
anyNA(titanic)
## [1] TRUE
prop.table(table(titanic_fin$Survived))
## 
##         0         1 
## 0.6161616 0.3838384
set.seed(777)
intrain <- sample(nrow(titanic_fin), nrow(titanic_fin)*0.8)
titanic_train <- titanic_fin[intrain,]
titanic_test <- titanic_fin[-intrain,]
table(titanic_train$Survived)
## 
##   0   1 
## 443 269
titanic_m_all <- glm(Survived~., titanic_train, family = "binomial")
summary(titanic_m_all)
## 
## Call:
## glm(formula = Survived ~ ., family = "binomial", data = titanic_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3771  -0.5866  -0.4158   0.6094   2.4470  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  14.931596 535.411332   0.028  0.97775    
## Pclass2      -1.067305   0.340786  -3.132  0.00174 ** 
## Pclass3      -2.181663   0.340487  -6.407 1.48e-10 ***
## Sexmale      -2.740152   0.225756 -12.138  < 2e-16 ***
## Age          -0.042590   0.009052  -4.705 2.54e-06 ***
## SibSp        -0.354001   0.122614  -2.887  0.00389 ** 
## Parch        -0.083305   0.132741  -0.628  0.53028    
## Fare          0.003161   0.002766   1.143  0.25313    
## EmbarkedC   -10.953789 535.411267  -0.020  0.98368    
## EmbarkedQ   -10.822691 535.411350  -0.020  0.98387    
## EmbarkedS   -11.061172 535.411257  -0.021  0.98352    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 944.09  on 711  degrees of freedom
## Residual deviance: 617.87  on 701  degrees of freedom
## AIC: 639.87
## 
## Number of Fisher Scoring iterations: 12
step(titanic_m_all, direction = "backward")
## Start:  AIC=639.87
## Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
## 
##            Df Deviance    AIC
## - Embarked  3   618.52 634.52
## - Parch     1   618.27 638.27
## - Fare      1   619.34 639.34
## <none>          617.87 639.87
## - SibSp     1   627.95 647.95
## - Age       1   642.11 662.11
## - Pclass    2   663.39 681.39
## - Sex       1   803.56 823.56
## 
## Step:  AIC=634.52
## Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare
## 
##          Df Deviance    AIC
## - Parch   1   619.05 633.05
## - Fare    1   620.21 634.21
## <none>        618.52 634.52
## - SibSp   1   629.32 643.32
## - Age     1   642.99 656.99
## - Pclass  2   665.53 677.53
## - Sex     1   812.87 826.87
## 
## Step:  AIC=633.05
## Survived ~ Pclass + Sex + Age + SibSp + Fare
## 
##          Df Deviance    AIC
## - Fare    1   620.43 632.43
## <none>        619.05 633.05
## - SibSp   1   632.89 644.89
## - Age     1   643.38 655.38
## - Pclass  2   668.62 678.62
## - Sex     1   815.52 827.52
## 
## Step:  AIC=632.43
## Survived ~ Pclass + Sex + Age + SibSp
## 
##          Df Deviance    AIC
## <none>        620.43 632.43
## - SibSp   1   633.16 643.16
## - Age     1   645.99 655.99
## - Pclass  2   708.40 716.40
## - Sex     1   821.16 831.16
## 
## Call:  glm(formula = Survived ~ Pclass + Sex + Age + SibSp, family = "binomial", 
##     data = titanic_train)
## 
## Coefficients:
## (Intercept)      Pclass2      Pclass3      Sexmale          Age        SibSp  
##     4.17317     -1.28645     -2.39060     -2.75570     -0.04331     -0.37014  
## 
## Degrees of Freedom: 711 Total (i.e. Null);  706 Residual
## Null Deviance:       944.1 
## Residual Deviance: 620.4     AIC: 632.4
titanic_m_back <- glm(formula = Survived ~ Pclass + Sex + Age + SibSp, family = "binomial", 
    data = titanic_train)
summary(titanic_m_back)
## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp, family = "binomial", 
##     data = titanic_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3597  -0.5797  -0.4058   0.6045   2.4692  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.173168   0.459195   9.088  < 2e-16 ***
## Pclass2     -1.286446   0.296515  -4.339 1.43e-05 ***
## Pclass3     -2.390600   0.277284  -8.621  < 2e-16 ***
## Sexmale     -2.755700   0.218720 -12.599  < 2e-16 ***
## Age         -0.043313   0.008982  -4.822 1.42e-06 ***
## SibSp       -0.370135   0.116381  -3.180  0.00147 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 944.09  on 711  degrees of freedom
## Residual deviance: 620.43  on 706  degrees of freedom
## AIC: 632.43
## 
## Number of Fisher Scoring iterations: 5
titanic_m_back2 <- step(titanic_m_all, direction = "backward")
## Start:  AIC=639.87
## Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
## 
##            Df Deviance    AIC
## - Embarked  3   618.52 634.52
## - Parch     1   618.27 638.27
## - Fare      1   619.34 639.34
## <none>          617.87 639.87
## - SibSp     1   627.95 647.95
## - Age       1   642.11 662.11
## - Pclass    2   663.39 681.39
## - Sex       1   803.56 823.56
## 
## Step:  AIC=634.52
## Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare
## 
##          Df Deviance    AIC
## - Parch   1   619.05 633.05
## - Fare    1   620.21 634.21
## <none>        618.52 634.52
## - SibSp   1   629.32 643.32
## - Age     1   642.99 656.99
## - Pclass  2   665.53 677.53
## - Sex     1   812.87 826.87
## 
## Step:  AIC=633.05
## Survived ~ Pclass + Sex + Age + SibSp + Fare
## 
##          Df Deviance    AIC
## - Fare    1   620.43 632.43
## <none>        619.05 633.05
## - SibSp   1   632.89 644.89
## - Age     1   643.38 655.38
## - Pclass  2   668.62 678.62
## - Sex     1   815.52 827.52
## 
## Step:  AIC=632.43
## Survived ~ Pclass + Sex + Age + SibSp
## 
##          Df Deviance    AIC
## <none>        620.43 632.43
## - SibSp   1   633.16 643.16
## - Age     1   645.99 655.99
## - Pclass  2   708.40 716.40
## - Sex     1   821.16 831.16
titanic_predict <- predict(titanic_m_back, titanic_test)
class(titanic_predict)
## [1] "numeric"
titanic_test$titanic_prob <- predict(titanic_m_back, titanic_test, type = "response")
ggplot(titanic_test, aes(x=titanic_prob))+geom_density(lwd=0.5)+theme_minimal()

titanic_test$titanic_predict <- factor(ifelse(titanic_test$titanic_prob > 0.5, "1", "0"))
head(titanic_test[1:10, c("titanic_predict", "Survived")])
##    titanic_predict Survived
## 3                1        1
## 4                1        1
## 20               1        1
## 28               0        0
## 30               0        0
## 40               1        1
confusionMatrix(titanic_test$titanic_predict, titanic_test$Survived, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 85 21
##          1 21 52
##                                           
##                Accuracy : 0.7654          
##                  95% CI : (0.6964, 0.8254)
##     No Information Rate : 0.5922          
##     P-Value [Acc > NIR] : 8.154e-07       
##                                           
##                   Kappa : 0.5142          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.7123          
##             Specificity : 0.8019          
##          Pos Pred Value : 0.7123          
##          Neg Pred Value : 0.8019          
##              Prevalence : 0.4078          
##          Detection Rate : 0.2905          
##    Detection Prevalence : 0.4078          
##       Balanced Accuracy : 0.7571          
##                                           
##        'Positive' Class : 1               
##