library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.2.2
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(tibble)
library(readr)
library(purrr)
##
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
##
## lift
library(stringr)
library(forcats)
library(tidyr)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
R.version
## _
## platform x86_64-w64-mingw32
## arch x86_64
## os mingw32
## crt ucrt
## system x86_64, mingw32
## status
## major 4
## minor 2.1
## year 2022
## month 06
## day 23
## svn rev 82513
## language R
## version.string R version 4.2.1 (2022-06-23 ucrt)
## nickname Funny-Looking Kid
titanic <- read.csv("train.csv")
str(titanic)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
titanic %>% glimpse()
## Rows: 891
## Columns: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Survived <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
## $ Pclass <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
## $ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
## $ Sex <chr> "male", "female", "female", "female", "male", "male", "mal…
## $ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
## $ SibSp <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
## $ Parch <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
## $ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
## $ Cabin <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C…
## $ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…
Information: - PassengerId: Passenger’s Identification Number - Survived: Passenger’s survival, 1 = Yes, 0 = No - Pclass: Socio Economy Status, 1st = Higher, 2nd = Middle, 3rd = Lower - Name: Passenger’s name - Sex: Passenger’s gender - Age: Passenger’s age - SibSp: sibsp: The dataset defines family relations in this way - Sibling = brother, sister, stepbrother, stepsister - Spouse = husband, wife (mistresses and fiancés were ignored) - Parch: parch: The dataset defines family relations in this way - Parent = mother, father - Child = daughter, son, stepdaughter, stepson - Some children travelled only with a nanny, therefore parch=0 for them - Ticket: Passenger’s ticket ID - Fare: The fare of the ticket - Cabin: Cabin number - Embarked: Port of emabarkation. C = Cherbourg, Q = Queenstown, S = Southampton
summary(titanic)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
titanic_fin <- titanic %>%
dplyr::select(-c(PassengerId, Name, Cabin, Ticket)) %>%
mutate(Survived = as.factor(Survived),
Pclass = as.factor(Pclass)) %>%
mutate(Age = replace_na(Age, replace = mean(Age, na.rm = T)))
str(titanic_fin)
## 'data.frame': 891 obs. of 8 variables:
## $ Survived: Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Embarked: chr "S" "C" "S" "S" ...
colSums(is.na(titanic_fin))
## Survived Pclass Sex Age SibSp Parch Fare Embarked
## 0 0 0 0 0 0 0 0
summary(titanic)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
anyNA(titanic)
## [1] TRUE
prop.table(table(titanic_fin$Survived))
##
## 0 1
## 0.6161616 0.3838384
set.seed(777)
intrain <- sample(nrow(titanic_fin), nrow(titanic_fin)*0.8)
titanic_train <- titanic_fin[intrain,]
titanic_test <- titanic_fin[-intrain,]
table(titanic_train$Survived)
##
## 0 1
## 443 269
titanic_m_all <- glm(Survived~., titanic_train, family = "binomial")
summary(titanic_m_all)
##
## Call:
## glm(formula = Survived ~ ., family = "binomial", data = titanic_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3771 -0.5866 -0.4158 0.6094 2.4470
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 14.931596 535.411332 0.028 0.97775
## Pclass2 -1.067305 0.340786 -3.132 0.00174 **
## Pclass3 -2.181663 0.340487 -6.407 1.48e-10 ***
## Sexmale -2.740152 0.225756 -12.138 < 2e-16 ***
## Age -0.042590 0.009052 -4.705 2.54e-06 ***
## SibSp -0.354001 0.122614 -2.887 0.00389 **
## Parch -0.083305 0.132741 -0.628 0.53028
## Fare 0.003161 0.002766 1.143 0.25313
## EmbarkedC -10.953789 535.411267 -0.020 0.98368
## EmbarkedQ -10.822691 535.411350 -0.020 0.98387
## EmbarkedS -11.061172 535.411257 -0.021 0.98352
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 944.09 on 711 degrees of freedom
## Residual deviance: 617.87 on 701 degrees of freedom
## AIC: 639.87
##
## Number of Fisher Scoring iterations: 12
step(titanic_m_all, direction = "backward")
## Start: AIC=639.87
## Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
##
## Df Deviance AIC
## - Embarked 3 618.52 634.52
## - Parch 1 618.27 638.27
## - Fare 1 619.34 639.34
## <none> 617.87 639.87
## - SibSp 1 627.95 647.95
## - Age 1 642.11 662.11
## - Pclass 2 663.39 681.39
## - Sex 1 803.56 823.56
##
## Step: AIC=634.52
## Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare
##
## Df Deviance AIC
## - Parch 1 619.05 633.05
## - Fare 1 620.21 634.21
## <none> 618.52 634.52
## - SibSp 1 629.32 643.32
## - Age 1 642.99 656.99
## - Pclass 2 665.53 677.53
## - Sex 1 812.87 826.87
##
## Step: AIC=633.05
## Survived ~ Pclass + Sex + Age + SibSp + Fare
##
## Df Deviance AIC
## - Fare 1 620.43 632.43
## <none> 619.05 633.05
## - SibSp 1 632.89 644.89
## - Age 1 643.38 655.38
## - Pclass 2 668.62 678.62
## - Sex 1 815.52 827.52
##
## Step: AIC=632.43
## Survived ~ Pclass + Sex + Age + SibSp
##
## Df Deviance AIC
## <none> 620.43 632.43
## - SibSp 1 633.16 643.16
## - Age 1 645.99 655.99
## - Pclass 2 708.40 716.40
## - Sex 1 821.16 831.16
##
## Call: glm(formula = Survived ~ Pclass + Sex + Age + SibSp, family = "binomial",
## data = titanic_train)
##
## Coefficients:
## (Intercept) Pclass2 Pclass3 Sexmale Age SibSp
## 4.17317 -1.28645 -2.39060 -2.75570 -0.04331 -0.37014
##
## Degrees of Freedom: 711 Total (i.e. Null); 706 Residual
## Null Deviance: 944.1
## Residual Deviance: 620.4 AIC: 632.4
titanic_m_back <- glm(formula = Survived ~ Pclass + Sex + Age + SibSp, family = "binomial",
data = titanic_train)
summary(titanic_m_back)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp, family = "binomial",
## data = titanic_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3597 -0.5797 -0.4058 0.6045 2.4692
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.173168 0.459195 9.088 < 2e-16 ***
## Pclass2 -1.286446 0.296515 -4.339 1.43e-05 ***
## Pclass3 -2.390600 0.277284 -8.621 < 2e-16 ***
## Sexmale -2.755700 0.218720 -12.599 < 2e-16 ***
## Age -0.043313 0.008982 -4.822 1.42e-06 ***
## SibSp -0.370135 0.116381 -3.180 0.00147 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 944.09 on 711 degrees of freedom
## Residual deviance: 620.43 on 706 degrees of freedom
## AIC: 632.43
##
## Number of Fisher Scoring iterations: 5
titanic_m_back2 <- step(titanic_m_all, direction = "backward")
## Start: AIC=639.87
## Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
##
## Df Deviance AIC
## - Embarked 3 618.52 634.52
## - Parch 1 618.27 638.27
## - Fare 1 619.34 639.34
## <none> 617.87 639.87
## - SibSp 1 627.95 647.95
## - Age 1 642.11 662.11
## - Pclass 2 663.39 681.39
## - Sex 1 803.56 823.56
##
## Step: AIC=634.52
## Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare
##
## Df Deviance AIC
## - Parch 1 619.05 633.05
## - Fare 1 620.21 634.21
## <none> 618.52 634.52
## - SibSp 1 629.32 643.32
## - Age 1 642.99 656.99
## - Pclass 2 665.53 677.53
## - Sex 1 812.87 826.87
##
## Step: AIC=633.05
## Survived ~ Pclass + Sex + Age + SibSp + Fare
##
## Df Deviance AIC
## - Fare 1 620.43 632.43
## <none> 619.05 633.05
## - SibSp 1 632.89 644.89
## - Age 1 643.38 655.38
## - Pclass 2 668.62 678.62
## - Sex 1 815.52 827.52
##
## Step: AIC=632.43
## Survived ~ Pclass + Sex + Age + SibSp
##
## Df Deviance AIC
## <none> 620.43 632.43
## - SibSp 1 633.16 643.16
## - Age 1 645.99 655.99
## - Pclass 2 708.40 716.40
## - Sex 1 821.16 831.16
titanic_predict <- predict(titanic_m_back, titanic_test)
class(titanic_predict)
## [1] "numeric"
titanic_test$titanic_prob <- predict(titanic_m_back, titanic_test, type = "response")
ggplot(titanic_test, aes(x=titanic_prob))+geom_density(lwd=0.5)+theme_minimal()
titanic_test$titanic_predict <- factor(ifelse(titanic_test$titanic_prob > 0.5, "1", "0"))
head(titanic_test[1:10, c("titanic_predict", "Survived")])
## titanic_predict Survived
## 3 1 1
## 4 1 1
## 20 1 1
## 28 0 0
## 30 0 0
## 40 1 1
confusionMatrix(titanic_test$titanic_predict, titanic_test$Survived, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 85 21
## 1 21 52
##
## Accuracy : 0.7654
## 95% CI : (0.6964, 0.8254)
## No Information Rate : 0.5922
## P-Value [Acc > NIR] : 8.154e-07
##
## Kappa : 0.5142
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.7123
## Specificity : 0.8019
## Pos Pred Value : 0.7123
## Neg Pred Value : 0.8019
## Prevalence : 0.4078
## Detection Rate : 0.2905
## Detection Prevalence : 0.4078
## Balanced Accuracy : 0.7571
##
## 'Positive' Class : 1
##