Get data

The data is from the kaggle … competititon (https://www.kaggle.com/c/titanic)

setwd("C:/Users/Owner/Desktop/MachineLearningR_sampleData/TitanicDataSet")
titanic_train <- read.csv("train.csv", stringsAsFactors = TRUE)

titanic_test <- read.csv("test.csv", stringsAsFactors = TRUE)
dim(titanic_train)
## [1] 891  12
head(titanic_train)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp
## 1                             Braund, Mr. Owen Harris   male  22     1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1
## 3                              Heikkinen, Miss. Laina female  26     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1
## 5                            Allen, Mr. William Henry   male  35     0
## 6                                    Moran, Mr. James   male  NA     0
##   Parch           Ticket    Fare Cabin Embarked
## 1     0        A/5 21171  7.2500              S
## 2     0         PC 17599 71.2833   C85        C
## 3     0 STON/O2. 3101282  7.9250              S
## 4     0           113803 53.1000  C123        S
## 5     0           373450  8.0500              S
## 6     0           330877  8.4583              Q
head(titanic_test)
##   PassengerId Pclass                                         Name    Sex
## 1         892      3                             Kelly, Mr. James   male
## 2         893      3             Wilkes, Mrs. James (Ellen Needs) female
## 3         894      2                    Myles, Mr. Thomas Francis   male
## 4         895      3                             Wirz, Mr. Albert   male
## 5         896      3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female
## 6         897      3                   Svensson, Mr. Johan Cervin   male
##    Age SibSp Parch  Ticket    Fare Cabin Embarked
## 1 34.5     0     0  330911  7.8292              Q
## 2 47.0     1     0  363272  7.0000              S
## 3 62.0     0     0  240276  9.6875              Q
## 4 27.0     0     0  315154  8.6625              S
## 5 22.0     1     1 3101298 12.2875              S
## 6 14.0     0     0    7538  9.2250              S

Determine if there are missing values in the training and testing datasets

library(Amelia)
## Warning: package 'Amelia' was built under R version 3.4.2
## Loading required package: Rcpp
## Warning: package 'Rcpp' was built under R version 3.4.2
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(titanic_train, main = "Missing values in train dataset")

missmap(titanic_test, main = "Missing values in test dataset")

sum(is.na(titanic_train$Age))
## [1] 177
sum(is.na(titanic_test$Age))
## [1] 86

There are missing values in Age column in both the training (177) and the testing (86) datasets. There are a few ways of handling missing values. I use the mice() from the mice package which draw missing values from a distribution specifically designed for each missing datapoint. Before imputing the Age missing values, I will check all the other features on the dataset since this key to drawing plausible Age values.

The passenge ID and ticket number are not informative features. I will exclude them from both the training and testing datasets. The passenger name is not very informative either, but their Titles (Mr. Miss, Mrs. …) might informative in predictive the age of the passengers and for building an accurate prediction model. I forked the following code from EasyD (https://github.com/EasyD/IntroToDataScience/blob/master/TitanicDataAnalysis_Video6.R) to extract the Titles of the passengers.

extract_Title <- function(name) {
  name <- as.character(name)
  
  if (length(grep("Miss.", name)) > 0) {
    return ("Miss.")
  } else if (length(grep("Master.", name)) > 0) {
    return ("Master.")
  } else if (length(grep("Mrs.", name)) > 0) {
    return ("Mrs.")
  } else if (length(grep("Mr.", name)) > 0) {
    return ("Mr.")
  } else {
    return ("Other")
  }
}

Titles <- NULL
for (i in 1:nrow(titanic_train)) {
  Titles <- c(Titles, extract_Title(titanic_train[i,"Name"]))
}
titanic_train$Title <- as.factor(Titles)

Titles <- NULL
for (i in 1:nrow(titanic_test)) {
  Titles <- c(Titles, extract_Title(titanic_test[i,"Name"]))
}
titanic_test$Title <- as.factor(Titles)
head(titanic_train)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp
## 1                             Braund, Mr. Owen Harris   male  22     1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1
## 3                              Heikkinen, Miss. Laina female  26     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1
## 5                            Allen, Mr. William Henry   male  35     0
## 6                                    Moran, Mr. James   male  NA     0
##   Parch           Ticket    Fare Cabin Embarked Title
## 1     0        A/5 21171  7.2500              S   Mr.
## 2     0         PC 17599 71.2833   C85        C  Mrs.
## 3     0 STON/O2. 3101282  7.9250              S Miss.
## 4     0           113803 53.1000  C123        S  Mrs.
## 5     0           373450  8.0500              S   Mr.
## 6     0           330877  8.4583              Q   Mr.
head(titanic_test)
##   PassengerId Pclass                                         Name    Sex
## 1         892      3                             Kelly, Mr. James   male
## 2         893      3             Wilkes, Mrs. James (Ellen Needs) female
## 3         894      2                    Myles, Mr. Thomas Francis   male
## 4         895      3                             Wirz, Mr. Albert   male
## 5         896      3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female
## 6         897      3                   Svensson, Mr. Johan Cervin   male
##    Age SibSp Parch  Ticket    Fare Cabin Embarked Title
## 1 34.5     0     0  330911  7.8292              Q   Mr.
## 2 47.0     1     0  363272  7.0000              S  Mrs.
## 3 62.0     0     0  240276  9.6875              Q   Mr.
## 4 27.0     0     0  315154  8.6625              S   Mr.
## 5 22.0     1     1 3101298 12.2875              S  Mrs.
## 6 14.0     0     0    7538  9.2250              S   Mr.

Exclude the Passenger name, Name and Ticket number from the dataset

titanic_train <- titanic_train[c(-1,-4,-9)]
titanic_test <- titanic_test[c(-1,-3,-8)]
sum(is.na(titanic_train))
## [1] 177
head(titanic_train)
##   Survived Pclass    Sex Age SibSp Parch    Fare Cabin Embarked Title
## 1        0      3   male  22     1     0  7.2500              S   Mr.
## 2        1      1 female  38     1     0 71.2833   C85        C  Mrs.
## 3        1      3 female  26     0     0  7.9250              S Miss.
## 4        1      1 female  35     1     0 53.1000  C123        S  Mrs.
## 5        0      3   male  35     0     0  8.0500              S   Mr.
## 6        0      3   male  NA     0     0  8.4583              Q   Mr.
head(titanic_test)
##   Pclass    Sex  Age SibSp Parch    Fare Cabin Embarked Title
## 1      3   male 34.5     0     0  7.8292              Q   Mr.
## 2      3 female 47.0     1     0  7.0000              S  Mrs.
## 3      2   male 62.0     0     0  9.6875              Q   Mr.
## 4      3   male 27.0     0     0  8.6625              S   Mr.
## 5      3 female 22.0     1     1 12.2875              S  Mrs.
## 6      3   male 14.0     0     0  9.2250              S   Mr.

Just like Title, Survived and PClass should also be factor variables

titanic_train$Survived <- factor(titanic_train$Survived, levels = c("0", "1"),
labels = c("No", "YES"))
titanic_train$Pclass<-as.factor(titanic_train$Pclass)
titanic_train$Pclass <- factor(titanic_train$Pclass, levels = c("1", "2","3"),
labels = c("first", "second", "third"))
titanic_test$Pclass<-as.factor(titanic_test$Pclass)
titanic_test$Pclass <- factor(titanic_test$Pclass, levels = c("1", "2","3"),
labels = c("first", "second", "third"))

Let’s examine the Cabin Variable

table(titanic_train$Cabin)
## 
##                             A10             A14             A16 
##             687               1               1               1 
##             A19             A20             A23             A24 
##               1               1               1               1 
##             A26             A31             A32             A34 
##               1               1               1               1 
##             A36              A5              A6              A7 
##               1               1               1               1 
##            B101            B102             B18             B19 
##               1               1               2               1 
##             B20             B22             B28              B3 
##               2               2               2               1 
##             B30             B35             B37             B38 
##               1               2               1               1 
##             B39              B4             B41             B42 
##               1               1               1               1 
##             B49              B5             B50     B51 B53 B55 
##               2               2               1               2 
## B57 B59 B63 B66         B58 B60             B69             B71 
##               2               2               1               1 
##             B73             B77             B78             B79 
##               1               2               1               1 
##             B80         B82 B84             B86             B94 
##               1               1               1               1 
##         B96 B98            C101            C103            C104 
##               4               1               1               1 
##            C106            C110            C111            C118 
##               1               1               1               1 
##            C123            C124            C125            C126 
##               2               2               2               2 
##            C128            C148              C2         C22 C26 
##               1               1               2               3 
##     C23 C25 C27             C30             C32             C45 
##               4               1               1               1 
##             C46             C47             C49             C50 
##               1               1               1               1 
##             C52             C54         C62 C64             C65 
##               2               1               1               2 
##             C68              C7             C70             C78 
##               2               1               1               2 
##             C82             C83             C85             C86 
##               1               2               1               1 
##             C87             C90             C91             C92 
##               1               1               1               2 
##             C93             C95             C99               D 
##               2               1               1               3 
##         D10 D12             D11             D15             D17 
##               1               1               1               2 
##             D19             D20             D21             D26 
##               1               2               1               2 
##             D28             D30             D33             D35 
##               1               1               2               2 
##             D36             D37             D45             D46 
##               2               1               1               1 
##             D47             D48             D49             D50 
##               1               1               1               1 
##             D56              D6              D7              D9 
##               1               1               1               1 
##             E10            E101             E12            E121 
##               1               3               1               2 
##             E17             E24             E25             E31 
##               1               2               2               1 
##             E33             E34             E36             E38 
##               2               1               1               1 
##             E40             E44             E46             E49 
##               1               2               1               1 
##             E50             E58             E63             E67 
##               1               1               1               2 
##             E68             E77              E8           F E69 
##               1               1               2               1 
##           F G63           F G73              F2             F33 
##               1               2               3               3 
##             F38              F4              G6               T 
##               1               2               4               1

There are 687 passengers without an assigned Cabin label in the training data set. The other passengers have Cabin labels from A(number) through G(number). I will reduce the labels to their respective letters excluding the numbers. Passengers without Cabin number will serve as a standalone category.

titanic_train$Cabin <- as.character(titanic_train$Cabin)
a<-transform(titanic_train, Cabin = replace(Cabin, startsWith(Cabin, "A"), "A"))
b<-transform(a, Cabin = replace(Cabin, startsWith(Cabin, "B"), "B"))
c<-transform(b, Cabin = replace(Cabin, startsWith(Cabin, "C"), "C"))
d<-transform(c, Cabin = replace(Cabin, startsWith(Cabin, "D"), "D"))
e<-transform(d, Cabin = replace(Cabin, startsWith(Cabin, "E"), "E"))
f<-transform(e, Cabin = replace(Cabin, startsWith(Cabin, "F"), "F"))
g<-transform(f, Cabin = replace(Cabin, startsWith(Cabin, "G"), "G"))
g$Cabin<-as.factor(g$Cabin)
train <- g

head(train)
##   Survived Pclass    Sex Age SibSp Parch    Fare Cabin Embarked Title
## 1       No  third   male  22     1     0  7.2500              S   Mr.
## 2      YES  first female  38     1     0 71.2833     C        C  Mrs.
## 3      YES  third female  26     0     0  7.9250              S Miss.
## 4      YES  first female  35     1     0 53.1000     C        S  Mrs.
## 5       No  third   male  35     0     0  8.0500              S   Mr.
## 6       No  third   male  NA     0     0  8.4583              Q   Mr.
table(train$Cabin)
## 
##       A   B   C   D   E   F   G   T 
## 687  15  47  59  33  32  13   4   1
titanic_test$Cabin <- as.character(titanic_test$Cabin)
a<-transform(titanic_test, Cabin = replace(Cabin, startsWith(Cabin, "A"), "A"))
b<-transform(a, Cabin = replace(Cabin, startsWith(Cabin, "B"), "B"))
c<-transform(b, Cabin = replace(Cabin, startsWith(Cabin, "C"), "C"))
d<-transform(c, Cabin = replace(Cabin, startsWith(Cabin, "D"), "D"))
e<-transform(d, Cabin = replace(Cabin, startsWith(Cabin, "E"), "E"))
f<-transform(e, Cabin = replace(Cabin, startsWith(Cabin, "F"), "F"))
g<-transform(f, Cabin = replace(Cabin, startsWith(Cabin, "G"), "G"))
g$Cabin<-as.factor(g$Cabin)
test<-g

head(test)
##   Pclass    Sex  Age SibSp Parch    Fare Cabin Embarked Title
## 1  third   male 34.5     0     0  7.8292              Q   Mr.
## 2  third female 47.0     1     0  7.0000              S  Mrs.
## 3 second   male 62.0     0     0  9.6875              Q   Mr.
## 4  third   male 27.0     0     0  8.6625              S   Mr.
## 5  third female 22.0     1     1 12.2875              S  Mrs.
## 6  third   male 14.0     0     0  9.2250              S   Mr.

Predicting missing age variable

Now that we have catered for all the other variables, let’s fix the missing values in the Age column. I will use the mice() from the mice package.
I forked these junks of codes from Megan L. Risdal (https://rstudio-pubs-static.s3.amazonaws.com/202517_d1c1e3e9101d49b1a0135a422a9b3748.html) that does a good job in predicting the missing age values.

library(mice)
## Warning: package 'mice' was built under R version 3.4.2
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.2
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
factor_vars <- c('Pclass','Sex','Embarked', 'Title')

train[factor_vars] <- lapply(train[factor_vars], function(x) as.factor(x))
test[factor_vars] <- lapply(test[factor_vars], function(x) as.factor(x))

# Perform mice imputation, excluding certain less-than-useful variables:
mice_mod <- mice(train[, !names(train) %in% c('Ticket','Cabin','Survived')], method='rf')
## 
##  iter imp variable
##   1   1  Age
##   1   2  Age
##   1   3  Age
##   1   4  Age
##   1   5  Age
##   2   1  Age
##   2   2  Age
##   2   3  Age
##   2   4  Age
##   2   5  Age
##   3   1  Age
##   3   2  Age
##   3   3  Age
##   3   4  Age
##   3   5  Age
##   4   1  Age
##   4   2  Age
##   4   3  Age
##   4   4  Age
##   4   5  Age
##   5   1  Age
##   5   2  Age
##   5   3  Age
##   5   4  Age
##   5   5  Age
mice_mod1 <- mice(test[, !names(test) %in% c('Ticket','Cabin','Survived')], method='rf')
## 
##  iter imp variable
##   1   1  Age  Fare
##   1   2  Age  Fare
##   1   3  Age  Fare
##   1   4  Age  Fare
##   1   5  Age  Fare
##   2   1  Age  Fare
##   2   2  Age  Fare
##   2   3  Age  Fare
##   2   4  Age  Fare
##   2   5  Age  Fare
##   3   1  Age  Fare
##   3   2  Age  Fare
##   3   3  Age  Fare
##   3   4  Age  Fare
##   3   5  Age  Fare
##   4   1  Age  Fare
##   4   2  Age  Fare
##   4   3  Age  Fare
##   4   4  Age  Fare
##   4   5  Age  Fare
##   5   1  Age  Fare
##   5   2  Age  Fare
##   5   3  Age  Fare
##   5   4  Age  Fare
##   5   5  Age  Fare
mice_output <- complete(mice_mod)
mice_output1 <- complete(mice_mod1)

par(mfrow=c(1,2))
hist(train$Age, freq=F, main='Age: Original Data', 
  col='darkgreen', ylim=c(0,0.04))
hist(mice_output$Age, freq=F, main='Age: MICE Output', 
  col='lightgreen', ylim=c(0,0.04))

# Replace Age variable from the mice model.
train$Age <- mice_output$Age
test$Age <- mice_output1$Age

# Show new number of missing Age values
sum(is.na(train$Age))
## [1] 0

Data exploration

Relative survival in the training dataset

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
ggplot(train, aes(x = Survived)) + 
  theme_bw() +
  geom_bar() +
  labs(y = "Passenger Count",
       Title = "Titanic Survival Rate")

table(train$Survived)
## 
##  No YES 
## 549 342

549 people died while 342 survived

Effect of passenger gender on their survival

ggplot(train, aes(x = Sex, fill = Survived)) + 
  theme_bw() +
  geom_bar() +
  labs(y = "Passenger Count",
       Title = "Titanic Survival Rates by Sex")

Relatively more female passengers survived.

Effect of age on survival

ggplot(train, aes(x = Age, fill = Survived)) +
  theme_bw() +
  geom_histogram(binwidth = 5) +
  labs(y = "Passenger Count",
       x = "Age (binwidth = 5)",
       Title = "Titanic Survival Rates by Age")

Majority of the passengers that died where between the ages of 20 and 40.

The relationship between passenger title and their survival

ggplot(train, aes(x = Title, fill = Survived)) + 
  theme_bw() +
  geom_bar() +
  labs(y = "Passenger Count",
       Title = "Titanic Survival Rates by Passenger Title")

Majority of the passengers that died had a Mr. title. Only about 1 in every 5 passenger in this group survived.

The relationship between Cabin occupied and survival

ggplot(train, aes(x = Cabin, fill = Survived)) + 
  theme_bw() +
  geom_bar() +
  labs(y = "Passenger Count",
       Title = "Titanic Survival Rates by Cabin Occupied")

The Cabin of the majority of passengers is not mentioned. This group of passengers disportionately had a lower survival rate compared to those with known Cabin.

The relationship between where passenger embarked and their survival

ggplot(train, aes(x = Embarked, fill = Survived)) + 
  theme_bw() +
  geom_bar() +
  labs(y = "Passenger Count",
       Title = "Titanic Survival Rates by where the passengers Embarked")

Model training (logistic regression)

Kaggle provided two datasets, train and test (with unkown survival) data set. I will split the training data into 2 (one for building the model and the other for training the model)

shuffled_train <- train[sample(891), ]
train_indices <- 1:round(0.85 * 891)
train_train <- shuffled_train[train_indices, ]
test_indices <- (round(0.85 * 891) + 1):891
test_train <- shuffled_train[test_indices, ]
model <- glm (Survived ~ . , data = train_train, family = binomial(link = "logit"), na.action = na.pass)
summary(model)
## 
## Call:
## glm(formula = Survived ~ ., family = binomial(link = "logit"), 
##     data = train_train, na.action = na.pass)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5446  -0.5405  -0.3731   0.5240   2.5421  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   3.249e+01  1.654e+03   0.020 0.984330    
## Pclasssecond -5.362e-01  4.913e-01  -1.091 0.275112    
## Pclassthird  -1.441e+00  4.906e-01  -2.937 0.003314 ** 
## Sexmale      -1.613e+01  7.866e+02  -0.021 0.983636    
## Age          -2.908e-02  9.525e-03  -3.053 0.002268 ** 
## SibSp        -5.816e-01  1.444e-01  -4.029 5.61e-05 ***
## Parch        -4.074e-01  1.497e-01  -2.721 0.006503 ** 
## Fare          4.433e-03  3.057e-03   1.450 0.147016    
## CabinA        4.905e-01  7.465e-01   0.657 0.511178    
## CabinB        6.557e-01  6.443e-01   1.018 0.308796    
## CabinC        2.584e-01  5.702e-01   0.453 0.650442    
## CabinD        1.578e+00  6.490e-01   2.432 0.015021 *  
## CabinE        1.728e+00  6.663e-01   2.593 0.009509 ** 
## CabinF        8.336e-01  9.316e-01   0.895 0.370924    
## CabinG       -5.485e-01  1.039e+00  -0.528 0.597423    
## CabinT       -1.425e+01  1.455e+03  -0.010 0.992186    
## EmbarkedC    -1.258e+01  1.455e+03  -0.009 0.993106    
## EmbarkedQ    -1.282e+01  1.455e+03  -0.009 0.992972    
## EmbarkedS    -1.296e+01  1.455e+03  -0.009 0.992895    
## TitleMiss.   -1.683e+01  7.866e+02  -0.021 0.982927    
## TitleMr.     -3.561e+00  6.202e-01  -5.742 9.37e-09 ***
## TitleMrs.    -1.592e+01  7.866e+02  -0.020 0.983857    
## TitleOther   -3.602e+00  9.403e-01  -3.830 0.000128 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1001.73  on 756  degrees of freedom
## Residual deviance:  605.75  on 734  degrees of freedom
## AIC: 651.75
## 
## Number of Fisher Scoring iterations: 14

Variable importance

anova(model, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Survived
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                       756    1001.73              
## Pclass    2   78.419       754     923.32 < 2.2e-16 ***
## Sex       1  212.532       753     710.78 < 2.2e-16 ***
## Age       1   19.010       752     691.77 1.301e-05 ***
## SibSp     1   15.292       751     676.48 9.210e-05 ***
## Parch     1    0.652       750     675.83   0.41948    
## Fare      1    1.602       749     674.23   0.20563    
## Cabin     8   17.840       741     656.39   0.02246 *  
## Embarked  3    2.650       738     653.74   0.44878    
## Title     4   47.987       734     605.75 9.496e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Gender > PClass > Title > Age > Sibling present had the most effect on determining weather or not a pessanger on board the Titanic survived.

Model prediction and accuracy

library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.2
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.4.2
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
predict <- predict(model, test_train, type="response") 
table(test_train$Survived, predict > 0.5)
##      
##       FALSE TRUE
##   No     66   10
##   YES    12   46
pr <- prediction(predict, test_train$Survived)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8920145

The accuracy of the model is > 85%

summary(table(predict, predict > 0.5))
## Number of cases in table: 134 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 134, df = 130, p-value = 0.3871
##  Chi-squared approximation may be incorrect

Predicting the survival of passengers in the test dataset

titanic_test <- read.csv("test.csv", stringsAsFactors = TRUE)
prediction <- predict(model, test)

# Create a dataframe with the ID of the passengers and whether or not they survived.

test_survival_predict <- data.frame(PassengerID = titanic_test$PassengerId, Survived = prediction)
# Change Survived to a binary factor variable

test_survival_predict$Survived <- ifelse(test_survival_predict$Survived >= 0.5, 1, 0)
test_survival_predict$Survived<- factor(test_survival_predict$Survived, levels = c("0", "1"),
labels = c("no", "yes"))

head(test_survival_predict, n = 10)
##    PassengerID Survived
## 1          892       no
## 2          893       no
## 3          894       no
## 4          895       no
## 5          896      yes
## 6          897       no
## 7          898      yes
## 8          899       no
## 9          900      yes
## 10         901       no
table(test_survival_predict$Survived)
## 
##  no yes 
## 277 140