Overview

This machine learning analysis used the R package tidymodels to predict whether a passenger survived the titanic ship wreck or not. Tidymodels is a collection of R packages for machine learning analysis.


Load Packages

The following R packages were used for this analysis:

  1. tidymodels
  2. tidyverse
  3. plotly
  4. skimr
  5. forcats

Load data files

# training file
train <- read_csv("train.csv", col_types = cols(),  na = "NA")

# test file
test <- read_csv("test.csv", col_types = cols(), na = "NA")

# Dimensions and structure of data
dim(train); str(train)
[1] 891  12
spec_tbl_df [891 x 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ PassengerId: num [1:891] 1 2 3 4 5 6 7 8 9 10 ...
 $ Survived   : num [1:891] 0 1 1 1 0 0 0 0 1 1 ...
 $ Pclass     : num [1:891] 3 1 3 1 3 3 1 3 3 2 ...
 $ Name       : chr [1:891] "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
 $ Sex        : chr [1:891] "male" "female" "female" "female" ...
 $ Age        : num [1:891] 22 38 26 35 35 NA 54 2 27 14 ...
 $ SibSp      : num [1:891] 1 1 0 1 0 0 0 3 0 1 ...
 $ Parch      : num [1:891] 0 0 0 0 0 0 0 1 2 0 ...
 $ Ticket     : chr [1:891] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
 $ Fare       : num [1:891] 7.25 71.28 7.92 53.1 8.05 ...
 $ Cabin      : chr [1:891] "" "C85" "" "C123" ...
 $ Embarked   : chr [1:891] "S" "C" "S" "S" ...
 - attr(*, "spec")=
  .. cols(
  ..   PassengerId = col_double(),
  ..   Survived = col_double(),
  ..   Pclass = col_double(),
  ..   Name = col_character(),
  ..   Sex = col_character(),
  ..   Age = col_double(),
  ..   SibSp = col_double(),
  ..   Parch = col_double(),
  ..   Ticket = col_character(),
  ..   Fare = col_double(),
  ..   Cabin = col_character(),
  ..   Embarked = col_character()
  .. )
dim(test); str(test)
[1] 418  11
spec_tbl_df [418 x 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ PassengerId: num [1:418] 892 893 894 895 896 897 898 899 900 901 ...
 $ Pclass     : num [1:418] 3 3 2 3 3 3 3 2 3 3 ...
 $ Name       : chr [1:418] "Kelly, Mr. James" "Wilkes, Mrs. James (Ellen Needs)" "Myles, Mr. Thomas Francis" "Wirz, Mr. Albert" ...
 $ Sex        : chr [1:418] "male" "female" "male" "male" ...
 $ Age        : num [1:418] 34.5 47 62 27 22 14 30 26 18 21 ...
 $ SibSp      : num [1:418] 0 1 0 0 1 0 0 1 0 2 ...
 $ Parch      : num [1:418] 0 0 0 0 1 0 0 1 0 0 ...
 $ Ticket     : chr [1:418] "330911" "363272" "240276" "315154" ...
 $ Fare       : num [1:418] 7.83 7 9.69 8.66 12.29 ...
 $ Cabin      : chr [1:418] "" "" "" "" ...
 $ Embarked   : chr [1:418] "Q" "S" "Q" "S" ...
 - attr(*, "spec")=
  .. cols(
  ..   PassengerId = col_double(),
  ..   Pclass = col_double(),
  ..   Name = col_character(),
  ..   Sex = col_character(),
  ..   Age = col_double(),
  ..   SibSp = col_double(),
  ..   Parch = col_double(),
  ..   Ticket = col_character(),
  ..   Fare = col_double(),
  ..   Cabin = col_character(),
  ..   Embarked = col_character()
  .. )

Data wrangling

Combine the train and test data so as to clean the 2 data sets together. Create a new column - Survived in the test data. The 2 data sets must have the same columns to combine successfully. Create another column as a label in each data set to differentiate between the train and the test data. The label will be used later to split the train from the test data.

# Survived column in test data
test <- test %>% mutate(Survived = NA)

# Observe the column
head(select(test, PassengerId, Survived, everything()))
# A tibble: 6 x 12
  PassengerId Survived Pclass Name    Sex     Age SibSp Parch Ticket  Fare Cabin
        <dbl> <lgl>     <dbl> <chr>   <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr>
1         892 NA            3 Kelly,~ male   34.5     0     0 330911  7.83 ""   
2         893 NA            3 Wilkes~ fema~  47       1     0 363272  7    ""   
3         894 NA            2 Myles,~ male   62       0     0 240276  9.69 ""   
4         895 NA            3 Wirz, ~ male   27       0     0 315154  8.66 ""   
5         896 NA            3 Hirvon~ fema~  22       1     1 31012~ 12.3  ""   
6         897 NA            3 Svenss~ male   14       0     0 7538    9.22 ""   
# ... with 1 more variable: Embarked <chr>
# Label for the data
train <- mutate(train, Label = TRUE)
test <- mutate(test, Label = FALSE)

# Combine the train and test data
merged <- bind_rows(train, test)
dim(merged)
[1] 1309   13
# Train data has 891 rows while test has 418 rows
sum(891, 418)
[1] 1309
# Observe the tail of the combined data
tail(select(merged,  PassengerId, Survived, Label, everything()))
# A tibble: 6 x 13
  PassengerId Survived Label Pclass Name   Sex     Age SibSp Parch Ticket   Fare
        <dbl>    <dbl> <lgl>  <dbl> <chr>  <chr> <dbl> <dbl> <dbl> <chr>   <dbl>
1        1304       NA FALSE      3 Henri~ fema~  28       0     0 347086   7.78
2        1305       NA FALSE      3 Spect~ male   NA       0     0 A.5. ~   8.05
3        1306       NA FALSE      1 Oliva~ fema~  39       0     0 PC 17~ 109.  
4        1307       NA FALSE      3 Saeth~ male   38.5     0     0 SOTON~   7.25
5        1308       NA FALSE      3 Ware,~ male   NA       0     0 359309   8.05
6        1309       NA FALSE      3 Peter~ male   NA       1     1 2668    22.4 
# ... with 2 more variables: Cabin <chr>, Embarked <chr>
# Detailed data summary
skimr::skim(merged)
Data summary
Name merged
Number of rows 1309
Number of columns 13
_______________________
Column type frequency:
character 5
logical 1
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Name 0 1 12 82 0 1307 0
Sex 0 1 4 6 0 2 0
Ticket 0 1 3 18 0 929 0
Cabin 0 1 0 15 1014 187 0
Embarked 0 1 0 1 2 4 0

Variable type: logical

skim_variable n_missing complete_rate mean count
Label 0 1 0.68 TRU: 891, FAL: 418

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PassengerId 0 1.00 655.00 378.02 1.00 328.0 655.00 982.00 1309.00 ▇▇▇▇▇
Survived 418 0.68 0.38 0.49 0.00 0.0 0.00 1.00 1.00 ▇▁▁▁▅
Pclass 0 1.00 2.29 0.84 1.00 2.0 3.00 3.00 3.00 ▃▁▃▁▇
Age 263 0.80 29.88 14.41 0.17 21.0 28.00 39.00 80.00 ▂▇▅▂▁
SibSp 0 1.00 0.50 1.04 0.00 0.0 0.00 1.00 8.00 ▇▁▁▁▁
Parch 0 1.00 0.39 0.87 0.00 0.0 0.00 0.00 9.00 ▇▁▁▁▁
Fare 1 1.00 33.30 51.76 0.00 7.9 14.45 31.27 512.33 ▇▁▁▁▁

Tidy data

Check for and remove missing values from the combined data, merged. Have a plan on how to replace the missing values. Variables Age, and Fare have NA as missing values apart from the imputed NAs for the column test$Survived. Columns Cabin and Embarked have blank cells as missing values.

# Columns with blanks
merged %>% distinct(Embarked)
# A tibble: 4 x 1
  Embarked
  <chr>   
1 "S"     
2 "C"     
3 "Q"     
4 ""      
merged %>% group_by(Embarked) %>% count(Embarked)
# A tibble: 4 x 2
# Groups:   Embarked [4]
  Embarked     n
  <chr>    <int>
1 ""           2
2 "C"        270
3 "Q"        123
4 "S"        914
merged %>% group_by(Cabin) %>% count(Cabin)
# A tibble: 187 x 2
# Groups:   Cabin [187]
   Cabin     n
   <chr> <int>
 1 ""     1014
 2 "A10"     1
 3 "A11"     1
 4 "A14"     1
 5 "A16"     1
 6 "A18"     1
 7 "A19"     1
 8 "A20"     1
 9 "A21"     1
10 "A23"     1
# ... with 177 more rows
# Total blank values
sum(merged == "", na.rm = TRUE); sum( 1014, 2)
[1] 1016
[1] 1016
# Total NA values
sum(is.na(merged))
[1] 682
# Cells in Embarked column with blanks
which(merged$Embarked == "")
[1]  62 830
# NA count in column Age
merged %>% filter(is.na(Age)) %>% count(Age)
# A tibble: 1 x 2
    Age     n
  <dbl> <int>
1    NA   263
# NA count in column Fare
merged %>% filter(is.na(Fare)) %>% count(Fare)
# A tibble: 1 x 2
   Fare     n
  <dbl> <int>
1    NA     1
# A scrutiny of Name and the Cabin
merged %>% group_by(Name) %>% distinct(Cabin) %>% 
  count(Cabin) %>% arrange(desc(Cabin))
# A tibble: 1,307 x 3
# Groups:   Name [1,307]
   Name                                                Cabin     n
   <chr>                                               <chr> <int>
 1 Blackwell, Mr. Stephen Weart                        T         1
 2 Sandstrom, Miss. Beatrice Irene                     G6        1
 3 Sandstrom, Miss. Marguerite Rut                     G6        1
 4 Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson) G6        1
 5 Strom, Miss. Telma Matilda                          G6        1
 6 Strom, Mrs. Wilhelm (Elna Matilda Persson)          G6        1
 7 Becker, Master. Richard F                           F4        1
 8 Becker, Miss. Marion Louise                         F4        1
 9 Becker, Miss. Ruth Elizabeth                        F4        1
10 Becker, Mrs. Allen Oliver (Nellie E Baumgardner)    F4        1
# ... with 1,297 more rows
merged %>% group_by(Cabin) %>% distinct(Name) %>% 
  count(Cabin) %>% arrange(desc(Cabin))
# A tibble: 187 x 2
# Groups:   Cabin [187]
   Cabin     n
   <chr> <int>
 1 T         1
 2 G6        5
 3 F4        4
 4 F38       1
 5 F33       4
 6 F2        4
 7 F G73     2
 8 F G63     2
 9 F E69     1
10 F E57     1
# ... with 177 more rows


Missingness

The missing values must be replaced. In the Embarked column 2 values are missing which will be replaced by the modal value. A cell has NA value in Fare and it will be replaced by the median of the column. Age has 263 missing values and a model will be used to fill up these values. Cabin has 1014 missing values. Cabin will not be used for the prediction but a scrutiny of Cabin and Name illustrates some interesting details which could be useful for feature engineering. For example, it was observed that there were folks with different last names staying in the same cabin.

# Replace Embarked == "" with the modal value "S"
merged$Embarked[which(merged$Embarked == "")] <- "S"

# Replace Fare == NA with median value
which(is.na(merged$Fare))
[1] 1044
median_fare <- median(merged$Fare, na.rm = TRUE)
merged$Fare[which(is.na(merged$Fare))] <- median_fare

Imputation

Ordinary least squares regression (OLS) or rpart package can be used as models to replace the missing values in the Age variable. OLS can be affected by extreme values i.e outliers. As shown by the summary and the plot there is no value after the lower whisker (no lower outliers but upper outliers are seen).

# Age summary
summary(merged$Age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.17   21.00   28.00   29.88   39.00   80.00     263 
# Visuals
age_spread <- ggplot(merged) + aes(x = "", y = Age, na.rm = TRUE) + geom_boxplot(fill = "lawngreen")
ggplotly(age_spread)
Warning: Removed 263 rows containing non-finite values (stat_boxplot).
# Get the boundaries
ggplot_build(age_spread)$data[[1]][1]
Warning: Removed 263 rows containing non-finite values (stat_boxplot).
  ymin
1 0.17
ggplot_build(age_spread)$data[[1]][5]
Warning: Removed 263 rows containing non-finite values (stat_boxplot).
  ymax
1   66
upper_limit <- boxplot.stats(merged$Age)$stats[5]

# Filter out the outliers
outlier_rm <- merged$Age <= upper_limit
merged %>% filter(outlier_rm)
# A tibble: 1,037 x 13
   PassengerId Survived Pclass Name   Sex     Age SibSp Parch Ticket  Fare Cabin
         <dbl>    <dbl>  <dbl> <chr>  <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr>
 1           1        0      3 Braun~ male     22     1     0 A/5 2~  7.25 ""   
 2           2        1      1 Cumin~ fema~    38     1     0 PC 17~ 71.3  "C85"
 3           3        1      3 Heikk~ fema~    26     0     0 STON/~  7.92 ""   
 4           4        1      1 Futre~ fema~    35     1     0 113803 53.1  "C12~
 5           5        0      3 Allen~ male     35     0     0 373450  8.05 ""   
 6           7        0      1 McCar~ male     54     0     0 17463  51.9  "E46"
 7           8        0      3 Palss~ male      2     3     1 349909 21.1  ""   
 8           9        1      3 Johns~ fema~    27     0     2 347742 11.1  ""   
 9          10        1      2 Nasse~ fema~    14     1     0 237736 30.1  ""   
10          11        1      3 Sands~ fema~     4     1     1 PP 95~ 16.7  "G6" 
# ... with 1,027 more rows, and 2 more variables: Embarked <chr>, Label <lgl>
# Arrange in order to see the upper limit
merged %>% filter(outlier_rm) %>% arrange(desc(Age))
# A tibble: 1,037 x 13
   PassengerId Survived Pclass Name  Sex     Age SibSp Parch Ticket   Fare Cabin
         <dbl>    <dbl>  <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr>   <dbl> <chr>
 1          34        0      2 Whea~ male     66     0     0 C.A. ~  10.5  ""   
 2          55        0      1 Ostb~ male     65     0     1 113509  62.0  "B30"
 3         281        0      3 Duan~ male     65     0     0 336439   7.75 ""   
 4         457        0      1 Mill~ male     65     0     0 13509   26.6  "E38"
 5         439        0      1 Fort~ male     64     1     4 19950  263    "C23~
 6         546        0      1 Nich~ male     64     0     0 693     26    ""   
 7        1071       NA      1 Comp~ fema~    64     0     2 PC 17~  83.2  "E45"
 8        1128       NA      1 Warr~ male     64     1     0 110813  75.2  "D37"
 9        1197       NA      1 Cros~ fema~    64     1     1 112901  26.6  "B26"
10         276        1      1 Andr~ fema~    63     1     0 13502   78.0  "D7" 
# ... with 1,027 more rows, and 2 more variables: Embarked <chr>, Label <lgl>

Linear model

Tidymodels' linear_reg() is used to make a model to replace the missing values in Age.

# Assign filtered data to a variable
age_filter <- merged %>% 
  filter(outlier_rm) %>% 
  arrange(desc(Age))

# Pick a model
age_model <- linear_reg() %>% 
  set_engine(engine = "lm") %>% 
  set_mode(mode = "regression")

# Fit the model
age_fit <- age_model %>% fit(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked, data = age_filter)

# Filter rows for Age with missing values as the new data for prediction
age_na <- merged %>% 
  filter(is.na(Age)) %>% 
  select(Age, Pclass, Sex, SibSp, Parch, Fare, Embarked)

# Observe
head(age_na)
# A tibble: 6 x 7
    Age Pclass Sex    SibSp Parch  Fare Embarked
  <dbl>  <dbl> <chr>  <dbl> <dbl> <dbl> <chr>   
1    NA      3 male       0     0  8.46 Q       
2    NA      2 male       0     0 13    S       
3    NA      3 female     0     0  7.22 C       
4    NA      3 male       0     0  7.22 C       
5    NA      3 female     0     0  7.88 Q       
6    NA      3 male       0     0  7.90 S       
# Prediction
age_predict <- age_fit %>% 
  predict(new_data=age_na)

# Replace missing values in Age with predicted
merged[is.na(merged$Age), "Age"] <- age_predict

# Verify if missing values are completely replaced
sum(is.na(merged$Age))
[1] 0
# Observe data again to see if all changes are corrected
skimr::skim(merged)
Data summary
Name merged
Number of rows 1309
Number of columns 13
_______________________
Column type frequency:
character 5
logical 1
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Name 0 1 12 82 0 1307 0
Sex 0 1 4 6 0 2 0
Ticket 0 1 3 18 0 929 0
Cabin 0 1 0 15 1014 187 0
Embarked 0 1 1 1 0 3 0

Variable type: logical

skim_variable n_missing complete_rate mean count
Label 0 1 0.68 TRU: 891, FAL: 418

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PassengerId 0 1.00 655.00 378.02 1.00 328.0 655.00 982.00 1309.00 ▇▇▇▇▇
Survived 418 0.68 0.38 0.49 0.00 0.0 0.00 1.00 1.00 ▇▁▁▁▅
Pclass 0 1.00 2.29 0.84 1.00 2.0 3.00 3.00 3.00 ▃▁▃▁▇
Age 0 1.00 29.37 13.37 -2.02 22.0 27.00 36.00 80.00 ▂▇▅▁▁
SibSp 0 1.00 0.50 1.04 0.00 0.0 0.00 1.00 8.00 ▇▁▁▁▁
Parch 0 1.00 0.39 0.87 0.00 0.0 0.00 0.00 9.00 ▇▁▁▁▁
Fare 0 1.00 33.28 51.74 0.00 7.9 14.45 31.27 512.33 ▇▁▁▁▁

Categorical casting

Variables Survived, Pclass, Sex, Embarked must be changed to categorical(factor) variables. The casting of Survived is done after the merged is split into train and test data again.

# Factors
merged$Pclass <- factor(merged$Pclass)
merged$Sex <- factor(merged$Sex)
merged$Embarked <- factor(merged$Embarked)

split the merged data into train and test again

# Train set
train2 <- merged %>% filter(Label == TRUE)
dim(train); dim(train2)
[1] 891  13
[1] 891  13
# Test set
test2 <- merged %>% filter(Label == FALSE)
dim(test); dim(test2)
[1] 418  13
[1] 418  13

Cast Survived into categorical variable

# Factor
train2$Survived <- factor(train2$Survived)

Pre-Process

# Split the train data
train_split <- train2 %>% initial_split(prop = 3/4)
train_split
<Analysis/Assess/Total>
<668/223/891>
# Subset data for training and testing
train_set <- train_split %>% training() %>% glimpse()
Rows: 668
Columns: 13
$ PassengerId <dbl> 19, 301, 509, 813, 761, 864, 473, 544, 452, 678, 144, 450,~
$ Survived    <fct> 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1~
$ Pclass      <fct> 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, 1, 2, 3, 2, 1, 3, 3, 3, 1~
$ Name        <chr> "Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",~
$ Sex         <fct> female, female, male, male, male, female, female, male, ma~
$ Age         <dbl> 31.000000, 26.780351, 28.000000, 35.000000, 26.985769, -2.~
$ SibSp       <dbl> 1, 0, 0, 0, 0, 8, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1~
$ Parch       <dbl> 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0~
$ Ticket      <chr> "345763", "9234", "C 4001", "28206", "358585", "CA. 2343",~
$ Fare        <dbl> 18.0000, 7.7500, 22.5250, 10.5000, 14.5000, 69.5500, 27.75~
$ Cabin       <chr> "", "", "", "", "", "", "", "", "", "", "", "C104", "", ""~
$ Embarked    <fct> S, Q, S, S, S, S, S, S, S, S, Q, S, S, S, S, S, S, S, S, S~
$ Label       <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE~
test_set <- train_split %>% testing() %>% glimpse()
Rows: 223
Columns: 13
$ PassengerId <dbl> 1, 5, 6, 7, 8, 12, 23, 24, 26, 31, 34, 47, 53, 54, 58, 62,~
$ Survived    <fct> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0~
$ Pclass      <fct> 3, 3, 3, 1, 3, 1, 3, 1, 3, 1, 2, 3, 1, 2, 3, 1, 3, 2, 3, 3~
$ Name        <chr> "Braund, Mr. Owen Harris", "Allen, Mr. William Henry", "Mo~
$ Sex         <fct> male, male, male, male, male, female, female, male, female~
$ Age         <dbl> 22.00000, 35.00000, 29.24580, 54.00000, 2.00000, 58.00000,~
$ SibSp       <dbl> 1, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0~
$ Parch       <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
$ Ticket      <chr> "A/5 21171", "373450", "330877", "17463", "349909", "11378~
$ Fare        <dbl> 7.2500, 8.0500, 8.4583, 51.8625, 21.0750, 26.5500, 8.0292,~
$ Cabin       <chr> "", "", "", "E46", "", "C103", "", "A6", "", "", "", "", "~
$ Embarked    <fct> S, S, Q, S, S, S, Q, S, S, C, S, Q, C, S, C, S, S, S, S, S~
$ Label       <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE~

Prep the Recipe

# Recipes
train_recipe <- train_set %>% recipe(Survived ~ Pclass + Age + Sex + SibSp + Parch + Fare + Embarked) %>% 
  step_corr(all_numeric_predictors()) %>% 
  step_center(all_numeric_predictors(), -all_outcomes()) %>% 
  step_scale(all_numeric_predictors(), -all_outcomes()) %>%
  prep()

# View
train_recipe
Recipe

Inputs:

      role #variables
   outcome          1
 predictor          7

Training data contained 668 data points and no missing data.

Operations:

Correlation filter on <none> [trained]
Centering for Age, SibSp, Parch, Fare [trained]
Scaling for Age, SibSp, Parch, Fare [trained]

Bake and Juice

# Bake test set with the recipe
test_set_bake <- train_recipe %>% bake(test_set) %>% glimpse()
Rows: 223
Columns: 8
$ Pclass   <fct> 3, 3, 3, 1, 3, 1, 3, 1, 3, 1, 2, 3, 1, 2, 3, 1, 3, 2, 3, 3, 3~
$ Age      <dbl> -0.534584312, 0.450683427, 0.014573507, 1.890690123, -2.05038~
$ Sex      <fct> male, male, male, male, male, female, female, male, female, m~
$ SibSp    <dbl> 0.3825483, -0.4692593, -0.4692593, -0.4692593, 2.0861635, -0.~
$ Parch    <dbl> -0.4736619, -0.4736619, -0.4736619, -0.4736619, 0.7203236, -0~
$ Fare     <dbl> -0.50371140, -0.48816046, -0.48022365, 0.36349665, -0.2349716~
$ Embarked <fct> S, S, Q, S, S, S, Q, S, S, C, S, Q, C, S, C, S, S, S, S, S, S~
$ Survived <fct> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1~
# Juice the recipe
train_juice <- train_recipe %>% juice() %>% glimpse()
Rows: 668
Columns: 8
$ Pclass   <fct> 3, 3, 3, 2, 3, 3, 2, 2, 3, 3, 3, 1, 2, 3, 2, 1, 3, 3, 3, 1, 3~
$ Age      <dbl> 0.14752412, -0.17228235, -0.07984536, 0.45068343, -0.15671374~
$ Sex      <fct> female, female, male, male, male, female, female, male, male,~
$ SibSp    <dbl> 0.3825483, -0.4692593, -0.4692593, -0.4692593, -0.4692593, 6.~
$ Parch    <dbl> -0.4736619, -0.4736619, -0.4736619, -0.4736619, -0.4736619, 1~
$ Fare     <dbl> -0.29474561, -0.49399207, -0.20678559, -0.44053570, -0.362780~
$ Embarked <fct> S, Q, S, S, S, S, S, S, S, S, Q, S, S, S, S, S, S, S, S, S, C~
$ Survived <fct> 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0~

Train model

# Train the model
train_model <- rand_forest(trees = 100) %>% 
  set_engine(engine = "randomForest") %>% 
  set_mode(mode = "classification") %>% 
  fit(Survived ~ ., data = train_juice)

Predictions

# Predict the split set
train_model %>% predict(test_set_bake)
# A tibble: 223 x 1
   .pred_class
   <fct>      
 1 0          
 2 0          
 3 0          
 4 0          
 5 0          
 6 1          
 7 1          
 8 0          
 9 0          
10 0          
# ... with 213 more rows
# Remove Survived column from the test data 
test2 <- select(test2, -Survived)

# Prediction
train_model %>% predict(test2)
# A tibble: 418 x 1
   .pred_class
   <fct>      
 1 0          
 2 1          
 3 0          
 4 0          
 5 0          
 6 0          
 7 1          
 8 0          
 9 1          
10 0          
# ... with 408 more rows

csv file

# Combine the variables for the file
my_submit <- bind_cols(PassengerId = test$PassengerId, Survived = train_model %>% predict(test2))

names(my_submit)
[1] "PassengerId" ".pred_class"
# Change .pred_class to Survived
my_submit <- my_submit %>% rename(Survived = .pred_class)
names(my_submit)
[1] "PassengerId" "Survived"   
# Write the file
write_csv(my_submit, "my_submit.csv")

Model validation

# Metrics
train_model %>% 
  predict(test_set_bake) %>% 
  bind_cols(test_set_bake) %>% 
  metrics(truth = Survived, estimate = .pred_class)
# A tibble: 2 x 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy binary         0.843
2 kap      binary         0.657

Conclusion

The accuracy of the model is 83.86% and the kappa score is 63.37%. The model scored over 77% in kaggle