Titanic project overview

The training set should be used to build your machine learning models. For the training set, we provide the outcome (also known as the “ground truth”) for each passenger. Your model will be based on “features” like passengers’ gender and class. You can also use feature engineering to create new features.

The test set should be used to see how well your model performs on unseen data. For the test set, we do not provide the ground truth for each passenger. It is your job to predict these outcomes. For each passenger in the test set, use the model you trained to predict whether or not they survived the sinking of the Titanic.

We also include gender_submission.csv, a set of predictions that assume all and only female passengers survive, as an example of what a submission file should look like.

Data Dictionary

Variable | Definition (Key)

survival | Survival (0 = No, 1 = Yes)

pclass | Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)

sex | Sex

Age | Age in years

sibsp | # of siblings / spouses aboard the Titanic

parch | # of parents / children aboard the Titanic

ticket | Ticket number

fare | Passenger fare

cabin | Cabin number

embarked | Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton

Variable Notes

pclass: A proxy for socio-economic status (SES) 1st = Upper 2nd = Middle 3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way

Sibling = brother, sister, stepbrother, stepsister

Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way

Parent = mother, father

Child = daughter, son, stepdaughter, stepson

Some children traveled only with a nanny, therefore parch=0 for them.

Libraries

library(readr)
library(skimr)
library(ggplot2)
library(DataExplorer)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble  3.1.8     ✔ stringr 1.4.0
## ✔ tidyr   1.2.0     ✔ forcats 0.5.1
## ✔ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.0     ✔ rsample      1.1.0
## ✔ dials        1.0.0     ✔ tune         1.0.0
## ✔ infer        1.0.3     ✔ workflows    1.0.0
## ✔ modeldata    1.0.0     ✔ workflowsets 1.0.0
## ✔ parsnip      1.0.1     ✔ yardstick    1.0.0
## ✔ recipes      1.0.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(baguette)

Import and overview data

titanic_train<-read_csv('train.csv')
## Rows: 891 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (7): PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
titanic_test<-read_csv('test.csv')
## Rows: 418 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (6): PassengerId, Pclass, Age, SibSp, Parch, Fare
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(titanic_train)
## spec_tbl_df [891 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ PassengerId: num [1:891] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : num [1:891] 0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : num [1:891] 3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr [1:891] "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr [1:891] "male" "female" "female" "female" ...
##  $ Age        : num [1:891] 22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : num [1:891] 1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : num [1:891] 0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr [1:891] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num [1:891] 7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr [1:891] NA "C85" NA "C123" ...
##  $ Embarked   : chr [1:891] "S" "C" "S" "S" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   PassengerId = col_double(),
##   ..   Survived = col_double(),
##   ..   Pclass = col_double(),
##   ..   Name = col_character(),
##   ..   Sex = col_character(),
##   ..   Age = col_double(),
##   ..   SibSp = col_double(),
##   ..   Parch = col_double(),
##   ..   Ticket = col_character(),
##   ..   Fare = col_double(),
##   ..   Cabin = col_character(),
##   ..   Embarked = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
head(titanic_train)
## # A tibble: 6 × 12
##   PassengerId Survived Pclass Name    Sex     Age SibSp Parch Ticket  Fare Cabin
##         <dbl>    <dbl>  <dbl> <chr>   <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr>
## 1           1        0      3 Braund… male     22     1     0 A/5 2…  7.25 <NA> 
## 2           2        1      1 Cuming… fema…    38     1     0 PC 17… 71.3  C85  
## 3           3        1      3 Heikki… fema…    26     0     0 STON/…  7.92 <NA> 
## 4           4        1      1 Futrel… fema…    35     1     0 113803 53.1  C123 
## 5           5        0      3 Allen,… male     35     0     0 373450  8.05 <NA> 
## 6           6        0      3 Moran,… male     NA     0     0 330877  8.46 <NA> 
## # … with 1 more variable: Embarked <chr>
summary(titanic_train)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 
skim(titanic_train)
Data summary
Name titanic_train
Number of rows 891
Number of columns 12
_______________________
Column type frequency:
character 5
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Name 0 1.00 12 82 0 891 0
Sex 0 1.00 4 6 0 2 0
Ticket 0 1.00 3 18 0 681 0
Cabin 687 0.23 1 15 0 147 0
Embarked 2 1.00 1 1 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
PassengerId 0 1.0 446.00 257.35 1.00 223.50 446.00 668.5 891.00 ▇▇▇▇▇
Survived 0 1.0 0.38 0.49 0.00 0.00 0.00 1.0 1.00 ▇▁▁▁▅
Pclass 0 1.0 2.31 0.84 1.00 2.00 3.00 3.0 3.00 ▃▁▃▁▇
Age 177 0.8 29.70 14.53 0.42 20.12 28.00 38.0 80.00 ▂▇▅▂▁
SibSp 0 1.0 0.52 1.10 0.00 0.00 0.00 1.0 8.00 ▇▁▁▁▁
Parch 0 1.0 0.38 0.81 0.00 0.00 0.00 0.0 6.00 ▇▁▁▁▁
Fare 0 1.0 32.20 49.69 0.00 7.91 14.45 31.0 512.33 ▇▁▁▁▁
plot_missing(titanic_train)

plot_missing(titanic_test)

With overview the training data of titanic, the data is structured with 891 observations and 12 variables. The unnecessary variables will be excluded, including passanger ID, name, ticket number, cabin number. Also, from summary() and str(), the Pclass is in numeric, so need to convert to a character and replace 1,2,3 to upper, middle, low. The missing_plot() tells that the port of embarked and Age contain NA values. For NA of age will be replaced by mean value, and the NA of Embarked will be replaced by the model.

Cleaning data

# drop unnecessary columns
titanic_train<-subset(titanic_train, select=-c(PassengerId,Name,Ticket,Cabin))

# convert numeric value of survived to factors
titanic_train$Survived<-factor(titanic_train$Survived)

# convert Pclass from 1,2,3 to upper, middle, and low
titanic_train$Pclass<-ifelse(titanic_train$Pclass==1,'upper',titanic_train$Pclass)
titanic_train$Pclass<-ifelse(titanic_train$Pclass==2,'middle',titanic_train$Pclass)
titanic_train$Pclass<-ifelse(titanic_train$Pclass==3,'low',titanic_train$Pclass)
titanic_train$Pclass<-factor(titanic_train$Pclass)

titanic_test$Pclass<-ifelse(titanic_test$Pclass==1,'upper',titanic_test$Pclass)
titanic_test$Pclass<-ifelse(titanic_test$Pclass==2,'middle',titanic_test$Pclass)
titanic_test$Pclass<-ifelse(titanic_test$Pclass==3,'low',titanic_test$Pclass)
titanic_test$Pclass<-factor(titanic_test$Pclass)

# Fix NA value to mean or mode value
titanic_train$Age[is.na(titanic_train$Age)]<-mean(titanic_train$Age,na.rm = TRUE)
titanic_train$Embarked[is.na(titanic_train$Embarked)]<-mode(titanic_train$Embarked)
plot_missing(titanic_train)

titanic_test$Age[is.na(titanic_test$Age)]<-mean(titanic_test$Age,na.rm = TRUE)
titanic_test$Fare[is.na(titanic_test$Fare)]<-mean(titanic_test$Fare,na.rm = TRUE)
plot_missing(titanic_test)

Perfectly, the unnecessary columns are dropped, and the NAs are replaced by either mean or mode values.The Pclass are also successfully replaced by ‘low’,‘middle’,and ‘upper’.These are the crucial steps for the further analysis and modeling.

Explotary data analysis

prop.table(table(titanic_train$Survived,titanic_train$Sex),2)
##    
##        female      male
##   0 0.2579618 0.8110919
##   1 0.7420382 0.1889081

Above table present that only 19 percent of male are survived, and about 75 percent of female are survived. It shows that the survive rate of man is very low compare to female. let’s visualize and compare the suvival rate.

# survival rate of different sex
ggplot(titanic_train, aes(x=Sex, fill = Survived))+geom_bar()+
  ggtitle('female vs male in survival')

ggplot(titanic_train,aes(x=Sex, fill=Survived))+geom_bar(position = 'fill')+
  ggtitle('The proportion of female vs male in survival')+
  ylab('survival rate')

As above plot, female has much higher survival rate compare with male. so, will vary age and the Pclass also affect the survival rate? let’us make plots.

ggplot(titanic_train, aes(x=Age))+geom_histogram()+
  ggtitle('distribution of age')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(titanic_train, aes(x=Age))+geom_density()+
  ggtitle('density of age')

ggplot(titanic_train,aes(x=Age,y=Survived))+geom_boxplot()+coord_flip()+
  ggtitle('distribution survival by age')

# filter the age between 20 to 40
age2040 <- titanic_train%>%
  filter(20<=Age & Age<=40)
(nrow(age2040)/nrow(titanic_train))
## [1] 0.647587
#filter survived for the age between 20 to 40
survivedAge2040<-titanic_train%>%
  filter(20<=Age & Age<=40 & Survived==1)

#filter survived for all
survivedAll<-titanic_train %>%
  filter(Survived==1)
(nrow(survivedAge2040)/nrow(survivedAll))
## [1] 0.6081871

Above histogram and boxplot shows that the age between 20 to 40 is about 64%, and the survived for age beteen 20 to 40 is about 61%. Therefore, cannot confirm that the high survival rate for age between 20 to 40 is because they are young or from the high proportion of the passengers with age 20 to 40.Let me explore further for the Pclass.

ggplot(titanic_train, aes(x = Pclass, fill = Survived))+geom_bar(position = 'fill')+
  ggtitle('proportion of survival rate by Pclass')+
  ylab('survival rate')

As above plot explains, the survival rate of upper class is over 63 %. the survival rate for middle class is slightly below 50 %. and the survival rate for low class is about 25 %. It means that the Pclass which is the proxy of socioeconomic status affects in survival rate. To sum up, the higher class has higher survival rate.

ggplot(titanic_train, aes(x=Fare))+geom_histogram(bins = 50)

summary(titanic_train$Fare)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    7.91   14.45   32.20   31.00  512.33

The histogram tells that the fare is rightly skewed and the max fare is 512.33 and the min of fare is 0.

Modeling

In this classification modeling, two different tree models will be applied. Includes the classification tree and the randomforest.

Classification Tree

class_spec<-decision_tree()%>%
  set_engine('rpart')%>%
  set_mode('classification')

class_tree<-class_spec %>%
  fit(Survived ~., data = titanic_train)
print(class_tree)
## parsnip model object
## 
## n= 891 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 891 342 0 (0.61616162 0.38383838)  
##    2) Sex=male 577 109 0 (0.81109185 0.18890815)  
##      4) Age>=6.5 553  93 0 (0.83182640 0.16817360) *
##      5) Age< 6.5 24   8 1 (0.33333333 0.66666667)  
##       10) SibSp>=2.5 9   1 0 (0.88888889 0.11111111) *
##       11) SibSp< 2.5 15   0 1 (0.00000000 1.00000000) *
##    3) Sex=female 314  81 1 (0.25796178 0.74203822)  
##      6) Pclass=low 144  72 0 (0.50000000 0.50000000)  
##       12) Fare>=23.35 27   3 0 (0.88888889 0.11111111) *
##       13) Fare< 23.35 117  48 1 (0.41025641 0.58974359)  
##         26) Age>=36.5 7   1 0 (0.85714286 0.14285714) *
##         27) Age< 36.5 110  42 1 (0.38181818 0.61818182) *
##      7) Pclass=middle,upper 170   9 1 (0.05294118 0.94705882) *
class_tree_pred<-predict(class_tree, new_data = titanic_train)

class_tree_pred_combined <- class_tree_pred %>%
  mutate(true_class = titanic_train$Survived)

# calculate the confusion matrix
conf_mat(data = class_tree_pred_combined, estimate = .pred_class, truth = true_class)
##           Truth
## Prediction   0   1
##          0 498  98
##          1  51 244
accuracy(data = class_tree_pred_combined, estimate = .pred_class, truth = true_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.833
# accuracy of 83.27% on training set.

The accuracy of the classification tree is 83.27%.

Random forest

set.seed(99)
spec<- rand_forest(tree = 100) %>%
  set_mode('classification') %>%
  set_engine('ranger')

rf<-spec %>%
  fit(Survived ~., data = titanic_train)


predict(rf, new_data = titanic_test)
## # A tibble: 418 × 1
##    .pred_class
##    <fct>      
##  1 0          
##  2 0          
##  3 0          
##  4 0          
##  5 0          
##  6 0          
##  7 1          
##  8 0          
##  9 1          
## 10 0          
## # … with 408 more rows
rf_pred<-predict(rf, new_data = titanic_train)
rf_combined <- rf_pred %>%
  mutate(true_class = titanic_train$Survived)

# calculate the confusion matrix
conf_mat(data = rf_combined, estimate = .pred_class, truth = true_class)
##           Truth
## Prediction   0   1
##          0 528  69
##          1  21 273
accuracy(data = rf_combined, estimate = .pred_class, truth = true_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.899
# variable importance
rand_forest(mode = 'classification') %>%
  set_engine('ranger', importance = 'impurity')%>%
  fit(Survived ~., data = titanic_train)%>%
  vip::vip()

The accuracy of random forest on training set is 89.89%, it is better performed than the classification tree model of 83.27%. According to the random forest importance plot, sex plays the key roles among the factors for survival, it also confirmed from the exploratory analysis above that female has much higher survival chance than male. Sex is followed by Fare, Age, and Pclass, so I can assume that the higher class of ticket, the higher chance to get survived.

Fit model to predict test set

forest_predict<-predict(rf, new_data = titanic_test, type='class')
forest_predict
## # A tibble: 418 × 1
##    .pred_class
##    <fct>      
##  1 0          
##  2 0          
##  3 0          
##  4 0          
##  5 0          
##  6 0          
##  7 1          
##  8 0          
##  9 1          
## 10 0          
## # … with 408 more rows
tree_predict<-predict(class_tree, new_data = titanic_test, type='class')
tree_predict
## # A tibble: 418 × 1
##    .pred_class
##    <fct>      
##  1 0          
##  2 0          
##  3 0          
##  4 0          
##  5 1          
##  6 0          
##  7 1          
##  8 0          
##  9 1          
## 10 0          
## # … with 408 more rows

Submission

submission<-data.frame(list('PassengerId' = titanic_test$PassengerId, Survived=forest_predict),stringsAsFactors = FALSE)
submission<-rename(submission, 'Survived' = '.pred_class')
write.csv(submission, file="final_test.csv", row.names=FALSE, col.names=TRUE,sep='\t')
## Warning in write.csv(submission, file = "final_test.csv", row.names = FALSE, :
## attempt to set 'col.names' ignored
## Warning in write.csv(submission, file = "final_test.csv", row.names = FALSE, :
## attempt to set 'sep' ignored
read_csv('final_test.csv')
## Rows: 418 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): PassengerId, Survived
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 418 × 2
##    PassengerId Survived
##          <dbl>    <dbl>
##  1         892        0
##  2         893        0
##  3         894        0
##  4         895        0
##  5         896        0
##  6         897        0
##  7         898        1
##  8         899        0
##  9         900        1
## 10         901        0
## # … with 408 more rows