The training set should be used to build your machine learning models. For the training set, we provide the outcome (also known as the “ground truth”) for each passenger. Your model will be based on “features” like passengers’ gender and class. You can also use feature engineering to create new features.
The test set should be used to see how well your model performs on unseen data. For the test set, we do not provide the ground truth for each passenger. It is your job to predict these outcomes. For each passenger in the test set, use the model you trained to predict whether or not they survived the sinking of the Titanic.
We also include gender_submission.csv, a set of predictions that assume all and only female passengers survive, as an example of what a submission file should look like.
Variable | Definition (Key)
survival | Survival (0 = No, 1 = Yes)
pclass | Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)
sex | Sex
Age | Age in years
sibsp | # of siblings / spouses aboard the Titanic
parch | # of parents / children aboard the Titanic
ticket | Ticket number
fare | Passenger fare
cabin | Cabin number
embarked | Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton
Variable Notes
pclass: A proxy for socio-economic status (SES) 1st = Upper 2nd = Middle 3rd = Lower
age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
sibsp: The dataset defines family relations in this way
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)
parch: The dataset defines family relations in this way
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children traveled only with a nanny, therefore parch=0 for them.
library(readr)
library(skimr)
library(ggplot2)
library(DataExplorer)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ tibble 3.1.8 ✔ stringr 1.4.0
## ✔ tidyr 1.2.0 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom 1.0.0 ✔ rsample 1.1.0
## ✔ dials 1.0.0 ✔ tune 1.0.0
## ✔ infer 1.0.3 ✔ workflows 1.0.0
## ✔ modeldata 1.0.0 ✔ workflowsets 1.0.0
## ✔ parsnip 1.0.1 ✔ yardstick 1.0.0
## ✔ recipes 1.0.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
library(baguette)
titanic_train<-read_csv('train.csv')
## Rows: 891 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (7): PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
titanic_test<-read_csv('test.csv')
## Rows: 418 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (6): PassengerId, Pclass, Age, SibSp, Parch, Fare
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(titanic_train)
## spec_tbl_df [891 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ PassengerId: num [1:891] 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : num [1:891] 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : num [1:891] 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr [1:891] "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr [1:891] "male" "female" "female" "female" ...
## $ Age : num [1:891] 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : num [1:891] 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : num [1:891] 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr [1:891] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num [1:891] 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr [1:891] NA "C85" NA "C123" ...
## $ Embarked : chr [1:891] "S" "C" "S" "S" ...
## - attr(*, "spec")=
## .. cols(
## .. PassengerId = col_double(),
## .. Survived = col_double(),
## .. Pclass = col_double(),
## .. Name = col_character(),
## .. Sex = col_character(),
## .. Age = col_double(),
## .. SibSp = col_double(),
## .. Parch = col_double(),
## .. Ticket = col_character(),
## .. Fare = col_double(),
## .. Cabin = col_character(),
## .. Embarked = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
head(titanic_train)
## # A tibble: 6 × 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin
## <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 0 3 Braund… male 22 1 0 A/5 2… 7.25 <NA>
## 2 2 1 1 Cuming… fema… 38 1 0 PC 17… 71.3 C85
## 3 3 1 3 Heikki… fema… 26 0 0 STON/… 7.92 <NA>
## 4 4 1 1 Futrel… fema… 35 1 0 113803 53.1 C123
## 5 5 0 3 Allen,… male 35 0 0 373450 8.05 <NA>
## 6 6 0 3 Moran,… male NA 0 0 330877 8.46 <NA>
## # … with 1 more variable: Embarked <chr>
summary(titanic_train)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
skim(titanic_train)
| Name | titanic_train |
| Number of rows | 891 |
| Number of columns | 12 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Name | 0 | 1.00 | 12 | 82 | 0 | 891 | 0 |
| Sex | 0 | 1.00 | 4 | 6 | 0 | 2 | 0 |
| Ticket | 0 | 1.00 | 3 | 18 | 0 | 681 | 0 |
| Cabin | 687 | 0.23 | 1 | 15 | 0 | 147 | 0 |
| Embarked | 2 | 1.00 | 1 | 1 | 0 | 3 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| PassengerId | 0 | 1.0 | 446.00 | 257.35 | 1.00 | 223.50 | 446.00 | 668.5 | 891.00 | ▇▇▇▇▇ |
| Survived | 0 | 1.0 | 0.38 | 0.49 | 0.00 | 0.00 | 0.00 | 1.0 | 1.00 | ▇▁▁▁▅ |
| Pclass | 0 | 1.0 | 2.31 | 0.84 | 1.00 | 2.00 | 3.00 | 3.0 | 3.00 | ▃▁▃▁▇ |
| Age | 177 | 0.8 | 29.70 | 14.53 | 0.42 | 20.12 | 28.00 | 38.0 | 80.00 | ▂▇▅▂▁ |
| SibSp | 0 | 1.0 | 0.52 | 1.10 | 0.00 | 0.00 | 0.00 | 1.0 | 8.00 | ▇▁▁▁▁ |
| Parch | 0 | 1.0 | 0.38 | 0.81 | 0.00 | 0.00 | 0.00 | 0.0 | 6.00 | ▇▁▁▁▁ |
| Fare | 0 | 1.0 | 32.20 | 49.69 | 0.00 | 7.91 | 14.45 | 31.0 | 512.33 | ▇▁▁▁▁ |
plot_missing(titanic_train)
plot_missing(titanic_test)
With overview the training data of titanic, the data is structured with 891 observations and 12 variables. The unnecessary variables will be excluded, including passanger ID, name, ticket number, cabin number. Also, from summary() and str(), the Pclass is in numeric, so need to convert to a character and replace 1,2,3 to upper, middle, low. The missing_plot() tells that the port of embarked and Age contain NA values. For NA of age will be replaced by mean value, and the NA of Embarked will be replaced by the model.
# drop unnecessary columns
titanic_train<-subset(titanic_train, select=-c(PassengerId,Name,Ticket,Cabin))
# convert numeric value of survived to factors
titanic_train$Survived<-factor(titanic_train$Survived)
# convert Pclass from 1,2,3 to upper, middle, and low
titanic_train$Pclass<-ifelse(titanic_train$Pclass==1,'upper',titanic_train$Pclass)
titanic_train$Pclass<-ifelse(titanic_train$Pclass==2,'middle',titanic_train$Pclass)
titanic_train$Pclass<-ifelse(titanic_train$Pclass==3,'low',titanic_train$Pclass)
titanic_train$Pclass<-factor(titanic_train$Pclass)
titanic_test$Pclass<-ifelse(titanic_test$Pclass==1,'upper',titanic_test$Pclass)
titanic_test$Pclass<-ifelse(titanic_test$Pclass==2,'middle',titanic_test$Pclass)
titanic_test$Pclass<-ifelse(titanic_test$Pclass==3,'low',titanic_test$Pclass)
titanic_test$Pclass<-factor(titanic_test$Pclass)
# Fix NA value to mean or mode value
titanic_train$Age[is.na(titanic_train$Age)]<-mean(titanic_train$Age,na.rm = TRUE)
titanic_train$Embarked[is.na(titanic_train$Embarked)]<-mode(titanic_train$Embarked)
plot_missing(titanic_train)
titanic_test$Age[is.na(titanic_test$Age)]<-mean(titanic_test$Age,na.rm = TRUE)
titanic_test$Fare[is.na(titanic_test$Fare)]<-mean(titanic_test$Fare,na.rm = TRUE)
plot_missing(titanic_test)
Perfectly, the unnecessary columns are dropped, and the NAs are replaced by either mean or mode values.The Pclass are also successfully replaced by ‘low’,‘middle’,and ‘upper’.These are the crucial steps for the further analysis and modeling.
prop.table(table(titanic_train$Survived,titanic_train$Sex),2)
##
## female male
## 0 0.2579618 0.8110919
## 1 0.7420382 0.1889081
Above table present that only 19 percent of male are survived, and about 75 percent of female are survived. It shows that the survive rate of man is very low compare to female. let’s visualize and compare the suvival rate.
# survival rate of different sex
ggplot(titanic_train, aes(x=Sex, fill = Survived))+geom_bar()+
ggtitle('female vs male in survival')
ggplot(titanic_train,aes(x=Sex, fill=Survived))+geom_bar(position = 'fill')+
ggtitle('The proportion of female vs male in survival')+
ylab('survival rate')
As above plot, female has much higher survival rate compare with male. so, will vary age and the Pclass also affect the survival rate? let’us make plots.
ggplot(titanic_train, aes(x=Age))+geom_histogram()+
ggtitle('distribution of age')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(titanic_train, aes(x=Age))+geom_density()+
ggtitle('density of age')
ggplot(titanic_train,aes(x=Age,y=Survived))+geom_boxplot()+coord_flip()+
ggtitle('distribution survival by age')
# filter the age between 20 to 40
age2040 <- titanic_train%>%
filter(20<=Age & Age<=40)
(nrow(age2040)/nrow(titanic_train))
## [1] 0.647587
#filter survived for the age between 20 to 40
survivedAge2040<-titanic_train%>%
filter(20<=Age & Age<=40 & Survived==1)
#filter survived for all
survivedAll<-titanic_train %>%
filter(Survived==1)
(nrow(survivedAge2040)/nrow(survivedAll))
## [1] 0.6081871
Above histogram and boxplot shows that the age between 20 to 40 is about 64%, and the survived for age beteen 20 to 40 is about 61%. Therefore, cannot confirm that the high survival rate for age between 20 to 40 is because they are young or from the high proportion of the passengers with age 20 to 40.Let me explore further for the Pclass.
ggplot(titanic_train, aes(x = Pclass, fill = Survived))+geom_bar(position = 'fill')+
ggtitle('proportion of survival rate by Pclass')+
ylab('survival rate')
As above plot explains, the survival rate of upper class is over 63 %. the survival rate for middle class is slightly below 50 %. and the survival rate for low class is about 25 %. It means that the Pclass which is the proxy of socioeconomic status affects in survival rate. To sum up, the higher class has higher survival rate.
ggplot(titanic_train, aes(x=Fare))+geom_histogram(bins = 50)
summary(titanic_train$Fare)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 7.91 14.45 32.20 31.00 512.33
The histogram tells that the fare is rightly skewed and the max fare is 512.33 and the min of fare is 0.
In this classification modeling, two different tree models will be applied. Includes the classification tree and the randomforest.
class_spec<-decision_tree()%>%
set_engine('rpart')%>%
set_mode('classification')
class_tree<-class_spec %>%
fit(Survived ~., data = titanic_train)
print(class_tree)
## parsnip model object
##
## n= 891
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 891 342 0 (0.61616162 0.38383838)
## 2) Sex=male 577 109 0 (0.81109185 0.18890815)
## 4) Age>=6.5 553 93 0 (0.83182640 0.16817360) *
## 5) Age< 6.5 24 8 1 (0.33333333 0.66666667)
## 10) SibSp>=2.5 9 1 0 (0.88888889 0.11111111) *
## 11) SibSp< 2.5 15 0 1 (0.00000000 1.00000000) *
## 3) Sex=female 314 81 1 (0.25796178 0.74203822)
## 6) Pclass=low 144 72 0 (0.50000000 0.50000000)
## 12) Fare>=23.35 27 3 0 (0.88888889 0.11111111) *
## 13) Fare< 23.35 117 48 1 (0.41025641 0.58974359)
## 26) Age>=36.5 7 1 0 (0.85714286 0.14285714) *
## 27) Age< 36.5 110 42 1 (0.38181818 0.61818182) *
## 7) Pclass=middle,upper 170 9 1 (0.05294118 0.94705882) *
class_tree_pred<-predict(class_tree, new_data = titanic_train)
class_tree_pred_combined <- class_tree_pred %>%
mutate(true_class = titanic_train$Survived)
# calculate the confusion matrix
conf_mat(data = class_tree_pred_combined, estimate = .pred_class, truth = true_class)
## Truth
## Prediction 0 1
## 0 498 98
## 1 51 244
accuracy(data = class_tree_pred_combined, estimate = .pred_class, truth = true_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.833
# accuracy of 83.27% on training set.
The accuracy of the classification tree is 83.27%.
set.seed(99)
spec<- rand_forest(tree = 100) %>%
set_mode('classification') %>%
set_engine('ranger')
rf<-spec %>%
fit(Survived ~., data = titanic_train)
predict(rf, new_data = titanic_test)
## # A tibble: 418 × 1
## .pred_class
## <fct>
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 1
## 8 0
## 9 1
## 10 0
## # … with 408 more rows
rf_pred<-predict(rf, new_data = titanic_train)
rf_combined <- rf_pred %>%
mutate(true_class = titanic_train$Survived)
# calculate the confusion matrix
conf_mat(data = rf_combined, estimate = .pred_class, truth = true_class)
## Truth
## Prediction 0 1
## 0 528 69
## 1 21 273
accuracy(data = rf_combined, estimate = .pred_class, truth = true_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.899
# variable importance
rand_forest(mode = 'classification') %>%
set_engine('ranger', importance = 'impurity')%>%
fit(Survived ~., data = titanic_train)%>%
vip::vip()
The accuracy of random forest on training set is 89.89%, it is better performed than the classification tree model of 83.27%. According to the random forest importance plot, sex plays the key roles among the factors for survival, it also confirmed from the exploratory analysis above that female has much higher survival chance than male. Sex is followed by Fare, Age, and Pclass, so I can assume that the higher class of ticket, the higher chance to get survived.
forest_predict<-predict(rf, new_data = titanic_test, type='class')
forest_predict
## # A tibble: 418 × 1
## .pred_class
## <fct>
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 1
## 8 0
## 9 1
## 10 0
## # … with 408 more rows
tree_predict<-predict(class_tree, new_data = titanic_test, type='class')
tree_predict
## # A tibble: 418 × 1
## .pred_class
## <fct>
## 1 0
## 2 0
## 3 0
## 4 0
## 5 1
## 6 0
## 7 1
## 8 0
## 9 1
## 10 0
## # … with 408 more rows
submission<-data.frame(list('PassengerId' = titanic_test$PassengerId, Survived=forest_predict),stringsAsFactors = FALSE)
submission<-rename(submission, 'Survived' = '.pred_class')
write.csv(submission, file="final_test.csv", row.names=FALSE, col.names=TRUE,sep='\t')
## Warning in write.csv(submission, file = "final_test.csv", row.names = FALSE, :
## attempt to set 'col.names' ignored
## Warning in write.csv(submission, file = "final_test.csv", row.names = FALSE, :
## attempt to set 'sep' ignored
read_csv('final_test.csv')
## Rows: 418 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): PassengerId, Survived
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 418 × 2
## PassengerId Survived
## <dbl> <dbl>
## 1 892 0
## 2 893 0
## 3 894 0
## 4 895 0
## 5 896 0
## 6 897 0
## 7 898 1
## 8 899 0
## 9 900 1
## 10 901 0
## # … with 408 more rows