Importing Data including Train and Test Sets

library(tidymodels)

titanic3 <- read.csv("titanic3.csv") %>%
  rename(Pclass = pclass, Survived = survived, Name = name, Sex = sex,
         Age = age, SibSp = sibsp, Parch = parch, Ticket = ticket, Fare = fare,
         Cabin = cabin, Embarked = embarked, Boat = boat, Body = body,
         Home.dest = home.dest)
titanic3$Survived <- as.factor(titanic3$Survived)
titanic3$Pclass <- as.factor(titanic3$Pclass)
titanic3$Sex <- as.factor(titanic3$Sex)
titanic3$Embarked <- as.factor(titanic3$Embarked)

summary(titanic3)
##  Pclass  Survived     Name               Sex           Age       
##  1:323   0:809    Length:1309        female:466   Min.   : 0.17  
##  2:277   1:500    Class :character   male  :843   1st Qu.:21.00  
##  3:709            Mode  :character                Median :28.00  
##                                                   Mean   :29.88  
##                                                   3rd Qu.:39.00  
##                                                   Max.   :80.00  
##                                                   NA's   :263    
##      SibSp            Parch          Ticket               Fare        
##  Min.   :0.0000   Min.   :0.000   Length:1309        Min.   :  0.000  
##  1st Qu.:0.0000   1st Qu.:0.000   Class :character   1st Qu.:  7.896  
##  Median :0.0000   Median :0.000   Mode  :character   Median : 14.454  
##  Mean   :0.4989   Mean   :0.385                      Mean   : 33.295  
##  3rd Qu.:1.0000   3rd Qu.:0.000                      3rd Qu.: 31.275  
##  Max.   :8.0000   Max.   :9.000                      Max.   :512.329  
##                                                      NA's   :1        
##     Cabin           Embarked     Boat                Body      
##  Length:1309         :  2    Length:1309        Min.   :  1.0  
##  Class :character   C:270    Class :character   1st Qu.: 72.0  
##  Mode  :character   Q:123    Mode  :character   Median :155.0  
##                     S:914                       Mean   :160.8  
##                                                 3rd Qu.:256.0  
##                                                 Max.   :328.0  
##                                                 NA's   :1188   
##   Home.dest        
##  Length:1309       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
titanic_train <- read.csv("titanic_train.csv")
titanic_train$Survived = as.factor(titanic_train$Survived)
titanic_train$Pclass <- as.factor(titanic_train$Pclass)
titanic_train$Sex <- as.factor(titanic_train$Sex)
titanic_train$Embarked <- as.factor(titanic_train$Embarked)
summary(titanic_train)
##   PassengerId    Survived Pclass      Name               Sex     
##  Min.   :  1.0   0:549    1:216   Length:891         female:314  
##  1st Qu.:223.5   1:342    2:184   Class :character   male  :577  
##  Median :446.0            3:491   Mode  :character               
##  Mean   :446.0                                                   
##  3rd Qu.:668.5                                                   
##  Max.   :891.0                                                   
##                                                                  
##       Age            SibSp           Parch           Ticket         
##  Min.   : 0.42   Min.   :0.000   Min.   :0.0000   Length:891        
##  1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000   Class :character  
##  Median :28.00   Median :0.000   Median :0.0000   Mode  :character  
##  Mean   :29.70   Mean   :0.523   Mean   :0.3816                     
##  3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000                     
##  Max.   :80.00   Max.   :8.000   Max.   :6.0000                     
##  NA's   :177                                                        
##       Fare           Cabin           Embarked
##  Min.   :  0.00   Length:891          :  2   
##  1st Qu.:  7.91   Class :character   C:168   
##  Median : 14.45   Mode  :character   Q: 77   
##  Mean   : 32.20                      S:644   
##  3rd Qu.: 31.00                              
##  Max.   :512.33                              
## 
titanic_test <- read.csv("titanic_test.csv")
titanic_test$Pclass <- as.factor(titanic_test$Pclass)
titanic_test$Sex <- as.factor(titanic_test$Sex)
titanic_test$Embarked <- as.factor(titanic_test$Embarked)
summary(titanic_test)
##   PassengerId     Pclass      Name               Sex           Age       
##  Min.   : 892.0   1:107   Length:418         female:152   Min.   : 0.17  
##  1st Qu.: 996.2   2: 93   Class :character   male  :266   1st Qu.:21.00  
##  Median :1100.5   3:218   Mode  :character                Median :27.00  
##  Mean   :1100.5                                           Mean   :30.27  
##  3rd Qu.:1204.8                                           3rd Qu.:39.00  
##  Max.   :1309.0                                           Max.   :76.00  
##                                                           NA's   :86     
##      SibSp            Parch           Ticket               Fare        
##  Min.   :0.0000   Min.   :0.0000   Length:418         Min.   :  0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   Class :character   1st Qu.:  7.896  
##  Median :0.0000   Median :0.0000   Mode  :character   Median : 14.454  
##  Mean   :0.4474   Mean   :0.3923                      Mean   : 35.627  
##  3rd Qu.:1.0000   3rd Qu.:0.0000                      3rd Qu.: 31.500  
##  Max.   :8.0000   Max.   :9.0000                      Max.   :512.329  
##                                                       NA's   :1        
##     Cabin           Embarked
##  Length:418         C:102   
##  Class :character   Q: 46   
##  Mode  :character   S:270   
##                             
##                             
##                             
## 

Univariate Analysis

Survived

We observe that most did not survive (809 compared to 500).

library(ggplot2)
ggplot(titanic3, aes(Survived)) +
  geom_bar(color = "darkgreen", fill = "lightgreen") +
  theme_classic()

summary(titanic3$Survived)
##   0   1 
## 809 500

Sex

ggplot(titanic3, aes(Sex)) +
  geom_bar(color = "darkgreen", fill = "lightgreen") +
  theme_classic()

summary(titanic3$Sex)
## female   male 
##    466    843

Age

ggplot(titanic3, aes(Age)) +
  geom_histogram(color = "darkgreen", fill = "lightgreen", binwidth = 10) +
  theme_classic()
## Warning: Removed 263 rows containing non-finite values (stat_bin).

summary(titanic3$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.17   21.00   28.00   29.88   39.00   80.00     263

Fare

ggplot(titanic3, aes(Fare)) +
  geom_histogram(color = "darkgreen", fill = "lightgreen", binwidth = 30) +
  theme_classic()
## Warning: Removed 1 rows containing non-finite values (stat_bin).

summary(titanic3$Fare)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   7.896  14.454  33.295  31.275 512.329       1

Pclass

ggplot(titanic3, aes(Pclass)) +
  geom_bar(color = "darkgreen", fill = "lightgreen") +
  theme_classic()

summary(titanic3$Pclass)
##   1   2   3 
## 323 277 709

Embarked

ggplot(titanic3, aes(Embarked)) +
  geom_bar(color = "darkgreen", fill = "lightgreen") +
  theme_classic()

summary(titanic3$Embarked)
##       C   Q   S 
##   2 270 123 914

SibSp

ggplot(titanic3, aes(SibSp)) +
  geom_histogram(color = "darkgreen", fill = "lightgreen") +
  theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(titanic3$SibSp)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4989  1.0000  8.0000

Parch

ggplot(titanic3, aes(Parch)) +
  geom_histogram(color = "darkgreen", fill = "lightgreen") +
  theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(titanic3$Parch)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.385   0.000   9.000

Bi-Variate Analysis

Survived vs. Age

glm_mdl <- glm(Survived ~ Age, data = titanic3, family = binomial)
summary(glm_mdl)
## 
## Call:
## glm(formula = Survived ~ Age, family = binomial, data = titanic3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1189  -1.0361  -0.9768   1.3187   1.5162  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -0.136534   0.144715  -0.943   0.3454  
## Age         -0.007899   0.004407  -1.792   0.0731 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1414.6  on 1045  degrees of freedom
## Residual deviance: 1411.4  on 1044  degrees of freedom
##   (263 observations deleted due to missingness)
## AIC: 1415.4
## 
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
##  (Intercept)          Age 
## -0.136533978 -0.007898504
ggplot(titanic3, aes(Age, as.numeric(Survived) - 1)) +
  geom_point() +
  geom_smooth(method="glm", color="blue", se=FALSE,
                method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 263 rows containing non-finite values (stat_smooth).
## Warning: Removed 263 rows containing missing values (geom_point).

Survived vs. Sex

glm_mdl <- glm(Survived ~ Sex, data = titanic3, family = binomial)
summary(glm_mdl)
## 
## Call:
## glm(formula = Survived ~ Sex, family = binomial, data = titanic3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6124  -0.6511  -0.6511   0.7977   1.8196  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.9818     0.1040   9.437   <2e-16 ***
## Sexmale      -2.4254     0.1360 -17.832   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1741.0  on 1308  degrees of freedom
## Residual deviance: 1368.1  on 1307  degrees of freedom
## AIC: 1372.1
## 
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept)     Sexmale 
##    0.981813   -2.425438
ggplot(titanic3, aes(as.numeric(Sex), as.numeric(Survived) - 1)) +
  geom_point() +
  geom_smooth(method="glm", color="blue", se=FALSE,
                method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'

Survived vs. Fare

glm_mdl <- glm(Survived ~ Fare, data = titanic3, family = binomial)
summary(glm_mdl)
## 
## Call:
## glm(formula = Survived ~ Fare, family = binomial, data = titanic3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2271  -0.8971  -0.8666   1.3526   1.5676  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.882427   0.075522 -11.684  < 2e-16 ***
## Fare         0.012453   0.001604   7.762 8.35e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1740.1  on 1307  degrees of freedom
## Residual deviance: 1654.0  on 1306  degrees of freedom
##   (1 observation deleted due to missingness)
## AIC: 1658
## 
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept)        Fare 
## -0.88242654  0.01245273
ggplot(titanic3, aes(Fare, as.numeric(Survived) - 1)) +
  geom_point() +
  geom_smooth(method="glm", color="blue", se=FALSE,
                method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

Survived vs. Pclass

glm_mdl <- glm(Survived ~ Pclass, data = titanic3, family = binomial)
summary(glm_mdl)
## 
## Call:
## glm(formula = Survived ~ Pclass, family = binomial, data = titanic3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3896  -0.7678  -0.7678   0.9791   1.6525  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.4861     0.1146   4.242 2.21e-05 ***
## Pclass2      -0.7696     0.1669  -4.611 4.02e-06 ***
## Pclass3      -1.5567     0.1433 -10.860  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1741.0  on 1308  degrees of freedom
## Residual deviance: 1613.3  on 1306  degrees of freedom
## AIC: 1619.3
## 
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept)     Pclass2     Pclass3 
##   0.4861330  -0.7696046  -1.5567323
ggplot(titanic3, aes(as.numeric(Pclass), as.numeric(Survived) - 1)) +
  geom_point() +
  geom_smooth(method="glm", color="blue", se=FALSE,
                method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'

Survived vs. Embarked

glm_mdl <- glm(Survived ~ Embarked, data = titanic3, family = binomial)
summary(glm_mdl)
## 
## Call:
## glm(formula = Survived ~ Embarked, family = binomial, data = titanic3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2735  -0.8993  -0.8993   1.4339   1.4838  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)    13.57     378.59   0.036    0.971
## EmbarkedC     -13.34     378.59  -0.035    0.972
## EmbarkedQ     -14.15     378.59  -0.037    0.970
## EmbarkedS     -14.26     378.59  -0.038    0.970
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1741  on 1308  degrees of freedom
## Residual deviance: 1694  on 1305  degrees of freedom
## AIC: 1702
## 
## Number of Fisher Scoring iterations: 12
coefficients(glm_mdl)
## (Intercept)   EmbarkedC   EmbarkedQ   EmbarkedS 
##    13.56607   -13.34292   -14.15132   -14.26250
ggplot(titanic3, aes(as.numeric(Embarked), as.numeric(Survived) - 1)) +
  geom_point() +
  geom_smooth(method="glm", color="blue", se=FALSE,
                method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'

Survived vs. SibSp

glm_mdl <- glm(Survived ~ SibSp, data = titanic3, family = binomial)
summary(glm_mdl)
## 
## Call:
## glm(formula = Survived ~ SibSp, family = binomial, data = titanic3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9919  -0.9919  -0.9698   1.3750   1.4766  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.45331    0.06310  -7.184 6.79e-13 ***
## SibSp       -0.05681    0.05660  -1.004    0.315    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1741  on 1308  degrees of freedom
## Residual deviance: 1740  on 1307  degrees of freedom
## AIC: 1744
## 
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept)       SibSp 
## -0.45331284 -0.05681221
ggplot(titanic3, aes(SibSp, as.numeric(Survived) - 1)) +
  geom_point() +
  geom_smooth(method="glm", color="blue", se=FALSE,
                method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'

Survived vs. Parch

glm_mdl <- glm(Survived ~ Parch, data = titanic3, family = binomial)
summary(glm_mdl)
## 
## Call:
## glm(formula = Survived ~ Parch, family = binomial, data = titanic3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7022  -0.9516  -0.9516   1.4215   1.4215  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.55753    0.06289  -8.865  < 2e-16 ***
## Parch        0.19318    0.06625   2.916  0.00355 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1741.0  on 1308  degrees of freedom
## Residual deviance: 1732.3  on 1307  degrees of freedom
## AIC: 1736.3
## 
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept)       Parch 
##  -0.5575286   0.1931776
ggplot(titanic3, aes(Parch, as.numeric(Survived) - 1)) +
  geom_point() +
  geom_smooth(method="glm", color="blue", se=FALSE,
                method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'

Tree-based Model

I first start by building a tree-based model. I set engine to “rpart” and mode to “classification”.I performed splitting since the Survived variable was not available on our test dataset.

I made the tree-based model as Survival based on Age and Sex of passengers. Based on our model, We observe that only 19% of males survived compared to 27% of females that survived. So more females survived overall. Also, we note that among the males that survived, 17% were more than 9.5 years old and 41% were less than 9.5 years old.

Finally, we observe that the accuracy of our tree-based model is about 78%, meaning that it is correct 78% of the time, which is quite good. Error rate of our model is about 22%.

tree_spec <- decision_tree() %>% 
  set_engine("rpart") %>% 
  set_mode("classification")
tree_model_age_sex <- tree_spec %>%
  fit(formula = Survived ~ Age + Sex, data = titanic3)

tree_model_age_sex
## parsnip model object
## 
## n= 1309 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 1309 500 0 (0.6180290 0.3819710)  
##   2) Sex=male 843 161 0 (0.8090154 0.1909846)  
##     4) Age>=9.5 800 136 0 (0.8300000 0.1700000) *
##     5) Age< 9.5 43  18 1 (0.4186047 0.5813953) *
##   3) Sex=female 466 127 1 (0.2725322 0.7274678) *

Splitting the data

I performed splitting since the Survived variable was not available on our test dataset.

titanic_split <- initial_split(titanic3, prop = 0.75)
titanic_tree_train <- training(titanic_split)
titanic_tree_test <- testing(titanic_split)

Predictions based on Tree-Based Model

tree_predictions <- predict(tree_model_age_sex, new_data = titanic_tree_test, type = "class")
#predictions

#Combining predictions and truth value
pred_combined <- tree_predictions %>% 
  mutate(true_class = titanic_tree_test$Survived)

head(pred_combined)
## # A tibble: 6 × 2
##   .pred_class true_class
##   <fct>       <fct>     
## 1 1           0         
## 2 0           0         
## 3 1           1         
## 4 0           0         
## 5 0           0         
## 6 1           1
#Confusion matrix
conf_mat(data = pred_combined, estimate = .pred_class, truth = true_class)
##           Truth
## Prediction   0   1
##          0 176  35
##          1  30  87

Accuracy of Tree-based Model

We observe that the accuracy of our tree-based model is about 78%, meaning that it is correct 78% of the time, which is quite good.

accuracy(pred_combined, estimate = .pred_class, truth = true_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.802

Error Rate of Tree-based Model

Error rate is calculated as the total number of two incorrect predictions (FN + FP) divided by the total number of a dataset (P + N). Error rate of our model is about 22%.

error_rate <- (36 + 37) / (171 + 37 + 36 + 84)
error_rate
## [1] 0.222561

Tuning our Tree-Based Model

By tuning our tree, we observe that the optimal tree depth is 8 with minimum 40 nodes.

set.seed(275)

spec_untuned <- decision_tree(min_n = tune(),
                              tree_depth = tune(),
                              cost_complexity = tune()) %>%
  set_engine("rpart") %>%
  set_mode("classification")

tree_grid <- grid_regular(parameters(spec_untuned), levels = 3)
## Warning: `parameters.model_spec()` was deprecated in tune 0.1.6.9003.
## Please use `hardhat::extract_parameter_set_dials()` instead.
tune_results <- tune_grid(spec_untuned,
                          Survived ~ Age + Sex,
                          resamples = vfold_cv(titanic3, v = 3),
                          grid = tree_grid,
                          metrics = metric_set(accuracy))
autoplot(tune_results)

final_params <- select_best(tune_results)
final_params
## # A tibble: 1 × 4
##   cost_complexity tree_depth min_n .config              
##             <dbl>      <int> <int> <chr>                
## 1    0.0000000001          8    40 Preprocessor1_Model22
best_spec <- finalize_model(spec_untuned, final_params)
best_spec
## Decision Tree Model Specification (classification)
## 
## Main Arguments:
##   cost_complexity = 1e-10
##   tree_depth = 8
##   min_n = 40
## 
## Computational engine: rpart
final_model <- fit(best_spec, Survived ~ Age + Sex, data = titanic3)
final_model
## parsnip model object
## 
## n= 1309 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 1309 500 0 (0.6180290 0.3819710)  
##    2) Sex=male 843 161 0 (0.8090154 0.1909846)  
##      4) Age>=9.5 800 136 0 (0.8300000 0.1700000) *
##      5) Age< 9.5 43  18 1 (0.4186047 0.5813953)  
##       10) Age>=3.5 21  10 0 (0.5238095 0.4761905) *
##       11) Age< 3.5 22   7 1 (0.3181818 0.6818182) *
##    3) Sex=female 466 127 1 (0.2725322 0.7274678)  
##      6) Age< 30.75 314 101 1 (0.3216561 0.6783439)  
##       12) Age< 11.5 42  17 1 (0.4047619 0.5952381)  
##         24) Age>=5.5 14   5 0 (0.6428571 0.3571429) *
##         25) Age< 5.5 28   8 1 (0.2857143 0.7142857) *
##       13) Age>=11.5 272  84 1 (0.3088235 0.6911765) *
##      7) Age>=30.75 152  26 1 (0.1710526 0.8289474) *

ROC Curve & AUC

Instead of being aligned to the top left of the plot, the ROC of our tree-based model is aligned to the bottom right of the plot, which makes it bad model. AUC is 0.22 which is less than 0.5, also indicating a bad model.

tree_predictions2 <- predict(tree_model_age_sex, new_data = titanic_tree_test, type = "prob") %>%
  bind_cols(titanic_tree_test)
tree_predictions2
## # A tibble: 328 × 16
##    .pred_0 .pred_1 Pclass Survived Name     Sex     Age SibSp Parch Ticket  Fare
##      <dbl>   <dbl> <fct>  <fct>    <chr>    <fct> <dbl> <int> <int> <chr>  <dbl>
##  1   0.273   0.727 1      0        "Alliso… fema…     2     1     2 113781 152. 
##  2   0.83    0.17  1      0        "Alliso… male     30     1     2 113781 152. 
##  3   0.273   0.727 1      1        "Andrew… fema…    63     1     0 13502   78.0
##  4   0.83    0.17  1      0        "Andrew… male     39     0     0 112050   0  
##  5   0.83    0.17  1      0        "Artaga… male     71     0     0 PC 17…  49.5
##  6   0.273   0.727 1      1        "Barber… fema…    26     0     0 19877   78.8
##  7   0.83    0.17  1      0        "Beatti… male     36     0     0 13050   75.2
##  8   0.83    0.17  1      1        "Behr, … male     26     0     0 111369  30  
##  9   0.273   0.727 1      1        "Bonnel… fema…    30     0     0 36928  165. 
## 10   0.83    0.17  1      0        "Brady,… male     41     0     0 113054  30.5
## # … with 318 more rows, and 5 more variables: Cabin <chr>, Embarked <fct>,
## #   Boat <chr>, Body <int>, Home.dest <chr>
roc <- roc_curve(tree_predictions2, estimate = .pred_1, truth = Survived)
autoplot(roc)

roc_auc <- roc_auc(tree_predictions2, estimate = .pred_1, truth = Survived)
roc_auc
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.216

Random Forest

spec <- rand_forest(mtry = 4,
            trees = 500,
            min_n = 10) %>%
  set_mode("classification") %>%
  set_engine("ranger")

Training the Forest

spec2 <- spec %>%
  fit(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data = titanic_tree_train)
## Warning: Dropped unused factor level(s) in dependent variable: 1.
##Pclass + Sex + Age + SibSp + Parch + Fare + Embarked


#library("ranger")
#model_rf <- ranger(Survived ~ ., data = titanic_train, probability = TRUE)
##data = titanic_train[complete.cases(titanic_train),]
#model_rf

Variable Importance

We observe that the most important variable is “Sex”, which is followed by “Ticket”, “Fare”, and “Age”.

rand_forest(mode = "classification") %>%
  set_engine("ranger", importance = "impurity") %>%
  fit(Survived ~ ., data = titanic_train) %>%
  vip::vip()

Predictions based on Random Forest Model

forest_predictions <- predict(spec2, new_data = titanic_tree_test)
forest_predictions
## # A tibble: 30 × 1
##    .pred_class
##    <fct>      
##  1 0          
##  2 0          
##  3 0          
##  4 0          
##  5 0          
##  6 0          
##  7 0          
##  8 0          
##  9 0          
## 10 0          
## # … with 20 more rows
#Combining predictions and truth value
pred_combined <- forest_predictions %>% 
  mutate(true_class = titanic_tree_test$Survived)

head(pred_combined)
## # A tibble: 6 × 2
##   .pred_class true_class
##   <fct>       <fct>     
## 1 0           0         
## 2 0           0         
## 3 0           0         
## 4 0           0         
## 5 0           0         
## 6 0           0
#Confusion matrix
conf_mat(data = pred_combined, estimate = .pred_class, truth = true_class)
##           Truth
## Prediction  0  1
##          0 30  0
##          1  0  0

Accuracy of Random Forest Model

accuracy(pred_combined,
                estimate = .pred_class, 
                truth = true_class)
## # A tibble: 1 × 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary             1

Boosting the Random Forest

library("xgboost")
## Warning: package 'xgboost' was built under R version 4.2.1
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
## 
##     slice
boost_spec <- boost_tree() %>%
  set_mode("classification") %>%
  set_engine("xgboost")
boost_spec
## Boosted Tree Model Specification (classification)
## 
## Computational engine: xgboost
boost_model <- fit(boost_spec, formula = Survived ~ ., data = titanic_train)
boost_model
## parsnip model object
## 
## ##### xgb.Booster
## raw: 41 Kb 
## call:
##   xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0, 
##     colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1, 
##     subsample = 1, objective = "binary:logistic"), data = x$data, 
##     nrounds = 15, watchlist = x$watchlist, verbose = 0, nthread = 1)
## params (as set within xgb.train):
##   eta = "0.3", max_depth = "6", gamma = "0", colsample_bytree = "1", colsample_bynode = "1", min_child_weight = "1", subsample = "1", objective = "binary:logistic", nthread = "1", validate_parameters = "TRUE"
## xgb.attributes:
##   niter
## callbacks:
##   cb.evaluation.log()
## # of features: 1405 
## niter: 15
## nfeatures : 1405 
## evaluation_log:
##     iter training_logloss
##        1        0.5533275
##        2        0.4747563
## ---                      
##       14        0.2384150
##       15        0.2252098
set.seed(99)

folds <- vfold_cv(titanic_train, v = 5)

cv_results <- fit_resamples(boost_spec,
                            Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
                            resamples = folds,
                            metrics = metric_set(yardstick::roc_auc))
collect_metrics(cv_results)
## # A tibble: 1 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 roc_auc binary     0.871     5  0.0116 Preprocessor1_Model1
set.seed(100)

predictions <- boost_tree() %>%
  set_mode("classification") %>%
  set_engine("xgboost") %>% 
  fit(Survived ~ ., data = titanic_train) %>%
  predict(new_data = titanic_train, type = "prob") %>% 
  bind_cols(titanic_train)

# Calculate AUC
roc_auc(predictions, 
        truth = Survived, 
        estimate = .pred_1)
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary        0.0189
boost_spec <- boost_tree(
                trees = 500,
                learn_rate = tune(),
                tree_depth = tune(),
                sample_size = tune()) %>%
  set_mode("classification") %>%
  set_engine("xgboost")

# Create the tuning grid
tunegrid_boost <- grid_regular(parameters(boost_spec), 
                      levels = 3)
## Warning: `parameters.model_spec()` was deprecated in tune 0.1.6.9003.
## Please use `hardhat::extract_parameter_set_dials()` instead.
tunegrid_boost
## # A tibble: 27 × 3
##    tree_depth learn_rate sample_size
##         <int>      <dbl>       <dbl>
##  1          1     0.001         0.1 
##  2          8     0.001         0.1 
##  3         15     0.001         0.1 
##  4          1     0.0178        0.1 
##  5          8     0.0178        0.1 
##  6         15     0.0178        0.1 
##  7          1     0.316         0.1 
##  8          8     0.316         0.1 
##  9         15     0.316         0.1 
## 10          1     0.001         0.55
## # … with 17 more rows
# Create CV folds of training data
folds <- vfold_cv(titanic_train, v = 6)

# Tune along the grid
tune_results <- tune_grid(boost_spec,
                    Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
                    resamples = folds,
                    grid = tunegrid_boost,
                    metrics = metric_set(yardstick::roc_auc))

# Plot the results
autoplot(tune_results)

Conclusion

- Our Tree-based model was problematic as the AUC was very flawed.

- Our Random Forest model was overfitted as we got an accuracy of 100%!

- Our Boosted Forest gave us a more reasonable output with a roc_auc of 87.1%.