library(tidymodels)
titanic3 <- read.csv("titanic3.csv") %>%
rename(Pclass = pclass, Survived = survived, Name = name, Sex = sex,
Age = age, SibSp = sibsp, Parch = parch, Ticket = ticket, Fare = fare,
Cabin = cabin, Embarked = embarked, Boat = boat, Body = body,
Home.dest = home.dest)
titanic3$Survived <- as.factor(titanic3$Survived)
titanic3$Pclass <- as.factor(titanic3$Pclass)
titanic3$Sex <- as.factor(titanic3$Sex)
titanic3$Embarked <- as.factor(titanic3$Embarked)
summary(titanic3)
## Pclass Survived Name Sex Age
## 1:323 0:809 Length:1309 female:466 Min. : 0.17
## 2:277 1:500 Class :character male :843 1st Qu.:21.00
## 3:709 Mode :character Median :28.00
## Mean :29.88
## 3rd Qu.:39.00
## Max. :80.00
## NA's :263
## SibSp Parch Ticket Fare
## Min. :0.0000 Min. :0.000 Length:1309 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.:0.000 Class :character 1st Qu.: 7.896
## Median :0.0000 Median :0.000 Mode :character Median : 14.454
## Mean :0.4989 Mean :0.385 Mean : 33.295
## 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.: 31.275
## Max. :8.0000 Max. :9.000 Max. :512.329
## NA's :1
## Cabin Embarked Boat Body
## Length:1309 : 2 Length:1309 Min. : 1.0
## Class :character C:270 Class :character 1st Qu.: 72.0
## Mode :character Q:123 Mode :character Median :155.0
## S:914 Mean :160.8
## 3rd Qu.:256.0
## Max. :328.0
## NA's :1188
## Home.dest
## Length:1309
## Class :character
## Mode :character
##
##
##
##
titanic_train <- read.csv("titanic_train.csv")
titanic_train$Survived = as.factor(titanic_train$Survived)
titanic_train$Pclass <- as.factor(titanic_train$Pclass)
titanic_train$Sex <- as.factor(titanic_train$Sex)
titanic_train$Embarked <- as.factor(titanic_train$Embarked)
summary(titanic_train)
## PassengerId Survived Pclass Name Sex
## Min. : 1.0 0:549 1:216 Length:891 female:314
## 1st Qu.:223.5 1:342 2:184 Class :character male :577
## Median :446.0 3:491 Mode :character
## Mean :446.0
## 3rd Qu.:668.5
## Max. :891.0
##
## Age SibSp Parch Ticket
## Min. : 0.42 Min. :0.000 Min. :0.0000 Length:891
## 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000 Class :character
## Median :28.00 Median :0.000 Median :0.0000 Mode :character
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Fare Cabin Embarked
## Min. : 0.00 Length:891 : 2
## 1st Qu.: 7.91 Class :character C:168
## Median : 14.45 Mode :character Q: 77
## Mean : 32.20 S:644
## 3rd Qu.: 31.00
## Max. :512.33
##
titanic_test <- read.csv("titanic_test.csv")
titanic_test$Pclass <- as.factor(titanic_test$Pclass)
titanic_test$Sex <- as.factor(titanic_test$Sex)
titanic_test$Embarked <- as.factor(titanic_test$Embarked)
summary(titanic_test)
## PassengerId Pclass Name Sex Age
## Min. : 892.0 1:107 Length:418 female:152 Min. : 0.17
## 1st Qu.: 996.2 2: 93 Class :character male :266 1st Qu.:21.00
## Median :1100.5 3:218 Mode :character Median :27.00
## Mean :1100.5 Mean :30.27
## 3rd Qu.:1204.8 3rd Qu.:39.00
## Max. :1309.0 Max. :76.00
## NA's :86
## SibSp Parch Ticket Fare
## Min. :0.0000 Min. :0.0000 Length:418 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 Class :character 1st Qu.: 7.896
## Median :0.0000 Median :0.0000 Mode :character Median : 14.454
## Mean :0.4474 Mean :0.3923 Mean : 35.627
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.: 31.500
## Max. :8.0000 Max. :9.0000 Max. :512.329
## NA's :1
## Cabin Embarked
## Length:418 C:102
## Class :character Q: 46
## Mode :character S:270
##
##
##
##
We observe that most did not survive (809 compared to 500).
library(ggplot2)
ggplot(titanic3, aes(Survived)) +
geom_bar(color = "darkgreen", fill = "lightgreen") +
theme_classic()
summary(titanic3$Survived)
## 0 1
## 809 500
ggplot(titanic3, aes(Sex)) +
geom_bar(color = "darkgreen", fill = "lightgreen") +
theme_classic()
summary(titanic3$Sex)
## female male
## 466 843
ggplot(titanic3, aes(Age)) +
geom_histogram(color = "darkgreen", fill = "lightgreen", binwidth = 10) +
theme_classic()
## Warning: Removed 263 rows containing non-finite values (stat_bin).
summary(titanic3$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.17 21.00 28.00 29.88 39.00 80.00 263
ggplot(titanic3, aes(Fare)) +
geom_histogram(color = "darkgreen", fill = "lightgreen", binwidth = 30) +
theme_classic()
## Warning: Removed 1 rows containing non-finite values (stat_bin).
summary(titanic3$Fare)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 7.896 14.454 33.295 31.275 512.329 1
ggplot(titanic3, aes(Pclass)) +
geom_bar(color = "darkgreen", fill = "lightgreen") +
theme_classic()
summary(titanic3$Pclass)
## 1 2 3
## 323 277 709
ggplot(titanic3, aes(Embarked)) +
geom_bar(color = "darkgreen", fill = "lightgreen") +
theme_classic()
summary(titanic3$Embarked)
## C Q S
## 2 270 123 914
ggplot(titanic3, aes(SibSp)) +
geom_histogram(color = "darkgreen", fill = "lightgreen") +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(titanic3$SibSp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4989 1.0000 8.0000
ggplot(titanic3, aes(Parch)) +
geom_histogram(color = "darkgreen", fill = "lightgreen") +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(titanic3$Parch)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.385 0.000 9.000
glm_mdl <- glm(Survived ~ Age, data = titanic3, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = Survived ~ Age, family = binomial, data = titanic3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1189 -1.0361 -0.9768 1.3187 1.5162
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.136534 0.144715 -0.943 0.3454
## Age -0.007899 0.004407 -1.792 0.0731 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1414.6 on 1045 degrees of freedom
## Residual deviance: 1411.4 on 1044 degrees of freedom
## (263 observations deleted due to missingness)
## AIC: 1415.4
##
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept) Age
## -0.136533978 -0.007898504
ggplot(titanic3, aes(Age, as.numeric(Survived) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 263 rows containing non-finite values (stat_smooth).
## Warning: Removed 263 rows containing missing values (geom_point).
glm_mdl <- glm(Survived ~ Sex, data = titanic3, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = Survived ~ Sex, family = binomial, data = titanic3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6124 -0.6511 -0.6511 0.7977 1.8196
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.9818 0.1040 9.437 <2e-16 ***
## Sexmale -2.4254 0.1360 -17.832 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1741.0 on 1308 degrees of freedom
## Residual deviance: 1368.1 on 1307 degrees of freedom
## AIC: 1372.1
##
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept) Sexmale
## 0.981813 -2.425438
ggplot(titanic3, aes(as.numeric(Sex), as.numeric(Survived) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(Survived ~ Fare, data = titanic3, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = Survived ~ Fare, family = binomial, data = titanic3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2271 -0.8971 -0.8666 1.3526 1.5676
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.882427 0.075522 -11.684 < 2e-16 ***
## Fare 0.012453 0.001604 7.762 8.35e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1740.1 on 1307 degrees of freedom
## Residual deviance: 1654.0 on 1306 degrees of freedom
## (1 observation deleted due to missingness)
## AIC: 1658
##
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept) Fare
## -0.88242654 0.01245273
ggplot(titanic3, aes(Fare, as.numeric(Survived) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
glm_mdl <- glm(Survived ~ Pclass, data = titanic3, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = Survived ~ Pclass, family = binomial, data = titanic3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3896 -0.7678 -0.7678 0.9791 1.6525
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4861 0.1146 4.242 2.21e-05 ***
## Pclass2 -0.7696 0.1669 -4.611 4.02e-06 ***
## Pclass3 -1.5567 0.1433 -10.860 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1741.0 on 1308 degrees of freedom
## Residual deviance: 1613.3 on 1306 degrees of freedom
## AIC: 1619.3
##
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept) Pclass2 Pclass3
## 0.4861330 -0.7696046 -1.5567323
ggplot(titanic3, aes(as.numeric(Pclass), as.numeric(Survived) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(Survived ~ Embarked, data = titanic3, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = Survived ~ Embarked, family = binomial, data = titanic3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2735 -0.8993 -0.8993 1.4339 1.4838
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 13.57 378.59 0.036 0.971
## EmbarkedC -13.34 378.59 -0.035 0.972
## EmbarkedQ -14.15 378.59 -0.037 0.970
## EmbarkedS -14.26 378.59 -0.038 0.970
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1741 on 1308 degrees of freedom
## Residual deviance: 1694 on 1305 degrees of freedom
## AIC: 1702
##
## Number of Fisher Scoring iterations: 12
coefficients(glm_mdl)
## (Intercept) EmbarkedC EmbarkedQ EmbarkedS
## 13.56607 -13.34292 -14.15132 -14.26250
ggplot(titanic3, aes(as.numeric(Embarked), as.numeric(Survived) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(Survived ~ SibSp, data = titanic3, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = Survived ~ SibSp, family = binomial, data = titanic3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9919 -0.9919 -0.9698 1.3750 1.4766
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.45331 0.06310 -7.184 6.79e-13 ***
## SibSp -0.05681 0.05660 -1.004 0.315
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1741 on 1308 degrees of freedom
## Residual deviance: 1740 on 1307 degrees of freedom
## AIC: 1744
##
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept) SibSp
## -0.45331284 -0.05681221
ggplot(titanic3, aes(SibSp, as.numeric(Survived) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(Survived ~ Parch, data = titanic3, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = Survived ~ Parch, family = binomial, data = titanic3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7022 -0.9516 -0.9516 1.4215 1.4215
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.55753 0.06289 -8.865 < 2e-16 ***
## Parch 0.19318 0.06625 2.916 0.00355 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1741.0 on 1308 degrees of freedom
## Residual deviance: 1732.3 on 1307 degrees of freedom
## AIC: 1736.3
##
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept) Parch
## -0.5575286 0.1931776
ggplot(titanic3, aes(Parch, as.numeric(Survived) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
I first start by building a tree-based model. I set engine to “rpart” and mode to “classification”.I performed splitting since the Survived variable was not available on our test dataset.
I made the tree-based model as Survival based on Age and Sex of passengers. Based on our model, We observe that only 19% of males survived compared to 27% of females that survived. So more females survived overall. Also, we note that among the males that survived, 17% were more than 9.5 years old and 41% were less than 9.5 years old.
Finally, we observe that the accuracy of our tree-based model is about 78%, meaning that it is correct 78% of the time, which is quite good. Error rate of our model is about 22%.
tree_spec <- decision_tree() %>%
set_engine("rpart") %>%
set_mode("classification")
tree_model_age_sex <- tree_spec %>%
fit(formula = Survived ~ Age + Sex, data = titanic3)
tree_model_age_sex
## parsnip model object
##
## n= 1309
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1309 500 0 (0.6180290 0.3819710)
## 2) Sex=male 843 161 0 (0.8090154 0.1909846)
## 4) Age>=9.5 800 136 0 (0.8300000 0.1700000) *
## 5) Age< 9.5 43 18 1 (0.4186047 0.5813953) *
## 3) Sex=female 466 127 1 (0.2725322 0.7274678) *
I performed splitting since the Survived variable was not available on our test dataset.
titanic_split <- initial_split(titanic3, prop = 0.75)
titanic_tree_train <- training(titanic_split)
titanic_tree_test <- testing(titanic_split)
tree_predictions <- predict(tree_model_age_sex, new_data = titanic_tree_test, type = "class")
#predictions
#Combining predictions and truth value
pred_combined <- tree_predictions %>%
mutate(true_class = titanic_tree_test$Survived)
head(pred_combined)
## # A tibble: 6 × 2
## .pred_class true_class
## <fct> <fct>
## 1 1 0
## 2 0 0
## 3 1 1
## 4 0 0
## 5 0 0
## 6 1 1
#Confusion matrix
conf_mat(data = pred_combined, estimate = .pred_class, truth = true_class)
## Truth
## Prediction 0 1
## 0 176 35
## 1 30 87
We observe that the accuracy of our tree-based model is about 78%, meaning that it is correct 78% of the time, which is quite good.
accuracy(pred_combined, estimate = .pred_class, truth = true_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.802
Error rate is calculated as the total number of two incorrect predictions (FN + FP) divided by the total number of a dataset (P + N). Error rate of our model is about 22%.
error_rate <- (36 + 37) / (171 + 37 + 36 + 84)
error_rate
## [1] 0.222561
By tuning our tree, we observe that the optimal tree depth is 8 with minimum 40 nodes.
set.seed(275)
spec_untuned <- decision_tree(min_n = tune(),
tree_depth = tune(),
cost_complexity = tune()) %>%
set_engine("rpart") %>%
set_mode("classification")
tree_grid <- grid_regular(parameters(spec_untuned), levels = 3)
## Warning: `parameters.model_spec()` was deprecated in tune 0.1.6.9003.
## Please use `hardhat::extract_parameter_set_dials()` instead.
tune_results <- tune_grid(spec_untuned,
Survived ~ Age + Sex,
resamples = vfold_cv(titanic3, v = 3),
grid = tree_grid,
metrics = metric_set(accuracy))
autoplot(tune_results)
final_params <- select_best(tune_results)
final_params
## # A tibble: 1 × 4
## cost_complexity tree_depth min_n .config
## <dbl> <int> <int> <chr>
## 1 0.0000000001 8 40 Preprocessor1_Model22
best_spec <- finalize_model(spec_untuned, final_params)
best_spec
## Decision Tree Model Specification (classification)
##
## Main Arguments:
## cost_complexity = 1e-10
## tree_depth = 8
## min_n = 40
##
## Computational engine: rpart
final_model <- fit(best_spec, Survived ~ Age + Sex, data = titanic3)
final_model
## parsnip model object
##
## n= 1309
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1309 500 0 (0.6180290 0.3819710)
## 2) Sex=male 843 161 0 (0.8090154 0.1909846)
## 4) Age>=9.5 800 136 0 (0.8300000 0.1700000) *
## 5) Age< 9.5 43 18 1 (0.4186047 0.5813953)
## 10) Age>=3.5 21 10 0 (0.5238095 0.4761905) *
## 11) Age< 3.5 22 7 1 (0.3181818 0.6818182) *
## 3) Sex=female 466 127 1 (0.2725322 0.7274678)
## 6) Age< 30.75 314 101 1 (0.3216561 0.6783439)
## 12) Age< 11.5 42 17 1 (0.4047619 0.5952381)
## 24) Age>=5.5 14 5 0 (0.6428571 0.3571429) *
## 25) Age< 5.5 28 8 1 (0.2857143 0.7142857) *
## 13) Age>=11.5 272 84 1 (0.3088235 0.6911765) *
## 7) Age>=30.75 152 26 1 (0.1710526 0.8289474) *
Instead of being aligned to the top left of the plot, the ROC of our tree-based model is aligned to the bottom right of the plot, which makes it bad model. AUC is 0.22 which is less than 0.5, also indicating a bad model.
tree_predictions2 <- predict(tree_model_age_sex, new_data = titanic_tree_test, type = "prob") %>%
bind_cols(titanic_tree_test)
tree_predictions2
## # A tibble: 328 × 16
## .pred_0 .pred_1 Pclass Survived Name Sex Age SibSp Parch Ticket Fare
## <dbl> <dbl> <fct> <fct> <chr> <fct> <dbl> <int> <int> <chr> <dbl>
## 1 0.273 0.727 1 0 "Alliso… fema… 2 1 2 113781 152.
## 2 0.83 0.17 1 0 "Alliso… male 30 1 2 113781 152.
## 3 0.273 0.727 1 1 "Andrew… fema… 63 1 0 13502 78.0
## 4 0.83 0.17 1 0 "Andrew… male 39 0 0 112050 0
## 5 0.83 0.17 1 0 "Artaga… male 71 0 0 PC 17… 49.5
## 6 0.273 0.727 1 1 "Barber… fema… 26 0 0 19877 78.8
## 7 0.83 0.17 1 0 "Beatti… male 36 0 0 13050 75.2
## 8 0.83 0.17 1 1 "Behr, … male 26 0 0 111369 30
## 9 0.273 0.727 1 1 "Bonnel… fema… 30 0 0 36928 165.
## 10 0.83 0.17 1 0 "Brady,… male 41 0 0 113054 30.5
## # … with 318 more rows, and 5 more variables: Cabin <chr>, Embarked <fct>,
## # Boat <chr>, Body <int>, Home.dest <chr>
roc <- roc_curve(tree_predictions2, estimate = .pred_1, truth = Survived)
autoplot(roc)
roc_auc <- roc_auc(tree_predictions2, estimate = .pred_1, truth = Survived)
roc_auc
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.216
spec <- rand_forest(mtry = 4,
trees = 500,
min_n = 10) %>%
set_mode("classification") %>%
set_engine("ranger")
spec2 <- spec %>%
fit(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data = titanic_tree_train)
## Warning: Dropped unused factor level(s) in dependent variable: 1.
##Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
#library("ranger")
#model_rf <- ranger(Survived ~ ., data = titanic_train, probability = TRUE)
##data = titanic_train[complete.cases(titanic_train),]
#model_rf
We observe that the most important variable is “Sex”, which is followed by “Ticket”, “Fare”, and “Age”.
rand_forest(mode = "classification") %>%
set_engine("ranger", importance = "impurity") %>%
fit(Survived ~ ., data = titanic_train) %>%
vip::vip()
forest_predictions <- predict(spec2, new_data = titanic_tree_test)
forest_predictions
## # A tibble: 30 × 1
## .pred_class
## <fct>
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 0
## 9 0
## 10 0
## # … with 20 more rows
#Combining predictions and truth value
pred_combined <- forest_predictions %>%
mutate(true_class = titanic_tree_test$Survived)
head(pred_combined)
## # A tibble: 6 × 2
## .pred_class true_class
## <fct> <fct>
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
#Confusion matrix
conf_mat(data = pred_combined, estimate = .pred_class, truth = true_class)
## Truth
## Prediction 0 1
## 0 30 0
## 1 0 0
accuracy(pred_combined,
estimate = .pred_class,
truth = true_class)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 1
library("xgboost")
## Warning: package 'xgboost' was built under R version 4.2.1
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
boost_spec <- boost_tree() %>%
set_mode("classification") %>%
set_engine("xgboost")
boost_spec
## Boosted Tree Model Specification (classification)
##
## Computational engine: xgboost
boost_model <- fit(boost_spec, formula = Survived ~ ., data = titanic_train)
boost_model
## parsnip model object
##
## ##### xgb.Booster
## raw: 41 Kb
## call:
## xgboost::xgb.train(params = list(eta = 0.3, max_depth = 6, gamma = 0,
## colsample_bytree = 1, colsample_bynode = 1, min_child_weight = 1,
## subsample = 1, objective = "binary:logistic"), data = x$data,
## nrounds = 15, watchlist = x$watchlist, verbose = 0, nthread = 1)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "6", gamma = "0", colsample_bytree = "1", colsample_bynode = "1", min_child_weight = "1", subsample = "1", objective = "binary:logistic", nthread = "1", validate_parameters = "TRUE"
## xgb.attributes:
## niter
## callbacks:
## cb.evaluation.log()
## # of features: 1405
## niter: 15
## nfeatures : 1405
## evaluation_log:
## iter training_logloss
## 1 0.5533275
## 2 0.4747563
## ---
## 14 0.2384150
## 15 0.2252098
set.seed(99)
folds <- vfold_cv(titanic_train, v = 5)
cv_results <- fit_resamples(boost_spec,
Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
resamples = folds,
metrics = metric_set(yardstick::roc_auc))
collect_metrics(cv_results)
## # A tibble: 1 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 roc_auc binary 0.871 5 0.0116 Preprocessor1_Model1
set.seed(100)
predictions <- boost_tree() %>%
set_mode("classification") %>%
set_engine("xgboost") %>%
fit(Survived ~ ., data = titanic_train) %>%
predict(new_data = titanic_train, type = "prob") %>%
bind_cols(titanic_train)
# Calculate AUC
roc_auc(predictions,
truth = Survived,
estimate = .pred_1)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.0189
boost_spec <- boost_tree(
trees = 500,
learn_rate = tune(),
tree_depth = tune(),
sample_size = tune()) %>%
set_mode("classification") %>%
set_engine("xgboost")
# Create the tuning grid
tunegrid_boost <- grid_regular(parameters(boost_spec),
levels = 3)
## Warning: `parameters.model_spec()` was deprecated in tune 0.1.6.9003.
## Please use `hardhat::extract_parameter_set_dials()` instead.
tunegrid_boost
## # A tibble: 27 × 3
## tree_depth learn_rate sample_size
## <int> <dbl> <dbl>
## 1 1 0.001 0.1
## 2 8 0.001 0.1
## 3 15 0.001 0.1
## 4 1 0.0178 0.1
## 5 8 0.0178 0.1
## 6 15 0.0178 0.1
## 7 1 0.316 0.1
## 8 8 0.316 0.1
## 9 15 0.316 0.1
## 10 1 0.001 0.55
## # … with 17 more rows
# Create CV folds of training data
folds <- vfold_cv(titanic_train, v = 6)
# Tune along the grid
tune_results <- tune_grid(boost_spec,
Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
resamples = folds,
grid = tunegrid_boost,
metrics = metric_set(yardstick::roc_auc))
# Plot the results
autoplot(tune_results)