#Import dataset
library(readr)
T3 <- read_csv("titanic3.csv")
## Rows: 1309 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): name, sex, ticket, cabin, embarked, boat, home.dest
## dbl (7): pclass, survived, age, sibsp, parch, fare, body
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#New 'titanic' data w/ survived, embarked, age, sex, sibsp, parche, fare
titanic <- T3[, c("survived", "embarked", "age", "sex", "sibsp", "parch", "fare")]
head(titanic)
## # A tibble: 6 × 7
## survived embarked age sex sibsp parch fare
## <dbl> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 1 S 29 female 0 0 211.
## 2 1 S 0.92 male 1 2 152.
## 3 0 S 2 female 1 2 152.
## 4 0 S 30 male 1 2 152.
## 5 0 S 25 female 1 2 152.
## 6 1 S 48 male 0 0 26.6
#Find survived proportion
prop_survived <- mean(titanic$survived == 1)
print(prop_survived)
## [1] 0.381971
#Remove any NAs
anyNA(titanic)
## [1] TRUE
titanic <- na.omit(titanic)
#Set survived, embarked, and sex as factors
titanic$survived <- as.factor(titanic$survived)
titanic$embarked <- as.factor(titanic$embarked)
titanic$sex <- as.factor(titanic$sex)
str(titanic)
## tibble [1,043 × 7] (S3: tbl_df/tbl/data.frame)
## $ survived: Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 1 2 1 ...
## $ embarked: Factor w/ 3 levels "C","Q","S": 3 3 3 3 3 3 3 3 3 1 ...
## $ age : num [1:1043] 29 0.92 2 30 25 48 63 39 53 71 ...
## $ sex : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
## $ sibsp : num [1:1043] 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : num [1:1043] 0 2 2 2 2 0 0 0 0 0 ...
## $ fare : num [1:1043] 211 152 152 152 152 ...
## - attr(*, "na.action")= 'omit' Named int [1:266] 16 38 41 47 60 70 71 75 81 107 ...
## ..- attr(*, "names")= chr [1:266] "16" "38" "41" "47" ...
#Find correlation
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.3.3
# Calculate correlation matrix
correlation_matrix <- model.matrix(~0+., data = titanic) %>%
cor(use = "pairwise.complete.obs")
# Plot correlation matrix
ggcorrplot(correlation_matrix, show.diag = FALSE, type = "lower", lab = TRUE, lab_size = 2)

#Set seed and split dataset
set.seed(1000)
titanic_train <- titanic[1:1046, ]
titanic_test <- titanic[1047:1308, ]
#Train learner on test dataset
library(rpart)
## Warning: package 'rpart' was built under R version 4.3.3
fit <- rpart(survived~ sex + age + sibsp + parch + fare + embarked, data = titanic_train, method = "class")
plot(fit)

#Make plot readable
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
fancyRpartPlot(fit, main = "Regression tree for Titanic")

#Make prediction and save result of of prediction
titanic_prediction <- predict(fit, titanic_test, type = "class")
Results <- data.frame(PassengerSex = titanic_test$sex, Survived = titanic_prediction)
head(Results)
## PassengerSex Survived
## 1 <NA> 0
## 2 <NA> 0
## 3 <NA> 0
## 4 <NA> 0
## 5 <NA> 0
## 6 <NA> 0
#Save data frame to a .csv file
write.csv(Results, file = "Titanicdtree.csv", row.names = FALSE)