titanic

#Import dataset
library(readr)
T3 <- read_csv("titanic3.csv")

## Rows: 1309 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): name, sex, ticket, cabin, embarked, boat, home.dest
## dbl (7): pclass, survived, age, sibsp, parch, fare, body
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#New 'titanic' data w/ survived, embarked, age, sex, sibsp, parche, fare
titanic <- T3[, c("survived", "embarked", "age", "sex", "sibsp", "parch", "fare")]
head(titanic)

## # A tibble: 6 × 7
##   survived embarked   age sex    sibsp parch  fare
##      <dbl> <chr>    <dbl> <chr>  <dbl> <dbl> <dbl>
## 1        1 S        29    female     0     0 211. 
## 2        1 S         0.92 male       1     2 152. 
## 3        0 S         2    female     1     2 152. 
## 4        0 S        30    male       1     2 152. 
## 5        0 S        25    female     1     2 152. 
## 6        1 S        48    male       0     0  26.6

#Find survived proportion
prop_survived <- mean(titanic$survived == 1)
print(prop_survived)

## [1] 0.381971

#Remove any NAs 
anyNA(titanic)

## [1] TRUE

titanic <- na.omit(titanic)

#Set survived, embarked, and sex as factors
titanic$survived <- as.factor(titanic$survived)
titanic$embarked <- as.factor(titanic$embarked)
titanic$sex <- as.factor(titanic$sex)
str(titanic)

## tibble [1,043 × 7] (S3: tbl_df/tbl/data.frame)
##  $ survived: Factor w/ 2 levels "0","1": 2 2 1 1 1 2 2 1 2 1 ...
##  $ embarked: Factor w/ 3 levels "C","Q","S": 3 3 3 3 3 3 3 3 3 1 ...
##  $ age     : num [1:1043] 29 0.92 2 30 25 48 63 39 53 71 ...
##  $ sex     : Factor w/ 2 levels "female","male": 1 2 1 2 1 2 1 2 1 2 ...
##  $ sibsp   : num [1:1043] 0 1 1 1 1 0 1 0 2 0 ...
##  $ parch   : num [1:1043] 0 2 2 2 2 0 0 0 0 0 ...
##  $ fare    : num [1:1043] 211 152 152 152 152 ...
##  - attr(*, "na.action")= 'omit' Named int [1:266] 16 38 41 47 60 70 71 75 81 107 ...
##   ..- attr(*, "names")= chr [1:266] "16" "38" "41" "47" ...

#Find correlation
library(ggcorrplot)

## Warning: package 'ggcorrplot' was built under R version 4.3.3

# Calculate correlation matrix
correlation_matrix <- model.matrix(~0+., data = titanic) %>% 
  cor(use = "pairwise.complete.obs")

# Plot correlation matrix
ggcorrplot(correlation_matrix, show.diag = FALSE, type = "lower", lab = TRUE, lab_size = 2)

#Set seed and split dataset
set.seed(1000)
titanic_train <- titanic[1:1046, ]
titanic_test <- titanic[1047:1308, ]

#Train learner on test dataset
library(rpart)

## Warning: package 'rpart' was built under R version 4.3.3

fit <- rpart(survived~ sex + age + sibsp + parch + fare + embarked, data = titanic_train, method = "class")
plot(fit)

#Make plot readable
library(rattle)

## Warning: package 'rattle' was built under R version 4.3.3

## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

fancyRpartPlot(fit, main = "Regression tree for Titanic")

#Make prediction and save result of of prediction
titanic_prediction <- predict(fit, titanic_test, type = "class")
Results <- data.frame(PassengerSex = titanic_test$sex, Survived = titanic_prediction)
head(Results)

##   PassengerSex Survived
## 1         <NA>        0
## 2         <NA>        0
## 3         <NA>        0
## 4         <NA>        0
## 5         <NA>        0
## 6         <NA>        0

#Save data frame to a .csv file
write.csv(Results, file = "Titanicdtree.csv", row.names = FALSE)

titanic

Thu Vu

2024-04-14