library(readr)
## Warning: package 'readr' was built under R version 4.3.3
# Load the dataset into T3
T3 <- read_csv("C:/Users/scott6522/Downloads/titanic3.csv")
## Rows: 1309 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): pclass, name, sex, ticket, cabin, embarked, boat, home.dest
## dbl (6): survived, age, sibsp, parch, fare, body
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(T3)
# Build a new dataset, titanic, by selecting desired features
titanic <- T3[, c("survived", "embarked", "age", "sex", "sibsp", "parch", "fare")]
View(titanic)
# Perform a statistical analysis of the titanic dataset
summary(titanic)
## survived embarked age sex
## Min. :0.000 Length:1309 Min. : 0.17 Length:1309
## 1st Qu.:0.000 Class :character 1st Qu.:21.00 Class :character
## Median :0.000 Mode :character Median :28.00 Mode :character
## Mean :0.382 Mean :29.88
## 3rd Qu.:1.000 3rd Qu.:39.00
## Max. :1.000 Max. :80.00
## NA's :263
## sibsp parch fare
## Min. :0.0000 Min. :0.000 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.: 7.896
## Median :0.0000 Median :0.000 Median : 14.454
## Mean :0.4989 Mean :0.385 Mean : 33.295
## 3rd Qu.:1.0000 3rd Qu.:0.000 3rd Qu.: 31.275
## Max. :8.0000 Max. :9.000 Max. :512.329
## NA's :1
# Find the proportion of Survived in the dataset
prop.table(table(titanic$survived))
##
## 0 1
## 0.618029 0.381971
# Make Survived, embarked, and sex as factors
titanic$survived <- as.factor(titanic$survived)
titanic$embarked <- as.factor(titanic$embarked)
titanic$sex <- as.factor(titanic$sex)
# Find the correlation matrix between survival and the other features
# Consider using non-parametric correlation methods for categorical variables
# correlation_matrix <- cor(titanic[, c("survived", "embarked", "sex", "sibsp", "parch", "fare")])
# Plot survival with other features to see if any correlation exists
plot(titanic$survived ~ titanic$sex)

# Set a seed and split the dataset into 80% training, 20% testing
set.seed(1000)
train_index <- sample(1:nrow(titanic), 0.8 * nrow(titanic))
train <- titanic[train_index, ]
test <- titanic[-train_index, ]
# Load rpart package for decision tree modeling
library(rpart)
# Fit the decision tree model
fit <- rpart(survived ~ sex + age + sibsp + parch + fare + embarked, data = train, method = "class")
# Visualize the decision tree
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Warning: package 'tibble' was built under R version 4.3.3
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(RColorBrewer)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
fancyRpartPlot(fit)

# Make predictions on the test set
Prediction <- predict(fit, test, type = "class")
Results <- data.frame(PassengerSex = test$sex, Survived = Prediction)
# Write the results to a CSV file
write.csv(Results, file = "Titanicdtree.csv", row.names = FALSE)