library(readr)
## Warning: package 'readr' was built under R version 4.3.3
# Load the dataset into T3
T3 <- read_csv("C:/Users/scott6522/Downloads/titanic3.csv")
## Rows: 1309 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): pclass, name, sex, ticket, cabin, embarked, boat, home.dest
## dbl (6): survived, age, sibsp, parch, fare, body
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(T3)

# Build a new dataset, titanic, by selecting desired features
titanic <- T3[, c("survived", "embarked", "age", "sex", "sibsp", "parch", "fare")]
View(titanic)

# Perform a statistical analysis of the titanic dataset
summary(titanic)
##     survived       embarked              age            sex           
##  Min.   :0.000   Length:1309        Min.   : 0.17   Length:1309       
##  1st Qu.:0.000   Class :character   1st Qu.:21.00   Class :character  
##  Median :0.000   Mode  :character   Median :28.00   Mode  :character  
##  Mean   :0.382                      Mean   :29.88                     
##  3rd Qu.:1.000                      3rd Qu.:39.00                     
##  Max.   :1.000                      Max.   :80.00                     
##                                     NA's   :263                       
##      sibsp            parch            fare        
##  Min.   :0.0000   Min.   :0.000   Min.   :  0.000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:  7.896  
##  Median :0.0000   Median :0.000   Median : 14.454  
##  Mean   :0.4989   Mean   :0.385   Mean   : 33.295  
##  3rd Qu.:1.0000   3rd Qu.:0.000   3rd Qu.: 31.275  
##  Max.   :8.0000   Max.   :9.000   Max.   :512.329  
##                                   NA's   :1
# Find the proportion of Survived in the dataset
prop.table(table(titanic$survived))
## 
##        0        1 
## 0.618029 0.381971
# Make Survived, embarked, and sex as factors
titanic$survived <- as.factor(titanic$survived)
titanic$embarked <- as.factor(titanic$embarked)
titanic$sex <- as.factor(titanic$sex)

# Find the correlation matrix between survival and the other features
# Consider using non-parametric correlation methods for categorical variables
# correlation_matrix <- cor(titanic[, c("survived", "embarked", "sex", "sibsp", "parch", "fare")])

# Plot survival with other features to see if any correlation exists
plot(titanic$survived ~ titanic$sex)

# Set a seed and split the dataset into 80% training, 20% testing
set.seed(1000)
train_index <- sample(1:nrow(titanic), 0.8 * nrow(titanic))
train <- titanic[train_index, ]
test <- titanic[-train_index, ]

# Load rpart package for decision tree modeling
library(rpart)

# Fit the decision tree model
fit <- rpart(survived ~ sex + age + sibsp + parch + fare + embarked, data = train, method = "class")

# Visualize the decision tree
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Warning: package 'tibble' was built under R version 4.3.3
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(RColorBrewer)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
fancyRpartPlot(fit)

# Make predictions on the test set
Prediction <- predict(fit, test, type = "class")
Results <- data.frame(PassengerSex = test$sex, Survived = Prediction)

# Write the results to a CSV file
write.csv(Results, file = "Titanicdtree.csv", row.names = FALSE)