# Load necessary packages
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
## Warning: package 'tibble' was built under R version 4.2.3
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
# Read data from CSV files
train <- read.csv("train.csv") %>% tibble::tibble()
test <- read.csv("test.csv") %>% tibble::tibble()
# Explore the data ========================================================
# Display column names
colnames(train)
## [1] "PassengerId" "HomePlanet" "CryoSleep" "Cabin" "Destination"
## [6] "Age" "VIP" "RoomService" "FoodCourt" "ShoppingMall"
## [11] "Spa" "VRDeck" "Name" "Transported"
# Explore the relationship between HomePlanet and Transported
train %>%
dplyr::group_by(Transported) %>%
dplyr::count(HomePlanet) %>%
tidyr::pivot_wider(
id_cols = "HomePlanet",
names_from = "Transported",
values_from = "n"
)
## # A tibble: 4 × 3
## HomePlanet False True
## <chr> <int> <int>
## 1 "" 98 103
## 2 "Earth" 2651 1951
## 3 "Europa" 727 1404
## 4 "Mars" 839 920
# Explore the relationship between CryoSleep and Transported
train %>%
dplyr::group_by(Transported) %>%
dplyr::count(CryoSleep) %>%
tidyr::pivot_wider(
id_cols = "CryoSleep",
names_from = "Transported",
values_from = "n"
)
## # A tibble: 3 × 3
## CryoSleep False True
## <chr> <int> <int>
## 1 "" 111 106
## 2 "False" 3650 1789
## 3 "True" 554 2483
# Explore the relationship between Cabin side and Transported for people from Europa
train %>%
dplyr::filter(HomePlanet == "Europa") %>%
dplyr::group_by(Transported) %>%
dplyr::count(CryoSleep) %>%
tidyr::pivot_wider(
id_cols = "CryoSleep",
names_from = "Transported",
values_from = "n"
)
## # A tibble: 3 × 3
## CryoSleep False True
## <chr> <int> <int>
## 1 "" 20 38
## 2 "False" 697 465
## 3 "True" 10 901
# Explore the relationship between VRDeck and Transported
train %>%
dplyr::group_by(Transported) %>%
dplyr::count(VRDeck) %>%
tidyr::pivot_wider(
id_cols = "VRDeck",
names_from = "Transported",
values_from = "n"
) %>%
dplyr::arrange(desc(VRDeck)) %>%
na.omit()
## # A tibble: 296 × 3
## VRDeck False True
## <dbl> <int> <int>
## 1 2577 1 1
## 2 2376 1 1
## 3 2260 1 1
## 4 1804 1 1
## 5 1674 2 1
## 6 1539 1 1
## 7 1514 1 1
## 8 1479 1 1
## 9 1460 1 1
## 10 1384 1 1
## # ℹ 286 more rows
# Explore the relationship between VIP status and Transported
train %>%
dplyr::group_by(Transported) %>%
dplyr::count(VIP) %>%
tidyr::pivot_wider(
id_cols = "VIP",
names_from = "Transported",
values_from = "n"
)
## # A tibble: 3 × 3
## VIP False True
## <chr> <int> <int>
## 1 "" 99 104
## 2 "False" 4093 4198
## 3 "True" 123 76
# Decision tree ========================================================
# Prepare the data for the decision tree
train_ok <- train %>%
dplyr::mutate(
Transported = factor(Transported, levels = c("True","False")),
deck = stringr::str_remove(Cabin, "\\/.*"),
num = stringr::str_remove(Cabin, "\\/.$") %>% stringr::str_remove(".*\\/") %>% as.numeric(),
side = stringr::str_remove(Cabin, ".*/\\.*")
) %>%
dplyr::mutate(
luxury = RoomService + FoodCourt + ShoppingMall + Spa + VRDeck
)
# Specify variables for the decision tree
vars <- c("Transported", "CryoSleep", "Age")
# Build the decision tree model
m <- rpart::rpart(Transported ~ ., data = train_ok[,vars])
# Plot the decision tree
rpart.plot(m)
# Evaluate the model using confusion matrix
p <- predict(m, train_ok, type = 'class')
caret::confusionMatrix(p, train_ok$Transported, positive="True")

## Confusion Matrix and Statistics
##
## Reference
## Prediction True False
## True 2774 692
## False 1604 3623
##
## Accuracy : 0.7359
## 95% CI : (0.7265, 0.7451)
## No Information Rate : 0.5036
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4725
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6336
## Specificity : 0.8396
## Pos Pred Value : 0.8003
## Neg Pred Value : 0.6931
## Prevalence : 0.5036
## Detection Rate : 0.3191
## Detection Prevalence : 0.3987
## Balanced Accuracy : 0.7366
##
## 'Positive' Class : True
##
# Make predictions on the test set
predict(m, test, type = "prob") %>%
as.data.frame() %>%
tibble::tibble() %>%
dplyr::bind_cols(test[,"PassengerId"]) %>%
dplyr::mutate(Transported = ifelse(True > False, "True","False")) %>%
dplyr::select(PassengerId, Transported)
## # A tibble: 4,277 × 2
## PassengerId Transported
## <chr> <chr>
## 1 0013_01 True
## 2 0018_01 False
## 3 0019_01 True
## 4 0021_01 False
## 5 0023_01 False
## 6 0027_01 False
## 7 0029_01 True
## 8 0032_01 True
## 9 0032_02 True
## 10 0033_01 False
## # ℹ 4,267 more rows
#
knitr::include_graphics("4444.jpeg")
