# Load necessary packages
library(dplyr)      
## Warning: package 'dplyr' was built under R version 4.2.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tibble)     
## Warning: package 'tibble' was built under R version 4.2.3
library(rpart)       
library(rpart.plot) 
## Warning: package 'rpart.plot' was built under R version 4.2.3
# Read data from CSV files
train <- read.csv("train.csv") %>% tibble::tibble()
test <- read.csv("test.csv") %>% tibble::tibble()

# Explore the data ========================================================
# Display column names
colnames(train)
##  [1] "PassengerId"  "HomePlanet"   "CryoSleep"    "Cabin"        "Destination" 
##  [6] "Age"          "VIP"          "RoomService"  "FoodCourt"    "ShoppingMall"
## [11] "Spa"          "VRDeck"       "Name"         "Transported"
# Explore the relationship between HomePlanet and Transported
train %>%
  dplyr::group_by(Transported) %>%
  dplyr::count(HomePlanet) %>%
  tidyr::pivot_wider(
    id_cols = "HomePlanet", 
    names_from = "Transported",
    values_from = "n"
  )
## # A tibble: 4 × 3
##   HomePlanet False  True
##   <chr>      <int> <int>
## 1 ""            98   103
## 2 "Earth"     2651  1951
## 3 "Europa"     727  1404
## 4 "Mars"       839   920
# Explore the relationship between CryoSleep and Transported
train %>%
  dplyr::group_by(Transported) %>%
  dplyr::count(CryoSleep) %>%
  tidyr::pivot_wider(
    id_cols = "CryoSleep", 
    names_from = "Transported",
    values_from = "n"
  )
## # A tibble: 3 × 3
##   CryoSleep False  True
##   <chr>     <int> <int>
## 1 ""          111   106
## 2 "False"    3650  1789
## 3 "True"      554  2483
# Explore the relationship between Cabin side and Transported for people from Europa
train %>%
  dplyr::filter(HomePlanet == "Europa") %>%
  dplyr::group_by(Transported) %>%
  dplyr::count(CryoSleep) %>%
  tidyr::pivot_wider(
    id_cols = "CryoSleep", 
    names_from = "Transported",
    values_from = "n"
  )
## # A tibble: 3 × 3
##   CryoSleep False  True
##   <chr>     <int> <int>
## 1 ""           20    38
## 2 "False"     697   465
## 3 "True"       10   901
# Explore the relationship between VRDeck and Transported
train %>%
  dplyr::group_by(Transported) %>%
  dplyr::count(VRDeck) %>%
  tidyr::pivot_wider(
    id_cols = "VRDeck", 
    names_from = "Transported",
    values_from = "n"
  ) %>%
  dplyr::arrange(desc(VRDeck)) %>%
  na.omit()
## # A tibble: 296 × 3
##    VRDeck False  True
##     <dbl> <int> <int>
##  1   2577     1     1
##  2   2376     1     1
##  3   2260     1     1
##  4   1804     1     1
##  5   1674     2     1
##  6   1539     1     1
##  7   1514     1     1
##  8   1479     1     1
##  9   1460     1     1
## 10   1384     1     1
## # ℹ 286 more rows
# Explore the relationship between VIP status and Transported
train %>%
  dplyr::group_by(Transported) %>%
  dplyr::count(VIP) %>%
  tidyr::pivot_wider(
    id_cols = "VIP", 
    names_from = "Transported",
    values_from = "n"
  )
## # A tibble: 3 × 3
##   VIP     False  True
##   <chr>   <int> <int>
## 1 ""         99   104
## 2 "False"  4093  4198
## 3 "True"    123    76
# Decision tree ========================================================
# Prepare the data for the decision tree
train_ok <- train %>%
  dplyr::mutate(
    Transported = factor(Transported, levels = c("True","False")),
    deck = stringr::str_remove(Cabin, "\\/.*"),
    num = stringr::str_remove(Cabin, "\\/.$") %>% stringr::str_remove(".*\\/") %>% as.numeric(),
    side = stringr::str_remove(Cabin, ".*/\\.*")
  ) %>%
  dplyr::mutate(
    luxury = RoomService + FoodCourt + ShoppingMall + Spa + VRDeck
  )

# Specify variables for the decision tree
vars <- c("Transported", "CryoSleep", "Age")

# Build the decision tree model
m <- rpart::rpart(Transported ~ ., data = train_ok[,vars])

# Plot the decision tree
rpart.plot(m)

# Evaluate the model using confusion matrix
p <- predict(m, train_ok, type = 'class')
caret::confusionMatrix(p, train_ok$Transported, positive="True")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction True False
##      True  2774   692
##      False 1604  3623
##                                           
##                Accuracy : 0.7359          
##                  95% CI : (0.7265, 0.7451)
##     No Information Rate : 0.5036          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4725          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6336          
##             Specificity : 0.8396          
##          Pos Pred Value : 0.8003          
##          Neg Pred Value : 0.6931          
##              Prevalence : 0.5036          
##          Detection Rate : 0.3191          
##    Detection Prevalence : 0.3987          
##       Balanced Accuracy : 0.7366          
##                                           
##        'Positive' Class : True            
## 
# Make predictions on the test set
predict(m, test, type = "prob") %>%
  as.data.frame() %>%
  tibble::tibble() %>%
  dplyr::bind_cols(test[,"PassengerId"]) %>%
  dplyr::mutate(Transported = ifelse(True > False, "True","False")) %>%
  dplyr::select(PassengerId, Transported)
## # A tibble: 4,277 × 2
##    PassengerId Transported
##    <chr>       <chr>      
##  1 0013_01     True       
##  2 0018_01     False      
##  3 0019_01     True       
##  4 0021_01     False      
##  5 0023_01     False      
##  6 0027_01     False      
##  7 0029_01     True       
##  8 0032_01     True       
##  9 0032_02     True       
## 10 0033_01     False      
## # ℹ 4,267 more rows
#
knitr::include_graphics("4444.jpeg")