# Create a trainControl object for cross-validation
train_control <- trainControl(
  method = "repeatedcv", # Use cross-validation
  number = 5, # Number of folds
  repeats = 2, # Number of repeats
  search = "grid"
)

# Set up the tuning grid for hyperparameters
tune_grid <- expand.grid(
  mtry = c(2, 3, 4, 5), # Number of variables tried at each split
  splitrule = c("gini", "extratrees"), # Splitting rules
  min.node.size = c(1, 5, 10) # Minimum node sizes
)

set.seed(123)
tictoc::tic()
rf_model_tuned <- train(
  Survived ~ Sex + Pclass + Age + SibSp + Fare + Embarked,
  data = titanic_train,
  method = "ranger",
  trControl = train_control,
  tuneGrid = tune_grid,
  num.trees = 1000, # Number of trees
  importance = "impurity"
)
tictoc::toc()
#;-) 31.988 sec elapsed

# print(rf_model_tuned)
print("Best Hyperparameters:")
#;-) [1] "Best Hyperparameters:"
print(rf_model_tuned$bestTune)
#;-)   mtry splitrule min.node.size
#;-) 7    3      gini             1

rf_preds_tuned <- predict(rf_model_tuned, newdata = titanic_test)

# Prepare submission
rf_submission_tuned <- data.frame(
  PassengerId = titanic_test$PassengerId,
  Survived = rf_preds_tuned
)

write.csv(rf_submission_tuned, "rf_submission_tuned.csv", row.names = FALSE)

# Convert Pclass to unordered factor for logistic regression
titanic_train$Pclass <- factor(titanic_train$Pclass)
titanic_test$Pclass <- factor(titanic_test$Pclass)

# Build logistic regression model
logit_model <- glm(Survived ~ Sex + Pclass + Age + SibSp + Fare + Embarked,
  data = titanic_train,
  family = binomial(link = "logit")
)

# Predict on training set
train_preds_logit <- predict(logit_model, newdata = titanic_train, type = "response")
train_preds_logit_class <- ifelse(train_preds_logit > 0.5, 1, 0)

# Confusion matrix
confusionMatrix(factor(train_preds_logit_class), titanic_train$Survived)
#;-) Confusion Matrix and Statistics
#;-) 
#;-)           Reference
#;-) Prediction   0   1
#;-)          0 477  99
#;-)          1  72 243
#;-)                                           
#;-)                Accuracy : 0.8081          
#;-)                  95% CI : (0.7807, 0.8334)
#;-)     No Information Rate : 0.6162          
#;-)     P-Value [Acc > NIR] : < 2e-16         
#;-)                                           
#;-)                   Kappa : 0.5881          
#;-)                                           
#;-)  Mcnemar's Test P-Value : 0.04678         
#;-)                                           
#;-)             Sensitivity : 0.8689          
#;-)             Specificity : 0.7105          
#;-)          Pos Pred Value : 0.8281          
#;-)          Neg Pred Value : 0.7714          
#;-)              Prevalence : 0.6162          
#;-)          Detection Rate : 0.5354          
#;-)    Detection Prevalence : 0.6465          
#;-)       Balanced Accuracy : 0.7897          
#;-)                                           
#;-)        'Positive' Class : 0               
#;-)

# Predict on test set
test_preds_logit <- predict(logit_model, newdata = titanic_test, type = "response")
test_preds_logit_class <- ifelse(test_preds_logit > 0.5, 1, 0)
# here, we basically predict the probability of survival for each passenger,
## and then classify them based on a threshold of 0.5.

# Prepare submission
logit_prediction_sub <- data.frame(
  PassengerId = titanic_test$PassengerId,
  Survived = test_preds_logit_class
)

# Write submission file
write.csv(logit_prediction_sub, "logit_submission.csv", row.names = FALSE)

sessionInfo()
#;-) R version 4.4.1 (2024-06-14)
#;-) Platform: aarch64-apple-darwin20
#;-) Running under: macOS Sonoma 14.1.1
#;-) 
#;-) Matrix products: default
#;-) BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
#;-) LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
#;-) 
#;-) locale:
#;-) [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#;-) 
#;-) time zone: Europe/Berlin
#;-) tzcode source: internal
#;-) 
#;-) attached base packages:
#;-) [1] stats     graphics  grDevices utils     datasets  methods   base     
#;-) 
#;-) other attached packages:
#;-) [1] ranger_0.17.0        randomForest_4.7-1.2 rpart.plot_3.1.2    
#;-) [4] rpart_4.1.23         caret_6.0-94         lattice_0.22-6      
#;-) [7] ggplot2_3.5.1       
#;-) 
#;-) loaded via a namespace (and not attached):
#;-)  [1] gtable_0.3.5         xfun_0.47            recipes_1.1.0       
#;-)  [4] vctrs_0.6.5          tools_4.4.1          generics_0.1.3      
#;-)  [7] stats4_4.4.1         parallel_4.4.1       proxy_0.4-27        
#;-) [10] tibble_3.2.1         fansi_1.0.6          ModelMetrics_1.2.2.2
#;-) [13] pkgconfig_2.0.3      R.oo_1.26.0          Matrix_1.7-0        
#;-) [16] data.table_1.16.0    lifecycle_1.0.4      R.cache_0.16.0      
#;-) [19] compiler_4.4.1       stringr_1.5.1        tictoc_1.2.1        
#;-) [22] munsell_0.5.1        codetools_0.2-20     htmltools_0.5.8.1   
#;-) [25] class_7.3-22         yaml_2.3.10          prodlim_2024.06.25  
#;-) [28] pillar_1.9.0         MASS_7.3-61          R.utils_2.12.3      
#;-) [31] gower_1.0.1          iterators_1.0.14     foreach_1.5.2       
#;-) [34] nlme_3.1-164         parallelly_1.38.0    lava_1.8.0          
#;-) [37] styler_1.10.3        tidyselect_1.2.1     digest_0.6.37       
#;-) [40] stringi_1.8.4        future_1.34.0        dplyr_1.1.4         
#;-) [43] reshape2_1.4.4       purrr_1.0.2          listenv_0.9.1       
#;-) [46] splines_4.4.1        fastmap_1.2.0        grid_4.4.1          
#;-) [49] colorspace_2.1-1     cli_3.6.3            magrittr_2.0.3      
#;-) [52] survival_3.6-4       utf8_1.2.4           e1071_1.7-16        
#;-) [55] future.apply_1.11.2  withr_3.0.1          scales_1.3.0        
#;-) [58] lubridate_1.9.3      timechange_0.3.0     rmarkdown_2.28      
#;-) [61] globals_0.16.3       nnet_7.3-19          timeDate_4041.110   
#;-) [64] RANN_2.6.2           R.methodsS3_1.8.2    evaluate_0.24.0     
#;-) [67] knitr_1.48           hardhat_1.4.0        rlang_1.1.4         
#;-) [70] Rcpp_1.0.13          glue_1.7.0           pROC_1.18.5         
#;-) [73] ipred_0.9-15         reprex_2.1.1         rstudioapi_0.16.0   
#;-) [76] R6_2.5.1             plyr_1.8.9           fs_1.6.4

ASSIGNMENT ANSWER KEY

Q1- A New, Fine-tuned Random Forest

Q2: Logistic Regression

2.0- Build a traditional logistic regression model using the same data.

2.1 - What’s your accuracy - is it better or worse than the RF model?

Q3: Data Leakage

3.1. What do you see in the case study?

3.2. What’s the data leakage they are referring to?

3.3. What mistake we could have made in our Titanic model that could be considered data leakage? Answer by code.