Importing Dataset:

remove(list = ls())

train <- read.csv("~/Desktop/BCE Coding/train.csv")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
train_numeric <- select(.data = train, -Name, -Sex, -Ticket, -Cabin, -Embarked)
library(tidyverse)

train_numeric <- select(.data = train, -Name, -Sex, -Ticket, -Cabin, -Embarked)


?cor
train_corr_matrix <- cor(train_numeric, use = "pairwise.complete.obs")

#install.packages("ggcorrplot")
library(ggcorrplot)
ggcorrplot(corr = train_corr_matrix)

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(train, type = "text")
## 
## ==============================================
## Statistic    N   Mean   St. Dev.  Min    Max  
## ----------------------------------------------
## PassengerId 891 446.000 257.354    1     891  
## Survived    891  0.384   0.487     0      1   
## Pclass      891  2.309   0.836     1      3   
## Age         714 29.699   14.526  0.420 80.000 
## SibSp       891  0.523   1.103     0      8   
## Parch       891  0.382   0.806     0      6   
## Fare        891 32.204   49.693  0.000 512.329
## ----------------------------------------------
set.seed(150)
train_subset <- sample_n(tbl = train, size = 600)

test_subset <- dplyr::anti_join(x = train, y = train_subset, by = "PassengerId")
Train1 <- lm(data = train_subset, formula = Survived ~ as.factor(Pclass) + Age + Fare + Sex)
test_subset$predictions <- predict(object = Train1, newdata = test_subset)

summary(test_subset$predictions)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -0.03006  0.13856  0.37472  0.43557  0.67612  1.07295       63
test_subset$predicted_Survived <- ifelse(test = test_subset$predictions > .5, yes = 1, no = 0)

table(test_subset$Survived, test_subset$predicted_Survived)
##    
##       0   1
##   0 103  21
##   1  29  75