Importing Dataset:
remove(list = ls())
train <- read.csv("~/Desktop/BCE Coding/train.csv")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
train_numeric <- select(.data = train, -Name, -Sex, -Ticket, -Cabin, -Embarked)
library(tidyverse)
train_numeric <- select(.data = train, -Name, -Sex, -Ticket, -Cabin, -Embarked)
?cor
train_corr_matrix <- cor(train_numeric, use = "pairwise.complete.obs")
#install.packages("ggcorrplot")
library(ggcorrplot)
ggcorrplot(corr = train_corr_matrix)

library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(train, type = "text")
##
## ==============================================
## Statistic N Mean St. Dev. Min Max
## ----------------------------------------------
## PassengerId 891 446.000 257.354 1 891
## Survived 891 0.384 0.487 0 1
## Pclass 891 2.309 0.836 1 3
## Age 714 29.699 14.526 0.420 80.000
## SibSp 891 0.523 1.103 0 8
## Parch 891 0.382 0.806 0 6
## Fare 891 32.204 49.693 0.000 512.329
## ----------------------------------------------
set.seed(150)
train_subset <- sample_n(tbl = train, size = 600)
test_subset <- dplyr::anti_join(x = train, y = train_subset, by = "PassengerId")
Train1 <- lm(data = train_subset, formula = Survived ~ as.factor(Pclass) + Age + Fare + Sex)
test_subset$predictions <- predict(object = Train1, newdata = test_subset)
summary(test_subset$predictions)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -0.03006 0.13856 0.37472 0.43557 0.67612 1.07295 63
test_subset$predicted_Survived <- ifelse(test = test_subset$predictions > .5, yes = 1, no = 0)
table(test_subset$Survived, test_subset$predicted_Survived)
##
## 0 1
## 0 103 21
## 1 29 75