data <- read.csv("//AARONARROYO/Users/aaron/OneDrive/Desktop/Analytics 5336/Titanic-Dataset.csv") #Read a CSV file
head(data) # Previewing the data
View(data) #Viewing the data
str(data) #Checking the structure
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
summary(data) #Quick stats
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
# number of survivors
number_survived <- table(data$Survived)
number_survived
##
## 0 1
## 549 342
prop.table(number_survived)
##
## 0 1
## 0.6161616 0.3838384
# passenger class numbers
pass_class <- table(data$Pclass)
pass_class
##
## 1 2 3
## 216 184 491
prop.table(pass_class)
##
## 1 2 3
## 0.2424242 0.2065095 0.5510662
# sibling/spouses
number_sibsp <- table(data$SibSp)
number_sibsp
##
## 0 1 2 3 4 5 8
## 608 209 28 16 18 5 7
prop.table(number_sibsp)
##
## 0 1 2 3 4 5
## 0.682379349 0.234567901 0.031425365 0.017957351 0.020202020 0.005611672
## 8
## 0.007856341
# parents/children
number_parch <- table(data$Parch)
number_parch
##
## 0 1 2 3 4 5 6
## 678 118 80 5 4 5 1
prop.table(number_parch)
##
## 0 1 2 3 4 5
## 0.760942761 0.132435466 0.089786756 0.005611672 0.004489338 0.005611672
## 6
## 0.001122334
colSums(is.na(data)) #Checking what cols have missing values
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 0 0
sum(is.na(data$Cabin)) #Returns 687 aka missing values
## [1] 0
# Look at unique values
unique(data$Cabin)
## [1] "" "C85" "C123" "E46"
## [5] "G6" "C103" "D56" "A6"
## [9] "C23 C25 C27" "B78" "D33" "B30"
## [13] "C52" "B28" "C83" "F33"
## [17] "F G73" "E31" "A5" "D10 D12"
## [21] "D26" "C110" "B58 B60" "E101"
## [25] "F E69" "D47" "B86" "F2"
## [29] "C2" "E33" "B19" "A7"
## [33] "C49" "F4" "A32" "B4"
## [37] "B80" "A31" "D36" "D15"
## [41] "C93" "C78" "D35" "C87"
## [45] "B77" "E67" "B94" "C125"
## [49] "C99" "C118" "D7" "A19"
## [53] "B49" "D" "C22 C26" "C106"
## [57] "C65" "E36" "C54" "B57 B59 B63 B66"
## [61] "C7" "E34" "C32" "B18"
## [65] "C124" "C91" "E40" "T"
## [69] "C128" "D37" "B35" "E50"
## [73] "C82" "B96 B98" "E10" "E44"
## [77] "A34" "C104" "C111" "C92"
## [81] "E38" "D21" "E12" "E63"
## [85] "A14" "B37" "C30" "D20"
## [89] "B79" "E25" "D46" "B73"
## [93] "C95" "B38" "B39" "B22"
## [97] "C86" "C70" "A16" "C101"
## [101] "C68" "A10" "E68" "B41"
## [105] "A20" "D19" "D50" "D9"
## [109] "A23" "B50" "A26" "D48"
## [113] "E58" "C126" "B71" "B51 B53 B55"
## [117] "D49" "B5" "B20" "F G63"
## [121] "C62 C64" "E24" "C90" "C45"
## [125] "E8" "B101" "D45" "C46"
## [129] "D30" "E121" "D11" "E77"
## [133] "F38" "B3" "D6" "B82 B84"
## [137] "D17" "A36" "B102" "B69"
## [141] "E49" "C47" "D28" "E17"
## [145] "A24" "C50" "B42" "C148"
unique(data$Embarked)
## [1] "S" "C" "Q" ""
# Count empty strings
sum(data$Cabin == "")
## [1] 687
sum(data$Embarked == "")
## [1] 2
# Before conversion
sum(is.na(data$Embarked)) # returns 0
## [1] 0
table(data$Embarked) # includes "" as a category
##
## C Q S
## 2 168 77 644
# After conversion
data$Embarked[data$Embarked == ""] <- NA #empty strings into NA
sum(is.na(data$Embarked)) # returns 2
## [1] 2
table(data$Embarked) # only shows S, C, Q
##
## C Q S
## 168 77 644
# dropping NA
data$Cabin <- NULL
colSums(is.na(data))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Embarked
## 0 0 0 0 2
# Handle Age (177 missing values) Median imputation is the standard approach:
data$Age[is.na(data$Age)] <- median(data$Age, na.rm = TRUE)
# Handle Embarked (2 missing values) Fill with the mode (“S” for Southampton)
mode_embarked <- names(sort(table(data$Embarked), decreasing = TRUE))[1]
data$Embarked[is.na(data$Embarked)] <- mode_embarked
colSums(is.na(data))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 0
## SibSp Parch Ticket Fare Embarked
## 0 0 0 0 0
data$Sex <- factor(data$Sex)
data$Embarked <- factor(data$Embarked)
data$Survived <- factor(data$Survived)
# Load dataset
data <- read.csv("//AARONARROYO/Users/aaron/OneDrive/Desktop/Analytics 5336/Titanic-Dataset.csv")
library(rpart)
## Warning: package 'rpart' was built under R version 4.4.3
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
# Preview and inspect
head(data)
str(data)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
summary(data)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
# Check missing values
colSums(is.na(data))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 0 0
# --- Handle Cabin ---
# Convert empty strings to NA
data$Cabin[data$Cabin == ""] <- NA
# Drop Cabin due to heavy missingness
data$Cabin <- NULL
# --- Handle Age ---
# Impute missing Age values with median
data$Age[is.na(data$Age)] <- median(data$Age, na.rm = TRUE)
# --- Handle Embarked ---
# Convert empty strings to NA
data$Embarked[data$Embarked == ""] <- NA
# Impute missing Embarked values with mode ("S")
mode_embarked <- names(sort(table(data$Embarked), decreasing = TRUE))[1]
data$Embarked[is.na(data$Embarked)] <- mode_embarked
data$Sex <- factor(data$Sex)
data$Embarked <- factor(data$Embarked)
data$Survived <- factor(data$Survived)
# --- Verifying cleaning ---
colSums(is.na(data))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 0
## SibSp Parch Ticket Fare Embarked
## 0 0 0 0 0
summary(data)
## PassengerId Survived Pclass Name Sex
## Min. : 1.0 0:549 Min. :1.000 Length:891 female:314
## 1st Qu.:223.5 1:342 1st Qu.:2.000 Class :character male :577
## Median :446.0 Median :3.000 Mode :character
## Mean :446.0 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:3.000
## Max. :891.0 Max. :3.000
## Age SibSp Parch Ticket
## Min. : 0.42 Min. :0.000 Min. :0.0000 Length:891
## 1st Qu.:22.00 1st Qu.:0.000 1st Qu.:0.0000 Class :character
## Median :28.00 Median :0.000 Median :0.0000 Mode :character
## Mean :29.36 Mean :0.523 Mean :0.3816
## 3rd Qu.:35.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## Fare Embarked
## Min. : 0.00 C:168
## 1st Qu.: 7.91 Q: 77
## Median : 14.45 S:646
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
## what this pipeline does
# Cabin → dropped (too many missing values).
# Age → imputed with median (robust to skewness).
# Embarked → cleaned (empty strings → NA) and imputed with mode "S".
# Sex & Embarked → converted to factors for modeling.
# Final check → confirms no missing values remain.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
# --- Boxplots ---
par(mar = c(3, 6, 3, 3))
boxplot((data$Age), main = "Age", col = "lightblue", bty = "]")
#The majority of passengers were between ages 20 and 40, with the median slightly under 30. The 25th percentile was 20 years and the 75th percentile was 38.
boxplot((data$Pclass), main = "Passenger Class", col = "lightgreen")
#The majority of passengers were in 2nd and 3rd class.
boxplot((data$SibSp), main = "Siblings/Spouse", col = "lightpink")
#The majority of passengers had 1 or no spouses or siblings onboard.
boxplot((data$Fare), main = "Fare", col = "yellow")
#The median fare was 14.45 and the mean was 32.20. The highest outlier was 512.33.
# --- Histograms ---
hist((data$Age), main = "Age", xlab = "Age", col = "lightblue", ylab = "Number of Passengers")
#Age skews right with the majority of passengers between ages 20 and 30.
#using a bar plot instead of histogram for Passenger class
counts <- table(data$Pclass)
barplot(counts, main = "Passenger Class", col = "lightgreen", xlab = "Passenger Class", ylab = "Number of Passengers")
#With more detail seen here than in the box plot, the data shows that more passengers were in 3rd class than 1st or 2nd class.
hist((data$SibSp), main = "Siblings/Spouse", col = "lightpink", xlab = "Siblings/ Spouses", ylab = "Number of Passengers")
#The number of passengers with siblings and spouses is extremely right-skewed, with slightly more than 800 passengers with 0 spouses or siblings.
hist((data$Fare), main = "Fare", col = "yellow", xlab = "Fare (ticket price)", ylab = "Number of Passengers")
#The fare is also extremely right-skewed, with nearly all passengers paying less than 50. Unfortunately the dataset does not clarify if the numbers provided here are in US dollars or British pounds.
# --- Scatterplot Matrices ---
par(mar = c(1, .5, 1, .5))
pairs(data[, c("Survived", "Age", "SibSp", "Parch", "Fare", "Pclass")], main = "Scatterplot Matrix of Titanic Dataset")
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
pairs.panels(data[,-1], method = "pearson", col = 'lightblue',
main = "Scatterplot Matrix of Titanic Dataset")
# --- Correlation ---
cor(data[sapply(data,is.numeric)])
## PassengerId Pclass Age SibSp Parch
## PassengerId 1.000000000 -0.03514399 0.03421211 -0.05752683 -0.001652012
## Pclass -0.035143994 1.00000000 -0.33989833 0.08308136 0.018442671
## Age 0.034212112 -0.33989833 1.00000000 -0.23329633 -0.172481954
## SibSp -0.057526834 0.08308136 -0.23329633 1.00000000 0.414837699
## Parch -0.001652012 0.01844267 -0.17248195 0.41483770 1.000000000
## Fare 0.012658219 -0.54949962 0.09668842 0.15965104 0.216224945
## Fare
## PassengerId 0.01265822
## Pclass -0.54949962
## Age 0.09668842
## SibSp 0.15965104
## Parch 0.21622494
## Fare 1.00000000
# There is low correlation between most of the variables (i.e. less than 0.1).
# The exceptions are:
# Pclass/Fare = -.549
# Pclass/Age = -.339
# Parch/SibSp = .414
# Age/SibSp = -.233
# In addition to the two linear correlations suggested by the scatterplot matrix (Pclass/Fare and Age/SibSp), two additional correlations are seen: Passenger Class/Age and Parch/SibSp. A linear regression model will be developed and tested below on Parch/SibSp.
# Linear Regression - Parch/SibSp
# --- Hypothesis ---
# Null hypothesis: There is no relationship between Parch and SibSp.
# Alternative hypothesis: There is a relationship between Parch and SibSp.
# --- Model ---
par(mar = c(3, 6, 3, 3))
model1 <- lm(Parch ~ SibSp, data = data)
summary(model1)
##
## Call:
## lm(formula = Parch ~ SibSp, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.133 -0.223 -0.223 -0.223 5.474
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.22300 0.02721 8.195 8.7e-16 ***
## SibSp 0.30323 0.02231 13.594 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7338 on 889 degrees of freedom
## Multiple R-squared: 0.1721, Adjusted R-squared: 0.1712
## F-statistic: 184.8 on 1 and 889 DF, p-value: < 2.2e-16
# The regression model has the structure of:
# estimate of Parch = 0.22300 + .30323 SibSp
# Meaning, for every 0.30323 increase in SibSp, an average increase of 0.22300 will be seen in Parch.
# The p-value is <2.2e-16, so we can reject the null hypothesis.
# --- Scatterplot with Regression Line ---
plot(data$SibSp, data$Parch, main = "Regression Line for Parch vs SibSp", xlab = "SibSp", ylab = "Parch", col = "blue")
abline(model1, col = "green")
# --- Goodness of Fit ---
# Residual standard error: 0.7338 on 889 degrees of freedom
# Multiple R-squared: 0.1721, Adjusted R-squared: 0.1712
# F-statistic: 184.8 on 1 and 889 DF, p-value: < 2.2e-16
# Based on the Residual Standard Error, the predictions differ from the actual data by 0.7338. Approximately 17% of Parch is explained by SibSp, based on both the Multiple R-squared and Adjusted R-squared values. Based on the F-statistic (184.8) and the p-value (<2.23-16) being less than .05, we can conclude that there is strong evidence that SibSp predicts Parch.
# ---Residuals vs Fitted Values ---
plot(model1$fitted.values, resid(model1), xlab = "Fitted Vaues", ylab = "Residuals", main = "Residuals vs Fitted for Parch/SibSp")
abline(h = 0, col = "blue", lwd = 2)
# The Residuals vs Fitted Values plot had a somewhat linear curve, meaning that the model is not a good fit.
# --- Train Data ---
data <- read.csv("//AARONARROYO/Users/aaron/OneDrive/Desktop/Analytics 5336/Titanic-Dataset.csv")
set.seed(123)
n <- nrow(data)
train_idx <- sample(1:n, size = round(0.7*n))
train <- data[train_idx, ]
test <- data[-train_idx, ]
model1 <- lm(Parch ~ SibSp, data = train)
summary(model1)
##
## Call:
## lm(formula = Parch ~ SibSp, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.197 -0.217 -0.217 -0.217 5.457
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.21699 0.03398 6.386 3.34e-10 ***
## SibSp 0.32656 0.03069 10.641 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7616 on 622 degrees of freedom
## Multiple R-squared: 0.154, Adjusted R-squared: 0.1526
## F-statistic: 113.2 on 1 and 622 DF, p-value: < 2.2e-16
# --- Test Data ---
pred_test <- predict(model1, newdata = test)
mse <- mean ((test$Parch - pred_test )^2)
rmse <- sqrt (mse)
mae <- mean (abs(test$Parch - pred_test ))
mse; rmse; mae
## [1] 0.4440216
## [1] 0.6663495
## [1] 0.4562056
# Given that the values of Parch are between 0 and 6, and the mean is 0.3816, the error rate shown by the MSE, RMSE, and MAE are too high to consider this model a good fit.
# A linear regression model can not be used to determine the correlation between Survived and other variables as Survived is binary. Instead, logistic regression analysis will be conducted below.
# --- Clean the dataset BEFORE splitting ---
# Drop Cabin (too many missing values)
data$Cabin <- NULL
# Impute Age with median
data$Age[is.na(data$Age)] <- median(data$Age, na.rm = TRUE)
# Handle Embarked (empty strings → NA, then impute with mode)
data$Embarked[data$Embarked == ""] <- NA
mode_embarked <- names(sort(table(data$Embarked), decreasing = TRUE))[1]
data$Embarked[is.na(data$Embarked)] <- mode_embarked
# Convert categorical variables to factors
data$Sex <- factor(data$Sex)
data$Embarked <- factor(data$Embarked)
data$Survived <- factor(data$Survived)
#-------------------------------------------------------------------------
# Train/Test Split
set.seed(123)
n <- nrow(data)
train_idx <- sample(1:n, size = round(0.7*n))
train <- data[train_idx, ]
test <- data[-train_idx, ]
# Align factor levels in test to match train
test$Sex <- factor(test$Sex, levels = levels(train$Sex))
test$Embarked <- factor(test$Embarked, levels = levels(train$Embarked))
#Research question: Which passenger characteristics (class, sex, age, embarked point) significantly predict survival odds on the Titanic?
# Using Survived as the response variable
log_model <- glm(Survived ~ Pclass + Sex + Age + Embarked,
data = train, family = binomial)
summary(log_model)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Embarked, family = binomial,
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.42470 0.58033 9.348 < 2e-16 ***
## Pclass -1.19766 0.15109 -7.927 2.25e-15 ***
## Sexmale -2.62254 0.22651 -11.578 < 2e-16 ***
## Age -0.04145 0.00911 -4.550 5.37e-06 ***
## EmbarkedQ 0.37308 0.43484 0.858 0.3909
## EmbarkedS -0.55197 0.27197 -2.030 0.0424 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 833.37 on 623 degrees of freedom
## Residual deviance: 555.06 on 618 degrees of freedom
## AIC: 567.06
##
## Number of Fisher Scoring iterations: 5
# Predictions
log_probs <- predict(log_model, newdata = test, type = "response")
log_preds <- ifelse(log_probs > 0.5, 1, 0)
# Confusion Matrix
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
cm_log <- table(Predicted = log_preds, Actual = test$Survived)
cm_log
## Actual
## Predicted 0 1
## 0 134 31
## 1 33 69
# Accuracy & Misclassification Rate
accuracy_log <- sum(diag(cm_log))/sum(cm_log)
misclass_rate_log <- 1 - accuracy_log
accuracy_log
## [1] 0.7602996
misclass_rate_log
## [1] 0.2397004
# Research Question: Is the mean age of Titanic survivors significantly different from 45 years? Do survivors and non-survivors differ significantly in average age?
# One Sample T-test
t.test(data$Age, mu = 45) #Testing if the mean passenger age differs from 45
##
## One Sample t-test
##
## data: data$Age
## t = -35.853, df = 890, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 45
## 95 percent confidence interval:
## 28.50553 30.21764
## sample estimates:
## mean of x
## 29.36158
# We see that the pvalue is basically zero, we say this is statistically significant
# We reject Ho. Meaning the true mean age is between the C.I. stated.
# The age of 45 is not between the C.I.
# The observed average passenger age is around 29 years.
#-------------------------------------------------------------------------
# Two Sample T-test
t.test(Age ~ Survived, data = data) # age by survival status
##
## Welch Two Sample t-test
##
## data: Age by Survived
## t = 1.8966, df = 671.15, p-value = 0.05831
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.06126264 3.53486344
## sample estimates:
## mean in group 0 mean in group 1
## 30.02823 28.29143
# We fail to reject Ho. Meaning there is no statistically significance difference
# in average age between survivors and non-survivors, though the trend suggests
# survivors were slightly younger. The C.I. crossing 0 lets us know that the difference
# is not conclusive. #Non-Survivors mean age of 30.03 years. Survivors 28.29 years
#-------------------------------------------------------------------------
str(data) #just checking the data types of each column
## 'data.frame': 891 obs. of 11 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 28 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
#Research question: What decision rules (splits on class, sex, age, etc.) best classify survival outcomes among Titanic passengers?
# Fitting a Large Tree
tree_model <- rpart(Survived ~ Pclass + Sex + Age + Embarked + Fare,
data = train, method = "class",
control = rpart.control(cp = 0.0001))
printcp(tree_model) # Viewing the CP table
##
## Classification tree:
## rpart(formula = Survived ~ Pclass + Sex + Age + Embarked + Fare,
## data = train, method = "class", control = rpart.control(cp = 1e-04))
##
## Variables actually used in tree construction:
## [1] Age Embarked Fare Pclass Sex
##
## Root node error: 242/624 = 0.38782
##
## n= 624
##
## CP nsplit rel error xerror xstd
## 1 0.4504132 0 1.00000 1.00000 0.050296
## 2 0.0371901 1 0.54959 0.54959 0.042273
## 3 0.0330579 2 0.51240 0.55372 0.042388
## 4 0.0137741 4 0.44628 0.50826 0.041064
## 5 0.0055096 7 0.40496 0.46281 0.039613
## 6 0.0041322 11 0.38017 0.50826 0.041064
## 7 0.0001000 12 0.37603 0.50826 0.041064
plotcp(tree_model) #Visualizing
names(train)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Embarked"
# Pruned Tree - Best CP from the CP Table
pruned_tree <- prune(tree_model, cp = 0.0055)
# Visualize the pruned tree
rpart.plot(pruned_tree, type = 4, extra = 1,
clip.right.labs = FALSE, varlen = 0, faclen = 3)
# Predictions
tree_preds <- predict(pruned_tree, newdata = test, type = "class")
cm_tree <- table(Predicted = tree_preds, Actual = test$Survived)
cm_tree
## Actual
## Predicted 0 1
## 0 138 31
## 1 29 69
accuracy_tree <-sum(diag(cm_tree))/sum(cm_tree)
misclass_rate_tree <- 1 - accuracy_tree
accuracy_tree
## [1] 0.7752809
misclass_rate_tree
## [1] 0.2247191
# The pruned classification tree achieved an accuracy of 77.5% on the test set,
# significantly better than the baseline rate of 62.6%. Sensitivity was higher
# (82.6%) than specificity (69.0%), indicating the model was more effective at
# identifying non‑survivors than survivors. The misclassification error was 22.5%.
# Overall, pruning produced a balanced and interpretable tree that highlights key
# survival predictors such as sex, class, and age.
#Research question: Do natural passenger groupings (based on numeric features) align with survival outcomes?
library(ggplot2)
library(cluster)
# Load Titanic dataset
data <- read.csv("//AARONARROYO/Users/aaron/OneDrive/Desktop/Analytics 5336/Titanic-Dataset.csv")
library(ggplot2)
library(cluster)
# Load Titanic dataset
data <- read.csv("C:/Users/aaron/OneDrive/Desktop/Analytics 5336/Titanic-Dataset.csv")
# Prepare numeric-only data for clustering
cluster_data <- data
cluster_data$PassengerId <- NULL
cluster_data$Name <- NULL
cluster_data$Ticket <- NULL
cluster_data$Embarked <- NULL
cluster_data$Sex <- NULL
cluster_data$Survived <- NULL
cluster_data$Cabin <- NULL # remove Cabin (non-numeric)
# Handle missing values (drop rows with NA)
cluster_data <- na.omit(cluster_data)
# Confirm structure
str(cluster_data)
## 'data.frame': 714 obs. of 5 variables:
## $ Pclass: int 3 1 3 1 3 1 3 3 2 3 ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
## $ SibSp : int 1 1 0 1 0 0 3 0 1 1 ...
## $ Parch : int 0 0 0 0 0 0 1 2 0 1 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## - attr(*, "na.action")= 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
## ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...
#-------------------------------------------------------------------------
# K-Means Clustering
set.seed(123)
# K = 2
titanicCluster2 <- kmeans(cluster_data, centers = 2, nstart = 25)
titanicCluster2$centers
## Pclass Age SibSp Parch Fare
## 1 2.298529 29.55919 0.5058824 0.4044118 25.41562
## 2 1.000000 32.49765 0.6470588 0.9705882 220.27230
titanicCluster2$size
## [1] 680 34
# Compare clusters to survival outcome (aligned with dropped rows)
survived_clean <- data$Survived[!is.na(data$Age) & !is.na(data$Fare)]
table(Cluster = titanicCluster2$cluster, Survived = survived_clean)
## Survived
## Cluster 0 1
## 1 416 264
## 2 8 26
# K = 3
titanicCluster3 <- kmeans(cluster_data, centers = 3, nstart = 25)
titanicCluster3$centers
## Pclass Age SibSp Parch Fare
## 1 2.506944 28.35807 0.4774306 0.4062500 16.32360
## 2 1.000000 31.16667 0.8333333 1.1666667 285.38148
## 3 1.125000 35.91600 0.6333333 0.4416667 85.27184
titanicCluster3$size
## [1] 576 18 120
table(Cluster = titanicCluster3$cluster, Survived = survived_clean)
## Survived
## Cluster 0 1
## 1 384 192
## 2 4 14
## 3 36 84
library(cluster)
# K = 2 visualization
clusplot(cluster_data, titanicCluster2$cluster,
color = TRUE, shade = TRUE, labels = 0, lines = 0,
main = "Titanic Clustering (K=2)")
# K = 3 visualization
clusplot(cluster_data, titanicCluster3$cluster,
color = TRUE, shade = TRUE, labels = 0, lines = 0,
main = "Titanic Clustering (K=3)")
#-------------------------------------------------------------------------
# Elbow Method
tot.withinss <- numeric(10)
for (i in 1:10){
titanicCluster <- kmeans(cluster_data, centers = i, nstart = 25)
tot.withinss[i] <- titanicCluster$tot.withinss
}
plot(1:10, tot.withinss, type="b", pch=19,
main="Elbow Plot",
ylab="Total Within SS",
xlab="Number of clusters (K)")
## Hierarchical Clustering
# Research question: Do natural passenger groupings (based on numeric features) align with survival outcomes?
d <- dist(cluster_data)
# Hierarchical Clustering with complete linkage
h_com <- hclust(d, method="complete")
plot(h_com, labels = FALSE, main = "Hierarchical Clustering (Complete Linkage)", cex = 0.6)
rect.hclust(h_com, k = 2, border = "red")
rect.hclust(h_com, k= 3, border = "blue")
# Single Linkage
h_sin <- hclust(d, method = "single")
plot(h_sin, labels = FALSE, main = "Hierarchical Cluster (Single Linkage)", cex = 0.6)
rect.hclust(h_sin, k = 2, border = "red")
# --- Model Comparison ---
# Research question: Which predictive model (Logistic Regression or Decision Tree) provides the best balance of accuracy and interpretability for survival prediction?
# - Logistic Regression provides the highest predictive accuracy (80.13%) and is recommended for prediction tasks. The Decision Tree offers superior interpretability with clear, actionable survival rules. Both models are valuable for different purposes.
# --- Limitations ---
# While this dataset and analysis provide useful understandings of factors that contributed to passenger survival, it is not a comprehensive examination of all factors.
# Age, number of relatives on-board, and sex are likely self-reported data from the passengers themselves, which may or may not be reliable.
# Passenger class and fare may be seen as proxies for wealth, but this would need to be tested using additional data. Cabin location may indicate proximity to lifeboats, but this does not provide data on where exactly a passenger was at the time of the collision or during the time between the collision and the final capsize. For that matter, proximity to a lifeboat does not necessarily equate to whether or not a passenger boarded a lifeboat, nor does getting into a lifeboat necessarily equate to survival.
# Lastly, crew members were not included in this analysis. Depending on the research questions and aim of the study, this data might be crucial.
# --- Conclusion ---
#--------------------------------------------------------------------------
# References
# Yasser H, M. Titanic Dataset: Titanic Survival Prediction Dataset. Kaggle, 2021. https://www.kaggle.com/datasets/yasserh/titanic-dataset/data
# Pruitt, S. Why Did the Titanic Sink? History.com, 2025. https://www.history.com/articles/why-did-the-titanic-sink