File Set-up and Initial View

data <- read.csv("//AARONARROYO/Users/aaron/OneDrive/Desktop/Analytics 5336/Titanic-Dataset.csv") #Read a CSV file
head(data) # Previewing the data

View(data) #Viewing the data
str(data) #Checking the structure

## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

summary(data) #Quick stats

##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
##

Frequencies

# number of survivors
number_survived <- table(data$Survived)
number_survived

## 
##   0   1 
## 549 342

prop.table(number_survived)

## 
##         0         1 
## 0.6161616 0.3838384

# passenger class numbers
pass_class <- table(data$Pclass)
pass_class

## 
##   1   2   3 
## 216 184 491

prop.table(pass_class)

## 
##         1         2         3 
## 0.2424242 0.2065095 0.5510662

# sibling/spouses
number_sibsp <- table(data$SibSp)
number_sibsp

## 
##   0   1   2   3   4   5   8 
## 608 209  28  16  18   5   7

prop.table(number_sibsp)

## 
##           0           1           2           3           4           5 
## 0.682379349 0.234567901 0.031425365 0.017957351 0.020202020 0.005611672 
##           8 
## 0.007856341

# parents/children
number_parch <- table(data$Parch)
number_parch

## 
##   0   1   2   3   4   5   6 
## 678 118  80   5   4   5   1

prop.table(number_parch)

## 
##           0           1           2           3           4           5 
## 0.760942761 0.132435466 0.089786756 0.005611672 0.004489338 0.005611672 
##           6 
## 0.001122334

Investigating Missing Data

colSums(is.na(data)) #Checking what cols have missing values

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0           0           0

sum(is.na(data$Cabin)) #Returns 687 aka missing values

## [1] 0

# Look at unique values
unique(data$Cabin)

##   [1] ""                "C85"             "C123"            "E46"            
##   [5] "G6"              "C103"            "D56"             "A6"             
##   [9] "C23 C25 C27"     "B78"             "D33"             "B30"            
##  [13] "C52"             "B28"             "C83"             "F33"            
##  [17] "F G73"           "E31"             "A5"              "D10 D12"        
##  [21] "D26"             "C110"            "B58 B60"         "E101"           
##  [25] "F E69"           "D47"             "B86"             "F2"             
##  [29] "C2"              "E33"             "B19"             "A7"             
##  [33] "C49"             "F4"              "A32"             "B4"             
##  [37] "B80"             "A31"             "D36"             "D15"            
##  [41] "C93"             "C78"             "D35"             "C87"            
##  [45] "B77"             "E67"             "B94"             "C125"           
##  [49] "C99"             "C118"            "D7"              "A19"            
##  [53] "B49"             "D"               "C22 C26"         "C106"           
##  [57] "C65"             "E36"             "C54"             "B57 B59 B63 B66"
##  [61] "C7"              "E34"             "C32"             "B18"            
##  [65] "C124"            "C91"             "E40"             "T"              
##  [69] "C128"            "D37"             "B35"             "E50"            
##  [73] "C82"             "B96 B98"         "E10"             "E44"            
##  [77] "A34"             "C104"            "C111"            "C92"            
##  [81] "E38"             "D21"             "E12"             "E63"            
##  [85] "A14"             "B37"             "C30"             "D20"            
##  [89] "B79"             "E25"             "D46"             "B73"            
##  [93] "C95"             "B38"             "B39"             "B22"            
##  [97] "C86"             "C70"             "A16"             "C101"           
## [101] "C68"             "A10"             "E68"             "B41"            
## [105] "A20"             "D19"             "D50"             "D9"             
## [109] "A23"             "B50"             "A26"             "D48"            
## [113] "E58"             "C126"            "B71"             "B51 B53 B55"    
## [117] "D49"             "B5"              "B20"             "F G63"          
## [121] "C62 C64"         "E24"             "C90"             "C45"            
## [125] "E8"              "B101"            "D45"             "C46"            
## [129] "D30"             "E121"            "D11"             "E77"            
## [133] "F38"             "B3"              "D6"              "B82 B84"        
## [137] "D17"             "A36"             "B102"            "B69"            
## [141] "E49"             "C47"             "D28"             "E17"            
## [145] "A24"             "C50"             "B42"             "C148"

unique(data$Embarked)

## [1] "S" "C" "Q" ""

# Count empty strings
sum(data$Cabin == "")

## [1] 687

sum(data$Embarked == "")

## [1] 2

# Before conversion
sum(is.na(data$Embarked))   # returns 0

## [1] 0

table(data$Embarked)        # includes "" as a category

## 
##       C   Q   S 
##   2 168  77 644

# After conversion
data$Embarked[data$Embarked == ""] <- NA #empty strings into NA
sum(is.na(data$Embarked))   # returns 2

## [1] 2

table(data$Embarked)        # only shows S, C, Q

## 
##   C   Q   S 
## 168  77 644

# dropping NA
data$Cabin <- NULL
colSums(is.na(data))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare    Embarked 
##           0           0           0           0           2

# Handle Age (177 missing values) Median imputation is the standard approach:
data$Age[is.na(data$Age)] <- median(data$Age, na.rm = TRUE)

# Handle Embarked (2 missing values) Fill with the mode (“S” for Southampton)
mode_embarked <- names(sort(table(data$Embarked), decreasing = TRUE))[1]
data$Embarked[is.na(data$Embarked)] <- mode_embarked

colSums(is.na(data))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0           0 
##       SibSp       Parch      Ticket        Fare    Embarked 
##           0           0           0           0           0

## Data Above Is Now Cleaned

Converting Categorical Values into Factors so that we can use them for modeling

data$Sex <- factor(data$Sex)
data$Embarked <- factor(data$Embarked)
data$Survived <- factor(data$Survived)

Revised version of the data above

Load Dataset & Preview

# Load dataset
data <- read.csv("//AARONARROYO/Users/aaron/OneDrive/Desktop/Analytics 5336/Titanic-Dataset.csv") 
library(rpart)

## Warning: package 'rpart' was built under R version 4.4.3

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.4.3

# Preview and inspect
head(data)

str(data)

## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

summary(data)

##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
##

Checking for missing values

# Check missing values
colSums(is.na(data))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0           0           0

# --- Handle Cabin ---
# Convert empty strings to NA
data$Cabin[data$Cabin == ""] <- NA

# Drop Cabin due to heavy missingness
data$Cabin <- NULL

# --- Handle Age ---
# Impute missing Age values with median
data$Age[is.na(data$Age)] <- median(data$Age, na.rm = TRUE)

# --- Handle Embarked ---
# Convert empty strings to NA
data$Embarked[data$Embarked == ""] <- NA

# Impute missing Embarked values with mode ("S")
mode_embarked <- names(sort(table(data$Embarked), decreasing = TRUE))[1]
data$Embarked[is.na(data$Embarked)] <- mode_embarked

Converting categorical variables to factors

data$Sex <- factor(data$Sex)
data$Embarked <- factor(data$Embarked)
data$Survived <- factor(data$Survived)
# --- Verifying cleaning ---
colSums(is.na(data))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0           0 
##       SibSp       Parch      Ticket        Fare    Embarked 
##           0           0           0           0           0

summary(data)

##   PassengerId    Survived     Pclass          Name               Sex     
##  Min.   :  1.0   0:549    Min.   :1.000   Length:891         female:314  
##  1st Qu.:223.5   1:342    1st Qu.:2.000   Class :character   male  :577  
##  Median :446.0            Median :3.000   Mode  :character               
##  Mean   :446.0            Mean   :2.309                                  
##  3rd Qu.:668.5            3rd Qu.:3.000                                  
##  Max.   :891.0            Max.   :3.000                                  
##       Age            SibSp           Parch           Ticket         
##  Min.   : 0.42   Min.   :0.000   Min.   :0.0000   Length:891        
##  1st Qu.:22.00   1st Qu.:0.000   1st Qu.:0.0000   Class :character  
##  Median :28.00   Median :0.000   Median :0.0000   Mode  :character  
##  Mean   :29.36   Mean   :0.523   Mean   :0.3816                     
##  3rd Qu.:35.00   3rd Qu.:1.000   3rd Qu.:0.0000                     
##  Max.   :80.00   Max.   :8.000   Max.   :6.0000                     
##       Fare        Embarked
##  Min.   :  0.00   C:168   
##  1st Qu.:  7.91   Q: 77   
##  Median : 14.45   S:646   
##  Mean   : 32.20           
##  3rd Qu.: 31.00           
##  Max.   :512.33

## what this pipeline does
# Cabin → dropped (too many missing values).
# Age → imputed with median (robust to skewness).
# Embarked → cleaned (empty strings → NA) and imputed with mode "S".
# Sex & Embarked → converted to factors for modeling.
# Final check → confirms no missing values remain.

Descriptive Statistics and Visualizations

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

# --- Boxplots ---
par(mar = c(3, 6, 3, 3))

boxplot((data$Age), main = "Age", col = "lightblue", bty = "]")

#The majority of passengers were between ages 20 and 40, with the median slightly under 30.  The 25th percentile was 20 years and the 75th percentile was 38.


boxplot((data$Pclass), main = "Passenger Class", col = "lightgreen")

#The majority of passengers were in 2nd and 3rd class.


boxplot((data$SibSp), main = "Siblings/Spouse", col = "lightpink")

#The majority of passengers had 1 or no spouses or siblings onboard.


boxplot((data$Fare), main = "Fare", col = "yellow")

#The median fare was 14.45 and the mean was 32.20.  The highest outlier was 512.33.


# --- Histograms ---
hist((data$Age), main = "Age", xlab = "Age", col = "lightblue", ylab = "Number of Passengers")

#Age skews right with the majority of passengers between ages 20 and 30.




#using a bar plot instead of histogram for Passenger class
counts <- table(data$Pclass)
barplot(counts, main = "Passenger Class", col = "lightgreen", xlab = "Passenger Class", ylab = "Number of Passengers")

#With more detail seen here than in the box plot, the data shows that more passengers were in 3rd class than 1st or 2nd class.
      

hist((data$SibSp), main = "Siblings/Spouse", col = "lightpink", xlab = "Siblings/ Spouses", ylab = "Number of Passengers")

#The number of passengers with siblings and spouses is extremely right-skewed, with slightly more than  800 passengers with 0 spouses or siblings.




hist((data$Fare), main = "Fare", col = "yellow", xlab = "Fare (ticket price)", ylab = "Number of Passengers")

#The fare is also extremely right-skewed, with nearly all passengers paying less than 50.  Unfortunately the dataset does not clarify if the numbers provided here are in US dollars or British pounds.



# --- Scatterplot Matrices ---
par(mar = c(1, .5, 1, .5))

pairs(data[, c("Survived", "Age", "SibSp", "Parch", "Fare", "Pclass")], main = "Scatterplot Matrix of Titanic Dataset")

library(psych)

## Warning: package 'psych' was built under R version 4.4.3

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

pairs.panels(data[,-1], method = "pearson", col = 'lightblue', 
             main = "Scatterplot Matrix of Titanic Dataset")

# --- Correlation ---
cor(data[sapply(data,is.numeric)])

##              PassengerId      Pclass         Age       SibSp        Parch
## PassengerId  1.000000000 -0.03514399  0.03421211 -0.05752683 -0.001652012
## Pclass      -0.035143994  1.00000000 -0.33989833  0.08308136  0.018442671
## Age          0.034212112 -0.33989833  1.00000000 -0.23329633 -0.172481954
## SibSp       -0.057526834  0.08308136 -0.23329633  1.00000000  0.414837699
## Parch       -0.001652012  0.01844267 -0.17248195  0.41483770  1.000000000
## Fare         0.012658219 -0.54949962  0.09668842  0.15965104  0.216224945
##                    Fare
## PassengerId  0.01265822
## Pclass      -0.54949962
## Age          0.09668842
## SibSp        0.15965104
## Parch        0.21622494
## Fare         1.00000000

# There is low correlation between most of the variables (i.e. less than 0.1).  
# The exceptions are:
#   Pclass/Fare = -.549
#   Pclass/Age = -.339
#   Parch/SibSp = .414
#   Age/SibSp = -.233
# In addition to the two linear correlations suggested by the scatterplot matrix (Pclass/Fare and Age/SibSp), two additional correlations are seen: Passenger Class/Age and Parch/SibSp.  A linear regression model will be developed and tested below on Parch/SibSp.

# Linear Regression - Parch/SibSp

# --- Hypothesis ---  
# Null hypothesis:  There is no relationship between Parch and SibSp.
# Alternative hypothesis:  There is a relationship between Parch and SibSp.

# --- Model ---
par(mar = c(3, 6, 3, 3))

model1 <- lm(Parch ~ SibSp, data = data)
summary(model1)

## 
## Call:
## lm(formula = Parch ~ SibSp, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.133 -0.223 -0.223 -0.223  5.474 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.22300    0.02721   8.195  8.7e-16 ***
## SibSp        0.30323    0.02231  13.594  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7338 on 889 degrees of freedom
## Multiple R-squared:  0.1721, Adjusted R-squared:  0.1712 
## F-statistic: 184.8 on 1 and 889 DF,  p-value: < 2.2e-16

# The regression model has the structure of:
#     estimate of Parch = 0.22300 + .30323 SibSp

# Meaning, for every 0.30323 increase in SibSp, an average increase of 0.22300 will be seen in Parch.

# The p-value is <2.2e-16, so we can reject the null hypothesis.

# --- Scatterplot with Regression Line ---
plot(data$SibSp, data$Parch, main = "Regression Line for Parch vs SibSp", xlab = "SibSp", ylab = "Parch", col = "blue")
abline(model1, col = "green")

# --- Goodness of Fit ---
# Residual standard error: 0.7338 on 889 degrees of freedom
# Multiple R-squared:  0.1721,  Adjusted R-squared:  0.1712 
# F-statistic: 184.8 on 1 and 889 DF,  p-value: < 2.2e-16

# Based on the Residual Standard Error, the predictions differ from the actual data by 0.7338.  Approximately 17% of Parch is explained by SibSp, based on both the Multiple R-squared and Adjusted R-squared values. Based on the F-statistic (184.8) and the p-value (<2.23-16) being less than .05, we can conclude that there is strong evidence that SibSp predicts Parch.

# ---Residuals vs Fitted Values ---
plot(model1$fitted.values, resid(model1), xlab = "Fitted Vaues", ylab = "Residuals", main = "Residuals vs Fitted for Parch/SibSp")
abline(h = 0, col = "blue", lwd = 2)

# The Residuals vs Fitted Values plot had a somewhat linear curve, meaning that the model is not a good fit.

# --- Train Data ---
data <- read.csv("//AARONARROYO/Users/aaron/OneDrive/Desktop/Analytics 5336/Titanic-Dataset.csv")  

set.seed(123)
n <- nrow(data)
train_idx <- sample(1:n, size = round(0.7*n))
 
train <- data[train_idx, ]
test <- data[-train_idx, ]

model1 <- lm(Parch ~ SibSp, data = train)
 
summary(model1)

## 
## Call:
## lm(formula = Parch ~ SibSp, data = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.197 -0.217 -0.217 -0.217  5.457 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.21699    0.03398   6.386 3.34e-10 ***
## SibSp        0.32656    0.03069  10.641  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7616 on 622 degrees of freedom
## Multiple R-squared:  0.154,  Adjusted R-squared:  0.1526 
## F-statistic: 113.2 on 1 and 622 DF,  p-value: < 2.2e-16

# --- Test Data ---
pred_test <- predict(model1, newdata = test)

mse <- mean ((test$Parch - pred_test )^2)
rmse <- sqrt (mse)
mae <- mean (abs(test$Parch - pred_test ))

mse; rmse; mae

## [1] 0.4440216

## [1] 0.6663495

## [1] 0.4562056

# Given that the values of Parch are between 0 and 6, and the mean is 0.3816, the error rate shown by the MSE, RMSE, and MAE are too high to consider this model a good fit.

Linear vs Logistic Regression for Survived Predictions

# A linear regression model can not be used to determine the correlation between Survived and other variables as Survived is binary.  Instead, logistic regression analysis will be conducted below.

# --- Clean the dataset BEFORE splitting ---
# Drop Cabin (too many missing values)
data$Cabin <- NULL

# Impute Age with median
data$Age[is.na(data$Age)] <- median(data$Age, na.rm = TRUE)

# Handle Embarked (empty strings → NA, then impute with mode)
data$Embarked[data$Embarked == ""] <- NA
mode_embarked <- names(sort(table(data$Embarked), decreasing = TRUE))[1]
data$Embarked[is.na(data$Embarked)] <- mode_embarked

# Convert categorical variables to factors
data$Sex <- factor(data$Sex)
data$Embarked <- factor(data$Embarked)
data$Survived <- factor(data$Survived)
#-------------------------------------------------------------------------
# Train/Test Split
set.seed(123)
n <- nrow(data)
train_idx <- sample(1:n, size = round(0.7*n))
train <- data[train_idx, ]
test <- data[-train_idx, ]



# Align factor levels in test to match train
test$Sex <- factor(test$Sex, levels = levels(train$Sex))
test$Embarked <- factor(test$Embarked, levels = levels(train$Embarked))

Logistic Regression

#Research question: Which passenger characteristics (class, sex, age, embarked point) significantly predict survival odds on the Titanic?

# Using Survived as the response variable
log_model <- glm(Survived ~ Pclass + Sex + Age + Embarked,
                 data = train, family = binomial)

summary(log_model)

## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Embarked, family = binomial, 
##     data = train)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.42470    0.58033   9.348  < 2e-16 ***
## Pclass      -1.19766    0.15109  -7.927 2.25e-15 ***
## Sexmale     -2.62254    0.22651 -11.578  < 2e-16 ***
## Age         -0.04145    0.00911  -4.550 5.37e-06 ***
## EmbarkedQ    0.37308    0.43484   0.858   0.3909    
## EmbarkedS   -0.55197    0.27197  -2.030   0.0424 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 833.37  on 623  degrees of freedom
## Residual deviance: 555.06  on 618  degrees of freedom
## AIC: 567.06
## 
## Number of Fisher Scoring iterations: 5

# Predictions
log_probs <- predict(log_model, newdata = test, type = "response")
log_preds <- ifelse(log_probs > 0.5, 1, 0)

# Confusion Matrix
library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

cm_log <- table(Predicted = log_preds, Actual = test$Survived)
cm_log

##          Actual
## Predicted   0   1
##         0 134  31
##         1  33  69

# Accuracy & Misclassification Rate
accuracy_log <- sum(diag(cm_log))/sum(cm_log)
misclass_rate_log <- 1 - accuracy_log
accuracy_log

## [1] 0.7602996

misclass_rate_log

## [1] 0.2397004

One Sample & Two Sample T-test

# Research Question: Is the mean age of Titanic survivors significantly different from 45 years? Do survivors and non-survivors differ significantly in average age?


# One Sample T-test
t.test(data$Age, mu = 45) #Testing if the mean passenger age differs from 45

## 
##  One Sample t-test
## 
## data:  data$Age
## t = -35.853, df = 890, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 45
## 95 percent confidence interval:
##  28.50553 30.21764
## sample estimates:
## mean of x 
##  29.36158

# We see that the pvalue is basically zero, we say this is statistically significant
# We reject Ho. Meaning the true mean age is between the C.I. stated.
# The age of 45 is not between the C.I. 
# The observed average passenger age is around 29 years.
#-------------------------------------------------------------------------
# Two Sample T-test
t.test(Age ~ Survived, data = data) # age by survival status

## 
##  Welch Two Sample t-test
## 
## data:  Age by Survived
## t = 1.8966, df = 671.15, p-value = 0.05831
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.06126264  3.53486344
## sample estimates:
## mean in group 0 mean in group 1 
##        30.02823        28.29143

# We fail to reject Ho. Meaning there is no statistically significance difference
# in average age between survivors and non-survivors, though the trend suggests
# survivors were slightly younger. The C.I. crossing 0 lets us know that the difference
# is not conclusive. #Non-Survivors mean age of 30.03 years. Survivors 28.29 years
#-------------------------------------------------------------------------
str(data) #just checking the data types of each column

## 'data.frame':    891 obs. of  11 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num  22 38 26 35 35 28 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Embarked   : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...

Classification Tree - Pruning the Titanic

#Research question: What decision rules (splits on class, sex, age, etc.) best classify survival outcomes among Titanic passengers?


# Fitting a Large Tree
tree_model <- rpart(Survived ~ Pclass + Sex + Age + Embarked + Fare,
                    data = train, method = "class", 
                    control = rpart.control(cp = 0.0001))
printcp(tree_model) # Viewing the CP table

## 
## Classification tree:
## rpart(formula = Survived ~ Pclass + Sex + Age + Embarked + Fare, 
##     data = train, method = "class", control = rpart.control(cp = 1e-04))
## 
## Variables actually used in tree construction:
## [1] Age      Embarked Fare     Pclass   Sex     
## 
## Root node error: 242/624 = 0.38782
## 
## n= 624 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.4504132      0   1.00000 1.00000 0.050296
## 2 0.0371901      1   0.54959 0.54959 0.042273
## 3 0.0330579      2   0.51240 0.55372 0.042388
## 4 0.0137741      4   0.44628 0.50826 0.041064
## 5 0.0055096      7   0.40496 0.46281 0.039613
## 6 0.0041322     11   0.38017 0.50826 0.041064
## 7 0.0001000     12   0.37603 0.50826 0.041064

plotcp(tree_model) #Visualizing

names(train)

##  [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
##  [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
## [11] "Embarked"

# Pruned Tree - Best CP from the CP Table
pruned_tree <- prune(tree_model, cp = 0.0055)

# Visualize the pruned tree
rpart.plot(pruned_tree, type = 4, extra = 1,
           clip.right.labs = FALSE, varlen = 0, faclen = 3)

# Predictions
tree_preds <- predict(pruned_tree, newdata = test, type = "class")
cm_tree <- table(Predicted = tree_preds, Actual = test$Survived)
cm_tree

##          Actual
## Predicted   0   1
##         0 138  31
##         1  29  69

accuracy_tree <-sum(diag(cm_tree))/sum(cm_tree)
misclass_rate_tree <- 1 - accuracy_tree
accuracy_tree

## [1] 0.7752809

misclass_rate_tree

## [1] 0.2247191

# The pruned classification tree achieved an accuracy of 77.5% on the test set, 
# significantly better than the baseline rate of 62.6%. Sensitivity was higher 
# (82.6%) than specificity (69.0%), indicating the model was more effective at 
# identifying non‑survivors than survivors. The misclassification error was 22.5%. 
# Overall, pruning produced a balanced and interpretable tree that highlights key 
# survival predictors such as sex, class, and age.

Clustering the Titanic Dataset (Unsupervised Learning)

#Research question: Do natural passenger groupings (based on numeric features) align with survival outcomes?


library(ggplot2)
library(cluster)

# Load Titanic dataset
data <- read.csv("//AARONARROYO/Users/aaron/OneDrive/Desktop/Analytics 5336/Titanic-Dataset.csv")

library(ggplot2)
library(cluster)

# Load Titanic dataset
data <- read.csv("C:/Users/aaron/OneDrive/Desktop/Analytics 5336/Titanic-Dataset.csv")

# Prepare numeric-only data for clustering
cluster_data <- data
cluster_data$PassengerId <- NULL
cluster_data$Name <- NULL
cluster_data$Ticket <- NULL
cluster_data$Embarked <- NULL
cluster_data$Sex <- NULL
cluster_data$Survived <- NULL
cluster_data$Cabin <- NULL   # remove Cabin (non-numeric)

# Handle missing values (drop rows with NA)
cluster_data <- na.omit(cluster_data)

# Confirm structure
str(cluster_data)

## 'data.frame':    714 obs. of  5 variables:
##  $ Pclass: int  3 1 3 1 3 1 3 3 2 3 ...
##  $ Age   : num  22 38 26 35 35 54 2 27 14 4 ...
##  $ SibSp : int  1 1 0 1 0 0 3 0 1 1 ...
##  $ Parch : int  0 0 0 0 0 0 1 2 0 1 ...
##  $ Fare  : num  7.25 71.28 7.92 53.1 8.05 ...
##  - attr(*, "na.action")= 'omit' Named int [1:177] 6 18 20 27 29 30 32 33 37 43 ...
##   ..- attr(*, "names")= chr [1:177] "6" "18" "20" "27" ...

#-------------------------------------------------------------------------
# K-Means Clustering 
set.seed(123)

# K = 2
titanicCluster2 <- kmeans(cluster_data, centers = 2, nstart = 25)
titanicCluster2$centers

##     Pclass      Age     SibSp     Parch      Fare
## 1 2.298529 29.55919 0.5058824 0.4044118  25.41562
## 2 1.000000 32.49765 0.6470588 0.9705882 220.27230

titanicCluster2$size

## [1] 680  34

# Compare clusters to survival outcome (aligned with dropped rows)
survived_clean <- data$Survived[!is.na(data$Age) & !is.na(data$Fare)]
table(Cluster = titanicCluster2$cluster, Survived = survived_clean)

##        Survived
## Cluster   0   1
##       1 416 264
##       2   8  26

# K = 3
titanicCluster3 <- kmeans(cluster_data, centers = 3, nstart = 25)
titanicCluster3$centers

##     Pclass      Age     SibSp     Parch      Fare
## 1 2.506944 28.35807 0.4774306 0.4062500  16.32360
## 2 1.000000 31.16667 0.8333333 1.1666667 285.38148
## 3 1.125000 35.91600 0.6333333 0.4416667  85.27184

titanicCluster3$size

## [1] 576  18 120

table(Cluster = titanicCluster3$cluster, Survived = survived_clean)

##        Survived
## Cluster   0   1
##       1 384 192
##       2   4  14
##       3  36  84

library(cluster)

# K = 2 visualization
clusplot(cluster_data, titanicCluster2$cluster,
         color = TRUE, shade = TRUE, labels = 0, lines = 0,
         main = "Titanic Clustering (K=2)")

# K = 3 visualization
clusplot(cluster_data, titanicCluster3$cluster,
         color = TRUE, shade = TRUE, labels = 0, lines = 0,
         main = "Titanic Clustering (K=3)")

#-------------------------------------------------------------------------
# Elbow Method
tot.withinss <- numeric(10)
for (i in 1:10){
  titanicCluster <- kmeans(cluster_data, centers = i, nstart = 25)
  tot.withinss[i] <- titanicCluster$tot.withinss
}

plot(1:10, tot.withinss, type="b", pch=19,
     main="Elbow Plot",
     ylab="Total Within SS",
     xlab="Number of clusters (K)")

## Hierarchical Clustering

# Research question: Do natural passenger groupings (based on numeric features) align with survival outcomes?

d <- dist(cluster_data)

# Hierarchical Clustering with complete linkage
h_com <- hclust(d, method="complete")
plot(h_com, labels = FALSE, main = "Hierarchical Clustering (Complete Linkage)", cex = 0.6)
rect.hclust(h_com, k = 2, border = "red")
rect.hclust(h_com, k= 3, border = "blue")

# Single Linkage
h_sin <- hclust(d, method = "single")
plot(h_sin, labels = FALSE, main = "Hierarchical Cluster (Single Linkage)", cex = 0.6)
rect.hclust(h_sin, k = 2, border = "red")

Results and Interpretation

# --- Model Comparison ---
# Research question: Which predictive model (Logistic Regression or Decision Tree) provides the best balance of accuracy and interpretability for survival prediction?
 
# - Logistic Regression provides the highest predictive accuracy (80.13%) and is recommended for prediction tasks. The Decision Tree offers superior interpretability with clear, actionable survival rules. Both models are valuable for different purposes.



# --- Limitations ---
# While this dataset and analysis provide useful understandings of factors that contributed to passenger survival, it is not a comprehensive examination of all factors.  

# Age, number of relatives on-board, and sex are likely self-reported data from the passengers themselves, which may or may not be reliable. 

# Passenger class and fare may be seen as proxies for wealth, but this would need to be tested using additional data.  Cabin location may indicate proximity to lifeboats, but this does not provide data on where exactly a passenger was at the time of the collision or during the time between the collision and the final capsize.  For that matter, proximity to a lifeboat does not necessarily equate to whether or not a passenger boarded a lifeboat, nor does getting into a lifeboat necessarily equate to survival.

# Lastly, crew members were not included in this analysis. Depending on the research questions and aim of the study, this data might be crucial.


# --- Conclusion ---


#--------------------------------------------------------------------------

# References
# Yasser H, M.  Titanic Dataset: Titanic Survival Prediction Dataset.  Kaggle, 2021.  https://www.kaggle.com/datasets/yasserh/titanic-dataset/data

# Pruitt, S. Why Did the Titanic Sink?  History.com, 2025. https://www.history.com/articles/why-did-the-titanic-sink

Titanic Dataset

Arroyo, Aaron & Carter, Linda

2025-12-03

File Set-up and Initial View

Frequencies

Investigating Missing Data

## Data Above Is Now Cleaned

Converting Categorical Values into Factors so that we can use them for modeling

Revised version of the data above

Load Dataset & Preview

Checking for missing values

Converting categorical variables to factors

Descriptive Statistics and Visualizations

Linear vs Logistic Regression for Survived Predictions

Logistic Regression

One Sample & Two Sample T-test

Classification Tree - Pruning the Titanic

Clustering the Titanic Dataset (Unsupervised Learning)

Results and Interpretation