Introduction

This assignment explores the Titanic dataset to understand factors associated with passenger survival.

# Check your current working directory
current_wd <- getwd()
current_wd
## [1] "C:/Users/natha_4r2oswp/Downloads"
# Exercise 1.2: Load the Titanic Dataset
# Load the training dataset from the URL
train <- read.csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

# View the first 6 rows
head(train)
# View the last 6 rows
tail(train)
# Exercise 1.3: Basic Dataset Information
# Check the structure of the dataset
str(train)
## 'data.frame':    891 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...
# Get the dimensions (rows and columns)
dim(train)
## [1] 891  12
# Get column names
names(train)
##  [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
##  [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
## [11] "Cabin"       "Embarked"
# Get a summary of all variables
summary(train)
##   PassengerId       Survived          Pclass          Name          
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000   Length:891        
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000   Class :character  
##  Median :446.0   Median :0.0000   Median :3.000   Mode  :character  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309                     
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000                     
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000                     
##                                                                     
##      Sex                 Age            SibSp           Parch       
##  Length:891         Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  Class :character   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##  Mode  :character   Median :28.00   Median :0.000   Median :0.0000  
##                     Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##                     3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##                     Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##                     NA's   :177                                     
##     Ticket               Fare           Cabin             Embarked        
##  Length:891         Min.   :  0.00   Length:891         Length:891        
##  Class :character   1st Qu.:  7.91   Class :character   Class :character  
##  Mode  :character   Median : 14.45   Mode  :character   Mode  :character  
##                     Mean   : 32.20                                        
##                     3rd Qu.: 31.00                                        
##                     Max.   :512.33                                        
## 

Part 2: Understanding Variables

# Create a frequency table of passenger classes
table(train$Pclass)
## 
##   1   2   3 
## 216 184 491
# Calculate the proportion of passengers in each class
prop.table(table(train$Pclass))
## 
##         1         2         3 
## 0.2424242 0.2065095 0.5510662
# Exercise 2.2: Survival Analysis
# Create a frequency table for Survived variable (0 = No, 1 = Yes)
survival_table <- table(train$Survived)
survival_table
## 
##   0   1 
## 549 342
# Calculate how many passengers survived
num_survived <- sum(train$Survived == 1)
num_survived
## [1] 342
# Calculate how many passengers did not survive
num_died <- sum(train$Survived == 0)
num_died
## [1] 549
# Calculate the overall survival rate (as a proportion)
survival_rate <- num_survived / nrow(train)
survival_rate
## [1] 0.3838384
# Convert to percentage
survival_percentage <- survival_rate * 100
survival_percentage
## [1] 38.38384
# Exercise 2.3: Gender Analysis
# Create a frequency table of Sex
table(train$Sex)
## 
## female   male 
##    314    577
# Create a two-way table of Sex and Survived
table(train$Sex, train$Survived)
##         
##            0   1
##   female  81 233
##   male   468 109

Part 3: Exploring Numeric Variables

# Exercise 3.1: Age Variable
# Calculate the mean age (remember to handle missing values)
mean_age <- mean(train$Age, na.rm = TRUE)
mean_age
## [1] 29.69912
# Calculate the median age
median_age <- median(train$Age, na.rm = TRUE)
median_age
## [1] 28
# Find the youngest passenger
min_age <- min(train$Age, na.rm = TRUE)
min_age
## [1] 0.42
# Find the oldest passenger
max_age <- max(train$Age, na.rm = TRUE)
max_age
## [1] 80
# Count how many Age values are missing
num_missing_age <- sum(is.na(train$Age))
num_missing_age
## [1] 177
# Exercise 3.2: Fare Variable
# Calculate summary statistics for Fare
mean(train$Fare)
## [1] 32.20421
median(train$Fare)
## [1] 14.4542
min(train$Fare)
## [1] 0
max(train$Fare)
## [1] 512.3292
# Find the range of fares
range(train$Fare)
## [1]   0.0000 512.3292
# Calculate the standard deviation of fares
sd(train$Fare)
## [1] 49.69343
# Exercise 3.3: Family Size
# Count passengers by number of siblings/spouses aboard (SibSp)
table(train$SibSp)
## 
##   0   1   2   3   4   5   8 
## 608 209  28  16  18   5   7
# Count passengers by number of parents/children aboard (Parch)
table(train$Parch)
## 
##   0   1   2   3   4   5   6 
## 678 118  80   5   4   5   1
# Create a new variable for total family size (SibSp + Parch + 1 for the passenger)
train$FamilySize <- train$SibSp + train$Parch + 1

# Display the first few rows to check
head(train[, c("SibSp", "Parch", "FamilySize")])
# Create a table of family sizes
table(train$Familysize)
## < table of extent 0 >

Part 4: Subsetting and Filtering

# Exercise 4.1: Filter by Survival
# Create a subset of only survivors
survivors <- train[train$Survived == 1, ]

# Count how many survivors there were
nrow(survivors)
## [1] 342
# Calculate the mean age of survivors
mean(survivors$Age, na.rm = TRUE)
## [1] 28.34369
# Exercise 4.2: Filter by Gender
# Create a subset of only female passengers
females <- train[train$Sex == "female", ]

# Count how many female passengers there were
nrow(females)
## [1] 314
# Calculate survival rate for females
sum(females$Survived == 1) / nrow(females)
## [1] 0.7420382
# Create a subset of only male passengers
males <- train[train$Sex == "male", ]

# Calculate survival rate for males
sum(males$Survived == 1) / nrow(males)
## [1] 0.1889081
# Exercise 4.3: Filter by Passenger Class
# Create a subset of first class passengers
first_class <- train[train$Pclass == 1, ]

# Calculate mean fare for first class
mean(first_class$Fare)
## [1] 84.15469
# Create a subset of third class passengers
third_class <- train[train$Pclass == 3, ]

# Calculate mean fare for third class
mean(third_class$Fare)
## [1] 13.67555
# Exercise 4.4: Multiple Conditions
# Create a subset of female survivors in first class
first_class_female_survivors <- train[train$Survived == 1 & 
                                        train$Sex == "female" & 
                                        train$Pclass == 1, ]

# Count how many there were
nrow(first_class_female_survivors)
## [1] 91

Part 5: Creating New Variables

# Exercise 5.1: Age Categories
# Create a new variable that categorizes passengers as "Child" (under 18) or "Adult"
train$AgeGroup <- ifelse(train$Age < 18, "Child", "Adult")

# View the first few rows
head(train[, c("Age", "AgeGroup")])
# Create a table of age groups (note: will include NAs for missing ages)
table(train$AgeGroup, useNA = "ifany")
## 
## Adult Child  <NA> 
##   601   113   177
# Exercise 5.2: Fare Categories
# Create fare categories: Low (< 10), Medium (10-30), High (> 30)
train$FareCategory <- ifelse(train$Fare < 10, "Low",
                             ifelse(train$Fare <= 30, "Medium", "High"))

# Create a table of fare categories
table(train$FareCategory)
## 
##   High    Low Medium 
##    234    336    321
# Exercise 5.3: Title Extraction (Challenge!)
# The Name variable contains titles (Mr., Mrs., Miss., etc.)
# Let's look at a few examples
head(train$Name)
## [1] "Braund, Mr. Owen Harris"                            
## [2] "Cumings, Mrs. John Bradley (Florence Briggs Thayer)"
## [3] "Heikkinen, Miss. Laina"                             
## [4] "Futrelle, Mrs. Jacques Heath (Lily May Peel)"       
## [5] "Allen, Mr. William Henry"                           
## [6] "Moran, Mr. James"
# Create a simple binary variable: Is the passenger a "Mr."?
train$IsMr <- grepl("Mr\\.", train$Name)

# Count how many "Mr." there are
sum(train$IsMr)
## [1] 517

Part 6: Basic Visualizations

# Exercise 6.1: Survival Bar Plot
# Create a barplot of survival counts
survival_counts <- table(train$Survived)
barplot(survival_counts,
        main = "Titanic Survival",
        names.arg = c("Did Not Survive", "Survived"),
        xlab = "Survival Status",
        ylab = "Number of Passengers",
        col = c("red", "green"))

# Exercise 6.2: Age Distribution
# Create a histogram of ages
hist(train$Age,
     main = "Distribution of Passenger Ages",
     xlab = "...",
     ylab = "Frequency",
     col = "lightblue",
     breaks = 20)

# Exercise 6.3: Fare Distribution
# Create a histogram of fares
hist(train$Fare,
     main = "Distribution of Passenger Fares",
     xlab = "Fare (British Pounds)",
     ylab = "...",
     col = "lightgreen",
     breaks = 30)

# Exercise 6.4: Survival by Gender
# Create a barplot showing survival by gender
survival_by_sex <- table(train$Survived, train$Sex)
barplot(survival_by_sex,
        main = "Survival by Gender",
        xlab = "Gender",
        ylab = "Number of Passengers",
        col = c("red", "green"),
        legend = c("Did Not Survive", "Survived"),
        beside = TRUE)

# Exercise 6.5: Survival by Class
# Create a barplot showing survival by passenger class
survival_by_class <- table(train$Survived, train$Pclass)
barplot(survival_by_class,
        main = "Survival by Passenger Class",
        xlab = "Passenger Class",
        ylab = "Number of Passengers",
        col = c("red", "green"),
        legend = c("Did Not Survive", "Survived"),
        beside = TRUE)

# Exercise 6.6: Age by Survival
# Create side-by-side boxplots comparing age distribution for survivors vs non-survivors
boxplot(Age ~ Survived,
        data = train,
        main = "Age Distribution by Survival Status",
        xlab = "Survived (0 = No, 1 = Yes)",
        ylab = "Age",
        col = c("red", "green"))

Part 7: Analysis Questions

# Question 1: What was the survival rate for children (under 18)?
children <- train[train$Age < 18, ]
child_survival_rate <- sum(children$Survived == 1, na.rm = TRUE) / sum(!is.na(children$Age))
child_survival_rate
## [1] 0.539823
# Question 2: What was the survival rate for adults?
adults <- train[train$Age >= 18, ]
adult_survival_rate <- sum(adults$Survived == 1, na.rm = TRUE) / sum(!is.na(adults$Age))
adult_survival_rate
## [1] 0.3810316
# Question 3: What was the average fare paid by survivors vs non-survivors?
avg_fare_survivors <- mean(survivors$Fare)
avg_fare_died <- mean(train[train$Survived == 0, ]$Fare)
avg_fare_survivors
## [1] 48.39541
avg_fare_died
## [1] 22.11789
# Question 4: What percentage of first class passengers survived?
first_class_survival <- sum(first_class$Survived == 1) / nrow(first_class) * 100
first_class_survival
## [1] 62.96296
# Question 5: What percentage of third class passengers survived?
third_class_survival <- sum(third_class$Survived == 1) / nrow(third_class) * 100
third_class_survival
## [1] 24.23625
# Question 6: What was the average family size of survivors vs non-survivors?
avg_family_survivors <- mean(survivors$FamilySize)
avg_family_died <- mean(train[train$Survived == 0, ]$FamilySize)
avg_family_survivors
## [1] 1.938596
avg_family_died
## [1] 1.883424