This assignment explores the Titanic dataset to understand factors associated with passenger survival.
# Check your current working directory
current_wd <- getwd()
current_wd
## [1] "C:/Users/natha_4r2oswp/Downloads"
# Exercise 1.2: Load the Titanic Dataset
# Load the training dataset from the URL
train <- read.csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
# View the first 6 rows
head(train)
# View the last 6 rows
tail(train)
# Exercise 1.3: Basic Dataset Information
# Check the structure of the dataset
str(train)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
# Get the dimensions (rows and columns)
dim(train)
## [1] 891 12
# Get column names
names(train)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked"
# Get a summary of all variables
summary(train)
## PassengerId Survived Pclass Name
## Min. : 1.0 Min. :0.0000 Min. :1.000 Length:891
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Class :character
## Median :446.0 Median :0.0000 Median :3.000 Mode :character
## Mean :446.0 Mean :0.3838 Mean :2.309
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :891.0 Max. :1.0000 Max. :3.000
##
## Sex Age SibSp Parch
## Length:891 Min. : 0.42 Min. :0.000 Min. :0.0000
## Class :character 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Mode :character Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Ticket Fare Cabin Embarked
## Length:891 Min. : 0.00 Length:891 Length:891
## Class :character 1st Qu.: 7.91 Class :character Class :character
## Mode :character Median : 14.45 Mode :character Mode :character
## Mean : 32.20
## 3rd Qu.: 31.00
## Max. :512.33
##
Part 2: Understanding Variables
# Create a frequency table of passenger classes
table(train$Pclass)
##
## 1 2 3
## 216 184 491
# Calculate the proportion of passengers in each class
prop.table(table(train$Pclass))
##
## 1 2 3
## 0.2424242 0.2065095 0.5510662
# Exercise 2.2: Survival Analysis
# Create a frequency table for Survived variable (0 = No, 1 = Yes)
survival_table <- table(train$Survived)
survival_table
##
## 0 1
## 549 342
# Calculate how many passengers survived
num_survived <- sum(train$Survived == 1)
num_survived
## [1] 342
# Calculate how many passengers did not survive
num_died <- sum(train$Survived == 0)
num_died
## [1] 549
# Calculate the overall survival rate (as a proportion)
survival_rate <- num_survived / nrow(train)
survival_rate
## [1] 0.3838384
# Convert to percentage
survival_percentage <- survival_rate * 100
survival_percentage
## [1] 38.38384
# Exercise 2.3: Gender Analysis
# Create a frequency table of Sex
table(train$Sex)
##
## female male
## 314 577
# Create a two-way table of Sex and Survived
table(train$Sex, train$Survived)
##
## 0 1
## female 81 233
## male 468 109
Part 3: Exploring Numeric Variables
# Exercise 3.1: Age Variable
# Calculate the mean age (remember to handle missing values)
mean_age <- mean(train$Age, na.rm = TRUE)
mean_age
## [1] 29.69912
# Calculate the median age
median_age <- median(train$Age, na.rm = TRUE)
median_age
## [1] 28
# Find the youngest passenger
min_age <- min(train$Age, na.rm = TRUE)
min_age
## [1] 0.42
# Find the oldest passenger
max_age <- max(train$Age, na.rm = TRUE)
max_age
## [1] 80
# Count how many Age values are missing
num_missing_age <- sum(is.na(train$Age))
num_missing_age
## [1] 177
# Exercise 3.2: Fare Variable
# Calculate summary statistics for Fare
mean(train$Fare)
## [1] 32.20421
median(train$Fare)
## [1] 14.4542
min(train$Fare)
## [1] 0
max(train$Fare)
## [1] 512.3292
# Find the range of fares
range(train$Fare)
## [1] 0.0000 512.3292
# Calculate the standard deviation of fares
sd(train$Fare)
## [1] 49.69343
# Exercise 3.3: Family Size
# Count passengers by number of siblings/spouses aboard (SibSp)
table(train$SibSp)
##
## 0 1 2 3 4 5 8
## 608 209 28 16 18 5 7
# Count passengers by number of parents/children aboard (Parch)
table(train$Parch)
##
## 0 1 2 3 4 5 6
## 678 118 80 5 4 5 1
# Create a new variable for total family size (SibSp + Parch + 1 for the passenger)
train$FamilySize <- train$SibSp + train$Parch + 1
# Display the first few rows to check
head(train[, c("SibSp", "Parch", "FamilySize")])
# Create a table of family sizes
table(train$Familysize)
## < table of extent 0 >
Part 4: Subsetting and Filtering
# Exercise 4.1: Filter by Survival
# Create a subset of only survivors
survivors <- train[train$Survived == 1, ]
# Count how many survivors there were
nrow(survivors)
## [1] 342
# Calculate the mean age of survivors
mean(survivors$Age, na.rm = TRUE)
## [1] 28.34369
# Exercise 4.2: Filter by Gender
# Create a subset of only female passengers
females <- train[train$Sex == "female", ]
# Count how many female passengers there were
nrow(females)
## [1] 314
# Calculate survival rate for females
sum(females$Survived == 1) / nrow(females)
## [1] 0.7420382
# Create a subset of only male passengers
males <- train[train$Sex == "male", ]
# Calculate survival rate for males
sum(males$Survived == 1) / nrow(males)
## [1] 0.1889081
# Exercise 4.3: Filter by Passenger Class
# Create a subset of first class passengers
first_class <- train[train$Pclass == 1, ]
# Calculate mean fare for first class
mean(first_class$Fare)
## [1] 84.15469
# Create a subset of third class passengers
third_class <- train[train$Pclass == 3, ]
# Calculate mean fare for third class
mean(third_class$Fare)
## [1] 13.67555
# Exercise 4.4: Multiple Conditions
# Create a subset of female survivors in first class
first_class_female_survivors <- train[train$Survived == 1 &
train$Sex == "female" &
train$Pclass == 1, ]
# Count how many there were
nrow(first_class_female_survivors)
## [1] 91
Part 5: Creating New Variables
# Exercise 5.1: Age Categories
# Create a new variable that categorizes passengers as "Child" (under 18) or "Adult"
train$AgeGroup <- ifelse(train$Age < 18, "Child", "Adult")
# View the first few rows
head(train[, c("Age", "AgeGroup")])
# Create a table of age groups (note: will include NAs for missing ages)
table(train$AgeGroup, useNA = "ifany")
##
## Adult Child <NA>
## 601 113 177
# Exercise 5.2: Fare Categories
# Create fare categories: Low (< 10), Medium (10-30), High (> 30)
train$FareCategory <- ifelse(train$Fare < 10, "Low",
ifelse(train$Fare <= 30, "Medium", "High"))
# Create a table of fare categories
table(train$FareCategory)
##
## High Low Medium
## 234 336 321
# Exercise 5.3: Title Extraction (Challenge!)
# The Name variable contains titles (Mr., Mrs., Miss., etc.)
# Let's look at a few examples
head(train$Name)
## [1] "Braund, Mr. Owen Harris"
## [2] "Cumings, Mrs. John Bradley (Florence Briggs Thayer)"
## [3] "Heikkinen, Miss. Laina"
## [4] "Futrelle, Mrs. Jacques Heath (Lily May Peel)"
## [5] "Allen, Mr. William Henry"
## [6] "Moran, Mr. James"
# Create a simple binary variable: Is the passenger a "Mr."?
train$IsMr <- grepl("Mr\\.", train$Name)
# Count how many "Mr." there are
sum(train$IsMr)
## [1] 517
Part 6: Basic Visualizations
# Exercise 6.1: Survival Bar Plot
# Create a barplot of survival counts
survival_counts <- table(train$Survived)
barplot(survival_counts,
main = "Titanic Survival",
names.arg = c("Did Not Survive", "Survived"),
xlab = "Survival Status",
ylab = "Number of Passengers",
col = c("red", "green"))
# Exercise 6.2: Age Distribution
# Create a histogram of ages
hist(train$Age,
main = "Distribution of Passenger Ages",
xlab = "...",
ylab = "Frequency",
col = "lightblue",
breaks = 20)
# Exercise 6.3: Fare Distribution
# Create a histogram of fares
hist(train$Fare,
main = "Distribution of Passenger Fares",
xlab = "Fare (British Pounds)",
ylab = "...",
col = "lightgreen",
breaks = 30)
# Exercise 6.4: Survival by Gender
# Create a barplot showing survival by gender
survival_by_sex <- table(train$Survived, train$Sex)
barplot(survival_by_sex,
main = "Survival by Gender",
xlab = "Gender",
ylab = "Number of Passengers",
col = c("red", "green"),
legend = c("Did Not Survive", "Survived"),
beside = TRUE)
# Exercise 6.5: Survival by Class
# Create a barplot showing survival by passenger class
survival_by_class <- table(train$Survived, train$Pclass)
barplot(survival_by_class,
main = "Survival by Passenger Class",
xlab = "Passenger Class",
ylab = "Number of Passengers",
col = c("red", "green"),
legend = c("Did Not Survive", "Survived"),
beside = TRUE)
# Exercise 6.6: Age by Survival
# Create side-by-side boxplots comparing age distribution for survivors vs non-survivors
boxplot(Age ~ Survived,
data = train,
main = "Age Distribution by Survival Status",
xlab = "Survived (0 = No, 1 = Yes)",
ylab = "Age",
col = c("red", "green"))
Part 7: Analysis Questions
# Question 1: What was the survival rate for children (under 18)?
children <- train[train$Age < 18, ]
child_survival_rate <- sum(children$Survived == 1, na.rm = TRUE) / sum(!is.na(children$Age))
child_survival_rate
## [1] 0.539823
# Question 2: What was the survival rate for adults?
adults <- train[train$Age >= 18, ]
adult_survival_rate <- sum(adults$Survived == 1, na.rm = TRUE) / sum(!is.na(adults$Age))
adult_survival_rate
## [1] 0.3810316
# Question 3: What was the average fare paid by survivors vs non-survivors?
avg_fare_survivors <- mean(survivors$Fare)
avg_fare_died <- mean(train[train$Survived == 0, ]$Fare)
avg_fare_survivors
## [1] 48.39541
avg_fare_died
## [1] 22.11789
# Question 4: What percentage of first class passengers survived?
first_class_survival <- sum(first_class$Survived == 1) / nrow(first_class) * 100
first_class_survival
## [1] 62.96296
# Question 5: What percentage of third class passengers survived?
third_class_survival <- sum(third_class$Survived == 1) / nrow(third_class) * 100
third_class_survival
## [1] 24.23625
# Question 6: What was the average family size of survivors vs non-survivors?
avg_family_survivors <- mean(survivors$FamilySize)
avg_family_died <- mean(train[train$Survived == 0, ]$FamilySize)
avg_family_survivors
## [1] 1.938596
avg_family_died
## [1] 1.883424