Titanic Data Analysis

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Created a titanic_data dataframe from csv file
titanic_data <- read.csv("cleaned_titanic_data.csv")

# Installed and Attached Necessary Packages
# Note that dplyr is included in tidyverse
install.packages("tidyverse")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

install.packages("plotly")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

This is where I make the column names lowercase for consistency and convention.

names(titanic_data)

##  [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
##  [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
## [11] "Embarked"

names(titanic_data) <- tolower(names(titanic_data))

ggplot(titanic_data, aes(x = as.factor(survived))) +
  geom_bar() +
  xlab("Survived") +
  ylab("Count") +
  ggtitle("Count of Survived Passengers on the Titanic")

ggplot(titanic_data, aes(x = age)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black") +
  xlab("Age") +
  ylab("Count") +
  ggtitle("Age Distribution of Passengers")

ggplot(titanic_data, aes(x = as.factor(survived), y = age)) +
  geom_boxplot() +
  xlab("Survived") +
  ylab("Age") +
  ggtitle("Age Distribution by Survival Status")

ggplot(titanic_data, aes(x = as.factor(survived), y = age)) +
  geom_violin() +
  xlab("Survived") +
  ylab("Age") +
  ggtitle("Age Distribution and Density by Survival Status")

ggplot(titanic_data, aes(x = as.factor(pclass))) +
  geom_bar(fill = "green") +
  xlab("Passenger Class") +
  ylab("Count") +
  ggtitle("Count of Passengers by Class")

ggplot(titanic_data, aes(x = embarked)) +
  geom_bar(fill = "purple") +
  xlab("Embarkation Point") +
  ylab("Count") +
  ggtitle("Count of Passengers by Embarkation Point")

ggplot(titanic_data, aes(x = age, y = fare)) +
  geom_point(color = "red") +
  xlab("Age") +
  ylab("Fare") +
  ggtitle("Scatter Plot of Age vs. Fare")

ggplot(titanic_data, aes(x = age, y = fare)) +
  geom_point() +
  facet_grid(. ~ survived) +
  xlab("Age") +
  ylab("Fare") +
  ggtitle("Age vs. Fare by Survival Status")

ggplot(titanic_data, aes(x = age, y = fare)) +
  geom_point(color = "brown") +
  facet_grid(. ~ pclass) +
  xlab("Age") +
  ylab("Fare") +
  ggtitle("Age vs. Fare by Passenger Class")

Stacked Bar Plot of Survival by Passenger Class} ggplot(titanic_data, aes(x = pclass, fill = as.factor(survived))) + geom_bar(position = "fill") + xlab("Passenger Class") + ylab("Proportion") + labs(fill = "Survived") + ggtitle("Survival Proportions by Passenger Class")

titanic_summary <- titanic_data %>%
  group_by(pclass, survived) %>%
  summarise(count = n()) %>%
  mutate(percentage = count / sum(count) * 100, 
         survival_status = ifelse(survived == 1, "Survived", "Did Not Survive"))

## `summarise()` has grouped output by 'pclass'. You can override using the
## `.groups` argument.

titanic_summary <- as.data.frame(titanic_summary)

# Create the interactive stacked bar plot
plot <- plot_ly(titanic_summary, 
                x = ~pclass, 
                y = ~percentage, 
                type = 'bar', 
                color = ~as.factor(survived),
                text = ~paste('Status:', survival_status, '<br>Percentage:', round(percentage, 2), '%'),
                hoverinfo = 'text',
                textposition = 'auto') %>%
  layout(barmode = 'stack',
         xaxis = list(title = 'Passenger Class'),
         yaxis = list(title = 'Percentage'),
         title = 'Survival Proportions by Passenger Class',
         legend = list(title = list(text = 'Survival Status')))

plot

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

ggplot_scatter <- ggplot(titanic_data, aes(x = age, y = fare, color = embarked)) +
  geom_point() +
  xlab("Age") +
  ylab("Fare") +
  ggtitle("Scatter Plot of Age vs. Fare by Embarkation") +
  theme_minimal()

# Display the plot
ggplot_scatter

southampton_data <- titanic_data %>%
  filter(embarked == "S")

# Scatter Plot of Age vs Fare
ggplot(southampton_data, aes(x = age, y = fare)) +
  geom_point(color = "red") +
  geom_smooth(method = "lm", col = "blue") +
  xlab("Age") +
  ylab("Fare") +
  ggtitle("Scatter Plot of Age vs. Fare for Southampton")

## `geom_smooth()` using formula = 'y ~ x'

Titanic Data Analysis

2024-08-08

R Markdown