This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Created a titanic_data dataframe from csv file
titanic_data <- read.csv("cleaned_titanic_data.csv")
# Installed and Attached Necessary Packages
# Note that dplyr is included in tidyverse
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("plotly")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
This is where I make the column names lowercase for consistency and convention.
names(titanic_data)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Embarked"
names(titanic_data) <- tolower(names(titanic_data))
ggplot(titanic_data, aes(x = as.factor(survived))) +
geom_bar() +
xlab("Survived") +
ylab("Count") +
ggtitle("Count of Survived Passengers on the Titanic")
ggplot(titanic_data, aes(x = age)) +
geom_histogram(binwidth = 5, fill = "blue", color = "black") +
xlab("Age") +
ylab("Count") +
ggtitle("Age Distribution of Passengers")
ggplot(titanic_data, aes(x = as.factor(survived), y = age)) +
geom_boxplot() +
xlab("Survived") +
ylab("Age") +
ggtitle("Age Distribution by Survival Status")
ggplot(titanic_data, aes(x = as.factor(survived), y = age)) +
geom_violin() +
xlab("Survived") +
ylab("Age") +
ggtitle("Age Distribution and Density by Survival Status")
ggplot(titanic_data, aes(x = as.factor(pclass))) +
geom_bar(fill = "green") +
xlab("Passenger Class") +
ylab("Count") +
ggtitle("Count of Passengers by Class")
ggplot(titanic_data, aes(x = embarked)) +
geom_bar(fill = "purple") +
xlab("Embarkation Point") +
ylab("Count") +
ggtitle("Count of Passengers by Embarkation Point")
ggplot(titanic_data, aes(x = age, y = fare)) +
geom_point(color = "red") +
xlab("Age") +
ylab("Fare") +
ggtitle("Scatter Plot of Age vs. Fare")
ggplot(titanic_data, aes(x = age, y = fare)) +
geom_point() +
facet_grid(. ~ survived) +
xlab("Age") +
ylab("Fare") +
ggtitle("Age vs. Fare by Survival Status")
ggplot(titanic_data, aes(x = age, y = fare)) +
geom_point(color = "brown") +
facet_grid(. ~ pclass) +
xlab("Age") +
ylab("Fare") +
ggtitle("Age vs. Fare by Passenger Class")
Stacked Bar Plot of Survival by Passenger Class} ggplot(titanic_data, aes(x = pclass, fill = as.factor(survived))) + geom_bar(position = "fill") + xlab("Passenger Class") + ylab("Proportion") + labs(fill = "Survived") + ggtitle("Survival Proportions by Passenger Class")
titanic_summary <- titanic_data %>%
group_by(pclass, survived) %>%
summarise(count = n()) %>%
mutate(percentage = count / sum(count) * 100,
survival_status = ifelse(survived == 1, "Survived", "Did Not Survive"))
## `summarise()` has grouped output by 'pclass'. You can override using the
## `.groups` argument.
titanic_summary <- as.data.frame(titanic_summary)
# Create the interactive stacked bar plot
plot <- plot_ly(titanic_summary,
x = ~pclass,
y = ~percentage,
type = 'bar',
color = ~as.factor(survived),
text = ~paste('Status:', survival_status, '<br>Percentage:', round(percentage, 2), '%'),
hoverinfo = 'text',
textposition = 'auto') %>%
layout(barmode = 'stack',
xaxis = list(title = 'Passenger Class'),
yaxis = list(title = 'Percentage'),
title = 'Survival Proportions by Passenger Class',
legend = list(title = list(text = 'Survival Status')))
plot
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
ggplot_scatter <- ggplot(titanic_data, aes(x = age, y = fare, color = embarked)) +
geom_point() +
xlab("Age") +
ylab("Fare") +
ggtitle("Scatter Plot of Age vs. Fare by Embarkation") +
theme_minimal()
# Display the plot
ggplot_scatter
southampton_data <- titanic_data %>%
filter(embarked == "S")
# Scatter Plot of Age vs Fare
ggplot(southampton_data, aes(x = age, y = fare)) +
geom_point(color = "red") +
geom_smooth(method = "lm", col = "blue") +
xlab("Age") +
ylab("Fare") +
ggtitle("Scatter Plot of Age vs. Fare for Southampton")
## `geom_smooth()` using formula = 'y ~ x'