setwd("/Users/whinton/src/rstudio/tim8501")
titanic <- read.csv("titanic.csv", header = TRUE, sep= ",",stringsAsFactors = TRUE)
df <- titanic ## make copy of original dataset to data frame df
Examine missing values.
for (i in 1:length(df)) {
# Loop through rows
for (j in 1:nrow(df)) {
# Check for empty strings or NA values
if (df[j, i] == "" | is.na(df[j, i])) {
# Replace with actual NA value (not a string "NA")
df[j, i] <- NA
}
}
}
cols_with_nas <- sum(colSums(is.na(df)) > 0)
Show missing values.
mdf <- data.frame(purrr::map_df(df, ~mean(is.na(.))))
vals <- c(mdf$Age[1],mdf$Cabin[1], mdf$Embarked[1])
labs <- c("Age", "Cabin", "Embarked")
piepercent <- round(100 * vals / sum(vals), 1)
pie(vals, labels = piepercent, main="Missing Values by Column Variable", col = rainbow(3))
legend("topright",labs, cex = 0.8, fill = rainbow(3))
colSums(is.na(df))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 687 2
missmap(df)
Perform Imputation.
# Remove Unnecessary Columns
df <- select(df, -PassengerId)
df <- select(df, -Name)
df <- select(df, -Ticket)
df <- select(df, -Cabin)
# Impute Remaining. if numeric use mean. if factor use numeric
for (col in names(df)) {
if (is.numeric(df[[col]]) || is.integer(df[[col]])) {
if (sum(!is.na(df[[col]])) > 10) {
# If more than 10 non-NA values, use mean
df[[col]][is.na(df[[col]])] <- mean(df[[col]], na.rm = TRUE)
} else {
# Otherwise, use linear interpolation for imputation
df[[col]][is.na(df[[col]])] <- approx(seq_along(df[[col]]), df[[col]], n = length(df[[col]]))[["y"]][is.na(df[[col]])]
}
} else if (is.factor(df[[col]])) {
mode_val <- names(sort(-table(df[[col]])))[1]
df[[col]][is.na(df[[col]])] <- mode_val
} else if (is.character(df[[col]])) {
df[[col]][is.na(df[[col]])] <- "NA"
}
}
df$Survived <- cut(df$Survived, breaks=c(-1,0,1), labels=c("NO","YES"))
df$Pclass <- cut(df$Pclass, breaks=c(0,1,2,3), labels=c("First","Second","Third"))
# Right-size the levels (number of possible values) on categorical variables.
df$Survived <- as.factor(as.character(df$Survived))
df$Pclass <- as.factor(as.character(df$Pclass))
df$Sex <- as.factor(as.character(df$Sex))
df$Embarked <- as.factor(as.character(df$Embarked))
# Show the dataset is now clean, w/only pertinent variables
###########################################################
missmap(df)
colSums(is.na(df))
## Survived Pclass Sex Age SibSp Parch Fare Embarked
## 0 0 0 0 0 0 0 0
Some Descriptive Statistics
#summary(df)
#psych::describe(df)
ageIQR <- paste("IQR for Age:",round(IQR(df$Age),2))
fareIQR <- paste("IQR for Fare:",round(IQR(df$Fare),2))
ageBPstats <- c(count(boxplot.stats(df$Age)$out))
fareBPstats <- c(count(boxplot.stats(df$Fare)$out))
ageOutliers <- paste("Number of Outliers for Age:",sum(ageBPstats$freq))
fareOutliers <- paste("Number of Outliers for Fare:",sum(fareBPstats$freq))
Plot of a single categorical/factor variable Survived (YES=1,NO=0) .
# Create a bargraph for numeric or a histogram for categorical variable.
# Set col to the desired column name
################################################################################
col = "Survived"
if (is.factor(df[[col]])) { # if the col is categorical, then the code will
# create two graphs the Bar plot
# Highlight and run until the line that start with `# Boxplot for numeric variables
#
# If the col is numeric, then it will create the histogram
# Bar graph for factors
p1 <- ggplot(df, aes(x = .data[[col]], fill = .data[[col]])) +
geom_bar() +
labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
theme_minimal() +
theme(legend.position = "right")
} else if (is.numeric(df[[col]]) || is.integer(df[[col]])) {
# Create a
# Histogram for numeric variables
# note the the "Binwidth" cannot be set up in the same way to work with
# Age or Fare that has a small range and one that the range is in thousands
# Change this appropriately
###b_width <- mean(c(df[[col]]))
b_width <- 10
p1 <- ggplot(df, aes(x = .data[[col]])) +
geom_histogram(binwidth = b_width, fill="transparent", color="blue") +
labs(title = paste("Histogram for", col), x = col, y = "Count") +
theme_minimal()
}
## Normal Probability Plot of {col} using QQ
s_parm = c(as.numeric(df[[col]]))
parms <- df %>% summarize(mean = mean(s_parm), sd = sd(s_parm))
p2 <- df %>% ggplot(aes(sample = .data[[col]])) + geom_qq(dparams = parms) + geom_abline() + ggtitle(paste("Probability for", col))
## Dot plot of Passenger {col}
y_axis <- as.factor(nrow(df))
p3 <- ggplot(df, aes(x=.data[[col]], y=y_axis)) + geom_dotplot(binwidth=.05) + labs(title = paste("Dot Plot for", col), x = col, y = "Count") + theme(plot.title = element_text(hjust=0.5))
{
## Boxplot for numeric variables. Explain the findings of your Boxplot.
## Are there any outliers? what is the IQR?
p4 <- ggplot(df, aes(x = factor(1), y = .data[[col]])) +
geom_boxplot() +
labs(title = paste("Box Plot for", col), x = col, y = "Value") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
}
multiplot(p1,p2,p3,p4,cols = 2)
Plots of a single categorical/factor variable Pclass (First=1,Second=2,Third=2) .
# Create a bargraph for numeric or a histogram for categorical variable.
# Set col to the desired column name
################################################################################
col = "Pclass"
if (is.factor(df[[col]])) { # if the col is categorical, then the code will
# create two graphs the Bar plot
# Highlight and run until the line that start with `# Boxplot for numeric variables
#
# If the col is numeric, then it will create the histogram
# Bar graph for factors
p1 <- ggplot(df, aes(x = .data[[col]], fill = .data[[col]])) +
geom_bar() +
labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
theme_minimal() +
theme(legend.position = "right")
}
## Normal Probability Plot of {col} using QQ
s_parm = c(as.numeric(df[[col]]))
parms <- df %>% summarize(mean = mean(s_parm), sd = sd(s_parm))
p2 <- df %>% ggplot(aes(sample = .data[[col]])) + geom_qq(dparams = parms) + geom_abline() + ggtitle(paste("Probability for", col))
## Dot plot of Passenger {col}
y_axis <- as.factor(nrow(df))
p3 <- ggplot(df, aes(x=.data[[col]], y=y_axis)) + geom_dotplot(binwidth=.05) + labs(title = paste("Dot Plot for", col), x = col, y = "Count") + theme(plot.title = element_text(hjust=0.5))
{
## Boxplot for numeric variables. Explain the findings of your Boxplot.
## Are there any outliers? what is the IQR?
p4 <- ggplot(df, aes(x = factor(1), y = .data[[col]])) +
geom_boxplot() +
labs(title = paste("Box Plot for", col), x = col, y = "Value") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
}
multiplot(p1,p2,p3,p4,cols = 2)
Plot of a single quantitative/numeric variable Age(decimal) .
# Create a bargraph for numeric or a histogram for categorical variable.
# Set col to the desired column name
################################################################################
col = "Age"
if (is.factor(df[[col]])) { # if the col is categorical, then the code will
# create two graphs the Bar plot
# Highlight and run until the line that start with `# Boxplot for numeric variables
#
# If the col is numeric, then it will create the histogram
# Bar graph for factors
p1 <- ggplot(df, aes(x = .data[[col]], fill = .data[[col]])) +
geom_bar() +
labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
theme_minimal() +
theme(legend.position = "right")
} else if (is.numeric(df[[col]]) || is.integer(df[[col]])) {
# Create a
# Histogram for numeric variables
# note the the "Binwidth" cannot be set up in the same way to work with
# Age or Fare that has a small range and one that the range is in thousands
# Change this appropriately
###b_width <- mean(c(df[[col]]))
b_width <- 10 ## a good binwidth for age ~ 10
p1 <- ggplot(df, aes(x = .data[[col]])) +
geom_histogram(binwidth = b_width, fill="transparent", color="blue") +
labs(title = paste("Histogram for", col), x = col, y = "Count") +
theme_minimal()
}
## Normal Probability Plot of {col} using QQ
s_parm = c(as.numeric(df[[col]]))
parms <- df %>% summarize(mean = mean(s_parm), sd = sd(s_parm))
p2 <- df %>% ggplot(aes(sample = .data[[col]])) + geom_qq(dparams = parms, color="blue") + geom_abline(color="red") + ggtitle(paste("Probability for", col))
## Dot plot of Passenger {col}
y_axis <- as.factor(nrow(df))
p3 <- ggplot(df, aes(x=.data[[col]], y=y_axis)) + geom_dotplot(binwidth=.5) + labs(title = paste("Dot Plot for", col), x = col, y = "Count") + theme(plot.title = element_text(hjust=0.5))
{
## Boxplot for numeric variables. Explain the findings of your Boxplot.
## Are there any outliers? what is the IQR?
p4 <- ggplot(df, aes(x = factor(1), y = .data[[col]])) +
geom_boxplot(color="blue") +
labs(title = paste("Box Plot for", col), x = col, y = "Value") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
}
multiplot(p1,p2,p3,p4,cols = 2)
Plot of a single quantitative/numeric variable Fare(decimal).
# Create a bargraph for numeric or a histogram for categorical variable.
# Set col to the desired column name
################################################################################
col = "Fare"
if (is.factor(df[[col]])) { # if the col is categorical, then the code will
# create two graphs the Bar plot
# Highlight and run until the line that start with `# Boxplot for numeric variables
#
# If the col is numeric, then it will create the histogram
# Bar graph for factors
p1 <- ggplot(df, aes(x = .data[[col]], fill = .data[[col]])) +
geom_bar() +
labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
theme_minimal() +
theme(legend.position = "right")
} else if (is.numeric(df[[col]]) || is.integer(df[[col]])) {
# Histogram for numeric variables
# note the the "Binwidth" cannot be set up in the same way to work with
# Age or Fare that has a small range and one that the range is in thousands
###b_width <- mean(c(df[[col]]))
b_width <- 30 ## a good binwidth for fare ~ 30
p1 <- ggplot(df, aes(x = .data[[col]])) +
geom_histogram(binwidth = b_width, fill="transparent", color="blue") +
labs(title = paste("Histogram for", col), x = col, y = "Count") +
theme_minimal()
}
## Normal Probability Plot of {col} using QQ
s_parm = c(as.numeric(df[[col]]))
parms <- df %>% summarize(mean = mean(s_parm), sd = sd(s_parm))
p2 <- df %>% ggplot(aes(sample = .data[[col]])) + geom_qq(dparams = parms, color="blue") + geom_abline(color="red") + ggtitle(paste("Probability for", col))
## Dot plot of Passenger {col}
y_axis <- as.factor(nrow(df))
p3 <- ggplot(df, aes(x=.data[[col]], y=y_axis)) + geom_dotplot(binwidth=3) + labs(title = paste("Dot Plot for", col), x = col, y = "Count") + theme(plot.title = element_text(hjust=0.5))
{
## Boxplot for numeric variables.
## Are there any outliers? what is the IQR?
p4 <- ggplot(df, aes(x = factor(1), y = .data[[col]])) +
geom_boxplot(color="blue") +
labs(title = paste("Box Plot for", col), x = col, y = "Value") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
}
multiplot(p1,p2,p3,p4,cols = 2)
Bivariate analysis examines relationships between pairs of variables to understand how they interact.
Bar plot for Categorical vs Categorical. (vs Continuous) Survival by Pclass , Survival by Embarked , Survival by Sex , Survival by Age
p1 <- ggplot(data = df, aes(x = Pclass, fill = factor(Survived))) +
geom_bar(position = "fill") +
labs(title = "Survival by Passenger Class", x = "Passenger Class", y = "Proportion Survived")
p2 <- ggplot(data = df, aes(x = Embarked, fill = factor(Survived))) +
geom_bar(position = "fill") +
labs(title = "Survival by Embarkment", x = "Embarkment Port", y = "Proportion Survived")
p3 <- ggplot(data = df, aes(x = Sex, fill = factor(Survived))) +
geom_bar(position = "fill") +
labs(title = "Survival by Gender", x = "Gender", y = "Proportion Survived")
multiplot(p1,p2,p3, cols = 2)
Box Plot for Categorical vs Continuous Fare by Pclass , Fare by Embarked , Age by Survived
p1 <- ggplot(data = df, aes(x = factor(Pclass), y = Fare)) +
geom_boxplot(color="blue") +
labs(title = "Fare Distribution by Class", x = "Passenger Class", y = "Fare")
p2 <- ggplot(data = df, aes(x = factor(Embarked), y = Fare)) +
geom_boxplot(color="blue") +
labs(title = "Fare Distribution by Embarkment", x = "Embarkment Port", y = "Fare")
p3 <- ggplot(data = df, aes(x = factor(Survived), y = Age)) +
geom_boxplot(color="blue") +
labs(title = "Age Distribution by Survival", x = "Survived", y = "Age")
multiplot(p1,p2,p3, cols = 2)
Scatter Plot for Continuous vs. Continuous Fare by Age
ggplot(data = df, aes(x = Age, y = Fare)) +
geom_point(alpha = 0.5, color="blue") + geom_abline(color="red") +
labs(title = "Fare by Age", x = "Age", y = "Fare")
Facet Grid. Fare vs. Age by Class and Gender
ggplot(data = df, aes(x = Age, y = Fare, color = factor(Survived))) +
geom_point() +
facet_grid(Sex ~ Pclass) +
labs(title = "Fare vs. Age by Class and Gender", x = "Age", y = "Fare")
This plot shows the relationship between age, fare, and survival across gender and class categories.
Bubble Plots. Fare vs. Age by Survival Status and Family Size
ggplot(data = df, aes(x = Age, y = Fare, color = factor(Survived), size = SibSp)) +
geom_point(alpha = 0.6) +
labs(title = "Fare vs. Age by Survival Status and Family Size", x = "Age", y = "Fare")
Here, family size (SibSp) is represented by bubble size, allowing us to see if larger family sizes impacted survival in relation to age and fare.
Heatmap. Survival Rate by Class and Embarkation Port
ggplot(data = df, aes(x = Pclass, y = Embarked, fill = Survived)) +
geom_tile() +
labs(title = "Survival Rate by Class and Embarkation Port", x = "Class", y = "Embarked")
This plot allows us to see survival rates within each embarkation point and class, highlighting differences in survival probability.
In the Titanic dataset, visualizing individual variables helps reveal key patterns, such as passenger demographics and survival rates, and highlights data issues that may affect further analysis. Starting with loading and preparing the data for univariate analysis, we move from univariate visual analysis to bivariate and multivariate visual analysis.
Data Preparation
Data Preparation is largely about handling missing data then filtering and cleaning the data for visualization and further analysis. Missing values were present in variables like Age and Cabin, while Embarked had minor missing values.
Handling methods.
- Deletion: Simple and effective for variables with minimal missing
values (e.g., Embarked), but led to data reduction and potential bias
when applied to variables like Age.
- Mean Imputation: This method is used for Age to retain dataset size
and ensure numerical consistency. It reduces data variability but
introduces potential bias.
- Mode Imputation: Applied to categorical variables (e.g., Embarked) to
preserve data distribution.
Filtering and Cleaning.
Irrelevant columns were identified as not adding analytical value.
(e.g. PassengerID, Name, Ticket, Cabin).
Some numerical variables were reclassified as categorical factor
variables for proper analysis (e.g. Survived-> 0=No, 1=Yes, Pclass
-> 1=First, 2=Second, 3=Third).
Univariate Analysis
For effective univariate graphical analysis, each relevant column variable is properly clean and classified. This includes identify the original raw data type, the base type for analysis such as categorical or numerical, the classification in terms of qualitative or quantitative, the scale of measurement in terms of nominal, ordinal, interval or ratio, and if quantitative then the variable must be determined as continuous or discrete.
A paper exercise was previously conducted to assess each of the remaining post-cleaned variables for plotting on one or more graph types including: histogram, bar plot, probability plot, dot plot and/or box plot. For example, a categorical variable was plotted as a bar plot, but a discrete numeric variable was plotted as a histogram.
Share of Distribution: Univariate visualizations such as histograms and box plots help understand the overall shape of a variable’s distribution (e.g., normal, skewed, or multimodal). Also box plots highlight potential outliers, which are values that deviate significantly from the rest of the data.
Bivariate (variable pairs) Analysis
For bivariate analysis, I examined the relationships between pairs of variables to understand how they interact. These variable pairs include the following.
Key Insights derived from this analysis include the following.
• Socio-Economic Status and Survival: Higher-class passengers, who
paid more for their tickets, had a higher chance of survival. This trend
may reflect access to lifeboats and the prioritization of upper-class
passengers.
• Gender and Survival: Females had a higher survival rate than males,
supporting a strong positive association between gender and
survival.
• Family Size: Passengers with smaller families (as measured by SibSp
and Parch) tended to survive at slightly higher rates, possibly due to
evacuation logistics favoring smaller groups.
• Age and Survival: Although younger passengers, especially children,
were prioritized, age did not exhibit a strong correlation with
survival.
• Embarkation Point and Survival: Differences in survival rates by
embarkation port were minor, suggesting embarkation point alone had
little impact on survival but may indicate socio-economic factors
indirectly.
Bivariate analysis provides critical insights into relationships between passenger demographics, socio-economic factors, and survival outcomes in the Titanic dataset. By understanding these correlations and patterns, we gain a clearer picture of which factors contributed most significantly to survival, laying the foundation for deeper, multivariate analysis.
Multivariate Analysis
Multivariate data visualization is essential in Exploratory Data Analysis (EDA), especially for data ets like Titanic.csv, which contain numerous variables with potential interactions and dependencies. While univariate and bivariate analyses provide insights into individual and paired variables, multivariate visualization enables a comprehensive understanding of how multiple variables (3 or more) relate, allowing us to uncover complex patterns that may not be apparent when examining variables in isolation or pairs.
In the Titanic dataset, multivariate visualization helps reveal how
variables like Pclass (socio-economic status), Sex (gender), Age (age),
and Fare interact in relation to Survived (survival outcome). For
instance, it allows us to observe:
• Combined Effects: For instance, survival rates can vary significantly
across different combinations of Pclass, Sex, and Age. Multivariate
visualization can show that females in the upper class had higher
survival rates, while males in the lower class had lower survival
rates.
• Interactions and Dependencies: Variables like Fare and Pclass are
related (higher fares correspond to upper classes), and this
relationship affects survival chances. Visualizing these variables
together highlights the dependency of survival on socio-economic
factors.
• Patterns by Groups: Multivariate visualizations can reveal patterns
within certain groups, such as how age affects survival differently
across sexes or how family size (using SibSp and Parch) impacts survival
when combined with socio-economic class.
Multivariate analysis can reveal non-linear relationships, such as instances where survival is highly probable for younger females across all classes but only for older passengers in the upper classes. Outliers or unique cases, like exceptionally high fares among lower classes or extreme ages associated with high survival rates, may also emerge more clearly when multiple variables are plotted together.
Key Benefits of Multivariate Visualization.
Summary
In exploratory data analysis (EDA), bivariate analysis and multivariate analysis allow analysts to examine relationships between two or more variables, respectively, helping uncover patterns, correlations, or interactions that might be significant for insights or predictive modeling. In a dataset like titanic.csv, these approaches can reveal survival factors, demographic trends, and socio-economic impacts on outcomes.
• Bivariate Analysis: Focuses on relationships between two variables, ideal for uncovering simple correlations or distributions (e.g., survival rate by gender). • Multivariate Analysis: Examines complex interactions among multiple variables, offering insights into combined effects on outcomes (e.g., survival as influenced by age, gender, and class).
Each type of analysis provides unique perspectives in EDA. Bivariate analysis simplifies relationships, while multivariate analysis uncovers deeper insights, which are especially valuable for complex datasets like Titanic, where multiple interconnected factors influence survival.
References
Hinton, W. (2024). A Univariate Visualization and Analysis of the Titanic Data Set. Available at Rpubs. https://www.rpubs.com/whinton/.
Prabhakaran, S. (2023). The complete ggplot2 tutorial. R-statistics.co. Available online _{link}(https://r-statistics.co/Complete-Ggplot2-Tutorial-Part1-With-R-Code.html)_ Datar, R., & Garg, H. (2019). Hands-on exploratory data analysis with R: Become an expert in exploratory data analysis using R packages. O’Reilly Media, Inc.
Smeaton, A. (2003). NIST/SEMATECH Engineering Statistics Handbook. _{link}(https://www.itl.nist.gov/div898/handbook/)_. R Programming for Statistics and Data Science (Media from Packt Publishing available freely through O’Reilly Media Inc.). (2018).
Xie, Y., Allaire, J. J., & Grolemund, G. (2018). R markdown: The definitive guide. CRC Press. _[Link]https://bookdown.org/yihui/rmarkdown/)_
Kabacoff, Robert (2022). R in Action, Third Edition. O’Reilly Online Learning. _{Link}(https://learning.oreilly.com/library/view/r-in-action/9781617296055/)_
EPA. (2024). Exploratory Data Analysis. “United States Environmental Protection Agency”. Link
Wickham, H. (2016). gplot2: Elegant Graphics for Data Analysis. Retrieved from _[Link}(https://ggplot2.tidyverse.org)_.
This study conducted and performed by Will Hinton