From Univariate to Bivariate and Multivariate Analysis


1. Prepare the Data (Load and Clean titantic.csv)


setwd("/Users/whinton/src/rstudio/tim8501")
titanic <- read.csv("titanic.csv", header = TRUE, sep= ",",stringsAsFactors = TRUE)
df <- titanic ## make copy of original dataset to data frame df

Examine missing values.

for (i in 1:length(df)) {
  # Loop through rows
  for (j in 1:nrow(df)) {
    # Check for empty strings or NA values
    if (df[j, i] == "" | is.na(df[j, i])) {
      # Replace with actual NA value (not a string "NA")
      df[j, i] <- NA
    }
  } 
}

cols_with_nas <- sum(colSums(is.na(df)) > 0)

Show missing values.

mdf <- data.frame(purrr::map_df(df, ~mean(is.na(.))))
vals <- c(mdf$Age[1],mdf$Cabin[1], mdf$Embarked[1])
labs <- c("Age", "Cabin", "Embarked")
piepercent <- round(100 * vals / sum(vals), 1)

pie(vals, labels = piepercent, main="Missing Values by Column Variable", col = rainbow(3))
legend("topright",labs, cex = 0.8, fill = rainbow(3))

colSums(is.na(df)) 
## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0         687           2
missmap(df)   

Perform Imputation.

# Remove Unnecessary Columns
df <- select(df, -PassengerId)
df <- select(df, -Name)
df <- select(df, -Ticket)
df <- select(df, -Cabin)

# Impute Remaining. if numeric use mean. if factor use numeric
for (col in names(df)) {
  if (is.numeric(df[[col]]) || is.integer(df[[col]])) {
    if (sum(!is.na(df[[col]])) > 10) {
      # If more than 10 non-NA values, use mean
      df[[col]][is.na(df[[col]])] <- mean(df[[col]], na.rm = TRUE)
    } else {
      # Otherwise, use linear interpolation for imputation
      df[[col]][is.na(df[[col]])] <- approx(seq_along(df[[col]]), df[[col]], n = length(df[[col]]))[["y"]][is.na(df[[col]])]
    }
  } else if (is.factor(df[[col]])) {
    mode_val <- names(sort(-table(df[[col]])))[1]
    df[[col]][is.na(df[[col]])] <- mode_val
  } else if (is.character(df[[col]])) {
    df[[col]][is.na(df[[col]])] <- "NA"
  }
}


df$Survived <- cut(df$Survived, breaks=c(-1,0,1), labels=c("NO","YES"))
df$Pclass <- cut(df$Pclass, breaks=c(0,1,2,3), labels=c("First","Second","Third"))

# Right-size the levels (number of possible values) on categorical variables.

df$Survived <- as.factor(as.character(df$Survived))
df$Pclass <- as.factor(as.character(df$Pclass))
df$Sex <- as.factor(as.character(df$Sex))
df$Embarked <- as.factor(as.character(df$Embarked))

# Show the dataset is now clean, w/only pertinent variables
###########################################################
missmap(df)

colSums(is.na(df))
## Survived   Pclass      Sex      Age    SibSp    Parch     Fare Embarked 
##        0        0        0        0        0        0        0        0

2. Univariate Analysis


Some Descriptive Statistics

#summary(df)
#psych::describe(df)

ageIQR <- paste("IQR for Age:",round(IQR(df$Age),2))
fareIQR <- paste("IQR for Fare:",round(IQR(df$Fare),2))
ageBPstats <- c(count(boxplot.stats(df$Age)$out))
fareBPstats <- c(count(boxplot.stats(df$Fare)$out))
ageOutliers <- paste("Number of Outliers for Age:",sum(ageBPstats$freq))
fareOutliers <- paste("Number of Outliers for Fare:",sum(fareBPstats$freq))

Plot of a single categorical/factor variable Survived (YES=1,NO=0) .

# Create a bargraph for numeric or a histogram for categorical variable.  
# Set col to the desired column name
################################################################################

col = "Survived"
if (is.factor(df[[col]])) { # if the col is categorical, then the code will
  # create two graphs the Bar plot 
  # Highlight and run until the line that start with `# Boxplot for numeric variables
  #
  # If the col is numeric, then it will create the histogram
  # Bar graph for factors
  p1 <- ggplot(df, aes(x = .data[[col]], fill = .data[[col]])) +
    geom_bar() +
    labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
    theme_minimal() +
    theme(legend.position = "right")
} else if (is.numeric(df[[col]]) || is.integer(df[[col]])) {
  
  # Create a 
  # Histogram for numeric variables
  # note the the "Binwidth" cannot be set up in the same way to work with 
  # Age or Fare that has a small range and one that the range is in thousands
  # Change this appropriately
  ###b_width <- mean(c(df[[col]]))
  b_width <- 10
  p1 <- ggplot(df, aes(x = .data[[col]])) +
    geom_histogram(binwidth = b_width, fill="transparent", color="blue") + 
    labs(title = paste("Histogram for", col), x = col, y = "Count") +
    theme_minimal()
}
 
## Normal Probability Plot of {col} using QQ
s_parm = c(as.numeric(df[[col]]))
parms <- df %>% summarize(mean = mean(s_parm), sd = sd(s_parm))
p2 <- df %>% ggplot(aes(sample = .data[[col]])) + geom_qq(dparams = parms) + geom_abline() + ggtitle(paste("Probability for", col))

## Dot plot of Passenger {col}
y_axis <- as.factor(nrow(df))
p3 <- ggplot(df, aes(x=.data[[col]], y=y_axis)) + geom_dotplot(binwidth=.05) + labs(title = paste("Dot Plot for", col), x = col, y = "Count") + theme(plot.title = element_text(hjust=0.5))

{
## Boxplot for numeric variables. Explain the findings of your Boxplot.
## Are there any outliers? what is the IQR?
p4 <- ggplot(df, aes(x = factor(1), y = .data[[col]])) +
    geom_boxplot() +
    labs(title = paste("Box Plot for", col), x = col, y = "Value") +
    theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
  
}

multiplot(p1,p2,p3,p4,cols = 2)

  • Survived
    • Histogram: Not ideal, as it is categorical with only two outcomes (0 or 1).
    • Bar Plot: This is very useful, as it clearly shows the number of survivors (1) compared to non-survivors (0).
    • Probability Plot: Not useful for binary categorical data.
    • Dot Plot: This can be used but provides limited additional value over a bar plot for binary variables.
    • Box Plot: Not applicable here because Survived is binary, so distribution does not vary.
    • Overall, the bar plot is the most useful here, clearly visualizing survival counts.

Plots of a single categorical/factor variable Pclass (First=1,Second=2,Third=2) .

# Create a bargraph for numeric or a histogram for categorical variable.  
# Set col to the desired column name
################################################################################

col = "Pclass"
if (is.factor(df[[col]])) { # if the col is categorical, then the code will
  # create two graphs the Bar plot 
  # Highlight and run until the line that start with `# Boxplot for numeric variables
  #
  # If the col is numeric, then it will create the histogram
  # Bar graph for factors
  p1 <- ggplot(df, aes(x = .data[[col]], fill = .data[[col]])) +
    geom_bar() +
    labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
    theme_minimal() +
    theme(legend.position = "right")
}

## Normal Probability Plot of {col} using QQ
s_parm = c(as.numeric(df[[col]]))
parms <- df %>% summarize(mean = mean(s_parm), sd = sd(s_parm))
p2 <- df %>% ggplot(aes(sample = .data[[col]])) + geom_qq(dparams = parms) + geom_abline() + ggtitle(paste("Probability for", col))

## Dot plot of Passenger {col}
y_axis <- as.factor(nrow(df))
p3 <- ggplot(df, aes(x=.data[[col]], y=y_axis)) + geom_dotplot(binwidth=.05) + labs(title = paste("Dot Plot for", col), x = col, y = "Count") + theme(plot.title = element_text(hjust=0.5))

{
## Boxplot for numeric variables. Explain the findings of your Boxplot.
## Are there any outliers? what is the IQR?
p4 <- ggplot(df, aes(x = factor(1), y = .data[[col]])) +
    geom_boxplot() +
    labs(title = paste("Box Plot for", col), x = col, y = "Value") +
    theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
  
}

multiplot(p1,p2,p3,p4,cols = 2)

  • Pclass
    • Histogram: Not ideal because Pclass is categorical.
    • Bar Plot: Highly useful for showing class distribution (1, 2, 3).
    • Probability Plot: Not applicable, as Pclass is categorical and ordinal.
    • Dot Plot: Useful as an alternative to the bar plot for comparing class frequencies but offers less clarity.
    • Box Plot: Not applicable as Pclass is categorical.
    • Overall: Bar plot is the most effective way to visualize passenger class distribution.

Plot of a single quantitative/numeric variable Age(decimal) .

# Create a bargraph for numeric or a histogram for categorical variable.  
# Set col to the desired column name
################################################################################

col = "Age"
if (is.factor(df[[col]])) { # if the col is categorical, then the code will
  # create two graphs the Bar plot 
  # Highlight and run until the line that start with `# Boxplot for numeric variables
  #
  # If the col is numeric, then it will create the histogram
  # Bar graph for factors
  p1 <- ggplot(df, aes(x = .data[[col]], fill = .data[[col]])) +
    geom_bar() +
    labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
    theme_minimal() +
    theme(legend.position = "right")
} else if (is.numeric(df[[col]]) || is.integer(df[[col]])) {
  
  # Create a 
  # Histogram for numeric variables
  # note the the "Binwidth" cannot be set up in the same way to work with 
  # Age or Fare that has a small range and one that the range is in thousands
  # Change this appropriately
  ###b_width <- mean(c(df[[col]]))
  b_width <- 10  ## a good binwidth for age ~ 10
  p1 <- ggplot(df, aes(x = .data[[col]])) +
    geom_histogram(binwidth = b_width, fill="transparent", color="blue") + 
    labs(title = paste("Histogram for", col), x = col, y = "Count") +
    theme_minimal()
}

## Normal Probability Plot of {col} using QQ
s_parm = c(as.numeric(df[[col]]))
parms <- df %>% summarize(mean = mean(s_parm), sd = sd(s_parm))
p2 <- df %>% ggplot(aes(sample = .data[[col]])) + geom_qq(dparams = parms, color="blue") + geom_abline(color="red") + ggtitle(paste("Probability for", col))

## Dot plot of Passenger {col}
y_axis <- as.factor(nrow(df))
p3 <- ggplot(df, aes(x=.data[[col]], y=y_axis)) + geom_dotplot(binwidth=.5) + labs(title = paste("Dot Plot for", col), x = col, y = "Count") + theme(plot.title = element_text(hjust=0.5))

{
## Boxplot for numeric variables. Explain the findings of your Boxplot.
## Are there any outliers? what is the IQR?
p4 <- ggplot(df, aes(x = factor(1), y = .data[[col]])) +
    geom_boxplot(color="blue") +
    labs(title = paste("Box Plot for", col), x = col, y = "Value") +
    theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
  
}

multiplot(p1,p2,p3,p4,cols = 2)

  • Age
    • Histogram: Highly useful for viewing the age distribution and identifying modes or skewness.
    • Bar Plot: Not suitable for continuous data like Age.
    • Probability Plot: Useful for assessing whether the age distribution follows a normal pattern.
    • Dot Plot: Useful to show individual ages but may become cluttered with a large dataset.
    • Box Plot: Useful for visualizing the age spread and identifying outliers.
    • Overall: Histogram and Box plot are valuable for visualizing Age, while the Probability plot can check for normality.

Plot of a single quantitative/numeric variable Fare(decimal).

# Create a bargraph for numeric or a histogram for categorical variable.  
# Set col to the desired column name
################################################################################

col = "Fare"
if (is.factor(df[[col]])) { # if the col is categorical, then the code will
  # create two graphs the Bar plot 
  # Highlight and run until the line that start with `# Boxplot for numeric variables
  #
  # If the col is numeric, then it will create the histogram
  # Bar graph for factors
  p1 <- ggplot(df, aes(x = .data[[col]], fill = .data[[col]])) +
    geom_bar() +
    labs(title = paste("Bar Graph for", col), x = col, y = "Count") +
    theme_minimal() +
    theme(legend.position = "right")
} else if (is.numeric(df[[col]]) || is.integer(df[[col]])) {
  
  # Histogram for numeric variables
  # note the the "Binwidth" cannot be set up in the same way to work with 
  # Age or Fare that has a small range and one that the range is in thousands
  ###b_width <- mean(c(df[[col]]))
  b_width <- 30  ## a good binwidth for fare ~ 30
  p1 <- ggplot(df, aes(x = .data[[col]])) +
    geom_histogram(binwidth = b_width, fill="transparent", color="blue") + 
    labs(title = paste("Histogram for", col), x = col, y = "Count") +
    theme_minimal()
}

## Normal Probability Plot of {col} using QQ
s_parm = c(as.numeric(df[[col]]))
parms <- df %>% summarize(mean = mean(s_parm), sd = sd(s_parm))
p2 <- df %>% ggplot(aes(sample = .data[[col]])) + geom_qq(dparams = parms, color="blue") + geom_abline(color="red") + ggtitle(paste("Probability for", col))

## Dot plot of Passenger {col}
y_axis <- as.factor(nrow(df))
p3 <- ggplot(df, aes(x=.data[[col]], y=y_axis)) + geom_dotplot(binwidth=3) + labs(title = paste("Dot Plot for", col), x = col, y = "Count") + theme(plot.title = element_text(hjust=0.5))

{
## Boxplot for numeric variables.
## Are there any outliers? what is the IQR?
p4 <- ggplot(df, aes(x = factor(1), y = .data[[col]])) +
    geom_boxplot(color="blue") +
    labs(title = paste("Box Plot for", col), x = col, y = "Value") +
    theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
  
}

multiplot(p1,p2,p3,p4,cols = 2)

  • Fare
    • Histogram: Useful to show the distribution of fares and identify skewness or multimodality..
    • Bar Plot: Not suitable, as Fare is continuous data.
    • Probability Plot: Useful for assessing normality or skewness in fare distribution.
    • Dot Plot: Can be informative but may become cluttered due to many unique values.
    • Box Plot: Useful for viewing fare spread and outliers, especially since Fare tends to be skewed.
    • Overall: Histogram, Box plot, and Probability plot are effective for Fare analysis.

3. Bivariate Analysis


Bivariate analysis examines relationships between pairs of variables to understand how they interact.

Bar plot for Categorical vs Categorical. (vs Continuous) Survival by Pclass , Survival by Embarked , Survival by Sex , Survival by Age

p1 <- ggplot(data = df, aes(x = Pclass, fill = factor(Survived))) +
  geom_bar(position = "fill") + 
  labs(title = "Survival by Passenger Class", x = "Passenger Class", y = "Proportion Survived")
p2 <- ggplot(data = df, aes(x = Embarked, fill = factor(Survived))) +
  geom_bar(position = "fill") + 
  labs(title = "Survival by Embarkment", x = "Embarkment Port", y = "Proportion Survived")
p3 <- ggplot(data = df, aes(x = Sex, fill = factor(Survived))) +
  geom_bar(position = "fill") + 
  labs(title = "Survival by Gender", x = "Gender", y = "Proportion Survived")
multiplot(p1,p2,p3, cols = 2)

Box Plot for Categorical vs Continuous Fare by Pclass , Fare by Embarked , Age by Survived

p1 <- ggplot(data = df, aes(x = factor(Pclass), y = Fare)) +
  geom_boxplot(color="blue") + 
  labs(title = "Fare Distribution by Class", x = "Passenger Class", y = "Fare")
p2 <- ggplot(data = df, aes(x = factor(Embarked), y = Fare)) +
  geom_boxplot(color="blue") + 
  labs(title = "Fare Distribution by Embarkment", x = "Embarkment Port", y = "Fare")
p3 <- ggplot(data = df, aes(x = factor(Survived), y = Age)) +
  geom_boxplot(color="blue") + 
  labs(title = "Age Distribution by Survival", x = "Survived", y = "Age")
multiplot(p1,p2,p3, cols = 2)

Scatter Plot for Continuous vs. Continuous Fare by Age

ggplot(data = df, aes(x = Age, y = Fare)) +
  geom_point(alpha = 0.5, color="blue") + geom_abline(color="red") +
  labs(title = "Fare by Age", x = "Age", y = "Fare")


4. Multivariate Analysis


Facet Grid. Fare vs. Age by Class and Gender

ggplot(data = df, aes(x = Age, y = Fare, color = factor(Survived))) +
  geom_point() +
  facet_grid(Sex ~ Pclass) +
  labs(title = "Fare vs. Age by Class and Gender", x = "Age", y = "Fare")

This plot shows the relationship between age, fare, and survival across gender and class categories.

Bubble Plots. Fare vs. Age by Survival Status and Family Size

ggplot(data = df, aes(x = Age, y = Fare, color = factor(Survived), size = SibSp)) +
  geom_point(alpha = 0.6) +
  labs(title = "Fare vs. Age by Survival Status and Family Size", x = "Age", y = "Fare")

Here, family size (SibSp) is represented by bubble size, allowing us to see if larger family sizes impacted survival in relation to age and fare.

Heatmap. Survival Rate by Class and Embarkation Port

ggplot(data = df, aes(x = Pclass, y = Embarked, fill = Survived)) +
  geom_tile() +
  labs(title = "Survival Rate by Class and Embarkation Port", x = "Class", y = "Embarked")

This plot allows us to see survival rates within each embarkation point and class, highlighting differences in survival probability.


5. Interpretation and Findings.


In the Titanic dataset, visualizing individual variables helps reveal key patterns, such as passenger demographics and survival rates, and highlights data issues that may affect further analysis. Starting with loading and preparing the data for univariate analysis, we move from univariate visual analysis to bivariate and multivariate visual analysis.

Data Preparation

Data Preparation is largely about handling missing data then filtering and cleaning the data for visualization and further analysis. Missing values were present in variables like Age and Cabin, while Embarked had minor missing values.

Handling methods.
- Deletion: Simple and effective for variables with minimal missing values (e.g., Embarked), but led to data reduction and potential bias when applied to variables like Age.
- Mean Imputation: This method is used for Age to retain dataset size and ensure numerical consistency. It reduces data variability but introduces potential bias.
- Mode Imputation: Applied to categorical variables (e.g., Embarked) to preserve data distribution.

Filtering and Cleaning.
Irrelevant columns were identified as not adding analytical value. (e.g. PassengerID, Name, Ticket, Cabin).
Some numerical variables were reclassified as categorical factor variables for proper analysis (e.g. Survived-> 0=No, 1=Yes, Pclass -> 1=First, 2=Second, 3=Third).

Univariate Analysis

For effective univariate graphical analysis, each relevant column variable is properly clean and classified. This includes identify the original raw data type, the base type for analysis such as categorical or numerical, the classification in terms of qualitative or quantitative, the scale of measurement in terms of nominal, ordinal, interval or ratio, and if quantitative then the variable must be determined as continuous or discrete.

A paper exercise was previously conducted to assess each of the remaining post-cleaned variables for plotting on one or more graph types including: histogram, bar plot, probability plot, dot plot and/or box plot. For example, a categorical variable was plotted as a bar plot, but a discrete numeric variable was plotted as a histogram.

Share of Distribution: Univariate visualizations such as histograms and box plots help understand the overall shape of a variable’s distribution (e.g., normal, skewed, or multimodal). Also box plots highlight potential outliers, which are values that deviate significantly from the rest of the data.

Bivariate (variable pairs) Analysis

For bivariate analysis, I examined the relationships between pairs of variables to understand how they interact. These variable pairs include the following.

  1. Survived vs. Pclass
  2. Survived vs. Sex
  3. Survived vs. Age
  4. Survived vs. SibSp and Parch
  5. Survived vs. Fare
  6. Pclass vs. Fare
  7. Pclass vs. Age
  8. Age vs. Fare
  9. Survived vs. Embarked

Key Insights derived from this analysis include the following.

• Socio-Economic Status and Survival: Higher-class passengers, who paid more for their tickets, had a higher chance of survival. This trend may reflect access to lifeboats and the prioritization of upper-class passengers.
• Gender and Survival: Females had a higher survival rate than males, supporting a strong positive association between gender and survival.
• Family Size: Passengers with smaller families (as measured by SibSp and Parch) tended to survive at slightly higher rates, possibly due to evacuation logistics favoring smaller groups.
• Age and Survival: Although younger passengers, especially children, were prioritized, age did not exhibit a strong correlation with survival.
• Embarkation Point and Survival: Differences in survival rates by embarkation port were minor, suggesting embarkation point alone had little impact on survival but may indicate socio-economic factors indirectly.

Bivariate analysis provides critical insights into relationships between passenger demographics, socio-economic factors, and survival outcomes in the Titanic dataset. By understanding these correlations and patterns, we gain a clearer picture of which factors contributed most significantly to survival, laying the foundation for deeper, multivariate analysis.

Multivariate Analysis

Multivariate data visualization is essential in Exploratory Data Analysis (EDA), especially for data ets like Titanic.csv, which contain numerous variables with potential interactions and dependencies. While univariate and bivariate analyses provide insights into individual and paired variables, multivariate visualization enables a comprehensive understanding of how multiple variables (3 or more) relate, allowing us to uncover complex patterns that may not be apparent when examining variables in isolation or pairs.

In the Titanic dataset, multivariate visualization helps reveal how variables like Pclass (socio-economic status), Sex (gender), Age (age), and Fare interact in relation to Survived (survival outcome). For instance, it allows us to observe:
• Combined Effects: For instance, survival rates can vary significantly across different combinations of Pclass, Sex, and Age. Multivariate visualization can show that females in the upper class had higher survival rates, while males in the lower class had lower survival rates.
• Interactions and Dependencies: Variables like Fare and Pclass are related (higher fares correspond to upper classes), and this relationship affects survival chances. Visualizing these variables together highlights the dependency of survival on socio-economic factors.
• Patterns by Groups: Multivariate visualizations can reveal patterns within certain groups, such as how age affects survival differently across sexes or how family size (using SibSp and Parch) impacts survival when combined with socio-economic class.

Multivariate analysis can reveal non-linear relationships, such as instances where survival is highly probable for younger females across all classes but only for older passengers in the upper classes. Outliers or unique cases, like exceptionally high fares among lower classes or extreme ages associated with high survival rates, may also emerge more clearly when multiple variables are plotted together.

Key Benefits of Multivariate Visualization.

  1. Holistic View of Data Relationships: Viewing multiple variables at once provides a more comprehensive understanding of the factors contributing to survival in complex datasets.
  2. Identification of Multivariate Patterns and Anomalies: Recognizing interactions between multiple variables enables better detection of multivariate outliers and unique patterns in specific subsets of passengers.
  3. Enhanced Feature Selection for Modeling: Understanding complex interactions guides feature selection for predictive modeling, ensuring that meaningful relationships are preserved in model training.

Summary

In exploratory data analysis (EDA), bivariate analysis and multivariate analysis allow analysts to examine relationships between two or more variables, respectively, helping uncover patterns, correlations, or interactions that might be significant for insights or predictive modeling. In a dataset like titanic.csv, these approaches can reveal survival factors, demographic trends, and socio-economic impacts on outcomes.

• Bivariate Analysis: Focuses on relationships between two variables, ideal for uncovering simple correlations or distributions (e.g., survival rate by gender). • Multivariate Analysis: Examines complex interactions among multiple variables, offering insights into combined effects on outcomes (e.g., survival as influenced by age, gender, and class).

Each type of analysis provides unique perspectives in EDA. Bivariate analysis simplifies relationships, while multivariate analysis uncovers deeper insights, which are especially valuable for complex datasets like Titanic, where multiple interconnected factors influence survival.

References

Hinton, W. (2024). A Univariate Visualization and Analysis of the Titanic Data Set. Available at Rpubs. https://www.rpubs.com/whinton/.

Prabhakaran, S. (2023). The complete ggplot2 tutorial. R-statistics.co. Available online _{link}(https://r-statistics.co/Complete-Ggplot2-Tutorial-Part1-With-R-Code.html)_ Datar, R., & Garg, H. (2019). Hands-on exploratory data analysis with R: Become an expert in exploratory data analysis using R packages. O’Reilly Media, Inc. 

Smeaton, A. (2003). NIST/SEMATECH Engineering Statistics Handbook. _{link}(https://www.itl.nist.gov/div898/handbook/)_. R Programming for Statistics and Data Science (Media from Packt Publishing available freely through O’Reilly Media Inc.). (2018).

Xie, Y., Allaire, J. J., & Grolemund, G. (2018). R markdown: The definitive guide. CRC Press. _[Link]https://bookdown.org/yihui/rmarkdown/)_

Kabacoff, Robert (2022). R in Action, Third Edition. O’Reilly Online Learning. _{Link}(https://learning.oreilly.com/library/view/r-in-action/9781617296055/)_

EPA. (2024). Exploratory Data Analysis. “United States Environmental Protection Agency”. Link

Wickham, H. (2016). gplot2: Elegant Graphics for Data Analysis. Retrieved from _[Link}(https://ggplot2.tidyverse.org)_.


This study conducted and performed by Will Hinton