Data Analysis on Stress Responses in Caribbean Reef Sharks

# Clear working space ensuring no leftover variables 
rm(list=ls())

#Set the working directory to the folder where the files are stored
setwd("/Users/weronikastaniak/Desktop/UNI/Masters/RMDA/summative") # This ensures that R knows where to look for in the files

#List all files in the directory to confirm the dataset is present
list.files("/Users/weronikastaniak/Desktop/UNI/Masters/RMDA/summative") # Quickly verifies the data

[1] "Copy of sharks.xlsx"   "Copy of sharksub.xlsx" "dataset1.csv"         
[4] "dataset2.csv"

#Load the data set into R 
#This step reads the CSV files into the data frames for analysis
Dataset1 <- read.csv("/Users/weronikastaniak/Desktop/UNI/Masters/RMDA/summative/Dataset1.csv")
Dataset2 <- read.csv("/Users/weronikastaniak/Desktop/UNI/Masters/RMDA/summative/Dataset2.csv")

Question 1: Is There a Correlation between Air and Water?

Exploratory Data Analysis

# Load necessary libraries
library(ggplot2) #Load ggplot2 for visualizations 
library(readr) #Load readr to read the data
library(moments)# used to calculate skewness and kurtosis which are measures of data distribution

# Load the data sets
Dataset1 <- read_csv("/Users/weronikastaniak/Desktop/UNI/Masters/RMDA/summative/Dataset1.csv")

Rows: 500 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): ID, sex
dbl (8): blotch, BPM, weight, length, air, water, meta, depth

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Dataset2 <- read_csv("/Users/weronikastaniak/Desktop/UNI/Masters/RMDA/summative/Dataset2.csv")

Rows: 50 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): ID, sex
dbl (2): blotch1, blotch2

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Question 1: Is there a correlation between air and water?

# Visualize the distribution of air and water temperature 
#Purpose: To understand the underlying data and check for any patterns or anomalies 

# Histogram for Air Temperature 
ggplot(Dataset1, aes(x = Dataset1$air)) +
  geom_histogram(binwidth = 1, fill= "blue", alpha = 0.6) + # Adjust bin size for clarity 
  labs(title = "Histogram of Air Temperature", 
       x = "Air Temperature", 
       y = "Frequency") +
  theme_minimal()

# Histogram for Water Temperature 
ggplot(Dataset1, aes(x= Dataset1$water)) +
  geom_histogram(binwidth = 1, fill= "green", alpha= 0.6) +
  labs(title = "Histogram of Water Temperature", 
       x = "Water Temperature",
       y = "Frequency") +
  theme_minimal()

# Create Density Plot for Air and Water Temperature
# Density plots show the smoothed distribution of water and air temps which are useful for visualizing normality and identifying skewness 
ggplot(Dataset1, aes(x = Dataset1$air)) +
  geom_density(fill= "blue", alpha = 0.4) +
  labs(title = "Density Plot of Air Temperature", x = "Air Temperature", y = "Density") +
  theme_minimal()

ggplot(Dataset1, aes(x = Dataset1$water)) +
  geom_density(fill = "green", alpha = 0.4) +
  labs(title = "Density of Water Temperature", x= "Water Temperature", y = "Density") +
  theme_minimal()

Assessing Normality

# Check for normality of air and water temperature data 
# Purpose: to determine whether the parametric or non-parametric statistical tests are appropritate 


# Step 1: Generate a QQ- Plots
# QQ plot visually compares the sample data to a theoretical normal distribution
par(mfrow = c(1, 2)) #Set plotting layout for two QQ-plots
qqnorm(Dataset1$air, main= "QQ Plot: Air Temperature") # Plot for air temperature
qqline(Dataset1$air, col= "red") #Add a reference line for normality
qqnorm(Dataset1$water, main="QQ Plot: Water Temperature")
qqline(Dataset1$water, col="red")

# Step 2: Perform Shapiro-Wilk test for normality 
# If P-value > 0.05, the data is considered to be normally distribution 
shapiro_air <- shapiro.test(Dataset1$air)
shapiro_water <- shapiro.test(Dataset1$water)

# Print the Shapiro-Wilk test results
# Display the test statistical and p-value for both air and water temperatures. 
cat("Shapiro-Wilk Test for Air Temperature: W =", shapiro_air$statistic, ", p-value =", shapiro_air$p.value, "\n")

Shapiro-Wilk Test for Air Temperature: W = 0.9588536 , p-value = 1.337993e-10

cat("Shapiro-Wilk Test for Water Temperature: W =", shapiro_water$statistic, ", p-value =", shapiro_water$p.value, "\n")

Shapiro-Wilk Test for Water Temperature: W = 0.9603501 , p-value = 2.370881e-10

# Step 3: Calculate Skewness and Kurtosis
# Skewness indicates asymmetry of the data distribution 
# Kurtosis measures the 'tailedness' of the data distribution 
skew_air <- skewness(Dataset1$air)
kurt_air <- kurtosis(Dataset1$air)
skew_water <- skewness(Dataset1$water)
kurt_water <- kurtosis(Dataset1$water)

# Print Skewness and Kurtosis 
# These metrics provide additional insights into the shape of the data distribution 
cat("Air Temperature: Skewness =", skew_air, ", Kurtosis =", kurt_air, "\n")

Air Temperature: Skewness = 0.03811551 , Kurtosis = 1.864071

cat("Water Temperature: Skewness =", skew_water, ", Kurtosis =", kurt_water, "\n")

Water Temperature: Skewness = -0.009127381 , Kurtosis = 1.857337

Results:

Shapiro- Wilk Test :

- p- values are extremely small therefore suggests that data does not follow a normal distribution

Skewness and Kurtosis:

- Skewness values are close to zero, indicating symmetry in the distribution

Kurtosis values are below 3, suggesting that the distributions are platykurtic (flatter than a normal distribution) Therefore best test to do is Spearman’s rank correlation coefficient

# Perform the Spearman correlation test
# Spearman is used because the data is not normally distributed
spearman_corr <- cor.test(Dataset1$air, Dataset1$water, method = "spearman")

# Display the results of the Spearman correlation test
cat("Spearman Correlation Test Results:\n")

Spearman Correlation Test Results:

print(spearman_corr)


    Spearman's rank correlation rho

data:  Dataset1$air and Dataset1$water
S = 22007692, p-value = 0.2082
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.05637344

# Prepare the results for the table
# Extract relevant statistics from the test
correlation_coefficient <- spearman_corr$estimate
p_value <- spearman_corr$p.value

# Create a summary table
# The table includes the test statistic, correlation coefficient, and p-value
results_table <- data.frame(
  Test = "Spearman Correlation",
  `Correlation Coefficient` = round(correlation_coefficient, 3),
  `p-value` = signif(p_value, 3),
  `Interpretation` = ifelse(p_value < 0.05, "Significant", "Not Significant")
)

# Print the results table
cat("\nResults Table:\n")


Results Table:

print(results_table)

                    Test Correlation.Coefficient p.value  Interpretation
rho Spearman Correlation                  -0.056   0.208 Not Significant

# Scatter-plot with a smoothed trend line to visualize the relationship between air and water temperatures
ggplot(Dataset1, aes(x = air, y = water)) +
  geom_point(color = "blue", alpha = 0.6) +  # Scatter points
  geom_smooth(method = "loess", color = "red", se = FALSE) +  # Red smooth line (LOESS)
  labs(title = "Scatterplot of Air vs Water Temperature",
       x = "Air Temperature",
       y = "Water Temperature") +
  theme_minimal()  # Clean theme

`geom_smooth()` using formula = 'y ~ x'

Results: Question: Is there a correlation between air and water temperatures?

Answer: Based on the Spearman’s correlation test (p= 0.056, p= 0.208, there is no significant correlation between air and water temperatures.

Results

# Create a results table for Spearman Correlation
library(knitr) # For table rendering
results_table <- data.frame(
  Test = "Spearman Correlation",
  Correlation_Coefficient = round(correlation_coefficient, 3),
  p_value = signif(p_value, 3),
  Interpretation = ifelse(p_value < 0.05, "Significant", "Not Significant")
)

# Print the table as a clean markdown-ready table
kable(results_table, caption = "Results of Spearman Correlation Test for Air and Water Temperatures")

Results of Spearman Correlation Test for Air and Water Temperatures
	Test	Correlation_Coefficient	p_value	Interpretation
rho	Spearman Correlation	-0.056	0.208	Not Significant

# Create scatterplot with LOESS smoothing
ggplot(Dataset1, aes(x = air, y = water)) +
  geom_point(color = "blue", alpha = 0.6) +  # Add scatter points
  geom_smooth(method = "loess", color = "red", se = FALSE) +  # Add smoothed LOESS line
  labs(
    title = "Scatterplot of Air vs Water Temperature",
    x = "Air Temperature",
    y = "Water Temperature"
  ) +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

Question 2: Does Multiple Capture Have an Effect on Blotching Time?

# Load the necessary package 
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ stringr   1.5.1
✔ forcats   1.0.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#Inspect the data
head(Dataset2)

# A tibble: 6 × 4
  ID    sex    blotch1 blotch2
  <chr> <chr>    <dbl>   <dbl>
1 SH269 Female    36.1    37.2
2 SH163 Female    33.4    34.4
3 SH008 Female    36.3    36.5
4 SH239 Female    35.0    36.0
5 SH332 Female    35.7    36.8
6 SH328 Female    34.9    35.9

#Check structure and summary of data
str(Dataset2)

spc_tbl_ [50 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ ID     : chr [1:50] "SH269" "SH163" "SH008" "SH239" ...
 $ sex    : chr [1:50] "Female" "Female" "Female" "Female" ...
 $ blotch1: num [1:50] 36.1 33.4 36.3 35 35.7 ...
 $ blotch2: num [1:50] 37.2 34.4 36.5 36 36.8 ...
 - attr(*, "spec")=
  .. cols(
  ..   ID = col_character(),
  ..   sex = col_character(),
  ..   blotch1 = col_double(),
  ..   blotch2 = col_double()
  .. )
 - attr(*, "problems")=<externalptr>

summary(Dataset2)

      ID                sex               blotch1         blotch2     
 Length:50          Length:50          Min.   :32.49   Min.   :33.47  
 Class :character   Class :character   1st Qu.:34.38   1st Qu.:35.31  
 Mode  :character   Mode  :character   Median :34.94   Median :35.94  
                                       Mean   :35.03   Mean   :35.96  
                                       3rd Qu.:35.90   3rd Qu.:36.78  
                                       Max.   :37.07   Max.   :38.18

#Check for missing values
sum(is.na(Dataset2)) #Should return 0 if data is clean

[1] 0

Exploring the data and answering the question:

# Exploratory Data Analysis (EDA)
## Summary stats
### Summary statistics provide an overview of the central tendency (mean) and variability (SD) for blotching time.
Dataset2 %>%
  summarise(
    mean_blotch1 = mean(blotch1, na.rm=TRUE), # Calculate the mean blotching time for captures
    mean_blotch2 = mean(blotch2, na.rm=TRUE),
    SD_blotch1 = sd(blotch1, na.rm=TRUE), # Calculate the standard deviation for the captures
    SD_blotch2 = sd(blotch2, na.rm=TRUE)
  )

# A tibble: 1 × 4
  mean_blotch1 mean_blotch2 SD_blotch1 SD_blotch2
         <dbl>        <dbl>      <dbl>      <dbl>
1         35.0         36.0       1.10       1.16

# Visualizations 
## Blox-plot for blotching times
### Box plots are ideal for comparing distribution between two groups and highlights the differences in medians, spread and potential outlines. 
Dataset2 %>%
  pivot_longer(cols = starts_with("Blotch"), names_to = "Capture", values_to = "Time") %>%
  ggplot(aes(x = Capture, y = Time, fill = Capture)) +
  geom_boxplot() + # Create a boxplot to visualise the distribution of blotching times
  labs(title = "Blotching Times by Capture Event", x = "Capture Event", y = "Blotching Time (seconds)") +
  theme_minimal() # Apply clean and minimal theme

## Paired scatter-plot
## Shows the relationship between blotching times for the two captures
### Diagonal line helps to identify whether blotch time for capture 2 differ consistently from capture 1. 
ggplot(Dataset2, aes(x = blotch1, y = blotch2)) +
  geom_point(alpha = 0.7) + # Plot points for paired blotching times
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") + # Adding a diagonal reference line
  labs(title = "Paired Blotching Times", x = "Blotch Time (Capture 1)", y = "Blotch Time (Capture 2)") +
  theme_minimal()

# Statistical Analysis
## Paired t-test: Evaluates whether there is a significant mean difference in blotching time between two captures. 
t_test_results <- t.test(Dataset2$blotch1, Dataset2$blotch2, paired = TRUE)

# Output t-test results
print(t_test_results) # Displays the results


    Paired t-test

data:  Dataset2$blotch1 and Dataset2$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.037176 -0.822301
sample estimates:
mean difference 
     -0.9297384

# Interpretation of Results
## Save the t-test results for reporting
capture_effect <- ifelse(t_test_results$p.value < 0.05, 
                         "significant", 
                         "not significant")

# This steps interprets the t-test results by determining statistical significance and presenting them in a clear, re-portable format. 
cat("The effect of multiple captures on blotching time is", capture_effect, 
    "with a p-value of", round(t_test_results$p.value, 4), ".\n")

The effect of multiple captures on blotching time is significant with a p-value of 0 .

# Prepare data for visualization
data_long <- Dataset2 %>%
  pivot_longer(cols = starts_with("blotch"), 
               names_to = "Capture", 
               values_to = "BlotchTime") %>%
  mutate(Capture = recode(Capture, 
                          blotch1 = "Capture 1", 
                          blotch2 = "Capture 2"))

# Boxplot to show blotching time distribution for each capture
ggplot(data_long, aes(x = Capture, y = BlotchTime, fill = Capture)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA) + # Create boxplots with semi-transparency and hide outliners 
  stat_summary(fun = "mean", geom = "point", shape = 20, size = 4, color = "red") + # Add red points for mean values
  labs(title = "Blotching Time by Capture Event",
       x = "Capture Event",
       y = "Blotching Time (seconds)") +
  theme_minimal() +
  theme(legend.position = "none") # Remove legends for simplicity

# Bar chart with error bars
## Compute summary statistics
data_summary <- data_long %>%
  group_by(Capture) %>%
  summarise(Mean = mean(BlotchTime, na.rm = TRUE), # Calculate mean blotching time
            SD = sd(BlotchTime, na.rm = TRUE), # Calculate standard deviation
            N = n(), # Count number of observations 
            SE = SD / sqrt(N)) # Calculate standard error

## Plot the summary as a bar chart
ggplot(data_summary, aes(x = Capture, y = Mean, fill = Capture)) +
  geom_bar(stat = "identity", alpha = 0.7, color = "black") + # Create bar chart with transparency 
  geom_errorbar(aes(ymin = Mean - SE, ymax = Mean + SE), width = 0.2) + # Add error bars
  labs(title = "Mean Blotching Time with Standard Errors",
       x = "Capture Event",
       y = "Blotching Time (seconds)") +
  theme_minimal() +
  theme(legend.position = "none")

Result Discussion: The results of the paired t-test indicate that there is a statistically significant effect on multiple captures on blotching time.

Question 3: Is It Possible to Predict Blotching Time?

# Step 1: Install and load the required packages
# Install required packages if not already installed
# install.packages(c("dplyr", "ggplot2", "caret", "corrplot", "tidyr"))

library(dplyr)   # For data manipulation
library(ggplot2) # For visualization
library(caret)   # For regression modeling and performance evaluation

Loading required package: lattice


Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

library(corrplot) # For correlation matrix visualization

corrplot 0.95 loaded

library(tidyr)   # For reshaping data

# Step 2: Load and preview the datasets
# Assuming Dataset1 and Dataset2 are already loaded in your R environment
head(Dataset1)

# A tibble: 6 × 10
  ID    sex    blotch   BPM weight length   air water  meta depth
  <chr> <chr>   <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
1 SH001 Female   37.2   148   74.7   187.  37.7  23.4  64.1  53.2
2 SH002 Female   34.5   158   73.4   189.  35.7  21.4  73.7  49.6
3 SH003 Female   36.3   125   71.8   284.  34.8  20.1  54.4  49.4
4 SH004 Male     35.3   161  105.    171.  36.2  21.6  86.3  50.3
5 SH005 Female   37.4   138   67.1   264.  33.6  21.8 108.   49.0
6 SH006 Male     33.5   126  110.    270.  36.4  20.9 109.   46.8

head(Dataset2)

# A tibble: 6 × 4
  ID    sex    blotch1 blotch2
  <chr> <chr>    <dbl>   <dbl>
1 SH269 Female    36.1    37.2
2 SH163 Female    33.4    34.4
3 SH008 Female    36.3    36.5
4 SH239 Female    35.0    36.0
5 SH332 Female    35.7    36.8
6 SH328 Female    34.9    35.9

# Step 3: Explore the data (EDA)
## Check the structure and summary statistics of Dataset1
str(Dataset1)

spc_tbl_ [500 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ ID    : chr [1:500] "SH001" "SH002" "SH003" "SH004" ...
 $ sex   : chr [1:500] "Female" "Female" "Female" "Male" ...
 $ blotch: num [1:500] 37.2 34.5 36.3 35.3 37.4 ...
 $ BPM   : num [1:500] 148 158 125 161 138 126 166 135 132 127 ...
 $ weight: num [1:500] 74.7 73.4 71.8 104.6 67.1 ...
 $ length: num [1:500] 187 189 284 171 264 ...
 $ air   : num [1:500] 37.7 35.7 34.8 36.2 33.6 ...
 $ water : num [1:500] 23.4 21.4 20.1 21.6 21.8 ...
 $ meta  : num [1:500] 64.1 73.7 54.4 86.3 108 ...
 $ depth : num [1:500] 53.2 49.6 49.4 50.3 49 ...
 - attr(*, "spec")=
  .. cols(
  ..   ID = col_character(),
  ..   sex = col_character(),
  ..   blotch = col_double(),
  ..   BPM = col_double(),
  ..   weight = col_double(),
  ..   length = col_double(),
  ..   air = col_double(),
  ..   water = col_double(),
  ..   meta = col_double(),
  ..   depth = col_double()
  .. )
 - attr(*, "problems")=<externalptr>

summary(Dataset1)

      ID                sex                blotch           BPM       
 Length:500         Length:500         Min.   :30.78   Min.   :119.0  
 Class :character   Class :character   1st Qu.:34.16   1st Qu.:129.0  
 Mode  :character   Mode  :character   Median :35.05   Median :142.0  
                                       Mean   :35.13   Mean   :141.8  
                                       3rd Qu.:36.05   3rd Qu.:153.2  
                                       Max.   :40.08   Max.   :166.0  
     weight           length           air            water      
 Min.   : 65.10   Min.   :128.3   Min.   :33.00   Min.   :20.01  
 1st Qu.: 75.68   1st Qu.:172.0   1st Qu.:34.42   1st Qu.:21.55  
 Median : 87.82   Median :211.1   Median :35.43   Median :23.11  
 Mean   : 87.94   Mean   :211.0   Mean   :35.54   Mean   :23.02  
 3rd Qu.:100.40   3rd Qu.:251.8   3rd Qu.:36.71   3rd Qu.:24.37  
 Max.   :110.94   Max.   :291.0   Max.   :38.00   Max.   :25.99  
      meta            depth      
 Min.   : 50.03   Min.   :44.64  
 1st Qu.: 67.39   1st Qu.:48.90  
 Median : 82.45   Median :50.14  
 Mean   : 82.04   Mean   :50.14  
 3rd Qu.: 95.97   3rd Qu.:51.35  
 Max.   :112.45   Max.   :56.83

## Check for missing values
colSums(is.na(Dataset1))

    ID    sex blotch    BPM weight length    air  water   meta  depth 
     0      0      0      0      0      0      0      0      0      0

## Visualize the distribution of Blotch (dependent variable)
ggplot(Dataset1, aes(x = blotch)) +
  geom_histogram(binwidth = 2, fill = "skyblue", color = "black") +
  labs(title = "Distribution of Blotch Time", x = "Blotch Time (seconds)", y = "Frequency") +
  theme_minimal()

# Step 4: Check correlations among continuous variables
## Exclude non-numeric columns (e.g., ID and Sex) for correlation matrix
numeric_data <- Dataset1 %>% select(-ID, -sex)

## Compute the correlation matrix
cor_matrix <- cor(numeric_data)

## Visualize the correlation matrix
corrplot(cor_matrix, method = "circle", type = "upper", tl.col = "black", tl.srt = 45,
         title = "Correlation Matrix of Continuous Variables")

# Step 5: Build a multiple linear regression model to predict Blotch
## Fit the model
model <- lm(blotch ~ BPM + weight + length + air + water + meta + depth + sex, data = Dataset1)

## Display the model summary
summary(model)


Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta + 
    depth + sex, data = Dataset1)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.97715 -0.66193 -0.00841  0.64123  2.90395 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.1179728  1.8749828   5.930 5.73e-09 ***
BPM         -0.0020791  0.0031540  -0.659  0.51009    
weight       0.0017281  0.0033143   0.521  0.60231    
length       0.0013042  0.0009606   1.358  0.17517    
air         -0.0310068  0.0315302  -0.983  0.32590    
water       -0.0143878  0.0268112  -0.537  0.59176    
meta        -0.0011610  0.0025671  -0.452  0.65127    
depth        0.5034077  0.0220870  22.792  < 2e-16 ***
sexMale      0.3088617  0.0890602   3.468  0.00057 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9912 on 491 degrees of freedom
Multiple R-squared:  0.5256,    Adjusted R-squared:  0.5178 
F-statistic: 67.99 on 8 and 491 DF,  p-value: < 2.2e-16

# Step 6: Model Diagnostics
## Plot residual diagnostics
par(mfrow = c(2, 2))
plot(model)

# Step 7: Visualize relationships for key variables
## Blotch vs Depth (significant predictor)
ggplot(Dataset1, aes(x = depth, y = blotch)) +
  geom_point(color = "purple", alpha = 0.6) +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Blotch vs. Depth", x = "Depth (m)", y = "Blotch Time (seconds)") +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

## Blotch vs Sex (categorical variable)
ggplot(Dataset1, aes(x = sex, y = blotch, fill = sex)) +
  geom_boxplot() +
  labs(title = "Blotch vs. Sex", x = "Sex", y = "Blotch Time (seconds)") +
  theme_minimal() +
  scale_fill_manual(values = c("pink", "lightblue"))

## Blotch vs Meta (stress hormone cortisol)
ggplot(Dataset1, aes(x = meta, y = blotch)) +
  geom_point(color = "orange", alpha = 0.6) +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Blotch vs. Meta (Cortisol Levels)", x = "Cortisol Levels (ng/mL)", y = "Blotch Time (seconds)") +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

## Blotch vs Weight
ggplot(Dataset1, aes(x = weight, y = blotch)) +
  geom_point(color = "green", alpha = 0.6) +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Blotch vs. Weight", x = "Weight (kg)", y = "Blotch Time (seconds)") +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

# Step 8: Model Performance Metrics
## R-squared value
r_squared <- summary(model)$r.squared
cat("R-squared Value:", r_squared, "\n")

R-squared Value: 0.5255797

## Check the significance of each predictor variable
cat("Significant Predictors (p < 0.05):\n")

Significant Predictors (p < 0.05):

summary(model)$coefficients[summary(model)$coefficients[, 4] < 0.05, ]

              Estimate Std. Error   t value     Pr(>|t|)
(Intercept) 11.1179728 1.87498284  5.929640 5.729619e-09
depth        0.5034077 0.02208705 22.791988 5.614863e-79
sexMale      0.3088617 0.08906018  3.468011 5.704265e-04