Assessing Shark Welfare

Author

Megara Welaratne

Published

January 24, 2025

Introduction

This work book looks into the relationship between stress indicators in sharks. Through analyzing the data based on blotching, gathered when capturing sharks, will allow for evaluation of any correlation between stress and capture.

Preparing for Data input

# Import the libraries that are needed 
library(readxl)      # For reading Excel files
library(tidyverse)   # For data manipulation and visualization (ggplot2, dplyr, etc.)
library(ggplot2)     # For visualization (part of tidyverse)
library(dplyr)       # For data manipulation (part of tidyverse)
library(tidyr)       # For data reshaping (part of tidyverse)

# For statistical analysis 
library(stats)       # For basic statistical functions (built-in to R)
library(car)         # For additional statistical tools

# Data cleaning 
library(janitor)     # For cleaning data
library(lubridate)   # For date handling (if needed)

# Interactive plots 
library(plotly)      # For interactive plots

# Reporting tools 
library(rmarkdown)   # For rendering reports (Quarto uses this in the background)
library(knitr)       # For chunk processing in Quarto

Uploading and Summary of Data set 1 “Sharks”

# providing the correct path to the file
sharks <- read_excel("/Users/megarawelaratne/Desktop/sharks.xlsx")


head(sharks)  # Shows a preview (the first 6 rows) 
# A tibble: 6 × 10
  ID    sex    blotch   BPM weight length   air water  meta depth
  <chr> <chr>   <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
1 SH001 Female   37.2   148   74.7   187.  37.7  23.4  64.1  53.2
2 SH002 Female   34.5   158   73.4   189.  35.7  21.4  73.7  49.6
3 SH003 Female   36.3   125   71.8   284.  34.8  20.1  54.4  49.4
4 SH004 Male     35.3   161  105.    171.  36.2  21.6  86.3  50.3
5 SH005 Female   37.4   138   67.1   264.  33.6  21.8 108.   49.0
6 SH006 Male     33.5   126  110.    270.  36.4  20.9 109.   46.8
summary(sharks) # Show a summary of the data set
      ID                sex                blotch           BPM       
 Length:500         Length:500         Min.   :30.78   Min.   :119.0  
 Class :character   Class :character   1st Qu.:34.16   1st Qu.:129.0  
 Mode  :character   Mode  :character   Median :35.05   Median :142.0  
                                       Mean   :35.13   Mean   :141.8  
                                       3rd Qu.:36.05   3rd Qu.:153.2  
                                       Max.   :40.08   Max.   :166.0  
     weight           length           air            water      
 Min.   : 65.10   Min.   :128.3   Min.   :33.00   Min.   :20.01  
 1st Qu.: 75.68   1st Qu.:172.0   1st Qu.:34.42   1st Qu.:21.55  
 Median : 87.82   Median :211.1   Median :35.43   Median :23.11  
 Mean   : 87.94   Mean   :211.0   Mean   :35.54   Mean   :23.02  
 3rd Qu.:100.40   3rd Qu.:251.8   3rd Qu.:36.71   3rd Qu.:24.37  
 Max.   :110.94   Max.   :291.0   Max.   :38.00   Max.   :25.99  
      meta            depth      
 Min.   : 50.03   Min.   :44.64  
 1st Qu.: 67.39   1st Qu.:48.90  
 Median : 82.45   Median :50.14  
 Mean   : 82.04   Mean   :50.14  
 3rd Qu.: 95.97   3rd Qu.:51.35  
 Max.   :112.45   Max.   :56.83  

Uploading and Symmary of Data set 2 “Sharkssub”

# providing the correct path to the file
sharksub <- read_excel("/Users/megarawelaratne/Desktop/sharksub.xlsx")

head(sharksub)  # Shows a preview (first 6 rows)
# A tibble: 6 × 4
  ID    sex    blotch1 blotch2
  <chr> <chr>    <dbl>   <dbl>
1 SH269 Female    36.1    37.2
2 SH163 Female    33.4    34.4
3 SH008 Female    36.3    36.5
4 SH239 Female    35.0    36.0
5 SH332 Female    35.7    36.8
6 SH328 Female    34.9    35.9
summary(sharksub) # Shows a summary of the data set 
      ID                sex               blotch1         blotch2     
 Length:50          Length:50          Min.   :32.49   Min.   :33.47  
 Class :character   Class :character   1st Qu.:34.38   1st Qu.:35.31  
 Mode  :character   Mode  :character   Median :34.94   Median :35.94  
                                       Mean   :35.03   Mean   :35.96  
                                       3rd Qu.:35.90   3rd Qu.:36.78  
                                       Max.   :37.07   Max.   :38.18  

Statistical and Visual Analysis for each Question

Q1: Correlation Between Air and Water (Dataset 1)

#  Creating a Scatter plot of air vs water
ggplot(sharks, aes(x = air, y = water)) +
  geom_point() +
  geom_smooth(method = "lm", color = "red") +
  theme_minimal() +
  labs(title = "Scatter Plot of Air vs Water", x = "Air (time)", y = "Water (time)")

# Pearson correlation test
cor_test <- cor.test(sharks$air, sharks$water, method = "pearson")
cor_test

    Pearson's product-moment correlation

data:  sharks$air and sharks$water
t = -1.2346, df = 498, p-value = 0.2176
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.14224207  0.03260803
sample estimates:
        cor 
-0.05524051 

Q2: Does Multiple Capture Affect Blotching Time (Dataset 2)

# Checking the structure of blotch1 and blotch2
str(sharksub)
tibble [50 × 4] (S3: tbl_df/tbl/data.frame)
 $ ID     : chr [1:50] "SH269" "SH163" "SH008" "SH239" ...
 $ sex    : chr [1:50] "Female" "Female" "Female" "Female" ...
 $ blotch1: num [1:50] 36.1 33.4 36.3 35 35.7 ...
 $ blotch2: num [1:50] 37.2 34.4 36.5 36 36.8 ...
# The Paired t-test
t_test_paired <- t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)
t_test_paired

    Paired t-test

data:  sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.037176 -0.822301
sample estimates:
mean difference 
     -0.9297384 
# Reshaping the data to long format for visualization
sharksub_long <- sharksub %>%
  pivot_longer(cols = c(blotch1, blotch2), 
               names_to = "Capture", 
               values_to = "Blotch_Time")

# Creating a box plot
ggplot(sharksub_long, aes(x = Capture, y = Blotch_Time, fill = Capture)) +
  geom_boxplot(alpha = 0.7) +
  theme_minimal() +
  labs(title = "Blotching Time Across Captures",
       x = "Capture Event",
       y = "Blotching Time")

Q3: Is it possible to predict Blotching Time (Dataset 1 & 2)

Only using Data set 1

# Fit a linear regression model to predict blotch
lm_model <- lm(blotch ~ air + water + depth, data = sharks)

# Summarize the model
summary(lm_model)

Call:
lm(formula = blotch ~ air + water + depth, data = sharks)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.89045 -0.65406  0.00093  0.60311  2.76805 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.42834    1.74348   6.555  1.4e-10 ***
air         -0.03048    0.03143  -0.970    0.333    
water       -0.02066    0.02687  -0.769    0.442    
depth        0.50372    0.02219  22.695  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.001 on 496 degrees of freedom
Multiple R-squared:  0.5115,    Adjusted R-squared:  0.5086 
F-statistic: 173.2 on 3 and 496 DF,  p-value: < 2.2e-16

Only using data set 1 showed that using only water and air as factors in predicting blotching time was not enough but depth added a significant difference. However, merging both data sets will allow for more variables to contribute to the model such as blotch2.

# Merge datasets to see if more predications are available
sharks_combined <- sharks %>%
  left_join(sharksub, by = "ID")

# Fit a regression model using variables from both datasets
lm_model_combined <- lm(blotch ~ air + water + depth + blotch1 + blotch2, 
                        data = sharks_combined)
summary(lm_model_combined)

Call:
lm(formula = blotch ~ air + water + depth + blotch1 + blotch2, 
    data = sharks_combined)

Residuals:
       Min         1Q     Median         3Q        Max 
-4.460e-15 -1.307e-16  8.440e-17  3.276e-16  1.208e-15 

Coefficients:
              Estimate Std. Error    t value Pr(>|t|)    
(Intercept)  1.680e-14  5.464e-15  3.075e+00  0.00361 ** 
air         -1.435e-17  7.568e-17 -1.900e-01  0.85043    
water       -1.017e-16  7.102e-17 -1.432e+00  0.15922    
depth        2.654e-16  8.731e-17  3.040e+00  0.00398 ** 
blotch1      1.000e+00  3.431e-16  2.915e+15  < 2e-16 ***
blotch2     -2.725e-17  3.166e-16 -8.600e-02  0.93182    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.192e-16 on 44 degrees of freedom
  (450 observations deleted due to missingness)
Multiple R-squared:      1, Adjusted R-squared:      1 
F-statistic: 1.754e+31 on 5 and 44 DF,  p-value: < 2.2e-16
# Check correlations between blotch, blotch1, and blotch2
cor(sharks_combined %>% select(blotch, blotch1, blotch2), use = "complete.obs")
           blotch   blotch1   blotch2
blotch  1.0000000 1.0000000 0.9456842
blotch1 1.0000000 1.0000000 0.9456842
blotch2 0.9456842 0.9456842 1.0000000

blotch1 is redundant because it is identical to blotch. Including it in the regression model will lead to over fitting and meaningless results.Blotch2 is highly correlating with blotch but not identical, meaning it can remain in the model.

# Remove blotch1 from the dataset
sharks_combined_clean <- sharks_combined %>%
  select(-blotch1)

# Fit the refined regression model
lm_model_refined <- lm(blotch ~ air + water + depth + blotch2, data = sharks_combined_clean)
summary(lm_model_refined)

Call:
lm(formula = blotch ~ air + water + depth + blotch2, data = sharks_combined_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.30521 -0.20611 -0.10466  0.05488  1.12037 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.24706    2.32427   1.397   0.1693    
air          0.01211    0.03283   0.369   0.7139    
water       -0.05133    0.02989  -1.717   0.0928 .  
depth        0.03305    0.03762   0.879   0.3842    
blotch2      0.85877    0.05038  17.044   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.356 on 45 degrees of freedom
  (450 observations deleted due to missingness)
Multiple R-squared:  0.9031,    Adjusted R-squared:  0.8945 
F-statistic: 104.9 on 4 and 45 DF,  p-value: < 2.2e-16

The refined regression model using air, water, depth and blotch2 provides much more interpretable results compared to the previous versions.

Creating Diagnostic plots for Visualization of the model

# Create diagnostic plots for the regression model
par(mfrow = c(2, 2))  # Arrange plots in a 2x2 grid
plot(lm_model_refined)

Creating scatter plots will help to visualize the relationships between:

  1. blotch (dependent variable) and blotch2 (the strongest predictor).
  2. blotch and air (weaker predictor)
  3. blotch and water (weaker predictor)
#  1: Scatter plot: blotch vs blotch2
ggplot(sharks_combined_clean, aes(x = blotch2, y = blotch)) +
  geom_point(color = "blue", alpha = 0.7) +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  theme_minimal() +
  labs(title = "Relationship Between Blotch and Blotch2",
       x = "Blotch2",
       y = "Blotch")

# 2: Scatter plot: blotch vs air
ggplot(sharks_combined_clean, aes(x = air, y = blotch)) +
  geom_point(color = "green", alpha = 0.7) +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  theme_minimal() +
  labs(title = "Relationship Between Blotch and Air",
       x = "Air Exposure Time",
       y = "Blotch")

# 3: Scatter plot: blotch vs water
ggplot(sharks_combined_clean, aes(x = water, y = blotch)) +
  geom_point(color = "orange", alpha = 0.7) +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  theme_minimal() +
  labs(title = "Relationship Between Blotch and Water",
       x = "Water Exposure Time",
       y = "Blotch")

In order to asses how well this model predicts blotching in Sharks, additional editing to help refine the model must be made:

# Check the summary of the model
summary(lm_model_refined)

Call:
lm(formula = blotch ~ air + water + depth + blotch2, data = sharks_combined_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.30521 -0.20611 -0.10466  0.05488  1.12037 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.24706    2.32427   1.397   0.1693    
air          0.01211    0.03283   0.369   0.7139    
water       -0.05133    0.02989  -1.717   0.0928 .  
depth        0.03305    0.03762   0.879   0.3842    
blotch2      0.85877    0.05038  17.044   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.356 on 45 degrees of freedom
  (450 observations deleted due to missingness)
Multiple R-squared:  0.9031,    Adjusted R-squared:  0.8945 
F-statistic: 104.9 on 4 and 45 DF,  p-value: < 2.2e-16
# Refit the model to ensure alignment
lm_model_refined <- lm(blotch ~ air + water + depth + blotch2, data = sharks_combined_clean)

# Test predictions
head(predict(lm_model_refined, newdata = sharks_combined_clean))
       1        2        3        4        5        6 
      NA       NA       NA 34.81715       NA       NA 
# Test residuals
head(residuals(lm_model_refined))
          4           8          29          49          59          70 
 0.52166195  0.71222237 -0.11611701  0.07949033  0.03902610 -0.09545302 

Through double checking where and what the missing values were are found:

# Check for missing values in predictors
colSums(is.na(sharks_combined_clean[, c("air", "water", "depth","blotch2")]))
    air   water   depth blotch2 
      0       0       0     450 
# Remove rows with missing blotch2
sharks_combined_clean <- sharks_combined_clean %>%
  drop_na(blotch2)

# Refit the regression model
lm_model_refined <- lm(blotch ~ air + water + depth + blotch2, data = sharks_combined_clean)

# Check number of rows remaining
nrow(sharks_combined_clean)
[1] 50

There were 450 missing values in blotch2, reducing the amount of blotch2 to the amount available reduced it to 50. As this is not enough data to be able to make accurate predictions, we can use the mean value to fill in the missing values:

# Impute missing values in blotch2 with the mean
sharks_combined_clean$blotch2[is.na(sharks_combined_clean$blotch2)] <- 
  mean(sharks_combined_clean$blotch2, na.rm = TRUE)

# Check for missing values again
colSums(is.na(sharks_combined_clean))
     ID   sex.x  blotch     BPM  weight  length     air   water    meta   depth 
      0       0       0       0       0       0       0       0       0       0 
  sex.y blotch2 
      0       0 

Now all the blotch2 rows have valid values we can refit the model.

# Refit the regression model
lm_model_refined <- lm(blotch ~ air + water + depth + blotch2, data = sharks_combined_clean)

# Summarize the updated model
summary(lm_model_refined)

Call:
lm(formula = blotch ~ air + water + depth + blotch2, data = sharks_combined_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.30521 -0.20611 -0.10466  0.05488  1.12037 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.24706    2.32427   1.397   0.1693    
air          0.01211    0.03283   0.369   0.7139    
water       -0.05133    0.02989  -1.717   0.0928 .  
depth        0.03305    0.03762   0.879   0.3842    
blotch2      0.85877    0.05038  17.044   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.356 on 45 degrees of freedom
Multiple R-squared:  0.9031,    Adjusted R-squared:  0.8945 
F-statistic: 104.9 on 4 and 45 DF,  p-value: < 2.2e-16

After fitting, we re-run the predict() function

# Generate predicted values
sharks_combined_clean <- sharks_combined_clean %>%
  mutate(
    predicted_blotch = predict(lm_model_refined, newdata = sharks_combined_clean),
    residuals = residuals(lm_model_refined)
  )

# Check the first few predictions
head(sharks_combined_clean$predicted_blotch)
       1        2        3        4        5        6 
34.81715 35.58275 35.29387 35.46906 36.46098 32.95503 

And now finally allowing us to generate a Residual vs Predict plot

# Residuals vs Predicted Plot
ggplot(sharks_combined_clean, aes(x = predicted_blotch, y = residuals)) +
  geom_point(alpha = 0.7, color = "purple") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  theme_minimal() +
  labs(title = "Residuals vs Predicted Values",
       x = "Predicted Blotch",
       y = "Residuals")

Additional analysis of data - Male vs Female blotching

# Boxplot for blotching by sex
ggplot(sharks_combined_clean, aes(x = sex.x, y = blotch, fill = sex.x)) +
  geom_boxplot(alpha = 0.7) +
  theme_minimal() +
  labs(title = "Blotching by Sex",
       x = "Sex (Male/Female)",
       y = "Blotch")

# Perform t-test for blotching between sexes
t_test <- t.test(blotch ~ sex.x, data = sharks_combined_clean)
t_test

    Welch Two Sample t-test

data:  blotch by sex.x
t = -1.4629, df = 47.997, p-value = 0.15
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
 -1.0644425  0.1678337
sample estimates:
mean in group Female   mean in group Male 
            34.80627             35.25457