# Import the libraries that are needed
library(readxl) # For reading Excel files
library(tidyverse) # For data manipulation and visualization (ggplot2, dplyr, etc.)
library(ggplot2) # For visualization (part of tidyverse)
library(dplyr) # For data manipulation (part of tidyverse)
library(tidyr) # For data reshaping (part of tidyverse)
# For statistical analysis
library(stats) # For basic statistical functions (built-in to R)
library(car) # For additional statistical tools
# Data cleaning
library(janitor) # For cleaning data
library(lubridate) # For date handling (if needed)
# Interactive plots
library(plotly) # For interactive plots
# Reporting tools
library(rmarkdown) # For rendering reports (Quarto uses this in the background)
library(knitr) # For chunk processing in QuartoAssessing Shark Welfare
Introduction
This work book looks into the relationship between stress indicators in sharks. Through analyzing the data based on blotching, gathered when capturing sharks, will allow for evaluation of any correlation between stress and capture.
Preparing for Data input
Uploading and Summary of Data set 1 “Sharks”
# providing the correct path to the file
sharks <- read_excel("/Users/megarawelaratne/Desktop/sharks.xlsx")
head(sharks) # Shows a preview (the first 6 rows) # A tibble: 6 × 10
ID sex blotch BPM weight length air water meta depth
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 SH001 Female 37.2 148 74.7 187. 37.7 23.4 64.1 53.2
2 SH002 Female 34.5 158 73.4 189. 35.7 21.4 73.7 49.6
3 SH003 Female 36.3 125 71.8 284. 34.8 20.1 54.4 49.4
4 SH004 Male 35.3 161 105. 171. 36.2 21.6 86.3 50.3
5 SH005 Female 37.4 138 67.1 264. 33.6 21.8 108. 49.0
6 SH006 Male 33.5 126 110. 270. 36.4 20.9 109. 46.8
summary(sharks) # Show a summary of the data set ID sex blotch BPM
Length:500 Length:500 Min. :30.78 Min. :119.0
Class :character Class :character 1st Qu.:34.16 1st Qu.:129.0
Mode :character Mode :character Median :35.05 Median :142.0
Mean :35.13 Mean :141.8
3rd Qu.:36.05 3rd Qu.:153.2
Max. :40.08 Max. :166.0
weight length air water
Min. : 65.10 Min. :128.3 Min. :33.00 Min. :20.01
1st Qu.: 75.68 1st Qu.:172.0 1st Qu.:34.42 1st Qu.:21.55
Median : 87.82 Median :211.1 Median :35.43 Median :23.11
Mean : 87.94 Mean :211.0 Mean :35.54 Mean :23.02
3rd Qu.:100.40 3rd Qu.:251.8 3rd Qu.:36.71 3rd Qu.:24.37
Max. :110.94 Max. :291.0 Max. :38.00 Max. :25.99
meta depth
Min. : 50.03 Min. :44.64
1st Qu.: 67.39 1st Qu.:48.90
Median : 82.45 Median :50.14
Mean : 82.04 Mean :50.14
3rd Qu.: 95.97 3rd Qu.:51.35
Max. :112.45 Max. :56.83
Uploading and Symmary of Data set 2 “Sharkssub”
# providing the correct path to the file
sharksub <- read_excel("/Users/megarawelaratne/Desktop/sharksub.xlsx")
head(sharksub) # Shows a preview (first 6 rows)# A tibble: 6 × 4
ID sex blotch1 blotch2
<chr> <chr> <dbl> <dbl>
1 SH269 Female 36.1 37.2
2 SH163 Female 33.4 34.4
3 SH008 Female 36.3 36.5
4 SH239 Female 35.0 36.0
5 SH332 Female 35.7 36.8
6 SH328 Female 34.9 35.9
summary(sharksub) # Shows a summary of the data set ID sex blotch1 blotch2
Length:50 Length:50 Min. :32.49 Min. :33.47
Class :character Class :character 1st Qu.:34.38 1st Qu.:35.31
Mode :character Mode :character Median :34.94 Median :35.94
Mean :35.03 Mean :35.96
3rd Qu.:35.90 3rd Qu.:36.78
Max. :37.07 Max. :38.18
Statistical and Visual Analysis for each Question
Q1: Correlation Between Air and Water (Dataset 1)
# Creating a Scatter plot of air vs water
ggplot(sharks, aes(x = air, y = water)) +
geom_point() +
geom_smooth(method = "lm", color = "red") +
theme_minimal() +
labs(title = "Scatter Plot of Air vs Water", x = "Air (time)", y = "Water (time)")# Pearson correlation test
cor_test <- cor.test(sharks$air, sharks$water, method = "pearson")
cor_test
Pearson's product-moment correlation
data: sharks$air and sharks$water
t = -1.2346, df = 498, p-value = 0.2176
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.14224207 0.03260803
sample estimates:
cor
-0.05524051
Q2: Does Multiple Capture Affect Blotching Time (Dataset 2)
# Checking the structure of blotch1 and blotch2
str(sharksub)tibble [50 × 4] (S3: tbl_df/tbl/data.frame)
$ ID : chr [1:50] "SH269" "SH163" "SH008" "SH239" ...
$ sex : chr [1:50] "Female" "Female" "Female" "Female" ...
$ blotch1: num [1:50] 36.1 33.4 36.3 35 35.7 ...
$ blotch2: num [1:50] 37.2 34.4 36.5 36 36.8 ...
# The Paired t-test
t_test_paired <- t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)
t_test_paired
Paired t-test
data: sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-1.037176 -0.822301
sample estimates:
mean difference
-0.9297384
# Reshaping the data to long format for visualization
sharksub_long <- sharksub %>%
pivot_longer(cols = c(blotch1, blotch2),
names_to = "Capture",
values_to = "Blotch_Time")
# Creating a box plot
ggplot(sharksub_long, aes(x = Capture, y = Blotch_Time, fill = Capture)) +
geom_boxplot(alpha = 0.7) +
theme_minimal() +
labs(title = "Blotching Time Across Captures",
x = "Capture Event",
y = "Blotching Time")Q3: Is it possible to predict Blotching Time (Dataset 1 & 2)
Only using Data set 1
# Fit a linear regression model to predict blotch
lm_model <- lm(blotch ~ air + water + depth, data = sharks)
# Summarize the model
summary(lm_model)
Call:
lm(formula = blotch ~ air + water + depth, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.89045 -0.65406 0.00093 0.60311 2.76805
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.42834 1.74348 6.555 1.4e-10 ***
air -0.03048 0.03143 -0.970 0.333
water -0.02066 0.02687 -0.769 0.442
depth 0.50372 0.02219 22.695 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.001 on 496 degrees of freedom
Multiple R-squared: 0.5115, Adjusted R-squared: 0.5086
F-statistic: 173.2 on 3 and 496 DF, p-value: < 2.2e-16
Only using data set 1 showed that using only water and air as factors in predicting blotching time was not enough but depth added a significant difference. However, merging both data sets will allow for more variables to contribute to the model such as blotch2.
# Merge datasets to see if more predications are available
sharks_combined <- sharks %>%
left_join(sharksub, by = "ID")
# Fit a regression model using variables from both datasets
lm_model_combined <- lm(blotch ~ air + water + depth + blotch1 + blotch2,
data = sharks_combined)
summary(lm_model_combined)
Call:
lm(formula = blotch ~ air + water + depth + blotch1 + blotch2,
data = sharks_combined)
Residuals:
Min 1Q Median 3Q Max
-4.460e-15 -1.307e-16 8.440e-17 3.276e-16 1.208e-15
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.680e-14 5.464e-15 3.075e+00 0.00361 **
air -1.435e-17 7.568e-17 -1.900e-01 0.85043
water -1.017e-16 7.102e-17 -1.432e+00 0.15922
depth 2.654e-16 8.731e-17 3.040e+00 0.00398 **
blotch1 1.000e+00 3.431e-16 2.915e+15 < 2e-16 ***
blotch2 -2.725e-17 3.166e-16 -8.600e-02 0.93182
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.192e-16 on 44 degrees of freedom
(450 observations deleted due to missingness)
Multiple R-squared: 1, Adjusted R-squared: 1
F-statistic: 1.754e+31 on 5 and 44 DF, p-value: < 2.2e-16
# Check correlations between blotch, blotch1, and blotch2
cor(sharks_combined %>% select(blotch, blotch1, blotch2), use = "complete.obs") blotch blotch1 blotch2
blotch 1.0000000 1.0000000 0.9456842
blotch1 1.0000000 1.0000000 0.9456842
blotch2 0.9456842 0.9456842 1.0000000
blotch1 is redundant because it is identical to blotch. Including it in the regression model will lead to over fitting and meaningless results.Blotch2 is highly correlating with blotch but not identical, meaning it can remain in the model.
# Remove blotch1 from the dataset
sharks_combined_clean <- sharks_combined %>%
select(-blotch1)
# Fit the refined regression model
lm_model_refined <- lm(blotch ~ air + water + depth + blotch2, data = sharks_combined_clean)
summary(lm_model_refined)
Call:
lm(formula = blotch ~ air + water + depth + blotch2, data = sharks_combined_clean)
Residuals:
Min 1Q Median 3Q Max
-0.30521 -0.20611 -0.10466 0.05488 1.12037
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.24706 2.32427 1.397 0.1693
air 0.01211 0.03283 0.369 0.7139
water -0.05133 0.02989 -1.717 0.0928 .
depth 0.03305 0.03762 0.879 0.3842
blotch2 0.85877 0.05038 17.044 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.356 on 45 degrees of freedom
(450 observations deleted due to missingness)
Multiple R-squared: 0.9031, Adjusted R-squared: 0.8945
F-statistic: 104.9 on 4 and 45 DF, p-value: < 2.2e-16
The refined regression model using air, water, depth and blotch2 provides much more interpretable results compared to the previous versions.
Creating Diagnostic plots for Visualization of the model
# Create diagnostic plots for the regression model
par(mfrow = c(2, 2)) # Arrange plots in a 2x2 grid
plot(lm_model_refined)Creating scatter plots will help to visualize the relationships between:
- blotch (dependent variable) and blotch2 (the strongest predictor).
- blotch and air (weaker predictor)
- blotch and water (weaker predictor)
# 1: Scatter plot: blotch vs blotch2
ggplot(sharks_combined_clean, aes(x = blotch2, y = blotch)) +
geom_point(color = "blue", alpha = 0.7) +
geom_smooth(method = "lm", color = "red", se = TRUE) +
theme_minimal() +
labs(title = "Relationship Between Blotch and Blotch2",
x = "Blotch2",
y = "Blotch")# 2: Scatter plot: blotch vs air
ggplot(sharks_combined_clean, aes(x = air, y = blotch)) +
geom_point(color = "green", alpha = 0.7) +
geom_smooth(method = "lm", color = "red", se = TRUE) +
theme_minimal() +
labs(title = "Relationship Between Blotch and Air",
x = "Air Exposure Time",
y = "Blotch")# 3: Scatter plot: blotch vs water
ggplot(sharks_combined_clean, aes(x = water, y = blotch)) +
geom_point(color = "orange", alpha = 0.7) +
geom_smooth(method = "lm", color = "red", se = TRUE) +
theme_minimal() +
labs(title = "Relationship Between Blotch and Water",
x = "Water Exposure Time",
y = "Blotch")In order to asses how well this model predicts blotching in Sharks, additional editing to help refine the model must be made:
# Check the summary of the model
summary(lm_model_refined)
Call:
lm(formula = blotch ~ air + water + depth + blotch2, data = sharks_combined_clean)
Residuals:
Min 1Q Median 3Q Max
-0.30521 -0.20611 -0.10466 0.05488 1.12037
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.24706 2.32427 1.397 0.1693
air 0.01211 0.03283 0.369 0.7139
water -0.05133 0.02989 -1.717 0.0928 .
depth 0.03305 0.03762 0.879 0.3842
blotch2 0.85877 0.05038 17.044 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.356 on 45 degrees of freedom
(450 observations deleted due to missingness)
Multiple R-squared: 0.9031, Adjusted R-squared: 0.8945
F-statistic: 104.9 on 4 and 45 DF, p-value: < 2.2e-16
# Refit the model to ensure alignment
lm_model_refined <- lm(blotch ~ air + water + depth + blotch2, data = sharks_combined_clean)
# Test predictions
head(predict(lm_model_refined, newdata = sharks_combined_clean)) 1 2 3 4 5 6
NA NA NA 34.81715 NA NA
# Test residuals
head(residuals(lm_model_refined)) 4 8 29 49 59 70
0.52166195 0.71222237 -0.11611701 0.07949033 0.03902610 -0.09545302
Through double checking where and what the missing values were are found:
# Check for missing values in predictors
colSums(is.na(sharks_combined_clean[, c("air", "water", "depth","blotch2")])) air water depth blotch2
0 0 0 450
# Remove rows with missing blotch2
sharks_combined_clean <- sharks_combined_clean %>%
drop_na(blotch2)
# Refit the regression model
lm_model_refined <- lm(blotch ~ air + water + depth + blotch2, data = sharks_combined_clean)
# Check number of rows remaining
nrow(sharks_combined_clean)[1] 50
There were 450 missing values in blotch2, reducing the amount of blotch2 to the amount available reduced it to 50. As this is not enough data to be able to make accurate predictions, we can use the mean value to fill in the missing values:
# Impute missing values in blotch2 with the mean
sharks_combined_clean$blotch2[is.na(sharks_combined_clean$blotch2)] <-
mean(sharks_combined_clean$blotch2, na.rm = TRUE)
# Check for missing values again
colSums(is.na(sharks_combined_clean)) ID sex.x blotch BPM weight length air water meta depth
0 0 0 0 0 0 0 0 0 0
sex.y blotch2
0 0
Now all the blotch2 rows have valid values we can refit the model.
# Refit the regression model
lm_model_refined <- lm(blotch ~ air + water + depth + blotch2, data = sharks_combined_clean)
# Summarize the updated model
summary(lm_model_refined)
Call:
lm(formula = blotch ~ air + water + depth + blotch2, data = sharks_combined_clean)
Residuals:
Min 1Q Median 3Q Max
-0.30521 -0.20611 -0.10466 0.05488 1.12037
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.24706 2.32427 1.397 0.1693
air 0.01211 0.03283 0.369 0.7139
water -0.05133 0.02989 -1.717 0.0928 .
depth 0.03305 0.03762 0.879 0.3842
blotch2 0.85877 0.05038 17.044 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.356 on 45 degrees of freedom
Multiple R-squared: 0.9031, Adjusted R-squared: 0.8945
F-statistic: 104.9 on 4 and 45 DF, p-value: < 2.2e-16
After fitting, we re-run the predict() function
# Generate predicted values
sharks_combined_clean <- sharks_combined_clean %>%
mutate(
predicted_blotch = predict(lm_model_refined, newdata = sharks_combined_clean),
residuals = residuals(lm_model_refined)
)
# Check the first few predictions
head(sharks_combined_clean$predicted_blotch) 1 2 3 4 5 6
34.81715 35.58275 35.29387 35.46906 36.46098 32.95503
And now finally allowing us to generate a Residual vs Predict plot
# Residuals vs Predicted Plot
ggplot(sharks_combined_clean, aes(x = predicted_blotch, y = residuals)) +
geom_point(alpha = 0.7, color = "purple") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
theme_minimal() +
labs(title = "Residuals vs Predicted Values",
x = "Predicted Blotch",
y = "Residuals")Additional analysis of data - Male vs Female blotching
# Boxplot for blotching by sex
ggplot(sharks_combined_clean, aes(x = sex.x, y = blotch, fill = sex.x)) +
geom_boxplot(alpha = 0.7) +
theme_minimal() +
labs(title = "Blotching by Sex",
x = "Sex (Male/Female)",
y = "Blotch")# Perform t-test for blotching between sexes
t_test <- t.test(blotch ~ sex.x, data = sharks_combined_clean)
t_test
Welch Two Sample t-test
data: blotch by sex.x
t = -1.4629, df = 47.997, p-value = 0.15
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
-1.0644425 0.1678337
sample estimates:
mean in group Female mean in group Male
34.80627 35.25457