sharks project

Sharks data

Loading in the data

rmarkdown::pandoc_version()
[1] '3.2'
tempdir <- tempdir()
knitr::opts_chunk$set(output_dir = tempdir)


 
options(repos = c(CRAN = "https://cloud.r-project.org"))

warning = FALSE
errors = FALSE

# loading some packages

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(openxlsx)
Warning: package 'openxlsx' was built under R version 4.4.2
library(readxl)
Warning: package 'readxl' was built under R version 4.4.2
# getting the data into R studio


 sharks <- read_excel("C:/Users/rosie/OneDrive/Documents/sharks.xlsx")
View(sharks)

# Running a random mean to check data has been read in correctly

mean(sharks$length)
[1] 211.0442
# checking for outliers in blotching time


# Identify outliers based on IQR
Q1 <- quantile(sharks$blotch, 0.25)  # First quartile (25th percentile)
Q3 <- quantile(sharks$blotch, 0.75)  # Third quartile (75th percentile)
IQR_value <- Q3 - Q1                 # Interquartile range (IQR)

# Outlier threshold
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

# Identify outliers
outliers <- which(sharks$blotch < lower_bound | sharks$blotch > upper_bound)

# Print the outliers
cat("Outliers in blotching time:\n")
Outliers in blotching time:
print(sharks$blotch[outliers])
[1] 39.83638 39.13972 30.77585 40.08356

Air v water correlation:

Plotting a scatter graph

# Step 1: Create the scatter plot
plot(sharks$air, sharks$water, 
     main = "Air vs Water Temperature",
     xlab = "Air Temperature (°C)", 
     ylab = "Water Temperature (°C)", 
     pch = 16,   # Points type, solid circle
     col = "grey")  # Color of the points

# Step 2: Fit a linear model (line of best fit)
model <- lm(sharks$water ~ sharks$air)

# Step 3: Add the line of best fit to the scatter plot
abline(model, col = "black", lwd = 2)  
# Black line with line width of 2

# Add grid lines
grid(col = "gray", lty = "dotted", lwd = 1)

# Step 4: Calculate residuals and Cook's distance
residuals <- resid(model) 
# Residuals
cooks_dist <- cooks.distance(model) 
# Cook's distance

# Step 5: Identify potential outliers
outliers <- which(cooks_dist > (4 / length(cooks_dist)))  
# Cook's distance threshold

# Step 6: Highlight outliers on the plot
points(sharks$air[outliers], sharks$water[outliers], 
       pch = 16, col = "red")  

       # Red points for outliers


# Step 7: Print the identified outliers
cat("Potential outliers based on Cook's distance:\n")
Potential outliers based on Cook's distance:
print(outliers)
 12  69  70 171 191 211 270 326 391 419 421 
 12  69  70 171 191 211 270 326 391 419 421 

Pearson’s correlation coefficient for air and water

# Identify and remove outliers using the IQR method
remove_outliers <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)  # First quartile
  Q3 <- quantile(x, 0.75, na.rm = TRUE)  # Third quartile
  IQR <- Q3 - Q1                         # Interquartile range
  lower_bound <- Q1 - 1.5 * IQR          # Lower bound
  upper_bound <- Q3 + 1.5 * IQR          # Upper bound
  x[x >= lower_bound & x <= upper_bound] # Return non-outliers
}

# Clean the data by removing outliers
sharks_clean <- sharks %>%
  dplyr::filter(
    air %in% remove_outliers(air) &
    water %in% remove_outliers(water)
  )

# Perform correlation analysis using complete observations
correlation <- cor(sharks_clean$air, sharks_clean$water, method = "pearson", use = "complete.obs")

# Print correlation value
print(correlation)
[1] -0.05524051
# Perform the hypothesis test for Pearson correlation
cor_test <- cor.test(sharks_clean$air, sharks_clean$water, method = "pearson")

# Print the result of the test
print(cor_test)

    Pearson's product-moment correlation

data:  sharks_clean$air and sharks_clean$water
t = -1.2346, df = 498, p-value = 0.2176
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.14224207  0.03260803
sample estimates:
        cor 
-0.05524051 

Effects of recapture on blotching time

# downoading subshark data



sharksub <- read_excel("C:/Users/rosie/Downloads/sharksub.xlsx")

View(sharksub)

mean(sharksub$blotch1)
[1] 35.03042
mean(sharksub$blotch2)
[1] 35.96016

Performing a paired t-test as the data is related

# performing a t-test

t_test_result <- t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)


# View results
print(t_test_result)

    Paired t-test

data:  sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.037176 -0.822301
sample estimates:
mean difference 
     -0.9297384 

Visually analysing difference in blotch 1 and blotch 2

# Boxplot for time of blotching in blotch 1

1
[1] 1
boxplot(sharksub$blotch1,
        main = "Blotch 1",
        ylab = "Blotch Time (seconds)",
        col = "lightblue",
        border = "darkblue")

# Boxplot of blotching time in blotch 2
boxplot(sharksub$blotch2,
        main = "Blotch 2",
        ylab = "Blotch Time (seconds)",
        col = "lightblue",
        border = "darkblue")

# Create a data frame with blotch times for grouping
blotch_data <- data.frame(
  Time = c(sharksub$blotch1, sharksub$blotch2),
  Capture = rep(c("First Capture", "Second Capture"), each = length(sharksub$blotch1))
)

# Create the box plot
boxplot(Time ~ Capture, data = blotch_data,
        main = "Blotching Times - Blotch 1 and 2",
        ylab = "Blotch Time (Seconds)",
        col = c("blue", "green"))

Linear regression models

options(repos = c(CRAN = "https://cloud.r-project.org"))


# Depth vs blotching time
  


model <- lm(blotch ~ depth, data = sharks)


summary(model)

Call:
lm(formula = blotch ~ depth, data = sharks)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.81869 -0.65427 -0.01035  0.58825  2.83116 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.82178    1.11207   8.832   <2e-16 ***
depth        0.50467    0.02216  22.772   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1 on 498 degrees of freedom
Multiple R-squared:  0.5101,    Adjusted R-squared:  0.5091 
F-statistic: 518.6 on 1 and 498 DF,  p-value: < 2.2e-16
# visualising this

# Scatter plot
plot(sharks$depth, sharks$blotch, 
     main = "Depth vs Blotching Time", 
     xlab = "Depth (m)", 
     ylab = "Blotching Time (s)", 
     pch = 16, col = "blue")

# Add the regression line
abline(model, col = "red", lwd = 2)

# Generate predictions with a confidence interval
predictions <- predict(model, interval = "confidence", level = 0.95)

# Add the buffer (confidence interval) around the line
lines(sharks$depth, predictions[,2], col = "green", lty = 2) # Lower bound
lines(sharks$depth, predictions[,3], col = "green", lty = 2) # Upper bound

# Diagnostic plots
par(mfrow = c(2, 2)) 
# Arrange plots in a 2x2 grid
plot(model)

Weight vs blotching time

model <- lm(blotch ~ weight, data = sharks)


summary(model)

Call:
lm(formula = blotch ~ weight, data = sharks)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.3702 -0.9600 -0.0687  0.9328  4.9357 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 3.504e+01  4.227e-01  82.887   <2e-16 ***
weight      9.795e-04  4.752e-03   0.206    0.837    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.429 on 498 degrees of freedom
Multiple R-squared:  8.531e-05, Adjusted R-squared:  -0.001923 
F-statistic: 0.04249 on 1 and 498 DF,  p-value: 0.8368
# visualising this

# Scatter plot
plot(sharks$weight, sharks$blotch, 
     main = "Weight vs Blotching Time", 
     xlab = " Weight (Kg)", 
     ylab = "Blotching time (S)", 
     pch = 16, col = "blue")

# Add the regression line
abline(model, col = "red", lwd = 2)

# Generate predictions with a confidence interval
predictions <- predict(model, interval = "confidence", level = 0.95)

# Add the buffer (confidence interval) around the line
lines(sharks$weight, predictions[,2], col = "green", lty = 2) # Lower bound
lines(sharks$weight, predictions[,3], col = "green", lty = 2) # Upper bound

Length and blotting time

model <- lm(blotch ~ length, data = sharks)


summary(model)

Call:
lm(formula = blotch ~ length, data = sharks)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.3280 -0.9663 -0.0678  0.9117  4.9545 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 35.2312817  0.2965354 118.810   <2e-16 ***
length      -0.0005017  0.0013721  -0.366    0.715    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.429 on 498 degrees of freedom
Multiple R-squared:  0.0002684, Adjusted R-squared:  -0.001739 
F-statistic: 0.1337 on 1 and 498 DF,  p-value: 0.7148
# visualising this

# Scatter plot
plot(sharks$length, sharks$blotch, 
     main = "Length vs Blotching Time", 
     xlab = " Length (cm)", 
     ylab = "Blotching time (s)", 
     pch = 16, col = "blue")

# Add the regression line
abline(model, col = "red", lwd = 2)

# Generate predictions with a confidence interval
predictions <- predict(model, interval = "confidence", level = 0.95)

# Add the buffer (confidence interval) around the line
lines(sharks$length, predictions[,2], col = "green", lty = 2) # Lower bound
lines(sharks$length, predictions[,3], col = "green", lty = 2) # Upper bound

BPM and blotching time

model <- lm(blotch ~ BPM, data = sharks)


summary(model)

Call:
lm(formula = blotch ~ BPM, data = sharks)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.3282 -0.9492 -0.0876  0.8940  4.9559 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 35.544599   0.644079  55.187   <2e-16 ***
BPM         -0.002957   0.004521  -0.654    0.513    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.428 on 498 degrees of freedom
Multiple R-squared:  0.0008583, Adjusted R-squared:  -0.001148 
F-statistic: 0.4278 on 1 and 498 DF,  p-value: 0.5134
# visualising this

# Scatter plot
plot(sharks$length, sharks$blotch, 
     main = "BPM vs Blotching Time", 
     xlab = "BPM", 
     ylab = "Blotching time(s)", 
     pch = 16, col = "blue")

# Add the regression line
abline(model, col = "red", lwd = 2)

# Generate predictions with a confidence interval
predictions <- predict(model, interval = "confidence", level = 0.95)

# Add the buffer (confidence interval) around the line
lines(sharks$BPM, predictions[,2], col = "green", lty = 2) # Lower bound
lines(sharks$BPM, predictions[,3], col = "green", lty = 2) # Upper bound

Meta and blotching time

model <- lm(blotch ~ meta , data = sharks)


summary(model)

Call:
lm(formula = blotch ~ meta, data = sharks)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.3635 -0.9639 -0.0793  0.9275  4.9411 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 35.1892930  0.3075967 114.401   <2e-16 ***
meta        -0.0007787  0.0036674  -0.212    0.832    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.429 on 498 degrees of freedom
Multiple R-squared:  9.051e-05, Adjusted R-squared:  -0.001917 
F-statistic: 0.04508 on 1 and 498 DF,  p-value: 0.8319
# visualising this

# Scatter plot
plot(sharks$length, sharks$blotch, 
     main = "Meta vs Blotching Time", 
     xlab = "Meta (mcg/dl)", 
     ylab = "Blotching time(s)", 
     pch = 16, col = "blue")

# Add the regression line
abline(model, col = "red", lwd = 2)

# Generate predictions with a confidence interval
predictions <- predict(model, interval = "confidence", level = 0.95)

# Add the buffer (confidence interval) around the line
lines(sharks$meta, predictions[,2], col = "green", lty = 2) # Lower bound
lines(sharks$meta, predictions[,3], col = "green", lty = 2) # Upper bound

Water and blotching time

model <- lm(blotch ~ water , data = sharks)


summary(model)

Call:
lm(formula = blotch ~ water, data = sharks)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.3051 -0.9629 -0.0495  0.9471  5.0216 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 36.14125    0.88240  40.958   <2e-16 ***
water       -0.04413    0.03823  -1.154    0.249    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.427 on 498 degrees of freedom
Multiple R-squared:  0.002668,  Adjusted R-squared:  0.0006654 
F-statistic: 1.332 on 1 and 498 DF,  p-value: 0.249
# visualising this 

# Scatter plot
plot(sharks$water, sharks$blotch, 
     main = "Water temperature vs Blotching Time", 
     xlab = "Water temperature(C)", 
     ylab = "Blotching time(s)", 
     pch = 16, col = "blue")

# Add the regression line
abline(model, col = "red", lwd = 2)

# Generate predictions with a confidence interval
predictions <- predict(model, interval = "confidence", level = 0.95)

# Add the buffer (confidence interval) around the line
lines(sharks$water, predictions[,2], col = "green", lty = 2) # Lower bound
lines(sharks$water, predictions[,3], col = "green", lty = 2) # Upper bound

Air and blotching time

model <- lm(blotch ~ air , data = sharks)


summary(model)

Call:
lm(formula = blotch ~ air, data = sharks)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.3900 -0.9459 -0.0506  0.9423  4.9134 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 36.46184    1.59219   22.90   <2e-16 ***
air         -0.03761    0.04477   -0.84    0.401    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.428 on 498 degrees of freedom
Multiple R-squared:  0.001415,  Adjusted R-squared:  -0.0005902 
F-statistic: 0.7057 on 1 and 498 DF,  p-value: 0.4013
# visualising this

# Scatter plot
plot(sharks$air, sharks$blotch, 
     main = "Air temperature vs Blotching Time", 
     xlab = "Air temperature(C)", 
     ylab = "Blotching time(s)", 
     pch = 16, col = "blue")

# Add the regression line
abline(model, col = "red", lwd = 2)

# Generate predictions with a confidence interval
predictions <- predict(model, interval = "confidence", level = 0.95)

# Add the buffer (confidence interval) around the line
lines(sharks$air, predictions[,2], col = "green", lty = 2) # Lower bound
lines(sharks$air, predictions[,3], col = "green", lty = 2) # Upper bound

Sex and blotching time

# checking values in sex column

table(sharks$sex)

Female   Male 
   236    264 
# convert sex to a factor

sharks$sex <- as.factor(sharks$sex)

# fit the linear model 

model_sex <- lm(blotch ~ sex, data = sharks)
summary(model_sex)

Call:
lm(formula = blotch ~ sex, data = sharks)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.1471 -0.9653 -0.0222  0.9889  4.7772 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 34.92294    0.09217 378.885  < 2e-16 ***
sexMale      0.38347    0.12685   3.023  0.00263 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.416 on 498 degrees of freedom
Multiple R-squared:  0.01802,   Adjusted R-squared:  0.01605 
F-statistic: 9.139 on 1 and 498 DF,  p-value: 0.002632
# Load the necessary library
library(ggplot2)

# Create a data frame with the regression results
sex_effect <- data.frame(
  Sex = c("Female", "Male"),
  Blotching = c(34.92, 34.92 + 0.38347) # Intercept and intercept + male coefficient
)

# Plot the bar chart
ggplot(sex_effect, aes(x = Sex, y = Blotching, fill = Sex)) +
  geom_bar(stat = "identity", color = "black", width = 0.6) + 
  geom_text(aes(label = round(Blotching, 2)), vjust = -0.5, size = 5) +
  labs(
    x = "Sex",
    y = "Mean Blotching Level"
  ) +
  scale_fill_manual(values = c("Female" = "pink", "Male" = "blue")) +
  theme_minimal() +
  theme(
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, size = 16),
    axis.title = element_text(size = 14),
    axis.text = element_text(size = 12)
  ) +
  # Adjust y-axis limits and add padding
  ylim(0, max(sex_effect$Blotching) + 1) +
  coord_cartesian(clip = "off") # Ensure elements outside the plotting area are shown