Sharks Data Analysis

Author

Ellelouise Bates

Published

January 24, 2025

library(tidyverse)
library(ggplot2)
library(graphics)
library(readxl)
library(broom)

Statistical Analyses for Research Methods & Data Analysis Summative

Question 1: Is there a correlation between the variables air and water?

sharks <- read_xlsx("sharks.xlsx") 


# Testing normality of 'air' variable  
shapiro.test(sharks$air)


    Shapiro-Wilk normality test

data:  sharks$air
W = 0.95885, p-value = 1.338e-10

# Testing normality of 'water' variable  
shapiro.test(sharks$water)


    Shapiro-Wilk normality test

data:  sharks$water
W = 0.96035, p-value = 2.371e-10

# both variables are continuous (quantitative) and the data is not normally distributed (both p-values are <0.05), thus an Spearmen's Correlation is appropriate

cor.test(sharks$air, sharks$water, method = "spearman")


    Spearman's rank correlation rho

data:  sharks$air and sharks$water
S = 22007692, p-value = 0.2082
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.05637344

# Creating the scatter graph
ggplot(sharks, aes(x = air, 
                   y = water,)) +
  geom_point() +  
  geom_smooth(method = "lm") + # line of best fit 
  labs(
    title = "Correlation Between Ambient Air Temperature and Surface Water Temperature at Time of Processing",
    x = "Air Temprature (°C)",
    y = "Water Temperature (°C)") +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

Question 2: Does multiple capture have an effect on blotching time? (How does blotch time differ from the first and second capture)

sharksub <- read_xlsx("sharksub.xlsx") 


# Testing normality of 'blotch 1' variable  
shapiro.test(sharksub$blotch1)


    Shapiro-Wilk normality test

data:  sharksub$blotch1
W = 0.97958, p-value = 0.5345

# Testing normality of 'blotch 2' variable  
shapiro.test(sharksub$blotch2)


    Shapiro-Wilk normality test

data:  sharksub$blotch2
W = 0.97936, p-value = 0.5255

# Extracting the data for the two groups
group1 <- sharksub$blotch1
group2 <- sharksub$blotch2

# Performing F-test for testing equal variances
var.test(group1, group2)


    F test to compare two variances

data:  group1 and group2
F = 0.88829, num df = 49, denom df = 49, p-value = 0.6801
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
 0.5040845 1.5653379
sample estimates:
ratio of variances 
          0.888292

# As well as the data being normally distributed (both p-values are >0.05) and of equal variance (p-value), since the predictor variable (Capture) is categorical and split into two groups (blotch 1 and blotch 2), and the outcome variable (time taken for blotching to cover 30% of ventral surface) is quantitative, the appropriate statistical test would be a paired T-test to compare the blotch time between the 1st and 2nd captures. 

t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)


    Paired t-test

data:  sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.037176 -0.822301
sample estimates:
mean difference 
     -0.9297384

# Reshaping the data for the plot
sharksub_long <- sharksub %>%
  pivot_longer(cols = c(blotch1, blotch2), 
               names_to = "capture", 
               values_to = "blotch_time")

# Creating the boxplot
ggplot(sharksub_long, aes(x = capture, y = blotch_time, fill = capture)) + 
  geom_boxplot(alpha=0.5) +
  geom_jitter() +
    facet_grid(~ sex) + 
  labs(
    title = "Time Taken for Blotching to Cover 30% of Ventral Surface Depending on Capture", 
    x = "Capture", 
    y = "Blotch Time (s)") +
  theme_bw() +
  scale_fill_manual(values = c("blotch1" = "#9ee5ff", "blotch2" = "#97fae9"))

Question 3: Is it possible to predict blotching time?

# testing normality:

# Testing normality of 'BPM' variable  
shapiro.test(sharks$BPM)


    Shapiro-Wilk normality test

data:  sharks$BPM
W = 0.947, p-value = 2.178e-12

# Testing normality of 'weight' variable  
shapiro.test(sharks$weight)


    Shapiro-Wilk normality test

data:  sharks$weight
W = 0.94662, p-value = 1.929e-12

# Testing normality of 'length' variable  
shapiro.test(sharks$length)


    Shapiro-Wilk normality test

data:  sharks$length
W = 0.95668, p-value = 5.963e-11

# Testing normality of 'meta' variable  
shapiro.test(sharks$meta)


    Shapiro-Wilk normality test

data:  sharks$meta
W = 0.96374, p-value = 9.141e-10

# Testing normality of 'depth' variable  
shapiro.test(sharks$depth)


    Shapiro-Wilk normality test

data:  sharks$depth
W = 0.99746, p-value = 0.6485

# Ensureing that 'sex' is a factor
sharks$sex <- as.factor(sharks$sex)


# As there are multiple predictor variables (sex, BPM,  weight, length, air temp,   water temp, meta, and   depth), multiple regression can be used as the predictive model to predict the continuous outcome variable of blotch time.  


# Linear regression model to predict blotch - even though data in not normally distributed there is a large sample size 
blotch.model <- lm(formula = blotch ~ sex + BPM + weight + length + air + water + meta + depth, data = sharks)

# Summarizing the model
summary(blotch.model)


Call:
lm(formula = blotch ~ sex + BPM + weight + length + air + water + 
    meta + depth, data = sharks)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.97715 -0.66193 -0.00841  0.64123  2.90395 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.1179728  1.8749828   5.930 5.73e-09 ***
sexMale      0.3088617  0.0890602   3.468  0.00057 ***
BPM         -0.0020791  0.0031540  -0.659  0.51009    
weight       0.0017281  0.0033143   0.521  0.60231    
length       0.0013042  0.0009606   1.358  0.17517    
air         -0.0310068  0.0315302  -0.983  0.32590    
water       -0.0143878  0.0268112  -0.537  0.59176    
meta        -0.0011610  0.0025671  -0.452  0.65127    
depth        0.5034077  0.0220870  22.792  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9912 on 491 degrees of freedom
Multiple R-squared:  0.5256,    Adjusted R-squared:  0.5178 
F-statistic: 67.99 on 8 and 491 DF,  p-value: < 2.2e-16

# Plotting diagnostics for the regression model (due to non-normal distribution)
par(mfrow = c(2, 2))
plot(blotch.model)

The analysis reveals two statistically significant predictors of blotching time:

# visual representation of the the sex and depth variables 

# Scatter plot for depth vs. blotch time
ggplot(sharks, aes(x = depth, y = blotch)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Correlation Between Depth at Which the sharks were Hooked and Time Taken for Blotching to Cover 30% of Ventral Surface",
       x = "Capture Depth (m)", 
       y = "Blotch Time (s)")

`geom_smooth()` using formula = 'y ~ x'

# Box plot for sex vs. blotch time
ggplot(sharks, aes(x = sex, y = blotch, fill = sex)) +
  geom_boxplot(alpha=0.5) +
  geom_jitter() + 
  labs(title = "Time Taken for Blotching to Cover 30% of Ventral Surface Depending for each Sex",
       x = "Sex", 
       y = "Blotch Time (s)") +
  scale_fill_manual (values = c("#ffd5f4", "#bbc6ff"))