Shark assignment

Summative data analysis

Loading tidyverse and viewing the sharks dataset

library(readr)
sharks <- read_csv("sharks.csv")
Rows: 500 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): ID, sex
dbl (8): blotch, BPM, weight, length, air, water, meta, depth

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(sharks)
library(readr)
sharksub <- read_csv("sharksub.csv")
Rows: 50 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): ID, sex
dbl (2): blotch1, blotch2

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(sharksub)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ purrr     1.0.2
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(tidyr)
library(knitr)
library(kableExtra)

Attaching package: 'kableExtra'

The following object is masked from 'package:dplyr':

    group_rows
library(broom)

Blotch histogram made for question 2

library(gridExtra)
Warning: package 'gridExtra' was built under R version 4.4.2

Attaching package: 'gridExtra'
The following object is masked from 'package:dplyr':

    combine
# Reshape the data into long format
sharksub_long <- sharksub %>%
  pivot_longer(cols = c(blotch1, blotch2), 
               names_to = "Capture", 
               values_to = "BlotchTime")

# Create histogram for blotch1
blotch1_hist <- ggplot(data = sharksub, aes(x = blotch1)) +
  geom_histogram(bins = 30, color = "black", fill = "gold") +
  theme_bw() +
  labs(x = "Time taken for blotching to cover 30% of ventral surface (s)",
       y = "Frequency",
       title = "Distribution of Blotching Time After First Capture")

# Create histogram for blotch2
blotch2_hist <- ggplot(data = sharksub, aes(x = blotch2)) +
  geom_histogram(bins = 30, color = "black", fill = "navy") +
  theme_bw() +
  labs(x = "Time taken for blotching to cover 30% of ventral surface (s)",
       y = "Frequency",
       title = "Distribution of Blotching Time After Second Capture")

# Arrange the histograms vertically
grid.arrange(blotch1_hist, blotch2_hist, ncol = 1)

Q1. Is there a correlation between the variables air and water?

Histogram air normality check

#| label: Air Distribution Check


ggplot(data = sharks, aes(x = air)) +
  geom_histogram(bins = 30, col = "navy", fill = "gold") + 
    theme_dark() +
  theme(
    plot.background = element_rect(fill = "black"), # Background outside the plot
    panel.background = element_rect(fill = "black"), # Background inside the plot
    panel.grid.major = element_line(color = "gray"), # Major grid lines
    panel.grid.minor = element_line(color = "gray30"), # Minor grid lines
    axis.text = element_text(color = "white"), # Axis text color
    axis.title = element_text(color = "white"), # Axis title color
    plot.title = element_text(color = "white", size = 14, face = "bold") # Title color
  ) + 
  scale_color_brewer() + 
    labs(x = "Air (Celsius)", 
       y = "Frequency", 
       title = "Distribution of Ambient Air Temperatures") 

Histogram for water temperature

#| label: Water Distribution Check


ggplot(data = sharks, aes(x = water)) +
  geom_histogram(bins = 30, col = "gold", fill = "navy") + 
    theme_dark() +
  theme(
    plot.background = element_rect(fill = "black"), 
    panel.background = element_rect(fill = "black"), 
    panel.grid.major = element_line(color = "gray"), 
    panel.grid.minor = element_line(color = "gray30"), 
    axis.text = element_text(color = "white"), 
    axis.title = element_text(color = "white"), 
    plot.title = element_text(color = "white", size = 14, face = "bold") 
  ) + 
  scale_color_brewer() + 
  labs(x = "Water temperature at surface at time of processing (Celsius)",  
       y = "Frequency",  
       title = "Data Distribution of Surface Water Temperature") 

Creating QQ plots for air and water

QQ plot for air temperature

#| label: QQplot Air

par(bg = "black", col.axis = "white", col.lab = "white", col.main = "white")

qqnorm(sharks$air, pch = 1, col = "white", frame = FALSE, 
       main = "QQ Plot of air Data", 
       xlab = "Theoretical Quantiles", 
       ylab = "Sample Quantiles")
qqline(sharks$air, col = "gold", lwd = 2)

QQ plot for water temperature

par(bg = "black", col.axis = "white", col.lab = "white", col.main = "white")


qqnorm(sharks$water, pch = 1, col = "white", frame = FALSE, 
       main = "QQ Plot of Water Data", 
       xlab = "Theoretical Quantiles", 
       ylab = "Sample Quantiles")
qqline(sharks$water, col = "lightblue", lwd = 2)

#Shapiro -Wilk test for air temperature

shapiro.test(sharks$air)

    Shapiro-Wilk normality test

data:  sharks$air
W = 0.95885, p-value = 1.338e-10

#Shapiro-Wilk Test for water temperature

shapiro.test(sharks$water)

    Shapiro-Wilk normality test

data:  sharks$water
W = 0.96035, p-value = 2.371e-10
cor.test(x=sharks$air, y=sharks$water, method = 'spearman')

    Spearman's rank correlation rho

data:  sharks$air and sharks$water
S = 22007692, p-value = 0.2082
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.05637344 

#Value of Spearman’s correlation coefficient, rho = -0.056, p = 0.21

#null hypothesis H0: There is no significant correlation between air and water temperatures. #alternative hypothesis H1: There is significant correlation between air and water temperatures.

 cor.test(x = sharks$air, y = sharks$water, method = 'pearson')

    Pearson's product-moment correlation

data:  sharks$air and sharks$water
t = -1.2346, df = 498, p-value = 0.2176
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.14224207  0.03260803
sample estimates:
        cor 
-0.05524051 

#Value of Pearson’s correlation coefficient, rho = -0.055, p = 0.22

Q2. Does multiple capture have an effect on blotching time?

view(sharksub)

Blotch1 histogram

#| label: Blotch1 Distribution Check

ggplot(data = sharksub, aes(x = blotch1)) +
  geom_histogram(bins = 30, col = "navy", fill = "gold") + 
   theme_dark() +
  theme(
    plot.background = element_rect(fill = "black"), 
    panel.background = element_rect(fill = "black"), 
    panel.grid.major = element_line(color = "gray"), 
    panel.grid.minor = element_line(color = "gray30"), 
    axis.text = element_text(color = "white"), 
    axis.title = element_text(color = "white"), 
    plot.title = element_text(color = "white", size = 14, face = "bold") 
  ) + 
  scale_color_brewer() +
  labs(x = "Time for blotching to cover 30% of ventral surface (s)", 
       y = "Frequency", 
       title = "Data Distribution of blotching time for first capture") 

QQplot blotch1

#| label: QQplot Blotch1

par(bg = "black", col.axis = "white", col.lab = "white", col.main = "white")


qqnorm(sharksub$blotch1, pch = 1, col = "white", frame = FALSE, 
       main = "QQ Plot of Blotch1 Data", 
       xlab = "Theoretical Quantiles", 
       ylab = "Sample Quantiles")
qqline(sharksub$blotch1, col = "gold", lwd = 2)

Blotch1 shapiro-wilk

#| label: ShapiroWilk-blotch1_normality_check
#| warning: false
#| echo: true
#| output: true

shapiro.test(sharksub$blotch1) #testing normality of blotch1 data using Shapiro-Wilk test

    Shapiro-Wilk normality test

data:  sharksub$blotch1
W = 0.97958, p-value = 0.5345

Histogram for blotch2

#| label: Blotch2 Distribution Check


ggplot(data = sharksub, aes(x = blotch2)) +
  geom_histogram(bins = 30, col = "gold", fill = "navy") + 
   theme_dark() +
  theme(
    plot.background = element_rect(fill = "black"), 
    panel.background = element_rect(fill = "black"), 
    panel.grid.major = element_line(color = "gray"), 
    panel.grid.minor = element_line(color = "gray30"), 
    axis.text = element_text(color = "white"), 
    axis.title = element_text(color = "white"), 
    plot.title = element_text(color = "white", size = 14, face = "bold") 
  ) + 
  scale_color_brewer() + 
  labs(x = "Time for blotching to cover 30% of ventral surface (s)", 
       y = "Frequency", 
       title = "Data Distribution of Blotching Time for Second Capture") 

#| label: QQplot Blotch2

par(bg = "black", col.axis = "white", col.lab = "white", col.main = "white")


qqnorm(sharksub$blotch2, pch = 1, col = "white", frame = FALSE, 
       main = "QQ Plot of Blotch2 Data", 
       xlab = "Theoretical Quantiles", 
       ylab = "Sample Quantiles")
qqline(sharksub$blotch2, col = "Navy", lwd = 2)

#| label: Shapiro-Wilk Blotch2


shapiro.test(sharksub$blotch2) 

    Shapiro-Wilk normality test

data:  sharksub$blotch2
W = 0.97936, p-value = 0.5255

Histograms reveal roughly normal distribution for both blotch1 and blotch2

#For both blotch1 and blotch2, the Shapiro-Wilk test produces p-values > 0.05.

#null hypothesis H0: There is no significant difference between blotch1 and blotch2. #alternative hypothesis H1: There is a significant difference between blotch1 and blotch2.

capture1 <- sharksub$blotch1 # assigning blotch1 to capture1
capture2 <- sharksub$blotch2 # assigning blotch2 to capture2

paired t-test for blotch1 and blotch2

diff <- t.test(capture1, capture2, paired = TRUE)
diff

    Paired t-test

data:  capture1 and capture2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.037176 -0.822301
sample estimates:
mean difference 
     -0.9297384 
# Reshape the data into long format
sharksub_long <- sharksub %>%
  pivot_longer(cols = c(blotch1, blotch2), 
               names_to = "Capture", 
               values_to = "BlotchTime")

# Create the histogram
combined_hist <- ggplot(data = sharksub_long, aes(x = BlotchTime, fill = Capture)) +
  geom_histogram(position = "identity", alpha = 0.85, bins = 30, color = "black") +
  theme_dark() +
  theme(
    plot.background = element_rect(fill = "black"), 
    panel.background = element_rect(fill = "black"), 
    panel.grid.major = element_line(color = "gray"), 
    panel.grid.minor = element_line(color = "gray30"), 
    axis.text = element_text(color = "white"), 
    axis.title = element_text(color = "white"), 
    plot.title = element_text(color = "white", size = 14, face = "bold") 
  ) + 
  scale_fill_manual(values = c("blotch1" = "navy", "blotch2" = "gold")) +  # Choose a color palette
  labs(x = "Time for blotching to cover 30% of ventral surface (s)",
       y = "Frequency",
       title = "Distribution of Blotching Time After Capture") +
  theme(legend.title = element_blank())  # Remove legend title

# Display the plot
combined_hist

#The output of the paired t-test gives a p-value < 2.2e-16, which is considerably less than the significance level alpha = 0.05.

Q3. Is it possible to predict blotching time?

multiple regression for blotching time prediction

mult_reg_model <- lm(blotch ~ BPM + weight + length + air
                     + water + meta + depth, data = sharks)
summary(mult_reg_model)

Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta + 
    depth, data = sharks)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.83745 -0.66117 -0.00702  0.60110  2.74108 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.1405851  1.8958668   5.876 7.74e-09 ***
BPM         -0.0019723  0.0031890  -0.618    0.537    
weight       0.0016283  0.0033511   0.486    0.627    
length       0.0012295  0.0009710   1.266    0.206    
air         -0.0281474  0.0318707  -0.883    0.378    
water       -0.0188934  0.0270782  -0.698    0.486    
meta        -0.0009712  0.0025951  -0.374    0.708    
depth        0.5061285  0.0223191  22.677  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.002 on 492 degrees of freedom
Multiple R-squared:  0.514, Adjusted R-squared:  0.507 
F-statistic: 74.32 on 7 and 492 DF,  p-value: < 2.2e-16
summary(mult_reg_model)$coefficient
                 Estimate   Std. Error    t value     Pr(>|t|)
(Intercept) 11.1405850738 1.8958667617  5.8762490 7.742498e-09
BPM         -0.0019723462 0.0031890181 -0.6184807 5.365447e-01
weight       0.0016283130 0.0033510746  0.4859077 6.272489e-01
length       0.0012295257 0.0009710195  1.2662214 2.060330e-01
air         -0.0281474117 0.0318706960 -0.8831753 3.775729e-01
water       -0.0188934313 0.0270781617 -0.6977369 4.856714e-01
meta        -0.0009712356 0.0025951073 -0.3742564 7.083748e-01
depth        0.5061284856 0.0223190976 22.6769242 1.816298e-78
lin_reg_model <- lm(blotch ~ depth, data = sharks)
summary(lin_reg_model)

Call:
lm(formula = blotch ~ depth, data = sharks)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.81869 -0.65427 -0.01035  0.58825  2.83116 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.82178    1.11207   8.832   <2e-16 ***
depth        0.50467    0.02216  22.772   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1 on 498 degrees of freedom
Multiple R-squared:  0.5101,    Adjusted R-squared:  0.5091 
F-statistic: 518.6 on 1 and 498 DF,  p-value: < 2.2e-16
confint(lin_reg_model)
                2.5 %     97.5 %
(Intercept) 7.6368581 12.0067017
depth       0.4611309  0.5482156
mult_model <- mult_reg_model$residuals #residuals of multiple regression model assigned to a new object
hist(mult_model) 

QQ Plot of residuals

qqnorm(mult_model)
qqline(mult_model)