Summative Code

Author

N1344796

Summative Code

-> Loading packages

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readxl)
sharks <- read_excel("C:/Users/kathr/OneDrive/Documents/_NTU_/Research Methods and Data Analysis/ASSESSMENT/sharks.xlsx")
sharksub <- read_excel("C:/Users/kathr/OneDrive/Documents/_NTU_/Research Methods and Data Analysis/ASSESSMENT/sharksub.xlsx")

1) Is there a correlation between air temperature and water temperature?

-> Visualisation of data (Scatterplot)

ggplot(sharks, aes(x= water, y=air))+
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Scatterplot of Air vs. Water Temperature", 
       x = "Water Temperature (°C)", 
       y = "Air Temperature (°C)")
`geom_smooth()` using formula = 'y ~ x'

-> Pearson’s Correlation Test

cor(sharks$water, sharks$air, method = 'pearson')
[1] -0.05524051

-> New column creation (Difference in temperature)

shark.new <-
  sharks%>%
  mutate(tempdifference = (air - water))

-> Visualisation (Scatter Plot)

ggplot(shark.new, aes(x= tempdifference, y = blotch))+
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs( x = "Temperature Difference (°C)", 
       y = "Blotch Time (Seconds)")
`geom_smooth()` using formula = 'y ~ x'

-> Pearson’s Correlation Test

cor(shark.new$tempdifference, shark.new$blotch, method = 'pearson')
[1] 0.01444459

2) Does multiple capture have an effect on blotching time?

-> Visualization of blotch times

ggplot(sharksub, aes(x = blotch1, y = blotch2)) +
  geom_point(aes(colour = sex)) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Blotching Time: First Capture vs. Recapture",
       x = "First Capture Blotch Time (s)",
       y = "Recapture Blotch Time (s)",
       color = "Sex")
`geom_smooth()` using formula = 'y ~ x'

-> Q-Q Residual Plots

qqnorm(sharksub$blotch1)
qqline(sharksub$blotch1, col = "red")

qqnorm(sharksub$blotch2)
qqline(sharksub$blotch2, col = "red")

-> Shapiro- Wilk Test

shapiro.test(sharksub$blotch1)

    Shapiro-Wilk normality test

data:  sharksub$blotch1
W = 0.97958, p-value = 0.5345
shapiro.test(sharksub$blotch2)

    Shapiro-Wilk normality test

data:  sharksub$blotch2
W = 0.97936, p-value = 0.5255

-> Paired T-test (assumptions passed)

t.test(sharksub$blotch2, sharksub$blotch1, paired = TRUE)

    Paired t-test

data:  sharksub$blotch2 and sharksub$blotch1
t = 17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 0.822301 1.037176
sample estimates:
mean difference 
      0.9297384 

-> Visualisation (box plot showing difference in means between the two blotch times)

sharksub_long <- sharksub %>%
  pivot_longer(cols = c(blotch1, blotch2), 
               names_to = "Capture", 
               values_to = "Blotching_Time")
ggplot(sharksub_long, aes(x = Capture, y = Blotching_Time, fill = Capture)) +
  geom_boxplot() +
  labs(
    title = "Comparison of Blotching Times for First and Second Captures",
    x = "Capture Event",
    y = "Blotching Time (seconds)"
  ) 

3) Can blotching time be predicted?

-> Multiple linear regression

model <- lm(blotch ~ BPM + weight + length + air + water + depth, data = sharks)

-> Stepwise Regression

model_stepwise <- step(model, direction = "both")
Start:  AIC=8.33
blotch ~ BPM + weight + length + air + water + depth

         Df Sum of Sq     RSS    AIC
- weight  1      0.23  494.59   6.56
- BPM     1      0.39  494.74   6.72
- water   1      0.50  494.86   6.84
- air     1      0.89  495.24   7.22
- length  1      1.60  495.96   7.94
<none>                 494.36   8.33
- depth   1    516.43 1010.79 363.94

Step:  AIC=6.56
blotch ~ BPM + length + air + water + depth

         Df Sum of Sq     RSS    AIC
- BPM     1      0.38  494.97   4.94
- water   1      0.45  495.04   5.02
- air     1      0.93  495.52   5.50
- length  1      1.58  496.17   6.15
<none>                 494.59   6.56
+ weight  1      0.23  494.36   8.33
- depth   1    516.34 1010.93 362.01

Step:  AIC=4.94
blotch ~ length + air + water + depth

         Df Sum of Sq     RSS    AIC
- water   1      0.47  495.43   3.41
- air     1      0.85  495.82   3.80
- length  1      1.70  496.67   4.66
<none>                 494.97   4.94
+ BPM     1      0.38  494.59   6.56
+ weight  1      0.22  494.74   6.72
- depth   1    517.03 1012.00 360.54

Step:  AIC=3.41
blotch ~ length + air + depth

         Df Sum of Sq     RSS    AIC
- air     1      0.79  496.22   2.20
- length  1      1.83  497.26   3.25
<none>                 495.43   3.41
+ water   1      0.47  494.97   4.94
+ BPM     1      0.39  495.04   5.02
+ weight  1      0.17  495.26   5.24
- depth   1    519.64 1015.07 360.05

Step:  AIC=2.2
blotch ~ length + depth

         Df Sum of Sq     RSS    AIC
- length  1      1.91  498.12   2.12
<none>                 496.22   2.20
+ air     1      0.79  495.43   3.41
+ water   1      0.40  495.82   3.80
+ BPM     1      0.32  495.90   3.89
+ weight  1      0.21  496.01   3.99
- depth   1    520.33 1016.55 358.78

Step:  AIC=2.12
blotch ~ depth

         Df Sum of Sq     RSS    AIC
<none>                 498.12   2.12
+ length  1      1.91  496.22   2.20
+ air     1      0.86  497.26   3.25
+ water   1      0.51  497.61   3.60
+ BPM     1      0.43  497.69   3.69
+ weight  1      0.19  497.94   3.93
- depth   1    518.70 1016.82 356.91
summary(model_stepwise)

Call:
lm(formula = blotch ~ depth, data = sharks)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.81869 -0.65427 -0.01035  0.58825  2.83116 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.82178    1.11207   8.832   <2e-16 ***
depth        0.50467    0.02216  22.772   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1 on 498 degrees of freedom
Multiple R-squared:  0.5101,    Adjusted R-squared:  0.5091 
F-statistic: 518.6 on 1 and 498 DF,  p-value: < 2.2e-16

-> Checking assumptions

# Residual plot
plot(model_stepwise$fitted.values, residuals(model_stepwise),
     main = "Residual Plot",
     xlab = "Fitted Values",
     ylab = "Residuals")
abline(h = 0, col = "red", lty = 2)

# Q-Q plot for normality of residuals
qqnorm(residuals(model_stepwise))
qqline(residuals(model_stepwise), col = "red")

-> significant predictors

significant_predictors <- summary(model_stepwise)$coefficients %>%
  as.data.frame() %>%
  filter(`Pr(>|t|)` < 0.05)
print(significant_predictors)
             Estimate Std. Error   t value     Pr(>|t|)
(Intercept) 9.8217799  1.1120672  8.832002 1.761936e-17
depth       0.5046733  0.0221619 22.772112 3.399843e-79

-> Correlation Matrix

cor_matrix <- cor(sharks %>% select(BPM, weight, length, air, water, depth))
print(cor_matrix)
               BPM       weight      length         air       water
BPM     1.00000000  0.017036558 -0.06856053 -0.06841209  0.02451337
weight  0.01703656  1.000000000 -0.01959676 -0.05264537  0.08633875
length -0.06856053 -0.019596758  1.00000000 -0.03027426 -0.05940708
air    -0.06841209 -0.052645366 -0.03027426  1.00000000 -0.05524051
water   0.02451337  0.086338753 -0.05940708 -0.05524051  1.00000000
depth  -0.01217352 -0.006057435 -0.08334774 -0.01188199 -0.04088851
              depth
BPM    -0.012173520
weight -0.006057435
length -0.083347736
air    -0.011881989
water  -0.040888511
depth   1.000000000