Research Methods Summative

1 Step 1: Load the data into the R session

shark <- read.csv("sharks.csv")

shark_sub <- read.csv("sharksub.csv")

2 Step 2 Install/load the required packages

#installs packages 1 by 1 removing the # if these packages are not already present in your personal library
#install.packages("nortest")
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("vegan")
#install.packages("corrplot")
#install.packages("ggplot2")
#install.packages("mgcv")
#install.packages("car")
#install.packages(boot)


#loads them 1 by 1
library("nortest")
library("dplyr")

Warning: package 'dplyr' was built under R version 4.4.2


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library("tidyr")
library("vegan")

Warning: package 'vegan' was built under R version 4.4.2

Loading required package: permute

Warning: package 'permute' was built under R version 4.4.2

Loading required package: lattice

This is vegan 2.6-8

library("corrplot")

Warning: package 'corrplot' was built under R version 4.4.2

corrplot 0.95 loaded

library("ggplot2")
library("mgcv")

Warning: package 'mgcv' was built under R version 4.4.2

Loading required package: nlme


Attaching package: 'nlme'

The following object is masked from 'package:dplyr':

    collapse

This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.

library("car")

Warning: package 'car' was built under R version 4.4.2

Loading required package: carData


Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

library("boot")


Attaching package: 'boot'

The following object is masked from 'package:car':

    logit

The following object is masked from 'package:lattice':

    melanoma

3 Step 3: Conduct normality test

It is important to test for normality, to ensure which test is best fit for the data to avoid statistical bias and false significance. An Anderson Darling test is one of many tests can can be utalised for this purpose.

#firstly for the shark data set

weight.norm <- ad.test(shark$weight)
print(weight.norm)   # non normal


    Anderson-Darling normality test

data:  shark$weight
A = 7.4333, p-value < 2.2e-16

bpm.norm <- ad.test(shark$BPM)
print(bpm.norm)    # non normal


    Anderson-Darling normality test

data:  shark$BPM
A = 6.8186, p-value < 2.2e-16

blotch.norm <- ad.test(shark$blotch)
print(blotch.norm)    # normal


    Anderson-Darling normality test

data:  shark$blotch
A = 0.38716, p-value = 0.3872

length.norm <- ad.test(shark$length)
print(length.norm)    # non normal


    Anderson-Darling normality test

data:  shark$length
A = 5.422, p-value = 2.17e-13

air.norm <- ad.test(shark$air)
print(air.norm)    # non normal


    Anderson-Darling normality test

data:  shark$air
A = 4.6957, p-value = 1.203e-11

water.norm <- ad.test(shark$water)
print(water.norm)    # non normal


    Anderson-Darling normality test

data:  shark$water
A = 4.8383, p-value = 5.463e-12

meta.norm <- ad.test(shark$meta)
print(meta.norm)    # non normal


    Anderson-Darling normality test

data:  shark$meta
A = 4.0909, p-value = 3.462e-10

depth.norm <- ad.test(shark$depth)
print(depth.norm)   # normal


    Anderson-Darling normality test

data:  shark$depth
A = 0.39155, p-value = 0.3781

#secondly for the sharksub dataset

blotch1.norm <- ad.test(shark_sub$blotch1)
print(blotch1.norm)    # normal


    Anderson-Darling normality test

data:  shark_sub$blotch1
A = 0.2969, p-value = 0.578

blotch2.norm <- ad.test(shark_sub$blotch2)
print(blotch2.norm)    # normal


    Anderson-Darling normality test

data:  shark_sub$blotch2
A = 0.29461, p-value = 0.5846

4 Step 4: Firstly lets see if there is a correlation between water and air temperature and indicators of stress blotch time/Corticosterone levels (meta)

as these 3 columns of data do not follow a normal distribution the non parametric test will be used.

# to understand if the difference in temperature is influencing the stress response creating a new column within the data set called temp_diff using a simple calculation of subtracting the water column from each corresponding value in the air column.
#this gives us the difference in water temperature 
shark$temp_diff <- shark$air - shark$water
head(shark)

     ID    sex   blotch BPM    weight   length      air    water      meta
1 SH001 Female 37.17081 148  74.69050 186.6703 37.73957 23.37377  64.11183
2 SH002 Female 34.54973 158  73.41627 189.3189 35.68413 21.42088  73.68676
3 SH003 Female 36.32861 125  71.80837 283.6332 34.79854 20.05114  54.43466
4 SH004   Male 35.33881 161 104.62985 171.0986 36.15973 21.64319  86.32615
5 SH005 Female 37.39799 138  67.13098 264.3160 33.61477 21.76143 107.97796
6 SH006   Male 33.54668 126 110.49396 269.9829 36.38343 20.85200 108.86475
     depth temp_diff
1 53.22635  14.36580
2 49.63903  14.26326
3 49.44057  14.74740
4 50.29711  14.51655
5 49.03183  11.85333
6 46.84148  15.53143

# to understand the correlation of these variables testing them individually may shed light onto whether or not these can be used as predictors of stress. 
# first lets test if the correlation between water temperature and corticosterone (meta) and water temperature and blotching shows a significant correlation.

# Spearman's Rank correlation test
water_meta_cor <- cor.test(shark$meta, shark$water, method = "spearman")
blotch_water_cor <- cor.test(shark$blotch, shark$water, method = "spearman")

meta_air_cor <- cor.test(shark$meta, shark$air, method = "spearman")
blotch_air_cor <- cor.test(shark$blotch, shark$air, method = "spearman")

meta_diff_cor <- cor.test(shark$meta, shark$temp_diff, method = "spearman")
blotch_diff_cor <- cor.test(shark$blotch, shark$temp_diff, method = "spearman")

water_meta_cor


    Spearman's rank correlation rho

data:  shark$meta and shark$water
S = 20360456, p-value = 0.6126
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.0226942

# water and meta showing a small positive correlation of rho = 0.02 with a p value of 0.6126 indicating a insignificant positive correlation. 
blotch_water_cor


    Spearman's rank correlation rho

data:  shark$blotch and shark$water
S = 21758692, p-value = 0.3214
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.04442139

# water and blotching time shows a small negative correlation of rho = 0.04 with a p value of 0.3214 indicating a insignificant negative correlation 
meta_air_cor


    Spearman's rank correlation rho

data:  shark$meta and shark$air
S = 18172688, p-value = 0.004258
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.1277075

# the air and meta shows a significant positive correlation (rho = 0.13) with a p value of 0.004258, indicating that there could be some sort of relationship, warrenting further testing.
blotch_air_cor


    Spearman's rank correlation rho

data:  shark$blotch and shark$air
S = 21469324, p-value = 0.4956
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.03053167

# air and blotching time shows a small negative correlation of rho -0.03 with a p value of 0.4956, indicating a insignificant negative correlation
meta_diff_cor


    Spearman's rank correlation rho

data:  shark$meta and shark$temp_diff
S = 19486516, p-value = 0.1489
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
0.06464349

# meta and temp difference shows a small positive correlation of rho = 0.07, with a p value of 0.1489, indication a insignificant positive correlation
blotch_diff_cor


    Spearman's rank correlation rho

data:  shark$blotch and shark$temp_diff
S = 20660142, p-value = 0.8529
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
0.008309217

# meta and temp difference shows a small positive correlation of rho = 0.008, with a p value of 0.8529, indication a insignificant positive correlation

#there is one crux with this methodology however this test assumes linearity and only identifies patterns not relationships however this test still provides useful insights into the interactions between these two columns of data.

after conducting the correlation tests we got back one significant correlation as this correlation test only checks if an association exists testing to see if there is a positive relationship could reveal a result with further significance. running a relationship test on all of the variables could also reveal useful findings.

# Run Generlaised additive curve. 
model1 <- gam(meta ~ s(air), data = shark)

# View summary
summary(model1)


Family: gaussian 
Link function: identity 

Formula:
meta ~ s(air)

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  82.0427     0.7711   106.4   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
         edf Ref.df     F p-value  
s(air) 2.918  3.621 3.392  0.0107 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.0226   Deviance explained = 2.83%
GCV = 299.68  Scale est. = 297.33    n = 500

# p value = 0.01 indicating a significant relationship, however only  2.26 percent of the variation seen in the meta data can be explained by air temperature suggesting an extremely weak relationship, meaning it is biologically unfeasible to draw conclusions from this relationship.

When plotting the previous GAM it revealed a non linear relationship. therefore testing the relationship between these variables could be useful especially as the correlation tests only for linear relationships so running this model could indicate further relationships.

# model 2 represent the time to blotch and air temperature
model2 <- gam(blotch ~ s(air), data = shark)
summary(model2)


Family: gaussian 
Link function: identity 

Formula:
blotch ~ s(air)

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 35.12541    0.06386   550.1   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
       edf Ref.df     F p-value
s(air)   1      1 0.706   0.401

R-sq.(adj) =  -0.00059   Deviance explained = 0.142%
GCV = 2.0471  Scale est. = 2.0389    n = 500

# p value = 0.401 indicating an insignificant relationship, in addition to this only 0.142 percent of the variation seen in the blotch time can be explained by air temperature suggesting a extremely weak relationship.

model3 <- gam(meta ~ s(water), data = shark)
summary(model3)


Family: gaussian 
Link function: identity 

Formula:
meta ~ s(water)

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  82.0427     0.7782   105.4   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
           edf Ref.df     F p-value
s(water) 2.673  3.333 1.099   0.387

R-sq.(adj) =  0.00449   Deviance explained = 0.983%
GCV = 305.07  Scale est. = 302.83    n = 500

# p value = 0.387 indicating an insignificant relationship, in addition to this only 0.983 percent of the variation seen in the corticosterone can be explained by water temperature suggesting a extremely weak relationship.

model4 <- gam(blotch ~ s(water), data = shark)
summary(model4)


Family: gaussian 
Link function: identity 

Formula:
blotch ~ s(water)

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 35.12541    0.06382   550.4   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
         edf Ref.df     F p-value
s(water)   1      1 1.332   0.249

R-sq.(adj) =  0.000665   Deviance explained = 0.267%
GCV = 2.0445  Scale est. = 2.0364    n = 500

# p value = 0.249 indicating an insignificant relationship, in addition to this only 0.267 percent of the variation seen in the time to blotch can be explained by water temperature suggesting a extremely weak relationship.

model5 <- gam(meta ~ s(temp_diff), data = shark)
summary(model5)


Family: gaussian 
Link function: identity 

Formula:
meta ~ s(temp_diff)

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  82.0427     0.7784   105.4   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
               edf Ref.df     F p-value
s(temp_diff) 1.565  1.948 1.675   0.238

R-sq.(adj) =  0.004   Deviance explained = 0.712%
GCV = 304.54  Scale est. = 302.98    n = 500

# p value = 0.238 indicating an insignificant relationship, in addition to this only 0.712 percent of the variation seen in the time to blotch can be explained by the temperature difference between air and water suggesting a extremely weak relationship.

model6 <- gam(blotch ~ s(temp_diff), data = shark)
summary(model6)


Family: gaussian 
Link function: identity 

Formula:
blotch ~ s(temp_diff)

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 35.12541    0.06384   550.2   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
               edf Ref.df     F p-value
s(temp_diff) 1.574  1.961 0.486   0.639

R-sq.(adj) =  0.000113   Deviance explained = 0.327%
GCV =  2.048  Scale est. = 2.0375    n = 500

# p value = 0.639 indicating an insignificant relationship, in addition to this only 0.327 percent of the variation seen in the time to blotch can be explained by the temperature difference between air and water suggesting a extremely weak relationship.

In conclusion water and air temperature and the difference in temperature does not have any relationship between meta (corticosterone) and blotching time.

however they could still inform further models in predicting the blotching time.

5 Step 5: Does blotching time differ between sexes?

Often physiological stress responses are driven by hormonal changes, this can vary between males and females in most species. Testosterone often increasing the level of stress males experience overall.

male_data <- subset(shark, sex == "Male")
female_data <- subset(shark, sex == "Female")

# perform an unpaired t-test
t_test_result <- t.test(blotch ~ sex, data = shark)
print(t_test_result)


    Welch Two Sample t-test

data:  blotch by sex
t = -3.0282, df = 494.67, p-value = 0.002589
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
 -0.6322714 -0.1346620
sample estimates:
mean in group Female   mean in group Male 
            34.92294             35.30641

# results showed significant difference between males and females, with a p value of 0.003 which tells us there is a statistically significant difference in the time taken to blotch.

# create a boxplot
ggplot(shark, aes(x = sex, y = blotch, fill = sex)) +
  geom_boxplot()  +
  labs(title = "Comparison of Time Taken to Blotch by Sex",
       x = "Sex",
       y = "Time Taken to Blotch", fill = "sex") +
  theme_minimal()

#although the t test shows a significant result looking at the boxplot the error bars overlap heavily so do the interquartile ranges which shows there likely wouldnt be a noticable difference in the observed data.

6 Step 6: Does multiple capture have an effect on blotching time.

now i want to test if the time taken to blotch after the second capture increases or reduces.

because the columns blotch 1 and blotch 2 (which will be the focus of this test) showed normal distributions we can use a Paired t-test.

# Paired t-test
t.test(shark_sub$blotch1, shark_sub$blotch2, paired = TRUE)


    Paired t-test

data:  shark_sub$blotch1 and shark_sub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.037176 -0.822301
sample estimates:
mean difference 
     -0.9297384

# p value = 2.2e-16 this test revealed a highly significant difference between blotch 1 and blotch 2 indicating a significant change in the time to blotch after the first capture.

# boxplot of blotching times
boxplot(shark_sub$blotch1, shark_sub$blotch2, names = c("First Capture", "Second Capture"), ylab = "Time to Blotch", main = "Comparison of Blotching Times")

#seeming as though this test showed a significant difference understanding if sex has any role in this could aid in understanding the difference further.
#creating a column which represents the difference in time taken to blotch would be a good start especially as i want to use a t test again to ensure the results are comparible.

# Calculate the difference in blotching time between second and first capture
shark_sub$blotch_diff <- shark_sub$blotch2 - shark_sub$blotch1

# run Paired t-test to compare the change in blotching time between males and females
t.test(blotch_diff ~ sex, data = shark_sub)


    Welch Two Sample t-test

data:  blotch_diff by sex
t = 0.40707, df = 44.541, p-value = 0.6859
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
 -0.1733770  0.2611792
sample estimates:
mean in group Female   mean in group Male 
           0.9516890            0.9077879

#the blotching time revealed no significant result when comparing males and females however testing them individually might also reveal more results understanding weather the males or the females significantly influence time to blotch after the first capture.

# create unique dataframes one for male and one for females
females <- shark_sub[shark_sub$sex == "Female", ]
males <- shark_sub[shark_sub$sex == "Male", ]

t.test(females$blotch1, females$blotch2, paired = TRUE)


    Paired t-test

data:  females$blotch1 and females$blotch2
t = -14.694, df = 24, p-value = 1.7e-13
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.0853637 -0.8180143
sample estimates:
mean difference 
      -0.951689

t.test(males$blotch1, males$blotch2, paired = TRUE)


    Paired t-test

data:  males$blotch1 and males$blotch2
t = -10.527, df = 24, p-value = 1.786e-10
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.0857650 -0.7298108
sample estimates:
mean difference 
     -0.9077879

# it seems both males and females seem to show significant differences in the time taken to blotch. males p value = 1.786e-10 indicating a strong significance and females p value = 1.7e-13 indicating a strong significance 

# reshape the data specifically selecting the columns named sex blotch 1 & 2 and formatting them into a stacked data (all data in two columns)
shark_sub_long <- shark_sub %>%
  select(sex, blotch1, blotch2) %>%
  pivot_longer(cols = c(blotch1, blotch2), 
               names_to = "capture_type", 
               values_to = "blotch_time")

# Create the boxplot
ggplot(shark_sub_long, aes(x = interaction(sex, capture_type), y = blotch_time, fill = sex)) + geom_boxplot() + labs (title = "Comparison of Blotching Times by Sex and Capture", x = "Sex and Capture Type", y = "Blotching Time", fill = "Sex" ) + scale_x_discrete(labels = c("Blotch1", "Blotch1", "Blotch2", "Blotch2")) + theme_minimal()

#looking at the boxplot we can see overall the male takes longer to blotch however both sexes show that they significantly take longer to blotch the second time they are caught indicating it may be a less stressful experience for them the second time round. 
# further data collection on their corticosterone levels during their second capture could reveal more tangible evidence of this reduction in stress response.

7 Step 7: can blotching time be predicted with the given variables

the next analysis to be conducted is to see if the time taken to blotch can be predicted. this can be done by running multivariate tests seeing which variables best predict the time taken to blotch.

# create the generalised additive model using all of the variables as predictors for blotch.
model_gam <- gam(blotch ~ s(BPM) + s(weight) + s(length) + s(air) + s(water) + s(meta) + s(depth), 
                 data = shark)

# this produces a summary of the model giving information like what variables best inform the model. 
summary(model_gam)


Family: gaussian 
Link function: identity 

Formula:
blotch ~ s(BPM) + s(weight) + s(length) + s(air) + s(water) + 
    s(meta) + s(depth)

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 35.12541    0.04461   787.3   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
            edf Ref.df       F p-value    
s(BPM)    1.000  1.000   0.737   0.391    
s(weight) 1.000  1.000   0.269   0.604    
s(length) 1.000  1.000   1.494   0.222    
s(air)    1.000  1.000   0.767   0.382    
s(water)  3.834  4.743   0.628   0.636    
s(meta)   1.000  1.000   0.130   0.719    
s(depth)  3.213  4.075 127.899  <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.512   Deviance explained = 52.3%
GCV = 1.0218  Scale est. = 0.99513   n = 500

# the summary of the model informed that only 52.3% of the deviance and fluctations seen in the blotch data can be explained by all of the variables conbined, which isnt alot meaning realistically the predictive power of this model is low.
# in addition to this the most significant predictor seen was depth with a o value of <2e-16

# plots each individual varaible, flat lines or chaotic lines indicate that they do not significantly contribute to the models predictive power however lines with obvious trends do.
par(mfrow = c(3, 3))  
plot(model_gam, se = TRUE, rug = TRUE)
# depth is the only one that seems to show any sort of trend. further supporting the results from the summary command that depth is the only significant varaible used as a predictor for blotching time.


# this function helps generate the predictions using the previously made model named model_gam and indicates that we will be using bootstrapping
predict_model <- function(data, indices) {boot_sample <- data[indices, ]
  
  # fit the model to the bootstrapping sample
  model <- gam(blotch ~ s(BPM) + s(weight) + s(length) + s(air) + s(water) + s(meta) + s(depth), data = boot_sample)
  
  return(predict(model, newdata = data))}

# this applys the bootstrapping in this case was 1000 interations to the model
n_bootstrap <- 1000  # Number of bootstrap samples

# this line now carries out the bootstrapping running the model 1000 times getting predictied values 1000 times
bootstrap_results <- boot(data = shark, statistic = predict_model, R = n_bootstrap)

# this generates a dataframe with each column coresponding to one individual prediction that took place in the bootstrap
predictions_matrix <- bootstrap_results$t  

# calculate the mean and confidence intervals for each prediction which then can be plotted onto the graph
mean_predictions <- apply(predictions_matrix, 2, mean)
lower_ci <- apply(predictions_matrix, 2, function(x) quantile(x, 0.025))  # 2.5% quantile (lower CI)
upper_ci <- apply(predictions_matrix, 2, function(x) quantile(x, 0.975))  # 97.5% quantile (upper CI)

# add the results from the calcuating the mean to the original data set so we can compare the predicted blotch time over the observed blotch time.
shark$predicted_blotch <- mean_predictions
shark$lower_ci <- lower_ci
shark$upper_ci <- upper_ci

# plot the predictions generated in the bootstrapping onto a graph which contains the original data to compare the predicted with the actual data observed. using depth as the main predictor in the graph as it contributed to the prediction the most.

ggplot(shark, aes(x = depth, y = predicted_blotch)) +
  geom_line(color = "red", size = 1) +
  geom_ribbon(aes(ymin = lower_ci, ymax = upper_ci), fill = "red", alpha = 0.2) +
  geom_point(aes(x = depth, y = blotch), color = "blue", alpha = 0.5) +
  labs(title = "Bootstrapped Effect of Depth on Blotching Time",
       x = "Depth", y = "Predicted Blotch Time") +
  theme_minimal()

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

# given that the previous model was informed from depth with a p value of <2e-16 and all other variables having large p values indicates it was the only significant variable influencing the predictive power for blotching time.
#this can be further seen in the plots that it is the only one with a positive trend and all the others are just flat.
#creating a model with just depth is then further created to see its true predictive power

as depth increases so does the time taken for blotching to cover 30% of the ventral surface

# as the multivraite test previously didnt reveal any significant results. simplifying the model to see if it would help the predictive power is the next option.
#creating a univariate statistical test is next using the same model so the results can be compared. using just depth which was the most significant predictor of blotching in the previous model. 
#creating a model with just depth can help simplify the model which makes it more accurate at measuring.
model_depth <- gam(blotch ~ s(depth), data = shark)

summary(model_depth)


Family: gaussian 
Link function: identity 

Formula:
blotch ~ s(depth)

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 35.12541    0.04462   787.3   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
           edf Ref.df     F p-value    
s(depth) 3.057  3.884 134.8  <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.512   Deviance explained = 51.5%
GCV = 1.0034  Scale est. = 0.99528   n = 500

# this model now has a 0.8% worse explanation for the deviance indicating that all of the other variables had no real influence on the previous model solidifying the reasoning behind just using depth.

#now using the same code as before just refined for the depth model, produce 1000 bootstraps
predict_model_depth <- function(data, indices) {boot_sample <- data[indices, ]
  model <- gam(blotch ~ s(depth), data = boot_sample)
  return(predict(model, newdata = data, se.fit = TRUE)$fit)}

n_bootstrap <- 1000 

bootstrap_results <- boot(data = shark, statistic = predict_model_depth, R = n_bootstrap)

predictions_matrix <- bootstrap_results$t 


mean_predictions <- apply(predictions_matrix, 2, mean)
lower_ci <- apply(predictions_matrix, 2, function(x) quantile(x, 0.025)) 
upper_ci <- apply(predictions_matrix, 2, function(x) quantile(x, 0.975))


shark$predicted_blotch <- mean_predictions
shark$lower_ci <- lower_ci
shark$upper_ci <- upper_ci


ggplot(shark, aes(x = depth, y = predicted_blotch)) +
  geom_line(color = "red", size = 1) + 
  geom_ribbon(aes(ymin = lower_ci, ymax = upper_ci), fill = "red", alpha = 0.2) +
  geom_point(aes(y = blotch), color = "blue", alpha = 0.5) +  
  labs(title = "Bootstrapped Effect of Depth on Blotching Time", 
       x = "Depth", 
       y = "Predicted Blotch Time") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

#you can see if the graph produced as there is less noise added to the calculation with the added variables it produces a smoother curve indicating that depth on its own is a better and more accurate predictor of blotching that all of the variables combined.

8 Step 8: can we predict corticosterone levels

to answer this question a simple correlation test can be conducted and also plotting two of the variables in the data set against each other, time to blotch and the corticosterone levels. given that this hormone is the primary stress hormone for elasmobranchs and up until now we have been assuming blotching is an indicator of stress. it may be useful to plot these two variables and run the previously ran multivariate statistics on the meta column as well.

blotch_meta_cor <- cor.test(shark$blotch, shark$meta, method = "spearman")
blotch_meta_cor


    Spearman's rank correlation rho

data:  shark$blotch and shark$meta
S = 21411160, p-value = 0.5359
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.02773979

# the correlation test showed no significant result indicating no correlation. however plotting the two variables against each other may prove useful in visualisation.

ggplot(data = shark, aes(x = meta, y = blotch)) +
  geom_point(color = "blue", alpha = 0.7) +  
  labs(title = "Scatterplot of Meta vs. Blotch",
       x = "Meta (Stress Hormone Levels)",
       y = "Blotch (Time to Blotch)") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

# further understanding the correlation of the other variables and corticosterone could help understand the influence of the other variables on this stress hormone.

model_meta <- gam(meta ~ s(depth) + s(blotch) + s(weight) + s(length) + s(air) + s(water), data = shark)


summary(model_meta)


Family: gaussian 
Link function: identity 

Formula:
meta ~ s(depth) + s(blotch) + s(weight) + s(length) + s(air) + 
    s(water)

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  82.0427     0.7705   106.5   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
            edf Ref.df     F p-value   
s(depth)  1.000  1.000 0.175 0.67583   
s(blotch) 1.000  1.000 0.104 0.74696   
s(weight) 1.000  1.000 0.175 0.67630   
s(length) 1.698  2.116 0.870 0.44742   
s(air)    2.807  3.483 3.760 0.00711 **
s(water)  2.489  3.105 1.195 0.32740   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.0241   Deviance explained = 4.36%
GCV = 303.55  Scale est. = 296.87    n = 500

par(mfrow = c(3, 3))  
plot(model_meta, se = TRUE, rug = TRUE, main = "Effect of Variables on Meta (Stress Hormone)")


predict_meta_model <- function(data, indices) {
 
  boot_sample <- data[indices, ] 
  
  
  model <- gam(meta ~ s(depth) + s(blotch) + s(weight) + s(length) + s(air) + s(water), data = shark)

  
  
  return(predict(model, newdata = data))
}


n_bootstrap <- 1000
bootstrap_results_meta <- boot(data = shark, 
                               statistic = predict_meta_model, 
                               R = n_bootstrap)


predictions_matrix_meta <- bootstrap_results_meta$t  


mean_predictions_meta <- apply(predictions_matrix_meta, 2, mean)
lower_ci_meta <- apply(predictions_matrix_meta, 2, function(x) quantile(x, 0.025))  
upper_ci_meta <- apply(predictions_matrix_meta, 2, function(x) quantile(x, 0.975))  


shark$predicted_meta <- mean_predictions_meta
shark$lower_ci_meta <- lower_ci_meta
shark$upper_ci_meta <- upper_ci_meta

#producing a graph for the most significant predictor in the model which was air due to its extremely low P value

ggplot(shark, aes(x = air, y = predicted_meta)) +
  geom_line(color = "red", size = 1) +
  geom_ribbon(aes(ymin = lower_ci_meta, ymax = upper_ci_meta), fill = "red", alpha = 0.2) +
  geom_point(aes(x = air, y = meta), color = "blue", alpha = 0.5) +
  labs(title = "Bootstrapped Effect of Air Temperature on Meta (Stress Hormone)",
       x = "Air Temperature", y = "Predicted Meta") +
  theme_minimal()


#plotting the temperature of water as even though it doesnt have a high p value after running the plot function to see which variable has an effect on the predicted value. water reveal an interesting trend.
ggplot(shark, aes(x = water, y = predicted_meta)) +
  geom_line(color = "red", size = 1) +
  geom_ribbon(aes(ymin = lower_ci_meta, ymax = upper_ci_meta), fill = "red", alpha = 0.2) +
  geom_point(aes(x = water, y = meta), color = "blue", alpha = 0.5) +
  labs(title = "Bootstrapped Effect of Water Temperature on Meta (Stress Hormone)",
       x = "Water Temperature", y = "Predicted Meta") +
  theme_minimal()

model_meta <- gam(meta ~ s(air), data = shark)


summary(model_meta)


Family: gaussian 
Link function: identity 

Formula:
meta ~ s(air)

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  82.0427     0.7711   106.4   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
         edf Ref.df     F p-value  
s(air) 2.918  3.621 3.392  0.0107 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.0226   Deviance explained = 2.83%
GCV = 299.68  Scale est. = 297.33    n = 500

par(mfrow = c(3, 3))  
plot(model_meta, se = TRUE, rug = TRUE, main = "Effect of Variables on Meta (Stress Hormone)")


predict_meta_model <- function(data, indices) {
 
  boot_sample <- data[indices, ] 
  
  
  model <- gam(meta ~ s(air), data = shark)

  
  
  return(predict(model, newdata = data))
}


n_bootstrap <- 1000
bootstrap_results_meta <- boot(data = shark, 
                               statistic = predict_meta_model, 
                               R = n_bootstrap)


predictions_matrix_meta <- bootstrap_results_meta$t  


mean_predictions_meta <- apply(predictions_matrix_meta, 2, mean)
lower_ci_meta <- apply(predictions_matrix_meta, 2, function(x) quantile(x, 0.025))  
upper_ci_meta <- apply(predictions_matrix_meta, 2, function(x) quantile(x, 0.975))  


shark$predicted_meta <- mean_predictions_meta
shark$lower_ci_meta <- lower_ci_meta
shark$upper_ci_meta <- upper_ci_meta


ggplot(shark, aes(x = air, y = predicted_meta)) +
  geom_line(color = "red", size = 1) +
  geom_ribbon(aes(ymin = lower_ci_meta, ymax = upper_ci_meta), fill = "red", alpha = 0.2) +
  geom_point(aes(x = air, y = meta), color = "blue", alpha = 0.5) +
  labs(title = "Bootstrapped Effect of Air Temperature on Meta (Stress Hormone)",
       x = "Air Temperature", y = "Predicted Meta") +
  theme_minimal()

#even after reducing the noise generated from other variables and attempting to simplify the model the predicted values are just too erratic indicating that none of the variables in this data set can be used to accurately predict the corticosterone levels i.e. meta.