Sharks Summative

R Code

Analysis displayed without figures and results

Click to expand Analysis
# Packages Required

library(tidyverse)
library(dplyr)
library(ggplot2)
library(corrplot)
library(psych)
library(GGally)
library(patchwork)


# Importing Data

sharks <- data.frame(read_csv("sharks.csv"))
sharksub <- data.frame(read.csv("sharksub.csv"))

# Data Summary

describe(sharks) # Exploring the dataset to understand range of data

summary(sharks$blotch) # Exploring key variable blotching
hist(sharks$blotch)

# Visualising all correlations
sharks$sex <- factor(sharks$sex, levels = c("Male", "Female"))
ggpairs(sharks, columns = 2:10, cardinality_threshold = 500, 
        columnLabels = c("Sex", "Blotch Time", "BPM", "Weight", "Length", "Air Temp", "Water Temp", "Meta", "Depth"), 
        proportions = "auto",
        lower = list(combo = "dot"),
        upper = list(combo = "box"))
# Indication of relationship between blotch ~ depth and meta ~ air.

# Stress Variables - CORRELATION

cor.test(sharks$blotch, sharks$BPM)
cor.test(sharks$blotch, sharks$meta)

bpmplot <- ggplot(sharks, aes(BPM, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Heart Rate (BPM)", y = "Blotching Time (Seconds)")
metaplot <- ggplot(sharks, aes(meta, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Meta (mcg/dl)", y = "Blotching Time (Seconds)")

stressplot <- bpmplot + metaplot # Creating combined plot
print(stressplot)

# Environmental Variable - CORRELATION

sharks1 <- sharks %>%
  mutate(temp_diff = air - water) # create new variable investigating temperature difference

cor.test(sharks$blotch, sharks$air)
cor.test(sharks$blotch, sharks$water)
cor.test(sharks$blotch, sharks$depth)
cor.test(sharks1$blotch, sharks1$temp_diff)

airplot <- ggplot(sharks, aes(air, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Air Temperature (Celcius)", y = "Blotching Time (Seconds)")
waterplot <- ggplot(sharks, aes(water, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Water Temperature (Celcius)", y = "Blotching Time (Seconds)")
depthplot <- ggplot(sharks, aes(depth, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Capture Depth (Metres)", y = "Blotching Time (Seconds)")
tempplot <- ggplot(sharks1, aes(temp_diff, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Temperature Difference (Celcius)", y = "Blotching Time (Seconds)")

envplot <- airplot + waterplot + tempplot + depthplot # Creating combined plot
print(envplot)


# Air and Water Temp Effects

summary(sharks$air)
summary(sharks$water)

# Relationship between air and water temperature
tempcor <- cor.test(sharks$air, sharks$water)
print(tempcor)

airplot1 <- ggplot(sharks, aes(water, air))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Water Temperature (Celcius)", y = "Air Temperature (Celcius)")


cor.test(sharks$meta, sharks$air)
airplot2 <- ggplot(sharks, aes(meta, air))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Meta (mcg/dl)", y = "Air Temperature (Celcius)")

airplot <- airplot1 + airplot2
print(airplot)




# Physiological Variable - CORRELATION + T TEST

cor.test(sharks$blotch, sharks$weight)
cor.test(sharks$blotch, sharks$length)

weightplot <- ggplot(sharks, aes(weight, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Weight (Kilograms)", y = "Blotching Time (Seconds)")
lengthplot <- ggplot(sharks, aes(length, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Length (Centimetres)", y = "Blotching Time (Seconds)")

physplot <- weightplot + lengthplot # Creating combined plot
print(physplot)

# Separating variables by sex

sharksmale <- sharks %>%
  filter(sex == "Male") %>%
  na.omit()
sharksfemale <- sharks %>%
  filter(sex == "Female") %>%
  na.omit()

# Comparing variables between sexes

t.test(sharksmale$blotch, sharksfemale$blotch,
       alternative = "two.sided",
       mu = 0, paired = FALSE)

sexplot <- ggplot(sharks, aes(sex, blotch))+
  geom_boxplot()+
  labs(x = "Sex", y = "Blotching Time (Seconds)")

# Investigating life stage

sharkslife <- sharks %>% # Assigning adult or juvenile based on length
  mutate(life_stage = case_when(
    sex == "Female" & length > 180 ~ "adult",
    sex == "Female" & length <= 180 ~ "juvenile",
    sex == "Male" & length > 170 ~ "adult",
    sex == "Male" & length <= 170 ~ "juvenile",
    TRUE ~ NA_character_))

table(sharkslife$life_stage) # count how many adult vs juvenile

t.test(blotch ~ life_stage, data = sharkslife)

lifeplot <- ggplot(sharkslife, aes(life_stage, blotch))+
  geom_boxplot()+
  labs(x = "Life Stage", y = "Blotching Time (Seconds)")
  
physplot2 <- sexplot + lifeplot # Creating combined plot
print(physplot2)




# Difference between first and second capture

summary(sharksub) # Exploring data

shapiro.test(sharksub$blotch1)
shapiro.test(sharksub$blotch2) # both normally distributed

t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)

summary(sharksub$blotch1)
summary(sharksub$blotch2)


sharks_long <- sharksub %>%
  pivot_longer(cols = c(blotch1, blotch2), 
               names_to = "capture", 
               values_to = "blotch",
               names_prefix = "blotch")

captureplot1 <- ggplot(sharks_long, aes(x = capture, y = blotch))+
  geom_boxplot()+
  labs(x= "Capture", y = "Blotching time (Seconds)")

# Are male and female sharks affected differently by a second capture?

capture_sex <- sharksub %>% 
  group_by(sex) %>% 
  summarise(mean_blotch1 = mean(blotch1), 
            mean_blotch2 = mean(blotch2), 
            mean_diff = mean(blotch2 - blotch1))
print(capture_sex)

timediff <- sharksub %>%
  mutate(timediff = blotch2 - blotch1)
t.test(timediff ~ sex, data = timediff)


captureplot2 <- ggplot(sharks_long, aes(x = capture, y = blotch, fill = sex))+
  geom_boxplot()+
  labs(x= "Capture", y = "Blotching time (Seconds)")+
  scale_fill_manual(values = c("Male" = "darkgrey", "Female" = "azure2"))

captureplot <- captureplot1 + captureplot2 # Creating combined plot
print(captureplot)


# Predicting Blotching Time

blotchprediction <- lm (blotch ~ depth, data = sharks) # Initial linear model based on previous results

summary(blotchprediction)

ggplot(sharks, aes(blotchprediction$residuals))+ # shows normal distribution of residuals
  geom_histogram()

# Can the model be improved?

backwardmodel <- lm(blotch ~ sex + BPM + meta + weight + length + air + water + depth + sharks1$temp_diff, sharks) 

backwardmodel <- step(backwardmodel, direction = "backward") # Stepwise backward linear regression to find variables that are best for model

# Best fit = length, sex and depth

blotchprediction2 <- lm(blotch ~ length + sex + depth, data = sharks)

summary(blotchprediction2) # slightly improved model

# Length is not contrubuting much to the model - does removing it improve the model?

blotchprediction3 <- lm(blotch ~ sex + depth, data = sharks)

summary(blotchprediction3)

# this is the best model
# Plotting the model

plot <- ggplot(sharks,aes(x=depth,y=blotch, colour = sex))+ 
  geom_point()+ 
  labs(x = "Depth (Metres)", y = "Blotching Time (Seconds)")+
  theme(legend.position = "none")

plot2 <- ggplot(sharks,aes(x=depth,y=blotch,shape=sex, colour = sex))+ 
  geom_smooth(method=lm,se=TRUE,fullrange=TRUE, 
              aes(color=sex))+
  labs(x = "Depth (Metres)", y = "Blotching Time (Seconds)")

lmplot <- plot + plot2
print(lmplot)

summary(sharks$depth) # check range of data

depth <- seq(40,60, by=0.5) # creating new data to test predictions
sex <- rep(c("Male", "Female"), length.out = length(depth))
sex <- as.factor(sex)

pred_grid <- expand.grid(depth = depth, sex = sex) # make new values to predict

pred_grid$blotch <-predict(blotchprediction3, new = pred_grid) # predict blotching time

# Check model

predict(blotchprediction3, data.frame(depth = 53.22635, sex = "Female"))

# predicted = 36.51418
# actual = 37.17081

predict(blotchprediction3, data.frame(depth = 50.95442, sex = "Male"))

# predicted = 35.6781
# actual = 35.66700

# Check ends of variables

predict(blotchprediction3, data.frame(depth = 44.64474, sex = "Male"))

# predicted = 32.51239
# actual = 32.53531

# Measure accuracy of the model
MAPE <- mean(abs(blotchprediction3$residuals / sharks$blotch *100))
accuracy <- 100 - MAPE
print(accuracy)


# Plot the predicted model
ggplot(pred_grid, aes(x = depth, y = blotch, colour = sex))+
  geom_smooth(method = "lm")+
  labs(x = "Depth (m)", y = "Blotching Time (seconds)", colour = "Sex")+
  scale_x_continuous(breaks = seq(40, 60, by = 2)) + 
  scale_y_continuous(breaks = seq(29, 41, by = 1))

Full analysis displayed with figures and results

Click to expand full analysis
# Packages Required

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(corrplot)
Warning: package 'corrplot' was built under R version 4.3.3
corrplot 0.95 loaded
library(psych)
Warning: package 'psych' was built under R version 4.3.3

Attaching package: 'psych'

The following objects are masked from 'package:ggplot2':

    %+%, alpha
library(GGally)
Warning: package 'GGally' was built under R version 4.3.3
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2
library(patchwork)
Warning: package 'patchwork' was built under R version 4.3.3
# Importing Data

sharks <- data.frame(read_csv("sharks.csv"))
Rows: 500 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): ID, sex
dbl (8): blotch, BPM, weight, length, air, water, meta, depth

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sharksub <- data.frame(read.csv("sharksub.csv"))

# Data Summary

describe(sharks) # Exploring the dataset to understand range of data
       vars   n   mean     sd median trimmed    mad    min    max  range  skew
ID*       1 500 250.50 144.48 250.50  250.50 185.32   1.00 500.00 499.00  0.00
sex*      2 500   1.53   0.50   2.00    1.53   0.00   1.00   2.00   1.00 -0.11
blotch    3 500  35.13   1.43  35.05   35.11   1.38  30.78  40.08   9.31  0.15
BPM       4 500 141.76  14.14 142.00  141.54  19.27 119.00 166.00  47.00  0.08
weight    5 500  87.94  13.46  87.82   88.03  18.36  65.10 110.94  45.84 -0.02
length    6 500 211.04  46.61 211.07  211.24  59.51 128.25 290.95 162.70 -0.02
air       7 500  35.54   1.43  35.43   35.53   1.71  33.00  38.00   5.00  0.04
water     8 500  23.02   1.67  23.11   23.02   2.08  20.01  25.99   5.98 -0.01
meta      9 500  82.04  17.44  82.45   82.14  20.70  50.03 112.45  62.42 -0.05
depth    10 500  50.14   2.02  50.14   50.13   1.82  44.64  56.83  12.18  0.03
       kurtosis   se
ID*       -1.21 6.46
sex*      -1.99 0.02
blotch     0.20 0.06
BPM       -1.24 0.63
weight    -1.28 0.60
length    -1.19 2.08
air       -1.14 0.06
water     -1.15 0.07
meta      -1.10 0.78
depth     -0.06 0.09
summary(sharks$blotch) # Exploring key variable blotching
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  30.78   34.16   35.05   35.13   36.05   40.08 
hist(sharks$blotch)

# Visualising all correlations
sharks$sex <- factor(sharks$sex, levels = c("Male", "Female"))
ggpairs(sharks, columns = 2:10, cardinality_threshold = 500, 
        columnLabels = c("Sex", "Blotch Time", "BPM", "Weight", "Length", "Air Temp", "Water Temp", "Meta", "Depth"), 
        proportions = "auto",
        lower = list(combo = "dot"),
        upper = list(combo = "box"))

# Indication of relationship between blotch ~ depth and meta ~ air.

# Stress Variables - CORRELATION

cor.test(sharks$blotch, sharks$BPM)

    Pearson's product-moment correlation

data:  sharks$blotch and sharks$BPM
t = -0.65406, df = 498, p-value = 0.5134
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.11668743  0.05854438
sample estimates:
        cor 
-0.02929661 
cor.test(sharks$blotch, sharks$meta)

    Pearson's product-moment correlation

data:  sharks$blotch and sharks$meta
t = -0.21232, df = 498, p-value = 0.8319
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.09712341  0.07824201
sample estimates:
         cor 
-0.009513855 
bpmplot <- ggplot(sharks, aes(BPM, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Heart Rate (BPM)", y = "Blotching Time (Seconds)")
metaplot <- ggplot(sharks, aes(meta, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Meta (mcg/dl)", y = "Blotching Time (Seconds)")

stressplot <- bpmplot + metaplot # Creating combined plot
print(stressplot)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

# Environmental Variable - CORRELATION

sharks1 <- sharks %>%
  mutate(temp_diff = air - water) # create new variable investigating temperature difference

cor.test(sharks$blotch, sharks$air)

    Pearson's product-moment correlation

data:  sharks$blotch and sharks$air
t = -0.84005, df = 498, p-value = 0.4013
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.12489535  0.05023956
sample estimates:
        cor 
-0.03761675 
cor.test(sharks$blotch, sharks$water)

    Pearson's product-moment correlation

data:  sharks$blotch and sharks$water
t = -1.1542, df = 498, p-value = 0.249
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.13871605  0.03620077
sample estimates:
        cor 
-0.05165379 
cor.test(sharks$blotch, sharks$depth)

    Pearson's product-moment correlation

data:  sharks$blotch and sharks$depth
t = 22.772, df = 498, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6683963 0.7546509
sample estimates:
      cor 
0.7142247 
cor.test(sharks1$blotch, sharks1$temp_diff)

    Pearson's product-moment correlation

data:  sharks1$blotch and sharks1$temp_diff
t = 0.32238, df = 498, p-value = 0.7473
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.07333889  0.10200597
sample estimates:
       cor 
0.01444459 
airplot <- ggplot(sharks, aes(air, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Air Temperature (Celcius)", y = "Blotching Time (Seconds)")
waterplot <- ggplot(sharks, aes(water, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Water Temperature (Celcius)", y = "Blotching Time (Seconds)")
depthplot <- ggplot(sharks, aes(depth, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Capture Depth (Metres)", y = "Blotching Time (Seconds)")
tempplot <- ggplot(sharks1, aes(temp_diff, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Temperature Difference (Celcius)", y = "Blotching Time (Seconds)")

envplot <- airplot + waterplot + tempplot + depthplot # Creating combined plot
print(envplot)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

# Air and Water Temp Effects

summary(sharks$air)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  33.00   34.42   35.43   35.54   36.71   38.00 
summary(sharks$water)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  20.01   21.55   23.11   23.02   24.37   25.99 
# Relationship between air and water temperature
tempcor <- cor.test(sharks$air, sharks$water)
print(tempcor)

    Pearson's product-moment correlation

data:  sharks$air and sharks$water
t = -1.2346, df = 498, p-value = 0.2176
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.14224207  0.03260803
sample estimates:
        cor 
-0.05524051 
airplot1 <- ggplot(sharks, aes(water, air))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Water Temperature (Celcius)", y = "Air Temperature (Celcius)")


cor.test(sharks$meta, sharks$air)

    Pearson's product-moment correlation

data:  sharks$meta and sharks$air
t = 2.8188, df = 498, p-value = 0.005012
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.03804551 0.21069324
sample estimates:
     cor 
0.125318 
airplot2 <- ggplot(sharks, aes(meta, air))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Meta (mcg/dl)", y = "Air Temperature (Celcius)")

airplot <- airplot1 + airplot2
print(airplot)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

# Physiological Variable - CORRELATION + T TEST

cor.test(sharks$blotch, sharks$weight)

    Pearson's product-moment correlation

data:  sharks$blotch and sharks$weight
t = 0.20613, df = 498, p-value = 0.8368
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.07851766  0.09684867
sample estimates:
        cor 
0.009236525 
cor.test(sharks$blotch, sharks$length)

    Pearson's product-moment correlation

data:  sharks$blotch and sharks$length
t = -0.36562, df = 498, p-value = 0.7148
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.1039230  0.0714115
sample estimates:
        cor 
-0.01638167 
weightplot <- ggplot(sharks, aes(weight, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Weight (Kilograms)", y = "Blotching Time (Seconds)")
lengthplot <- ggplot(sharks, aes(length, blotch))+
  geom_point()+
  geom_smooth(method = "lm")+
  labs(x = "Length (Centimetres)", y = "Blotching Time (Seconds)")

physplot <- weightplot + lengthplot # Creating combined plot
print(physplot)
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

# Separating variables by sex

sharksmale <- sharks %>%
  filter(sex == "Male") %>%
  na.omit()
sharksfemale <- sharks %>%
  filter(sex == "Female") %>%
  na.omit()

# Comparing variables between sexes

t.test(sharksmale$blotch, sharksfemale$blotch,
       alternative = "two.sided",
       mu = 0, paired = FALSE)

    Welch Two Sample t-test

data:  sharksmale$blotch and sharksfemale$blotch
t = 3.0282, df = 494.67, p-value = 0.002589
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.1346620 0.6322714
sample estimates:
mean of x mean of y 
 35.30641  34.92294 
sexplot <- ggplot(sharks, aes(sex, blotch))+
  geom_boxplot()+
  labs(x = "Sex", y = "Blotching Time (Seconds)")

# Investigating life stage

sharkslife <- sharks %>% # Assigning adult or juvenile based on length
  mutate(life_stage = case_when(
    sex == "Female" & length > 180 ~ "adult",
    sex == "Female" & length <= 180 ~ "juvenile",
    sex == "Male" & length > 170 ~ "adult",
    sex == "Male" & length <= 170 ~ "juvenile",
    TRUE ~ NA_character_))

table(sharkslife$life_stage) # count how many adult vs juvenile

   adult juvenile 
     366      134 
t.test(blotch ~ life_stage, data = sharkslife)

    Welch Two Sample t-test

data:  blotch by life_stage
t = 0.9934, df = 242.26, p-value = 0.3215
alternative hypothesis: true difference in means between group adult and group juvenile is not equal to 0
95 percent confidence interval:
 -0.1390418  0.4219664
sample estimates:
   mean in group adult mean in group juvenile 
              35.16332               35.02186 
lifeplot <- ggplot(sharkslife, aes(life_stage, blotch))+
  geom_boxplot()+
  labs(x = "Life Stage", y = "Blotching Time (Seconds)")
  
physplot2 <- sexplot + lifeplot # Creating combined plot
print(physplot2)

# Difference between first and second capture

summary(sharksub) # Exploring data
      ID                sex               blotch1         blotch2     
 Length:50          Length:50          Min.   :32.49   Min.   :33.47  
 Class :character   Class :character   1st Qu.:34.38   1st Qu.:35.31  
 Mode  :character   Mode  :character   Median :34.94   Median :35.94  
                                       Mean   :35.03   Mean   :35.96  
                                       3rd Qu.:35.90   3rd Qu.:36.78  
                                       Max.   :37.07   Max.   :38.18  
shapiro.test(sharksub$blotch1)

    Shapiro-Wilk normality test

data:  sharksub$blotch1
W = 0.97958, p-value = 0.5345
shapiro.test(sharksub$blotch2) # both normally distributed

    Shapiro-Wilk normality test

data:  sharksub$blotch2
W = 0.97936, p-value = 0.5255
t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)

    Paired t-test

data:  sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.037176 -0.822301
sample estimates:
mean difference 
     -0.9297384 
summary(sharksub$blotch1)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  32.49   34.38   34.94   35.03   35.90   37.07 
summary(sharksub$blotch2)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  33.47   35.31   35.94   35.96   36.78   38.18 
sharks_long <- sharksub %>%
  pivot_longer(cols = c(blotch1, blotch2), 
               names_to = "capture", 
               values_to = "blotch",
               names_prefix = "blotch")

captureplot1 <- ggplot(sharks_long, aes(x = capture, y = blotch))+
  geom_boxplot()+
  labs(x= "Capture", y = "Blotching time (Seconds)")

# Are male and female sharks affected differently by a second capture?

capture_sex <- sharksub %>% 
  group_by(sex) %>% 
  summarise(mean_blotch1 = mean(blotch1), 
            mean_blotch2 = mean(blotch2), 
            mean_diff = mean(blotch2 - blotch1))
print(capture_sex)
# A tibble: 2 × 4
  sex    mean_blotch1 mean_blotch2 mean_diff
  <chr>         <dbl>        <dbl>     <dbl>
1 Female         34.8         35.8     0.952
2 Male           35.3         36.2     0.908
timediff <- sharksub %>%
  mutate(timediff = blotch2 - blotch1)
t.test(timediff ~ sex, data = timediff)

    Welch Two Sample t-test

data:  timediff by sex
t = 0.40707, df = 44.541, p-value = 0.6859
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
 -0.1733770  0.2611792
sample estimates:
mean in group Female   mean in group Male 
           0.9516890            0.9077879 
captureplot2 <- ggplot(sharks_long, aes(x = capture, y = blotch, fill = sex))+
  geom_boxplot()+
  labs(x= "Capture", y = "Blotching time (Seconds)")+
  scale_fill_manual(values = c("Male" = "darkgrey", "Female" = "azure2"))

captureplot <- captureplot1 + captureplot2 # Creating combined plot
print(captureplot)

# Predicting Blotching Time

blotchprediction <- lm (blotch ~ depth, data = sharks) # Initial linear model based on previous results

summary(blotchprediction)

Call:
lm(formula = blotch ~ depth, data = sharks)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.81869 -0.65427 -0.01035  0.58825  2.83116 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.82178    1.11207   8.832   <2e-16 ***
depth        0.50467    0.02216  22.772   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1 on 498 degrees of freedom
Multiple R-squared:  0.5101,    Adjusted R-squared:  0.5091 
F-statistic: 518.6 on 1 and 498 DF,  p-value: < 2.2e-16
ggplot(sharks, aes(blotchprediction$residuals))+ # shows normal distribution of residuals
  geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Can the model be improved?

backwardmodel <- lm(blotch ~ sex + BPM + meta + weight + length + air + water + depth + sharks1$temp_diff, sharks) 

backwardmodel <- step(backwardmodel, direction = "backward") # Stepwise backward linear regression to find variables that are best for model
Start:  AIC=0.08
blotch ~ sex + BPM + meta + weight + length + air + water + depth + 
    sharks1$temp_diff


Step:  AIC=0.08
blotch ~ sex + BPM + meta + weight + length + air + water + depth

         Df Sum of Sq    RSS    AIC
- meta    1      0.20 482.60  -1.71
- weight  1      0.27 482.67  -1.64
- water   1      0.28 482.68  -1.62
- BPM     1      0.43 482.83  -1.47
- air     1      0.95 483.35  -0.93
- length  1      1.81 484.21  -0.04
<none>                482.40   0.08
- sex     1     11.82 494.22  10.18
- depth   1    510.38 992.78 358.95

Step:  AIC=-1.71
blotch ~ sex + BPM + weight + length + air + water + depth

         Df Sum of Sq    RSS    AIC
- weight  1      0.26 482.86  -3.44
- water   1      0.30 482.90  -3.40
- BPM     1      0.43 483.03  -3.26
- air     1      1.08 483.68  -2.59
- length  1      1.80 484.40  -1.85
<none>                482.60  -1.71
- sex     1     11.76 494.36   8.33
- depth   1    510.21 992.82 356.97

Step:  AIC=-3.44
blotch ~ sex + BPM + length + air + water + depth

         Df Sum of Sq    RSS    AIC
- water   1      0.26 483.11  -5.18
- BPM     1      0.42 483.28  -5.01
- air     1      1.14 483.99  -4.27
- length  1      1.78 484.64  -3.61
<none>                482.86  -3.44
- sex     1     11.73 494.59   6.56
- depth   1    510.13 992.99 355.05

Step:  AIC=-5.18
blotch ~ sex + BPM + length + air + depth

         Df Sum of Sq    RSS    AIC
- BPM     1      0.43 483.55  -6.73
- air     1      1.08 484.19  -6.06
- length  1      1.87 484.99  -5.24
<none>                483.11  -5.18
- sex     1     11.93 495.04   5.02
- depth   1    512.18 995.30 354.22

Step:  AIC=-6.73
blotch ~ sex + length + air + depth

         Df Sum of Sq    RSS    AIC
- air     1      0.99 484.54  -7.71
<none>                483.55  -6.73
- length  1      2.02 485.56  -6.65
- sex     1     11.89 495.43   3.41
- depth   1    512.95 996.50 352.82

Step:  AIC=-7.71
blotch ~ sex + length + depth

         Df Sum of Sq    RSS    AIC
<none>                484.54  -7.71
- length  1      2.11 486.64  -7.54
- sex     1     11.68 496.22   2.20
- depth   1    513.79 998.32 351.73
# Best fit = length, sex and depth

blotchprediction2 <- lm(blotch ~ length + sex + depth, data = sharks)

summary(blotchprediction2) # slightly improved model

Call:
lm(formula = blotch ~ length + sex + depth, data = sharks)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.96351 -0.64800 -0.02128  0.62242  2.91815 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.6857036  1.1400938   8.496 2.31e-16 ***
length       0.0013987  0.0009527   1.468 0.142713    
sexFemale   -0.3064815  0.0886293  -3.458 0.000591 ***
depth        0.5043852  0.0219935  22.933  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9884 on 496 degrees of freedom
Multiple R-squared:  0.5235,    Adjusted R-squared:  0.5206 
F-statistic: 181.6 on 3 and 496 DF,  p-value: < 2.2e-16
# Length is not contrubuting much to the model - does removing it improve the model?

blotchprediction3 <- lm(blotch ~ sex + depth, data = sharks)

summary(blotchprediction3)

Call:
lm(formula = blotch ~ sex + depth, data = sharks)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.96224 -0.64833 -0.01317  0.58882  2.98829 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.11317    1.10357   9.164  < 2e-16 ***
sexFemale   -0.30379    0.08871  -3.424 0.000667 ***
depth        0.50172    0.02194  22.864  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9895 on 497 degrees of freedom
Multiple R-squared:  0.5214,    Adjusted R-squared:  0.5195 
F-statistic: 270.7 on 2 and 497 DF,  p-value: < 2.2e-16
# this is the best model
# Plotting the model

plot <- ggplot(sharks,aes(x=depth,y=blotch, colour = sex))+ 
  geom_point()+ 
  labs(x = "Depth (Metres)", y = "Blotching Time (Seconds)")+
  theme(legend.position = "none")

plot2 <- ggplot(sharks,aes(x=depth,y=blotch,shape=sex, colour = sex))+ 
  geom_smooth(method=lm,se=TRUE,fullrange=TRUE, 
              aes(color=sex))+
  labs(x = "Depth (Metres)", y = "Blotching Time (Seconds)")

lmplot <- plot + plot2
print(lmplot)
`geom_smooth()` using formula = 'y ~ x'

summary(sharks$depth) # check range of data
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  44.64   48.90   50.14   50.14   51.35   56.83 
depth <- seq(40,60, by=0.5) # creating new data to test predictions
sex <- rep(c("Male", "Female"), length.out = length(depth))
sex <- as.factor(sex)

pred_grid <- expand.grid(depth = depth, sex = sex) # make new values to predict

pred_grid$blotch <-predict(blotchprediction3, new = pred_grid) # predict blotching time

# Check model

predict(blotchprediction3, data.frame(depth = 53.22635, sex = "Female"))
       1 
36.51418 
# predicted = 36.51418
# actual = 37.17081

predict(blotchprediction3, data.frame(depth = 50.95442, sex = "Male"))
      1 
35.6781 
# predicted = 35.6781
# actual = 35.66700

# Check ends of variables

predict(blotchprediction3, data.frame(depth = 44.64474, sex = "Male"))
       1 
32.51239 
# predicted = 32.51239
# actual = 32.53531

# Measure accuracy of the model
MAPE <- mean(abs(blotchprediction3$residuals / sharks$blotch *100))
accuracy <- 100 - MAPE
print(accuracy)
[1] 97.76933
# Plot the predicted model
ggplot(pred_grid, aes(x = depth, y = blotch, colour = sex))+
  geom_smooth(method = "lm")+
  labs(x = "Depth (m)", y = "Blotching Time (seconds)", colour = "Sex")+
  scale_x_continuous(breaks = seq(40, 60, by = 2)) + 
  scale_y_continuous(breaks = seq(29, 41, by = 1))
`geom_smooth()` using formula = 'y ~ x'