Shana’s Summative

Preliminary checks

First I will load all libraries that I will use

library (tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(modeldata)
library(ggplot2)
library(dplyr)
library(ggpubr)
library(PairedData)

Loading required package: MASS

Attaching package: 'MASS'

The following object is masked from 'package:dplyr':

    select

Loading required package: gld
Loading required package: mvtnorm
Loading required package: lattice

Attaching package: 'PairedData'

The following object is masked from 'package:base':

    summary

Load and view structure of all data

library(readxl)
sharks <- read_excel("C:/Users/shana/OneDrive/Desktop/Research methods/Summative/sharks.xlsx")
View(sharks)
str(sharks)

tibble [500 × 10] (S3: tbl_df/tbl/data.frame)
 $ ID    : chr [1:500] "SH001" "SH002" "SH003" "SH004" ...
 $ sex   : chr [1:500] "Female" "Female" "Female" "Male" ...
 $ blotch: num [1:500] 37.2 34.5 36.3 35.3 37.4 ...
 $ BPM   : num [1:500] 148 158 125 161 138 126 166 135 132 127 ...
 $ weight: num [1:500] 74.7 73.4 71.8 104.6 67.1 ...
 $ length: num [1:500] 187 189 284 171 264 ...
 $ air   : num [1:500] 37.7 35.7 34.8 36.2 33.6 ...
 $ water : num [1:500] 23.4 21.4 20.1 21.6 21.8 ...
 $ meta  : num [1:500] 64.1 73.7 54.4 86.3 108 ...
 $ depth : num [1:500] 53.2 49.6 49.4 50.3 49 ...

sharksub <- read_excel("C:/Users/shana/OneDrive/Desktop/Research methods/Summative/sharksub.xlsx")
View(sharksub)
str(sharksub)

tibble [50 × 4] (S3: tbl_df/tbl/data.frame)
 $ ID     : chr [1:50] "SH269" "SH163" "SH008" "SH239" ...
 $ sex    : chr [1:50] "Female" "Female" "Female" "Female" ...
 $ blotch1: num [1:50] 36.1 33.4 36.3 35 35.7 ...
 $ blotch2: num [1:50] 37.2 34.4 36.5 36 36.8 ...

View head and tail, dimensions, columns and summary statistics of data

head(sharks)

# A tibble: 6 × 10
  ID    sex    blotch   BPM weight length   air water  meta depth
  <chr> <chr>   <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
1 SH001 Female   37.2   148   74.7   187.  37.7  23.4  64.1  53.2
2 SH002 Female   34.5   158   73.4   189.  35.7  21.4  73.7  49.6
3 SH003 Female   36.3   125   71.8   284.  34.8  20.1  54.4  49.4
4 SH004 Male     35.3   161  105.    171.  36.2  21.6  86.3  50.3
5 SH005 Female   37.4   138   67.1   264.  33.6  21.8 108.   49.0
6 SH006 Male     33.5   126  110.    270.  36.4  20.9 109.   46.8

tail(sharks)

# A tibble: 6 × 10
  ID    sex    blotch   BPM weight length   air water  meta depth
  <chr> <chr>   <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
1 SH495 Female   34.9   165  104.    149.  33.4  21.1  59.2  47.3
2 SH496 Male     34.6   150  109.    257.  34.9  23.7  60.4  48.5
3 SH497 Male     32.1   127   97.3   264.  33.4  24.9  54.1  45.4
4 SH498 Female   37.1   151   88.9   142.  35.9  22.0  82.4  53.1
5 SH499 Male     35.3   155   78.1   148.  33.3  22.1  99.2  52.1
6 SH500 Female   35.3   143  105.    159.  36.1  24.6  87.1  49.3

dim(sharks)

[1] 500  10

nrow(sharks)

[1] 500

ncol(sharks)

[1] 10

colnames(sharks)

 [1] "ID"     "sex"    "blotch" "BPM"    "weight" "length" "air"    "water" 
 [9] "meta"   "depth"

summary(sharks)

      ID                sex                blotch           BPM       
 Length:500         Length:500         Min.   :30.78   Min.   :119.0  
 Class :character   Class :character   1st Qu.:34.16   1st Qu.:129.0  
 Mode  :character   Mode  :character   Median :35.05   Median :142.0  
                                       Mean   :35.13   Mean   :141.8  
                                       3rd Qu.:36.05   3rd Qu.:153.2  
                                       Max.   :40.08   Max.   :166.0  
     weight           length           air            water      
 Min.   : 65.10   Min.   :128.3   Min.   :33.00   Min.   :20.01  
 1st Qu.: 75.68   1st Qu.:172.0   1st Qu.:34.42   1st Qu.:21.55  
 Median : 87.82   Median :211.1   Median :35.43   Median :23.11  
 Mean   : 87.94   Mean   :211.0   Mean   :35.54   Mean   :23.02  
 3rd Qu.:100.40   3rd Qu.:251.8   3rd Qu.:36.71   3rd Qu.:24.37  
 Max.   :110.94   Max.   :291.0   Max.   :38.00   Max.   :25.99  
      meta            depth      
 Min.   : 50.03   Min.   :44.64  
 1st Qu.: 67.39   1st Qu.:48.90  
 Median : 82.45   Median :50.14  
 Mean   : 82.04   Mean   :50.14  
 3rd Qu.: 95.97   3rd Qu.:51.35  
 Max.   :112.45   Max.   :56.83

head(sharksub)

# A tibble: 6 × 4
  ID    sex    blotch1 blotch2
  <chr> <chr>    <dbl>   <dbl>
1 SH269 Female    36.1    37.2
2 SH163 Female    33.4    34.4
3 SH008 Female    36.3    36.5
4 SH239 Female    35.0    36.0
5 SH332 Female    35.7    36.8
6 SH328 Female    34.9    35.9

tail(sharksub)

# A tibble: 6 × 4
  ID    sex   blotch1 blotch2
  <chr> <chr>   <dbl>   <dbl>
1 SH260 Male     36.4    37.5
2 SH357 Male     33.5    34.5
3 SH199 Male     37.1    38.2
4 SH413 Male     34.3    35.3
5 SH004 Male     35.3    35.6
6 SH307 Male     34.5    34.1

dim(sharksub)

[1] 50  4

nrow(sharksub)

[1] 50

ncol(sharksub)

[1] 4

colnames(sharksub)

[1] "ID"      "sex"     "blotch1" "blotch2"

summary(sharksub)

      ID                sex               blotch1         blotch2     
 Length:50          Length:50          Min.   :32.49   Min.   :33.47  
 Class :character   Class :character   1st Qu.:34.38   1st Qu.:35.31  
 Mode  :character   Mode  :character   Median :34.94   Median :35.94  
                                       Mean   :35.03   Mean   :35.96  
                                       3rd Qu.:35.90   3rd Qu.:36.78  
                                       Max.   :37.07   Max.   :38.18

Check for missing data

any(is.na(sharks))

[1] FALSE

any(is.na(sharksub))

[1] FALSE

Both are false so no need to handle missing values
I will rename the column name in the first dataset to match the second: turn blotch into blotch1

colnames(sharks)[colnames(sharks) == "blotch"] <- "blotch1"
colnames(sharks)

 [1] "ID"      "sex"     "blotch1" "BPM"     "weight"  "length"  "air"    
 [8] "water"   "meta"    "depth"

View(sharks)

This merging is in preparation for the second question:
I will merge the two sheets by the relevant columns (ID, blotch1, and sex) and then print

common_data <- merge(sharks, sharksub, by = c("ID", "blotch1", "sex"))
print(common_data)

      ID  blotch1    sex BPM    weight   length      air    water      meta
1  SH004 35.33881   Male 161 104.62985 171.0986 36.15973 21.64319  86.32615
2  SH008 36.29497 Female 135 101.20073 128.4664 36.77278 21.30411  96.33309
3  SH029 35.17775   Male 122  68.97648 218.6683 37.60242 23.31552 105.39974
4  SH049 35.54855 Female 166  66.42762 250.3885 36.53131 25.36878  83.54757
5  SH059 36.50001   Male 119 106.96948 217.8249 34.65448 24.72011  76.39855
6  SH070 32.85958   Male 158  83.81398 261.6298 33.45248 25.94927  56.49589
7  SH087 33.98649 Female 129  87.20821 241.8463 35.29138 24.02701  72.45724
8  SH101 36.42572   Male 134  69.26562 196.4749 37.99978 23.93187  68.47177
9  SH108 36.68703   Male 135  73.29044 179.1079 35.37759 24.89065  91.28896
10 SH110 34.65632   Male 134  78.89234 191.1178 36.70432 21.09131  54.94158
11 SH115 34.93594   Male 127  73.88824 199.1637 33.51968 21.84163  91.43666
12 SH124 35.92057   Male 145  92.32507 160.4630 36.10210 21.00988  64.82984
13 SH150 33.11113 Female 131  65.47062 145.7330 37.10396 24.76164 105.00771
14 SH152 36.48702   Male 146  73.59540 254.4253 36.94934 21.49606  84.87480
15 SH163 33.38396 Female 148  96.73251 189.9405 33.91255 25.18574  69.99987
16 SH178 34.45879 Female 155  76.42407 267.4729 34.28698 22.10824  90.42851
17 SH199 37.07165   Male 159  74.37317 185.9547 33.68490 20.50748  55.51194
18 SH211 33.26947 Female 164  81.88777 215.2635 33.81956 20.12551  96.40223
19 SH220 34.47858 Female 131  94.67604 273.5521 34.11300 23.57607  51.02802
20 SH221 35.99021 Female 129 108.59903 195.2602 33.51585 24.20466  68.93962
21 SH226 35.02983   Male 143  91.89068 176.1361 34.92279 25.98523  78.28148
22 SH229 35.81950 Female 125  74.82072 265.5075 37.44526 21.34394  51.62363
23 SH239 34.98931 Female 141  89.64871 231.7945 37.84878 22.19589  89.19871
24 SH245 34.88511 Female 137  85.28268 279.2720 34.57891 21.22774  91.39811
25 SH254 34.73333   Male 143  93.27573 259.4982 37.73990 20.80868  97.52831
26 SH258 32.49322 Female 166 105.96149 202.6149 34.70281 25.11431  89.97645
27 SH260 36.42890   Male 150  93.34433 241.1359 34.45962 21.29792 112.17231
28 SH269 36.07201 Female 151  97.14867 138.5105 36.13022 20.94955  78.88339
29 SH274 34.57425   Male 133  91.31657 278.6965 37.57242 23.19948  76.05037
30 SH276 35.34228   Male 152  94.25187 230.4080 33.37476 25.52029  71.59681
31 SH285 34.24796   Male 161 106.74323 283.3528 37.21471 21.59466  77.15406
32 SH296 33.60120   Male 162 105.00764 169.0551 36.47813 23.64483  50.81773
33 SH298 34.79932 Female 130  90.76888 264.8987 34.76677 24.96128  70.07991
34 SH307 34.52245   Male 119  73.22300 213.4676 34.55394 21.67472  73.22178
35 SH315 34.27528 Female 165  92.35764 139.8416 36.54493 22.17074  91.75451
36 SH327 36.83684 Female 151  75.76946 271.5658 34.19946 22.47481  91.61082
37 SH328 34.90283 Female 161 105.19883 198.1164 37.51192 20.65386 104.47734
38 SH332 35.70572 Female 155  83.20046 254.7629 34.42881 25.58226  76.14968
39 SH338 36.11878 Female 143 101.53496 135.8858 37.43137 25.46580  81.58295
40 SH357 33.53000   Male 122  75.67982 277.3364 37.70574 22.32384  82.82668
41 SH365 35.28866   Male 135  76.84522 197.9511 37.96769 24.35740  97.18871
42 SH374 34.74200 Female 149 105.15749 266.7394 37.49456 23.97601  90.81482
43 SH393 34.93960 Female 161 104.99774 202.7821 33.63340 23.27929  94.20890
44 SH405 35.70079   Male 161  83.27224 179.4364 36.16117 23.61640 110.41629
45 SH410 34.36868 Female 121  72.33262 144.0637 33.37066 22.60196 105.54567
46 SH413 34.29493   Male 161  65.40377 251.0183 35.66119 22.64759  95.98636
47 SH424 35.63334   Male 158  72.86885 163.4943 37.85090 22.38551  65.26609
48 SH430 36.37596   Male 156  99.27437 290.7846 35.09654 20.84968  93.20290
49 SH450 34.41429 Female 153 107.48861 222.3305 33.15764 24.94298 100.52145
50 SH487 34.27203 Female 129 105.41863 193.2823 36.88931 25.69646  71.03947
      depth  blotch2
1  50.29711 35.60995
2  50.59650 36.46102
3  50.59854 36.23308
4  49.55720 36.61501
5  53.78593 37.59501
6  47.48592 33.84537
7  51.84685 35.00608
8  51.31445 37.51849
9  49.80048 37.78764
10 49.89161 35.69601
11 51.96076 35.98402
12 51.01759 35.52544
13 50.64477 34.10446
14 50.48023 37.58163
15 48.43633 34.38548
16 49.16008 35.49255
17 50.46984 38.18380
18 46.08500 34.26755
19 50.53704 35.51294
20 49.06487 37.06992
21 50.49601 36.08072
22 49.25833 36.89408
23 51.13299 36.03899
24 46.95435 35.93166
25 50.45905 35.77533
26 48.11986 33.46802
27 49.40703 37.52177
28 52.43260 37.15417
29 48.96359 35.61148
30 48.87889 36.40255
31 48.65615 35.27540
32 49.53097 34.60924
33 47.52810 35.84330
34 49.02755 34.07366
35 47.94731 35.30354
36 49.70779 37.94195
37 51.20013 35.94991
38 51.79984 36.77689
39 51.30954 37.20234
40 48.78801 34.53590
41 50.46341 36.34732
42 49.21944 34.39458
43 50.95881 35.98779
44 50.37530 36.77181
45 49.88061 35.39974
46 50.23290 35.32378
47 52.22695 36.70234
48 52.85327 37.46724
49 52.37204 35.44672
50 48.89706 35.30019

Question 1

Q1. Is there a correlation between the variables air and water?

Visualize the correlation between air and water using a scatterplot

ggplot(sharks, aes(x = air,
                     y = water)) +
  geom_point(color = "navy") + 
  geom_smooth(method="lm",se=TRUE, aes(group =1), colour="maroon") +
  labs(x = "Air Temperature (°C)",
       y = "Water Temperature (°C)", 
       title = "Relationship between Air and Water Temperature") +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

There is no visually recognizable correlation between the two variables

There is no correlation between the variables air and water

Question 2

Q2. Does multiple capture have an effect on blotching time?

View merged data: this dataset only contains sharks that had multiple capture events and include both botching times

View(common_data)
print(common_data)

      ID  blotch1    sex BPM    weight   length      air    water      meta
1  SH004 35.33881   Male 161 104.62985 171.0986 36.15973 21.64319  86.32615
2  SH008 36.29497 Female 135 101.20073 128.4664 36.77278 21.30411  96.33309
3  SH029 35.17775   Male 122  68.97648 218.6683 37.60242 23.31552 105.39974
4  SH049 35.54855 Female 166  66.42762 250.3885 36.53131 25.36878  83.54757
5  SH059 36.50001   Male 119 106.96948 217.8249 34.65448 24.72011  76.39855
6  SH070 32.85958   Male 158  83.81398 261.6298 33.45248 25.94927  56.49589
7  SH087 33.98649 Female 129  87.20821 241.8463 35.29138 24.02701  72.45724
8  SH101 36.42572   Male 134  69.26562 196.4749 37.99978 23.93187  68.47177
9  SH108 36.68703   Male 135  73.29044 179.1079 35.37759 24.89065  91.28896
10 SH110 34.65632   Male 134  78.89234 191.1178 36.70432 21.09131  54.94158
11 SH115 34.93594   Male 127  73.88824 199.1637 33.51968 21.84163  91.43666
12 SH124 35.92057   Male 145  92.32507 160.4630 36.10210 21.00988  64.82984
13 SH150 33.11113 Female 131  65.47062 145.7330 37.10396 24.76164 105.00771
14 SH152 36.48702   Male 146  73.59540 254.4253 36.94934 21.49606  84.87480
15 SH163 33.38396 Female 148  96.73251 189.9405 33.91255 25.18574  69.99987
16 SH178 34.45879 Female 155  76.42407 267.4729 34.28698 22.10824  90.42851
17 SH199 37.07165   Male 159  74.37317 185.9547 33.68490 20.50748  55.51194
18 SH211 33.26947 Female 164  81.88777 215.2635 33.81956 20.12551  96.40223
19 SH220 34.47858 Female 131  94.67604 273.5521 34.11300 23.57607  51.02802
20 SH221 35.99021 Female 129 108.59903 195.2602 33.51585 24.20466  68.93962
21 SH226 35.02983   Male 143  91.89068 176.1361 34.92279 25.98523  78.28148
22 SH229 35.81950 Female 125  74.82072 265.5075 37.44526 21.34394  51.62363
23 SH239 34.98931 Female 141  89.64871 231.7945 37.84878 22.19589  89.19871
24 SH245 34.88511 Female 137  85.28268 279.2720 34.57891 21.22774  91.39811
25 SH254 34.73333   Male 143  93.27573 259.4982 37.73990 20.80868  97.52831
26 SH258 32.49322 Female 166 105.96149 202.6149 34.70281 25.11431  89.97645
27 SH260 36.42890   Male 150  93.34433 241.1359 34.45962 21.29792 112.17231
28 SH269 36.07201 Female 151  97.14867 138.5105 36.13022 20.94955  78.88339
29 SH274 34.57425   Male 133  91.31657 278.6965 37.57242 23.19948  76.05037
30 SH276 35.34228   Male 152  94.25187 230.4080 33.37476 25.52029  71.59681
31 SH285 34.24796   Male 161 106.74323 283.3528 37.21471 21.59466  77.15406
32 SH296 33.60120   Male 162 105.00764 169.0551 36.47813 23.64483  50.81773
33 SH298 34.79932 Female 130  90.76888 264.8987 34.76677 24.96128  70.07991
34 SH307 34.52245   Male 119  73.22300 213.4676 34.55394 21.67472  73.22178
35 SH315 34.27528 Female 165  92.35764 139.8416 36.54493 22.17074  91.75451
36 SH327 36.83684 Female 151  75.76946 271.5658 34.19946 22.47481  91.61082
37 SH328 34.90283 Female 161 105.19883 198.1164 37.51192 20.65386 104.47734
38 SH332 35.70572 Female 155  83.20046 254.7629 34.42881 25.58226  76.14968
39 SH338 36.11878 Female 143 101.53496 135.8858 37.43137 25.46580  81.58295
40 SH357 33.53000   Male 122  75.67982 277.3364 37.70574 22.32384  82.82668
41 SH365 35.28866   Male 135  76.84522 197.9511 37.96769 24.35740  97.18871
42 SH374 34.74200 Female 149 105.15749 266.7394 37.49456 23.97601  90.81482
43 SH393 34.93960 Female 161 104.99774 202.7821 33.63340 23.27929  94.20890
44 SH405 35.70079   Male 161  83.27224 179.4364 36.16117 23.61640 110.41629
45 SH410 34.36868 Female 121  72.33262 144.0637 33.37066 22.60196 105.54567
46 SH413 34.29493   Male 161  65.40377 251.0183 35.66119 22.64759  95.98636
47 SH424 35.63334   Male 158  72.86885 163.4943 37.85090 22.38551  65.26609
48 SH430 36.37596   Male 156  99.27437 290.7846 35.09654 20.84968  93.20290
49 SH450 34.41429 Female 153 107.48861 222.3305 33.15764 24.94298 100.52145
50 SH487 34.27203 Female 129 105.41863 193.2823 36.88931 25.69646  71.03947
      depth  blotch2
1  50.29711 35.60995
2  50.59650 36.46102
3  50.59854 36.23308
4  49.55720 36.61501
5  53.78593 37.59501
6  47.48592 33.84537
7  51.84685 35.00608
8  51.31445 37.51849
9  49.80048 37.78764
10 49.89161 35.69601
11 51.96076 35.98402
12 51.01759 35.52544
13 50.64477 34.10446
14 50.48023 37.58163
15 48.43633 34.38548
16 49.16008 35.49255
17 50.46984 38.18380
18 46.08500 34.26755
19 50.53704 35.51294
20 49.06487 37.06992
21 50.49601 36.08072
22 49.25833 36.89408
23 51.13299 36.03899
24 46.95435 35.93166
25 50.45905 35.77533
26 48.11986 33.46802
27 49.40703 37.52177
28 52.43260 37.15417
29 48.96359 35.61148
30 48.87889 36.40255
31 48.65615 35.27540
32 49.53097 34.60924
33 47.52810 35.84330
34 49.02755 34.07366
35 47.94731 35.30354
36 49.70779 37.94195
37 51.20013 35.94991
38 51.79984 36.77689
39 51.30954 37.20234
40 48.78801 34.53590
41 50.46341 36.34732
42 49.21944 34.39458
43 50.95881 35.98779
44 50.37530 36.77181
45 49.88061 35.39974
46 50.23290 35.32378
47 52.22695 36.70234
48 52.85327 37.46724
49 52.37204 35.44672
50 48.89706 35.30019

Convert wide data to a long data format so that using ggplot2 is easier: instead of a column each for blotch1 and blotch2, there is a column for capture and another for blotch time

my_data_long <- pivot_longer(common_data, 
                             cols = c(blotch1, blotch2), 
                             names_to = "capture", 
                             values_to = "blotch_time")
head(my_data_long)

# A tibble: 6 × 11
  ID    sex      BPM weight length   air water  meta depth capture blotch_time
  <chr> <chr>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <chr>         <dbl>
1 SH004 Male     161  105.    171.  36.2  21.6  86.3  50.3 blotch1        35.3
2 SH004 Male     161  105.    171.  36.2  21.6  86.3  50.3 blotch2        35.6
3 SH008 Female   135  101.    128.  36.8  21.3  96.3  50.6 blotch1        36.3
4 SH008 Female   135  101.    128.  36.8  21.3  96.3  50.6 blotch2        36.5
5 SH029 Male     122   69.0   219.  37.6  23.3 105.   50.6 blotch1        35.2
6 SH029 Male     122   69.0   219.  37.6  23.3 105.   50.6 blotch2        36.2

View as a boxlplot

ggboxplot(my_data_long, x = "capture", y = "blotch_time", 
          color = "capture", palette = c("#DC143C", "#87CEEB"),
          ylab = "Blotch Time (Seconds)") +
  labs(color = "Capture Event") +  
  scale_color_manual(values = c("#DC143C", "#87CEEB"), 
                     labels = c("First Capture", "Second Capture")) +  
  theme(axis.title.x = element_blank(), 
        axis.text.x = element_blank())

Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

As I have already created a data set with sharks that have all been captured twice, there is no need to subset my data
Now I plot the paired data

ggplot(my_data_long, aes(x = factor(ID), y = blotch_time, color = capture)) +
  geom_line(aes(group = ID), color = "gray") +  # Lines to show pairing
  geom_point() +
  labs(x = "Shark ID", y = "Blotch Time (Seconds)") +
  theme_bw()

Now, I reshape the data back to wide format to calculate the difference between blotch1 and blotch2

my_data_wide <- pivot_wider(my_data_long, 
                            names_from = "capture", 
                            values_from = "blotch_time")

Calculate the difference between blotch1 and blotch2 for each shark

my_data_wide$difference <- my_data_wide$blotch2 - my_data_wide$blotch1

View the data with the difference

View(my_data_wide)
print(my_data_wide)

# A tibble: 50 × 12
   ID    sex      BPM weight length   air water  meta depth blotch1 blotch2
   <chr> <chr>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>
 1 SH004 Male     161  105.    171.  36.2  21.6  86.3  50.3    35.3    35.6
 2 SH008 Female   135  101.    128.  36.8  21.3  96.3  50.6    36.3    36.5
 3 SH029 Male     122   69.0   219.  37.6  23.3 105.   50.6    35.2    36.2
 4 SH049 Female   166   66.4   250.  36.5  25.4  83.5  49.6    35.5    36.6
 5 SH059 Male     119  107.    218.  34.7  24.7  76.4  53.8    36.5    37.6
 6 SH070 Male     158   83.8   262.  33.5  25.9  56.5  47.5    32.9    33.8
 7 SH087 Female   129   87.2   242.  35.3  24.0  72.5  51.8    34.0    35.0
 8 SH101 Male     134   69.3   196.  38.0  23.9  68.5  51.3    36.4    37.5
 9 SH108 Male     135   73.3   179.  35.4  24.9  91.3  49.8    36.7    37.8
10 SH110 Male     134   78.9   191.  36.7  21.1  54.9  49.9    34.7    35.7
# ℹ 40 more rows
# ℹ 1 more variable: difference <dbl>

Visualize the difference

ggboxplot(my_data_wide, y = "difference", 
          ylab = "Difference in Blotch Time (Seconds)", 
          xlab = "Paired Data")

There are outliers seen when visualizing the difference,
Use IQR to detect outliers:

IQR_value <- IQR(my_data_wide$difference)
Q1 <- quantile(my_data_wide$difference, 0.25)
Q3 <- quantile(my_data_wide$difference, 0.75)
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- my_data_wide$difference[my_data_wide$difference < lower_bound | my_data_wide$difference > upper_bound]
print(outliers)

[1]  0.2711420  0.1660540 -0.3951263 -0.4487918 -0.3474200

Remove outliers:

my_data_no_outliers <- my_data_wide[!(my_data_wide$difference < lower_bound | my_data_wide$difference > upper_bound), ]

Visualize again to see if outliers are removed

ggboxplot(my_data_no_outliers, y = "difference", 
          ylab = "Difference in Blotch Time (Seconds)", 
          xlab = "Paired Data")

No outliers are seen now

Check the paired t-test assumptions

Assumption 1:Are the two samples paired? Yes

Assumption 2:The observations must be independent of each other: Yes

Assumption 3:The differences between the paired values should be approximately normally distributed: therefore we need to check whether the differences of the pairs follow a normal distribution

Check for normal distribution among the dependent variable (Visualization)

qqnorm(my_data_no_outliers$difference)
qqline(my_data_no_outliers$difference, col = "red")

All points fall on a straight line
Conduct statistical normal distribution test

shapiro_test_no_outliers <- shapiro.test(my_data_no_outliers$difference)
print(shapiro_test_no_outliers)


    Shapiro-Wilk normality test

data:  my_data_no_outliers$difference
W = 0.97934, p-value = 0.5934

The p value is >0.05 (0.5934) therefore the assumption of normal distribution is met and I can proceed with t test
Paired two-sample t test

t_test_result <- t.test(my_data_no_outliers$blotch1, 
                        my_data_no_outliers$blotch2, 
                        paired = TRUE)
print(t_test_result)


    Paired t-test

data:  my_data_no_outliers$blotch1 and my_data_no_outliers$blotch2
t = -208.14, df = 44, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.059966 -1.039637
sample estimates:
mean difference 
      -1.049801

The p value is <0.5 (2.2e - 16) therefore, we reject the null hypothesis.
We accept the alternative hypothesis: There is a significant difference in blotching time between the first and second capture

Therefore, multiple capture has an effect on blotching time.

Question 3

Q3. Is it possible to predict blotching time?

sharks <- read_excel("C:/Users/shana/OneDrive/Desktop/Research methods/Summative/sharks.xlsx")
View(sharks)

Boxplots to visualize outliers

library(ggplot2)
ggplot(sharks, aes(y = blotch)) +
  geom_boxplot(fill = "lightblue", color = "black") +
  labs(title = "Boxplot of Blotch Time", y = "Blotch Time (Seconds)") +
  theme_minimal()

I will have another try as that boxplot was stretched

boxplot(sharks$blotch,
        main = "Boxplot of Blotch Time",
        ylab = "Blotch Time (Seconds)",
        col = "#E7B800",
        border = "black")

Outliers are seen in the boxplot, therefore identify them using IQR & Define the lower and upper bounds for outliers

Q1 <- quantile(sharks$blotch, 0.25)
Q3 <- quantile(sharks$blotch, 0.75)
IQR_value <- IQR(sharks$blotch)

lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

Filter out rows where blotch_time is outside the bounds

sharks_no_outliers <- sharks[sharks$blotch >= lower_bound & sharks$blotch <= upper_bound, ]

Visualize the data after removing outliers

boxplot(sharks_no_outliers$blotch,
        main = "Boxplot of Blotch Time",
        ylab = "Blotch Time (Seconds)",
        col = "#E7B800",
        border = "black")

Now the outliers for blotch time in seconds are removed
To conduct linear regression models (lm) we need to make sure the dependent variable (blotch) has a normal distribution:

shapiro.test(sharks_no_outliers$blotch)


    Shapiro-Wilk normality test

data:  sharks_no_outliers$blotch
W = 0.99671, p-value = 0.4113

The p-value >0.05 (0.4113) therefore we accept null hypothesis that blotch is normally distributed
Therefore, we can go ahead with an lm
I will then change sex into a factor as it is not a continuous variable

sharks_no_outliers$sex <- factor(sharks_no_outliers$sex)

Visualization and Linear regression models

BPM and blotch

ggplot(sharks_no_outliers, aes(x = BPM, y = blotch)) +
  geom_point() + 
  geom_smooth(method = "lm", color = "blue") +
  labs(x = "Heart Rate (BPM)", y = "Blotch Time (Seconds)", title = "Heart Rate vs. Blotch Time")

`geom_smooth()` using formula = 'y ~ x'

heart_rate_blotch_lm <- lm(blotch ~ BPM, data = sharks_no_outliers)
summary(heart_rate_blotch_lm)


Call:
lm(formula = blotch ~ BPM, data = sharks_no_outliers)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.7593 -0.9311 -0.0689  0.8915  3.5609 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 35.467281   0.623104  56.920   <2e-16 ***
BPM         -0.002545   0.004374  -0.582    0.561    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.375 on 494 degrees of freedom
Multiple R-squared:  0.0006846, Adjusted R-squared:  -0.001338 
F-statistic: 0.3384 on 1 and 494 DF,  p-value: 0.561

The p value is >0.05 (0.561)therefore no correlation

Weight and blotch

ggplot(sharks_no_outliers, aes(x = weight, y = blotch)) +
  geom_point() + 
  geom_smooth(method = "lm", color = "blue") +
  labs(x = "Weight (kg)", y = "Blotch Time (Seconds)", title = "Weight vs. Blotch Time")

`geom_smooth()` using formula = 'y ~ x'

weight_blotch_lm <- lm(blotch ~ weight, data = sharks_no_outliers)
summary(weight_blotch_lm)


Call:
lm(formula = blotch ~ weight, data = sharks_no_outliers)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.7474 -0.9430 -0.0662  0.9314  3.5558 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.511e+01  4.091e-01  85.827   <2e-16 ***
weight      -9.491e-06  4.604e-03  -0.002    0.998    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.376 on 494 degrees of freedom
Multiple R-squared:  8.6e-09,   Adjusted R-squared:  -0.002024 
F-statistic: 4.249e-06 on 1 and 494 DF,  p-value: 0.9984

The p value is >0.05 (0.9984) therefore no correlation

Length and blotch

ggplot(sharks_no_outliers, aes(x = length, y = blotch)) +
  geom_point() + 
  geom_smooth(method = "lm", color = "blue") +
  labs(x = "Length (cm)", y = "Blotch Time (Seconds)", title = "Length vs. Blotch Time")

`geom_smooth()` using formula = 'y ~ x'

length_blotch_lm <- lm(blotch ~ length, data = sharks_no_outliers)
summary(length_blotch_lm)


Call:
lm(formula = blotch ~ length, data = sharks_no_outliers)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.7425 -0.9497 -0.0604  0.9262  3.5421 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 35.1854644  0.2858843 123.076   <2e-16 ***
length      -0.0003739  0.0013234  -0.283    0.778    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.376 on 494 degrees of freedom
Multiple R-squared:  0.0001616, Adjusted R-squared:  -0.001862 
F-statistic: 0.07984 on 1 and 494 DF,  p-value: 0.7776

The p value is >0.05 (0.7776) therefore no correlation

Air and blotch

ggplot(sharks_no_outliers, aes(x = air, y = blotch)) +
  geom_point() + 
  geom_smooth(method = "lm", color = "blue") +
  labs(x = "Air Temperature (°C)", y = "Blotch Time (Seconds)", title = "Air Temperature vs. Blotch Time")

`geom_smooth()` using formula = 'y ~ x'

air_blotch_lm <- lm(blotch ~ air, data = sharks_no_outliers)
summary(air_blotch_lm)


Call:
lm(formula = blotch ~ air, data = sharks_no_outliers)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.7737 -0.9320 -0.0353  0.9392  3.5534 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 36.24234    1.53621   23.59   <2e-16 ***
air         -0.03196    0.04319   -0.74     0.46    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.375 on 494 degrees of freedom
Multiple R-squared:  0.001107,  Adjusted R-squared:  -0.000915 
F-statistic: 0.5475 on 1 and 494 DF,  p-value: 0.4597

The p value is >0.05 (0.4597) therefore no correlation

Water and blotch

ggplot(sharks_no_outliers, aes(x = water, y = blotch)) +
  geom_point() + 
  geom_smooth(method = "lm", color = "blue") +
  labs(x = "Water Temperature (°C)", y = "Blotch Time (Seconds)", title = "Water Temperature vs. Blotch Time")

`geom_smooth()` using formula = 'y ~ x'

water_blotch_lm <- lm(blotch ~ water, data = sharks_no_outliers)
summary(water_blotch_lm)


Call:
lm(formula = blotch ~ water, data = sharks_no_outliers)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.7887 -0.9439 -0.0413  0.9546  3.5189 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 36.12179    0.85072  42.460   <2e-16 ***
water       -0.04411    0.03686  -1.196    0.232    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.374 on 494 degrees of freedom
Multiple R-squared:  0.00289,   Adjusted R-squared:  0.0008712 
F-statistic: 1.432 on 1 and 494 DF,  p-value: 0.2321

The p value is >0.05 (0.2321) therefore no correlation

Visualization of meta and blotch

ggplot(sharks_no_outliers, aes(x = meta, y = blotch)) +
  geom_point() + 
  geom_smooth(method = "lm", color = "blue") +
  labs(x = "Meta", y = "Blotch Time (Seconds)", title = "Meta vs. Blotch Time")

`geom_smooth()` using formula = 'y ~ x'

meta_blotch_lm <- lm(blotch ~ meta, data = sharks_no_outliers)
summary(meta_blotch_lm)


Call:
lm(formula = blotch ~ meta, data = sharks_no_outliers)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.7404 -0.9355 -0.0638  0.9421  3.6073 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 35.246148   0.297592 118.438   <2e-16 ***
meta        -0.001701   0.003548  -0.479    0.632    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.375 on 494 degrees of freedom
Multiple R-squared:  0.000465,  Adjusted R-squared:  -0.001558 
F-statistic: 0.2298 on 1 and 494 DF,  p-value: 0.6319

The p value is >0.05 (0.6319) therefore no correlation

Depth and blotch

ggplot(sharks_no_outliers, aes(x = depth, y = blotch)) +
  geom_point() + 
  geom_smooth(method = "lm", color = "blue") +
  labs(x = "Depth (m)", y = "Blotch Time (Seconds)", title = "Depth vs. Blotch Time")

`geom_smooth()` using formula = 'y ~ x'

depth_blotch_lm <- lm(blotch ~ depth, data = sharks_no_outliers)
summary(depth_blotch_lm)


Call:
lm(formula = blotch ~ depth, data = sharks_no_outliers)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.80824 -0.64783 -0.00811  0.60695  2.82044 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.81902    1.11064   9.741   <2e-16 ***
depth        0.48455    0.02214  21.885   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9802 on 494 degrees of freedom
Multiple R-squared:  0.4923,    Adjusted R-squared:  0.4912 
F-statistic:   479 on 1 and 494 DF,  p-value: < 2.2e-16

The p value is <0.05 (2.2e - 16) therefore there is a correlation

Sex and blotch

ggplot(sharks_no_outliers, aes(x = sex, y = blotch, fill = sex)) +
  geom_boxplot() +
  labs(x = "Sex", y = "Blotch Time (Seconds)", title = "Sex vs. Blotch Time")

sex_blotch_lm <- lm(blotch ~ sex, data = sharks_no_outliers)
summary(sex_blotch_lm)


Call:
lm(formula = blotch ~ sex, data = sharks_no_outliers)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.5812 -0.9630 -0.0208  0.9739  3.7218 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 34.94059    0.08914 391.956   <2e-16 ***
sexMale      0.31547    0.12289   2.567   0.0105 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.367 on 494 degrees of freedom
Multiple R-squared:  0.01316,   Adjusted R-squared:  0.01117 
F-statistic:  6.59 on 1 and 494 DF,  p-value: 0.01055

The p value is <0.05 (0.01055) therefore there is a correlation

Now I will take the significant linear regression models (blotch vs sex, and blotch vs depth) and fit a multiple linear regression model

mlr_model <- lm(blotch ~ sex + depth, data = sharks_no_outliers)
summary(mlr_model)


Call:
lm(formula = blotch ~ sex + depth, data = sharks_no_outliers)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.93661 -0.63242 -0.00156  0.58855  2.96121 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.75572    1.10129   9.766  < 2e-16 ***
sexMale      0.27080    0.08741   3.098  0.00206 ** 
depth        0.48297    0.02196  21.997  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9718 on 493 degrees of freedom
Multiple R-squared:  0.502, Adjusted R-squared:  0.4999 
F-statistic: 248.4 on 2 and 493 DF,  p-value: < 2.2e-16

I would like to see if a model with both independent variables fits the data better and answers the question better
I will test a full model with both significant variables and a model with just depth and sex

full_model <- lm(blotch ~ sex + depth, data = sharks_no_outliers)
summary(full_model)


Call:
lm(formula = blotch ~ sex + depth, data = sharks_no_outliers)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.93661 -0.63242 -0.00156  0.58855  2.96121 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.75572    1.10129   9.766  < 2e-16 ***
sexMale      0.27080    0.08741   3.098  0.00206 ** 
depth        0.48297    0.02196  21.997  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9718 on 493 degrees of freedom
Multiple R-squared:  0.502, Adjusted R-squared:  0.4999 
F-statistic: 248.4 on 2 and 493 DF,  p-value: < 2.2e-16

Reduced model removing ‘sex’

reduced_model <- lm(blotch ~ depth, data = sharks_no_outliers)
summary(reduced_model)


Call:
lm(formula = blotch ~ depth, data = sharks_no_outliers)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.80824 -0.64783 -0.00811  0.60695  2.82044 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.81902    1.11064   9.741   <2e-16 ***
depth        0.48455    0.02214  21.885   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9802 on 494 degrees of freedom
Multiple R-squared:  0.4923,    Adjusted R-squared:  0.4912 
F-statistic:   479 on 1 and 494 DF,  p-value: < 2.2e-16

Reduced model removing ‘depth’

reduced_model2 <- lm(blotch ~ sex, data = sharks_no_outliers)
summary(reduced_model2)


Call:
lm(formula = blotch ~ sex, data = sharks_no_outliers)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.5812 -0.9630 -0.0208  0.9739  3.7218 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 34.94059    0.08914 391.956   <2e-16 ***
sexMale      0.31547    0.12289   2.567   0.0105 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.367 on 494 degrees of freedom
Multiple R-squared:  0.01316,   Adjusted R-squared:  0.01117 
F-statistic:  6.59 on 1 and 494 DF,  p-value: 0.01055

Then i will conduct an ANOVA comparing the results of the models
First an ANOVA between full model and model without sex

anova(reduced_model, full_model)

Analysis of Variance Table

Model 1: blotch ~ depth
Model 2: blotch ~ sex + depth
  Res.Df    RSS Df Sum of Sq      F  Pr(>F)   
1    494 474.64                               
2    493 465.58  1    9.0636 9.5974 0.00206 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value <0.05 (0.00206), therefore removing sex significantly worsens the model
Second an ANOVA between full model and model without depth

anova(reduced_model2, full_model)

Analysis of Variance Table

Model 1: blotch ~ sex
Model 2: blotch ~ sex + depth
  Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
1    494 922.53                                  
2    493 465.58  1    456.95 483.86 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value <0.05 (2.2e-16), therefore removing depth significantly worsens the model Therefore I will include both sex and depth in the model (full model)
Earlier, I had turned sex into a factor, I will now check the levels of the factor to see if Male comes first or if Female comes first

levels(sharks_no_outliers$sex)

[1] "Female" "Male"

Female came first which means Female is coded as 0 and Male is coded as 1
I will now validate the model by conducting train-testing splitting
First I will create a training set and a test set

library(caret)

Warning: package 'caret' was built under R version 4.4.2


Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

set.seed(123)
training.samples <- sharks_no_outliers$blotch %>%
  createDataPartition(p =0.8, list = FALSE)  
train.data <- sharks_no_outliers[training.samples,]
test.data <- sharks_no_outliers[-training.samples,]

I will then fit the model to the train data set

linear_model <- lm(blotch ~ sex + depth, data = train.data)
summary(linear_model)


Call:
lm(formula = blotch ~ sex + depth, data = train.data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.93526 -0.66288 -0.00617  0.57743  2.93339 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.21346    1.24256   9.024   <2e-16 ***
sexMale      0.25085    0.09892   2.536   0.0116 *  
depth        0.47420    0.02480  19.122   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9875 on 397 degrees of freedom
Multiple R-squared:  0.4854,    Adjusted R-squared:  0.4828 
F-statistic: 187.2 on 2 and 397 DF,  p-value: < 2.2e-16

I will now plot the residuals to evaluate them to see visually if the model meets the assumptions of regression

par(mfrow = c(2, 2)) 
plot(linear_model)

According to the plots, it looks like the model meets the assumptions of normality of residuals and homoscedacity
I then predict on the test data set

predictions <- predict(linear_model, newdata = test.data)

I then check the mean squared error to see how close the predictions and the real values are

mse <- mean((test.data$blotch - predictions)^2)
cat("Mean Squared Error:", mse, "\n")

Mean Squared Error: 0.8194926

I will also visualize this on a graph (The red line is where actual=predicted)

comparison_data <- data.frame(Actual = test.data$blotch, Predicted = predictions)
ggplot(comparison_data, aes(x = Actual, y = Predicted)) +
  geom_point(alpha = 0.6) + 
  geom_abline(slope = 1, intercept = 0, color = "red") + 
  labs(
    title = "Actual vs Predicted Blotch Time",
    x = "Actual Blotch Time (s)",
    y = "Predicted Blotch Time (s)"
  ) +
  theme_minimal()