Sharks

Author

N1339756

Question 1

chooseCRANmirror(graphics = FALSE, ind = 1)
rm(list = ls()) #clear objects in workspace 
library(readxl)
sharks <- read_excel("/Volumes/UNI WORK/sharks.xlsx")
View(sharks)
library(readxl)
sharksub <- read_excel("/Volumes/UNI WORK/sharksub.xlsx")
View(sharksub)
install.packages("conflicted")
Installing package into '/Users/eloise/Library/R/arm64/4.4/library'
(as 'lib' is unspecified)

The downloaded binary packages are in
    /var/folders/s2/5m_j2hbx2qqd8264j2_ljrnh0000gn/T//RtmpKmVtaj/downloaded_packages
library(conflicted)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
library(dplyr)
library(lattice)


#to see if there are obvious outliers in the data set used dotplot to test all the continous variables
Names <- c("BPM", "weight", "length", "air", "water", "meta", "depth")
dotplot(as.matrix(as.matrix(sharks[,Names])),
groups=FALSE, #keeps data ungrouped 
strip = strip.custom(bg = 'white',
par.strip.text = list(cex = 1.2)),
scales = list(x = list(relation = "free",
draw = TRUE),
y = list(relation = "free", draw = FALSE)),
col = 1, cex = 1, pch = 16,
xlab = list(label = "Value of the variable",
cex = 1.2),
ylab = list(label = "Order of the data",
cex = 1.2))

#depth has some measurements that show sharks caught at deeper/shallower water 


sharks %>%
  ggplot(aes(x = air, y = water)) + 
  geom_point(color = "blue", size = 3) + 
  geom_smooth(method = "lm", color = "red", se = FALSE) + 
  labs(
    title = "Scatter Plot of Air Temperature Compared to Water Temperature", 
    x = "Air Temperature (°C)", 
    y = "Water Temperature (°C)"
  ) + 
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

# Fit a linear model
model <- lm(water ~ air, data = sharks)

# Check residuals with a histogram or Q-Q plot
sharks$residuals <- residuals(model)

# Histogram of residuals
ggplot(sharks, aes(x = residuals)) +
  geom_histogram(binwidth = 0.5, fill = "blue", color = "white") +
  labs(title = "Histogram of Residuals")

# Q-Q plot of residuals
ggplot(sharks, aes(sample = residuals)) +
  geom_qq() +
  geom_qq_line() +
  labs(title = "Q-Q Plot of Residuals")

#Data not normally distributed raw or tested with residuals so to test for significance between the two variables spearman's rank correlation was used. 
  
  
spearman_rank <- cor.test(sharks$air, sharks$water, method = "spearman")
print(spearman_rank)

    Spearman's rank correlation rho

data:  sharks$air and sharks$water
S = 22007692, p-value = 0.2082
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.05637344 
#data is not significantly correlated

Question 2

#Histogram of blotch time 1

ggplot(sharksub, aes(x = blotch1)) +
  geom_histogram(binwidth = 0.5, fill = "blue", color = "white", alpha = 0.7) +
  labs(title = "Histogram of Time to Blotch First", x = "Time to Blotch First", y = "Frequency") +
  theme_minimal()

#Histogram of blotch time 2
ggplot(sharksub, aes(x = blotch2)) +
  geom_histogram(binwidth = 0.5, fill = "blue", color = "white", alpha = 0.7) +
  labs(title = "Histogram of Time to Blotch First", x = "Time to Blotch First", y = "Frequency") +
  theme_minimal()

#QQ plot to check for normality 

ggplot(sharksub, aes(sample = blotch1)) +
  geom_qq() +
  geom_qq_line() +
  labs(title = "Q-Q Plot for Time to Blotch First Caught") +
  theme_minimal()

#QQ plot to check for normality 

ggplot(sharksub, aes(sample = blotch2)) +
  geom_qq() +
  geom_qq_line() +
  labs(title = "Q-Q Plot for Time to Blotch for Second Capture") +
  theme_minimal()

#QQ plots shows data to be normally distributed

#Shapiro Wilk check for normality
shapiro_test_blotch1 <- shapiro.test(sharksub$blotch1)
print(shapiro_test_blotch1)

    Shapiro-Wilk normality test

data:  sharksub$blotch1
W = 0.97958, p-value = 0.5345
#data normally distributed p= 0.5345 accept the null hypothesis

shapiro_test_blotch2 <- shapiro.test(sharksub$blotch2)
print(shapiro_test_blotch2)

    Shapiro-Wilk normality test

data:  sharksub$blotch2
W = 0.97936, p-value = 0.5255
#data normally distributed p=0.5255 so can use standard T - test

t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE, var.equal = TRUE)

    Paired t-test

data:  sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.037176 -0.822301
sample estimates:
mean difference 
     -0.9297384 
#Blotching time is significantly different p < 0.05 time taken for sharks to blotch increases by mean time of 0.930 (3 s.f) seconds after the second capture 

Question 3

#Check if any of the continous variables have a linear relationship

# Scatter plot for BPM vs Blotching Time
ggplot(sharks, aes(x = BPM, y = blotch)) +
  geom_point() +
  labs(title = "Scatter plot of BPM vs Blotching Time", x = "BPM", y = "Blotching Time") +
  theme_minimal()

# Scatter plot for Air Temperature vs Blotching Time
ggplot(sharks, aes(x = air, y = blotch)) +
  geom_point() +
  labs(title = "Scatter plot of Air Temperature vs Blotching Time", x = "Air Temperature", y = "Blotching Time") +
  theme_minimal()

# Scatter plot for Water Temperature vs Blotching Time
ggplot(sharks, aes(x = water, y = blotch)) +
  geom_point() +
  labs(title = "Scatter plot of Water Temperature vs Blotching Time", x = "Water Temperature", y = "Blotching Time") +
  theme_minimal()

# Scatter plot for Weight vs Blotching Time
ggplot(sharks, aes(x = weight, y = blotch)) +
  geom_point() +
  labs(title = "Scatter plot of Weight vs Blotching Time", x = "Weight", y = "Blotching Time") +
  theme_minimal()

# Scatter plot for Length vs Blotching Time
ggplot(sharks, aes(x = length, y = blotch)) +
  geom_point() +
  labs(title = "Scatter plot of Length vs Blotching Time", x = "Length", y = "Blotching Time") +
  theme_minimal()

# Scatter plot for Depth vs Blotching Time
ggplot(sharks, aes(x = depth, y = blotch)) +
  geom_point() +
  labs(title = "Scatter plot of Depth vs Blotching Time", x = "Depth", y = "Blotching Time") +
  theme_minimal()

# Scatter plot for Meta vs Blotching Time
ggplot(sharks, aes(x = meta, y = blotch)) +
  geom_point() +
  labs(title = "Scatter plot of Meta vs Blotching Time", x = "Depth", y = "Blotching Time") +
  theme_minimal()

#Depth has a linear relationship with blotching

#Boxplot for sex versus blotching time

ggplot(sharks, aes(x = sex, y = blotch, fill = sex)) +
  geom_boxplot() +
  labs(title = "Blotching Time by Sex",
       x = "Sex",
       y = "Blotch Time (S)") +
  theme_minimal() +
  scale_fill_manual(values = c("Male" = "blue", "Female" = "red"))

#shark sex is catergorical data but can be converted to a numeric format for linear model

sharks.new <- sharks %>%
  mutate(sex = ifelse(sex == "Male", 1, 2))


#Generalised linear model used as residuals plotted in Question 1 showed normality see if depth is statistically significant
model <- lm(blotch ~ BPM + weight + length + air + water + meta + depth + sex, data = sharks.new)
summary(model)

Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta + 
    depth + sex, data = sharks.new)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.97715 -0.66193 -0.00841  0.64123  2.90395 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.7356961  1.8828077   6.233 9.86e-10 ***
BPM         -0.0020791  0.0031540  -0.659  0.51009    
weight       0.0017281  0.0033143   0.521  0.60231    
length       0.0013042  0.0009606   1.358  0.17517    
air         -0.0310068  0.0315302  -0.983  0.32590    
water       -0.0143878  0.0268112  -0.537  0.59176    
meta        -0.0011610  0.0025671  -0.452  0.65127    
depth        0.5034077  0.0220870  22.792  < 2e-16 ***
sex         -0.3088617  0.0890602  -3.468  0.00057 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9912 on 491 degrees of freedom
Multiple R-squared:  0.5256,    Adjusted R-squared:  0.5178 
F-statistic: 67.99 on 8 and 491 DF,  p-value: < 2.2e-16
#biggest indicators of blotching time are depth caught and the sex of the shark 


# Correlation matrix of continuous variables to see if there's other indicators of blotching time
cor(sharks %>% select(blotch, BPM, weight, length, air, water, meta, depth))
             blotch          BPM       weight      length         air
blotch  1.000000000 -0.029296612  0.009236525 -0.01638167 -0.03761675
BPM    -0.029296612  1.000000000  0.017036558 -0.06856053 -0.06841209
weight  0.009236525  0.017036558  1.000000000 -0.01959676 -0.05264537
length -0.016381675 -0.068560532 -0.019596758  1.00000000 -0.03027426
air    -0.037616747 -0.068412093 -0.052645366 -0.03027426  1.00000000
water  -0.051653787  0.024513368  0.086338753 -0.05940708 -0.05524051
meta   -0.009513855 -0.006016429  0.019601470  0.00302851  0.12531801
depth   0.714224701 -0.012173520 -0.006057435 -0.08334774 -0.01188199
             water         meta        depth
blotch -0.05165379 -0.009513855  0.714224701
BPM     0.02451337 -0.006016429 -0.012173520
weight  0.08633875  0.019601470 -0.006057435
length -0.05940708  0.003028510 -0.083347736
air    -0.05524051  0.125318005 -0.011881989
water   1.00000000  0.022494605 -0.040888511
meta    0.02249461  1.000000000  0.008150764
depth  -0.04088851  0.008150764  1.000000000
#no other indicators

ggplot(sharks, aes(x = depth, y = blotch)) +
  geom_point() +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Depth vs Blotching Time", x = "Depth", y = "Blotching Time")
`geom_smooth()` using formula = 'y ~ x'

#blotching time increases with increased depth


# QQ plot for Male 

sharks.male <- sharks %>% #made a new column with only male blotching time
  mutate(blotch_male = if_else(sex == "Male", blotch, NA_real_))



ggplot(sharks.male, aes(sample = blotch_male)) +
  geom_qq() +
  geom_qq_line() +
  labs(title = "Q-Q Plot for Male Blotching Time") +
  theme_minimal()
Warning: Removed 236 rows containing non-finite outside the scale range
(`stat_qq()`).
Warning: Removed 236 rows containing non-finite outside the scale range
(`stat_qq_line()`).

na.omit(sharks.male$blotch_male) #remove the NAs shown in female row for test
  [1] 35.33881 33.54668 36.68291 35.40344 35.68104 39.83638 34.75400 33.49141
  [9] 35.09741 35.75184 34.98489 39.13972 35.79007 35.17775 34.34153 33.60036
 [17] 35.64141 33.19158 38.27560 33.38074 37.18545 35.68198 34.11847 37.70230
 [25] 32.09225 35.30483 34.12747 34.20984 35.76808 32.37115 36.50001 37.29134
 [33] 36.43598 34.73515 35.46505 36.56332 32.85958 34.47937 34.20434 35.13952
 [41] 35.35378 34.42745 37.16959 34.05025 35.60692 34.16414 36.90737 35.90758
 [49] 33.96733 34.44525 36.42572 34.26732 37.22830 32.53531 35.90348 36.39784
 [57] 36.68703 34.65632 37.17692 36.76577 34.93594 35.44028 35.37963 37.59267
 [65] 35.73577 33.72274 35.95868 35.92057 35.46943 35.07222 34.74639 38.65662
 [73] 32.88800 38.06194 35.99556 33.82318 37.22728 35.39418 35.85999 35.36557
 [81] 36.48702 35.92896 36.56256 36.30869 33.75869 34.02139 36.07216 38.51132
 [89] 36.53646 35.12783 34.48057 37.43851 35.50834 36.36155 36.02138 35.53009
 [97] 36.13323 33.49157 37.21730 36.26017 32.75580 35.31524 33.85064 34.86074
[105] 36.58108 36.67552 37.62275 37.07165 34.60328 36.04940 35.86671 38.64013
[113] 37.04875 35.03548 35.48251 32.27892 36.87935 37.67336 34.66563 33.56095
[121] 34.96694 33.82718 33.50852 33.33818 37.00103 35.02983 33.70580 34.99563
[129] 33.14350 34.10182 35.66434 34.07018 35.66700 33.86044 34.73333 37.65034
[137] 36.42890 35.87160 34.26328 37.55416 34.92134 35.50131 33.84733 34.57425
[145] 34.87784 35.34228 33.46396 35.03768 33.84433 35.80479 34.77044 33.95021
[153] 34.24796 33.65124 34.15069 35.25384 35.85005 33.97565 34.45961 34.01414
[161] 33.60120 36.11724 34.24048 34.57331 32.05344 36.68645 34.52245 36.77137
[169] 37.64657 35.29631 34.46606 34.43116 35.47244 34.16990 33.98736 38.56100
[177] 35.40291 35.03834 34.57363 35.81721 36.56161 36.39924 33.88859 37.70040
[185] 31.88414 33.85814 35.68289 34.77303 33.74675 34.74759 33.53000 36.48670
[193] 35.11494 35.96067 35.28866 34.00363 35.27814 35.23208 33.72488 34.02853
[201] 35.98068 36.48882 35.83944 37.07808 36.92076 34.72575 40.08356 37.31096
[209] 37.58439 36.79061 32.69830 35.70079 34.97168 34.12583 35.70131 35.52811
[217] 34.29493 34.06868 36.52201 35.65700 33.63755 33.16705 35.63334 35.26394
[225] 34.45643 32.74182 35.91940 36.37596 36.72031 33.19147 35.87670 34.98279
[233] 35.17529 36.27341 34.71510 35.40709 34.44828 34.42770 35.17856 37.59347
[241] 35.94039 35.43154 36.21244 33.54295 36.34480 35.46805 35.10372 35.29969
[249] 34.25084 36.61085 34.96632 35.18084 37.04830 35.56695 33.55750 35.85124
[257] 35.60726 33.68436 34.72661 35.09664 36.99740 34.60810 32.06841 35.31898
attr(,"na.action")
  [1]   1   2   3   5   8  10  11  12  15  17  18  22  23  25  26  31  37  39
 [19]  42  45  47  48  49  51  54  56  57  58  61  63  64  68  69  71  72  73
 [37]  74  75  77  80  82  83  84  87  89  90  94  95  97  98 103 109 111 114
 [55] 117 123 125 126 127 129 132 134 135 136 139 141 142 146 147 149 150 154
 [73] 155 156 158 161 163 164 165 168 170 171 175 176 178 179 184 190 192 195
 [91] 198 204 206 207 209 211 220 221 224 225 229 230 232 233 234 236 237 238
[109] 239 240 242 244 245 246 247 248 249 250 252 255 257 258 259 262 264 265
[127] 269 270 272 273 278 284 286 289 292 298 300 301 303 305 314 315 318 319
[145] 323 324 326 327 328 330 331 332 333 334 336 338 339 340 341 342 345 348
[163] 349 350 353 355 358 359 360 361 366 367 368 369 370 371 373 374 375 379
[181] 380 381 384 389 390 393 394 395 397 400 402 403 404 407 409 410 414 415
[199] 417 418 423 429 431 434 437 443 445 446 448 450 459 460 463 464 468 469
[217] 471 472 473 475 476 477 478 479 481 483 484 485 486 487 490 493 494 495
[235] 498 500
attr(,"class")
[1] "omit"
shapiro.test(sharks.male$blotch_male)

    Shapiro-Wilk normality test

data:  sharks.male$blotch_male
W = 0.99209, p-value = 0.1701
#Data normally distributed

# QQ plot for 'Female' group
sharks.female <- sharks %>%
  mutate(blotch_female = if_else(sex == "Female", blotch, NA_real_))
view(sharks.female)



ggplot(sharks.female, aes(sample = blotch_female)) +
  geom_qq() +
  geom_qq_line() +
  labs(title = "Q-Q Plot for Female Blotching Time") +
  theme_minimal()
Warning: Removed 264 rows containing non-finite outside the scale range
(`stat_qq()`).
Warning: Removed 264 rows containing non-finite outside the scale range
(`stat_qq_line()`).

na.omit(sharks.female$blotch_female)
  [1] 37.17081 34.54973 36.32861 37.39799 36.29497 36.02478 31.77830 36.23767
  [9] 37.04890 34.17118 36.36105 34.41702 36.33441 35.11890 34.25971 35.25938
 [17] 36.70323 33.86559 36.23841 35.01777 35.73423 34.50264 35.54855 35.51145
 [25] 33.81010 34.12868 34.38152 32.78833 33.14264 34.35993 34.31053 33.72053
 [33] 35.39024 36.87037 34.58863 36.24140 34.94177 33.80528 36.27716 36.52400
 [41] 35.86922 33.76378 34.99762 33.98649 34.83144 34.93018 34.65726 32.60900
 [49] 35.27513 33.07815 32.27762 35.72632 33.76447 35.54950 36.48768 35.56566
 [57] 34.04251 33.77159 33.86392 33.80302 33.86019 32.31056 37.23568 36.85702
 [65] 33.59581 33.79549 33.10335 36.39183 36.84328 33.56897 33.11113 36.35739
 [73] 33.88382 34.56139 36.92970 30.77585 33.38396 35.98082 34.49482 35.24772
 [81] 32.67439 34.89859 34.54851 35.54189 34.45879 36.07762 33.09675 38.66238
 [89] 36.27604 34.41919 35.69544 34.78846 32.20917 34.56033 34.14408 33.26947
 [97] 34.47858 35.99021 35.28434 34.77076 35.81950 34.84883 36.92090 35.48229
[105] 33.89644 36.06121 34.07307 34.79541 34.98931 34.12705 34.64274 32.74815
[113] 34.88511 35.69862 34.10099 33.56870 37.29475 37.90292 38.27531 34.36893
[121] 37.97618 32.49322 35.03094 34.17285 36.50998 35.48137 36.07201 33.61226
[129] 34.26067 36.04394 34.26182 34.45899 36.11875 35.18800 36.74514 34.79932
[137] 34.57090 34.45457 32.22041 33.95618 35.05340 34.27528 34.97589 32.84784
[145] 35.88209 35.04288 34.65921 36.83684 34.90283 37.41077 35.54236 35.70572
[153] 36.86162 31.35935 33.72887 36.11878 35.90743 36.60670 36.91332 36.71040
[161] 33.52062 33.19538 36.76501 35.09021 36.80253 34.97680 33.60686 35.26304
[169] 35.70735 33.97202 35.27511 35.47827 35.33056 32.94522 34.31492 34.49605
[177] 32.78553 34.74200 34.24688 34.81528 33.27486 34.55222 36.99602 34.18094
[185] 35.69824 34.93960 34.18335 33.49879 31.98199 34.92301 38.33805 33.48776
[193] 36.88131 35.00973 34.27767 34.36868 34.92444 34.16216 34.60461 36.25016
[201] 34.14676 36.33284 34.90484 34.51068 35.39109 32.66327 35.93561 33.86191
[209] 35.64318 34.41429 38.13854 35.76835 35.11572 34.54163 35.34490 33.94983
[217] 35.86883 34.70174 34.36352 36.03602 36.05150 36.20120 34.49677 34.48738
[225] 36.24746 36.14412 34.46625 34.39736 32.59701 34.27203 35.65420 31.79747
[233] 31.84995 34.88500 37.09394 35.26502
attr(,"na.action")
  [1]   4   6   7   9  13  14  16  19  20  21  24  27  28  29  30  32  33  34
 [19]  35  36  38  40  41  43  44  46  50  52  53  55  59  60  62  65  66  67
 [37]  70  76  78  79  81  85  86  88  91  92  93  96  99 100 101 102 104 105
 [55] 106 107 108 110 112 113 115 116 118 119 120 121 122 124 128 130 131 133
 [73] 137 138 140 143 144 145 148 151 152 153 157 159 160 162 166 167 169 172
 [91] 173 174 177 180 181 182 183 185 186 187 188 189 191 193 194 196 197 199
[109] 200 201 202 203 205 208 210 212 213 214 215 216 217 218 219 222 223 226
[127] 227 228 231 235 241 243 251 253 254 256 260 261 263 266 267 268 271 274
[145] 275 276 277 279 280 281 282 283 285 287 288 290 291 293 294 295 296 297
[163] 299 302 304 306 307 308 309 310 311 312 313 316 317 320 321 322 325 329
[181] 335 337 343 344 346 347 351 352 354 356 357 362 363 364 365 372 376 377
[199] 378 382 383 385 386 387 388 391 392 396 398 399 401 405 406 408 411 412
[217] 413 416 419 420 421 422 424 425 426 427 428 430 432 433 435 436 438 439
[235] 440 441 442 444 447 449 451 452 453 454 455 456 457 458 461 462 465 466
[253] 467 470 474 480 482 488 489 491 492 496 497 499
attr(,"class")
[1] "omit"
shapiro.test(sharks.female$blotch_female)

    Shapiro-Wilk normality test

data:  sharks.female$blotch_female
W = 0.99527, p-value = 0.682
#data normally distributed

sharks$sex <- factor(sharks$sex, levels = c("Male", "Female"))  #allows for means to be grouped by sex so know which sex has a faster blotching time

f_test_result_sex <- var.test(blotch ~ sex, data = sharks)

f_test_result_sex

    F test to compare two variances

data:  blotch by sex
F = 1.0626, num df = 263, denom df = 235, p-value = 0.6347
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
 0.8273317 1.3623562
sample estimates:
ratio of variances 
            1.0626 
# variance equal can use an independent t test

t.test(sharks.female$blotch_female, sharks.male$blotch_male, paired = FALSE, var.equal = TRUE)

    Two Sample t-test

data:  sharks.female$blotch_female and sharks.male$blotch_male
t = -3.023, df = 498, p-value = 0.002632
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.6326914 -0.1342420
sample estimates:
mean of x mean of y 
 34.92294  35.30641 
#females blotch quicker