RMDA Summative Code- Tink Swash (N0919220)

##Preparation

library(tidyverse) #loads tidyverse package

Warning: package 'tidyverse' was built under R version 4.3.3

Warning: package 'ggplot2' was built under R version 4.3.3

Warning: package 'tibble' was built under R version 4.3.1

Warning: package 'tidyr' was built under R version 4.3.3

Warning: package 'readr' was built under R version 4.3.3

Warning: package 'purrr' was built under R version 4.3.3

Warning: package 'dplyr' was built under R version 4.3.3

Warning: package 'stringr' was built under R version 4.3.3

Warning: package 'forcats' was built under R version 4.3.3

Warning: package 'lubridate' was built under R version 4.3.3

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2) #loads ggplot2 package for graph making

sharks <- read.csv("sharks.csv", header = TRUE)

sharksub <- read.csv("sharksub.csv", header = TRUE)

view(sharks) #opens the sharks data set in a new tab (after importing)

view(sharksub) #opens the sharksub data set in a new tab (after importing)

##Normality: The shapiro wilk test

normblotch1 <- shapiro.test(sharksub$blotch1) #normal distribution
print(normblotch1) #this print command shows the result of the test


    Shapiro-Wilk normality test

data:  sharksub$blotch1
W = 0.97958, p-value = 0.5345

normblotch2 <- shapiro.test(sharksub$blotch2) #normal distribution
print(normblotch2)


    Shapiro-Wilk normality test

data:  sharksub$blotch2
W = 0.97936, p-value = 0.5255

normblotch <- shapiro.test(sharks$blotch) #normal distribution
print(normblotch)


    Shapiro-Wilk normality test

data:  sharks$blotch
W = 0.99695, p-value = 0.4769

normbpm <- shapiro.test(sharks$BPM) #non-normal distribution
print(normbpm)


    Shapiro-Wilk normality test

data:  sharks$BPM
W = 0.947, p-value = 2.178e-12

normweight <- shapiro.test(sharks$weight) #non-normal distribution
print(normweight)


    Shapiro-Wilk normality test

data:  sharks$weight
W = 0.94662, p-value = 1.929e-12

normlength <- shapiro.test(sharks$length) #non-normal distribution
print(normlength)


    Shapiro-Wilk normality test

data:  sharks$length
W = 0.95668, p-value = 5.963e-11

normair <- shapiro.test(sharks$air) #non-normal distribution
print(normair)


    Shapiro-Wilk normality test

data:  sharks$air
W = 0.95885, p-value = 1.338e-10

normwater <- shapiro.test(sharks$water) #non-normal distribution
print(normwater)


    Shapiro-Wilk normality test

data:  sharks$water
W = 0.96035, p-value = 2.371e-10

normmeta <- shapiro.test(sharks$meta) #non-normal distribution
print(normmeta)


    Shapiro-Wilk normality test

data:  sharks$meta
W = 0.96374, p-value = 9.141e-10

normdepth <- shapiro.test(sharks$depth) #normal distribution
print(normdepth)


    Shapiro-Wilk normality test

data:  sharks$depth
W = 0.99746, p-value = 0.6485

##Multiple capture

ttestrecapture <- t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE, alternative = "two.sided") #paired t test between blotch 1 and blotch 2
print(ttestrecapture)


    Paired t-test

data:  sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.037176 -0.822301
sample estimates:
mean difference 
     -0.9297384

#Comparing this between males and females

#Creating subsets within the sharksub dataset by sex
malesharksub <- subset(sharksub, sex == "Male")
femalesharksub <- subset(sharksub, sex == "Female")

#Paired t-test for males
ttestmalesharksub <- t.test(malesharksub$blotch1, malesharksub$blotch2, paired = TRUE)

#Paired t-test for females
ttestfemalesharksub <- t.test(femalesharksub$blotch1, femalesharksub$blotch2, paired = TRUE)

#To discover the results
print("Paired t-test for males: Recapture")

[1] "Paired t-test for males: Recapture"

print(ttestmalesharksub)


    Paired t-test

data:  malesharksub$blotch1 and malesharksub$blotch2
t = -10.527, df = 24, p-value = 1.786e-10
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.0857650 -0.7298108
sample estimates:
mean difference 
     -0.9077879

print("Paired t-test for females: Recapture")

[1] "Paired t-test for females: Recapture"

print(ttestfemalesharksub)


    Paired t-test

data:  femalesharksub$blotch1 and femalesharksub$blotch2
t = -14.694, df = 24, p-value = 1.7e-13
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.0853637 -0.8180143
sample estimates:
mean difference 
      -0.951689

#both females and males significantly different between blotch1 and blotch2, both when tested separately and together

#Making a box plot for this
#Now I must create a new dataset to arrange the data for graphs
sharksubgraphmaking <- reshape2::melt(sharksub, id.vars = "sex", measure.vars = c("blotch1", "blotch2"))

#Boxplotting and making it look nice
ggplot(sharksubgraphmaking, aes(x = variable, y = value, fill = sex)) +
  geom_boxplot() +
  labs(title = "Blotch1 vs Blotch2 for Males and Females", 
       x = "First Capture                    Recapture", 
       y = "Blotch") +
  scale_fill_manual(values = c("pink", "blue")) +
  theme_minimal()

##Air and Water Correlation

corairwater <- cor.test(sharks$air, sharks$water, method = "spearman") #using spearman's rank test, negative correlation seen but not statistically significant
print(corairwater)


    Spearman's rank correlation rho

data:  sharks$air and sharks$water
S = 22007692, p-value = 0.2082
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.05637344

##Correlations with Blotch

corblotchweight <- cor.test(sharks$blotch, sharks$weight, method = "spearman") #positive correlation between blotching and weight but not significant
print(corblotchweight)


    Spearman's rank correlation rho

data:  sharks$blotch and sharks$weight
S = 20832130, p-value = 0.999
alternative hypothesis: true rho is not equal to 0
sample estimates:
         rho 
5.376022e-05

corblotchlength <- cor.test(sharks$blotch, sharks$length, method = "spearman") #negative correlation but not significant
print(corblotchlength)


    Spearman's rank correlation rho

data:  sharks$blotch and sharks$length
S = 21076764, p-value = 0.7942
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.01168872

corblotchair <- cor.test(sharks$blotch, sharks$air, method = "spearman") #negative correlation but not significant
print(corblotchair)


    Spearman's rank correlation rho

data:  sharks$blotch and sharks$air
S = 21469324, p-value = 0.4956
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.03053167

corblotchwater <- cor.test(sharks$blotch, sharks$water, method = "spearman") #negative correlation but not significant
print(corblotchwater)


    Spearman's rank correlation rho

data:  sharks$blotch and sharks$water
S = 21758692, p-value = 0.3214
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.04442139

corblotchmeta <- cor.test(sharks$blotch, sharks$meta, method = "spearman") #negative correlation but not significant
print(corblotchmeta)


    Spearman's rank correlation rho

data:  sharks$blotch and sharks$meta
S = 21411160, p-value = 0.5359
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.02773979

corblotchdepth <- cor.test(sharks$blotch, sharks$depth, method = "spearman") #significant positive correlation!
print(corblotchdepth)


    Spearman's rank correlation rho

data:  sharks$blotch and sharks$depth
S = 6797520, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.6737177

#Creating subsets within the shark dataset by sex
maleshark <- subset(sharks, sex == "Male")
femaleshark <- subset(sharks, sex == "Female")

#Unpaired t-test for males and females
ttestmalefemalesharks <- t.test(maleshark$blotch, femaleshark$blotch, paired = FALSE)

#To discover the results
print("Unpaired t-test for males and females")

[1] "Unpaired t-test for males and females"

print(ttestmalefemalesharks)


    Welch Two Sample t-test

data:  maleshark$blotch and femaleshark$blotch
t = 3.0282, df = 494.67, p-value = 0.002589
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.1346620 0.6322714
sample estimates:
mean of x mean of y 
 35.30641  34.92294

#there is a significant difference between blotching in males and females. Next I will make a boxplot to show this

ggplot(sharks, aes(x = sex, y = blotch, fill = sex)) +
  geom_boxplot() +  
  labs(title = "Blotch: Males vs.Females",
       x = "Sex",
       y = "Blotch") +
  scale_fill_manual(values = c("pink", "blue")) +  
  theme_minimal()

##Correlations with meta

cormetaweight <- cor.test(sharks$meta, sharks$weight, method = "spearman") #positive correlation but not significant
print(cormetaweight)


    Spearman's rank correlation rho

data:  sharks$meta and sharks$weight
S = 20413216, p-value = 0.6528
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
0.02016171

cormetalength <- cor.test(sharks$meta, sharks$length, method = "spearman") #positive correlation but not significant
print(cormetalength)


    Spearman's rank correlation rho

data:  sharks$meta and sharks$length
S = 20737928, p-value = 0.9187
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
0.004575474

cormetaair <- cor.test(sharks$meta, sharks$air, method = "spearman") #significant positive correlation! 
print(cormetaair)


    Spearman's rank correlation rho

data:  sharks$meta and sharks$air
S = 18172688, p-value = 0.004258
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.1277075

cormetawater <- cor.test(sharks$meta, sharks$water, method = "spearman") #positive correlation but not significant
print(cormetawater)


    Spearman's rank correlation rho

data:  sharks$meta and sharks$water
S = 20360456, p-value = 0.6126
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.0226942

cormetadepth <- cor.test(sharks$meta, sharks$depth, method = "spearman") #positive correlation but not significant
print(cormetadepth)


    Spearman's rank correlation rho

data:  sharks$meta and sharks$depth
S = 20614282, p-value = 0.8146
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
0.01051051

##Linear models to predict blotching

#Creating a model with all of the numerical variables in sharks
lm_model <- lm(blotch ~ BPM + weight + length + air + water + meta + depth, data = sharks)
summary(lm_model)


Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta + 
    depth, data = sharks)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.83745 -0.66117 -0.00702  0.60110  2.74108 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.1405851  1.8958668   5.876 7.74e-09 ***
BPM         -0.0019723  0.0031890  -0.618    0.537    
weight       0.0016283  0.0033511   0.486    0.627    
length       0.0012295  0.0009710   1.266    0.206    
air         -0.0281474  0.0318707  -0.883    0.378    
water       -0.0188934  0.0270782  -0.698    0.486    
meta        -0.0009712  0.0025951  -0.374    0.708    
depth        0.5061285  0.0223191  22.677  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.002 on 492 degrees of freedom
Multiple R-squared:  0.514, Adjusted R-squared:  0.507 
F-statistic: 74.32 on 7 and 492 DF,  p-value: < 2.2e-16

#Adjusting the model to reduce the interference from the non-significant variables and multicollinearity to only include the one significant variable seen in the previous model and the correlation tests.
lm_depth_only <- lm(blotch ~ depth, data = sharks)
summary(lm_depth_only)


Call:
lm(formula = blotch ~ depth, data = sharks)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.81869 -0.65427 -0.01035  0.58825  2.83116 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.82178    1.11207   8.832   <2e-16 ***
depth        0.50467    0.02216  22.772   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1 on 498 degrees of freedom
Multiple R-squared:  0.5101,    Adjusted R-squared:  0.5091 
F-statistic: 518.6 on 1 and 498 DF,  p-value: < 2.2e-16

#This model accounts for 51% of variation in the data, which is a mere 0.04% less than in the previous model.This is remedied by adjusting for R-squared, as when this is done the second model that focuses on depth increases its score to 50.91%.

#It is time to create predictions using the depth model
sharks$predicted_blotch <- predict(lm_depth_only, newdata = sharks)

#Creates predictions with the inclusion of confidence intervals which is helpful to assess the validity of the result
predictions <- predict(lm_depth_only, newdata = sharks, interval = "confidence", level = 0.95)

#To make graph creation easier later, this will move the predicted values from the model into the sharks data set
sharks$predicted_blotch <- predictions[, 1]  
sharks$lower_ci <- predictions[, 2]  
sharks$upper_ci <- predictions[, 3]  


#This creates a colourful scatter graph to visualise the relationship between depth and blotch, with the predicted blotching time represented as a red line
ggplot(sharks, aes(x = depth, y = blotch)) +
  geom_point(aes(color = "Observed"), alpha = 0.6) +  
  geom_line(aes(x = depth, y = predicted_blotch, color = "Predicted"), size = 1) + 
  labs(title = "Blotch vs Depth",
       x = "Depth",
       y = "Blotch") +
  scale_color_manual(values = c("Observed" = "blue", "Predicted" = "red")) +
  theme_minimal()

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

#Same as previous graph but with confidence intervals this time. 
ggplot(sharks, aes(x = depth)) +
  #Plots the observed values as blue points
  geom_point(aes(y = blotch, color = "Observed"), alpha = 0.6) +
  
  #Plots the predicted values as a red line across the graph
  geom_line(aes(y = predicted_blotch, color = "Predicted"), size = 1) +
  
  #Add confidence intervals that look like a semi-transparent red ribbon
  geom_ribbon(aes(ymin = lower_ci, ymax = upper_ci), fill = "red", alpha = 0.2) +
  
  labs(title = "Blotch vs Depth: with Confidence Intervals",
       x = "Depth",
       y = "Blotch") +
  scale_color_manual(values = c("Observed" = "blue", "Predicted" = "red")) +
  theme_minimal()