RMDA Summative Assessment R Script

Author

N0907785

# Packages
library(readxl)
library(ggplot2)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Question 1. Is there a correlation between the variables air and water?

sharks <- read_excel("sharks.xlsx")   # Read the sharks.xlsx file
head(sharks)                          # Display the data

# A tibble: 6 × 10
  ID    sex    blotch   BPM weight length   air water  meta depth
  <chr> <chr>   <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
1 SH001 Female   37.2   148   74.7   187.  37.7  23.4  64.1  53.2
2 SH002 Female   34.5   158   73.4   189.  35.7  21.4  73.7  49.6
3 SH003 Female   36.3   125   71.8   284.  34.8  20.1  54.4  49.4
4 SH004 Male     35.3   161  105.    171.  36.2  21.6  86.3  50.3
5 SH005 Female   37.4   138   67.1   264.  33.6  21.8 108.   49.0
6 SH006 Male     33.5   126  110.    270.  36.4  20.9 109.   46.8

# Shapiro-Wilk test for air and water variables
shapiro.test(sharks$air)


    Shapiro-Wilk normality test

data:  sharks$air
W = 0.95885, p-value = 1.338e-10

shapiro.test(sharks$water)


    Shapiro-Wilk normality test

data:  sharks$water
W = 0.96035, p-value = 2.371e-10

# Spearman correlation test between air and water
cor.test(sharks$air, sharks$water, method = "spearman")


    Spearman's rank correlation rho

data:  sharks$air and sharks$water
S = 22007692, p-value = 0.2082
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.05637344

# Summary of data
summary(sharks$air)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  33.00   34.42   35.43   35.54   36.71   38.00

summary(sharks$water)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  20.01   21.55   23.11   23.02   24.37   25.99

# Sample size 
length(sample(sharks$air))        # length() provides number of elements whilst sample () prints list of data sample

[1] 500

length(sample(sharks$water))

[1] 500

Graphing Correlation between Air and Water

ggplot(sharks, aes(x = water, y = air)) +      # Graph plot function (ggplot) using 'sharks' data and assigning 'air' and 'water' variables to x and y
  geom_point(alpha = 0.7) +       # Scatter plot (geom_point) with slight transparency for points (alpha)
  geom_smooth(method = "lm", color = "blue", se = TRUE, fill = "lightblue") +         # Line plot (geom_smooth) using linear model (lm) method in blue colour with standard errors (se) in light blue
  labs(x = "Water Temperature (ºC)", y = "Air Temperature (ºC)") +         # Labelling the x and y axis
  theme_minimal()         # Changing theme for clearer visualisation

`geom_smooth()` using formula = 'y ~ x'

Question 2. Does multiple capture have an effect on blotching time?

sharksub <- read_excel("sharksub.xlsx")   # Read the sharks.xlsx file
head(sharksub)                            # Display the data

# A tibble: 6 × 4
  ID    sex    blotch1 blotch2
  <chr> <chr>    <dbl>   <dbl>
1 SH269 Female    36.1    37.2
2 SH163 Female    33.4    34.4
3 SH008 Female    36.3    36.5
4 SH239 Female    35.0    36.0
5 SH332 Female    35.7    36.8
6 SH328 Female    34.9    35.9

summary(sharksub)             # Display data range, median and mean

      ID                sex               blotch1         blotch2     
 Length:50          Length:50          Min.   :32.49   Min.   :33.47  
 Class :character   Class :character   1st Qu.:34.38   1st Qu.:35.31  
 Mode  :character   Mode  :character   Median :34.94   Median :35.94  
                                       Mean   :35.03   Mean   :35.96  
                                       3rd Qu.:35.90   3rd Qu.:36.78  
                                       Max.   :37.07   Max.   :38.18

# Shapiro-Wilk test for blotching times in both captures
shapiro.test(sharksub$blotch1)


    Shapiro-Wilk normality test

data:  sharksub$blotch1
W = 0.97958, p-value = 0.5345

shapiro.test(sharksub$blotch2)


    Shapiro-Wilk normality test

data:  sharksub$blotch2
W = 0.97936, p-value = 0.5255

# Perform a paired t-test for blotch1 and blotch2
paired_ttest <- t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)
paired_ttest                  # Output results


    Paired t-test

data:  sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.037176 -0.822301
sample estimates:
mean difference 
     -0.9297384

paired_ttest$p.value          # Print p-value

[1] 1.326634e-22

sd(sharksub$blotch1)          # Print standard deviation of blotch1

[1] 1.095959

sd(sharksub$blotch2)          # Print standard deviation of blotch2

[1] 1.16283

length(sample(sharksub$blotch1))        # Print sample size of blotch1

[1] 50

length(sample(sharksub$blotch2))        # Print sample size of blotch2

[1] 50

Graphing Effect of Multiple Capture on Blotching Time

sharksub %>%          # Using the `sharksub` dataset and pipe operator `%>%` to chain operations
  pivot_longer(cols = c(blotch1, blotch2),          # Transform the wide-format data into a long format (pivot_longer) by assigning column names 'blotch1' and 'blotch2'
             names_to = "CaptureEvent",           # into a column called 'CaptureEvent'
             values_to = "BlotchTime") %>%          # and their corresponding values into a column called 'BlotchTime'
  ggplot(aes(x = CaptureEvent, y = BlotchTime, fill = CaptureEvent)) +            # Plot the tranformed data, assigning 'CaptureEvent' to x axis and 'BlotchTime' to y axis, using different fill colours for each Capture Event
  stat_boxplot(geom = "errorbar", width = 0.1) +           # Add error bars (geom specifies the type of geometric object) and set width to 0.1
  geom_boxplot() +            # Add boxplot layer on top of error bars
  labs(x = "Capture Event",           # Add labels to the graph
       y = "Blotching Time (seconds)") +
  scale_fill_manual(values = c("orange", "pink")) +           # Manually set fill colours for the boxplots to orange and pink
  scale_y_continuous(breaks = seq(32, 40, by = 1)) +           # Customise the y axis to show tick marks at intervals of 1 and range from 32 to 40
  scale_x_discrete(labels = c("blotch1" = "First Capture",            # Rename the x axis labels
                              "blotch2" = "Second Capture")) +
  theme_minimal() +             # Apply minimal theme for clearer visuals
  theme(panel.grid.major.x = element_blank(),           # Remove the legend and x axis grid lines
        legend.position = "none")

Independent Question. Does sex have an effect on blotching time?

# Shapiro-Wilk test where [data$variable == "value"] creates a logical condition using 'sex' values 'Female' and 'Male' to filter the rows of 'blotch'
shapiro.test(sharks$blotch[sharks$sex == "Female"])       # Normality test for blotching times in female sharks


    Shapiro-Wilk normality test

data:  sharks$blotch[sharks$sex == "Female"]
W = 0.99527, p-value = 0.682

shapiro.test(sharks$blotch[sharks$sex == "Male"])         # Normality test for blotching times in male sharks


    Shapiro-Wilk normality test

data:  sharks$blotch[sharks$sex == "Male"]
W = 0.99209, p-value = 0.1701

# var.test() checks if the variances between groups are equal
var.test(sharks$blotch[sharks$sex == "Female"], sharks$blotch[sharks$sex == "Male"])


    F test to compare two variances

data:  sharks$blotch[sharks$sex == "Female"] and sharks$blotch[sharks$sex == "Male"]
F = 0.94109, num df = 235, denom df = 263, p-value = 0.6347
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
 0.7340224 1.2087050
sample estimates:
ratio of variances 
         0.9410879

# Perform a t-test with equal variance (var.equal = TRUE) using the formula 'blotch ~ sex' to specify that 'blotch' is the dependent variable and 'sex' is the independent grouping variable
t.test(blotch ~ sex, data = sharks, var.equal = TRUE)


    Two Sample t-test

data:  blotch by sex
t = -3.023, df = 498, p-value = 0.002632
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
 -0.6326914 -0.1342420
sample estimates:
mean in group Female   mean in group Male 
            34.92294             35.30641

# Summary of data
summary(sharks$blotch[sharks$sex == "Female"])

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  30.78   34.12   34.87   34.92   35.95   38.66

summary(sharks$blotch[sharks$sex == "Male"])

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  31.88   34.25   35.30   35.31   36.26   40.08

sd(sharks$blotch[sharks$sex == "Female"])         # Print standard deviation of 'Female' filtered 'blotch'

[1] 1.393145

sd(sharks$blotch[sharks$sex == "Male"])           # Print standard deviation of 'Male' filtered 'blotch'

[1] 1.436089

length(sample(sharks$blotch[sharks$sex == "Female"]))        # Print sample size of 'Female' filtered 'blotch'

[1] 236

length(sample(sharks$blotch[sharks$sex == "Male"]))          # Print sample size of 'Male' filtered 'blotch'

[1] 264

Graphing Effect of Sex on Blotching Time

# Plot 'sharks' data assigning 'sex' and 'blotch' variables to x and y respectively, using different fill colours for each 'sex'
ggplot(sharks, aes(x = sex, y = blotch, fill = sex)) +
  stat_boxplot(geom = "errorbar", width = 0.2) +
  geom_boxplot() +
  labs(x = "Sex", y = "Blotching Time (seconds)") +
  scale_fill_manual(values = c("pink", "lightblue")) +
  scale_y_continuous(breaks = seq(20, 50, by = 2)) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank(),
        legend.position = "none")

Question 3. Is it possible to predict blotching time?

# Shapiro-Wilk test for blotch in the sharks dataset
shapiro.test(sharks$blotch)


    Shapiro-Wilk normality test

data:  sharks$blotch
W = 0.99695, p-value = 0.4769

# Summarise the dataset
summary(sharks)

      ID                sex                blotch           BPM       
 Length:500         Length:500         Min.   :30.78   Min.   :119.0  
 Class :character   Class :character   1st Qu.:34.16   1st Qu.:129.0  
 Mode  :character   Mode  :character   Median :35.05   Median :142.0  
                                       Mean   :35.13   Mean   :141.8  
                                       3rd Qu.:36.05   3rd Qu.:153.2  
                                       Max.   :40.08   Max.   :166.0  
     weight           length           air            water      
 Min.   : 65.10   Min.   :128.3   Min.   :33.00   Min.   :20.01  
 1st Qu.: 75.68   1st Qu.:172.0   1st Qu.:34.42   1st Qu.:21.55  
 Median : 87.82   Median :211.1   Median :35.43   Median :23.11  
 Mean   : 87.94   Mean   :211.0   Mean   :35.54   Mean   :23.02  
 3rd Qu.:100.40   3rd Qu.:251.8   3rd Qu.:36.71   3rd Qu.:24.37  
 Max.   :110.94   Max.   :291.0   Max.   :38.00   Max.   :25.99  
      meta            depth      
 Min.   : 50.03   Min.   :44.64  
 1st Qu.: 67.39   1st Qu.:48.90  
 Median : 82.45   Median :50.14  
 Mean   : 82.04   Mean   :50.14  
 3rd Qu.: 95.97   3rd Qu.:51.35  
 Max.   :112.45   Max.   :56.83

# Multiple linear regression model for blotching time
model <- lm(blotch ~ air + water + BPM + weight + length + meta + depth, data = sharks)
summary(model)          # Output model summary, showing coefficients, standard deviation, t value and p value


Call:
lm(formula = blotch ~ air + water + BPM + weight + length + meta + 
    depth, data = sharks)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.83745 -0.66117 -0.00702  0.60110  2.74108 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.1405851  1.8958668   5.876 7.74e-09 ***
air         -0.0281474  0.0318707  -0.883    0.378    
water       -0.0188934  0.0270782  -0.698    0.486    
BPM         -0.0019723  0.0031890  -0.618    0.537    
weight       0.0016283  0.0033511   0.486    0.627    
length       0.0012295  0.0009710   1.266    0.206    
meta        -0.0009712  0.0025951  -0.374    0.708    
depth        0.5061285  0.0223191  22.677  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.002 on 492 degrees of freedom
Multiple R-squared:  0.514, Adjusted R-squared:  0.507 
F-statistic: 74.32 on 7 and 492 DF,  p-value: < 2.2e-16

Graphing Prediction Model for Blotching Time

# Check the rest of the data for non-linear trends
shark_long <- pivot_longer(sharks,         # Convert the data into a long format for facet wrapping
                           cols = -c(blotch, sex, ID, depth),
                           names_to = "Predictor", 
                           values_to = "Value")

# Custom labels for the facet grid
facet_labels <- c(
  air = "Air (ºC)",
  water = "Water (ºC)",
  meta = "Cortisol (mcg/dl)",
  BPM = "Heart Rate (BPM)",
  length = "Length (cm)",
  weight = "Weight (kg)")

# Scatter plots with custom facet titles
ggplot(shark_long, aes(x = Value, y = blotch)) +
  geom_point(alpha = 0.7, color = "blue") +       # Scatter points with transparency
  facet_wrap(~ Predictor, scales = "free_x", labeller = as_labeller(facet_labels)) +       # Custom facet titles
  labs(x = "Predictor Value",
       y = "Blotching Time (seconds)") +
  scale_y_continuous(breaks = seq(20, 50, by = 2)) +
  theme_minimal() +
  theme(strip.text = element_text(size = 11, face = "bold"))

# Scatter plot for depth vs blotch
ggplot(sharks, aes(x = depth, y = blotch)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", color = "blue", se = TRUE) +
  labs(x = "Depth Hooked (metres)", y = "Blotching Time (seconds)") +
  scale_y_continuous(breaks = seq(20, 50, by = 2)) +
  scale_x_continuous(breaks = seq(40, 60, by = 2)) +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

# Scatter plot for sex difference for depth vs blotch
ggplot(sharks, aes(x = depth, y = blotch, color = sex)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(x = "Depth Hooked (metres)",
       y = "Blotching Time (seconds)",
       color = "Sex") +
  scale_y_continuous(breaks = seq(20, 50, by = 2)) +
  scale_x_continuous(breaks = seq(40, 60, by = 2)) +
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

# Generate predictions using significant predictors and add to the data set
predictionmodel <- lm(blotch ~ depth + sex, data = sharks)
sharks$predictedblotch <- predict(predictionmodel, newdata = sharks)        # predict() calculaates the predicted blotching times for each observation, using depth and sex as predictors as defined in the previous model

# Plot observed vs predicted blotching times
ggplot(sharks, aes(x = predictedblotch, y = blotch)) +
  geom_point(alpha = 0.7, color = "darkgreen") +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +         # Ideal line where predictions match observations
  labs(x = "Predicted Blotch", y = "Observed Blotch") +
  scale_y_continuous(breaks = seq(20, 50, by = 2)) +
  theme_minimal()

summary(predictionmodel)


Call:
lm(formula = blotch ~ depth + sex, data = sharks)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.96224 -0.64833 -0.01317  0.58882  2.98829 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.80938    1.10029   8.915  < 2e-16 ***
depth        0.50172    0.02194  22.864  < 2e-16 ***
sexMale      0.30379    0.08871   3.424 0.000667 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9895 on 497 degrees of freedom
Multiple R-squared:  0.5214,    Adjusted R-squared:  0.5195 
F-statistic: 270.7 on 2 and 497 DF,  p-value: < 2.2e-16

summary(sharks$blotch)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  30.78   34.16   35.05   35.13   36.05   40.08

summary(sharks$predictedblotch)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  32.32   34.47   35.12   35.13   35.77   38.32

# Correlation test between observed and predicted values
cor.test(sharks$blotch, sharks$predictedblotch)


    Pearson's product-moment correlation

data:  sharks$blotch and sharks$predictedblotch
t = 23.293, df = 498, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6772818 0.7615555
sample estimates:
      cor 
0.7220867

# Calculate residuals
sharks$predictedresiduals <- residuals(predictionmodel)
summary(sharks$predictedresiduals)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-2.96224 -0.64833 -0.01317  0.00000  0.58882  2.98829

# Plot residuals vs fitted values
ggplot(sharks, aes(x = predictedblotch, y = predictedresiduals)) +
  geom_point(alpha = 0.7, color = "blue") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(x = "Fitted Values", y = "Residuals") +
  theme_minimal()

# QQ plot to assess normality
ggplot(sharks, aes(sample = predictedresiduals)) +
  stat_qq() + 
  stat_qq_line(color = "red") +
  labs(x = "Theoretical Quantiles",
       y = "Sample Quantiles") +
  theme_minimal()