# Loading sharks excel data set saved as CSV file on my computer
sharks <- read.csv("C:/Users/44738/OneDrive/Documents/NTU- Masters/Research Methods & data analysis/Summative Assessment/sharks.csv")Ethical Dilemmas in Shark Research: Assessing Stress-Induced Blotching in Caribbean Reef Sharks (Carcharinus perezi) using multiple linear regression.
Loading the datasets
Investigating the correlation between air and water variables
Creating a scatterplot for air and water variables
# Creating a scatterplot for air and water variables
plot(sharks$air, sharks$water, xlab="Air Temperature (°C)", ylab="Water Temperature (°C)", main="Relationship between Air and Water Temperature")
abline(lm(sharks$water ~ sharks$air), col= "black") # adding regression lineTesting the correlation between air and water
Checking assumptions for Pearson’s correlation
# Checking normality with Shapiro-Wilk tests
shapiro.test(sharks$air)
Shapiro-Wilk normality test
data: sharks$air
W = 0.95885, p-value = 1.338e-10
shapiro.test(sharks$water)
Shapiro-Wilk normality test
data: sharks$water
W = 0.96035, p-value = 2.371e-10
Both p values < 0.05 so don’t meet Pearson’s assumption or normality
Carrying out Spearman’s rank correlation
# Calculating Spearman's correlation coefficient
cor.test(sharks$air, sharks$water, method = "spearman")
Spearman's rank correlation rho
data: sharks$air and sharks$water
S = 22007692, p-value = 0.2082
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
-0.05637344
Impact of multiple capture on blotching time
Creating a paired box plot for blotching time and capture event
# Installing required package
suppressMessages(suppressWarnings(
install.packages("PairedData", repos = "https://cran.rstudio.com/")
))package 'PairedData' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\44738\AppData\Local\Temp\RtmpUf6QGk\downloaded_packages
# Loading required package
suppressMessages(suppressWarnings(library(PairedData)))# Creating paired box plot for blotching time and capture event
First <- sharksub$blotch1 # creating vectors for easier usage vs calling column data
Second <- sharksub$blotch2
paired_data <- paired(First, Second) # creating a paired data object for later use
plot(paired_data, type = "profile") + # specifying paired observations
theme_bw() + # minimal theme
labs(title = "Comparison of Blotching Time Between First and Second Capture",
x = "Capture Event",
y = "Blotching Time (s)") +
theme(panel.grid.major = element_blank(), # remove major grid lines
panel.grid.minor = element_blank()) # remove minor grid linesGetting median values for both capture events for analysis
# Calculating medians for both capture events
median_blotch1 <- median(sharksub$blotch1, na.rm = TRUE)
median_blotch2 <- median(sharksub$blotch2, na.rm = TRUE) # Showing median values
cat("Median blotching time for first capture:", median_blotch1, "seconds\n") # naming and adding unitsMedian blotching time for first capture: 34.93777 seconds
cat("Median blotching time for second capture:", median_blotch2, "seconds\n")Median blotching time for second capture: 35.94079 seconds
Testing differences between blotching time against capture event
Checking paired t-test assumptions
# Testing normality of both capture events using Shapiro-wilk test
shapiro.test(sharksub$blotch1)
Shapiro-Wilk normality test
data: sharksub$blotch1
W = 0.97958, p-value = 0.5345
shapiro.test(sharksub$blotch2)
Shapiro-Wilk normality test
data: sharksub$blotch2
W = 0.97936, p-value = 0.5255
Conducting paired t-test
# Paired t test
t_test_result <- t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE) # running test
print(t_test_result) # show result
Paired t-test
data: sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-1.037176 -0.822301
sample estimates:
mean difference
-0.9297384
Predicting blotching time
Checking for potential predictors of blotching time out of all the continuous variables by testing for correlations
Checking if the continuous variables meet Pearson’s correlation assumptions
Weight
# Checking for outliers
boxplot(sharks$weight, main= "Boxplot of Weight")# Checking for normality
shapiro.test(sharks$weight)
Shapiro-Wilk normality test
data: sharks$weight
W = 0.94662, p-value = 1.929e-12
Length
# Checking for outliers
boxplot(sharks$length, main= "Boxplot of Length")# Checking for normality
shapiro.test(sharks$weight)
Shapiro-Wilk normality test
data: sharks$weight
W = 0.94662, p-value = 1.929e-12
BPM
# Checking for outliers
boxplot(sharks$BPM, main= "Boxplot of BPM")# Checking for normality
shapiro.test(sharks$BPM)
Shapiro-Wilk normality test
data: sharks$BPM
W = 0.947, p-value = 2.178e-12
Air
# Checking for outliers
boxplot(sharks$air, main= "Boxplot of Air")# Checking for normality
shapiro.test(sharks$air)
Shapiro-Wilk normality test
data: sharks$air
W = 0.95885, p-value = 1.338e-10
Water
# Checking for outliers
boxplot(sharks$water, main= "Boxplot of Water")# Checking for normality
shapiro.test(sharks$water)
Shapiro-Wilk normality test
data: sharks$water
W = 0.96035, p-value = 2.371e-10
Cortisol
# Checking for outliers
boxplot(sharks$meta, main= "Boxplot of Shark Cortisol")# Checking for normality
shapiro.test((sharks$meta))
Shapiro-Wilk normality test
data: (sharks$meta)
W = 0.96374, p-value = 9.141e-10
Depth
# Checking for outliers
boxplot(sharks$depth, main= "Boxplot of Shark Capture Depth")# Checking for normality
shapiro.test(sharks$depth)
Shapiro-Wilk normality test
data: sharks$depth
W = 0.99746, p-value = 0.6485
All continuous variables did not meet Pearson’s assumptions of normality/no outliers
Testing correlation using Spearman’s rank
# Testing continuous variables correlation to blotching time using Spearman's rank
continuous_vars <- sharks[, c("blotch", "BPM", "weight", "length", "air", "water", "meta", "depth")] # subset continuous variables
correlation_matrix_continuous_vars <- cor(continuous_vars, use = "complete.obs", method = "spearman") # conduct correlation matrix testing correlations of all continuous variables against each other using Spearman's rank
blotch_correlations <- correlation_matrix_continuous_vars["blotch", -1] # extract only correlations between blotch and the other continuous variables
blotch_correlations_df <- data.frame(Variable = names(blotch_correlations), Correlation = blotch_correlations) # create data frame to visualize test output
print(blotch_correlations_df) # show output Variable Correlation
BPM BPM -3.669241e-02
weight weight 5.376022e-05
length length -1.168872e-02
air air -3.053167e-02
water water -4.442139e-02
meta meta -2.773979e-02
depth depth 6.737177e-01
# Creating bar chart to visualize Spearman's output
variable_labels <- c("BPM", "Weight", "Length", "Air", "Water", "Cortisol", "Depth") # changing variable names
barplot_heights <- barplot(
blotch_correlations,
main = "Correlation of Blotch with Other Continuous Variables",
ylab = "Spearman's rho",
xlab = "Variables",
col = "gray", # using minimalistic colour
las = 2, # rotate labels on x-axis for better readability
cex.names = 0.7, # size of axis labels
ylim = c(-1, 1), # set y-axis limits from -1 to 1
names.arg = variable_labels # use the new variable names
)
text(
x = barplot_heights,
y = blotch_correlations,
label = round(blotch_correlations, 2), # adding rho value labels to 2 decimal places
pos = 3, # position labels above the bars
cex = 0.8 # adjust text size
)Visualising depth relationship
# Creating scatterplot for depth against blotching time
plot(sharks$depth, sharks$blotch, xlab= "Capture Depth (m)", ylab= "Blotching Time (s)", main= "Relationship between Capture Depth and Blotching Time")
abline(lm(sharks$blotch ~ sharks$depth), col= "black") # adding linear regression lineCarrying out individual Spearman’s test for depth for analysis figures
# Spearman's rank test for depth against blotching time
cor.test(sharks$blotch, sharks$depth, method = "spearman")
Spearman's rank correlation rho
data: sharks$blotch and sharks$depth
S = 6797520, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.6737177
Checking if sex could be potential predictor for blotching time
Seeing if significant differnces in blotching time between sex
# Testing Normal Distribution with Shaprio Wilk
with(sharks, shapiro.test(blotch[sex == "Male"]))
Shapiro-Wilk normality test
data: blotch[sex == "Male"]
W = 0.99209, p-value = 0.1701
with(sharks, shapiro.test(blotch[sex == "Female"]))
Shapiro-Wilk normality test
data: blotch[sex == "Female"]
W = 0.99527, p-value = 0.682
# Testing equal variances
var.test(blotch ~ sex, data = sharks)
F test to compare two variances
data: blotch by sex
F = 0.94109, num df = 235, denom df = 263, p-value = 0.6347
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
0.7340224 1.2087050
sample estimates:
ratio of variances
0.9410879
Meet assumptions so do T-test
# Conducting T-test
Female_blotch <- sharks$blotch[sharks$sex == "Female"]
Male_blotch <- sharks$blotch[sharks$sex == "Male"]
t.test(Female_blotch, Male_blotch, var.equal = TRUE)
Two Sample t-test
data: Female_blotch and Male_blotch
t = -3.023, df = 498, p-value = 0.002632
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.6326914 -0.1342420
sample estimates:
mean of x mean of y
34.92294 35.30641
Creating boxplot for sex and blotching time to further visualize the relationship
# Install necessary package
install.packages("ggplot2", repos = "https://cran.rstudio.com/")Warning: package 'ggplot2' is in use and will not be installed
# Loading required packages
library(ggplot2) # Creating boxplot for sex against blotching time
ggplot(sharks, aes(x = sex, y = blotch)) +
geom_boxplot() +
theme_minimal() +
theme(
panel.grid.major = element_blank(), # remove major grid lines
panel.grid.minor = element_blank(), # remove minor grid lines
panel.border = element_blank(),
axis.line = element_line(color = "black") # keep the axis lines
) +
labs(
title = "Boxplot of Blotch vs Sex",
x = "Sex",
y = "Blotching Time (s)"
)Calculating medians for analysis
# Calculating median blotching time for each sex
median_blotch_by_sex <- tapply(sharks$blotch, sharks$sex, median)
median_blotch_by_sex # display the results Female Male
34.86691 35.30226
Multiple Linear Regression to test prediction ability of sex and depth
Checking assumptions of multiple linear regression
Linearity
- Earlier scatterplot of depth and blotching time shows linear relationship (not relevant for sex but can include as t-test showed significant differences)
No Multicollinearity
# Installing required package
install.packages("car", repos = "https://cran.rstudio.com/")Installing package into 'C:/Users/44738/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)
package 'car' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\44738\AppData\Local\Temp\RtmpUf6QGk\downloaded_packages
library(car) # loading packageWarning: package 'car' was built under R version 4.4.2
Loading required package: carData
Warning: package 'carData' was built under R version 4.4.2
# Testing for multicollinearity using VIF
model <- lm(blotch ~ depth + sex, data = sharks)
vif(model) depth sex
1.001545 1.001545
- No multicollinearity (VIF < 5)
Homoscedasticity
# Testing homoscedasticity
plot(model$fitted.values, model$residuals)
abline(h = 0, col = "red")- Yes as no clear pattern
Multivariate normality
# Doing Q-Q plot for residuals to check multivariate normality
# Q-Q plot for residuals
qqnorm(residuals(model))
qqline(residuals(model), col = "red")- Normal
Carrying out multiple linear regression
# Getting output of the multiple linear regression model
summary(model)
Call:
lm(formula = blotch ~ depth + sex, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.96224 -0.64833 -0.01317 0.58882 2.98829
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.80938 1.10029 8.915 < 2e-16 ***
depth 0.50172 0.02194 22.864 < 2e-16 ***
sexMale 0.30379 0.08871 3.424 0.000667 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9895 on 497 degrees of freedom
Multiple R-squared: 0.5214, Adjusted R-squared: 0.5195
F-statistic: 270.7 on 2 and 497 DF, p-value: < 2.2e-16