Question 1: Is there a correlation between the variables air and
water?
Correlation Analysis
#calculate correlation coefficient between air temperature and water temparature using rows with complete observation in columns air and water
cor(sharks$air, sharks$water, use = "complete.obs")
## [1] -0.05524051
Relationship Visualization
#Create a scatter plot of air vs. water temperature with a linear regression line
ggplot(sharks, aes(x = air, y = water)) +
geom_point(color = "blue", size = 2) +
geom_smooth(method = "lm", color = "red") +
labs(title = "Scatter Plot of Air Temperature vs Water Temperature",
x = "Air Temperature (°C)", y = "Water Temperature (°C)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Test Significance
#Perform a pearson test between 'air' and 'water' columns in the 'sharks' data set using only complete observations (no missing values)
cor.test(sharks$air, sharks$water, use = "complete.obs")
##
## Pearson's product-moment correlation
##
## data: sharks$air and sharks$water
## t = -1.2346, df = 498, p-value = 0.2176
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.14224207 0.03260803
## sample estimates:
## cor
## -0.05524051
Question 2: Does multiple capture have an effect on blotching
time?
Calculate the Differences
#Calculate the difference between the second and first blotch values and store it in a new column 'blotch_diff'
sharksub$blotch_diff <- sharksub$blotch2 - sharksub$blotch1
summary(sharksub$blotch_diff)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.4488 1.0216 1.0453 0.9297 1.0711 1.1121
Paired t-test
#Perform a paired t-test to compare the first and second blotch values in 'sharksub'
t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)
##
## Paired t-test
##
## data: sharksub$blotch1 and sharksub$blotch2
## t = -17.39, df = 49, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -1.037176 -0.822301
## sample estimates:
## mean difference
## -0.9297384
Visualize the Differences
#Transform 'sharksub' to long format and create boxplot of blotch time comparing capture events
sharksub_long <- sharksub %>%
pivot_longer(cols = c(blotch1, blotch2),
names_to = "Capture_Event", values_to = "Blotch_Time")
ggplot(sharksub_long, aes(x = Capture_Event, y = Blotch_Time, fill = Capture_Event)) +
geom_boxplot() +
labs(title = "Comparison of Blotch Time Between Captures",
x = "Capture Event", y = "Blotch Time (seconds)") +
theme_minimal()

Question 3: Is it possible to predict blotching time?
Fit a Linear Regression Model
#Applying a linear regression model to predict 'blotch' using several predictors in the 'sharks' dataset
model <- lm(blotch ~ BPM + weight + length + air + water + meta, data = sharks)
summary(model)
##
## Call:
## lm(formula = blotch ~ BPM + weight + length + air + water + meta,
## data = sharks)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3251 -0.9574 -0.0401 0.9160 4.9332
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.2559539 2.1018996 18.201 <2e-16 ***
## BPM -0.0032950 0.0045553 -0.723 0.470
## weight 0.0012683 0.0047875 0.265 0.791
## length -0.0007032 0.0013819 -0.509 0.611
## air -0.0425615 0.0455232 -0.935 0.350
## water -0.0474373 0.0386436 -1.228 0.220
## meta -0.0002694 0.0037073 -0.073 0.942
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.432 on 493 degrees of freedom
## Multiple R-squared: 0.005944, Adjusted R-squared: -0.006154
## F-statistic: 0.4913 on 6 and 493 DF, p-value: 0.815
Feature Importance
#Use of stepwise regression to find the most significant predictors in the model
step(model)
## Start: AIC=365.93
## blotch ~ BPM + weight + length + air + water + meta
##
## Df Sum of Sq RSS AIC
## - meta 1 0.01082 1010.8 363.94
## - weight 1 0.14390 1010.9 364.01
## - length 1 0.53091 1011.3 364.20
## - BPM 1 1.07276 1011.9 364.46
## - air 1 1.79216 1012.6 364.82
## - water 1 3.08954 1013.9 365.46
## <none> 1010.8 365.93
##
## Step: AIC=363.94
## blotch ~ BPM + weight + length + air + water
##
## Df Sum of Sq RSS AIC
## - weight 1 0.14208 1010.9 362.01
## - length 1 0.53236 1011.3 362.20
## - BPM 1 1.07327 1011.9 362.47
## - air 1 1.85837 1012.6 362.86
## - water 1 3.10225 1013.9 363.47
## <none> 1010.8 363.94
##
## Step: AIC=362.01
## blotch ~ BPM + length + air + water
##
## Df Sum of Sq RSS AIC
## - length 1 0.5410 1011.5 360.28
## - BPM 1 1.0651 1012.0 360.54
## - air 1 1.9122 1012.8 360.95
## - water 1 3.0143 1014.0 361.50
## <none> 1010.9 362.01
##
## Step: AIC=360.28
## blotch ~ BPM + air + water
##
## Df Sum of Sq RSS AIC
## - BPM 1 0.96657 1012.4 358.75
## - air 1 1.83765 1013.3 359.18
## - water 1 2.87358 1014.4 359.70
## <none> 1011.5 360.28
##
## Step: AIC=358.75
## blotch ~ air + water
##
## Df Sum of Sq RSS AIC
## - air 1 1.6705 1014.1 357.58
## - water 1 2.9447 1015.4 358.21
## <none> 1012.4 358.75
##
## Step: AIC=357.58
## blotch ~ water
##
## Df Sum of Sq RSS AIC
## - water 1 2.713 1016.8 356.91
## <none> 1014.1 357.58
##
## Step: AIC=356.91
## blotch ~ 1
##
## Call:
## lm(formula = blotch ~ 1, data = sharks)
##
## Coefficients:
## (Intercept)
## 35.13
Visualize Predictions
#Calculate predicted blotch times from the model and create a scatter plot of observed vs. predicted values
sharks$predicted_blotch <- predict(model)
ggplot(sharks, aes(x = blotch, y = predicted_blotch)) +
geom_point(color = "blue") +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(title = "Observed vs Predicted Blotch Time",
x = "Observed Blotch Time", y = "Predicted Blotch Time") +
theme_minimal()
