Question 1: Is there a correlation between the variables air and water?

Correlation Analysis

#calculate correlation coefficient between air temperature and water temparature using rows with complete observation in columns air and water

cor(sharks$air, sharks$water, use = "complete.obs") 
## [1] -0.05524051

Relationship Visualization

#Create a scatter plot of air vs. water temperature with a linear regression line

ggplot(sharks, aes(x = air, y = water)) +
  geom_point(color = "blue", size = 2) +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Scatter Plot of Air Temperature vs Water Temperature",
       x = "Air Temperature (°C)", y = "Water Temperature (°C)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Test Significance

#Perform a pearson test between 'air' and 'water' columns in the 'sharks' data set using only complete observations (no missing values)

cor.test(sharks$air, sharks$water, use = "complete.obs")
## 
##  Pearson's product-moment correlation
## 
## data:  sharks$air and sharks$water
## t = -1.2346, df = 498, p-value = 0.2176
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.14224207  0.03260803
## sample estimates:
##         cor 
## -0.05524051

Question 2: Does multiple capture have an effect on blotching time?

Calculate the Differences

#Calculate the difference between the second and first blotch values and store it in a new column 'blotch_diff'

sharksub$blotch_diff <- sharksub$blotch2 - sharksub$blotch1
summary(sharksub$blotch_diff)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.4488  1.0216  1.0453  0.9297  1.0711  1.1121

Paired t-test

#Perform a paired t-test to compare the first and second blotch values in 'sharksub'

t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)
## 
##  Paired t-test
## 
## data:  sharksub$blotch1 and sharksub$blotch2
## t = -17.39, df = 49, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -1.037176 -0.822301
## sample estimates:
## mean difference 
##      -0.9297384

Visualize the Differences

#Transform 'sharksub' to long format and create boxplot of blotch time comparing capture events

sharksub_long <- sharksub %>%
  pivot_longer(cols = c(blotch1, blotch2),
               names_to = "Capture_Event", values_to = "Blotch_Time")

ggplot(sharksub_long, aes(x = Capture_Event, y = Blotch_Time, fill = Capture_Event)) +
  geom_boxplot() +
  labs(title = "Comparison of Blotch Time Between Captures",
       x = "Capture Event", y = "Blotch Time (seconds)") +
  theme_minimal()

Question 3: Is it possible to predict blotching time?

Fit a Linear Regression Model

#Applying a linear regression model to predict 'blotch' using several predictors in the 'sharks' dataset

model <- lm(blotch ~ BPM + weight + length + air + water + meta, data = sharks) 
summary(model)
## 
## Call:
## lm(formula = blotch ~ BPM + weight + length + air + water + meta, 
##     data = sharks)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3251 -0.9574 -0.0401  0.9160  4.9332 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 38.2559539  2.1018996  18.201   <2e-16 ***
## BPM         -0.0032950  0.0045553  -0.723    0.470    
## weight       0.0012683  0.0047875   0.265    0.791    
## length      -0.0007032  0.0013819  -0.509    0.611    
## air         -0.0425615  0.0455232  -0.935    0.350    
## water       -0.0474373  0.0386436  -1.228    0.220    
## meta        -0.0002694  0.0037073  -0.073    0.942    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.432 on 493 degrees of freedom
## Multiple R-squared:  0.005944,   Adjusted R-squared:  -0.006154 
## F-statistic: 0.4913 on 6 and 493 DF,  p-value: 0.815

Check Model Fit

#Plot diagnostic plots of the linear regression model to see if it fits and look at the residuals

plot(model)

Feature Importance

#Use of stepwise regression to find the most significant predictors in the model

step(model)
## Start:  AIC=365.93
## blotch ~ BPM + weight + length + air + water + meta
## 
##          Df Sum of Sq    RSS    AIC
## - meta    1   0.01082 1010.8 363.94
## - weight  1   0.14390 1010.9 364.01
## - length  1   0.53091 1011.3 364.20
## - BPM     1   1.07276 1011.9 364.46
## - air     1   1.79216 1012.6 364.82
## - water   1   3.08954 1013.9 365.46
## <none>                1010.8 365.93
## 
## Step:  AIC=363.94
## blotch ~ BPM + weight + length + air + water
## 
##          Df Sum of Sq    RSS    AIC
## - weight  1   0.14208 1010.9 362.01
## - length  1   0.53236 1011.3 362.20
## - BPM     1   1.07327 1011.9 362.47
## - air     1   1.85837 1012.6 362.86
## - water   1   3.10225 1013.9 363.47
## <none>                1010.8 363.94
## 
## Step:  AIC=362.01
## blotch ~ BPM + length + air + water
## 
##          Df Sum of Sq    RSS    AIC
## - length  1    0.5410 1011.5 360.28
## - BPM     1    1.0651 1012.0 360.54
## - air     1    1.9122 1012.8 360.95
## - water   1    3.0143 1014.0 361.50
## <none>                1010.9 362.01
## 
## Step:  AIC=360.28
## blotch ~ BPM + air + water
## 
##         Df Sum of Sq    RSS    AIC
## - BPM    1   0.96657 1012.4 358.75
## - air    1   1.83765 1013.3 359.18
## - water  1   2.87358 1014.4 359.70
## <none>               1011.5 360.28
## 
## Step:  AIC=358.75
## blotch ~ air + water
## 
##         Df Sum of Sq    RSS    AIC
## - air    1    1.6705 1014.1 357.58
## - water  1    2.9447 1015.4 358.21
## <none>               1012.4 358.75
## 
## Step:  AIC=357.58
## blotch ~ water
## 
##         Df Sum of Sq    RSS    AIC
## - water  1     2.713 1016.8 356.91
## <none>               1014.1 357.58
## 
## Step:  AIC=356.91
## blotch ~ 1
## 
## Call:
## lm(formula = blotch ~ 1, data = sharks)
## 
## Coefficients:
## (Intercept)  
##       35.13

Visualize Predictions

#Calculate predicted blotch times from the model and create a scatter plot of observed vs. predicted values

sharks$predicted_blotch <- predict(model)
ggplot(sharks, aes(x = blotch, y = predicted_blotch)) +
  geom_point(color = "blue") +
  geom_abline(slope = 1, intercept = 0, color = "red") +
  labs(title = "Observed vs Predicted Blotch Time",
       x = "Observed Blotch Time", y = "Predicted Blotch Time") +
  theme_minimal()