Sharks

Author

Jal Vashi (N0990629)

For Objective 1: Is there a correlation between air temperature and sea surface temperature?

Step 1:Load the necessary packages

# readxl is used to read the excel file stored in the local computer drive. 
# ggplot2 is used to visualise the data set. 

library(readxl)
library(ggplot2)

Step 2:Import the data from the Excel file

shark <- read_excel("/Users/jal4510/Desktop/MRes/Research Methods/Final Shark/sharks.xlsx", sheet = "Sheet1")

Step 3:View the first few rows of the data to confirm it’s loaded correctly

# head is used to view the first 5-10 data points to ensure the data is loaded correctly
# Summery is used to summarise the variables and provide class, mean, and other basic statistical analyses. 

head(shark)

# A tibble: 6 × 10
  ID    sex    blotch   BPM weight length   air water  meta depth
  <chr> <chr>   <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
1 SH001 Female   37.2   148   74.7   187.  37.7  23.4  64.1  53.2
2 SH002 Female   34.5   158   73.4   189.  35.7  21.4  73.7  49.6
3 SH003 Female   36.3   125   71.8   284.  34.8  20.1  54.4  49.4
4 SH004 Male     35.3   161  105.    171.  36.2  21.6  86.3  50.3
5 SH005 Female   37.4   138   67.1   264.  33.6  21.8 108.   49.0
6 SH006 Male     33.5   126  110.    270.  36.4  20.9 109.   46.8

summary(shark)

      ID                sex                blotch           BPM       
 Length:500         Length:500         Min.   :30.78   Min.   :119.0  
 Class :character   Class :character   1st Qu.:34.16   1st Qu.:129.0  
 Mode  :character   Mode  :character   Median :35.05   Median :142.0  
                                       Mean   :35.13   Mean   :141.8  
                                       3rd Qu.:36.05   3rd Qu.:153.2  
                                       Max.   :40.08   Max.   :166.0  
     weight           length           air            water      
 Min.   : 65.10   Min.   :128.3   Min.   :33.00   Min.   :20.01  
 1st Qu.: 75.68   1st Qu.:172.0   1st Qu.:34.42   1st Qu.:21.55  
 Median : 87.82   Median :211.1   Median :35.43   Median :23.11  
 Mean   : 87.94   Mean   :211.0   Mean   :35.54   Mean   :23.02  
 3rd Qu.:100.40   3rd Qu.:251.8   3rd Qu.:36.71   3rd Qu.:24.37  
 Max.   :110.94   Max.   :291.0   Max.   :38.00   Max.   :25.99  
      meta            depth      
 Min.   : 50.03   Min.   :44.64  
 1st Qu.: 67.39   1st Qu.:48.90  
 Median : 82.45   Median :50.14  
 Mean   : 82.04   Mean   :50.14  
 3rd Qu.: 95.97   3rd Qu.:51.35  
 Max.   :112.45   Max.   :56.83

Step 4:Finding the data type

# sapply is used to identify the data type of the column in a data-set. 
sapply (shark, class)

         ID         sex      blotch         BPM      weight      length 
"character" "character"   "numeric"   "numeric"   "numeric"   "numeric" 
        air       water        meta       depth 
  "numeric"   "numeric"   "numeric"   "numeric"

Step 5:Finding the structure of the data

# str is to understand the kind of data, including rows and columns provided in the data set. 
str (shark)

tibble [500 × 10] (S3: tbl_df/tbl/data.frame)
 $ ID    : chr [1:500] "SH001" "SH002" "SH003" "SH004" ...
 $ sex   : chr [1:500] "Female" "Female" "Female" "Male" ...
 $ blotch: num [1:500] 37.2 34.5 36.3 35.3 37.4 ...
 $ BPM   : num [1:500] 148 158 125 161 138 126 166 135 132 127 ...
 $ weight: num [1:500] 74.7 73.4 71.8 104.6 67.1 ...
 $ length: num [1:500] 187 189 284 171 264 ...
 $ air   : num [1:500] 37.7 35.7 34.8 36.2 33.6 ...
 $ water : num [1:500] 23.4 21.4 20.1 21.6 21.8 ...
 $ meta  : num [1:500] 64.1 73.7 54.4 86.3 108 ...
 $ depth : num [1:500] 53.2 49.6 49.4 50.3 49 ...

Step 6:Select columns X and Y for the correlation test

Check for normality.

library(nortest)

# Perform the Anderson-Darling test for normality for different variables
blotch.norm <- ad.test(shark$blotch)
print(blotch.norm)


    Anderson-Darling normality test

data:  shark$blotch
A = 0.38716, p-value = 0.3872

BPM.norm <- ad.test(shark$BPM)
print(BPM.norm)


    Anderson-Darling normality test

data:  shark$BPM
A = 6.8186, p-value < 2.2e-16

weight.norm <- ad.test(shark$weight)
print(weight.norm)


    Anderson-Darling normality test

data:  shark$weight
A = 7.4333, p-value < 2.2e-16

length.norm <- ad.test(shark$length)
print(length.norm)


    Anderson-Darling normality test

data:  shark$length
A = 5.422, p-value = 2.17e-13

air.norm <- ad.test(shark$air)
print(air.norm)


    Anderson-Darling normality test

data:  shark$air
A = 4.6957, p-value = 1.203e-11

water.norm <- ad.test(shark$water)
print(water.norm)


    Anderson-Darling normality test

data:  shark$water
A = 4.8383, p-value = 5.463e-12

meta.norm <- ad.test(shark$meta)
print(meta.norm)


    Anderson-Darling normality test

data:  shark$meta
A = 4.0909, p-value = 3.462e-10

Perform Pearson correlation test

# As we are only interested in the relation between air and water temperatures, a correlation test is performed. 
# Select columns for the correlation test
x <- shark$water
y <- shark$air

# Perform Pearson correlation tests for each pair of variables
# Correlation between air and water
cor_test_air_water <- cor.test(x, y)  

# Print the correlation test results
print(cor_test_air_water)


    Pearson's product-moment correlation

data:  x and y
t = -1.2346, df = 498, p-value = 0.2176
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.14224207  0.03260803
sample estimates:
        cor 
-0.05524051

Create a scatter plot with a linear regression line

# dplyr helps to manipulate the data set
# vtable is used to automatically output the data while continuing the work on the data.  

library(readxl)
library(ggplot2)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(vtable)

Loading required package: kableExtra


Attaching package: 'kableExtra'

The following object is masked from 'package:dplyr':

    group_rows

# Create a scatter plot with a linear regression line, encoding FW with color
ggplot(shark, aes(x = water, y = air)) +
  geom_point() +  
  geom_smooth(method = "lm", se = FALSE, color = "navy") +
  labs(
    title = paste("Scatter Plot of air vs. water
                  \n Correlation = ", round(cor(x, y), 2),
                  "\n P-value = ", round(cor_test_air_water$p.value, 4)),
    x = "Air temp (in ºC)",
    y = "Water temp (in ºC)"
  ) +
  theme_minimal() +
  scale_color_gradient(low = "blue", high = "red")

`geom_smooth()` using formula = 'y ~ x'

For Objective 2: Is there an effect on blotching time in respect to multiple catchings?

Step 1:Load the necessary packages

library(readxl)
library(ggplot2)

Step 2:Import the data from the Excel file

sharksub <- read_excel("/Users/jal4510/Desktop/MRes/Research Methods/Final Shark/sharksub.xlsx", sheet = "Sheet1")

Step 3:View the first few rows of the data to confirm it’s loaded correctly

head(sharksub)

# A tibble: 6 × 4
  ID    sex    blotch1 blotch2
  <chr> <chr>    <dbl>   <dbl>
1 SH269 Female    36.1    37.2
2 SH163 Female    33.4    34.4
3 SH008 Female    36.3    36.5
4 SH239 Female    35.0    36.0
5 SH332 Female    35.7    36.8
6 SH328 Female    34.9    35.9

summary(sharksub)

      ID                sex               blotch1         blotch2     
 Length:50          Length:50          Min.   :32.49   Min.   :33.47  
 Class :character   Class :character   1st Qu.:34.38   1st Qu.:35.31  
 Mode  :character   Mode  :character   Median :34.94   Median :35.94  
                                       Mean   :35.03   Mean   :35.96  
                                       3rd Qu.:35.90   3rd Qu.:36.78  
                                       Max.   :37.07   Max.   :38.18

Step 4:Finding the data type

sapply (sharksub, class)

         ID         sex     blotch1     blotch2 
"character" "character"   "numeric"   "numeric"

Step 5:Finding the structure of the data

str (sharksub)

tibble [50 × 4] (S3: tbl_df/tbl/data.frame)
 $ ID     : chr [1:50] "SH269" "SH163" "SH008" "SH239" ...
 $ sex    : chr [1:50] "Female" "Female" "Female" "Female" ...
 $ blotch1: num [1:50] 36.1 33.4 36.3 35 35.7 ...
 $ blotch2: num [1:50] 37.2 34.4 36.5 36 36.8 ...

Step 6:Perform Pearson correlation test

Check for normality.

library(nortest)

# Perform the Anderson-Darling test for normality for different variables
blotch1.norm <- ad.test(sharksub$blotch1)
print(blotch1.norm)


    Anderson-Darling normality test

data:  sharksub$blotch1
A = 0.2969, p-value = 0.578

blotch2.norm <- ad.test(sharksub$blotch2)
print(blotch2.norm)


    Anderson-Darling normality test

data:  sharksub$blotch2
A = 0.29461, p-value = 0.5846

Check for correlation test.

# Select columns for the correlation test
x <- sharksub$blotch1
y <- sharksub$blotch2

# Perform Pearson correlation tests for each pair of variables
# Correlation between blotch1 and blotch2
cor_test_blotch1_blotch2 <- cor.test(x, y)  

# Print the correlation test results
print(cor_test_blotch1_blotch2)


    Pearson's product-moment correlation

data:  x and y
t = 20.154, df = 48, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.9057584 0.9689707
sample estimates:
      cor 
0.9456842

Create a scatter plot with a linear regression line

# Since a correlation is found between the blotaching time, a graph is plotted.  

library(readxl)
library(ggplot2)
library(dplyr)
library(vtable)

# Create a scatter plot with a linear regression line, encoding FW with color

ggplot(sharksub, aes(x = blotch1, y = blotch2)) +
  geom_point() +  
  geom_smooth(method = "lm", se = FALSE, color = "navy") +  
  labs(
    title = paste("Scatter Plot of blotch1 vs. blotch2 
                  \n Correlation = ", round(cor(x, y), 2),
                  "\n P-value = ", round(cor_test_blotch1_blotch2$p.value, 4)),
    x = "blotch 1 (in seconds) ",
    y = "blotch2 (in seconds)"
  ) +
  theme_minimal() +
  scale_color_gradient(low = "blue", high = "red")

`geom_smooth()` using formula = 'y ~ x'

A t-test is performed to find the difference between the blotching time when captured first time in compared to second time.

# t-test

t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)


    Paired t-test

data:  sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.037176 -0.822301
sample estimates:
mean difference 
     -0.9297384

# plot for t-test results 

boxplot(sharksub$blotch1, sharksub$blotch2, names = c("First Capture", "Second Capture"), ylab = "Time taken to Blotch (in seconds)", main = "Comparison of Blotching Times between multiple catchings")

For Objective 3: Is it possible to predict the blotching time?

This objective can be achieved by two different methods. the first method includes the use of just blotching time and using the regression model to predict the blotching time when caught for the third time, while the other method uses another parameter like air, blotcing, water, and meta to predict blotching.

For the first method, a prediction model is used by using just the blotching data.

Step 1:Create a prediction model

# Load necessary library
library(readxl)
library(ggplot2)
library(dplyr)
library(vtable)

# Import the dataset (replace with the correct path)
sharksub_new <- read_excel("/Users/jal4510/Desktop/MRes/Research Methods/Final Shark/sharksub.xlsx", sheet = "Sheet1")

# Check the column names to verify the exact names of 'blotch1' and 'blotch2'
colnames(sharksub_new)

[1] "ID"      "sex"     "blotch1" "blotch2"

# Fit a linear regression model
model <- lm(blotch2 ~ blotch1, data = sharksub_new)

# Print model summary
summary(model)


Call:
lm(formula = blotch2 ~ blotch1, data = sharksub_new)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.37681  0.09513  0.11617  0.13911  0.17550 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.81111    1.74483   0.465    0.644    
blotch1      1.00339    0.04979  20.154   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3819 on 48 degrees of freedom
Multiple R-squared:  0.8943,    Adjusted R-squared:  0.8921 
F-statistic: 406.2 on 1 and 48 DF,  p-value: < 2.2e-16

# If necessary, you can also plot different attributes like residuals and leverages produced from the model. 
plot(model)

# Make predictions on the training data (Actual vs Predicted)
predictions <- predict(model, newdata = sharksub_new)

# Visualize Actual vs Predicted
plot(sharksub_new$blotch1, sharksub_new$blotch2, main = "Actual vs Predicted", xlab = "Blotch1 (in seconds)", ylab = "Blotch2 (in seconds)", pch = 16)
points(sharksub_new$blotch1, predictions, col = "red", pch = 19, cex = 0.7)
abline(model, col = "blue", lwd = 2)  # Adding the regression line

# Calculate RMSE (Root Mean Squared Error)
residuals <- sharksub_new$blotch2 - predictions
rmse <- sqrt(mean(residuals^2))
print(paste("RMSE: ", rmse))

[1] "RMSE:  0.37422142482184"

# Calculatinf the R-squared value for model evaluation
r_squared <- summary(model)$r.squared
print(paste("R-squared: ", r_squared))

[1] "R-squared:  0.894318609776957"

Step 2: The use of trained model to predict the blotching time if the sharks are caught for third time.

# Define new data for prediction (must match the structure of the training data)
new_data <- data.frame(blotch1 = sharksub_new$blotch1,
                       blotch2 = sharksub_new$blotch2)  # Example new values

# Make predictions using the trained model on new data
new_predictions <- predict(model, newdata = new_data)

# Combine new data and predictions into a new data frame
predicted_data <- cbind(new_data, predicted_blotch3 = new_predictions)

# View the predictions
print(predicted_data)

    blotch1  blotch2 predicted_blotch3
1  36.07201 37.15417          37.00528
2  33.38396 34.38548          34.30812
3  36.29497 36.46102          37.22899
4  34.98931 36.03899          35.91891
5  35.70572 36.77689          36.63775
6  34.90283 35.94991          35.83214
7  33.11113 34.10446          34.03437
8  32.49322 33.46802          33.41437
9  34.27203 35.30019          35.19920
10 35.54855 36.61501          36.48004
11 35.99021 37.06992          36.92320
12 34.88511 35.93166          35.81436
13 36.83684 37.94195          37.77270
14 34.79932 35.84330          35.72828
15 34.41429 35.44672          35.34194
16 34.74200 34.39458          35.67076
17 34.36868 35.39974          35.29618
18 34.27528 35.30354          35.20246
19 36.11878 37.20234          37.05220
20 33.98649 35.00608          34.91269
21 34.45879 35.49255          35.38659
22 34.93960 35.98779          35.86903
23 35.81950 36.89408          36.75191
24 34.47858 35.51294          35.40645
25 33.26947 34.26755          34.19324
26 32.85958 33.84537          33.78197
27 34.65632 35.69601          35.58479
28 36.48702 37.58163          37.42169
29 34.93594 35.98402          35.86536
30 36.42572 37.51849          37.36018
31 36.50001 37.59501          37.43473
32 36.37596 37.46724          37.31026
33 35.17775 36.23308          36.10799
34 34.57425 35.61148          35.50244
35 36.68703 37.78764          37.62238
36 35.28866 36.34732          36.21927
37 35.92057 35.52544          36.85332
38 34.73333 35.77533          35.66206
39 33.60120 34.60924          34.52610
40 35.70079 36.77181          36.63280
41 35.02983 36.08072          35.95957
42 35.63334 36.70234          36.56512
43 34.24796 35.27540          35.17505
44 35.34228 36.40255          36.27307
45 36.42890 37.52177          37.36337
46 33.53000 34.53590          34.45466
47 37.07165 38.18380          38.00830
48 34.29493 35.32378          35.22218
49 35.33881 35.60995          36.26959
50 34.52245 34.07366          35.45047

plot(sharksub_new$blotch1, new_data$predicted_blotch3, 
     main = "Actual vs Predicted", 
     xlab = "Blotch2 (in seconds)", 
     ylab = "predicted blotch3 (in seconds)", 
     pch = 16)

points(predicted_data, col = "red", pch = 19, cex = 0.7)
abline(new_data, col = "blue", lwd = 2)

# Add predicted values in red (use the correct data frame and variables)
points(sharksub_new$blotch1, predicted_data$predicted_blotch3, 
       col = "red", pch = 19, cex = 0.7)

# Fit a linear model and add the regression line
model <- lm(predicted_blotch3 ~ blotch1, data = predicted_data)
abline(model, col = "blue", lwd = 2)  # Add regression line

For second method, we will use multiple attributes like air, water, weigth to predict the blotching time. As seen earlier, there is relation between the selected attributes.

# Load necessary library
library(readxl)
library(ggplot2)
library(dplyr)
library(vtable)

# Import the dataset (replace with the correct path)
shark_pre <- read_excel("/Users/jal4510/Desktop/MRes/Research Methods/Final Shark/sharks.xlsx", sheet = "Sheet1")

# Check the column names to verify the exact names of 'blotch1' and 'blotch2'
colnames(shark_pre)

 [1] "ID"     "sex"    "blotch" "BPM"    "weight" "length" "air"    "water" 
 [9] "meta"   "depth"

# Fit a linear regression model
model <- lm(blotch ~ BPM + weight + length + air + water + meta, data = shark_pre)

# Print model summary
summary(model)


Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta, 
    data = shark_pre)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.3251 -0.9574 -0.0401  0.9160  4.9332 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 38.2559539  2.1018996  18.201   <2e-16 ***
BPM         -0.0032950  0.0045553  -0.723    0.470    
weight       0.0012683  0.0047875   0.265    0.791    
length      -0.0007032  0.0013819  -0.509    0.611    
air         -0.0425615  0.0455232  -0.935    0.350    
water       -0.0474373  0.0386436  -1.228    0.220    
meta        -0.0002694  0.0037073  -0.073    0.942    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.432 on 493 degrees of freedom
Multiple R-squared:  0.005944,  Adjusted R-squared:  -0.006154 
F-statistic: 0.4913 on 6 and 493 DF,  p-value: 0.815

# If necessary, you can also plot different attributes like residuals and leverages produced from the model. 
plot(model)

# Make predictions on the training data (Actual vs Predicted)
predictions <- predict(model, newdata = shark_pre)

# Calculate RMSE (Root Mean Squared Error)
residuals <- shark_pre$blotch - predictions
rmse <- sqrt(mean(residuals^2))
print(paste("RMSE: ", rmse))

[1] "RMSE:  1.42181452249635"

# Calculating the R-squared value for model evaluation
r_squared <- summary(model)$r.squared
print(paste("R-squared: ", r_squared))

[1] "R-squared:  0.00594396009422749"

# Visualize Actual vs Predicted
shark_pre$predicted_blotch <- predict(model)
ggplot(shark_pre, aes(x = blotch, y = predicted_blotch)) +
  geom_point(color = "black") +
  geom_abline(slope = 1, intercept = 0, color = "blue") +
  labs(title = "Observed vs Predicted Blotch Time",
       x = "Observed Blotch Time (in seconds)", y = "Predicted Blotch Time (in seconds)") +
  theme_minimal()