# readxl is used to read the excel file stored in the local computer drive.
# ggplot2 is used to visualise the data set.
library(readxl)
library(ggplot2)Sharks
For Objective 1: Is there a correlation between air temperature and sea surface temperature?
Step 1:Load the necessary packages
Step 2:Import the data from the Excel file
shark <- read_excel("/Users/jal4510/Desktop/MRes/Research Methods/Final Shark/sharks.xlsx", sheet = "Sheet1")Step 3:View the first few rows of the data to confirm it’s loaded correctly
# head is used to view the first 5-10 data points to ensure the data is loaded correctly
# Summery is used to summarise the variables and provide class, mean, and other basic statistical analyses.
head(shark)# A tibble: 6 × 10
ID sex blotch BPM weight length air water meta depth
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 SH001 Female 37.2 148 74.7 187. 37.7 23.4 64.1 53.2
2 SH002 Female 34.5 158 73.4 189. 35.7 21.4 73.7 49.6
3 SH003 Female 36.3 125 71.8 284. 34.8 20.1 54.4 49.4
4 SH004 Male 35.3 161 105. 171. 36.2 21.6 86.3 50.3
5 SH005 Female 37.4 138 67.1 264. 33.6 21.8 108. 49.0
6 SH006 Male 33.5 126 110. 270. 36.4 20.9 109. 46.8
summary(shark) ID sex blotch BPM
Length:500 Length:500 Min. :30.78 Min. :119.0
Class :character Class :character 1st Qu.:34.16 1st Qu.:129.0
Mode :character Mode :character Median :35.05 Median :142.0
Mean :35.13 Mean :141.8
3rd Qu.:36.05 3rd Qu.:153.2
Max. :40.08 Max. :166.0
weight length air water
Min. : 65.10 Min. :128.3 Min. :33.00 Min. :20.01
1st Qu.: 75.68 1st Qu.:172.0 1st Qu.:34.42 1st Qu.:21.55
Median : 87.82 Median :211.1 Median :35.43 Median :23.11
Mean : 87.94 Mean :211.0 Mean :35.54 Mean :23.02
3rd Qu.:100.40 3rd Qu.:251.8 3rd Qu.:36.71 3rd Qu.:24.37
Max. :110.94 Max. :291.0 Max. :38.00 Max. :25.99
meta depth
Min. : 50.03 Min. :44.64
1st Qu.: 67.39 1st Qu.:48.90
Median : 82.45 Median :50.14
Mean : 82.04 Mean :50.14
3rd Qu.: 95.97 3rd Qu.:51.35
Max. :112.45 Max. :56.83
Step 4:Finding the data type
# sapply is used to identify the data type of the column in a data-set.
sapply (shark, class) ID sex blotch BPM weight length
"character" "character" "numeric" "numeric" "numeric" "numeric"
air water meta depth
"numeric" "numeric" "numeric" "numeric"
Step 5:Finding the structure of the data
# str is to understand the kind of data, including rows and columns provided in the data set.
str (shark)tibble [500 × 10] (S3: tbl_df/tbl/data.frame)
$ ID : chr [1:500] "SH001" "SH002" "SH003" "SH004" ...
$ sex : chr [1:500] "Female" "Female" "Female" "Male" ...
$ blotch: num [1:500] 37.2 34.5 36.3 35.3 37.4 ...
$ BPM : num [1:500] 148 158 125 161 138 126 166 135 132 127 ...
$ weight: num [1:500] 74.7 73.4 71.8 104.6 67.1 ...
$ length: num [1:500] 187 189 284 171 264 ...
$ air : num [1:500] 37.7 35.7 34.8 36.2 33.6 ...
$ water : num [1:500] 23.4 21.4 20.1 21.6 21.8 ...
$ meta : num [1:500] 64.1 73.7 54.4 86.3 108 ...
$ depth : num [1:500] 53.2 49.6 49.4 50.3 49 ...
Step 6:Select columns X and Y for the correlation test
Check for normality.
library(nortest)
# Perform the Anderson-Darling test for normality for different variables
blotch.norm <- ad.test(shark$blotch)
print(blotch.norm)
Anderson-Darling normality test
data: shark$blotch
A = 0.38716, p-value = 0.3872
BPM.norm <- ad.test(shark$BPM)
print(BPM.norm)
Anderson-Darling normality test
data: shark$BPM
A = 6.8186, p-value < 2.2e-16
weight.norm <- ad.test(shark$weight)
print(weight.norm)
Anderson-Darling normality test
data: shark$weight
A = 7.4333, p-value < 2.2e-16
length.norm <- ad.test(shark$length)
print(length.norm)
Anderson-Darling normality test
data: shark$length
A = 5.422, p-value = 2.17e-13
air.norm <- ad.test(shark$air)
print(air.norm)
Anderson-Darling normality test
data: shark$air
A = 4.6957, p-value = 1.203e-11
water.norm <- ad.test(shark$water)
print(water.norm)
Anderson-Darling normality test
data: shark$water
A = 4.8383, p-value = 5.463e-12
meta.norm <- ad.test(shark$meta)
print(meta.norm)
Anderson-Darling normality test
data: shark$meta
A = 4.0909, p-value = 3.462e-10
Perform Pearson correlation test
# As we are only interested in the relation between air and water temperatures, a correlation test is performed.
# Select columns for the correlation test
x <- shark$water
y <- shark$air
# Perform Pearson correlation tests for each pair of variables
# Correlation between air and water
cor_test_air_water <- cor.test(x, y)
# Print the correlation test results
print(cor_test_air_water)
Pearson's product-moment correlation
data: x and y
t = -1.2346, df = 498, p-value = 0.2176
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.14224207 0.03260803
sample estimates:
cor
-0.05524051
Create a scatter plot with a linear regression line
# dplyr helps to manipulate the data set
# vtable is used to automatically output the data while continuing the work on the data.
library(readxl)
library(ggplot2)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(vtable)Loading required package: kableExtra
Attaching package: 'kableExtra'
The following object is masked from 'package:dplyr':
group_rows
# Create a scatter plot with a linear regression line, encoding FW with color
ggplot(shark, aes(x = water, y = air)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "navy") +
labs(
title = paste("Scatter Plot of air vs. water
\n Correlation = ", round(cor(x, y), 2),
"\n P-value = ", round(cor_test_air_water$p.value, 4)),
x = "Air temp (in ºC)",
y = "Water temp (in ºC)"
) +
theme_minimal() +
scale_color_gradient(low = "blue", high = "red")`geom_smooth()` using formula = 'y ~ x'
For Objective 2: Is there an effect on blotching time in respect to multiple catchings?
Step 1:Load the necessary packages
library(readxl)
library(ggplot2)Step 2:Import the data from the Excel file
sharksub <- read_excel("/Users/jal4510/Desktop/MRes/Research Methods/Final Shark/sharksub.xlsx", sheet = "Sheet1")Step 3:View the first few rows of the data to confirm it’s loaded correctly
head(sharksub)# A tibble: 6 × 4
ID sex blotch1 blotch2
<chr> <chr> <dbl> <dbl>
1 SH269 Female 36.1 37.2
2 SH163 Female 33.4 34.4
3 SH008 Female 36.3 36.5
4 SH239 Female 35.0 36.0
5 SH332 Female 35.7 36.8
6 SH328 Female 34.9 35.9
summary(sharksub) ID sex blotch1 blotch2
Length:50 Length:50 Min. :32.49 Min. :33.47
Class :character Class :character 1st Qu.:34.38 1st Qu.:35.31
Mode :character Mode :character Median :34.94 Median :35.94
Mean :35.03 Mean :35.96
3rd Qu.:35.90 3rd Qu.:36.78
Max. :37.07 Max. :38.18
Step 4:Finding the data type
sapply (sharksub, class) ID sex blotch1 blotch2
"character" "character" "numeric" "numeric"
Step 5:Finding the structure of the data
str (sharksub)tibble [50 × 4] (S3: tbl_df/tbl/data.frame)
$ ID : chr [1:50] "SH269" "SH163" "SH008" "SH239" ...
$ sex : chr [1:50] "Female" "Female" "Female" "Female" ...
$ blotch1: num [1:50] 36.1 33.4 36.3 35 35.7 ...
$ blotch2: num [1:50] 37.2 34.4 36.5 36 36.8 ...
Step 6:Perform Pearson correlation test
Check for normality.
library(nortest)
# Perform the Anderson-Darling test for normality for different variables
blotch1.norm <- ad.test(sharksub$blotch1)
print(blotch1.norm)
Anderson-Darling normality test
data: sharksub$blotch1
A = 0.2969, p-value = 0.578
blotch2.norm <- ad.test(sharksub$blotch2)
print(blotch2.norm)
Anderson-Darling normality test
data: sharksub$blotch2
A = 0.29461, p-value = 0.5846
Check for correlation test.
# Select columns for the correlation test
x <- sharksub$blotch1
y <- sharksub$blotch2
# Perform Pearson correlation tests for each pair of variables
# Correlation between blotch1 and blotch2
cor_test_blotch1_blotch2 <- cor.test(x, y)
# Print the correlation test results
print(cor_test_blotch1_blotch2)
Pearson's product-moment correlation
data: x and y
t = 20.154, df = 48, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9057584 0.9689707
sample estimates:
cor
0.9456842
Create a scatter plot with a linear regression line
# Since a correlation is found between the blotaching time, a graph is plotted.
library(readxl)
library(ggplot2)
library(dplyr)
library(vtable)
# Create a scatter plot with a linear regression line, encoding FW with color
ggplot(sharksub, aes(x = blotch1, y = blotch2)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "navy") +
labs(
title = paste("Scatter Plot of blotch1 vs. blotch2
\n Correlation = ", round(cor(x, y), 2),
"\n P-value = ", round(cor_test_blotch1_blotch2$p.value, 4)),
x = "blotch 1 (in seconds) ",
y = "blotch2 (in seconds)"
) +
theme_minimal() +
scale_color_gradient(low = "blue", high = "red")`geom_smooth()` using formula = 'y ~ x'
A t-test is performed to find the difference between the blotching time when captured first time in compared to second time.
# t-test
t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)
Paired t-test
data: sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-1.037176 -0.822301
sample estimates:
mean difference
-0.9297384
# plot for t-test results
boxplot(sharksub$blotch1, sharksub$blotch2, names = c("First Capture", "Second Capture"), ylab = "Time taken to Blotch (in seconds)", main = "Comparison of Blotching Times between multiple catchings")For Objective 3: Is it possible to predict the blotching time?
This objective can be achieved by two different methods. the first method includes the use of just blotching time and using the regression model to predict the blotching time when caught for the third time, while the other method uses another parameter like air, blotcing, water, and meta to predict blotching.
For the first method, a prediction model is used by using just the blotching data.
Step 1:Create a prediction model
# Load necessary library
library(readxl)
library(ggplot2)
library(dplyr)
library(vtable)
# Import the dataset (replace with the correct path)
sharksub_new <- read_excel("/Users/jal4510/Desktop/MRes/Research Methods/Final Shark/sharksub.xlsx", sheet = "Sheet1")
# Check the column names to verify the exact names of 'blotch1' and 'blotch2'
colnames(sharksub_new)[1] "ID" "sex" "blotch1" "blotch2"
# Fit a linear regression model
model <- lm(blotch2 ~ blotch1, data = sharksub_new)
# Print model summary
summary(model)
Call:
lm(formula = blotch2 ~ blotch1, data = sharksub_new)
Residuals:
Min 1Q Median 3Q Max
-1.37681 0.09513 0.11617 0.13911 0.17550
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.81111 1.74483 0.465 0.644
blotch1 1.00339 0.04979 20.154 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.3819 on 48 degrees of freedom
Multiple R-squared: 0.8943, Adjusted R-squared: 0.8921
F-statistic: 406.2 on 1 and 48 DF, p-value: < 2.2e-16
# If necessary, you can also plot different attributes like residuals and leverages produced from the model.
plot(model)# Make predictions on the training data (Actual vs Predicted)
predictions <- predict(model, newdata = sharksub_new)
# Visualize Actual vs Predicted
plot(sharksub_new$blotch1, sharksub_new$blotch2, main = "Actual vs Predicted", xlab = "Blotch1 (in seconds)", ylab = "Blotch2 (in seconds)", pch = 16)
points(sharksub_new$blotch1, predictions, col = "red", pch = 19, cex = 0.7)
abline(model, col = "blue", lwd = 2) # Adding the regression line# Calculate RMSE (Root Mean Squared Error)
residuals <- sharksub_new$blotch2 - predictions
rmse <- sqrt(mean(residuals^2))
print(paste("RMSE: ", rmse))[1] "RMSE: 0.37422142482184"
# Calculatinf the R-squared value for model evaluation
r_squared <- summary(model)$r.squared
print(paste("R-squared: ", r_squared))[1] "R-squared: 0.894318609776957"
Step 2: The use of trained model to predict the blotching time if the sharks are caught for third time.
# Define new data for prediction (must match the structure of the training data)
new_data <- data.frame(blotch1 = sharksub_new$blotch1,
blotch2 = sharksub_new$blotch2) # Example new values
# Make predictions using the trained model on new data
new_predictions <- predict(model, newdata = new_data)
# Combine new data and predictions into a new data frame
predicted_data <- cbind(new_data, predicted_blotch3 = new_predictions)
# View the predictions
print(predicted_data) blotch1 blotch2 predicted_blotch3
1 36.07201 37.15417 37.00528
2 33.38396 34.38548 34.30812
3 36.29497 36.46102 37.22899
4 34.98931 36.03899 35.91891
5 35.70572 36.77689 36.63775
6 34.90283 35.94991 35.83214
7 33.11113 34.10446 34.03437
8 32.49322 33.46802 33.41437
9 34.27203 35.30019 35.19920
10 35.54855 36.61501 36.48004
11 35.99021 37.06992 36.92320
12 34.88511 35.93166 35.81436
13 36.83684 37.94195 37.77270
14 34.79932 35.84330 35.72828
15 34.41429 35.44672 35.34194
16 34.74200 34.39458 35.67076
17 34.36868 35.39974 35.29618
18 34.27528 35.30354 35.20246
19 36.11878 37.20234 37.05220
20 33.98649 35.00608 34.91269
21 34.45879 35.49255 35.38659
22 34.93960 35.98779 35.86903
23 35.81950 36.89408 36.75191
24 34.47858 35.51294 35.40645
25 33.26947 34.26755 34.19324
26 32.85958 33.84537 33.78197
27 34.65632 35.69601 35.58479
28 36.48702 37.58163 37.42169
29 34.93594 35.98402 35.86536
30 36.42572 37.51849 37.36018
31 36.50001 37.59501 37.43473
32 36.37596 37.46724 37.31026
33 35.17775 36.23308 36.10799
34 34.57425 35.61148 35.50244
35 36.68703 37.78764 37.62238
36 35.28866 36.34732 36.21927
37 35.92057 35.52544 36.85332
38 34.73333 35.77533 35.66206
39 33.60120 34.60924 34.52610
40 35.70079 36.77181 36.63280
41 35.02983 36.08072 35.95957
42 35.63334 36.70234 36.56512
43 34.24796 35.27540 35.17505
44 35.34228 36.40255 36.27307
45 36.42890 37.52177 37.36337
46 33.53000 34.53590 34.45466
47 37.07165 38.18380 38.00830
48 34.29493 35.32378 35.22218
49 35.33881 35.60995 36.26959
50 34.52245 34.07366 35.45047
plot(sharksub_new$blotch1, new_data$predicted_blotch3,
main = "Actual vs Predicted",
xlab = "Blotch2 (in seconds)",
ylab = "predicted blotch3 (in seconds)",
pch = 16)
points(predicted_data, col = "red", pch = 19, cex = 0.7)
abline(new_data, col = "blue", lwd = 2)
# Add predicted values in red (use the correct data frame and variables)
points(sharksub_new$blotch1, predicted_data$predicted_blotch3,
col = "red", pch = 19, cex = 0.7)
# Fit a linear model and add the regression line
model <- lm(predicted_blotch3 ~ blotch1, data = predicted_data)
abline(model, col = "blue", lwd = 2) # Add regression lineFor second method, we will use multiple attributes like air, water, weigth to predict the blotching time. As seen earlier, there is relation between the selected attributes.
# Load necessary library
library(readxl)
library(ggplot2)
library(dplyr)
library(vtable)
# Import the dataset (replace with the correct path)
shark_pre <- read_excel("/Users/jal4510/Desktop/MRes/Research Methods/Final Shark/sharks.xlsx", sheet = "Sheet1")
# Check the column names to verify the exact names of 'blotch1' and 'blotch2'
colnames(shark_pre) [1] "ID" "sex" "blotch" "BPM" "weight" "length" "air" "water"
[9] "meta" "depth"
# Fit a linear regression model
model <- lm(blotch ~ BPM + weight + length + air + water + meta, data = shark_pre)
# Print model summary
summary(model)
Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta,
data = shark_pre)
Residuals:
Min 1Q Median 3Q Max
-4.3251 -0.9574 -0.0401 0.9160 4.9332
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 38.2559539 2.1018996 18.201 <2e-16 ***
BPM -0.0032950 0.0045553 -0.723 0.470
weight 0.0012683 0.0047875 0.265 0.791
length -0.0007032 0.0013819 -0.509 0.611
air -0.0425615 0.0455232 -0.935 0.350
water -0.0474373 0.0386436 -1.228 0.220
meta -0.0002694 0.0037073 -0.073 0.942
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.432 on 493 degrees of freedom
Multiple R-squared: 0.005944, Adjusted R-squared: -0.006154
F-statistic: 0.4913 on 6 and 493 DF, p-value: 0.815
# If necessary, you can also plot different attributes like residuals and leverages produced from the model.
plot(model)# Make predictions on the training data (Actual vs Predicted)
predictions <- predict(model, newdata = shark_pre)
# Calculate RMSE (Root Mean Squared Error)
residuals <- shark_pre$blotch - predictions
rmse <- sqrt(mean(residuals^2))
print(paste("RMSE: ", rmse))[1] "RMSE: 1.42181452249635"
# Calculating the R-squared value for model evaluation
r_squared <- summary(model)$r.squared
print(paste("R-squared: ", r_squared))[1] "R-squared: 0.00594396009422749"
# Visualize Actual vs Predicted
shark_pre$predicted_blotch <- predict(model)
ggplot(shark_pre, aes(x = blotch, y = predicted_blotch)) +
geom_point(color = "black") +
geom_abline(slope = 1, intercept = 0, color = "blue") +
labs(title = "Observed vs Predicted Blotch Time",
x = "Observed Blotch Time (in seconds)", y = "Predicted Blotch Time (in seconds)") +
theme_minimal()