── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)library(openxlsx)
Warning: package 'openxlsx' was built under R version 4.4.2
library(readxl)
Warning: package 'readxl' was built under R version 4.4.2
# getting the data into R studio sharks <-read_excel("C:/Users/rosie/OneDrive/Documents/sharks.xlsx")View(sharks)# Running a random mean to check data has been read in correctlymean(sharks$length)
[1] 211.0442
# checking for outliers in blotching time# Identify outliers based on IQRQ1 <-quantile(sharks$blotch, 0.25) # First quartile (25th percentile)Q3 <-quantile(sharks$blotch, 0.75) # Third quartile (75th percentile)IQR_value <- Q3 - Q1 # Interquartile range (IQR)# Outlier thresholdlower_bound <- Q1 -1.5* IQR_valueupper_bound <- Q3 +1.5* IQR_value# Identify outliersoutliers <-which(sharks$blotch < lower_bound | sharks$blotch > upper_bound)# Print the outlierscat("Outliers in blotching time:\n")
Outliers in blotching time:
print(sharks$blotch[outliers])
[1] 39.83638 39.13972 30.77585 40.08356
Air v water correlation:
Plotting a scatter graph
# Step 1: Create the scatter plotplot(sharks$air, sharks$water, main ="Air vs Water Temperature",xlab ="Air Temperature (°C)", ylab ="Water Temperature (°C)", pch =16, # Points type, solid circlecol ="grey") # Color of the points# Step 2: Fit a linear model (line of best fit)model <-lm(sharks$water ~ sharks$air)# Step 3: Add the line of best fit to the scatter plotabline(model, col ="black", lwd =2) # Black line with line width of 2# Add grid linesgrid(col ="gray", lty ="dotted", lwd =1)# Step 4: Calculate residuals and Cook's distanceresiduals <-resid(model) # Residualscooks_dist <-cooks.distance(model) # Cook's distance# Step 5: Identify potential outliersoutliers <-which(cooks_dist > (4/length(cooks_dist))) # Cook's distance threshold# Step 6: Highlight outliers on the plotpoints(sharks$air[outliers], sharks$water[outliers], pch =16, col ="red")
# Red points for outliers# Step 7: Print the identified outlierscat("Potential outliers based on Cook's distance:\n")
Pearson’s correlation coefficient for air and water
# Identify and remove outliers using the IQR methodremove_outliers <-function(x) { Q1 <-quantile(x, 0.25, na.rm =TRUE) # First quartile Q3 <-quantile(x, 0.75, na.rm =TRUE) # Third quartile IQR <- Q3 - Q1 # Interquartile range lower_bound <- Q1 -1.5* IQR # Lower bound upper_bound <- Q3 +1.5* IQR # Upper bound x[x >= lower_bound & x <= upper_bound] # Return non-outliers}# Clean the data by removing outlierssharks_clean <- sharks %>% dplyr::filter( air %in%remove_outliers(air) & water %in%remove_outliers(water) )# Perform correlation analysis using complete observationscorrelation <-cor(sharks_clean$air, sharks_clean$water, method ="pearson", use ="complete.obs")# Print correlation valueprint(correlation)
[1] -0.05524051
# Perform the hypothesis test for Pearson correlationcor_test <-cor.test(sharks_clean$air, sharks_clean$water, method ="pearson")# Print the result of the testprint(cor_test)
Pearson's product-moment correlation
data: sharks_clean$air and sharks_clean$water
t = -1.2346, df = 498, p-value = 0.2176
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.14224207 0.03260803
sample estimates:
cor
-0.05524051
# performing a t-testt_test_result <-t.test(sharksub$blotch1, sharksub$blotch2, paired =TRUE)# View resultsprint(t_test_result)
Paired t-test
data: sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-1.037176 -0.822301
sample estimates:
mean difference
-0.9297384
Visually analysing difference in blotch 1 and blotch 2
# Boxplot for time of blotching in blotch 11
[1] 1
boxplot(sharksub$blotch1,main ="Blotch 1",ylab ="Blotch Time (seconds)",col ="lightblue",border ="darkblue")
# Boxplot of blotching time in blotch 2boxplot(sharksub$blotch2,main ="Blotch 2",ylab ="Blotch Time (seconds)",col ="lightblue",border ="darkblue")
# Create a data frame with blotch times for groupingblotch_data <-data.frame(Time =c(sharksub$blotch1, sharksub$blotch2),Capture =rep(c("First Capture", "Second Capture"), each =length(sharksub$blotch1)))# Create the box plotboxplot(Time ~ Capture, data = blotch_data,main ="Blotching Times - Blotch 1 and 2",ylab ="Blotch Time (Seconds)",col =c("blue", "green"))
Linear regression models
options(repos =c(CRAN ="https://cloud.r-project.org"))# Depth vs blotching timemodel <-lm(blotch ~ depth, data = sharks)summary(model)
Call:
lm(formula = blotch ~ depth, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.81869 -0.65427 -0.01035 0.58825 2.83116
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.82178 1.11207 8.832 <2e-16 ***
depth 0.50467 0.02216 22.772 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1 on 498 degrees of freedom
Multiple R-squared: 0.5101, Adjusted R-squared: 0.5091
F-statistic: 518.6 on 1 and 498 DF, p-value: < 2.2e-16
# visualising this# Scatter plotplot(sharks$depth, sharks$blotch, main ="Depth vs Blotching Time", xlab ="Depth (m)", ylab ="Blotching Time (s)", pch =16, col ="blue")# Add the regression lineabline(model, col ="red", lwd =2)# Generate predictions with a confidence intervalpredictions <-predict(model, interval ="confidence", level =0.95)# Add the buffer (confidence interval) around the linelines(sharks$depth, predictions[,2], col ="green", lty =2) # Lower boundlines(sharks$depth, predictions[,3], col ="green", lty =2) # Upper bound
# Diagnostic plotspar(mfrow =c(2, 2)) # Arrange plots in a 2x2 gridplot(model)
Weight vs blotching time
model <-lm(blotch ~ weight, data = sharks)summary(model)
Call:
lm(formula = blotch ~ weight, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-4.3702 -0.9600 -0.0687 0.9328 4.9357
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.504e+01 4.227e-01 82.887 <2e-16 ***
weight 9.795e-04 4.752e-03 0.206 0.837
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.429 on 498 degrees of freedom
Multiple R-squared: 8.531e-05, Adjusted R-squared: -0.001923
F-statistic: 0.04249 on 1 and 498 DF, p-value: 0.8368
# visualising this# Scatter plotplot(sharks$weight, sharks$blotch, main ="Weight vs Blotching Time", xlab =" Weight (Kg)", ylab ="Blotching time (S)", pch =16, col ="blue")# Add the regression lineabline(model, col ="red", lwd =2)# Generate predictions with a confidence intervalpredictions <-predict(model, interval ="confidence", level =0.95)# Add the buffer (confidence interval) around the linelines(sharks$weight, predictions[,2], col ="green", lty =2) # Lower boundlines(sharks$weight, predictions[,3], col ="green", lty =2) # Upper bound
Length and blotting time
model <-lm(blotch ~ length, data = sharks)summary(model)
Call:
lm(formula = blotch ~ length, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-4.3280 -0.9663 -0.0678 0.9117 4.9545
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 35.2312817 0.2965354 118.810 <2e-16 ***
length -0.0005017 0.0013721 -0.366 0.715
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.429 on 498 degrees of freedom
Multiple R-squared: 0.0002684, Adjusted R-squared: -0.001739
F-statistic: 0.1337 on 1 and 498 DF, p-value: 0.7148
# visualising this# Scatter plotplot(sharks$length, sharks$blotch, main ="Length vs Blotching Time", xlab =" Length (cm)", ylab ="Blotching time (s)", pch =16, col ="blue")# Add the regression lineabline(model, col ="red", lwd =2)# Generate predictions with a confidence intervalpredictions <-predict(model, interval ="confidence", level =0.95)# Add the buffer (confidence interval) around the linelines(sharks$length, predictions[,2], col ="green", lty =2) # Lower boundlines(sharks$length, predictions[,3], col ="green", lty =2) # Upper bound
BPM and blotching time
model <-lm(blotch ~ BPM, data = sharks)summary(model)
Call:
lm(formula = blotch ~ BPM, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-4.3282 -0.9492 -0.0876 0.8940 4.9559
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 35.544599 0.644079 55.187 <2e-16 ***
BPM -0.002957 0.004521 -0.654 0.513
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.428 on 498 degrees of freedom
Multiple R-squared: 0.0008583, Adjusted R-squared: -0.001148
F-statistic: 0.4278 on 1 and 498 DF, p-value: 0.5134
# visualising this# Scatter plotplot(sharks$length, sharks$blotch, main ="BPM vs Blotching Time", xlab ="BPM", ylab ="Blotching time(s)", pch =16, col ="blue")# Add the regression lineabline(model, col ="red", lwd =2)# Generate predictions with a confidence intervalpredictions <-predict(model, interval ="confidence", level =0.95)# Add the buffer (confidence interval) around the linelines(sharks$BPM, predictions[,2], col ="green", lty =2) # Lower boundlines(sharks$BPM, predictions[,3], col ="green", lty =2) # Upper bound
Meta and blotching time
model <-lm(blotch ~ meta , data = sharks)summary(model)
Call:
lm(formula = blotch ~ meta, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-4.3635 -0.9639 -0.0793 0.9275 4.9411
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 35.1892930 0.3075967 114.401 <2e-16 ***
meta -0.0007787 0.0036674 -0.212 0.832
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.429 on 498 degrees of freedom
Multiple R-squared: 9.051e-05, Adjusted R-squared: -0.001917
F-statistic: 0.04508 on 1 and 498 DF, p-value: 0.8319
# visualising this# Scatter plotplot(sharks$length, sharks$blotch, main ="Meta vs Blotching Time", xlab ="Meta (mcg/dl)", ylab ="Blotching time(s)", pch =16, col ="blue")# Add the regression lineabline(model, col ="red", lwd =2)# Generate predictions with a confidence intervalpredictions <-predict(model, interval ="confidence", level =0.95)# Add the buffer (confidence interval) around the linelines(sharks$meta, predictions[,2], col ="green", lty =2) # Lower boundlines(sharks$meta, predictions[,3], col ="green", lty =2) # Upper bound
Water and blotching time
model <-lm(blotch ~ water , data = sharks)summary(model)
Call:
lm(formula = blotch ~ water, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-4.3051 -0.9629 -0.0495 0.9471 5.0216
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 36.14125 0.88240 40.958 <2e-16 ***
water -0.04413 0.03823 -1.154 0.249
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.427 on 498 degrees of freedom
Multiple R-squared: 0.002668, Adjusted R-squared: 0.0006654
F-statistic: 1.332 on 1 and 498 DF, p-value: 0.249
# visualising this # Scatter plotplot(sharks$water, sharks$blotch, main ="Water temperature vs Blotching Time", xlab ="Water temperature(C)", ylab ="Blotching time(s)", pch =16, col ="blue")# Add the regression lineabline(model, col ="red", lwd =2)# Generate predictions with a confidence intervalpredictions <-predict(model, interval ="confidence", level =0.95)# Add the buffer (confidence interval) around the linelines(sharks$water, predictions[,2], col ="green", lty =2) # Lower boundlines(sharks$water, predictions[,3], col ="green", lty =2) # Upper bound
Air and blotching time
model <-lm(blotch ~ air , data = sharks)summary(model)
Call:
lm(formula = blotch ~ air, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-4.3900 -0.9459 -0.0506 0.9423 4.9134
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 36.46184 1.59219 22.90 <2e-16 ***
air -0.03761 0.04477 -0.84 0.401
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.428 on 498 degrees of freedom
Multiple R-squared: 0.001415, Adjusted R-squared: -0.0005902
F-statistic: 0.7057 on 1 and 498 DF, p-value: 0.4013
# visualising this# Scatter plotplot(sharks$air, sharks$blotch, main ="Air temperature vs Blotching Time", xlab ="Air temperature(C)", ylab ="Blotching time(s)", pch =16, col ="blue")# Add the regression lineabline(model, col ="red", lwd =2)# Generate predictions with a confidence intervalpredictions <-predict(model, interval ="confidence", level =0.95)# Add the buffer (confidence interval) around the linelines(sharks$air, predictions[,2], col ="green", lty =2) # Lower boundlines(sharks$air, predictions[,3], col ="green", lty =2) # Upper bound
Sex and blotching time
# checking values in sex columntable(sharks$sex)
Female Male
236 264
# convert sex to a factorsharks$sex <-as.factor(sharks$sex)# fit the linear model model_sex <-lm(blotch ~ sex, data = sharks)summary(model_sex)
Call:
lm(formula = blotch ~ sex, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-4.1471 -0.9653 -0.0222 0.9889 4.7772
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 34.92294 0.09217 378.885 < 2e-16 ***
sexMale 0.38347 0.12685 3.023 0.00263 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.416 on 498 degrees of freedom
Multiple R-squared: 0.01802, Adjusted R-squared: 0.01605
F-statistic: 9.139 on 1 and 498 DF, p-value: 0.002632
# Load the necessary librarylibrary(ggplot2)# Create a data frame with the regression resultssex_effect <-data.frame(Sex =c("Female", "Male"),Blotching =c(34.92, 34.92+0.38347) # Intercept and intercept + male coefficient)# Plot the bar chartggplot(sex_effect, aes(x = Sex, y = Blotching, fill = Sex)) +geom_bar(stat ="identity", color ="black", width =0.6) +geom_text(aes(label =round(Blotching, 2)), vjust =-0.5, size =5) +labs(x ="Sex",y ="Mean Blotching Level" ) +scale_fill_manual(values =c("Female"="pink", "Male"="blue")) +theme_minimal() +theme(legend.position ="none",plot.title =element_text(hjust =0.5, size =16),axis.title =element_text(size =14),axis.text =element_text(size =12) ) +# Adjust y-axis limits and add paddingylim(0, max(sex_effect$Blotching) +1) +coord_cartesian(clip ="off") # Ensure elements outside the plotting area are shown