Q1: Is there a correlation between the variables air and water?
# Set CRAN mirror for document renderingoptions(repos =c(CRAN ="https://cran.rstudio.com/"))# Install and load all necessary packagesinstall.packages("tidyverse")
The downloaded binary packages are in
/var/folders/_3/gl1b3gb52rsg0qg560909w4r0000gn/T//RtmpfSRcoV/downloaded_packages
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)library(dplyr)library(readxl)# Read the datasetssharks <-read_excel("~/Library/CloudStorage/OneDrive-NottinghamTrentUniversity/Data Analysis/Summative/sharks.xlsx")sharksub <-read_excel("~/Library/CloudStorage/OneDrive-NottinghamTrentUniversity/Data Analysis/Summative/sharksub.xlsx")# Summarize the dataset sharks %>%summary()
ID sex blotch BPM
Length:500 Length:500 Min. :30.78 Min. :119.0
Class :character Class :character 1st Qu.:34.16 1st Qu.:129.0
Mode :character Mode :character Median :35.05 Median :142.0
Mean :35.13 Mean :141.8
3rd Qu.:36.05 3rd Qu.:153.2
Max. :40.08 Max. :166.0
weight length air water
Min. : 65.10 Min. :128.3 Min. :33.00 Min. :20.01
1st Qu.: 75.68 1st Qu.:172.0 1st Qu.:34.42 1st Qu.:21.55
Median : 87.82 Median :211.1 Median :35.43 Median :23.11
Mean : 87.94 Mean :211.0 Mean :35.54 Mean :23.02
3rd Qu.:100.40 3rd Qu.:251.8 3rd Qu.:36.71 3rd Qu.:24.37
Max. :110.94 Max. :291.0 Max. :38.00 Max. :25.99
meta depth
Min. : 50.03 Min. :44.64
1st Qu.: 67.39 1st Qu.:48.90
Median : 82.45 Median :50.14
Mean : 82.04 Mean :50.14
3rd Qu.: 95.97 3rd Qu.:51.35
Max. :112.45 Max. :56.83
# Check the first few rows of the dataset head(sharks)
# Calculate the Pearsons correlation between air and watercor.test(sharks$air, sharks$water)
Pearson's product-moment correlation
data: sharks$air and sharks$water
t = -1.2346, df = 498, p-value = 0.2176
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.14224207 0.03260803
sample estimates:
cor
-0.05524051
## Create a scatter plot with showing the relationship between the variables ggplot(sharks, aes(x = air, y = water)) +geom_point(color ="blue", size =3, shape =16, alpha =0.6) +geom_smooth(method ="lm", se =FALSE, color ="red", linetype ="dashed") +labs( title ="Relationship Between Air and Water Temperatures", x ="Air Temperature (°C)", y ="Water Temperature (°C)")
`geom_smooth()` using formula = 'y ~ x'
Question 2: Does multiple capture have an effect on blotching time?
# Run a Welches two sample t-test to assess any differences between the variablest.test(sharksub$blotch1, sharksub$blotch2)
Welch Two Sample t-test
data: sharksub$blotch1 and sharksub$blotch2
t = -4.1143, df = 97.658, p-value = 8.113e-05
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.3782038 -0.4812731
sample estimates:
mean of x mean of y
35.03042 35.96016
# Load necessary library library(tidyr) # For gathering the data into long format sharks <-read_excel("~/Library/CloudStorage/OneDrive-NottinghamTrentUniversity/Data Analysis/Summative/sharks.xlsx")# Convert data from wide to long format for 'ggplot2' compatibility sharksub_long <- sharksub %>%gather(key ="blotch_type", value ="time", blotch1, blotch2) # Create a boxplot to show the differences between blotch1 and blotch2 ggplot(sharksub_long, aes(x = blotch_type, y = time, fill = blotch_type)) +geom_boxplot() +labs(title ="Comparison of Blotching Times: Blotch1 vs Blotch2", x ="Blotch Time", y ="Time (seconds)") +theme_minimal() +scale_fill_manual(values =c("lightblue", "lightgreen"))
## Create density plot (alternative way to visualise data)sharksub_long <- sharksub %>%gather(key ="blotch_type", value ="time", blotch1, blotch2) # Create density plot ggplot(sharksub_long, aes(x = time, fill = blotch_type, color = blotch_type)) +# Plot the density curve geom_density(alpha =0.4, size =1.2) +# alpha for transparency, size for line thickness # Add labels and themelabs( title ="Density Plot of Blotch Times", x ="Time (seconds)", y ="Density" ) +theme_minimal(base_size =14) +theme( axis.text.x =element_text(size =12), axis.text.y =element_text(size =12), axis.title.x =element_text(size =14), axis.title.y =element_text(size =14), plot.title =element_text(size =16, face ="bold", hjust =0.5), legend.position ="top"# Position the legend at the top ) +# Set color palette scale_fill_manual(values =c("lightblue", "lightgreen")) +scale_color_manual(values =c("darkblue", "darkgreen"))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Question 3: Is it possible to predict blotching time?
# Load necessary libraries library(readxl) library(ggplot2) # Load the data from an Excel file file_path <-"~/Library/CloudStorage/OneDrive-NottinghamTrentUniversity/Data Analysis/Summative/sharks.xlsx"# Compute the Pearson correlation coefficient between 'blotch' and 'depth' cor.test(sharks$blotch, sharks$depth)
Pearson's product-moment correlation
data: sharks$blotch and sharks$depth
t = 22.772, df = 498, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.6683963 0.7546509
sample estimates:
cor
0.7142247
# Perform linear regression model <-lm(depth ~ blotch, data = sharks) # Output the regression summary summary(model)
Call:
lm(formula = depth ~ blotch, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-4.3570 -0.9453 -0.0124 0.9863 4.7997
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.63435 1.56040 9.379 <2e-16 ***
blotch 1.01079 0.04439 22.772 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.415 on 498 degrees of freedom
Multiple R-squared: 0.5101, Adjusted R-squared: 0.5091
F-statistic: 518.6 on 1 and 498 DF, p-value: < 2.2e-16
# Plotting the data and regression line ggplot(sharks, aes(x = blotch, y = depth)) +geom_point(color ="#1f78b4", linewidth =3, alpha =0.7) +# Scatter plot with color and transparency geom_smooth(method ="lm", se =TRUE, color ="red", linewidth =1) +# Regression line with confidence interval labs(title ="Relationship between blotching and depth", x ="Blotch (seconds)", y ="Depth (metres)") +theme_minimal() +# Minimal theme for cleaner look theme( plot.title =element_text(size =16, face ="bold", hjust =0.5), axis.title =element_text(size =14), axis.text =element_text(size =12), panel.grid.major =element_line(color ="grey", linetype ="dashed", size =0.5), panel.grid.minor =element_blank() ) +theme(legend.position ="none") # Remove legend (not needed for this plot)
Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
ℹ Please use the `linewidth` argument instead.
`geom_smooth()` using formula = 'y ~ x'
# Compute the Pearson correlation coefficient between 'blotch' and 'weight' cor.test(sharks$blotch, sharks$weight)
Pearson's product-moment correlation
data: sharks$blotch and sharks$weight
t = 0.20613, df = 498, p-value = 0.8368
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.07851766 0.09684867
sample estimates:
cor
0.009236525
# Perform linear regression model <-lm(weight ~ blotch, data = sharks) # Output the regression summary summary(model)
Call:
lm(formula = weight ~ blotch, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-22.9687 -12.2943 -0.1632 12.3893 22.9622
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 84.8823 14.8545 5.714 1.9e-08 ***
blotch 0.0871 0.4225 0.206 0.837
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 13.47 on 498 degrees of freedom
Multiple R-squared: 8.531e-05, Adjusted R-squared: -0.001923
F-statistic: 0.04249 on 1 and 498 DF, p-value: 0.8368
# Plotting the data and regression line ggplot(sharks, aes(x = blotch, y = weight)) +geom_point(color ="#1f78b4", linewidth =3, alpha =0.7) +# Scatter plot with color and transparency geom_smooth(method ="lm", se =TRUE, color ="red", linewidth =1) +# Regression line with confidence interval labs(title ="Relationship between blotching and weight", x ="Blotch (seconds)", y ="Weight (kg)") +theme_minimal() +# Minimal theme for cleaner look theme( plot.title =element_text(size =16, face ="bold", hjust =0.5), axis.title =element_text(size =14), axis.text =element_text(size =12), panel.grid.major =element_line(color ="grey", linetype ="dashed", size =0.5), panel.grid.minor =element_blank() ) +theme(legend.position ="none") # Remove legend (not needed for this plot)
# Compute the Pearson correlation coefficient between 'blotch' and 'air' cor.test(sharks$blotch, sharks$air)
Pearson's product-moment correlation
data: sharks$blotch and sharks$air
t = -0.84005, df = 498, p-value = 0.4013
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.12489535 0.05023956
sample estimates:
cor
-0.03761675
# Perform linear regression model <-lm(air ~ blotch, data = sharks) # Output the regression summary summary(model)
Call:
lm(formula = air ~ blotch, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.54412 -1.10096 -0.09279 1.17905 2.51344
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 36.85684 1.57452 23.41 <2e-16 ***
blotch -0.03762 0.04479 -0.84 0.401
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.428 on 498 degrees of freedom
Multiple R-squared: 0.001415, Adjusted R-squared: -0.0005902
F-statistic: 0.7057 on 1 and 498 DF, p-value: 0.4013
# Plotting the data and regression line ggplot(sharks, aes(x = blotch, y = air)) +geom_point(color ="#1f78b4", linewidth =3, alpha =0.7) +# Scatter plot with color and transparency geom_smooth(method ="lm", se =TRUE, color ="red", linewidth =1) +# Regression line with confidence interval labs(title ="Relationship between blotching and air", x ="Blotch (seconds)", y ="Air (°C)") +theme_minimal() +# Minimal theme for cleaner look theme( plot.title =element_text(size =16, face ="bold", hjust =0.5), axis.title =element_text(size =14), axis.text =element_text(size =12), panel.grid.major =element_line(color ="grey", linetype ="dashed", size =0.5), panel.grid.minor =element_blank() ) +theme(legend.position ="none") # Remove legend (not needed for this plot)
# Compute the Pearson correlation coefficient between 'blotch' and 'cortisol' cor.test(sharks$blotch, sharks$meta)
Pearson's product-moment correlation
data: sharks$blotch and sharks$meta
t = -0.21232, df = 498, p-value = 0.8319
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.09712341 0.07824201
sample estimates:
cor
-0.009513855
# Perform linear regression model <-lm(meta ~ blotch, data = sharks) # Output the regression summary summary(model)
Call:
lm(formula = meta ~ blotch, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-31.856 -14.556 0.426 13.857 30.687
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 86.1257 19.2463 4.475 9.48e-06 ***
blotch -0.1162 0.5475 -0.212 0.832
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 17.46 on 498 degrees of freedom
Multiple R-squared: 9.051e-05, Adjusted R-squared: -0.001917
F-statistic: 0.04508 on 1 and 498 DF, p-value: 0.8319
# Plotting the data and regression line ggplot(sharks, aes(x = blotch, y = meta)) +geom_point(color ="#1f78b4", linewidth =3, alpha =0.7) +# Scatter plot with color and transparency geom_smooth(method ="lm", se =TRUE, color ="red", linewidth =1) +# Regression line with confidence interval labs(title ="Relationship between blotching and cortisol", x ="Blotch (seconds)", y ="Corticosterone (mcg/dl)") +theme_minimal() +# Minimal theme for cleaner look theme( plot.title =element_text(size =16, face ="bold", hjust =0.5), axis.title =element_text(size =14), axis.text =element_text(size =12), panel.grid.major =element_line(color ="grey", linetype ="dashed", size =0.5), panel.grid.minor =element_blank() ) +theme(legend.position ="none") # Remove legend (not needed for this plot)
# Compute the Pearson correlation coefficient between 'blotch' and 'length' cor.test(sharks$blotch, sharks$length)
Pearson's product-moment correlation
data: sharks$blotch and sharks$length
t = -0.36562, df = 498, p-value = 0.7148
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.1039230 0.0714115
sample estimates:
cor
-0.01638167
# Perform linear regression model <-lm(length ~ blotch, data = sharks) # Output the regression summary summary(model)
Call:
lm(formula = length ~ blotch, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-83.303 -38.860 -0.192 40.659 80.596
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 229.8342 51.4343 4.468 9.76e-06 ***
blotch -0.5349 1.4631 -0.366 0.715
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 46.65 on 498 degrees of freedom
Multiple R-squared: 0.0002684, Adjusted R-squared: -0.001739
F-statistic: 0.1337 on 1 and 498 DF, p-value: 0.7148
# Plotting the data and regression line ggplot(sharks, aes(x = blotch, y = length)) +geom_point(color ="#1f78b4", linewidth =3, alpha =0.7) +# Scatter plot with color and transparency geom_smooth(method ="lm", se =TRUE, color ="red", linewidth =1) +# Regression line with confidence interval labs(title ="Relationship between blotching and length", x ="Blotch (seconds)", y ="length (m)") +theme_minimal() +# Minimal theme for neater look theme( plot.title =element_text(size =16, face ="bold", hjust =0.5), axis.title =element_text(size =14), axis.text =element_text(size =12), panel.grid.major =element_line(color ="grey", linetype ="dashed", size =0.5), panel.grid.minor =element_blank() ) +theme(legend.position ="none")
# Compute the Pearson correlation coefficient between 'blotch' and 'water' cor.test(sharks$blotch, sharks$water)
Pearson's product-moment correlation
data: sharks$blotch and sharks$water
t = -1.1542, df = 498, p-value = 0.249
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.13871605 0.03620077
sample estimates:
cor
-0.05165379
# Perform linear regression model <-lm(water ~ blotch, data = sharks) # Output the regression summary summary(model)
Call:
lm(formula = water ~ blotch, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-3.07227 -1.43903 0.07593 1.34741 3.04345
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 25.14433 1.84152 13.654 <2e-16 ***
blotch -0.06046 0.05238 -1.154 0.249
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.67 on 498 degrees of freedom
Multiple R-squared: 0.002668, Adjusted R-squared: 0.0006654
F-statistic: 1.332 on 1 and 498 DF, p-value: 0.249
# Plotting the data and regression line ggplot(sharks, aes(x = blotch, y = water)) +geom_point(color ="#1f78b4", linewidth =3, alpha =0.7) +# Scatter plot with color and transparency geom_smooth(method ="lm", se =TRUE, color ="red", linewidth =1) +# Regression line with confidence interval labs(title ="Relationship between blotching and water", x ="Blotch (seconds)", y ="Water (°C)") +theme_minimal() +# Minimal theme for neater look theme( plot.title =element_text(size =16, face ="bold", hjust =0.5), axis.title =element_text(size =14), axis.text =element_text(size =12), panel.grid.major =element_line(color ="grey", linetype ="dashed", size =0.5), panel.grid.minor =element_blank() ) +theme(legend.position ="none")
# Compute the Pearson correlation coefficient between 'blotch' and 'BPM' cor.test(sharks$blotch, sharks$BPM)
Pearson's product-moment correlation
data: sharks$blotch and sharks$BPM
t = -0.65406, df = 498, p-value = 0.5134
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.11668743 0.05854438
sample estimates:
cor
-0.02929661
# Perform linear regression model <-lm(BPM ~ blotch, data = sharks) # Output the regression summary summary(model)
Call:
lm(formula = BPM ~ blotch, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-23.029 -13.030 0.441 11.674 24.796
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 151.9574 15.6007 9.740 <2e-16 ***
blotch -0.2903 0.4438 -0.654 0.513
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 14.15 on 498 degrees of freedom
Multiple R-squared: 0.0008583, Adjusted R-squared: -0.001148
F-statistic: 0.4278 on 1 and 498 DF, p-value: 0.5134
# Plotting the data and regression line ggplot(sharks, aes(x = blotch, y = BPM)) +geom_point(color ="#1f78b4", linewidth =3, alpha =0.7) +# Scatter plot with color and transparency geom_smooth(method ="lm", se =TRUE, color ="red", linewidth =1) +# Regression line with confidence interval labs(title ="Relationship between blotching and heart rate", x ="Blotch (seconds)", y ="Heart rate (BPM)") +theme_minimal() +# Minimal theme for cleaner look theme( plot.title =element_text(size =16, face ="bold", hjust =0.5), axis.title =element_text(size =14), axis.text =element_text(size =12), panel.grid.major =element_line(color ="grey", linetype ="dashed", size =0.5), panel.grid.minor =element_blank() ) +theme(legend.position ="none") # Remove legend (not needed for this plot)
# Perform an independant samples t test (sex is categorical)t_test_result <-t.test(sharks$blotch ~ sharks$sex, sharks = df)print(t_test_result)
Welch Two Sample t-test
data: sharks$blotch by sharks$sex
t = -3.0282, df = 494.67, p-value = 0.002589
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
-0.6322714 -0.1346620
sample estimates:
mean in group Female mean in group Male
34.92294 35.30641
## Create Boxplotggplot(sharks, aes(x = sex, y = blotch)) +geom_boxplot(aes(fill = sex), alpha =0.6) +labs(title ="Box Plot of Blotch Time by Sex",x ="Sex",y ="Blotch Time (seconds)" ) +theme_minimal(base_size =14) +# Change the base font sizetheme(plot.title =element_text(hjust =0.5, size =16, face ="bold"),axis.title =element_text(size =14),axis.text =element_text(size =12) )