# Shapiro-Wilk test for air and water variablesshapiro.test(sharks$air)
Shapiro-Wilk normality test
data: sharks$air
W = 0.95885, p-value = 1.338e-10
shapiro.test(sharks$water)
Shapiro-Wilk normality test
data: sharks$water
W = 0.96035, p-value = 2.371e-10
# Spearman correlation test between air and watercor.test(sharks$air, sharks$water, method ="spearman")
Spearman's rank correlation rho
data: sharks$air and sharks$water
S = 22007692, p-value = 0.2082
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
-0.05637344
# Summary of datasummary(sharks$air)
Min. 1st Qu. Median Mean 3rd Qu. Max.
33.00 34.42 35.43 35.54 36.71 38.00
summary(sharks$water)
Min. 1st Qu. Median Mean 3rd Qu. Max.
20.01 21.55 23.11 23.02 24.37 25.99
# Sample size length(sample(sharks$air)) # length() provides number of elements whilst sample () prints list of data sample
[1] 500
length(sample(sharks$water))
[1] 500
Graphing Correlation between Air and Water
ggplot(sharks, aes(x = water, y = air)) +# Graph plot function (ggplot) using 'sharks' data and assigning 'air' and 'water' variables to x and ygeom_point(alpha =0.7) +# Scatter plot (geom_point) with slight transparency for points (alpha)geom_smooth(method ="lm", color ="blue", se =TRUE, fill ="lightblue") +# Line plot (geom_smooth) using linear model (lm) method in blue colour with standard errors (se) in light bluelabs(x ="Water Temperature (ºC)", y ="Air Temperature (ºC)") +# Labelling the x and y axistheme_minimal() # Changing theme for clearer visualisation
`geom_smooth()` using formula = 'y ~ x'
Question 2. Does multiple capture have an effect on blotching time?
sharksub <-read_excel("sharksub.xlsx") # Read the sharks.xlsx filehead(sharksub) # Display the data
summary(sharksub) # Display data range, median and mean
ID sex blotch1 blotch2
Length:50 Length:50 Min. :32.49 Min. :33.47
Class :character Class :character 1st Qu.:34.38 1st Qu.:35.31
Mode :character Mode :character Median :34.94 Median :35.94
Mean :35.03 Mean :35.96
3rd Qu.:35.90 3rd Qu.:36.78
Max. :37.07 Max. :38.18
# Shapiro-Wilk test for blotching times in both capturesshapiro.test(sharksub$blotch1)
Shapiro-Wilk normality test
data: sharksub$blotch1
W = 0.97958, p-value = 0.5345
shapiro.test(sharksub$blotch2)
Shapiro-Wilk normality test
data: sharksub$blotch2
W = 0.97936, p-value = 0.5255
# Perform a paired t-test for blotch1 and blotch2paired_ttest <-t.test(sharksub$blotch1, sharksub$blotch2, paired =TRUE)paired_ttest # Output results
Paired t-test
data: sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-1.037176 -0.822301
sample estimates:
mean difference
-0.9297384
paired_ttest$p.value # Print p-value
[1] 1.326634e-22
sd(sharksub$blotch1) # Print standard deviation of blotch1
[1] 1.095959
sd(sharksub$blotch2) # Print standard deviation of blotch2
[1] 1.16283
length(sample(sharksub$blotch1)) # Print sample size of blotch1
[1] 50
length(sample(sharksub$blotch2)) # Print sample size of blotch2
[1] 50
Graphing Effect of Multiple Capture on Blotching Time
sharksub %>%# Using the `sharksub` dataset and pipe operator `%>%` to chain operationspivot_longer(cols =c(blotch1, blotch2), # Transform the wide-format data into a long format (pivot_longer) by assigning column names 'blotch1' and 'blotch2'names_to ="CaptureEvent", # into a column called 'CaptureEvent'values_to ="BlotchTime") %>%# and their corresponding values into a column called 'BlotchTime'ggplot(aes(x = CaptureEvent, y = BlotchTime, fill = CaptureEvent)) +# Plot the tranformed data, assigning 'CaptureEvent' to x axis and 'BlotchTime' to y axis, using different fill colours for each Capture Eventstat_boxplot(geom ="errorbar", width =0.1) +# Add error bars (geom specifies the type of geometric object) and set width to 0.1geom_boxplot() +# Add boxplot layer on top of error barslabs(x ="Capture Event", # Add labels to the graphy ="Blotching Time (seconds)") +scale_fill_manual(values =c("orange", "pink")) +# Manually set fill colours for the boxplots to orange and pinkscale_y_continuous(breaks =seq(32, 40, by =1)) +# Customise the y axis to show tick marks at intervals of 1 and range from 32 to 40scale_x_discrete(labels =c("blotch1"="First Capture", # Rename the x axis labels"blotch2"="Second Capture")) +theme_minimal() +# Apply minimal theme for clearer visualstheme(panel.grid.major.x =element_blank(), # Remove the legend and x axis grid lineslegend.position ="none")
Independent Question. Does sex have an effect on blotching time?
# Shapiro-Wilk test where [data$variable == "value"] creates a logical condition using 'sex' values 'Female' and 'Male' to filter the rows of 'blotch'shapiro.test(sharks$blotch[sharks$sex =="Female"]) # Normality test for blotching times in female sharks
Shapiro-Wilk normality test
data: sharks$blotch[sharks$sex == "Female"]
W = 0.99527, p-value = 0.682
shapiro.test(sharks$blotch[sharks$sex =="Male"]) # Normality test for blotching times in male sharks
Shapiro-Wilk normality test
data: sharks$blotch[sharks$sex == "Male"]
W = 0.99209, p-value = 0.1701
# var.test() checks if the variances between groups are equalvar.test(sharks$blotch[sharks$sex =="Female"], sharks$blotch[sharks$sex =="Male"])
F test to compare two variances
data: sharks$blotch[sharks$sex == "Female"] and sharks$blotch[sharks$sex == "Male"]
F = 0.94109, num df = 235, denom df = 263, p-value = 0.6347
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
0.7340224 1.2087050
sample estimates:
ratio of variances
0.9410879
# Perform a t-test with equal variance (var.equal = TRUE) using the formula 'blotch ~ sex' to specify that 'blotch' is the dependent variable and 'sex' is the independent grouping variablet.test(blotch ~ sex, data = sharks, var.equal =TRUE)
Two Sample t-test
data: blotch by sex
t = -3.023, df = 498, p-value = 0.002632
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
-0.6326914 -0.1342420
sample estimates:
mean in group Female mean in group Male
34.92294 35.30641
# Summary of datasummary(sharks$blotch[sharks$sex =="Female"])
Min. 1st Qu. Median Mean 3rd Qu. Max.
30.78 34.12 34.87 34.92 35.95 38.66
summary(sharks$blotch[sharks$sex =="Male"])
Min. 1st Qu. Median Mean 3rd Qu. Max.
31.88 34.25 35.30 35.31 36.26 40.08
sd(sharks$blotch[sharks$sex =="Female"]) # Print standard deviation of 'Female' filtered 'blotch'
[1] 1.393145
sd(sharks$blotch[sharks$sex =="Male"]) # Print standard deviation of 'Male' filtered 'blotch'
[1] 1.436089
length(sample(sharks$blotch[sharks$sex =="Female"])) # Print sample size of 'Female' filtered 'blotch'
[1] 236
length(sample(sharks$blotch[sharks$sex =="Male"])) # Print sample size of 'Male' filtered 'blotch'
[1] 264
Graphing Effect of Sex on Blotching Time
# Plot 'sharks' data assigning 'sex' and 'blotch' variables to x and y respectively, using different fill colours for each 'sex'ggplot(sharks, aes(x = sex, y = blotch, fill = sex)) +stat_boxplot(geom ="errorbar", width =0.2) +geom_boxplot() +labs(x ="Sex", y ="Blotching Time (seconds)") +scale_fill_manual(values =c("pink", "lightblue")) +scale_y_continuous(breaks =seq(20, 50, by =2)) +theme_minimal() +theme(panel.grid.major.x =element_blank(),legend.position ="none")
Question 3. Is it possible to predict blotching time?
# Shapiro-Wilk test for blotch in the sharks datasetshapiro.test(sharks$blotch)
Shapiro-Wilk normality test
data: sharks$blotch
W = 0.99695, p-value = 0.4769
# Summarise the datasetsummary(sharks)
ID sex blotch BPM
Length:500 Length:500 Min. :30.78 Min. :119.0
Class :character Class :character 1st Qu.:34.16 1st Qu.:129.0
Mode :character Mode :character Median :35.05 Median :142.0
Mean :35.13 Mean :141.8
3rd Qu.:36.05 3rd Qu.:153.2
Max. :40.08 Max. :166.0
weight length air water
Min. : 65.10 Min. :128.3 Min. :33.00 Min. :20.01
1st Qu.: 75.68 1st Qu.:172.0 1st Qu.:34.42 1st Qu.:21.55
Median : 87.82 Median :211.1 Median :35.43 Median :23.11
Mean : 87.94 Mean :211.0 Mean :35.54 Mean :23.02
3rd Qu.:100.40 3rd Qu.:251.8 3rd Qu.:36.71 3rd Qu.:24.37
Max. :110.94 Max. :291.0 Max. :38.00 Max. :25.99
meta depth
Min. : 50.03 Min. :44.64
1st Qu.: 67.39 1st Qu.:48.90
Median : 82.45 Median :50.14
Mean : 82.04 Mean :50.14
3rd Qu.: 95.97 3rd Qu.:51.35
Max. :112.45 Max. :56.83
# Multiple linear regression model for blotching timemodel <-lm(blotch ~ air + water + BPM + weight + length + meta + depth, data = sharks)summary(model) # Output model summary, showing coefficients, standard deviation, t value and p value
Call:
lm(formula = blotch ~ air + water + BPM + weight + length + meta +
depth, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.83745 -0.66117 -0.00702 0.60110 2.74108
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.1405851 1.8958668 5.876 7.74e-09 ***
air -0.0281474 0.0318707 -0.883 0.378
water -0.0188934 0.0270782 -0.698 0.486
BPM -0.0019723 0.0031890 -0.618 0.537
weight 0.0016283 0.0033511 0.486 0.627
length 0.0012295 0.0009710 1.266 0.206
meta -0.0009712 0.0025951 -0.374 0.708
depth 0.5061285 0.0223191 22.677 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.002 on 492 degrees of freedom
Multiple R-squared: 0.514, Adjusted R-squared: 0.507
F-statistic: 74.32 on 7 and 492 DF, p-value: < 2.2e-16
Graphing Prediction Model for Blotching Time
# Check the rest of the data for non-linear trendsshark_long <-pivot_longer(sharks, # Convert the data into a long format for facet wrappingcols =-c(blotch, sex, ID, depth),names_to ="Predictor", values_to ="Value")# Custom labels for the facet gridfacet_labels <-c(air ="Air (ºC)",water ="Water (ºC)",meta ="Cortisol (mcg/dl)",BPM ="Heart Rate (BPM)",length ="Length (cm)",weight ="Weight (kg)")# Scatter plots with custom facet titlesggplot(shark_long, aes(x = Value, y = blotch)) +geom_point(alpha =0.7, color ="blue") +# Scatter points with transparencyfacet_wrap(~ Predictor, scales ="free_x", labeller =as_labeller(facet_labels)) +# Custom facet titleslabs(x ="Predictor Value",y ="Blotching Time (seconds)") +scale_y_continuous(breaks =seq(20, 50, by =2)) +theme_minimal() +theme(strip.text =element_text(size =11, face ="bold"))
# Scatter plot for depth vs blotchggplot(sharks, aes(x = depth, y = blotch)) +geom_point(alpha =0.7) +geom_smooth(method ="lm", color ="blue", se =TRUE) +labs(x ="Depth Hooked (metres)", y ="Blotching Time (seconds)") +scale_y_continuous(breaks =seq(20, 50, by =2)) +scale_x_continuous(breaks =seq(40, 60, by =2)) +theme_minimal()
`geom_smooth()` using formula = 'y ~ x'
# Scatter plot for sex difference for depth vs blotchggplot(sharks, aes(x = depth, y = blotch, color = sex)) +geom_point(alpha =0.7) +geom_smooth(method ="lm", se =TRUE) +labs(x ="Depth Hooked (metres)",y ="Blotching Time (seconds)",color ="Sex") +scale_y_continuous(breaks =seq(20, 50, by =2)) +scale_x_continuous(breaks =seq(40, 60, by =2)) +theme_minimal()
`geom_smooth()` using formula = 'y ~ x'
# Generate predictions using significant predictors and add to the data setpredictionmodel <-lm(blotch ~ depth + sex, data = sharks)sharks$predictedblotch <-predict(predictionmodel, newdata = sharks) # predict() calculaates the predicted blotching times for each observation, using depth and sex as predictors as defined in the previous model# Plot observed vs predicted blotching timesggplot(sharks, aes(x = predictedblotch, y = blotch)) +geom_point(alpha =0.7, color ="darkgreen") +geom_abline(slope =1, intercept =0, linetype ="dashed", color ="red") +# Ideal line where predictions match observationslabs(x ="Predicted Blotch", y ="Observed Blotch") +scale_y_continuous(breaks =seq(20, 50, by =2)) +theme_minimal()
summary(predictionmodel)
Call:
lm(formula = blotch ~ depth + sex, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.96224 -0.64833 -0.01317 0.58882 2.98829
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.80938 1.10029 8.915 < 2e-16 ***
depth 0.50172 0.02194 22.864 < 2e-16 ***
sexMale 0.30379 0.08871 3.424 0.000667 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9895 on 497 degrees of freedom
Multiple R-squared: 0.5214, Adjusted R-squared: 0.5195
F-statistic: 270.7 on 2 and 497 DF, p-value: < 2.2e-16
summary(sharks$blotch)
Min. 1st Qu. Median Mean 3rd Qu. Max.
30.78 34.16 35.05 35.13 36.05 40.08
summary(sharks$predictedblotch)
Min. 1st Qu. Median Mean 3rd Qu. Max.
32.32 34.47 35.12 35.13 35.77 38.32
# Correlation test between observed and predicted valuescor.test(sharks$blotch, sharks$predictedblotch)
Pearson's product-moment correlation
data: sharks$blotch and sharks$predictedblotch
t = 23.293, df = 498, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.6772818 0.7615555
sample estimates:
cor
0.7220867