library(readr)
library(vtable)
library(tidyverse)
library(ggcorrplot)
library(patchwork)
library(car)
library(lmerTest)
library(lme4)
library(geomtextpath)Research Methods & Data Analysis - Summative Workbook
Further through this page I made a mistake on the test I needed to use. I fixed my issue but I decided to keep the failed test in just so I remember my mistake and can learn from it!
Packages used throughout
Might have to use install.packages("insert package name here") if packages are not already installed!
Taking a look at the data
Importing the datasets
Data summary
Table of variables!
The three questions
With this data three questions are hoping to be answered through analysis.
Question one
Is there a correlation between the variables air and water temperature?
Question two
Does multiple capture have an effect on blotching time?
Question three
Is it possible to predict blotching time?
Descriptive Statistics
Question one
Click here for the code!
# ggplot package is within tidyverse
# creating the plot
s1 <- ggplot(sharks, aes(x = water, y = air)) + # data and x + y values used
geom_point() + # scatter plot
geom_smooth(method = "loess") + # adds a trend line
labs(x = "Water Temperature (°C)", y = "Air Temperature (°C)") + # renaming axes
theme_minimal() + # decorations
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5))
# printing the plot explicitly
print(s1)No apparent correlation between both temperatures.
Question two
Reshaping the data to be in long format
Click here for the code!
long_sharksub <- sharksub %>%
pivot_longer(cols = starts_with("blotch"), # select the blotch columns
names_to = "blotch_type", # new column for blotch type
values_to = "blotch_time") # new column for values
# view the reshaped data
head(long_sharksub)# A tibble: 6 × 4
ID sex blotch_type blotch_time
<chr> <chr> <chr> <dbl>
1 SH269 Female blotch1 36.1
2 SH269 Female blotch2 37.2
3 SH163 Female blotch1 33.4
4 SH163 Female blotch2 34.4
5 SH008 Female blotch1 36.3
6 SH008 Female blotch2 36.5
This makes it so the two catch occurrences can be compared in the time it takes to blotch.
Creating box plot
Click here for the code!
# creating the plot
s2 <- ggplot(long_sharksub, aes(x = blotch_type, y = blotch_time, fill = blotch_type)) + # data and x + y values used
stat_boxplot(geom = "errorbar", width = .4) + # adds ends to the whiskers
geom_boxplot() + # box plot
scale_x_discrete(labels = c("blotch1" = "First Catch", "blotch2" = "Second Catch")) + # renaming tick marks
labs(x = "Catch Occurance", y = "Time Taken to Blotch (secs)") + # renaming axes
coord_flip() +
ylim(32, NA) + # changes y limits to include all blotching values (added to have a starting value at the beginning)
theme_minimal() + # decorations
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5), legend.position = "none") +
scale_fill_brewer(palette = "Set2")
# printing the plot explicitly
print(s2)Increase in range, interquartile range, and median on second catch.
Creating another box plot that is grouped by sex
Click here for the code!
# creating the plot
s3 <- ggplot(long_sharksub, aes(x = blotch_type, y = blotch_time, fill = sex)) + # data, x + y values used and colour fill of boxes
stat_boxplot(geom = "errorbar", width = .4) + # adds ends to the whiskers
geom_boxplot() + # box plot
facet_wrap(~sex) + # grouping box plots per sex
scale_x_discrete(labels = c("blotch1" = "First Catch", "blotch2" = "Second Catch")) + # renaming tick marks
labs(x = "Catch Occurance", y = "Time Taken to Blotch (secs)") + # renaming axes
coord_flip() +
ylim(32, NA) + # changes y limits to include all blotching values (added to have a starting value at the beginning)
theme_minimal() + # decorations
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5), legend.position = "none") +
scale_fill_brewer(palette = "Accent")
# printing the plot explicitly
print(s3)Relatively similar between the two sexes, however in females the median of the second catch is greater than the upper quartile of the first catch. Important to know the both catches are for the same 50 sharks that were caught twice and so excludes the other 450 that were caught once.
Question three
As there are many factors that might be affecting blotching time there are several graphs that can be made.
Blotching VS numerical values
BPM, weight, length, air, water, meta, and depth are all numerical values.
Correlation heatmaps
Click here for the code!
# creating new column that is the difference in air and water temperature
sharks <- sharks %>%
mutate(temp_diff = air - water)
# selecting the numeric columns and computing a correlation matrix
correlation_matrix <- cor(sharks %>% select(where(is.numeric)))
# creating correlation heat map
ggcorrplot(correlation_matrix, method = "square", lab = TRUE)From this it can be seen that depth has a strong positive correlation with blotching, whereas all other values have no or a weak correlation with blotching.
Scatter plots
Lets first look at all of the no or a weak correlation values of the plot before as they might have hidden non-linear trends the plot didn’t pick up on.
Click here for the code!
# scatter plot between blotch and BPM
plot1 <- ggplot(sharks, aes(x = blotch, y = BPM)) +
geom_point() +
geom_smooth() +
labs(title = "Blotch vs BPM", x = "Time Taken to Blotch (secs)", y = "Beats per Minute") +
xlim(30, NA) + # changes x limits to include all blotching values (added to have a starting value at the beginning)
theme_minimal() +
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5))
# scatter plot between blotch and weight
plot2 <- ggplot(sharks, aes(x = blotch, y = weight)) +
geom_point() +
geom_smooth() +
labs(title = "Blotch vs Weight", x = "Time Taken to Blotch (secs)", y = "Weight (kg)") +
xlim(30, NA) + # changes x limits to include all blotching values (added to have a starting value at the beginning)
theme_minimal() +
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5))
# scatter plot between blotch and length
plot3 <- ggplot(sharks, aes(x = blotch, y = length)) +
geom_point() +
geom_smooth() +
labs(title = "Blotch vs Length", x = "Time Taken to Blotch (secs)", y = "Length (cm)") +
xlim(30, NA) + # changes x limits to include all blotching values (added to have a starting value at the beginning)
theme_minimal() +
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5))
# scatter plot between blotch and meta
plot4 <- ggplot(sharks, aes(x = blotch, y = meta)) +
geom_point() +
geom_smooth() +
labs(title = "Blotch vs Meta", x = "Time Taken to Blotch (secs)", y = "Meta (mcg/dl)") +
xlim(30, NA) + # changes x limits to include all blotching values (added to have a starting value at the beginning)
theme_minimal() +
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5))
# combining the plots into one large plot
combined_plot1 <- plot1 + plot2 + plot3 + plot4 +
plot_layout(ncol = 2) + # arranging them in 2x2 grid
plot_annotation(tag_levels = "A") + # add tags (A, B, C...)
theme(plot.margin = margin(10, 10, 10, 10)) # adds a margin around each plot
# print the plots together
print(combined_plot1)Nothing much happening matches the heat map. Would not say there is hidden non-linear trends.
Click here for the code!
# scatter plot between blotch and water temp
plot5 <- ggplot(sharks, aes(x = blotch, y = water)) +
geom_point() +
geom_smooth() +
labs(title = "Blotch vs Water Temperature", x = "Time Taken to Blotch (secs)", y = "Water Temperature (°C)") +
xlim(30, NA) + # changes x limits to include all blotching values (added to have a starting value at the beginning)
theme_minimal() +
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5))
# scatter plot between blotch and air temp
plot6 <- ggplot(sharks, aes(x = blotch, y = air)) +
geom_point() +
geom_smooth() +
labs(title = "Blotch vs Air Temperature", x = "Time Taken to Blotch (secs)", y = "Air Temperature (°C)") +
xlim(30, NA) + # changes x limits to include all blotching values (added to have a starting value at the beginning)
theme_minimal() +
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5))
# scatter plot between blotch and depth
plot7 <- ggplot(sharks, aes(x = blotch, y = depth)) +
geom_point() +
geom_smooth(method = lm, se = TRUE) + # adds a linear trend + confidence interval
labs(title = "Blotch vs Depth", x = "Time Taken to Blotch (secs)", y = "Depth (m)") +
xlim(30, NA) + # changes x limits to include all blotching values (added to have a starting value at the beginning)
theme_minimal() +
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5))
# scatter plot between blotch and temp diff
plot8 <- ggplot(sharks, aes(x = blotch, y = temp_diff)) +
geom_point() + # Scatter plot
geom_smooth() + # adds a linear trend + confidence interval
labs(title = "Blotch vs Temperature Difference",
x = "Blotching Time (secs)",
y = "Temperature Difference (°C)") +
xlim(30, NA) + # changes x limits to include all blotching values (added to have a starting value at the beginning)
theme_minimal() +
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5))
# combining the plots into one large plot
combined_plot2 <- plot5 + plot6 + plot7 + plot8 +
plot_layout(ncol = 2) + # arranging them in 2x2 grid
plot_annotation(tag_levels = "A") + # add tags (A, B, C...)
theme(plot.margin = margin(10, 10, 10, 10)) # adds a margin around each plot
# print the plots together
print(combined_plot2)Still not much happening with both. C, however, clearly shows the positive correlation with time taken to blotch and depth. The greater depth that a shark is caught at the longer it takes to develop blotches.
Blotching VS categorical values
Sex is the only categorical value!
Click here for the code!
# creating the plot
s5 <- ggplot(sharks, aes(x = sex, y = blotch, fill = sex)) + # data, x + y values used and colour fill of boxes
stat_boxplot(geom = "errorbar", width = .4) + # adds ends to the whiskers
geom_boxplot() + # box plot
labs(x = "Sex", y = "Time Taken to Blotch (secs)") + # renaming axes
coord_flip() +
ylim(30, NA) + # changes y limits to include all blotching values (added to have a starting value at the beginning)
theme_minimal() + # decorations
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5), legend.position = "none") +
scale_fill_brewer(palette = "Accent")
# printing the plot explicitly
print(s5)Few anomalies but both sexes are very similar.
Data Analysis
Question one
Based on the scatter plot there’s a high likelihood there is not any statistical significance, but nevertheless lets delve into it!
However, to determine what test is needed a few things need to be identified first.
First we know the data is continuous an we want to have a test of association.
Now we need to know whether the data meets the parametric assumptions, which are:
Is the data continuous?
- Yes!
Is the sample size greater than 10?
- Yes!
Is the data normally distributed?
- This is something we have to work out using a Shapiro-Wilks test
Does the data have equal variance?
- This has another test to check it, the Levenes test
Shapiro-Wilks
It is important to note the Shapiro Wilks test is sensitive to sample size and as the sample becomes large (over 100) it increases the chance of a statistically significant result. So if there’s a significant result it might have to be further investigated.
The test statistic W is calculated - small values suggest the data IS NOT normally distributed and larger values suggest that the data IS normally distributed.
The test statistic is supported with a p value to show if the test results are significant.
If p > 0.05 the null hypothesis can be accepted, so it can be concluded that the data IS normally distributed.
If p ≤ 0.05 the null hypothesis can be rejected, so it can be concluded that the data IS NOT normally distributed
shapiro.test(sharks$air) # shapiro for air temp from shark data set
Shapiro-Wilk normality test
data: sharks$air
W = 0.95885, p-value = 1.338e-10
Not normally distributed! - p is less than 0.05 and W value is small
shapiro.test(sharks$water) # shapiro for water temp from shark data set
Shapiro-Wilk normality test
data: sharks$water
W = 0.96035, p-value = 2.371e-10
Not normally distributed! - p is less than 0.05 and W value is small
Levenes
As the data is not normally distributed it does not fit all of the parametric assumptions even if it has an equal variance but lets have a look anyway :)
# air and water temperature needs to be into long format first
pivoted_temp <- sharks %>%
pivot_longer(cols = c(air, water), names_to = "variable", values_to = "value")
# levene test with new long data that has variables (either air or water) and values (the temp number)
leveneTest(value ~ variable, data = pivoted_temp)Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 1 16.416 5.481e-05 ***
998
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
p-value > 0.05: Fail to reject the null hypothesis, meaning the variances are likely equal
p-value < 0.05: Reject the null hypothesis, meaning the variances are likely unequal
Here the p value is very small again (less than 0.05) meaning the variances are not equal! Further meaning the data does not meet parametric assumptions.
Spearman’s Rank Correlation
As said before the data does not meet the parametric assumptions so a Spearman’s Rank Correlation test will be used to determine if there’s any correlation between air and water temperature.
# spearman rank test for shark data for air and water temp
cor.test(sharks$air, sharks$water, method = "spearman")
Spearman's rank correlation rho
data: sharks$air and sharks$water
S = 22007692, p-value = 0.2082
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
-0.05637344
The Spearman’s Rank Correlation Coefficient (rho or ρ) ranges from -1 (perfect negative) to +1 (perfect positive). So the value here (-0.05637…) being close to 0 shows a very weak negative relationship between air and water temperature - basically negligible.
The test also gives a p-value. If this is < 0.05 the relationship is statistically significant. Here the value (0.2082) is greater than 0.05 (failing to reject null hypothesis), meaning there is no statistically significant correlation between air and water temperature.
Question two
Continuous data once again (time taken to blotch). The catch occurrence is categorical tho!
We know to answer this question a test of difference is needed, as we want to know if there is a difference between blotching time between first and second capture in sharks.
We also know we have one dependent variable (time taken to blotch) compared across two conditions (first capture and second capture). This sharksub data set is dependent as well, due to each condition having the same observed individuals.
However, we once again we need to know whether the data meets the parametric assumptions!
Is the data continuous?
- Yes!
Is the sample size greater than 10?
- Yes!
Is the data normally distributed?
- Shapiro-Wilks test needed
Does the data have equal variance?
- Levenes test needed
Shapiro-Wilks
First capture:
shapiro.test(sharksub$blotch1) # shapiro for first capture blotch time from sharksub data set
Shapiro-Wilk normality test
data: sharksub$blotch1
W = 0.97958, p-value = 0.5345
Normally distributed! - p is greater than 0.05
Second capture:
shapiro.test(sharksub$blotch2) # shapiro for second capture blotch time from sharksub data set
Shapiro-Wilk normality test
data: sharksub$blotch2
W = 0.97936, p-value = 0.5255
Normally distributed! - p is greater than 0.05
Click here for the code!
# need long shark data
long_sharksub <- sharksub %>%
pivot_longer(cols = starts_with("blotch"), # select the blotch columns
names_to = "blotch_type", # new column for blotch type
values_to = "blotch_time") # new column for values
# creating histogram with normal curves
histsub <- ggplot(long_sharksub, aes(x = blotch_time, fill = blotch_type)) +
geom_histogram(aes(y = ..density..), bins = 30, alpha = .4, position = "identity", color = "black") + # creates histogram
stat_function(data = subset(long_sharksub, blotch_type == "blotch1"),
fun = dnorm,
args = list(mean = mean(long_sharksub$blotch_time[long_sharksub$blotch_type == "blotch1"]),
sd = sd(long_sharksub$blotch_time[long_sharksub$blotch_type == "blotch1"])),
color = "aquamarine4", size = 1.5) +
stat_function(data = subset(long_sharksub, blotch_type == "blotch2"),
fun = dnorm,
args = list(mean = mean(long_sharksub$blotch_time[long_sharksub$blotch_type == "blotch2"]),
sd = sd(long_sharksub$blotch_time[long_sharksub$blotch_type == "blotch2"])),
color = "darkorange4", size = 1.5) + # this makes the lines
scale_fill_brewer(palette = "Set2", labels = c("First Catch", "Second Catch")) + # different colours for the histograms and changes legend names
labs(x = "Time Taken to Blotch (secs)",
y = "Density",
fill = "Catch Occurrence") + # changing names
theme_minimal() + # decorations
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5))
# print plot explicitly
print(histsub)This histogram helps to show this!
Levenes
# need long shark data again
long_sharksub <- sharksub %>%
pivot_longer(cols = starts_with("blotch"), # select the blotch columns
names_to = "blotch_type", # new column for blotch type
values_to = "blotch_time") # new column for values
# levene test with new long data that has variables (either first or second catch) and values (the time taken)
leveneTest(blotch_time ~ blotch_type, data = long_sharksub)Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 1 0.1522 0.6973
98
The p-value (0.6973) is greater than 0.05 so the variances are most likely equal!
This means this data set fits the parametric assumptions, so a Paired t-test should be used to test if multiple capture creates a difference in blotching times.
Paired t-test
# t-test time - paired is because each condition (first and second) used same individuals
t_test_result <- t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)
t_test_result
Paired t-test
data: sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-1.037176 -0.822301
sample estimates:
mean difference
-0.9297384
The t-statistic (t) is a measure of the difference between the sample means, relative to the variation in the data. Here t = -17.39 which is quite large in magnitude, indicating a very strong difference between the time taken to blotch in the first and second capture.
df = 49 just shows sample size is 50 as df = sample size - 1
The p-value is extremely small (2.2e-16) meaning the null hypothesis is rejected meaning the mean difference between taken taken to blotch the first time and the second time is statistically significantly different from zero.
The confidence interval for the mean difference is between -1.037 and -0.822. This means there’s a 95% confidence that the true mean difference between the time for blotching at each catch occurrence lies between -1.037 and -0.822. Since the interval does not contain 0, this provides additional evidence that the mean difference is statistically significant.
The mean difference in time taken to blotch between the first and second capture is -0.9297… Meaning on average it takes 0.93 seconds less to blotch on the second catch.
But wait the histogram and box plot shows the second capture to have a greater time????
As the data consisted of repeated measurements from the same sharks, a paired t-test was conducted to account for the within-individual differences between blotch1 and blotch2. This test focuses on the differences between the two conditions for each individual, rather than comparing overall averages.
The paired t-test revealed a highly significant result (t = -17.39, p < 0.001), with a mean difference of -0.93. This indicates that, on average, blotch times of the same individual shark for the second capture were significantly shorter than for the first capture.
Click here for the code!
# create the dot and line plot
sharkdotline <- ggplot(sharksub, aes(x = factor(1), group = 1)) +
# add lines for each individual shark
geom_segment(aes(x = 1, xend = 2, y = blotch1, yend = blotch2),
color = "darkgray", alpha = 0.6) +
# add dots for first catch
geom_point(aes(x = 1, y = blotch1), color = "aquamarine3", size = 3) +
# add dots for second catch
geom_point(aes(x = 2, y = blotch2), color = "darkorange3", size = 3) +
# customise axes and labels
scale_x_continuous(breaks = c(1, 2), labels = c("First Catch", "Second Catch")) +
labs(
x = "Catch Occurrence",
y = "Time Taken to Blotch (secs)"
) +
theme_minimal() + # decorations
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5), legend.position = "none") +
scale_fill_brewer(palette = "Set2")
# printing the plot explicitly
print(sharkdotline)This dot and line plot also seems to have more increase lines?
Lets look at a histogram of the differences between the two captures!
Click here for the code!
# a new data set with difference between blotch 1 and 2 column
sharksubdiff <- sharksub %>%
mutate(diff = blotch1 - blotch2)
ggplot(sharksubdiff, aes(x = diff)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
labs(x = "Difference in Time Taken to Blotch",
y = "Count") + # labeling of axis
theme_minimal() + # decorations
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5), legend.position = "none") +
scale_fill_brewer(palette = "Set2")This shows whilst there are increases (positive numbers) in the time taken to blotch between the first and the second catch there are way more cases of decreased (negative numbers) times between the first and second catch. This further goes against t-test results.
# data set with difference between blotch 1 and 2 column
sharksubdifference <- sharksub$blotch1 - sharksub$blotch2
shapiro.test(sharksubdifference)
Shapiro-Wilk normality test
data: sharksubdifference
W = 0.43157, p-value = 1.212e-12
Based on the fact the difference in time is not normally distributed I am going to try a Wilcoxon Rank Sums test instead for the data (as parametric assumptions not met).
Wilcoxon Rank Sums - might be the right option
wilcox.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)
Wilcoxon signed rank test with continuity correction
data: sharksub$blotch1 and sharksub$blotch2
V = 12, p-value = 1.606e-09
alternative hypothesis: true location shift is not equal to 0
The very small p-value (p < 0.001) strongly suggesting there is a statistically significant difference between time taken to blotch on the first and second catch. Slower on second catch!
My mistake was using a t-test as although the times on there own were normally distributed the difference between them wasn’t! When the difference between paired observations is not normally distributed, the t-test might overestimate or underestimate the true effect, giving unreliable results (like the negative mean difference I got).
OOOPS! you live and you learn :)
Sex differences
Now lets also look at whether the time to blotch changes from the first to the second capture within each sex. Important to note there was 25 males and 25 females observed.
# first split sexes
sharksub_male <- subset(sharksub, sex == "Male")
sharksub_female <- subset(sharksub, sex == "Female")
# checking normality
shapiro.test(sharksub_male$blotch1 - sharksub_male$blotch2)
Shapiro-Wilk normality test
data: sharksub_male$blotch1 - sharksub_male$blotch2
W = 0.46081, p-value = 1.569e-08
shapiro.test(sharksub_female$blotch1 - sharksub_female$blotch2)
Shapiro-Wilk normality test
data: sharksub_female$blotch1 - sharksub_female$blotch2
W = 0.40466, p-value = 4.846e-09
NOT normally distributed! Therefore, a Wilcoxon Rank Sums test will be used to test whether the time taken to blotch changes between first and second capture, per sex.
# Wilcoxon Signed-Rank Test for males
wilcox.test(sharksub_male$blotch1, sharksub_male$blotch2, paired = TRUE)
Wilcoxon signed rank exact test
data: sharksub_male$blotch1 and sharksub_male$blotch2
V = 5, p-value = 5.96e-07
alternative hypothesis: true location shift is not equal to 0
# Wilcoxon Signed-Rank Test for females
wilcox.test(sharksub_female$blotch1, sharksub_female$blotch2, paired = TRUE)
Wilcoxon signed rank exact test
data: sharksub_female$blotch1 and sharksub_female$blotch2
V = 2, p-value = 1.788e-07
alternative hypothesis: true location shift is not equal to 0
Both tests are statistically significant (p < 0.001), meaning blotch time changes significantly between first and second capture in both sexes.
Based on the box plot the second catch has the longer blotching time compared to the first catch for both males and females!
Question three
For this question we want to build a predictive model for blotching time based on the measured variables (9 in total)!
However, I am first going to start with a singular linear regression model just for depth as it has such as strong visual correlation to blotching time.
Singular linear regression model - depth
Testing normality
Click here for the code!
shapiro.test(sharks$depth)
Shapiro-Wilk normality test
data: sharks$depth
W = 0.99746, p-value = 0.6485
Click here for the code!
shapiro.test(sharks$blotch)
Shapiro-Wilk normality test
data: sharks$blotch
W = 0.99695, p-value = 0.4769
Normally distributed!
# simple linear regression (slr) with depth
slr_depth_model <- lm(blotch ~ depth, data = sharks)
# viewing model summary
summary(slr_depth_model)
Call:
lm(formula = blotch ~ depth, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.81869 -0.65427 -0.01035 0.58825 2.83116
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.82178 1.11207 8.832 <2e-16 ***
depth 0.50467 0.02216 22.772 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1 on 498 degrees of freedom
Multiple R-squared: 0.5101, Adjusted R-squared: 0.5091
F-statistic: 518.6 on 1 and 498 DF, p-value: < 2.2e-16
# checking Homoscedasticity - equal varience
plot(slr_depth_model, which = 1) The plot shows the residuals (the differences between observed and predicted values) have equal variance meaning the regression model is valid.
Looking at the model:
The estimate intercept of 9.8 means that sharks caught at 0m take 9.8 seconds to blotch (not really applicable in practise).
The estimate gradient (slope) for depth of 0.50467 means that for each unit increase in depth, blotch time increases by 0.50467 units. Positive means as one increases the other does too!
Basically: y = mx + c y = 0.5x + 9.8
The p-value is extremely small (< 0.001), indicating that the relationship between depth and blotch time is statistically significant. Depth is a significant predictor of blotch time.
The R-squared value indicates that 51% of the variability in blotch time can be explained by depth alone. Suggests that depth has decent explanatory power, but other factors likely also play a role in explaining the variability in blotching time - this is where multiple linear regression comes in!
The F-statistic (518.6) tests whether the model as a whole is significant. Since the p-value is less than 0.05, we can conclude that the model explains a significant portion of the variability in blotch time.
If we want to predict blotching with just one variable it would be wise to choose blotch, but lets look at other variables too with a multiple linear regression model.
Multiple linear regression model
# multiple linear regression (mlr)
mlr_model <- lm(blotch ~ sex + BPM + weight + length + air + water + meta + depth, data = sharks)
# viewing model summary
summary(mlr_model)
Call:
lm(formula = blotch ~ sex + BPM + weight + length + air + water +
meta + depth, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.97715 -0.66193 -0.00841 0.64123 2.90395
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.1179728 1.8749828 5.930 5.73e-09 ***
sexMale 0.3088617 0.0890602 3.468 0.00057 ***
BPM -0.0020791 0.0031540 -0.659 0.51009
weight 0.0017281 0.0033143 0.521 0.60231
length 0.0013042 0.0009606 1.358 0.17517
air -0.0310068 0.0315302 -0.983 0.32590
water -0.0143878 0.0268112 -0.537 0.59176
meta -0.0011610 0.0025671 -0.452 0.65127
depth 0.5034077 0.0220870 22.792 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9912 on 491 degrees of freedom
Multiple R-squared: 0.5256, Adjusted R-squared: 0.5178
F-statistic: 67.99 on 8 and 491 DF, p-value: < 2.2e-16
This model shows sex and depth are important factors to blotching!
The other predictors (BPM, weight, length, air, water, and meta) do not have statistically significant relationships (p-values > 0.05) with blotch time (p-values are greater than 0.05), so they don’t significantly contribute to predicting the blotch time.
Since I already did depth slr model for depth (which shows similar results to the mlr model) I thought I should do one for the other important varible: sex
Singular linear regression model - sex
# simple linear regression (slr) with sex
slr_depth_model_sex <- lm(blotch ~ sex, data = sharks)
# viewing model summary
summary(slr_depth_model_sex)
Call:
lm(formula = blotch ~ sex, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-4.1471 -0.9653 -0.0222 0.9889 4.7772
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 34.92294 0.09217 378.885 < 2e-16 ***
sexMale 0.38347 0.12685 3.023 0.00263 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.416 on 498 degrees of freedom
Multiple R-squared: 0.01802, Adjusted R-squared: 0.01605
F-statistic: 9.139 on 1 and 498 DF, p-value: 0.002632
The Intercept (for Female, since sexMale is the contrast) is 34.92.
The estimate gradient for sexMale is 0.38347, meaning that males have an average blotching time 0.38 seconds longer than females (on average)
p-value for sexMale is 0.00263, which is statistically significant (since it’s < 0.05), suggesting that sex does have a significant effect on blotching time.
R-squared is 0.01802, which means only about 1.8% of the variability in blotching time can be explained by sex.
This is a lower R-squared compared to depth, indicating that sex alone is not a strong predictor of blotching time.
A combination of the two is probably the way to go!
Multiple linear regression model - depth + sex
# mutltiple linear regression (mlr) with depth + sex
mlr_model_depth_sex <- lm(blotch ~ depth + sex, data = sharks)
# viewing model summary
summary(mlr_model_depth_sex)
Call:
lm(formula = blotch ~ depth + sex, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.96224 -0.64833 -0.01317 0.58882 2.98829
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.80938 1.10029 8.915 < 2e-16 ***
depth 0.50172 0.02194 22.864 < 2e-16 ***
sexMale 0.30379 0.08871 3.424 0.000667 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9895 on 497 degrees of freedom
Multiple R-squared: 0.5214, Adjusted R-squared: 0.5195
F-statistic: 270.7 on 2 and 497 DF, p-value: < 2.2e-16
Depth is a strong predictor with a highly significant effect (p-value < 2e-16), and its positive coefficient (0.50172) means that as depth increases, blotching time increases as well (by 0.50172 secs).
Sex also has a statistically significant effect (p-value = 0.000667), with males having slightly longer blotching times (0.30379 secs).
The model explains 52.14% of the variability in blotching time (R-squared value).
Before jumping to conclusions we need to check if model meets assumptions.
Check Normality of Residuals
Click here for the code!
# plotting the residuals
plot(slr_depth_model_sex, which = 2) # Q-Q plotClick here for the code!
# Shapiro-Wilk test for normality of residuals
shapiro.test(residuals(slr_depth_model_sex))
Shapiro-Wilk normality test
data: residuals(slr_depth_model_sex)
W = 0.99717, p-value = 0.547
The Q-Q plot looks linear - residuals are approximately normal.
p-value > 0.05 in the Shapiro-Wilk test - the residuals are normally distributed.
Check Homoscedasticity (Equal Variance of Residuals)
Click here for the code!
# plotting residuals v fitted values
plot(slr_depth_model_sex, which = 1)The plot shows a similar random scatter of points around zero in each line of points (male and female) - indicates equal variance.
Check for Influential Points (Outliers)
Using Cook’s distance! Points with a Cook’s distance greater than 1 or 4/n (500) may be influential.
Click here for the code!
# plotting Cook's distance
plot(slr_depth_model_sex, which = 4)There are a few points greater than 4/500, but as the residuals are reasonably distributed and there are no other major issues with heteroscedasticity or non-normality, the model is likely still valid.
Based on these results depth and sex can be used to predict blotching time!
The model has a strong significance, and both variables contribute to explaining blotching time.
Lets make a plot to sort of show this!
Click here for the code!
# creating plot
predict <- ggplot(sharks, aes(x = blotch, y = depth, color = sex)) +
geom_point() +
geom_labelsmooth(aes(label = sex), fill = "white",
method = "lm", formula = y ~ x,
size = 3, linewidth = 1, boxlinewidth = 0.4) + # adds a linear trend with labels
labs(x = "Time Taken to Blotch (secs)", y = "Depth (m)") +
xlim(30, NA) + # changes x limits to include all blotching values (added to have a starting value at the beginning)
scale_color_brewer(palette = "Accent") + # add color scale for sex with the Accent palette
theme_minimal() +
theme(plot.background = element_rect(color = "darkslategrey",
fill = NA, size = .5)) +
guides(color = 'none')
# print plot explicitly
print(predict)You can see the slight steepness females have on males here!
Interaction Model
# interaction effect between depth and sex
interaction_model <- lm(blotch ~ depth * sex, data = sharks)
# view model summary
summary(interaction_model)
Call:
lm(formula = blotch ~ depth * sex, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.9614 -0.6388 -0.0179 0.5981 2.9815
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.14057 1.58486 6.398 3.64e-10 ***
depth 0.49510 0.03164 15.650 < 2e-16 ***
sexMale -0.33657 2.20540 -0.153 0.879
depth:sexMale 0.01277 0.04396 0.291 0.771
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9904 on 496 degrees of freedom
Multiple R-squared: 0.5215, Adjusted R-squared: 0.5186
F-statistic: 180.2 on 3 and 496 DF, p-value: < 2.2e-16
Need to check assumptions!
Click here for the code!
# residual normality
qqnorm(residuals(interaction_model))
qqline(residuals(interaction_model))Click here for the code!
# homoscedasticity (equal variance)
plot(fitted(interaction_model), residuals(interaction_model))
abline(h = 0, col = "red")Click here for the code!
# multicollinearity
vif(interaction_model) depth sex depth:sex
2.077837 617.828396 620.927378
Click here for the code!
# residual independence
plot(residuals(interaction_model) ~ seq_along(residuals(interaction_model)))
abline(h = 0, col = "red")Third one down- the Variance Inflation Factor (VIF) values for sex (617.83) and depth:sex (620.93) are extremely high. Typically, a VIF above 10 indicates severe multicollinearity, meaning that these predictors are highly correlated and could be distorting the model.
This aligns with previous results, where sex was significant in a simpler model but became non-significant when included in an interaction model. The high VIF suggests that sex and depth × sex might be redundant in this context.