Warning: package 'tidyverse' was built under R version 4.3.3
Warning: package 'ggplot2' was built under R version 4.3.3
Warning: package 'tibble' was built under R version 4.3.1
Warning: package 'tidyr' was built under R version 4.3.3
Warning: package 'readr' was built under R version 4.3.3
Warning: package 'purrr' was built under R version 4.3.3
Warning: package 'dplyr' was built under R version 4.3.3
Warning: package 'stringr' was built under R version 4.3.3
Warning: package 'forcats' was built under R version 4.3.3
Warning: package 'lubridate' was built under R version 4.3.3
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2) #loads ggplot2 package for graph making
sharks <-read.csv("sharks.csv", header =TRUE)sharksub <-read.csv("sharksub.csv", header =TRUE)view(sharks) #opens the sharks data set in a new tab (after importing)view(sharksub) #opens the sharksub data set in a new tab (after importing)
##Normality: The shapiro wilk test
normblotch1 <-shapiro.test(sharksub$blotch1) #normal distributionprint(normblotch1) #this print command shows the result of the test
Shapiro-Wilk normality test
data: sharksub$blotch1
W = 0.97958, p-value = 0.5345
Shapiro-Wilk normality test
data: sharks$depth
W = 0.99746, p-value = 0.6485
##Multiple capture
ttestrecapture <-t.test(sharksub$blotch1, sharksub$blotch2, paired =TRUE, alternative ="two.sided") #paired t test between blotch 1 and blotch 2print(ttestrecapture)
Paired t-test
data: sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-1.037176 -0.822301
sample estimates:
mean difference
-0.9297384
#Comparing this between males and females#Creating subsets within the sharksub dataset by sexmalesharksub <-subset(sharksub, sex =="Male")femalesharksub <-subset(sharksub, sex =="Female")#Paired t-test for malesttestmalesharksub <-t.test(malesharksub$blotch1, malesharksub$blotch2, paired =TRUE)#Paired t-test for femalesttestfemalesharksub <-t.test(femalesharksub$blotch1, femalesharksub$blotch2, paired =TRUE)#To discover the resultsprint("Paired t-test for males: Recapture")
[1] "Paired t-test for males: Recapture"
print(ttestmalesharksub)
Paired t-test
data: malesharksub$blotch1 and malesharksub$blotch2
t = -10.527, df = 24, p-value = 1.786e-10
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-1.0857650 -0.7298108
sample estimates:
mean difference
-0.9077879
print("Paired t-test for females: Recapture")
[1] "Paired t-test for females: Recapture"
print(ttestfemalesharksub)
Paired t-test
data: femalesharksub$blotch1 and femalesharksub$blotch2
t = -14.694, df = 24, p-value = 1.7e-13
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-1.0853637 -0.8180143
sample estimates:
mean difference
-0.951689
#both females and males significantly different between blotch1 and blotch2, both when tested separately and together#Making a box plot for this#Now I must create a new dataset to arrange the data for graphssharksubgraphmaking <- reshape2::melt(sharksub, id.vars ="sex", measure.vars =c("blotch1", "blotch2"))#Boxplotting and making it look niceggplot(sharksubgraphmaking, aes(x = variable, y = value, fill = sex)) +geom_boxplot() +labs(title ="Blotch1 vs Blotch2 for Males and Females", x ="First Capture Recapture", y ="Blotch") +scale_fill_manual(values =c("pink", "blue")) +theme_minimal()
##Air and Water Correlation
corairwater <-cor.test(sharks$air, sharks$water, method ="spearman") #using spearman's rank test, negative correlation seen but not statistically significantprint(corairwater)
Spearman's rank correlation rho
data: sharks$air and sharks$water
S = 22007692, p-value = 0.2082
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
-0.05637344
##Correlations with Blotch
corblotchweight <-cor.test(sharks$blotch, sharks$weight, method ="spearman") #positive correlation between blotching and weight but not significantprint(corblotchweight)
Spearman's rank correlation rho
data: sharks$blotch and sharks$weight
S = 20832130, p-value = 0.999
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
5.376022e-05
corblotchlength <-cor.test(sharks$blotch, sharks$length, method ="spearman") #negative correlation but not significantprint(corblotchlength)
Spearman's rank correlation rho
data: sharks$blotch and sharks$length
S = 21076764, p-value = 0.7942
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
-0.01168872
corblotchair <-cor.test(sharks$blotch, sharks$air, method ="spearman") #negative correlation but not significantprint(corblotchair)
Spearman's rank correlation rho
data: sharks$blotch and sharks$air
S = 21469324, p-value = 0.4956
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
-0.03053167
corblotchwater <-cor.test(sharks$blotch, sharks$water, method ="spearman") #negative correlation but not significantprint(corblotchwater)
Spearman's rank correlation rho
data: sharks$blotch and sharks$water
S = 21758692, p-value = 0.3214
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
-0.04442139
corblotchmeta <-cor.test(sharks$blotch, sharks$meta, method ="spearman") #negative correlation but not significantprint(corblotchmeta)
Spearman's rank correlation rho
data: sharks$blotch and sharks$meta
S = 21411160, p-value = 0.5359
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
-0.02773979
Spearman's rank correlation rho
data: sharks$blotch and sharks$depth
S = 6797520, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.6737177
#Creating subsets within the shark dataset by sexmaleshark <-subset(sharks, sex =="Male")femaleshark <-subset(sharks, sex =="Female")#Unpaired t-test for males and femalesttestmalefemalesharks <-t.test(maleshark$blotch, femaleshark$blotch, paired =FALSE)#To discover the resultsprint("Unpaired t-test for males and females")
[1] "Unpaired t-test for males and females"
print(ttestmalefemalesharks)
Welch Two Sample t-test
data: maleshark$blotch and femaleshark$blotch
t = 3.0282, df = 494.67, p-value = 0.002589
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.1346620 0.6322714
sample estimates:
mean of x mean of y
35.30641 34.92294
#there is a significant difference between blotching in males and females. Next I will make a boxplot to show thisggplot(sharks, aes(x = sex, y = blotch, fill = sex)) +geom_boxplot() +labs(title ="Blotch: Males vs.Females",x ="Sex",y ="Blotch") +scale_fill_manual(values =c("pink", "blue")) +theme_minimal()
##Correlations with meta
cormetaweight <-cor.test(sharks$meta, sharks$weight, method ="spearman") #positive correlation but not significantprint(cormetaweight)
Spearman's rank correlation rho
data: sharks$meta and sharks$weight
S = 20413216, p-value = 0.6528
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.02016171
cormetalength <-cor.test(sharks$meta, sharks$length, method ="spearman") #positive correlation but not significantprint(cormetalength)
Spearman's rank correlation rho
data: sharks$meta and sharks$length
S = 20737928, p-value = 0.9187
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.004575474
Spearman's rank correlation rho
data: sharks$meta and sharks$air
S = 18172688, p-value = 0.004258
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.1277075
cormetawater <-cor.test(sharks$meta, sharks$water, method ="spearman") #positive correlation but not significantprint(cormetawater)
Spearman's rank correlation rho
data: sharks$meta and sharks$water
S = 20360456, p-value = 0.6126
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.0226942
cormetadepth <-cor.test(sharks$meta, sharks$depth, method ="spearman") #positive correlation but not significantprint(cormetadepth)
Spearman's rank correlation rho
data: sharks$meta and sharks$depth
S = 20614282, p-value = 0.8146
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.01051051
##Linear models to predict blotching
#Creating a model with all of the numerical variables in sharkslm_model <-lm(blotch ~ BPM + weight + length + air + water + meta + depth, data = sharks)summary(lm_model)
Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta +
depth, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.83745 -0.66117 -0.00702 0.60110 2.74108
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.1405851 1.8958668 5.876 7.74e-09 ***
BPM -0.0019723 0.0031890 -0.618 0.537
weight 0.0016283 0.0033511 0.486 0.627
length 0.0012295 0.0009710 1.266 0.206
air -0.0281474 0.0318707 -0.883 0.378
water -0.0188934 0.0270782 -0.698 0.486
meta -0.0009712 0.0025951 -0.374 0.708
depth 0.5061285 0.0223191 22.677 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.002 on 492 degrees of freedom
Multiple R-squared: 0.514, Adjusted R-squared: 0.507
F-statistic: 74.32 on 7 and 492 DF, p-value: < 2.2e-16
#Adjusting the model to reduce the interference from the non-significant variables and multicollinearity to only include the one significant variable seen in the previous model and the correlation tests.lm_depth_only <-lm(blotch ~ depth, data = sharks)summary(lm_depth_only)
Call:
lm(formula = blotch ~ depth, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.81869 -0.65427 -0.01035 0.58825 2.83116
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.82178 1.11207 8.832 <2e-16 ***
depth 0.50467 0.02216 22.772 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1 on 498 degrees of freedom
Multiple R-squared: 0.5101, Adjusted R-squared: 0.5091
F-statistic: 518.6 on 1 and 498 DF, p-value: < 2.2e-16
#This model accounts for 51% of variation in the data, which is a mere 0.04% less than in the previous model.This is remedied by adjusting for R-squared, as when this is done the second model that focuses on depth increases its score to 50.91%.#It is time to create predictions using the depth modelsharks$predicted_blotch <-predict(lm_depth_only, newdata = sharks)#Creates predictions with the inclusion of confidence intervals which is helpful to assess the validity of the resultpredictions <-predict(lm_depth_only, newdata = sharks, interval ="confidence", level =0.95)#To make graph creation easier later, this will move the predicted values from the model into the sharks data setsharks$predicted_blotch <- predictions[, 1] sharks$lower_ci <- predictions[, 2] sharks$upper_ci <- predictions[, 3] #This creates a colourful scatter graph to visualise the relationship between depth and blotch, with the predicted blotching time represented as a red lineggplot(sharks, aes(x = depth, y = blotch)) +geom_point(aes(color ="Observed"), alpha =0.6) +geom_line(aes(x = depth, y = predicted_blotch, color ="Predicted"), size =1) +labs(title ="Blotch vs Depth",x ="Depth",y ="Blotch") +scale_color_manual(values =c("Observed"="blue", "Predicted"="red")) +theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
#Same as previous graph but with confidence intervals this time. ggplot(sharks, aes(x = depth)) +#Plots the observed values as blue pointsgeom_point(aes(y = blotch, color ="Observed"), alpha =0.6) +#Plots the predicted values as a red line across the graphgeom_line(aes(y = predicted_blotch, color ="Predicted"), size =1) +#Add confidence intervals that look like a semi-transparent red ribbongeom_ribbon(aes(ymin = lower_ci, ymax = upper_ci), fill ="red", alpha =0.2) +labs(title ="Blotch vs Depth: with Confidence Intervals",x ="Depth",y ="Blotch") +scale_color_manual(values =c("Observed"="blue", "Predicted"="red")) +theme_minimal()