# Check if the file exists in the working directoryfile.exists("sharksub.csv")
[1] TRUE
Code
#Set the working directorysetwd("/Users/becca/Desktop/Classes/Research Methods/Research Methods Summative")sharksub <-read.csv("sharksub.csv")
Now we will view the “sharks” and “sharksub” data, which opened up in a separate tab.
Code
view (sharks)view (sharksub)
Data Visualisation
Sharks (By Sex)
Now that we have confirmed that both the shark and sharksub datasets are working properly, let’s begin to visualise the data. First we will do a ggplot of male and female sharks plotted by weight and length, organized by color and shape.
Code
ggplot(sharks,aes(x = length, y = weight)) +geom_point(aes(color = sex, shape = sex)) +scale_color_manual(values =c("purple","cyan4")) +labs(title ="Shark Comparison: Sex by Weight",x ="Length (cm)", y ="Weight (kg)",color =" ",shape =" ", fill =" " ) +theme(axis.text =element_text(size =10),axis.title =element_text(size =10),plot.title =element_text(size =12, face ="bold", hjust =0.5))
Air & Water Correlation
We can see from the aforementioned graphic that there is no correlation between length and weight of females vs. males. Let’s perform the same action but for correlations between air and water.
Code
sharks %>%ggplot(aes(x = water, y = air,shape = sex,color = sex,fill = sex )) +geom_point() +labs(title ="Correlation: Air & Water",x ="Water (Celsius)", y ="Air (Celsius)",color =" ",shape =" ", fill =" " ) +theme(axis.text =element_text(size =10),axis.title =element_text(size =10),plot.title =element_text(size =12, face ="bold", hjust =0.5) )
There does not appear to be a correlation here. Let’s try to change the x and y axes.
Code
sharks %>%ggplot(aes(x = water, y = air,shape = sex,color = sex,fill = sex )) +geom_point() +labs(title ="Correlation: Air & Water",x ="Air (Celsius)", y ="Water (Celsius)",color =" ",shape =" ", fill =" " ) +theme(axis.text =element_text(size =10),axis.title =element_text(size =10),plot.title =element_text(size =12, face ="bold", hjust =0.5) )
Air Temperature & Catch Number
Still nothing. This may not be the correct type of graph, so let’s try something different by way of a histogram.
Code
data("sharks")
Warning in data("sharks"): data set 'sharks' not found
Code
sharks %>%group_by(sex) %>%ggplot(aes(x = air, color = sex, fill = sex)) +geom_histogram() +labs(title ="Air Temperature by Sex", x ="Air Temperature (°C)", # X-axis labely ="Count", # Y-axis labelcolor ="Sex", # Legend title for colorfill ="Sex"# Legend title for fill ) +theme(axis.text =element_text(size =10),axis.title =element_text(size =10),plot.title =element_text(size =12, face ="bold", hjust =0.5) )
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
At this point, I believe we can safely say there does not appear to be a relevant correlation between air temperature and number of catches.
Blotching Correlation
Let’s move into blotching. First let’s see if there is a correlation between male and female blotching.
Code
ggplot(sharks, aes(x = sex, y = blotch, fill = sex)) +geom_boxplot() +labs(title ="Comparison of Blotching Time by Sex",x ="Shark Sex",y ="Blotching Time (seconds)" ) +theme(axis.text =element_text(size =10),axis.title =element_text(size =10),plot.title =element_text(size =12, face ="bold", hjust =0.5) ) +theme_minimal() +scale_fill_manual(values =c("purple", "cyan4"))
Review of the data shows that males begin blotching slightly after females, but the difference is not substantial. Let’s look at the same data for males and females caught twice.
Code
ggplot(sharksub, aes(x = blotch1, y = blotch2, fill = sex)) +geom_boxplot() +labs(title ="Comparison of Blotching Time by Sex",x ="Sex",y ="Blotching Time (Seconds)" ) +theme_minimal() +scale_fill_manual(values =c("purple", "cyan4")) +theme(plot.title =element_text(face ="bold", hjust =0.5) )
This chart doesn’t look quite right, so let’s try a different variation. Here, we will utilise columns for the first and second blotch times to better visualise the data. co
Code
library(tidyr)sharks_long <- sharksub %>%pivot_longer(cols =c(blotch1, blotch2), names_to ="catch_number", values_to ="blotching_time")ggplot(sharks_long, aes(x = catch_number, y = blotching_time, fill = sex)) +geom_boxplot() +labs(title ="Blotch Time by Catch Number and Sex",x ="Catch Number",y ="Blotch Time (seconds)" ) +theme_minimal() +scale_fill_manual(values =c("purple", "cyan4")) +theme(plot.title =element_text(face ="bold", hjust =0.5) )
Review of this data shows that for both males and females, blotch time increases by about one second with the second catch.
Let’s see if we can find any other data correlations before we continue much further.
Code
# Select only numeric columns for correlation testshark_numeric <- sharks[, sapply(sharks, is.numeric)]
Code
# Compute the correlation matrixcor_matrix <-cor(shark_numeric, method ="pearson")
Code
# Visualize the correlation matrixcorrplot(cor_matrix, method ="circle",title ="Correlation Matrix", mar =c(2, 2, 2, 2) )
Blotch Time & Depth Correlation
It appears from the aforementioned graph that there is a correlation between depth and blotch time. We will now create a visual image of this to confirm, using a scatterplot comparing the depth to the blotch time and differentiating sex:
Code
ggplot(sharks, aes(x = blotch, y = depth, color = sex)) +geom_point() +geom_smooth(method ="lm", se =FALSE, color ="red") +labs(x ="Blotch Time (Seconds)", # X-axis label with unitsy ="Depth (Metres)", # Y-axis label with unitscolor ="Sex", title ="Linear Regression" ) +theme_minimal()+theme(plot.title =element_text(face ="bold", hjust =0.5) )
`geom_smooth()` using formula = 'y ~ x'
Blotch Predictors
Because we can clearly see a linear correlation between depth and blotch time, in order to predict bloch time we should follow a linear model.
Let’s try a simple linear model:
Code
model_simple <-lm(blotch ~ depth, data = sharks)summary(model_simple)
Call:
lm(formula = blotch ~ depth, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.81869 -0.65427 -0.01035 0.58825 2.83116
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.82178 1.11207 8.832 <2e-16 ***
depth 0.50467 0.02216 22.772 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1 on 498 degrees of freedom
Multiple R-squared: 0.5101, Adjusted R-squared: 0.5091
F-statistic: 518.6 on 1 and 498 DF, p-value: < 2.2e-16
Review of the information tells us the following: 1. The depth coeficcients tell us that for every 1 metre deeper a shark is caught from, blotching time increases by ~.50467 seconds. In other words, the deeper a shark is caught from, the longer it takes to blotch.
Because the p value is a statistically significant negative value, this tells us that it is very likely that the correlation between blotch time and depth is not by chance, which we saw in our linear model above, as well.
Residual values tell us that most predictions are within 2.8 units of the actual values.
Based on this model, approximately 51% of blotch time is explained, which is high for ecological and biological studies. However, because 49% is still unexplained, it is possible that depth combined with other factors such as stress, water/air temperature, or sex may improve the ability to predict blotch time.
As such, let’s add in additional variables to determine if we can further predict blotch time:
Code
model_multiple <-lm(blotch ~ depth + meta + air + water + weight + length + sex, data = sharks)summary(model_multiple)
Call:
lm(formula = blotch ~ depth + meta + air + water + weight + length +
sex, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-3.01869 -0.65130 -0.00238 0.62722 2.91144
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.7578141 1.7925782 6.001 3.8e-09 ***
depth 0.5036788 0.0220705 22.821 < 2e-16 ***
meta -0.0011650 0.0025656 -0.454 0.649972
air -0.0295655 0.0314362 -0.940 0.347427
water -0.0146552 0.0267927 -0.547 0.584638
weight 0.0017049 0.0033122 0.515 0.606964
length 0.0013489 0.0009576 1.409 0.159577
sexMale 0.3082889 0.0890047 3.464 0.000579 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9906 on 492 degrees of freedom
Multiple R-squared: 0.5252, Adjusted R-squared: 0.5184
F-statistic: 77.73 on 7 and 492 DF, p-value: < 2.2e-16
These results give us a bit more information, as follows:
Male sharks have a blotch time that is .31 seconds longer than female sharks (this was also seen in the box plots comparing male and female blotch time).
None of the other pedictors are statistically significant and therefore it can be assumed that there are not any other correlating factors in predicting blotch time.
Let’s do one last scatter plot to show these relationships (depth, blotch time, and sex):
Code
ggplot(sharks, aes(x = depth, y = blotch, color = sex)) +geom_point() +geom_smooth(method ="lm", se =FALSE) +labs(title ="Blotch Time vs Depth by Sex",x ="Depth (Metres)",y ="Blotch Time (Seconds)",color ="Sex") +theme_minimal()
`geom_smooth()` using formula = 'y ~ x'
We can now say the most significant identifers in this review are the following: 1. There is no correlation between weight, length, and sex.
2. There is no correlation between air and water. 3. There is no correlation between air temperature and number of catches. 4. There are no other correlations aside from depth and blotch. 5. Males begin blotching slightly later than females. 6. Multiple capture does have an effect on blotching time, which increases by about one second with the second catch. 7. It is possible to predict blotching time.