RMDA Summative Data Analysis

Install and load packages:

if (!require("tidyverse")) install.packages("tidyverse",  repos = "https://cran.rstudio.com/")
if (!require("knitr")) install.packages("knitr",  repos = "https://cran.rstudio.com/")
if (!require("gmodels")) install.packages("gmodels",  repos = "https://cran.rstudio.com/")
if (!require("ggplot2")) install.packages("ggplot2",  repos = "https://cran.rstudio.com/")
if (!require("ggpubr")) install.packages("ggpubr",  repos = "https://cran.rstudio.com/")
if (!require("xlsx")) install.packages("xlsx",  repos = "https://cran.rstudio.com/") ## For importing xlsx files
if (!require("corrplot")) install.packages("corrplot",  repos = "https://cran.rstudio.com/") ## For creating correlation plot visualisations 
if (!require("factoextra")) install.packages("factoextra",  repos = "https://cran.rstudio.com/") ## For creating PCA visualisations
if (!require("GGally")) install.packages("GGally",  repos = "https://cran.rstudio.com/")

## Checks if the required packages are already installed and if they aren't it runs the code to install them

library(tidyverse)
library(knitr)
library(gmodels)
library(ggplot2)
library(ggpubr)
library(xlsx)
library(corrplot)
library(factoextra)
library(GGally)

Load Data:

## Saves the data in the file as a tibble

sharks <- as_tibble(read.xlsx("sharks.xlsx", 1, TRUE)) 

sharksub <- as_tibble(read.xlsx("sharksub.xlsx", 1, TRUE))

Looking at the data:

summary(sharks)

      ID                sex                blotch           BPM       
 Length:500         Length:500         Min.   :30.78   Min.   :119.0  
 Class :character   Class :character   1st Qu.:34.16   1st Qu.:129.0  
 Mode  :character   Mode  :character   Median :35.05   Median :142.0  
                                       Mean   :35.13   Mean   :141.8  
                                       3rd Qu.:36.05   3rd Qu.:153.2  
                                       Max.   :40.08   Max.   :166.0  
     weight           length           air            water      
 Min.   : 65.10   Min.   :128.3   Min.   :33.00   Min.   :20.01  
 1st Qu.: 75.68   1st Qu.:172.0   1st Qu.:34.42   1st Qu.:21.55  
 Median : 87.82   Median :211.1   Median :35.43   Median :23.11  
 Mean   : 87.94   Mean   :211.0   Mean   :35.54   Mean   :23.02  
 3rd Qu.:100.40   3rd Qu.:251.8   3rd Qu.:36.71   3rd Qu.:24.37  
 Max.   :110.94   Max.   :291.0   Max.   :38.00   Max.   :25.99  
      meta            depth      
 Min.   : 50.03   Min.   :44.64  
 1st Qu.: 67.39   1st Qu.:48.90  
 Median : 82.45   Median :50.14  
 Mean   : 82.04   Mean   :50.14  
 3rd Qu.: 95.97   3rd Qu.:51.35  
 Max.   :112.45   Max.   :56.83

## It looks like all the values are reasonable with no apparent outliers

summary(sharksub)

      ID                sex               blotch1         blotch2     
 Length:50          Length:50          Min.   :32.49   Min.   :33.47  
 Class :character   Class :character   1st Qu.:34.38   1st Qu.:35.31  
 Mode  :character   Mode  :character   Median :34.94   Median :35.94  
                                       Mean   :35.03   Mean   :35.96  
                                       3rd Qu.:35.90   3rd Qu.:36.78  
                                       Max.   :37.07   Max.   :38.18

## As above

Looking at data structure:

str(sharks)

tibble [500 × 10] (S3: tbl_df/tbl/data.frame)
 $ ID    : chr [1:500] "SH001" "SH002" "SH003" "SH004" ...
 $ sex   : chr [1:500] "Female" "Female" "Female" "Male" ...
 $ blotch: num [1:500] 37.2 34.5 36.3 35.3 37.4 ...
 $ BPM   : num [1:500] 148 158 125 161 138 126 166 135 132 127 ...
 $ weight: num [1:500] 74.7 73.4 71.8 104.6 67.1 ...
 $ length: num [1:500] 187 189 284 171 264 ...
 $ air   : num [1:500] 37.7 35.7 34.8 36.2 33.6 ...
 $ water : num [1:500] 23.4 21.4 20.1 21.6 21.8 ...
 $ meta  : num [1:500] 64.1 73.7 54.4 86.3 108 ...
 $ depth : num [1:500] 53.2 49.6 49.4 50.3 49 ...

str(sharksub)

tibble [50 × 4] (S3: tbl_df/tbl/data.frame)
 $ ID     : chr [1:50] "SH269" "SH163" "SH008" "SH239" ...
 $ sex    : chr [1:50] "Female" "Female" "Female" "Female" ...
 $ blotch1: num [1:50] 36.1 33.4 36.3 35 35.7 ...
 $ blotch2: num [1:50] 37.2 34.4 36.5 36 36.8 ...

## The data types (num/chr) are correct for the data to be read and analysed.

Making sure that there aren’t any incorrect values within the “character” variables:

sharks %>%
  group_by(ID) %>% ## group the dataset into the unique values of ID
  count(ID) %>%    ## count the unique values of ID
  ungroup()        ## ungroup to ensure the next commands behave correctly

# A tibble: 500 × 2
   ID        n
   <chr> <int>
 1 SH001     1
 2 SH002     1
 3 SH003     1
 4 SH004     1
 5 SH005     1
 6 SH006     1
 7 SH007     1
 8 SH008     1
 9 SH009     1
10 SH010     1
# ℹ 490 more rows

                   ## repeat for both datasets and for sex
sharksub %>% 
  group_by(ID) %>% 
  count(ID) %>% 
  ungroup()

# A tibble: 50 × 2
   ID        n
   <chr> <int>
 1 SH004     1
 2 SH008     1
 3 SH029     1
 4 SH049     1
 5 SH059     1
 6 SH070     1
 7 SH087     1
 8 SH101     1
 9 SH108     1
10 SH110     1
# ℹ 40 more rows

sharks %>% 
  group_by(sex) %>% 
  count(sex) %>% 
  ungroup()

# A tibble: 2 × 2
  sex        n
  <chr>  <int>
1 Female   236
2 Male     264

sharksub %>% 
  group_by(sex) %>% 
  count(sex) %>% 
  ungroup()

# A tibble: 2 × 2
  sex        n
  <chr>  <int>
1 Female    25
2 Male      25

So there aren’t any duplicate IDs and there aren’t any variations for sex beyond “Female” and “Male” for both datasets.

Therefore, it looks like all the data has been loaded in in the correct format, without any outlying or incorrect values.

Visualising the data

Sharks

Sex

## Plot a bar chart showing distribution of sexes

p1 <- ggplot(sharks, aes(x = sex, fill = sex)) + ## save it as a variable to be displayed later
  geom_bar(show.legend = FALSE) +
  theme_minimal() +
  labs(x = NULL, y = "Count") + ## Define custom axes labels
  scale_y_continuous(breaks = seq(0,250, by = 50), limits = c(0, 275)) ## Change the scale of the y axis

Blotching times

p2 <- ggplot(sharks, aes(x = "", y = blotch, fill = "#F8766D")) +
  geom_violin(show.legend = FALSE) +
  geom_boxplot(width = 0.1, show.legend = FALSE, fill = "white", alpha = 0.8) + ## Overlay a boxplot
  theme_minimal() +
  labs(x = NULL, y = "Blotching time / s") +
  scale_y_continuous(breaks = seq(30,46, by = 2), limits = c(30, 42))## Plot a violin plot (with a boxplot) showing distribution of blotch times

BPM

p3 <- ggplot(sharks, aes(x = "", y = BPM, fill = "#F8766D")) +
  geom_violin(show.legend = FALSE) +
  geom_boxplot(width = 0.1, show.legend = FALSE, fill = "white", alpha = 0.8) +
  theme_minimal() +
  labs(x = NULL, y = "BPM")

Weight

p4 <- ggplot(sharks, aes(x = "", y = weight, fill = "#F8766D")) +
  geom_violin(show.legend = FALSE) +
  geom_boxplot(width = 0.1, show.legend = FALSE, fill = "white", alpha = 0.8) +
  theme_minimal() +
  labs(x = NULL, y = "Weight / kg")

Length

p5 <- ggplot(sharks, aes(x = "", y = length, fill = "#F8766D")) +
  geom_violin(show.legend = FALSE) +
  geom_boxplot(width = 0.1, show.legend = FALSE, fill = "white", alpha = 0.8) +
  theme_minimal() +
  labs(x = NULL, y = "Length / cm") +
  scale_y_continuous(breaks = seq(175,300, by = 25), limits = c(175, 300))

Air Temperature

p6 <- ggplot(sharks, aes(x = "", y = air, fill = "#F8766D")) +
  geom_violin(show.legend = FALSE) +
  geom_boxplot(width = 0.1, show.legend = FALSE, fill = "white", alpha = 0.8) +
  theme_minimal() +
  labs(x = NULL, y = "Air Temperature / degC")

Water Temperatures

p7 <- ggplot(sharks, aes(x = "", y = water, fill = "#F8766D")) +
  geom_violin(show.legend = FALSE) +
  geom_boxplot(width = 0.1, show.legend = FALSE, fill = "white", alpha = 0.8) +
  theme_minimal() +
  labs(x = NULL, y = "Water Temp. / degC")

Meta (corticosterone levels)

p8 <- ggplot(sharks, aes(x = "", y = meta, fill = "#F8766D")) +
  geom_violin(show.legend = FALSE) +
  geom_boxplot(width = 0.1, show.legend = FALSE, fill = "white", alpha = 0.8) +
  theme_minimal() +
  labs(x = NULL, y = "Meta / mcg/dl")

Depth caught at

p9 <- ggplot(sharks, aes(x = "", y = depth, fill = "#F8766D")) +
  geom_violin(show.legend = FALSE) +
  geom_boxplot(width = 0.1, show.legend = FALSE, fill = "white", alpha = 0.8) +
  theme_minimal() +
  labs(x = NULL, y = "Depth Caught / m")

Multiplot

## "multiplot" function definition taken from: http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/ [Accessed: 20/01/2025].

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                    ncol = cols, nrow = ceiling(numPlots/cols))
  }

 if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

multiplot(p1, p2, p3, p4, p5, p6, p7, p8, p9, cols = 3) ## Plot all prev. graphs in one image

Warning: Removed 129 rows containing non-finite outside the scale range
(`stat_ydensity()`).

Warning: Removed 129 rows containing non-finite outside the scale range
(`stat_boxplot()`).

Visualising the variables in the ‘sharks’ dataset.

Sharksub

Sex

p11 <- ggplot(sharksub, aes(x = sex, fill = sex)) +
  geom_bar(show.legend = FALSE) +
  theme_minimal() +
  labs(x = "Sex", y = "Count") +
  coord_fixed(ratio = 1.333/25) ## Set aspect ratio of graph plot (in terms of axes)

Blotching times

## To plot both blotch1 and blotch2 on the same plot, it's easier to create a new dataset using pivot_longer.

sharksblotch <- sharksub %>% 
  pivot_longer(cols = c(blotch1, blotch2),
               names_to = "capture",
               values_to = "time")

sharksblotch <- sharksblotch %>% 
  mutate(capture = recode(capture, "blotch1" = "Capture_1", "blotch2" = "Capture_2")) ## Rename values to make the legend look better

sharksblotch ## Look at the tibble to ensure the code has done what's intended

# A tibble: 100 × 4
   ID    sex    capture    time
   <chr> <chr>  <chr>     <dbl>
 1 SH269 Female Capture_1  36.1
 2 SH269 Female Capture_2  37.2
 3 SH163 Female Capture_1  33.4
 4 SH163 Female Capture_2  34.4
 5 SH008 Female Capture_1  36.3
 6 SH008 Female Capture_2  36.5
 7 SH239 Female Capture_1  35.0
 8 SH239 Female Capture_2  36.0
 9 SH332 Female Capture_1  35.7
10 SH332 Female Capture_2  36.8
# ℹ 90 more rows

p12 <- ggplot(sharksblotch, aes(x = time, fill = capture)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  labs(x = "Blotching time / s", y = "Density") +
  theme(legend.title=element_blank(), 
        legend.position = "inside",
        legend.position.inside = c(0.85, 0.85)) + ## Position legend within graph, in a sensible place, without a title
  coord_fixed(ratio = 8/0.7)

multiplot(p11, p12, cols = 2) ## Plot prev. graphs together

Visualising the variables in the ‘sharksub’ dataset.

These show the distributions of each variable within sharks and sharksub. Immediately, only blotch and depth look like they may have normal data, with the others showing a much more even range of values between their maximum and minimum.

The bimodal response of the weight variable may be due to a difference between male and female weights, so let’s look into this.

Data Analysis

Weight of each sex

Calculate mean weight of each sex

For this, we will need to use group_by(sex) and then summarise()

sharks %>% 
  group_by(sex) %>% ## Group data by the unique variables in sex (male or female)
  summarise("meanweight" = mean(weight)) %>% ## define a new variable as the mean weight to be displayed
  ungroup()

# A tibble: 2 × 2
  sex    meanweight
  <chr>       <dbl>
1 Female       88.1
2 Male         87.8

Plot weights for each sex

I wonder if a violin/box plot would show any difference:

ggplot(sharks, aes(x = sex, y = weight, fill = sex)) +
  geom_violin(show.legend = FALSE) +
  geom_boxplot(width = 0.1, show.legend = FALSE, fill = "white", alpha = 0.8) +
  theme_minimal() +
  labs(x = "Sex", y = "Weight/kg")

While both distributions have similar ranges, it can be seen that there is an increase in females at the higher weight and an increase in the males at the lower weight values. This may explain the bimodal distribution of weight. However, the distributions are nonetheless very similar.

Let’s check using a statistical test to see if this difference in weights is significant.

Normality test

First, we need to see if the data is normal by using the Shapiro-Wilks test for normality:

malesharks <- sharks %>% 
  filter(sex=="Male") ## Create new variable containing just the data of male sharks

femalesharks <- sharks %>% 
  filter(sex=="Female") ## Same as above for female sharks

## Shapiro for the male sharks

shapiromale <- shapiro.test(malesharks$weight)
print(shapiromale) ## Conduct Shapiro-Wilks test of normality on the male data, save it in a variable and then show the results.


    Shapiro-Wilk normality test

data:  malesharks$weight
W = 0.9483, p-value = 4.822e-08

## Shapiro for the female sharks

shapirofemale <- shapiro.test(femalesharks$weight)
print(shapirofemale) ## Same as above for the female data


    Shapiro-Wilk normality test

data:  femalesharks$weight
W = 0.94261, p-value = 5.264e-08

Both have p-values below 0.05, therefore we can say both sets are not normal.

Statistical test of medians

Now let’s compare the weight of males with the weight of females to see if there is a statistically significant difference.

## Data must be input as vectors into the wilcox.test() function.

malesharkweight <- pull(malesharks %>% 
  select(weight)) ## Removes all other columns except the malesharks column; pull "pulls" a vector from the tibble

femalesharkweight <- pull(femalesharks %>% 
  select(weight)) ## As above

wilcox.test(malesharkweight, femalesharkweight)


    Wilcoxon rank sum test with continuity correction

data:  malesharkweight and femalesharkweight
W = 30662, p-value = 0.7615
alternative hypothesis: true location shift is not equal to 0

So we have a p-value of 0.7615, therefore there isn’t a significant difference in the means of the weight of the males and females.

BPM of each sex

Calculate mean bpm of each sex

sharks %>% 
  group_by(sex) %>% ## Group data by the unique variables in sex (male or female)
  summarise("meanbpm" = mean(BPM)) %>% ## define a new variable as the mean bpm to be displayed
  ungroup()

# A tibble: 2 × 2
  sex    meanbpm
  <chr>    <dbl>
1 Female    142.
2 Male      142.

Plot BPMs for each sex

ggplot(sharks, aes(x = sex, y = BPM, fill = sex)) +
  geom_violin(show.legend = FALSE) +
  geom_boxplot(width = 0.1, show.legend = FALSE, fill = "white", alpha = 0.8) +
  theme_minimal() +
  labs(x = "Sex", y = "BPM")

There is no discernible difference between the sexes for BPM.

Is there a correlation between the variables air and water?

This is looking at two continuous variables so we need to use a linear model.

The air variable is the ambient air temperature
The water variable is the surface water temperature

Both recorded in degrees Celsius.

It is expected that there would be a slight correlation, since the water would be heated by the air. However, the temperature of the water is buffered by the mass and heat capacity of it, so whether this effect would be visible or measurable in this data is questionable.

Plotting air temperature vs water temperature for each capture

ggplot(sharks, aes(air, water)) + 
  geom_point() +
  geom_smooth(method = "lm",    # Adds trendline; lm makes it a linear model
              se = TRUE) +     # Turns on standard error shading
  theme_minimal() +
  labs(x = "Air / Degrees Celsius", y = "Water / Degrees Celsius")

`geom_smooth()` using formula = 'y ~ x'

So from this it looks like there is no discernible correlation.

Lets run a statistical test. Since we are looking for a correlation, we should use a correlation test. To find out which one, I’ll conduct a Shapiro-Wilks test first to determine normality.

Normality tests

#|label: testing normality for air and water data

shapiroair <- shapiro.test(sharks$air)
print(shapiroair) ## Testing normality for the air data within sharks


    Shapiro-Wilk normality test

data:  sharks$air
W = 0.95885, p-value = 1.338e-10

shapirowater <- shapiro.test(sharks$water)
print(shapirowater) ## As above but for water


    Shapiro-Wilk normality test

data:  sharks$water
W = 0.96035, p-value = 2.371e-10

These results show that there is a statistically significant likelihood that the data for both air and water within sharks is not normal (at the 0.05 significance level).

Spearman Rank Correlation test for correlation of non-normal data:

cor.test(sharks$water, sharks$air, method = "spearman") ## Conduct a Spearman test of correlation on the air and water variables within sharks.


    Spearman's rank correlation rho

data:  sharks$water and sharks$air
S = 22007692, p-value = 0.2082
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.05637344

So we have a p-value of 0.2082 (>0.05) so this isn’t statistically significant. This shows that there is a 21% probability that a random dataset would produce the same distribution.

The corr variable (Spearmans’ r) = -0.0564, showing a VERY weak relationship between the two variables.

So we conclude that there is no corrlation between air and water.

Does multiple capture have an effect on blotching time?

What is this asking?

When a shark is captured twice, is there a significant change in the time it takes for blotching to occur? Sounds simple enough! So I need to see if the difference between blotch1 and blotch2 (in sharksub) is significant.

Plotting blotching times

sharksub <- sharksub %>% 
  mutate(blotchdiff = (blotch2 - blotch1)) ## Define a new variable as the difference between the blotching times

str(sharksub) ## Check structure of sharksub to ensure above code worked as intended

tibble [50 × 5] (S3: tbl_df/tbl/data.frame)
 $ ID        : chr [1:50] "SH269" "SH163" "SH008" "SH239" ...
 $ sex       : chr [1:50] "Female" "Female" "Female" "Female" ...
 $ blotch1   : num [1:50] 36.1 33.4 36.3 35 35.7 ...
 $ blotch2   : num [1:50] 37.2 34.4 36.5 36 36.8 ...
 $ blotchdiff: num [1:50] 1.082 1.002 0.166 1.05 1.071 ...

All looks good!

ggplot(sharksub, aes(blotch1, blotch2)) + 
  geom_point() +
  geom_smooth(method = "lm",    # Adds trendline; lm makes it a linear model
              se = TRUE) +     # Turns on standard error shading
  theme_minimal() +
  labs(x = "Blotching time on first capture / s", y = "Blotching time on second capture / s") ## Plot a scatter plot showing blotching times against each other, with the time for the first capture on the x axis and the time for the second on the y axis.

`geom_smooth()` using formula = 'y ~ x'

So it seems that there is a very clear relationship, with the second capture sharks taking approximately 1 second longer to blotch.

There are also 5 outliers which have a smaller difference.

ggplot(sharksub, aes(blotchdiff)) + 
  geom_histogram(binwidth = 0.125) +
  theme_minimal() +
  scale_x_continuous(breaks = seq(-1, 1.5, by = 0.25), label = seq(-1, 1.5, by = 0.25)) + ## Assigns custom line breaks and custom x axis labels on graph
  labs(x = "Difference in Blotching Time Between First and Second Capture / s", y = "Count") ## Plot a histogram of the difference in blotching times.

Here we can see the above data in a different form, with the majority of the differences centered around 1 second, but with some that are less. Interestingly, some have a faster time to blotch on second capture.

Normality test

shapiroblotch1 <- shapiro.test(sharksub$blotch1)
print(shapiroblotch1) ## Shapiro-Wilks test of normality on blotch1 data


    Shapiro-Wilk normality test

data:  sharksub$blotch1
W = 0.97958, p-value = 0.5345

shapiroblotch2 <- shapiro.test(sharksub$blotch2)
print(shapiroblotch2) ## As above for blotch2


    Shapiro-Wilk normality test

data:  sharksub$blotch2
W = 0.97936, p-value = 0.5255

shapiroblotchdiff <- shapiro.test(sharksub$blotchdiff) 
print(shapiroblotchdiff) ## Shapiro test for difference in blotching times


    Shapiro-Wilk normality test

data:  sharksub$blotchdiff
W = 0.43157, p-value = 1.212e-12

The blotching times themselves are both normally distributed.

The Shapiro-Wilk test of the differences between the blotching times gives a p-value <0.0001 and thus is very likely to be not normal.

Statistical test

We want to see if the differences between the times for blotching after the first and second captures are statistically different to 0.

A Wilcoxon signed rank test can do this for non-normal data.

We will use a theoretical median = 0 to represent no difference between the two blotching times.

theor_median <- 0 ## Define the theoretical median as 0

sharksub %>% 
  select(blotchdiff) %>% 
  pull() %>% ## "pulls" a vector from the tibble since wilcox.test() req. a vector
  wilcox.test(mu = theor_median) ## Use dplyr/magrittr packages to input difference in blotching time data into t.test, using a theoretical mean of 0.


    Wilcoxon signed rank test with continuity correction

data:  .
V = 1263, p-value = 1.606e-09
alternative hypothesis: true location is not equal to 0

This shows that the p-value is 1.606e-9, and thus we can say that the median is statistically significantly different to 0.

Is it possible to predict blotching time?

This is asking whether any of the other variables we have been given are a good predictor of the blotching time. It could also be a combination of other variables, found using a PCA analysis.

The first step is to plot blotch against each of the other variables given and look for trends.

Plotting against sex

a1 <- ggplot(sharks, aes(x = sex, y = blotch, fill = sex)) +
  geom_violin(show.legend = FALSE) +
  geom_boxplot(width = 0.1, show.legend = FALSE, fill = "white", alpha = 0.8) +
  theme_minimal() +
  labs(y = "Blotching time / s", x = "Sex") ## Plot a violin plot (with a boxplot) showing distribution of blotching times for each sex

This shows that the blotching time is similar for each sex, with Males exhibiting a slightly longer time.

Plotting against BPM

a2 <- ggplot(sharks, aes(x = BPM, y = blotch)) +
  geom_point() +
  geom_smooth(method = "lm",  
              se = TRUE) +
  theme_minimal() +
  labs(y = "Blotching time / s", x = "Heart rate / bpm")

Plotting against weight

a3 <- ggplot(sharks, aes(x = weight, y = blotch)) +
  geom_point() +
  geom_smooth(method = "lm",  
              se = TRUE) +   
  theme_minimal() +
  labs(y = "Blotching time / s", x = "Weight / kg")

Plotting against length

 a4 <- ggplot(sharks, aes(x = length, y = blotch)) +
  geom_point() +
  geom_smooth(method = "lm", 
              se = TRUE) +  
  theme_minimal() +
  labs(y = "Blotching time / s", x = "Length / cm")

Plotting against air

a5 <- ggplot(sharks, aes(x = air, y = blotch)) +
  geom_point() +
  geom_smooth(method = "lm", 
              se = TRUE) +  
  theme_minimal() +
  labs(y = "Blotching time / s", x = "Air Temp. / DegC")

Plotting against water

a6 <- ggplot(sharks, aes(x = water, y = blotch)) +
  geom_point() +
  geom_smooth(method = "lm",  
              se = TRUE) + 
  theme_minimal() +
  labs(y = "Blotching time / s", x = "Water Temp. / DegC")

Plotting against meta

a7 <- ggplot(sharks, aes(x = meta, y = blotch)) +
  geom_point() +
  geom_smooth(method = "lm",  
              se = TRUE) +  
  theme_minimal() +
  labs(y = "Blotching time / s", x = "Meta / mcg/dl")

Plotting against depth

a8 <- ggplot(sharks, aes(x = depth, y = blotch)) +
  geom_point() +
  geom_smooth(method = "lm", 
              se = TRUE) + 
  theme_minimal() +
  labs(y = "Blotching time / s", x = "Depth Hooked / m")

multiplot(a1, a2, a3, a4, a5, a6, a7, a8, cols = 4)

`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

Multiplot for predicting blotching time.

Seeing as there is a trend against depth, let’s investigate further.

Correlation plot for the prev. graphs:

sharks %>%
  select(blotch:depth) %>% ## Only select the numeric variables within sharks
  cor() %>% ## Make the correlation matrix
  round(digits = 2) -> sharkscor ## Round the matrix results to 2 digits and save the output as "sharkscor"

sharkscor

       blotch   BPM weight length   air water  meta depth
blotch   1.00 -0.03   0.01  -0.02 -0.04 -0.05 -0.01  0.71
BPM     -0.03  1.00   0.02  -0.07 -0.07  0.02 -0.01 -0.01
weight   0.01  0.02   1.00  -0.02 -0.05  0.09  0.02 -0.01
length  -0.02 -0.07  -0.02   1.00 -0.03 -0.06  0.00 -0.08
air     -0.04 -0.07  -0.05  -0.03  1.00 -0.06  0.13 -0.01
water   -0.05  0.02   0.09  -0.06 -0.06  1.00  0.02 -0.04
meta    -0.01 -0.01   0.02   0.00  0.13  0.02  1.00  0.01
depth    0.71 -0.01  -0.01  -0.08 -0.01 -0.04  0.01  1.00

corrplot.mixed(sharkscor, 
         upper = "color",
         order = "alphabet", 
         tl.col = "black",
         bg = "gold2") ## Altering the display to make the values easier to read

This shows that only blotching time and depth are correlated.

Normality tests of blotching time and depth

shapiroblotch <- shapiro.test(sharks$blotch)
print(shapiroblotch) ## Conduct shapiro test for blotch time (note that this is different to prev. blotching tests due to a larger dataset)


    Shapiro-Wilk normality test

data:  sharks$blotch
W = 0.99695, p-value = 0.4769

shapirodepth <- shapiro.test(sharks$depth)
print(shapirodepth) ## As above for depth


    Shapiro-Wilk normality test

data:  sharks$depth
W = 0.99746, p-value = 0.6485

So both datasets are normal.

Statistical test of blotching time and depth

Use Pearsons test as the data is normal

cor.test(sharks$depth, sharks$blotch)


    Pearson's product-moment correlation

data:  sharks$depth and sharks$blotch
t = 22.772, df = 498, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6683963 0.7546509
sample estimates:
      cor 
0.7142247

This result shows that there is a strong positive correlation (rho = 0.7142) and it is statistically significant(p-value < 0.05).

Linear regression to find an equation linking blotching time and depth

sharks %>% 
  lm(blotch ~ depth, data = .) %>% ## 
  summary() ## Requires summary to print full output.


Call:
lm(formula = blotch ~ depth, data = .)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.81869 -0.65427 -0.01035  0.58825  2.83116 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.82178    1.11207   8.832   <2e-16 ***
depth        0.50467    0.02216  22.772   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1 on 498 degrees of freedom
Multiple R-squared:  0.5101,    Adjusted R-squared:  0.5091 
F-statistic: 518.6 on 1 and 498 DF,  p-value: < 2.2e-16

This gives us the coefficients: Intercept = 9.82178, depth = 0.50467.

We can add them to a graph using geom_line():

## To do this I'll add a new plot to the graph above of depth vs blotch, using data points that fit y = 9.82178 + 0.50467*x

modelline <- function(x) {
  9.82178 + 0.50467*x} ## The function

x_values <- seq(40, 60) ## Create a sequence of values to cover the necessary range on the graph

y_values <- modelline(x_values) ## Compute the y values using both of the above

model_data <- data.frame(x = x_values, y = y_values) ## Create a dataframe from the calculated values

ggplot(sharks, aes(x = depth, y = blotch)) +
  geom_point() +
  geom_smooth(method = "lm",    # Adds trendline; lm makes it a linear model
              se = TRUE) + 
    geom_line(data = model_data, aes(x = x, y =y), color = "red") + ## New geom input for the model data
  theme_minimal() +
  labs(y = "Blotching time / s", x = "Depth Hooked / m")

`geom_smooth()` using formula = 'y ~ x'

Here I’ve reproduced the previous plot but with a new line created from the output of the linear regression. As expected, it follows the same path as that generated by the geom_smooth() linear model method.

ggplot(sharks, aes(x = depth, y = blotch)) +
  geom_point() +
    geom_line(data = model_data, aes(x = x, y =y), color = "red") + ## New geom input for the model data
  theme_minimal() +
  labs(y = "Blotching time / s", x = "Depth Hooked / m")

This plot is only of the modeled line for clarity.

Conducting a PCA on blotching time

sharkpca <- prcomp(sharks[,c(3:10)], ## Perform the PCA on the numeric components of sharks
                   center = TRUE,  
                   scale. = TRUE) ## Standardizes the data

summary(sharkpca) ## Show the main results of the PCA

Importance of components:
                         PC1    PC2    PC3    PC4    PC5    PC6    PC7     PC8
Standard deviation     1.314 1.0887 1.0538 1.0135 0.9741 0.9506 0.9027 0.53084
Proportion of Variance 0.216 0.1482 0.1388 0.1284 0.1186 0.1129 0.1019 0.03522
Cumulative Proportion  0.216 0.3641 0.5030 0.6314 0.7500 0.8629 0.9648 1.00000

fviz_eig(sharkpca) ## Scree plot

fviz_pca_biplot(sharkpca, label = "var", addEllipses = FALSE) ## PCA Biplot showing how the different variables relate to the two dimensions that contribute most to the variation in the dataset

Unfortunately the Scree plot shows that the first two dimensions don’t make up a significant proportion of the explanation of the data. Because of this, the biplot only explains 36.4% of the variation in the data, and thus is fairly ineffective.

However, on the biplot it is nonetheless telling that both blotch and depth have identical eigenvectors.

Using GGally

ggpairs(sharks[,c(2:10)], ggplot2::aes(colour = sex)) +
  theme_minimal() +
  theme(axis.text = element_text(size = 8),
        strip.text = element_text(size = 8))

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

This is extremely messy, and I can’t work out how to tidy it up - by changing text size or altering the transparency of the data on each plot. Seeing as it doesn’t add any new data, I’ll probably leave it out of the final report.

Testing change in temperature between water and air

sharks <- sharks %>% 
  mutate(temp_diff = air - water) ## Create new variable in sharks of the difference in temperature
sharks

# A tibble: 500 × 11
   ID    sex    blotch   BPM weight length   air water  meta depth temp_diff
   <chr> <chr>   <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>     <dbl>
 1 SH001 Female   37.2   148   74.7   187.  37.7  23.4  64.1  53.2      14.4
 2 SH002 Female   34.5   158   73.4   189.  35.7  21.4  73.7  49.6      14.3
 3 SH003 Female   36.3   125   71.8   284.  34.8  20.1  54.4  49.4      14.7
 4 SH004 Male     35.3   161  105.    171.  36.2  21.6  86.3  50.3      14.5
 5 SH005 Female   37.4   138   67.1   264.  33.6  21.8 108.   49.0      11.9
 6 SH006 Male     33.5   126  110.    270.  36.4  20.9 109.   46.8      15.5
 7 SH007 Male     36.7   166  101.    194.  33.1  21.8  99.7  49.1      11.3
 8 SH008 Female   36.3   135  101.    128.  36.8  21.3  96.3  50.6      15.5
 9 SH009 Male     35.4   132   95.0   269.  35.3  22.2  79.0  49.9      13.1
10 SH010 Female   36.0   127   71.3   163.  35.7  24.6  72.3  51.2      11.1
# ℹ 490 more rows

## Plot the temperature difference against the variables most likely to be related to stress
## Just doing simple plots to quickly see if any correlations are seen.

ggplot(sharks, aes(temp_diff, blotch)) +
  geom_point()

ggplot(sharks, aes(temp_diff, BPM)) +
  geom_point()

ggplot(sharks, aes(temp_diff, meta)) + 
  geom_point()

Potential slight correlation in temperature difference vs blotch - worth investigating further, though it’s likely non-significant.

## Test data for normality

shapirotemp <- shapiro.test(sharks$temp_diff)
shapirotemp


    Shapiro-Wilk normality test

data:  sharks$temp_diff
W = 0.98944, p-value = 0.001154

## Therefore non-normal data, use non-parametric test:

cor.test(sharks$temp_diff, sharks$blotch, method = "spearman")


    Spearman's rank correlation rho

data:  sharks$temp_diff and sharks$blotch
S = 20660142, p-value = 0.8529
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
0.008309217

## Result: not significant.