ARES4011 Submission

Author

Martin Cooper

# load the tidyverse package
library(tidyverse)

Sharks Data

Initial Data Import and checking

# Call data and create data frame. 
library(readxl)
sharks <- read_excel("D:/ARES40011 Rsrch Methods & Data Analysis/Summative/sharks.xlsx")
sharks <-data.frame(sharks)

# Summarize the data frame
summary(sharks)

      ID                sex                blotch           BPM       
 Length:500         Length:500         Min.   :30.78   Min.   :119.0  
 Class :character   Class :character   1st Qu.:34.16   1st Qu.:129.0  
 Mode  :character   Mode  :character   Median :35.05   Median :142.0  
                                       Mean   :35.13   Mean   :141.8  
                                       3rd Qu.:36.05   3rd Qu.:153.2  
                                       Max.   :40.08   Max.   :166.0  
     weight           length           air            water      
 Min.   : 65.10   Min.   :128.3   Min.   :33.00   Min.   :20.01  
 1st Qu.: 75.68   1st Qu.:172.0   1st Qu.:34.42   1st Qu.:21.55  
 Median : 87.82   Median :211.1   Median :35.43   Median :23.11  
 Mean   : 87.94   Mean   :211.0   Mean   :35.54   Mean   :23.02  
 3rd Qu.:100.40   3rd Qu.:251.8   3rd Qu.:36.71   3rd Qu.:24.37  
 Max.   :110.94   Max.   :291.0   Max.   :38.00   Max.   :25.99  
      meta            depth      
 Min.   : 50.03   Min.   :44.64  
 1st Qu.: 67.39   1st Qu.:48.90  
 Median : 82.45   Median :50.14  
 Mean   : 82.04   Mean   :50.14  
 3rd Qu.: 95.97   3rd Qu.:51.35  
 Max.   :112.45   Max.   :56.83

The summary output matches the expectation from the data set so data exploration can continue

#Checking the data structure
str(sharks)

'data.frame':   500 obs. of  10 variables:
 $ ID    : chr  "SH001" "SH002" "SH003" "SH004" ...
 $ sex   : chr  "Female" "Female" "Female" "Male" ...
 $ blotch: num  37.2 34.5 36.3 35.3 37.4 ...
 $ BPM   : num  148 158 125 161 138 126 166 135 132 127 ...
 $ weight: num  74.7 73.4 71.8 104.6 67.1 ...
 $ length: num  187 189 284 171 264 ...
 $ air   : num  37.7 35.7 34.8 36.2 33.6 ...
 $ water : num  23.4 21.4 20.1 21.6 21.8 ...
 $ meta  : num  64.1 73.7 54.4 86.3 108 ...
 $ depth : num  53.2 49.6 49.4 50.3 49 ...

We can see that there are 500 observations of 11 variables, with 1 character variable (sex) and 9 continuous variables.

#Checking for missing data
colSums(is.na(sharks))

    ID    sex blotch    BPM weight length    air  water   meta  depth 
     0      0      0      0      0      0      0      0      0      0

There is no missing data.

Outliers

As there are two data types, we need to check in different ways for outliers

boxplot(blotch ~ sex, ylab = 'Blotch Time (s)', xlab = 'Sex', data = sharks, outpch = 16, las=1)

This plot suggests that there is a difference between the means of blotch time by sex (different means). There are also 3 possible outliers in blotch time.

Note

Note: If doing further analysis of blotch time by sex, we will need to remeber to take out these outliers.

To check for outliers in the continuous variables using Cleveland dotplotts.

#install libraries
library(lattice)
echo=TRUE
results='hide'
# Define column names
Names <- c("BPM", "weight", "length", "air", "water", "meta", "depth")

# Create dot plot
dotplot(as.matrix(sharks[, Names]), 
        groups = FALSE, 
        strip = strip.custom(bg = 'white'), 
        par.strip.text = list(cex = 1.2),
        scales = list(x = list(relation = "free", draw = TRUE),
                      y = list(relation = "free", draw = FALSE)), 
        col = 1, cex = 1, pch = 16,
        xlab = list(label = "Value of the variable", cex = 1.2),
        ylab = list(label = "Order of the data", cex = 1.2))

The data is spread, but there are only possible outliers in the depth dataset. We can test these specifically if we want to conduct any futher analysis.

Normality Tests

# Create frequency polygons using different binning methods
ggplot(data = sharks, aes(x = blotch)) +
  geom_freqpoly(bins = 15, color = "blue", linetype = "solid", size = 1) +
  labs(title = "Frequency Polygon of Blotch Time",
       x = "Blotch Time (s)",
       y = "Frequency")

Our varibale is normal. To confirm:

shapiro.test(sharks$blotch)


    Shapiro-Wilk normality test

data:  sharks$blotch
W = 0.99695, p-value = 0.4769

Data is confirmed as normal.

Multicollinearity among covariates

This is to test if the independant variables are corelated

library(Hmisc)
echo=TRUE
results='hide'
# Define column names
Coll <- c("BPM", "weight", "length", "air", "water", "meta", "depth")

# Calculate correlation matrix with p-values
cor_results <- rcorr(as.matrix(sharks[, Coll]), type = "pearson")

# Correlation coefficients
cor_matrix <- cor_results$r

# Display correlation coefficients
print(cor_matrix)

                BPM       weight      length         air       water
BPM     1.000000000  0.017036558 -0.06856053 -0.06841209  0.02451337
weight  0.017036558  1.000000000 -0.01959676 -0.05264537  0.08633875
length -0.068560532 -0.019596758  1.00000000 -0.03027426 -0.05940708
air    -0.068412093 -0.052645366 -0.03027426  1.00000000 -0.05524051
water   0.024513368  0.086338753 -0.05940708 -0.05524051  1.00000000
meta   -0.006016429  0.019601470  0.00302851  0.12531801  0.02249461
depth  -0.012173520 -0.006057435 -0.08334774 -0.01188199 -0.04088851
               meta        depth
BPM    -0.006016429 -0.012173520
weight  0.019601470 -0.006057435
length  0.003028510 -0.083347736
air     0.125318005 -0.011881989
water   0.022494605 -0.040888511
meta    1.000000000  0.008150764
depth   0.008150764  1.000000000

This is a table of the corrrelation values

# P-values
p_matrix <- cor_results$P

#Display P-values
print(p_matrix)

             BPM     weight    length         air      water        meta
BPM           NA 0.70392824 0.1257632 0.126586449 0.58448465 0.893247627
weight 0.7039282         NA 0.6620083 0.239972797 0.05368554 0.661932090
length 0.1257632 0.66200832        NA 0.499413144 0.18476370 0.946143700
air    0.1265864 0.23997280 0.4994131          NA 0.21755131 0.005012136
water  0.5844847 0.05368554 0.1847637 0.217551315         NA 0.615808758
meta   0.8932476 0.66193209 0.9461437 0.005012136 0.61580876          NA
depth  0.7859781 0.89252444 0.0625624 0.790983481 0.36156444 0.855736961
           depth
BPM    0.7859781
weight 0.8925244
length 0.0625624
air    0.7909835
water  0.3615644
meta   0.8557370
depth         NA

This is a table of the significance of these correlations.

Note

Note: We can see that the only statistically signifcant correlation is between air and meta - Air Temperature and Cortizol level.

I found this easier to spot using graphics

library(GGally)

# Define column names
Names <- c("BPM", "weight", "length", "air", "water", "meta", "depth")

# Create a scatter plot matrix
ggpairs(sharks[, Names],
        lower = list(continuous = wrap("points", size = 1, alpha = 0.6)),
        diag = list(continuous = wrap("barDiag", fill = "lightblue")),
        upper = list(continuous = wrap("cor", size = 4))) +
  theme(panel.background = element_blank(),
        panel.border = element_rect(fill = NA, size = 1),
        strip.background = element_rect(fill = "white", color = "white", size = 1),
        text = element_text(size = 12))

Relations among dependent and independant variables

To see if there are any relationships between the blotch time and the variables collected. Given the difference in blotch time by sex, that split has been inluded to highlight if there are any sex spefic significance, the overall correlation and p-value are also displayed

# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$BPM, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value

#Plot
ggplot(sharks, aes(x = BPM, y = blotch, color=sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", 
                                        color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and BPM", 
       x = "BPM", 
       y = "Blotch Time",
       color="sex")+
  annotate("text", x = 155, y = 39, 
           label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)), 
           size = 5, color = "blue")

# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$weight, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value

ggplot(sharks, aes(x = weight, y = blotch, color=sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", 
                                        color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and Weight", 
       x = "Weight", 
       y = "Blotch Time",
       color="sex")+
    annotate("text", x = 70, y=39.05, 
           label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)), 
           size = 5, color = "blue")

# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$length, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value

ggplot(sharks, aes(x = length, y = blotch, color=sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", 
                                        color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and Length", 
       x = "Length", 
       y = "Blotch Time",
       color="sex")+
  annotate("text", x = 150, y=39, 
           label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)), 
           size = 5, color = "blue")

# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$air, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value

ggplot(sharks, aes(x = air, y = blotch, colour=sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", 
                                        color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and Air Temperature", 
       x = "Air Tempurature", 
       y = "Blotch Time",
       colour="sex")+
  annotate("text", x = 37, y=39, 
           label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)), 
           size = 5, color = "blue")

# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$water, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value

ggplot(sharks, aes(x = water, y = blotch, color = sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and Surface Water Temperature", 
       x = "Surface Water Temperature", 
       y = "Blotch Time",
       color = "Sex") +
  annotate("text", x = 21.5, y = 39.5, 
           label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)), 
           size = 5, color = "blue")

# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$meta, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value

ggplot(sharks, aes(x = meta, y = blotch, color = sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and Cortisol", 
       x = "Cortisol", 
       y = "Blotch Time",
       color = "Sex") +
  annotate("text", x = 57, y = 39.5, 
           label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)), 
           size = 5, color = "blue")

# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$depth, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value

ggplot(sharks, aes(x = depth, y = blotch, color = sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and Depth of Capture", 
       x = "Depth", 
       y = "Blotch Time",
       color = "Sex") +
  annotate("text", x = 46, y = 39.5, 
           label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)), 
           size = 5, color = "blue")

Note

Note: We can see that the only statistically signifcant correlation is between blotch time and depth.

#Analysis of Shark Data

t.test(blotch~sex,
       data=sharks,
       var.equal=TRUE)


    Two Sample t-test

data:  blotch by sex
t = -3.023, df = 498, p-value = 0.002632
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
 -0.6326914 -0.1342420
sample estimates:
mean in group Female   mean in group Male 
            34.92294             35.30641

This confimes that there is a significant differnce between male and female blotch times.

# Create a subset for females
sharks_female <- sharks %>%
  filter(sex == "Female")

# Create a subset for males
sharks_male <- sharks %>%
  filter(sex == "Male")

Given the difference between the sex and the significance of the correlation between depth and blotch time, we should calculate the stats for this.

# Run correlation tests for females and males
cor_test_female <- cor.test(sharks_female$depth, sharks_female$blotch)
correlation_female <- cor_test_female$estimate
p_value_female <- cor_test_female$p.value

cor_test_male <- cor.test(sharks_male$depth, sharks_male$blotch)
correlation_male <- cor_test_male$estimate
p_value_male <- cor_test_male$p.value

# Plot for combined data
ggplot(sharks, aes(x = depth, y = blotch, color = sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and Depth by Sex", 
       x = "Depth", 
       y = "Blotch Time",
       color = "Sex") +
  annotate("text", x = 48, y = 39.5, 
           label = paste("Female:\nCorrelation:", round(correlation_female, 2), 
                         "\nP-value:", format.pval(p_value_female, digits = 2)), 
           size = 5, color = "blue") +
  annotate("text", x = 54, y = 32.5, 
           label = paste("Male:\nCorrelation:", round(correlation_male, 2), 
                         "\nP-value:", format.pval(p_value_male, digits = 2)), 
           size = 5, color = "red")

`geom_smooth()` using formula = 'y ~ x'

Sharsub

Initial Data Import and checking

# Call data and create data frame. 
library(readxl)
sharksub <- read_excel("D:/ARES40011 Rsrch Methods & Data Analysis/Summative/sharksub (1).xlsx")
sharksub <-data.frame(sharksub)

# Summarize the data frame
summary(sharksub)

      ID                sex               blotch1         blotch2     
 Length:50          Length:50          Min.   :32.49   Min.   :33.47  
 Class :character   Class :character   1st Qu.:34.38   1st Qu.:35.31  
 Mode  :character   Mode  :character   Median :34.94   Median :35.94  
                                       Mean   :35.03   Mean   :35.96  
                                       3rd Qu.:35.90   3rd Qu.:36.78  
                                       Max.   :37.07   Max.   :38.18

str(sharksub)

'data.frame':   50 obs. of  4 variables:
 $ ID     : chr  "SH269" "SH163" "SH008" "SH239" ...
 $ sex    : chr  "Female" "Female" "Female" "Female" ...
 $ blotch1: num  36.1 33.4 36.3 35 35.7 ...
 $ blotch2: num  37.2 34.4 36.5 36 36.8 ...

Checking for missing data

colSums(is.na(sharksub))

     ID     sex blotch1 blotch2 
      0       0       0       0

##Outliers

For each capture event

#Looking for differences between male and felame for each capture

# Create a boxplot with color by sex
ggplot(sharksub, aes(x = sex, y = blotch1, fill = sex)) +
  geom_boxplot(outlier.shape = 16) +
  labs(title = "Boxplot of Blotch1 Time by Sex",
       x = "Sex",
       y = "Blotch1 Time (s)") +
  scale_fill_manual(values = c("Female" = "blue", "Male" = "red")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 0, hjust = 1))

This looks like there may be significance.

#Looking for differences between male and felame for each capture
# Create a boxplot with color by sex
ggplot(sharksub, aes(x = sex, y = blotch2, fill = sex)) +
  geom_boxplot(outlier.shape = 16) +
  labs(title = "Boxplot of Blotch1 Time by Sex",
       x = "Sex",
       y = "Blotch2 Time (s)") +
  scale_fill_manual(values = c("Female" = "blue", "Male" = "red")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 0, hjust = 1))

Note

Note: We can see that there may be a difference between male and female blotch time in both capture events.

##**Normality Tests

#To make it easier, group data by individual
sharksub_long <- sharksub %>%
  pivot_longer(cols = c(blotch1, blotch2), names_to = "variable", values_to = "value")

# Plot histograms
ggplot(data = sharksub_long, aes(x = value, fill = variable)) +
  geom_histogram(binwidth = 1, alpha = 0.5, position = "identity") +
  facet_wrap(~ variable, scales = "free") +
  scale_fill_manual(values = c("blotch1" = "blue", "blotch2" = "red")) +
  labs(title = "Histograms of Blotch1 and Blotch2 Time",
       x = "Blotch Time (s)",
       y = "Frequency",
       fill = "Blotch Type") +
  theme_minimal()

Note

Note: We can see that this is normally distributed.

ggplot(data = sharksub_long, aes(x = value, fill = sex)) +
  geom_histogram(binwidth = 1, alpha = 0.5, position = "identity") +
  facet_wrap(sex ~ variable, scales = "free") +
  scale_fill_manual(values = c("Female" = "blue", "Male" = "red")) +
  labs(title = "Histograms of Blotch1 and Blotch2 Time Grouped by Sex",
       x = "Blotch Time (s)",
       y = "Frequency",
       fill = "Sex") +
  theme_minimal()

Note

Note: We can see that this is not normally distributed when grouped by sex.

Looking at Differences between blotch1 and blotch2

# Perform paired t-test
t_test_result2 <- t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)
p_value2 <- t_test_result2$p.value
print(t_test_result2)


    Paired t-test

data:  sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.037176 -0.822301
sample estimates:
mean difference 
     -0.9297384

# Plot box plots with t-test annotation
ggplot(data = sharksub_long, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(alpha = 0.5) +
  facet_wrap(~ variable, scales = "free") +
  scale_fill_manual(values = c("blotch1" = "blue", "blotch2" = "red")) +
  labs(title = "Box Plots of Blotch1 and Blotch2 Time",
       x = "Blotch Type",
       y = "Blotch Time (s)",
       fill = "Blotch Event") +
  theme_minimal() +
  annotate("text", x = 1., y = max(sharksub_long$value), 
           label = paste(format.pval(p_value2, digits = 3)), 
           size = 5, color = "black")

Note

Note: We can see that there is a significant diference between the time of blotch1 and blotch2.

Looking at Differences between blotch1 and blotch2 with sex

# Split the data by sex
sharksub_female <- sharksub_long %>% filter(sex == "Female")
sharksub_male <- sharksub_long %>% filter(sex == "Male")

# Run paired t-tests within each sex
t_test_female <- t.test(sharksub_female$value[sharksub_female$variable == "blotch1"], 
                        sharksub_female$value[sharksub_female$variable == "blotch2"], 
                        paired = TRUE)
#remebering that normality cannot be assumed for the male subset - ranked test used instead of standard t.test
t_test_male <- wilcox.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)

# Create a data frame to store the t-test results
t_test_results3 <- data.frame(
  sex = c("Female", "Male"),
  p_value = c(t_test_female$p.value, t_test_male$p.value)
)
print(t_test_results3)

     sex      p_value
1 Female 1.699676e-13
2   Male 1.606456e-09

# Plot box plot
ggplot(data = sharksub_long, aes(x = sex, y = value, fill = sex)) +
  geom_boxplot(alpha = 0.5, outlier.shape = 16) +
  facet_wrap(~ variable, scales = "free") +
  scale_fill_manual(values = c("Female" = "blue", "Male" = "red")) +
  labs(title = "Box Plots of Blotch1 and Blotch2 Time Grouped by Sex",
       x = "Sex",
       y = "Blotch Time (s)",
       fill = "Sex") +
  theme_minimal()

Note

Note: We can see that there is a significant diference between the time of blotch1 and blotch2 when controlled by sex as well.

It is worth assessing if there is significance in the data between male and female when looking at each blotch event. There was with the larger dataset.

# Tests for blotch1
t_test_ss1 <- t.test(blotch1 ~ sex, data = sharksub, var.equal = TRUE)

# Tests for blotch2
t_test_ss2 <- t.test(blotch2 ~ sex, data = sharksub, var.equal = TRUE)

# Create a data frame to store the t-test results
t_test_results <- data.frame(
  Blotch_Event = c("1", "2"),
  p_value = c(t_test_ss1$p.value, t_test_ss2$p.value)
)

# Print the t-test results
print(t_test_results)

  Blotch_Event   p_value
1            1 0.1499993
2            2 0.2224073

Interestingly, there is no difference between male and female in each of the blotch events.

Modeling Blotch Time

#Clearing the workspace
rm(list = ls()) 

library(tidyverse)
library(caret)
library(mlbench)

output=FALSE