ARES4011 Submission

Author

Martin Cooper

# load the tidyverse package
library(tidyverse)

Sharks Data

Initial Data Import and checking

# Call data and create data frame. 
library(readxl)
sharks <- read_excel("D:/ARES40011 Rsrch Methods & Data Analysis/Summative/sharks.xlsx")
sharks <-data.frame(sharks)

# Summarize the data frame
summary(sharks)

      ID                sex                blotch           BPM       
 Length:500         Length:500         Min.   :30.78   Min.   :119.0  
 Class :character   Class :character   1st Qu.:34.16   1st Qu.:129.0  
 Mode  :character   Mode  :character   Median :35.05   Median :142.0  
                                       Mean   :35.13   Mean   :141.8  
                                       3rd Qu.:36.05   3rd Qu.:153.2  
                                       Max.   :40.08   Max.   :166.0  
     weight           length           air            water      
 Min.   : 65.10   Min.   :128.3   Min.   :33.00   Min.   :20.01  
 1st Qu.: 75.68   1st Qu.:172.0   1st Qu.:34.42   1st Qu.:21.55  
 Median : 87.82   Median :211.1   Median :35.43   Median :23.11  
 Mean   : 87.94   Mean   :211.0   Mean   :35.54   Mean   :23.02  
 3rd Qu.:100.40   3rd Qu.:251.8   3rd Qu.:36.71   3rd Qu.:24.37  
 Max.   :110.94   Max.   :291.0   Max.   :38.00   Max.   :25.99  
      meta            depth      
 Min.   : 50.03   Min.   :44.64  
 1st Qu.: 67.39   1st Qu.:48.90  
 Median : 82.45   Median :50.14  
 Mean   : 82.04   Mean   :50.14  
 3rd Qu.: 95.97   3rd Qu.:51.35  
 Max.   :112.45   Max.   :56.83

The summary output matches the expectation from the data set so data exploration can continue

#Checking the data structure
str(sharks)

'data.frame':   500 obs. of  10 variables:
 $ ID    : chr  "SH001" "SH002" "SH003" "SH004" ...
 $ sex   : chr  "Female" "Female" "Female" "Male" ...
 $ blotch: num  37.2 34.5 36.3 35.3 37.4 ...
 $ BPM   : num  148 158 125 161 138 126 166 135 132 127 ...
 $ weight: num  74.7 73.4 71.8 104.6 67.1 ...
 $ length: num  187 189 284 171 264 ...
 $ air   : num  37.7 35.7 34.8 36.2 33.6 ...
 $ water : num  23.4 21.4 20.1 21.6 21.8 ...
 $ meta  : num  64.1 73.7 54.4 86.3 108 ...
 $ depth : num  53.2 49.6 49.4 50.3 49 ...

We can see that there are 500 observations of 11 variables, with 1 character variable (sex) and 9 continuous variables.

#Checking for missing data
colSums(is.na(sharks))

    ID    sex blotch    BPM weight length    air  water   meta  depth 
     0      0      0      0      0      0      0      0      0      0

There is no missing data.

Outliers

As there are two data types, we need to check in different ways for outliers

boxplot(blotch ~ sex, ylab = 'Blotch Time (s)', xlab = 'Sex', data = sharks, outpch = 16, las=1)

This plot suggests that there is a difference between the means of blotch time by sex (different means). There are also 3 possible outliers in blotch time.

Note

Note: If doing further analysis of blotch time by sex, we will need to remeber to take out these outliers.

To check for outliers in the continuous variables using Cleveland dotplotts.

#install libraries
library(lattice)
echo=TRUE
results='hide'
# Define column names
Names <- c("BPM", "weight", "length", "air", "water", "meta", "depth")

# Create dot plot
dotplot(as.matrix(sharks[, Names]), 
        groups = FALSE, 
        strip = strip.custom(bg = 'white'), 
        par.strip.text = list(cex = 1.2),
        scales = list(x = list(relation = "free", draw = TRUE),
                      y = list(relation = "free", draw = FALSE)), 
        col = 1, cex = 1, pch = 16,
        xlab = list(label = "Value of the variable", cex = 1.2),
        ylab = list(label = "Order of the data", cex = 1.2))

The data is spread, but there are only possible outliers in the depth dataset. We can test these specifically if we want to conduct any futher analysis.

Normality Tests

# Create frequency polygons using different binning methods
ggplot(data = sharks, aes(x = blotch)) +
  geom_freqpoly(bins = 15, color = "blue", linetype = "solid", size = 1) +
  labs(title = "Frequency Polygon of Blotch Time",
       x = "Blotch Time (s)",
       y = "Frequency")

Our varibale is normal. To confirm:

shapiro.test(sharks$blotch)


    Shapiro-Wilk normality test

data:  sharks$blotch
W = 0.99695, p-value = 0.4769

Data is confirmed as normal.

Multicollinearity among covariates

This is to test if the independant variables are corelated

library(Hmisc)
echo=TRUE
results='hide'
# Define column names
Coll <- c("BPM", "weight", "length", "air", "water", "meta", "depth")

# Calculate correlation matrix with p-values
cor_results <- rcorr(as.matrix(sharks[, Coll]), type = "pearson")

# Correlation coefficients
cor_matrix <- cor_results$r

# Display correlation coefficients
print(cor_matrix)

                BPM       weight      length         air       water
BPM     1.000000000  0.017036558 -0.06856053 -0.06841209  0.02451337
weight  0.017036558  1.000000000 -0.01959676 -0.05264537  0.08633875
length -0.068560532 -0.019596758  1.00000000 -0.03027426 -0.05940708
air    -0.068412093 -0.052645366 -0.03027426  1.00000000 -0.05524051
water   0.024513368  0.086338753 -0.05940708 -0.05524051  1.00000000
meta   -0.006016429  0.019601470  0.00302851  0.12531801  0.02249461
depth  -0.012173520 -0.006057435 -0.08334774 -0.01188199 -0.04088851
               meta        depth
BPM    -0.006016429 -0.012173520
weight  0.019601470 -0.006057435
length  0.003028510 -0.083347736
air     0.125318005 -0.011881989
water   0.022494605 -0.040888511
meta    1.000000000  0.008150764
depth   0.008150764  1.000000000

This is a table of the corrrelation values

# P-values
p_matrix <- cor_results$P

#Display P-values
print(p_matrix)

             BPM     weight    length         air      water        meta
BPM           NA 0.70392824 0.1257632 0.126586449 0.58448465 0.893247627
weight 0.7039282         NA 0.6620083 0.239972797 0.05368554 0.661932090
length 0.1257632 0.66200832        NA 0.499413144 0.18476370 0.946143700
air    0.1265864 0.23997280 0.4994131          NA 0.21755131 0.005012136
water  0.5844847 0.05368554 0.1847637 0.217551315         NA 0.615808758
meta   0.8932476 0.66193209 0.9461437 0.005012136 0.61580876          NA
depth  0.7859781 0.89252444 0.0625624 0.790983481 0.36156444 0.855736961
           depth
BPM    0.7859781
weight 0.8925244
length 0.0625624
air    0.7909835
water  0.3615644
meta   0.8557370
depth         NA

This is a table of the significance of these correlations.

Note

Note: We can see that the only statistically signifcant correlation is between air and meta - Air Temperature and Cortizol level.

I found this easier to spot using graphics

library(GGally)

# Define column names
Names <- c("BPM", "weight", "length", "air", "water", "meta", "depth")

# Create a scatter plot matrix
ggpairs(sharks[, Names],
        lower = list(continuous = wrap("points", size = 1, alpha = 0.6)),
        diag = list(continuous = wrap("barDiag", fill = "lightblue")),
        upper = list(continuous = wrap("cor", size = 4))) +
  theme(panel.background = element_blank(),
        panel.border = element_rect(fill = NA, size = 1),
        strip.background = element_rect(fill = "white", color = "white", size = 1),
        text = element_text(size = 12))

Relations among dependent and independant variables

To see if there are any relationships between the blotch time and the variables collected. Given the difference in blotch time by sex, that split has been inluded to highlight if there are any sex spefic significance, the overall correlation and p-value are also displayed

# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$BPM, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value

#Plot
ggplot(sharks, aes(x = BPM, y = blotch, color=sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", 
                                        color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and BPM", 
       x = "BPM", 
       y = "Blotch Time",
       color="sex")+
  annotate("text", x = 155, y = 39, 
           label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)), 
           size = 5, color = "blue")

# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$weight, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value

ggplot(sharks, aes(x = weight, y = blotch, color=sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", 
                                        color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and Weight", 
       x = "Weight", 
       y = "Blotch Time",
       color="sex")+
    annotate("text", x = 70, y=39.05, 
           label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)), 
           size = 5, color = "blue")

# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$length, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value

ggplot(sharks, aes(x = length, y = blotch, color=sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", 
                                        color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and Length", 
       x = "Length", 
       y = "Blotch Time",
       color="sex")+
  annotate("text", x = 150, y=39, 
           label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)), 
           size = 5, color = "blue")

# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$air, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value

ggplot(sharks, aes(x = air, y = blotch, colour=sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", 
                                        color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and Air Temperature", 
       x = "Air Tempurature", 
       y = "Blotch Time",
       colour="sex")+
  annotate("text", x = 37, y=39, 
           label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)), 
           size = 5, color = "blue")

# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$water, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value

ggplot(sharks, aes(x = water, y = blotch, color = sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and Surface Water Temperature", 
       x = "Surface Water Temperature", 
       y = "Blotch Time",
       color = "Sex") +
  annotate("text", x = 21.5, y = 39.5, 
           label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)), 
           size = 5, color = "blue")

# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$meta, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value

ggplot(sharks, aes(x = meta, y = blotch, color = sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and Cortisol", 
       x = "Cortisol", 
       y = "Blotch Time",
       color = "Sex") +
  annotate("text", x = 57, y = 39.5, 
           label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)), 
           size = 5, color = "blue")

# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$depth, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value

ggplot(sharks, aes(x = depth, y = blotch, color = sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and Depth of Capture", 
       x = "Depth", 
       y = "Blotch Time",
       color = "Sex") +
  annotate("text", x = 46, y = 39.5, 
           label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)), 
           size = 5, color = "blue")

Note

Note: We can see that the only statistically signifcant correlation is between blotch time and depth.

#Analysis of Shark Data

t.test(blotch~sex,
       data=sharks,
       var.equal=TRUE)


    Two Sample t-test

data:  blotch by sex
t = -3.023, df = 498, p-value = 0.002632
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
 -0.6326914 -0.1342420
sample estimates:
mean in group Female   mean in group Male 
            34.92294             35.30641

This confimes that there is a significant differnce between male and female blotch times.

# Create a subset for females
sharks_female <- sharks %>%
  filter(sex == "Female")

# Create a subset for males
sharks_male <- sharks %>%
  filter(sex == "Male")

Given the difference between the sex and the significance of the correlation between depth and blotch time, we should calculate the stats for this.

# Run correlation tests for females and males
cor_test_female <- cor.test(sharks_female$depth, sharks_female$blotch)
correlation_female <- cor_test_female$estimate
p_value_female <- cor_test_female$p.value

cor_test_male <- cor.test(sharks_male$depth, sharks_male$blotch)
correlation_male <- cor_test_male$estimate
p_value_male <- cor_test_male$p.value

# Plot for combined data
ggplot(sharks, aes(x = depth, y = blotch, color = sex)) +
  geom_point() +
  geom_smooth(method = 'lm', se = FALSE) +
  theme(panel.background = element_blank()) +
  theme(panel.border = element_rect(fill = NA, size = 1)) +
  theme(strip.background = element_rect(fill = "white", color = "white", size = 1)) +
  theme(text = element_text(size = 16)) +
  labs(title = "Relationship between Blotch Time and Depth by Sex", 
       x = "Depth", 
       y = "Blotch Time",
       color = "Sex") +
  annotate("text", x = 48, y = 39.5, 
           label = paste("Female:\nCorrelation:", round(correlation_female, 2), 
                         "\nP-value:", format.pval(p_value_female, digits = 2)), 
           size = 5, color = "blue") +
  annotate("text", x = 54, y = 32.5, 
           label = paste("Male:\nCorrelation:", round(correlation_male, 2), 
                         "\nP-value:", format.pval(p_value_male, digits = 2)), 
           size = 5, color = "red")

`geom_smooth()` using formula = 'y ~ x'

Sharsub

Initial Data Import and checking

# Call data and create data frame. 
library(readxl)
sharksub <- read_excel("D:/ARES40011 Rsrch Methods & Data Analysis/Summative/sharksub (1).xlsx")
sharksub <-data.frame(sharksub)

# Summarize the data frame
summary(sharksub)

      ID                sex               blotch1         blotch2     
 Length:50          Length:50          Min.   :32.49   Min.   :33.47  
 Class :character   Class :character   1st Qu.:34.38   1st Qu.:35.31  
 Mode  :character   Mode  :character   Median :34.94   Median :35.94  
                                       Mean   :35.03   Mean   :35.96  
                                       3rd Qu.:35.90   3rd Qu.:36.78  
                                       Max.   :37.07   Max.   :38.18

str(sharksub)

'data.frame':   50 obs. of  4 variables:
 $ ID     : chr  "SH269" "SH163" "SH008" "SH239" ...
 $ sex    : chr  "Female" "Female" "Female" "Female" ...
 $ blotch1: num  36.1 33.4 36.3 35 35.7 ...
 $ blotch2: num  37.2 34.4 36.5 36 36.8 ...

Checking for missing data

colSums(is.na(sharksub))

     ID     sex blotch1 blotch2 
      0       0       0       0

##Outliers

For each capture event

#Looking for differences between male and felame for each capture

# Create a boxplot with color by sex
ggplot(sharksub, aes(x = sex, y = blotch1, fill = sex)) +
  geom_boxplot(outlier.shape = 16) +
  labs(title = "Boxplot of Blotch1 Time by Sex",
       x = "Sex",
       y = "Blotch1 Time (s)") +
  scale_fill_manual(values = c("Female" = "blue", "Male" = "red")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 0, hjust = 1))

This looks like there may be significance.

#Looking for differences between male and felame for each capture
# Create a boxplot with color by sex
ggplot(sharksub, aes(x = sex, y = blotch2, fill = sex)) +
  geom_boxplot(outlier.shape = 16) +
  labs(title = "Boxplot of Blotch2 Time by Sex",
       x = "Sex",
       y = "Blotch2 Time (s)") +
  scale_fill_manual(values = c("Female" = "blue", "Male" = "red")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 0, hjust = 1))

Note

Note: We can see that there may be a difference between male and female blotch time in both capture events.

##**Normality Tests

#To make it easier, group data by individual
sharksub_long <- sharksub %>%
  pivot_longer(cols = c(blotch1, blotch2), names_to = "variable", values_to = "value")

# Plot histograms
ggplot(data = sharksub_long, aes(x = value, fill = variable)) +
  geom_histogram(binwidth = 1, alpha = 0.5, position = "identity") +
  facet_wrap(~ variable, scales = "free") +
  scale_fill_manual(values = c("blotch1" = "blue", "blotch2" = "red")) +
  labs(title = "Histograms of Blotch1 and Blotch2 Time",
       x = "Blotch Time (s)",
       y = "Frequency",
       fill = "Blotch Type") +
  theme_minimal()

Note

Note: We can see that this is normally distributed.

ggplot(data = sharksub_long, aes(x = value, fill = sex)) +
  geom_histogram(binwidth = 1, alpha = 0.5, position = "identity") +
  facet_wrap(sex ~ variable, scales = "free") +
  scale_fill_manual(values = c("Female" = "blue", "Male" = "red")) +
  labs(title = "Histograms of Blotch1 and Blotch2 Time Grouped by Sex",
       x = "Blotch Time (s)",
       y = "Frequency",
       fill = "Sex") +
  theme_minimal()

Note

Note: We can see that this is not normally distributed when grouped by sex.

Looking at Differences between blotch1 and blotch2

# Perform paired t-test
t_test_result2 <- t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)
p_value2 <- t_test_result2$p.value
print(t_test_result2)


    Paired t-test

data:  sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.037176 -0.822301
sample estimates:
mean difference 
     -0.9297384

Note

Note: We can see that there is statistically significant differences in the blotch1 and blotch2 times.

# Plot box plots with t-test annotation
ggplot(data = sharksub_long, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(alpha = 0.5) +
  facet_wrap(~ variable, scales = "free") +
  scale_fill_manual(values = c("blotch1" = "blue", "blotch2" = "red")) +
  labs(title = "Box Plots of Blotch1 and Blotch2 Time",
       x = "Blotch Type",
       y = "Blotch Time (s)",
       fill = "Blotch Event") +
  theme_minimal() +
  annotate("text", x = 1., y = max(sharksub_long$value), 
           label = paste(format.pval(p_value2, digits = 3)), 
           size = 5, color = "black")

Note

Note: We can see that there is a significant diference between the time of blotch1 and blotch2.

Looking at Differences between blotch1 and blotch2 with sex

# Split the data by sex
sharksub_female <- sharksub_long %>% filter(sex == "Female")
sharksub_male <- sharksub_long %>% filter(sex == "Male")

# Run paired t-tests within each sex
t_test_female <- t.test(sharksub_female$value[sharksub_female$variable == "blotch1"], 
                        sharksub_female$value[sharksub_female$variable == "blotch2"], 
                        paired = TRUE)
#remebering that normality cannot be assumed for the male subset - ranked test used instead of standard t.test
t_test_male <- wilcox.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)

# Create a data frame to store the t-test results
t_test_results3 <- data.frame(
  sex = c("Female", "Male"),
  p_value = c(t_test_female$p.value, t_test_male$p.value)
)
print(t_test_results3)

     sex      p_value
1 Female 1.699676e-13
2   Male 1.606456e-09

# Plot box plot
ggplot(data = sharksub_long, aes(x = sex, y = value, fill = sex)) +
  geom_boxplot(alpha = 0.5, outlier.shape = 16) +
  facet_wrap(~ variable, scales = "free") +
  scale_fill_manual(values = c("Female" = "blue", "Male" = "red")) +
  labs(title = "Box Plots of Blotch1 and Blotch2 Time Grouped by Sex",
       x = "Sex",
       y = "Blotch Time (s)",
       fill = "Sex") +
  theme_minimal()

Note

Note: We can see that there is a significant diference between the time of blotch1 and blotch2 for both males and females,h the p-vales from the t-test confirming this.

It is worth assessing if there is significance in the data between male and female when looking at each blotch event. There was with the larger dataset.

# Tests for blotch1
t_test_ss1 <- t.test(blotch1 ~ sex, data = sharksub, var.equal = TRUE)

# Tests for blotch2
t_test_ss2 <- t.test(blotch2 ~ sex, data = sharksub, var.equal = TRUE)

# Create a data frame to store the t-test results
t_test_results <- data.frame(
  Blotch_Event = c("1", "2"),
  p_value = c(t_test_ss1$p.value, t_test_ss2$p.value)
)

# Print the t-test results
print(t_test_results)

  Blotch_Event   p_value
1            1 0.1499993
2            2 0.2224073

Interestingly, there is no significant difference between male and female in the individual blotch events.

Modeling Blotch Time

#Clearing the workspace
rm(list = ls()) 

library(tidyverse)
library(caret)
library(mlbench)

# Add Data
sharks <- read_excel("D:/ARES40011 Rsrch Methods & Data Analysis/Summative/sharks.xlsx")
sharks <- data.frame(sharks)

# Set seed for reproducibility
set.seed(234)

# Create training samples
training.samples <- sharks$blotch %>%
  createDataPartition(p = 0.8, list = FALSE)
#Assign Partitioned data to dataframes
train.data<-sharks[training.samples, ]
test.data<-sharks[-training.samples, ]

Linear Regression

From our previous analysis, we know there is a strong correlation between depth and bloch time. Let’s fit the model and check its effectiveness

Also, air and meta were found to have correlated significantly so have been excluded from all models.

model<-lm(blotch~depth, data=train.data)
summary(model)


Call:
lm(formula = blotch ~ depth, data = train.data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.81044 -0.65674 -0.04245  0.57193  2.86632 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.81343    1.24859    8.66   <2e-16 ***
depth        0.48471    0.02487   19.49   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1 on 398 degrees of freedom
Multiple R-squared:  0.4882,    Adjusted R-squared:  0.487 
F-statistic: 379.7 on 1 and 398 DF,  p-value: < 2.2e-16

This is pretty good! lets check its prediction capabilities:

# Create preditctions from model on test data
predictions<-predict(model, test.data)

#Assess Models Accuracy
#MSE
mse<-mean((test.data$blotch-predictions)^2)
#Produce output including label
print(paste("LM1 mse:", mse))

[1] "LM1 mse: 1.00886315360145"

#Root Mean Squared Error
rmse <- sqrt(mse)
print(paste("LM1 RMSE:", rmse))

[1] "LM1 RMSE: 1.00442180064027"

# Calculate R-squared
rss <- sum((test.data$blotch - predictions)^2)
tss <- sum((test.data$blotch - mean(test.data$blotch))^2)
r_squared <- 1 - (rss/tss)
print(paste("LM1 R-squared:", r_squared))

[1] "LM1 R-squared: 0.57775730819532"

This is only explaing approx 57% of the varience so is not that helpful. Lets see if it is improved by the inclusion of other predictors

Multiple Linear Regression

glm_model<-lm(blotch~depth+sex+BPM+weight+weight+length+water, data=train.data)
summary(glm_model)


Call:
lm(formula = blotch ~ depth + sex + BPM + weight + weight + length + 
    water, data = train.data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.89728 -0.64890 -0.03427  0.61504  2.96668 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.246261   1.613731   6.349 5.97e-10 ***
depth        0.486297   0.024793  19.614  < 2e-16 ***
sexMale      0.286089   0.099858   2.865  0.00439 ** 
BPM         -0.003430   0.003517  -0.975  0.32997    
weight       0.003345   0.003729   0.897  0.37028    
length       0.001402   0.001074   1.306  0.19235    
water        0.010075   0.029618   0.340  0.73392    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9919 on 393 degrees of freedom
Multiple R-squared:  0.503, Adjusted R-squared:  0.4954 
F-statistic: 66.28 on 6 and 393 DF,  p-value: < 2.2e-16

This suggests that sex is a significant predictor, so lets rebuilt a more specific model and re-test

glm_model2 <- lm(blotch ~ depth + sex, data = train.data)
summary(glm_model2)


Call:
lm(formula = blotch ~ depth + sex, data = train.data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.93974 -0.63739 -0.02135  0.56798  2.96795 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.69327    1.23865   8.633  < 2e-16 ***
depth        0.48411    0.02466  19.629  < 2e-16 ***
sexMale      0.27946    0.09944   2.810  0.00519 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9916 on 397 degrees of freedom
Multiple R-squared:  0.4982,    Adjusted R-squared:  0.4957 
F-statistic: 197.1 on 2 and 397 DF,  p-value: < 2.2e-16

# Create preditctions from model on test data
predictions2<-predict(glm_model2, test.data)

#Assess Models Accuracy
#MSE
mse<-mean((test.data$blotch-predictions2)^2)
#Produce output including label
print(paste("GLM2 mse:", mse))

[1] "GLM2 mse: 0.971084389631148"

#Root Mean Squared Error
rmse <- sqrt(mse)
print(paste("GLM2 RMSE:", rmse))

[1] "GLM2 RMSE: 0.985436141833223"

# Calculate R-squared
rss <- sum((test.data$blotch - predictions2)^2)
tss <- sum((test.data$blotch - mean(test.data$blotch))^2)
r_squared <- 1 - (rss/tss)
print(paste("GLM2 R-squared:", r_squared))

[1] "GLM2 R-squared: 0.593568973964783"

Whilst this does improve the models accuracy, it still onnly explains 59% of the variation. From our earlier visual checks of teh data, we know that there were no polynomial relationships so we have reached the limit of the predtions possible with GLMs.

Modeling with sex split data

As identified in the ealier analysis, there is a significant difference between male and female blotch time. It is therefore interesting to see if predictions can be applied to one sex.

# Split the data by sex
sharks_female <- sharks %>% filter(sex == "Female")
sharks_male <- sharks %>% filter(sex == "Male")

# Set seed for reproducibility
set.seed(234)

# Create training samples
training.samples_female <- sharks_female$blotch %>%
  createDataPartition(p = 0.8, list = FALSE)

#Assign Partitioned data to dataframes
train.data_female<-sharks_female[training.samples_female, ]
test.data_female<-sharks_female[-training.samples_female, ]

model_fe_lm<-lm(blotch~depth, data=train.data_female)
summary(model_fe_lm)


Call:
lm(formula = blotch ~ depth, data = train.data_female)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.48167 -0.55854  0.04126  0.59489  2.97202 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.58708    1.66676   6.352 1.53e-09 ***
depth        0.48619    0.03329  14.606  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9567 on 190 degrees of freedom
Multiple R-squared:  0.5289,    Adjusted R-squared:  0.5264 
F-statistic: 213.3 on 1 and 190 DF,  p-value: < 2.2e-16

This is not very different from teh one for data including both sexs, but lets check its prediction capabilities:

# Create preditctions from model on test data
predictions_fe_lm <- predict(model_fe_lm, test.data_female)

# Assess Model's Accuracy
# MSE
mse_Fe <- mean((test.data_female$blotch - predictions_fe_lm)^2)
# Produce output including label
print(paste("LM_Fe mse:", mse_Fe))

[1] "LM_Fe mse: 0.954896236658408"

# Root Mean Squared Error
rmse_Fe <- sqrt(mse_Fe)
print(paste("LM_Fe RMSE:", rmse_Fe))

[1] "LM_Fe RMSE: 0.977187922898359"

# Calculate R-squared
rss_Lm_Fe <- sum((test.data_female$blotch - predictions_fe_lm)^2)
tss_Lm_Fe <- sum((test.data_female$blotch - mean(test.data_female$blotch))^2)
r_squared_lm_Fe <- 1 - (rss_Lm_Fe / tss_Lm_Fe)
print(paste("LM_Fe R-squared:", r_squared_lm_Fe))

[1] "LM_Fe R-squared: 0.515810244670255"

This is only explaing approx 51% of the varience so is not that helpful and is owrse than our all data model. Lets see if it is improved by the inclusion of other predictors

Multiple Linear Regression

# Create linear model
glm_model_fe <- lm(blotch ~ BPM + weight + length + water, data = train.data_female)

# Summarize the model
summary(glm_model_fe)


Call:
lm(formula = blotch ~ BPM + weight + length + water, data = train.data_female)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.1677 -0.8189 -0.1133  0.9625  3.8311 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 34.8508524  1.9044964  18.299   <2e-16 ***
BPM         -0.0077815  0.0073821  -1.054    0.293    
weight       0.0012468  0.0074111   0.168    0.867    
length       0.0005759  0.0021346   0.270    0.788    
water        0.0403689  0.0592275   0.682    0.496    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.399 on 187 degrees of freedom
Multiple R-squared:  0.00886,   Adjusted R-squared:  -0.01234 
F-statistic: 0.4179 on 4 and 187 DF,  p-value: 0.7956

This suggests no other significant predictors for the female samples.

# Set seed for reproducibility
set.seed(234)

# Create training samples
training.samples_male <- sharks_male$blotch %>%
  createDataPartition(p = 0.8, list = FALSE)

#Assign Partitioned data to dataframes
train.data_male<-sharks_male[training.samples_male, ]
test.data_male<-sharks_male[-training.samples_male, ]

model_ma_lm<-lm(blotch~depth, data=train.data_male)
summary(model_ma_lm)


Call:
lm(formula = blotch ~ depth, data = train.data_male)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.95147 -0.65302 -0.04974  0.57094  2.59508 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.46885    1.75637   5.391 1.88e-07 ***
depth        0.51437    0.03495  14.716  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.019 on 210 degrees of freedom
Multiple R-squared:  0.5077,    Adjusted R-squared:  0.5053 
F-statistic: 216.6 on 1 and 210 DF,  p-value: < 2.2e-16

And the predictive test

# Create preditctions from model on test data
predictions_ma_lm <- predict(model_ma_lm, test.data_male)

# Assess Model's Accuracy
# MSE
mse_ma <- mean((test.data_male$blotch - predictions_ma_lm)^2)
# Produce output including label
print(paste("LM_ma mse:", mse_ma))

[1] "LM_ma mse: 1.01657017672931"

# Root Mean Squared Error
rmse_ma <- sqrt(mse_ma)
print(paste("LM_ma RMSE:", rmse_ma))

[1] "LM_ma RMSE: 1.00825104846428"

# Calculate R-squared
rss_Lm_ma <- sum((test.data_male$blotch - predictions_ma_lm)^2)
tss_Lm_ma <- sum((test.data_male$blotch - mean(test.data_male$blotch))^2)
r_squared_lm_ma <- 1 - (rss_Lm_ma / tss_Lm_ma)
print(paste("LM_ma R-squared:", r_squared_lm_ma))

[1] "LM_ma R-squared: 0.469320413660358"

This is less accurate than our previous models. Lets see if there are any other significant factors:

# Create linear model
glm_model_ma <- lm(blotch ~ depth + BPM + weight + length + water, data = train.data_male)

# Summarize the model
summary(glm_model_ma)


Call:
lm(formula = blotch ~ depth + BPM + weight + length + water, 
    data = train.data_male)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.94887 -0.64104 -0.06228  0.58686  2.63736 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.3680604  2.3573214   4.398 1.75e-05 ***
depth        0.5132221  0.0356102  14.412  < 2e-16 ***
BPM         -0.0019404  0.0049489  -0.392    0.695    
weight       0.0002594  0.0055450   0.047    0.963    
length       0.0004878  0.0015845   0.308    0.759    
water       -0.0299880  0.0446796  -0.671    0.503    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.026 on 206 degrees of freedom
Multiple R-squared:  0.5096,    Adjusted R-squared:  0.4977 
F-statistic: 42.82 on 5 and 206 DF,  p-value: < 2.2e-16

This suggests no other significant predictors for the male samples either.

General Model Playing

Gamma GML

gammal<-glm(blotch~depth+sex,
            family=Gamma(link=log),
            data=train.data)

# Create preditctions from model on test data
predictions3<-predict(gammal, test.data)

#Assess Models Accuracy
#MSE
mse<-mean((test.data$blotch-predictions3)^2)
#Produce output including label
print(paste("GAMMA GLM mse:", mse))

[1] "GAMMA GLM mse: 999.214085876252"

#Root Mean Squared Error
rmse <- sqrt(mse)
print(paste("GAMMA GLM RMSE:", rmse))

[1] "GAMMA GLM RMSE: 31.6103477658227"

# Calculate R-squared
rss <- sum((test.data$blotch - predictions3)^2)
tss <- sum((test.data$blotch - mean(test.data$blotch))^2)
r_squared <- 1 - (rss/tss)
print(paste("GAMMA GLM R-squared:", r_squared))

[1] "GAMMA GLM R-squared: -417.204236920936"

This is clearly inappropriate!

Decision Trees

# Load necessary library
library(rpart)

# Fit the decision tree model
DT_model <- rpart(blotch ~ depth + sex + BPM + weight + length + water, data = train.data)

# Summarize the model
summary(DT_model)

Call:
rpart(formula = blotch ~ depth + sex + BPM + weight + length + 
    water, data = train.data)
  n= 400 

          CP nsplit rel error    xerror       xstd
1 0.30682387      0 1.0000000 1.0091643 0.07451584
2 0.09293024      1 0.6931761 0.7547603 0.05704714
3 0.08655757      2 0.6002459 0.6957056 0.04812921
4 0.01948587      3 0.5136883 0.5744040 0.04157488
5 0.01716320      4 0.4942024 0.5633858 0.03977897
6 0.01054044      5 0.4770392 0.5441942 0.03931566
7 0.01000000      6 0.4664988 0.5350905 0.03698648

Variable importance
 depth length    BPM weight  water 
    92      4      2      2      1 

Node number 1: 400 observations,    complexity param=0.3068239
  mean=35.12411, MSE=1.944723 
  left son=2 (221 obs) right son=3 (179 obs)
  Primary splits:
      depth  < 50.39554 to the left,  improve=0.306823900, (0 missing)
      sex    splits as  LR,           improve=0.011225570, (0 missing)
      weight < 88.7134  to the left,  improve=0.010281100, (0 missing)
      BPM    < 137.5    to the right, improve=0.007445887, (0 missing)
      water  < 20.9844  to the left,  improve=0.005938598, (0 missing)
  Surrogate splits:
      length < 151.2442 to the right, agree=0.573, adj=0.045, (0 split)
      BPM    < 123.5    to the right, agree=0.562, adj=0.022, (0 split)
      water  < 25.79412 to the left,  agree=0.560, adj=0.017, (0 split)
      weight < 110.581  to the left,  agree=0.557, adj=0.011, (0 split)

Node number 2: 221 observations,    complexity param=0.08655757
  mean=34.42892, MSE=1.31364 
  left son=4 (77 obs) right son=5 (144 obs)
  Primary splits:
      depth  < 48.35314 to the left,  improve=0.23192850, (0 missing)
      BPM    < 124.5    to the left,  improve=0.02452245, (0 missing)
      water  < 25.35719 to the left,  improve=0.01914465, (0 missing)
      sex    splits as  LR,           improve=0.01570431, (0 missing)
      weight < 74.06402 to the right, improve=0.01363231, (0 missing)
  Surrogate splits:
      length < 134.2956 to the left,  agree=0.665, adj=0.039, (0 split)
      water  < 20.14037 to the left,  agree=0.661, adj=0.026, (0 split)
      weight < 110.363  to the right, agree=0.656, adj=0.013, (0 split)

Node number 3: 179 observations,    complexity param=0.09293024
  mean=35.98242, MSE=1.390501 
  left son=6 (123 obs) right son=7 (56 obs)
  Primary splits:
      depth  < 52.37234 to the left,  improve=0.29043590, (0 missing)
      length < 181.6658 to the left,  improve=0.02798548, (0 missing)
      weight < 107.0718 to the left,  improve=0.01740607, (0 missing)
      BPM    < 127.5    to the right, improve=0.01187385, (0 missing)
      sex    splits as  LR,           improve=0.00782871, (0 missing)
  Surrogate splits:
      BPM    < 119.5    to the right, agree=0.698, adj=0.036, (0 split)
      weight < 110.7731 to the left,  agree=0.698, adj=0.036, (0 split)
      length < 273.7339 to the left,  agree=0.698, adj=0.036, (0 split)

Node number 4: 77 observations,    complexity param=0.0171632
  mean=33.67409, MSE=0.8852428 
  left son=8 (15 obs) right son=9 (62 obs)
  Primary splits:
      depth  < 46.56121 to the left,  improve=0.19586770, (0 missing)
      BPM    < 127.5    to the left,  improve=0.15064330, (0 missing)
      water  < 21.0222  to the left,  improve=0.09737779, (0 missing)
      sex    splits as  LR,           improve=0.07784480, (0 missing)
      weight < 90.9203  to the right, improve=0.04637677, (0 missing)
  Surrogate splits:
      BPM    < 122.5    to the left,  agree=0.818, adj=0.067, (0 split)
      weight < 66.26287 to the left,  agree=0.818, adj=0.067, (0 split)

Node number 5: 144 observations,    complexity param=0.01054044
  mean=34.83255, MSE=1.075129 
  left son=10 (27 obs) right son=11 (117 obs)
  Primary splits:
      depth  < 48.98858 to the left,  improve=0.05296068, (0 missing)
      water  < 20.72111 to the right, improve=0.04888154, (0 missing)
      weight < 92.73019 to the left,  improve=0.03249283, (0 missing)
      length < 179.066  to the left,  improve=0.02505633, (0 missing)
      BPM    < 153      to the right, improve=0.02232216, (0 missing)
  Surrogate splits:
      length < 290.0006 to the right, agree=0.826, adj=0.074, (0 split)

Node number 6: 123 observations
  mean=35.55362, MSE=0.8542041 

Node number 7: 56 observations,    complexity param=0.01948587
  mean=36.92424, MSE=1.277557 
  left son=14 (47 obs) right son=15 (9 obs)
  Primary splits:
      depth  < 54.16999 to the left,  improve=0.21186980, (0 missing)
      length < 177.0123 to the left,  improve=0.06709445, (0 missing)
      water  < 21.27713 to the left,  improve=0.05156503, (0 missing)
      BPM    < 142      to the right, improve=0.02526993, (0 missing)
      weight < 100.1431 to the left,  improve=0.01819176, (0 missing)

Node number 8: 15 observations
  mean=32.82752, MSE=0.6336963 

Node number 9: 62 observations
  mean=33.8789, MSE=0.7307611 

Node number 10: 27 observations
  mean=34.33582, MSE=1.055709 

Node number 11: 117 observations
  mean=34.94718, MSE=1.009531 

Node number 14: 47 observations
  mean=36.69658, MSE=0.9547175 

Node number 15: 9 observations
  mean=38.11316, MSE=1.279294

# Fit the refined decision tree model
DT_model1 <- rpart(blotch ~ depth + length, data = train.data)

# Create preditctions from model on test data
predictions4<-predict(DT_model1, test.data)

#Assess Models Accuracy
#MSE
mse<-mean((test.data$blotch-predictions4)^2)
#Produce output including label
print(paste("DT1 mse:", mse))

[1] "DT1 mse: 0.973644768541162"

#Root Mean Squared Error
rmse <- sqrt(mse)
print(paste("DT1 RMSE:", rmse))

[1] "DT1 RMSE: 0.986734396147799"

# Calculate R-squared
rss <- sum((test.data$blotch - predictions4)^2)
tss <- sum((test.data$blotch - mean(test.data$blotch))^2)
r_squared <- 1 - (rss/tss)
print(paste("DT1 R-squared:", r_squared))

[1] "DT1 R-squared: 0.592497370468169"

This is no more accurate than the GLM.