My summative R code

Author

Renad Galal Elsayed Elrayes

Published

January 24, 2025

Importing data into R

Please change your working directory to where the files are

# Set the working directory
Working_directory <- "C:/Users/renad/Desktop/NTU/1 Study/1 slides/1 Rsrch methods and data analyis/summative R/data files/"

# Construct file paths
path1 <- file.path(Working_directory, "sharks.xlsx")
path2 <- file.path(Working_directory, "sharksub.xlsx")

# Print paths to check
print(path1)
[1] "C:/Users/renad/Desktop/NTU/1 Study/1 slides/1 Rsrch methods and data analyis/summative R/data files//sharks.xlsx"
print(path2)
[1] "C:/Users/renad/Desktop/NTU/1 Study/1 slides/1 Rsrch methods and data analyis/summative R/data files//sharksub.xlsx"
# Install and load the `readxl` library
install.packages("readxl")  # Run this line only once if not already installed
Installing package into 'C:/Users/renad/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)
package 'readxl' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\renad\AppData\Local\Temp\RtmpOA5AWT\downloaded_packages
library(readxl)
Warning: package 'readxl' was built under R version 4.4.2
# Import data
sharks_data <- read_excel(path1)
sharksub_data <- read_excel(path2)

# View the imported data
head(sharks_data)
# A tibble: 6 × 10
  ID    sex    blotch   BPM weight length   air water  meta depth
  <chr> <chr>   <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
1 SH001 Female   37.2   148   74.7   187.  37.7  23.4  64.1  53.2
2 SH002 Female   34.5   158   73.4   189.  35.7  21.4  73.7  49.6
3 SH003 Female   36.3   125   71.8   284.  34.8  20.1  54.4  49.4
4 SH004 Male     35.3   161  105.    171.  36.2  21.6  86.3  50.3
5 SH005 Female   37.4   138   67.1   264.  33.6  21.8 108.   49.0
6 SH006 Male     33.5   126  110.    270.  36.4  20.9 109.   46.8
head(sharksub_data)
# A tibble: 6 × 4
  ID    sex    blotch1 blotch2
  <chr> <chr>    <dbl>   <dbl>
1 SH269 Female    36.1    37.2
2 SH163 Female    33.4    34.4
3 SH008 Female    36.3    36.5
4 SH239 Female    35.0    36.0
5 SH332 Female    35.7    36.8
6 SH328 Female    34.9    35.9

Checking and exploring the data

head(sharks_data)
# A tibble: 6 × 10
  ID    sex    blotch   BPM weight length   air water  meta depth
  <chr> <chr>   <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
1 SH001 Female   37.2   148   74.7   187.  37.7  23.4  64.1  53.2
2 SH002 Female   34.5   158   73.4   189.  35.7  21.4  73.7  49.6
3 SH003 Female   36.3   125   71.8   284.  34.8  20.1  54.4  49.4
4 SH004 Male     35.3   161  105.    171.  36.2  21.6  86.3  50.3
5 SH005 Female   37.4   138   67.1   264.  33.6  21.8 108.   49.0
6 SH006 Male     33.5   126  110.    270.  36.4  20.9 109.   46.8
head(sharksub_data) # Check Your Data
# A tibble: 6 × 4
  ID    sex    blotch1 blotch2
  <chr> <chr>    <dbl>   <dbl>
1 SH269 Female    36.1    37.2
2 SH163 Female    33.4    34.4
3 SH008 Female    36.3    36.5
4 SH239 Female    35.0    36.0
5 SH332 Female    35.7    36.8
6 SH328 Female    34.9    35.9
View (sharks_data)
View (sharksub_data) # view your data
str (sharks_data)
tibble [500 × 10] (S3: tbl_df/tbl/data.frame)
 $ ID    : chr [1:500] "SH001" "SH002" "SH003" "SH004" ...
 $ sex   : chr [1:500] "Female" "Female" "Female" "Male" ...
 $ blotch: num [1:500] 37.2 34.5 36.3 35.3 37.4 ...
 $ BPM   : num [1:500] 148 158 125 161 138 126 166 135 132 127 ...
 $ weight: num [1:500] 74.7 73.4 71.8 104.6 67.1 ...
 $ length: num [1:500] 187 189 284 171 264 ...
 $ air   : num [1:500] 37.7 35.7 34.8 36.2 33.6 ...
 $ water : num [1:500] 23.4 21.4 20.1 21.6 21.8 ...
 $ meta  : num [1:500] 64.1 73.7 54.4 86.3 108 ...
 $ depth : num [1:500] 53.2 49.6 49.4 50.3 49 ...
str (sharksub_data) # to display the structure or internal details
tibble [50 × 4] (S3: tbl_df/tbl/data.frame)
 $ ID     : chr [1:50] "SH269" "SH163" "SH008" "SH239" ...
 $ sex    : chr [1:50] "Female" "Female" "Female" "Female" ...
 $ blotch1: num [1:50] 36.1 33.4 36.3 35 35.7 ...
 $ blotch2: num [1:50] 37.2 34.4 36.5 36 36.8 ...
names (sharks_data)
 [1] "ID"     "sex"    "blotch" "BPM"    "weight" "length" "air"    "water" 
 [9] "meta"   "depth" 
names (sharksub_data) # a quick view of the variables names
[1] "ID"      "sex"     "blotch1" "blotch2"

Data Wrangling

Sharksdata wrangling

Checking for missing values

# Check for missing values in the entire dataset
sum(is.na(sharks_data))  # Total number of missing values
[1] 0
  • Sum = 0
  • There are no missing values
# Check for missing values in each column
colSums(is.na(sharks_data))
    ID    sex blotch    BPM weight length    air  water   meta  depth 
     0      0      0      0      0      0      0      0      0      0 
  • There are no missing values

Checking for duplicates

# Check for duplicate rows in the entire dataset
duplicates <- duplicated(sharks_data)
View (duplicates)
sum(duplicates)
[1] 0
  • there are no duplicates

Outliers detection

outliers detected in 2 variables in sharks_data (blotch,depth)

blotch > outliers detected!!

boxplot(sharks_data$blotch, main = "Boxplot for blotch Outliers", 
        ylab = "Values", col = "lightblue")

# Identifying blotch Outliers
# Extract the outlier values based on the IQR method
outlier_values <- function(data, variable) {
  q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE) # First quartile (25%)
  q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE) # Third quartile (75%)
  iqr <- q3 - q1                                      # Interquartile range
  lower_bound <- q1 - 1.5 * iqr                       # Lower threshold
  upper_bound <- q3 + 1.5 * iqr                       # Upper threshold
  
  # Identify the outlier values
  outliers <- data[[variable]][data[[variable]] < lower_bound | data[[variable]] > upper_bound]
  
  # Print thresholds for reference
  cat("Lower Bound:", lower_bound, "Upper Bound:", upper_bound, "\n")
  
  return(outliers)
}

# Apply the function to the "blotch" variable
blotch_outliers <- outlier_values(sharks_data, "blotch")
Lower Bound: 31.33423 Upper Bound: 38.87935 
cat("Outliers in Blotch:\n", blotch_outliers, "\n")
Outliers in Blotch:
 39.83638 39.13972 30.77585 40.08356 

depth > outliers detected!!

boxplot(sharks_data$depth, main = "Boxplot for depth Outliers", 
        ylab = "Values", col = "lightblue")

# Identifying depth Outliers
# Extract the outlier values based on the IQR method
outlier_values <- function(data, variable) {
  q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE) # First quartile (25%)
  q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE) # Third quartile (75%)
  iqr <- q3 - q1                                      # Interquartile range
  lower_bound <- q1 - 1.5 * iqr                       # Lower threshold
  upper_bound <- q3 + 1.5 * iqr                       # Upper threshold
  
  # Identify the outlier values
  outliers <- data[[variable]][data[[variable]] < lower_bound | data[[variable]] > upper_bound]
  
  # Print thresholds for reference
  cat("Lower Bound:", lower_bound, "Upper Bound:", upper_bound, "\n")
  
  return(outliers)
}

# Apply the function to the "depth" variable
depth_outliers <- outlier_values(sharks_data, "depth")
Lower Bound: 45.22479 Upper Bound: 55.01748 
cat("Outliers in depth:\n", depth_outliers, "\n")
Outliers in depth:
 55.30608 44.64474 55.1217 55.02983 45.2239 56.82916 44.87603 

BPM > no outliers

boxplot(sharks_data$BPM, main = "Boxplot for Outliers", 
        ylab = "Values", col = "lightblue")

weight > no outliers

boxplot(sharks_data$weight, main = "Boxplot for Outliers", 
        ylab = "Values", col = "lightblue")

length > no outliers

boxplot(sharks_data$length, main = "Boxplot for Outliers", 
        ylab = "Values", col = "lightblue")

air > no outliers

boxplot(sharks_data$air, main = "Boxplot for Outliers", 
        ylab = "Values", col = "lightblue")

water > no outliers

boxplot(sharks_data$water, main = "Boxplot for Outliers", 
        ylab = "Values", col = "lightblue")

meta

boxplot(sharks_data$meta, main = "Boxplot for Outliers", 
        ylab = "Values", col = "lightblue")

Outliers were addressed in the report

Data handling

library (dplyr) # loading dplyr package to use the group_by function

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Grouping by sex, calculating means, and counting each group

 # Grouping and summarizing
sharks_data_grouped <- sharks_data %>%
  group_by(sex) %>%
  summarize(
    mean_air_temp = mean(air, na.rm = TRUE),
    mean_water_temp = mean(water, na.rm = TRUE),
    count = n()  # Count of observations in each group
  )
print(sharks_data_grouped) # View the grouped summary
# A tibble: 2 × 4
  sex    mean_air_temp mean_water_temp count
  <chr>          <dbl>           <dbl> <int>
1 Female          35.5            23.1   236
2 Male            35.6            22.9   264

Results

  • the number of female sharks = 236
  • the number of male sharks = 264
  • The average air temperature for female sharks was 35.48716°C, and the average water temperature was 23.10948°C
  • The average air temperature for male sharks was 35.57826°C, and the average water temperature was 22.94100°C

Minimum and maximum values calculation

Tip
  • we could calculate min and max by simply rearranging the variables column order in R
  • the other way is to use coding to make it easily reproducible
# Calculate the minimum and maximum for the "BPM" variable
BPM_min <- min(sharks_data$BPM, na.rm = TRUE)
BPM_max <- max(sharks_data$BPM, na.rm = TRUE)
cat("BPM - Min:", BPM_min, ", Max:", BPM_max, "\n")
BPM - Min: 119 , Max: 166 
# Calculate the minimum and maximum for the "weight" variable
weight_min <- min(sharks_data$weight, na.rm = TRUE)
weight_max <- max(sharks_data$weight, na.rm = TRUE)
cat("weight - Min:", weight_min, ", Max:", weight_max, "\n")
weight - Min: 65.10202 , Max: 110.9447 
# Calculate the minimum and maximum for the "length" variable
length_min <- min(sharks_data$length, na.rm = TRUE)
length_max <- max(sharks_data$length, na.rm = TRUE)
cat("length - Min:", length_min, ", Max:", length_max, "\n")
length - Min: 128.2534 , Max: 290.9527 
# Calculate the minimum and maximum for the "meta" variable
meta_min <- min(sharks_data$meta, na.rm = TRUE)
meta_max <- max(sharks_data$meta, na.rm = TRUE)
cat("meta - Min:", meta_min, ", Max:", meta_max, "\n")
meta - Min: 50.026 , Max: 112.445 

Results

  • BPM: Min: 119 , Max: 166
  • weight: Min: 65.10202 , Max: 110.9447
  • length: Min: 128.2534 , Max: 290.9527
  • meta: Min: 50.026 , Max: 112.445

Means calculation

# Calculate the mean of a BPM
mean_value <- mean(sharks_data$BPM, na.rm = TRUE)
cat("Mean of BPM:", mean_value, "\n")
Mean of BPM: 141.762 
# Calculate the mean of a "weight"
mean_value <- mean(sharks_data$"weight", na.rm = TRUE)
cat("Mean of weight:", mean_value, "\n")
Mean of weight: 87.94172 
# Calculate the mean of a "length"
mean_value <- mean(sharks_data$"length", na.rm = TRUE)
cat("Mean of length:", mean_value, "\n")
Mean of length: 211.0442 
# Calculate the mean of a "meta"
mean_value <- mean(sharks_data$"meta", na.rm = TRUE)
cat("Mean of meta:", mean_value, "\n")
Mean of meta: 82.04268 
# Calculate the mean of a "air"
mean_value <- mean(sharks_data$"air", na.rm = TRUE)
cat("Mean of air:", mean_value, "\n")
Mean of air: 35.53526 
# Calculate the mean of a "water"
mean_value <- mean(sharks_data$"water", na.rm = TRUE)
cat("Mean of water:", mean_value, "\n")
Mean of water: 23.02052 

sharksub_data wrangling

Checking for missing values

# Check for missing values in the entire dataset
sum(is.na(sharksub_data))  # Total number of missing values
[1] 0
  • Sum = 0
  • There are no missing values
# Check for missing values in each column
colSums(is.na(sharksub_data))
     ID     sex blotch1 blotch2 
      0       0       0       0 
  • There are no missing values

Checking for duplicates

# Check for duplicate rows in the entire dataset
duplicates_sub <- duplicated(sharksub_data)
View (duplicates_sub)
sum(duplicates_sub)
[1] 0
  • there are no duplicates

no outliers were detected in sharksub dataset

blotch1 > no outliers

boxplot(sharksub_data$blotch1, main = "Boxplot for Outliers", 
        ylab = "Values", col = "lightblue")

blotch2 > no outliers

boxplot(sharksub_data$blotch2, main = "Boxplot for Outliers", 
        ylab = "Values", col = "lightblue")

Research question 1: Is there a correlation between the variables air and water?

  • type of variables? both continuous

My assumption: do we expect a correlation? yes, because they are often influenced by the same environmental factors and exchange heat with each other, but lets analyse them to confirm if this is true: ## Normality of data

shapiro.test(sharks_data$air)  

    Shapiro-Wilk normality test

data:  sharks_data$air
W = 0.95885, p-value = 1.338e-10
shapiro.test(sharks_data$water) # Check for normality to determine whether to use Pearson's correlation (for normally distributed data) or Spearman's correlation (for non-normally distributed data)

    Shapiro-Wilk normality test

data:  sharks_data$water
W = 0.96035, p-value = 2.371e-10
Tip

If the p-value from the Shapiro-Wilk test is < 0.05, the data is not normally distributed.

  • p-value for air = 1.338e-10 and for water = 2.371e-10 which indicates a statistically significant result. The data is not normally distributed (reject the null hypothesis of normality) > we’ll use Spearman’s correlation

testing the correlation

cor.test(sharks_data$air, sharks_data$water, method = "spearman")

    Spearman's rank correlation rho

data:  sharks_data$air and sharks_data$water
S = 22007692, p-value = 0.2082
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.05637344 
Tip
  • rho (Spearman’s rank correlation coefficient) measures the strength and direction of the monotonic relationship between two variables
  • rho = 1 : Perfect positive monotonic relationship (as one variable increases, the other increases).
  • rho = -1: Perfect negative monotonic relationship (as one variable increases, the other decreases).
  • rho = 0 : No monotonic relationship between the variables.
  • Reject H₀: If the p-value is less than or equal to 0.05 > there is significant evidence to reject the null hypothesis in favor of the alternative hypothesis.
  • Fail to Reject H₀: If the p-value is greater than 0.05 > there is insufficient evidence to reject the null hypothesis.

Results

  • p-value = 0.2082 > more than 0.05 > not enough evidence to reject > no significant difference
  • rho = -0.05637344 > a very weak negative relationship between air and water (very weak correlation)
  • This means that as air increases, water slightly tends to decrease, but the effect is very small
  • My assumption was not correct

Conclusion

  • There is no strong or significant monotonic relationship between air temperature and water temperature.

Visualising

library(ggplot2) # Load required libraries
Warning: package 'ggplot2' was built under R version 4.4.2
ggplot(sharks_data, aes(x = air, y = water)) +
  geom_point(color = "blue") +
  ggtitle("Correlation between air and water") +
  xlab("air") +
  ylab("water") +
  theme_minimal() # Scatterplot 

Minimum and maximum values calculation for water and air

# Calculate the minimum and maximum for the "air" variable
air_min <- min(sharks_data$air, na.rm = TRUE)
air_max <- max(sharks_data$air, na.rm = TRUE)
cat("Air - Min:", air_min, ", Max:", air_max, "\n")
Air - Min: 33.00454 , Max: 37.99978 
# Calculate the minimum and maximum for the "water" variable
water_min <- min(sharks_data$water, na.rm = TRUE)
water_max <- max(sharks_data$water, na.rm = TRUE)
cat("Water - Min:", water_min, ", Max:", water_max, "\n")
Water - Min: 20.00503 , Max: 25.98523 

Results

  • Air: Min: 33.00454 , Max: 37.99978
  • Water: Min: 20.00503 , Max: 25.98523

Research question 2: Does multiple capture have an effect on blotching time?

my assumption: yes of course it does, as it causes more stress when the shark goes through a stressful event twice

Normality of data

Shapiro-wilk normality test

shapiro.test(sharksub_data$blotch1)  

    Shapiro-Wilk normality test

data:  sharksub_data$blotch1
W = 0.97958, p-value = 0.5345
shapiro.test(sharksub_data$blotch2) # Check for normality to determine which test to use 

    Shapiro-Wilk normality test

data:  sharksub_data$blotch2
W = 0.97936, p-value = 0.5255
  • Null hypothesis: data is normally distributed (no significant deviation from normality)
  • Alternative hypothesis: data is NOTnormally distributed (significant deviation)

Result:

  • sharksub_data$blotch1 p-value = 0.5345
  • sharksub_data$blotch2 p-value = 0.5255
  • p-value is > 0.05, so the data is normally distributed.
  • Since this involves comparing blotching times between two related observations (first and second capture of the same sharks), we should use a paired samples t-test.
  • Aim: Compare the means of two samples and test whether the mean difference between two sets of observations is zero.
Tip

If the p-value > 0.05, you fail to reject the null hypothesis, meaning the data does not significantly deviate from normality and can be considered normal.

Since the data is normally distributed, and we want to evaluate whether the mean difference in blotching time between the first and second captures is statistically significant > we use Paired T- test > and based on that we can tell if there is an effect or not 😊

visualizing the data

library(ggplot2) # Load required libraries
# Basic boxplot for blotching times
boxplot(sharksub_data$blotch1, sharksub_data$blotch2,
        names = c("First Capture", "Second Capture"),
        main = "Boxplot of Blotching Times",
        xlab = "Capture Event",
        ylab = "Blotching Time (seconds)",
        col = c("lightblue", "lightgreen"))

from looking at the boxplot i can already tell that the second capture actually made the shark experience prolonged blotching time > which aligns with my assumption

another way of visualizing the data

  • To overlay individual shark data on the boxplot using ggplot2.
  • Adding lines connecting the points for each shark.
  • This visualizes the trends for individual sharks between the first and second captures.
library(tidyr)

# Reshape sharksub_data to "long format," where each row represents a single observation (blotching time) and includes columns for shark ID, capture event, and blotching time.
sharksub_long <- pivot_longer(
  sharksub_data,
  cols = c(blotch1, blotch2),
  names_to = "Capture_Event",
  values_to = "Blotching_Time"
)

# Rename capture events for better readability
sharksub_long$Capture_Event <- factor(
  sharksub_long$Capture_Event,
  levels = c("blotch1", "blotch2"),
  labels = c("First Capture", "Second Capture")
)
library(ggplot2)

# Boxplot with overlayed individual data, we use ggplot2 to add both the boxplot and the individual lines connecting data points for each shark.
ggplot(sharksub_long, aes(x = Capture_Event, y = Blotching_Time)) +
  geom_boxplot(aes(fill = Capture_Event), alpha = 0.6) +  # Boxplot
  geom_line(aes(group = ID), color = "gray", alpha = 0.5) +  # Lines for individual sharks
  geom_point(aes(color = ID), size = 2) +  # Points for individual blotching times
  labs(
    title = "Blotching Time by Capture Event",
    x = "Capture Event",
    y = "Blotching Time (seconds)"
  ) +
  scale_fill_manual(values = c("lightblue", "lightgreen")) +  # Box colors
  theme_minimal() +
  theme(legend.position = "none")  # Hides the legend for individual sharks

Interpretation This plot shows a boxplots that Summarises statistics for each capture event (median, quartiles, outliers)

  • Lines: How blotching times changed for each individual shark between captures.
  • Points: Actual blotching times for each shark.

Paired T-test:

  • H0 = mean difference between blotching time 1 and 2 is zero
# Perform a paired t-test
t_test_result <- t.test(sharksub_data$blotch1, sharksub_data$blotch2, paired = TRUE)

# Print the result
print(t_test_result)

    Paired t-test

data:  sharksub_data$blotch1 and sharksub_data$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 -1.037176 -0.822301
sample estimates:
mean difference 
     -0.9297384 
# Perform a paired t-test
t_test_result <- t.test(sharksub_data$blotch1, sharksub_data$blotch2, paired = TRUE, mu=0, conf.level = 0.99)

# Print the result
print(t_test_result)

    Paired t-test

data:  sharksub_data$blotch1 and sharksub_data$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
99 percent confidence interval:
 -1.0730161 -0.7864608
sample estimates:
mean difference 
     -0.9297384 

Results

  • alternative hypothesis: true mean difference is not equal to 0 > so yes multiple captures affect blotching time
  • t = -17.39
  • df = 49
  • p-value < 2.2e-16
  • 99 percent confidence interval: -1.0730161 -0.7864608
  • sample estimates: mean difference -0.9297384

If p-value < 0.05, there is a statistically significant difference in blotching times between the two captures. (p-value < 2.2e-16 so yes)

Research question 3: Is it possible to predict blotching time?

i think the way to approach this question is to see what the relationships are between “blotch” and each variable in sharks_data, then based on that we decide whether its predictable or not.

  • using chapter 5 from the module: Choosing the right analysis
  • Steps:
  1. Identify the nature of variables
  2. Foresee the types of analyses
  3. Draw graphics of predicted results

Identify the nature of variables

  • Sex: categorical (binary)
  • Blotch: Continuous
  • BPM: Continuous
  • weight: Continuous
  • length: Continuous
  • air: Continuous
  • water: Continuous
  • meta: Continuous
  • depth: Continuous
str (sharks_data) # checking variables
tibble [500 × 10] (S3: tbl_df/tbl/data.frame)
 $ ID    : chr [1:500] "SH001" "SH002" "SH003" "SH004" ...
 $ sex   : chr [1:500] "Female" "Female" "Female" "Male" ...
 $ blotch: num [1:500] 37.2 34.5 36.3 35.3 37.4 ...
 $ BPM   : num [1:500] 148 158 125 161 138 126 166 135 132 127 ...
 $ weight: num [1:500] 74.7 73.4 71.8 104.6 67.1 ...
 $ length: num [1:500] 187 189 284 171 264 ...
 $ air   : num [1:500] 37.7 35.7 34.8 36.2 33.6 ...
 $ water : num [1:500] 23.4 21.4 20.1 21.6 21.8 ...
 $ meta  : num [1:500] 64.1 73.7 54.4 86.3 108 ...
 $ depth : num [1:500] 53.2 49.6 49.4 50.3 49 ...

Finding associations between “blotch” and the rest of the variables

  1. First, visualization (graphs , models)
  2. Then, correlations
  3. Then, tests

Visualization

Sex and blotch scatterplot

library(ggplot2)
ggplot(sharks_data, aes(x = sex, y = blotch)) +
  geom_point(color = "blue") +
  ggtitle("Correlation between sex and blotch") +
  xlab("sex") +
  ylab("blotch") +
  theme_minimal() # Scatter plot 

Tip

correlation doesn’t imply causation!!!

ANOVA: to test whether there are significant differences in blotch time across different groups of a categorical variable (sex)

# Fit the ANOVA model
anova_model <- aov(blotch ~ sex, data = sharks_data)

# Summary of the ANOVA model
summary(anova_model)
             Df Sum Sq Mean Sq F value  Pr(>F)   
sex           1   18.3  18.323   9.139 0.00263 **
Residuals   498  998.5   2.005                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ANOVA doesn’t answer the question directly. Its just testing the relationship between “blotch” and “sex”

Results:

  • Males had longer blotching time (interesting) > so yes there is a relationship
boxplot(sharks_data$blotch ~ sharks_data$sex, main = "Boxplot of Blotch by sex", 
        ylab = "Values", col = "lightblue") #boxplot

outliers present (interesting)

Testing correlation between “blotch” and other variables

BPM

# scatter plot
plot(sharks_data$BPM, sharks_data$blotch,
     xlab = "BPM", ylab = "Blotch Time", main = "Scatter Plot")

library(ggplot2)
ggplot(sharks_data, aes(x = BPM, y = blotch)) +
  geom_point(color = "blue") +
  ggtitle("Correlation between BPM and blotch") +
  xlab("BPM") +
  ylab("blotch") +
  theme_minimal() # Scatter plot 

library(ggplot2)

# Visualizing the regression lines for males and females
ggplot(sharks_data, aes(x = BPM, y = blotch, color = sex)) +
  geom_point(alpha = 0.6) +  # Data points
  geom_smooth(method = "lm", aes(group = sex), se = FALSE) +  # Separate regression lines for males and females
  labs(title = "Regression of Blotch Time on BPM by Sex",
       x = "BPM", y = "Blotch Time",
       color = "Sex") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

library(ggplot2)
# Visualizing the regression lines for males and females
ggplot(sharks_data, aes(x = weight, y = blotch, color = sex)) +
  geom_point(alpha = 0.6) +  # Data points
  geom_smooth(method = "lm", aes(group = sex), se = FALSE) +  # Separate regression lines for males and females
  labs(title = "Regression of Blotch Time on weight by Sex",
       x = "weight", y = "Blotch Time",
       color = "Sex") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

no correlation between blotch and BPM

weight

# scatter plot
plot(sharks_data$weight, sharks_data$blotch,
     xlab = "weight", ylab = "Blotch Time", main = "Scatter Plot")

library(ggplot2)
ggplot(sharks_data, aes(x = weight, y = blotch)) +
  geom_point(color = "blue") +
  ggtitle("Correlation between weight and blotch") +
  xlab("weight") +
  ylab("blotch") +
  theme_minimal() # Scatter plot 

no correlation between blotch and weight

length

# scatter plot
plot(sharks_data$length, sharks_data$blotch,
     xlab = "length", ylab = "Blotch Time", main = "Scatter Plot")

library(ggplot2)
ggplot(sharks_data, aes(x = length, y = blotch)) +
  geom_point(color = "blue") +
  ggtitle("Correlation between length and blotch") +
  xlab("length") +
  ylab("blotch") +
  theme_minimal() # Scatter plot 

library(ggplot2)
# Visualizing the regression lines for males and females
ggplot(sharks_data, aes(x = length, y = blotch, color = sex)) +
  geom_point(alpha = 0.6) +  # Data points
  geom_smooth(method = "lm", aes(group = sex), se = FALSE) +  # Separate regression lines for males and females
  labs(title = "Regression of Blotch Time on length by Sex",
       x = "length", y = "Blotch Time",
       color = "Sex") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

no correlation between blotch and length

air

# scatter plot
plot(sharks_data$air, sharks_data$blotch,
     xlab = "air", ylab = "Blotch Time", main = "Scatter Plot")

library(ggplot2)
ggplot(sharks_data, aes(x = air, y = blotch)) +
  geom_point(color = "blue") +
  ggtitle("Correlation between air and blotch") +
  xlab("air") +
  ylab("blotch") +
  theme_minimal() # Scatter plot 

library(ggplot2)
# Visualizing the regression lines for males and females
ggplot(sharks_data, aes(x = air, y = blotch, color = sex)) +
  geom_point(alpha = 0.6) +  # Data points
  geom_smooth(method = "lm", aes(group = sex), se = FALSE) +  # Separate regression lines for males and females
  labs(title = "Regression of Blotch Time on air by Sex",
       x = "air", y = "Blotch Time",
       color = "Sex") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

no correlation between blotch and air

water

# scatter plot
plot(sharks_data$water, sharks_data$blotch,
     xlab = "water", ylab = "Blotch Time", main = "Scatter Plot")

library(ggplot2)
ggplot(sharks_data, aes(x = water, y = blotch)) +
  geom_point(color = "blue") +
  ggtitle("Correlation between water and blotch") +
  xlab("water") +
  ylab("blotch") +
  theme_minimal() # Scatter plot 

library(ggplot2)
# Visualizing the regression lines for males and females
ggplot(sharks_data, aes(x = water, y = blotch, color = sex)) +
  geom_point(alpha = 0.6) +  # Data points
  geom_smooth(method = "lm", aes(group = sex), se = FALSE) +  # Separate regression lines for males and females
  labs(title = "Regression of Blotch Time on water by Sex",
       x = "water", y = "Blotch Time",
       color = "Sex") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

no correlation between blotch and water

meta

# scatter plot
plot(sharks_data$meta, sharks_data$blotch,
     xlab = "meta", ylab = "Blotch Time", main = "Scatter Plot")

library(ggplot2)
ggplot(sharks_data, aes(x = meta, y = blotch)) +
  geom_point(color = "blue") +
  ggtitle("Correlation between meta and blotch") +
  xlab("meta") +
  ylab("blotch") +
  theme_minimal() # Scatter plot 

library(ggplot2)
# Visualizing the regression lines for males and females
ggplot(sharks_data, aes(x = meta, y = blotch, color = sex)) +
  geom_point(alpha = 0.6) +  # Data points
  geom_smooth(method = "lm", aes(group = sex), se = FALSE) +  # Separate regression lines for males and females
  labs(title = "Regression of Blotch Time on meta by Sex",
       x = "meta", y = "Blotch Time",
       color = "Sex") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

no correlation between blotch and meta

depth

# scatter plot
plot(sharks_data$depth, sharks_data$blotch,
     xlab = "depth", ylab = "Blotch Time", main = "Scatter Plot")

library(ggplot2)
ggplot(sharks_data, aes(x = depth, y = blotch)) +
  geom_point(color = "blue") +
  ggtitle("Correlation between depth and blotch") +
  xlab("depth") +
  ylab("blotch") +
  theme_minimal() # Scatter plot (upgraded)

library(ggplot2)
# Visualizing the regression lines for males and females
ggplot(sharks_data, aes(x = depth, y = blotch, color = sex)) +
  geom_point(alpha = 0.6) +  # Data points
  geom_smooth(method = "lm", aes(group = sex), se = FALSE) +  # Separate regression lines for males and females
  labs(title = "Regression of Blotch Time on depth by Sex",
       x = "depth", y = "Blotch Time",
       color = "Sex") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

i see positive upward lining! > there is correlation between blotch and depth

now we test the causation

Foresee the types of analyses

modelling: linear regression for depth and blotch

Tip
  • linear regression equation [ y = a + bx ]
  • y = output/response/trying to predict variable (blotch)
  • a = fixed/intercept/constant number (where the line crosses the y axis)
  • b = slope of the line
  • x = input/independent/using to predict variable (BPM, length, etc..)
# Fit the model
model <- lm(blotch ~ depth, data = sharks_data)

# View model summary
summary(model)

Call:
lm(formula = blotch ~ depth, data = sharks_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.81869 -0.65427 -0.01035  0.58825  2.83116 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.82178    1.11207   8.832   <2e-16 ***
depth        0.50467    0.02216  22.772   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1 on 498 degrees of freedom
Multiple R-squared:  0.5101,    Adjusted R-squared:  0.5091 
F-statistic: 518.6 on 1 and 498 DF,  p-value: < 2.2e-16
# visualizing the model
library(ggplot2)

ggplot(sharks_data, aes(x = depth, y = blotch)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue", se = TRUE) +
  labs(title = "Linear Regression of Blotch vs. depth",
       x = "depth",
       y = "Blotch Time") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

based on this > we can use depth to predict blotching time

Linear regression for multiple variables

making this step as a confirmation that the other variables do not have a linear relationship with blotch

# Linear regression model
model <- lm(blotch ~ BPM + weight + length + air + water + meta + depth, data = sharks_data)

# View model summary
summary(model)

Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta + 
    depth, data = sharks_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.83745 -0.66117 -0.00702  0.60110  2.74108 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.1405851  1.8958668   5.876 7.74e-09 ***
BPM         -0.0019723  0.0031890  -0.618    0.537    
weight       0.0016283  0.0033511   0.486    0.627    
length       0.0012295  0.0009710   1.266    0.206    
air         -0.0281474  0.0318707  -0.883    0.378    
water       -0.0188934  0.0270782  -0.698    0.486    
meta        -0.0009712  0.0025951  -0.374    0.708    
depth        0.5061285  0.0223191  22.677  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.002 on 492 degrees of freedom
Multiple R-squared:  0.514, Adjusted R-squared:  0.507 
F-statistic: 74.32 on 7 and 492 DF,  p-value: < 2.2e-16
# Linear model for males
model_male <- lm(blotch ~ BPM + weight + length + air + water + meta + depth,
                 data = subset(sharks_data, sex == "Male"))

# Linear model for females
model_female <- lm(blotch ~ BPM + weight + length + air + water + meta + depth,
                   data = subset(sharks_data, sex == "Female"))

# View summaries of both models
summary(model_male)

Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta + 
    depth, data = subset(sharks_data, sex == "Male"))

Residuals:
     Min       1Q   Median       3Q      Max 
-2.94564 -0.68003 -0.06042  0.57852  2.58234 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.1023247  2.7496753   3.674 0.000291 ***
BPM         -0.0001076  0.0044128  -0.024 0.980559    
weight       0.0029343  0.0048273   0.608 0.543823    
length       0.0007728  0.0014039   0.550 0.582487    
air          0.0170450  0.0443967   0.384 0.701353    
water       -0.0440785  0.0394629  -1.117 0.265058    
meta        -0.0027789  0.0036107  -0.770 0.442236    
depth        0.5065037  0.0318862  15.885  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.023 on 256 degrees of freedom
Multiple R-squared:  0.5057,    Adjusted R-squared:  0.4922 
F-statistic: 37.42 on 7 and 256 DF,  p-value: < 2.2e-16
summary(model_female)

Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta + 
    depth, data = subset(sharks_data, sex == "Female"))

Residuals:
     Min       1Q   Median       3Q      Max 
-2.42269 -0.61408  0.01388  0.70091  2.74980 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 12.707430   2.581993   4.922 1.65e-06 ***
BPM         -0.004454   0.004592  -0.970   0.3332    
weight       0.001167   0.004617   0.253   0.8006    
length       0.001566   0.001328   1.179   0.2395    
air         -0.082418   0.045323  -1.818   0.0703 .  
water        0.011057   0.036922   0.299   0.7649    
meta         0.000895   0.003699   0.242   0.8090    
depth        0.499601   0.030827  16.207  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9593 on 228 degrees of freedom
Multiple R-squared:  0.5399,    Adjusted R-squared:  0.5258 
F-statistic: 38.23 on 7 and 228 DF,  p-value: < 2.2e-16

visualise these models

using base R

# Define predictor variables
predictors <- c("BPM", "weight", "length", "air", "water", "meta", "depth")

# Create scatter plots with regression lines for all predictors
par(mfrow = c(3, 3))  # Arrange plots in a 3x3 grid
for (var in predictors) {
  plot(sharks_data[[var]], sharks_data$blotch,
       main = paste("Blotch vs", var),
       xlab = var, ylab = "Blotch", pch = 16, col = "blue")
  abline(lm(sharks_data$blotch ~ sharks_data[[var]]), col = "red", lwd = 2)  # Add regression line
}
par(mfrow = c(1, 1))  # Reset plotting layout

using ggplot2

# Install ggplot2 if not already installed
if (!requireNamespace("ggplot2", quietly = TRUE)) {
  install.packages("ggplot2")
}
library(ggplot2)

# Reshape the data for ggplot2
library(tidyr)
long_data <- sharks_data %>%
  pivot_longer(cols = c(BPM, weight, length, air, water, meta, depth),
               names_to = "Variable", values_to = "Value")

# Create scatter plots with regression lines
ggplot(long_data, aes(x = Value, y = blotch)) +
  geom_point(color = "blue") +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  facet_wrap(~Variable, scales = "free_x") +
  labs(title = "Linear Regression for All Variables", x = "Predictor", y = "Blotch") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

this is really cool! 😎👍

Predictive modelling (before splitting)

linear modelling depth and blotch

plot(sharks_data$depth, sharks_data$blotch,
     xlab = "Depth",
     ylab = "Blotching Time",
     main = "Scatterplot of Depth vs Blotching Time")
abline(lm(blotch ~ depth, data = sharks_data), col = "red")  # Adds a regression line

we can tell there is a linear regression = assumption = linear relationship

The result confirms a significant and strong positive correlation.

This suggests it’s appropriate to proceed with linear regression modeling, assuming other assumptions (e.g., linearity, homoscedasticity, normality of residuals) hold. (I test them later)

Results interpretation (before splitting)

  1. Depth is a significant predictor of blotching time

    p<0.05p < 0.05

  2. The relationship is positive: as depth increases, blotching time increases.

  3. The model explains a substantial portion of the variability in blotching time ().

    R2=51.01%R^2 = 51.01%

  4. The model is statistically significant overall.

shapiro.test(sharks_data$depth)

    Shapiro-Wilk normality test

data:  sharks_data$depth
W = 0.99746, p-value = 0.6485
  • Null hypothesis: data is normally distributed (no significant deviation from normality)
  • Alternative hypothesis: data is NOTnormally distributed (significant deviation)
Tip

If the p-value from the Shapiro-Wilk test is < 0.05, the data is not normally distributed.

  • then the data is normal (depth)

Linear regression model

lm_model <- lm(blotch ~ depth, data = sharks_data)
summary(lm_model)

Call:
lm(formula = blotch ~ depth, data = sharks_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.81869 -0.65427 -0.01035  0.58825  2.83116 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.82178    1.11207   8.832   <2e-16 ***
depth        0.50467    0.02216  22.772   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1 on 498 degrees of freedom
Multiple R-squared:  0.5101,    Adjusted R-squared:  0.5091 
F-statistic: 518.6 on 1 and 498 DF,  p-value: < 2.2e-16

Predictive modelling (after splitting)

Depth variable

  • ok we`ll work on depth since it looks like its the only linear one so far

Steps of predictive modelling:

  1. Split the Data into Training and Testing Sets
  2. Visualize the relationship between blotch and depth
  3. Check linearity
  4. Linear regression model
  5. Assess assumptions not yet..
  6. Evaluate model performance
  7. Make predictions
  8. Interpret results

Splitting the data

set.seed(123)  # For reproducibility
sample_index <- sample(1:nrow(sharks_data), size = 0.7 * nrow(sharks_data))  # 70% for training
train_data <- sharks_data[sample_index, ]
test_data <- sharks_data[-sample_index, ]
  • Confirming the Split
nrow(train_data)  # Number of rows in the training dataset
[1] 350
nrow(test_data)   # Number of rows in the testing dataset
[1] 150
nrow(train_data) + nrow(test_data) == nrow(sharks_data)  # Should return TRUE
[1] TRUE
  • View the Training Data
View(train_data)  # Opens the training dataset in the data viewer
head(train_data)  # Displays the first 6 rows of the training dataset
# A tibble: 6 × 10
  ID    sex    blotch   BPM weight length   air water  meta depth
  <chr> <chr>   <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
1 SH415 Female   34.2   158   87.7   161.  33.5  23.1  63.6  49.5
2 SH463 Female   35.1   138  103.    170.  36.6  24.8 102.   51.4
3 SH179 Female   36.1   166  106.    240.  35.2  23.8  65.0  51.2
4 SH014 Male     39.8   121   90.9   193.  34.4  23.4  99.8  55.3
5 SH195 Female   34.4   160  102.    278.  33.1  25.0 103.   49.2
6 SH426 Male     34.5   155  107.    224.  35.7  22.3  95.7  46.7
dim(train_data)   # Shows the dimensions (rows and columns) of the training dataset
[1] 350  10
  • View the Testing Data
View(test_data)  # Opens the testing dataset in the data viewer
head(test_data)  # Displays the first 6 rows of the testing dataset
# A tibble: 6 × 10
  ID    sex    blotch   BPM weight length   air water  meta depth
  <chr> <chr>   <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
1 SH001 Female   37.2   148   74.7   187.  37.7  23.4  64.1  53.2
2 SH003 Female   36.3   125   71.8   284.  34.8  20.1  54.4  49.4
3 SH006 Male     33.5   126  110.    270.  36.4  20.9 109.   46.8
4 SH008 Female   36.3   135  101.    128.  36.8  21.3  96.3  50.6
5 SH009 Male     35.4   132   95.0   269.  35.3  22.2  79.0  49.9
6 SH012 Female   36.2   131   70.0   235.  33.7  20.2  62.0  50.6
dim(test_data)   # Shows the dimensions (rows and columns) of the testing dataset
[1] 150  10
  • Inspect Specific Variables
  • If you want to view specific variables like depth or blotch in the training or testing datasets:
# Training Data
summary(train_data$depth)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  44.64   48.89   50.10   50.11   51.35   56.83 
summary(train_data$blotch)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  30.78   34.25   35.01   35.12   35.99   40.08 
# Testing Data
summary(test_data$depth)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  45.22   48.93   50.30   50.20   51.31   55.12 
summary(test_data$blotch)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  31.36   34.03   35.27   35.13   36.25   38.66 

Visualize the relationship between blotch and depth

  • Use only the training data for visualization and model fitting:
plot(train_data$depth, train_data$blotch,
     xlab = "Depth",
     ylab = "Blotching Time",
     main = "Scatterplot of Depth vs Blotching Time")
abline(lm(blotch ~ depth, data = train_data), col = "red")

Check linearity

  • normality and histograms and Q-Q
# Histogram for depth and blotch
hist(train_data$depth, main = "Histogram of Depth", xlab = "Depth", col = "skyblue")

hist(train_data$blotch, main = "Histogram of Blotch", xlab = "Blotch Time", col = "skyblue")

# Q-Q plot for depth and blotch
qqnorm(train_data$depth, main = "Q-Q Plot of Depth")
qqline(train_data$depth)

qqnorm(train_data$blotch, main = "Q-Q Plot of Blotch")
qqline(train_data$blotch)

# Shapiro-Wilk test for normality
shapiro.test(train_data$depth)  # Null hypothesis: Normal distribution

    Shapiro-Wilk normality test

data:  train_data$depth
W = 0.99715, p-value = 0.8034
shapiro.test(train_data$blotch)

    Shapiro-Wilk normality test

data:  train_data$blotch
W = 0.9927, p-value = 0.08557
  • data is normally distributed.
Tip

If the p-value from the Shapiro-Wilk test is < 0.05, the data is not normally distributed.

  • Homoscedasticity

Homoscedasticity means the variance of residuals is constant across all fitted values. Fit a simple linear regression model first:

model <- lm(blotch ~ depth, data = train_data)

# Residuals vs. Fitted plot to check homoscedasticity
plot(model, which = 1, main = "Residuals vs Fitted")

# Breusch-Pagan test for heteroscedasticity (optional, requires 'lmtest' package)
library(lmtest)
Loading required package: zoo

Attaching package: 'zoo'
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
bptest(model)  # Null hypothesis: Homoscedasticity

    studentized Breusch-Pagan test

data:  model
BP = 2.5623, df = 1, p-value = 0.1094
  • normality of residuals
# Extract residuals
residuals <- resid(model)

# Q-Q plot for residuals
qqnorm(residuals, main = "Q-Q Plot of Residuals")
qqline(residuals)

# Shapiro-Wilk test for residuals
shapiro.test(residuals)

    Shapiro-Wilk normality test

data:  residuals
W = 0.99375, p-value = 0.158
# Histogram of residuals
hist(residuals, main = "Histogram of Residuals", xlab = "Residuals", col = "skyblue")

Residuals are normally distributed We test the normality of the residuals rather than the variables(better)

Interpreting Results

  • Normality: If the Shapiro-Wilk test 𝑝p-value is > 0.05, the variable or residuals are normally distributed. Q-Q plots should have points close to the diagonal line.

  • Homoscedasticity: The residuals vs. fitted plot should show no clear patterns (e.g., no funnel shape).For the Breusch-Pagan test, 𝑝 p-value > 0.05 indicates homoscedasticity. (p-value = 0.1094)

  • Normality of Residuals: Residuals should follow a normal distribution, as confirmed by Q-Q plots, Shapiro-Wilk test, and histograms. (p-value = 0.158)

Linear regression model

  • Fit the model using the training dataset:
lm_model <- lm(blotch ~ depth, data = train_data)
summary(lm_model)

Call:
lm(formula = blotch ~ depth, data = train_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.83179 -0.64675 -0.06487  0.58677  2.81225 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.11171    1.29946   7.781 8.25e-14 ***
depth        0.49915    0.02591  19.264  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9863 on 348 degrees of freedom
Multiple R-squared:  0.5161,    Adjusted R-squared:  0.5147 
F-statistic: 371.1 on 1 and 348 DF,  p-value: < 2.2e-16

Assess assumptions

Evaluate model performance

  • Use the testing dataset to evaluate the model’s predictive performance:
predictions <- predict(lm_model, newdata = test_data)
# Compare predicted vs. actual values
actual_vs_predicted <- data.frame(Actual = test_data$blotch, Predicted = predictions)
print(head(actual_vs_predicted))
    Actual Predicted
1 37.17081  36.67942
2 36.32861  34.78976
3 33.54668  33.49244
4 36.29497  35.36674
5 35.40344  34.99610
6 36.23767  35.36759

Assess Model Performance:

Calculate metrics to assess how well your model predicts blotching time for the test dataset:

# Mean Squared Error (MSE)
mse <- mean((test_data$blotch - predictions)^2)
print(paste("Mean Squared Error:", mse))
[1] "Mean Squared Error: 1.06487810493847"
# R-squared for test data
sst <- sum((test_data$blotch - mean(test_data$blotch))^2)
ssr <- sum((test_data$blotch - predictions)^2)
r2 <- 1 - (ssr / sst)
print(paste("R-squared:", r2))
[1] "R-squared: 0.496536150102437"

Make predictions

Draw graphics of predicted results

  • Fit the Linear Model Using the Training Data: Use the training data to build the regression model:
lm_model <- lm(blotch ~ depth, data = train_data)
summary(lm_model)  # View the model summary

Call:
lm(formula = blotch ~ depth, data = train_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.83179 -0.64675 -0.06487  0.58677  2.81225 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.11171    1.29946   7.781 8.25e-14 ***
depth        0.49915    0.02591  19.264  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9863 on 348 degrees of freedom
Multiple R-squared:  0.5161,    Adjusted R-squared:  0.5147 
F-statistic: 371.1 on 1 and 348 DF,  p-value: < 2.2e-16
  • Make Predictions on the Test Data: Use the model to predict blotch values for the test dataset:
predictions <- predict(lm_model, newdata = test_data)
  • View the Predicted Values: Combine the actual and predicted values to compare them:
results <- data.frame(Actual = test_data$blotch, Predicted = predictions)
head(results)  # View the first few rows
    Actual Predicted
1 37.17081  36.67942
2 36.32861  34.78976
3 33.54668  33.49244
4 36.29497  35.36674
5 35.40344  34.99610
6 36.23767  35.36759
  • Evaluate Model Performance Assess how well your model predicts using metrics like Mean Squared Error (MSE) and R2R^2:
# Mean Squared Error (MSE)
mse <- mean((test_data$blotch - predictions)^2)
print(paste("Mean Squared Error:", mse))
[1] "Mean Squared Error: 1.06487810493847"
# R-squared
sst <- sum((test_data$blotch - mean(test_data$blotch))^2)  # Total sum of squares
ssr <- sum((test_data$blotch - predictions)^2)  # Residual sum of squares
r2 <- 1 - (ssr / sst)
print(paste("R-squared:", r2))
[1] "R-squared: 0.496536150102437"
  • Previous code adjusted to calculate adjusted R-squared and mean absolute error (MAE) as well
# Mean Squared Error (MSE)
mse <- mean((test_data$blotch - predictions)^2)
print(paste("Mean Squared Error (MSE):", mse))
[1] "Mean Squared Error (MSE): 1.06487810493847"
# Mean Absolute Error (MAE)
mae <- mean(abs(test_data$blotch - predictions))
print(paste("Mean Absolute Error (MAE):", mae))
[1] "Mean Absolute Error (MAE): 0.83339546490311"
# R-squared for test data
sst <- sum((test_data$blotch - mean(test_data$blotch))^2)
ssr <- sum((test_data$blotch - predictions)^2)
r2 <- 1 - (ssr / sst)
print(paste("R-squared:", r2))
[1] "R-squared: 0.496536150102437"
# Adjusted R-squared for test data
n <- nrow(test_data) # Number of observations
p <- length(coef(model)) - 1 # Number of predictors (excluding intercept)
adj_r2 <- 1 - ((1 - r2) * (n - 1) / (n - p - 1))
print(paste("Adjusted R-squared:", adj_r2))
[1] "Adjusted R-squared: 0.493134367332859"
  • Optional: Visualize Predictions: Create a scatterplot of Actual vs. Predicted values to visually assess the model’s performance:
plot(test_data$blotch, predictions,
     xlab = "Actual Blotch Values",
     ylab = "Predicted Blotch Values",
     main = "Actual vs. Predicted Blotch Values")
abline(0, 1, col = "red")  # Add a diagonal reference line

Summary of Predictions

  • Predicted Values: These are the blotching times the model estimates based on depth in the test dataset.
  • Evaluation Metrics: Use MSE and to quantify how close the predicted values are to the actual values. (R2R^2) # how do i confirm my prediction model is good?
  • Goodness of fit

Goodness of fit for this model

summary(lm_model)  # Provides R-squared and adjusted R-squared

Call:
lm(formula = blotch ~ depth, data = train_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.83179 -0.64675 -0.06487  0.58677  2.81225 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.11171    1.29946   7.781 8.25e-14 ***
depth        0.49915    0.02591  19.264  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9863 on 348 degrees of freedom
Multiple R-squared:  0.5161,    Adjusted R-squared:  0.5147 
F-statistic: 371.1 on 1 and 348 DF,  p-value: < 2.2e-16

multiple linear regression for other variables

# Fit the model for all data
model_all <- lm(blotch ~ BPM + weight + length + air + water + meta + depth, data = sharks_data)

# Summary of the model
summary(model_all)

Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta + 
    depth, data = sharks_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.83745 -0.66117 -0.00702  0.60110  2.74108 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 11.1405851  1.8958668   5.876 7.74e-09 ***
BPM         -0.0019723  0.0031890  -0.618    0.537    
weight       0.0016283  0.0033511   0.486    0.627    
length       0.0012295  0.0009710   1.266    0.206    
air         -0.0281474  0.0318707  -0.883    0.378    
water       -0.0188934  0.0270782  -0.698    0.486    
meta        -0.0009712  0.0025951  -0.374    0.708    
depth        0.5061285  0.0223191  22.677  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.002 on 492 degrees of freedom
Multiple R-squared:  0.514, Adjusted R-squared:  0.507 
F-statistic: 74.32 on 7 and 492 DF,  p-value: < 2.2e-16
# Diagnostics: Check residuals
plot(model_all, which = 1)  # Residuals vs Fitted

plot(model_all, which = 2)  # Q-Q plot for normality

group by sex and do it again

# Subset data by sex
data_male <- subset(sharks_data, sex == "Male")
data_female <- subset(sharks_data, sex == "Female")

# Fit models for each group
model_male <- lm(blotch ~ BPM + weight + length + air + water + meta + depth, data = data_male)
model_female <- lm(blotch ~ BPM + weight + length + air + water + meta + depth, data = data_female)

# Summaries for each model
summary(model_male)

Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta + 
    depth, data = data_male)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.94564 -0.68003 -0.06042  0.57852  2.58234 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.1023247  2.7496753   3.674 0.000291 ***
BPM         -0.0001076  0.0044128  -0.024 0.980559    
weight       0.0029343  0.0048273   0.608 0.543823    
length       0.0007728  0.0014039   0.550 0.582487    
air          0.0170450  0.0443967   0.384 0.701353    
water       -0.0440785  0.0394629  -1.117 0.265058    
meta        -0.0027789  0.0036107  -0.770 0.442236    
depth        0.5065037  0.0318862  15.885  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.023 on 256 degrees of freedom
Multiple R-squared:  0.5057,    Adjusted R-squared:  0.4922 
F-statistic: 37.42 on 7 and 256 DF,  p-value: < 2.2e-16
summary(model_female)

Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta + 
    depth, data = data_female)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.42269 -0.61408  0.01388  0.70091  2.74980 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 12.707430   2.581993   4.922 1.65e-06 ***
BPM         -0.004454   0.004592  -0.970   0.3332    
weight       0.001167   0.004617   0.253   0.8006    
length       0.001566   0.001328   1.179   0.2395    
air         -0.082418   0.045323  -1.818   0.0703 .  
water        0.011057   0.036922   0.299   0.7649    
meta         0.000895   0.003699   0.242   0.8090    
depth        0.499601   0.030827  16.207  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.9593 on 228 degrees of freedom
Multiple R-squared:  0.5399,    Adjusted R-squared:  0.5258 
F-statistic: 38.23 on 7 and 228 DF,  p-value: < 2.2e-16

Interpret results

visualise

install.packages("dotwhisker")
Installing package into 'C:/Users/renad/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)
package 'dotwhisker' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\renad\AppData\Local\Temp\RtmpOA5AWT\downloaded_packages
library(dotwhisker)
Warning: package 'dotwhisker' was built under R version 4.4.2
# Create a coefficient plot
dwplot(list(Male = model_male, Female = model_female)) +
  theme_minimal() +
  labs(title = "Comparison of Coefficients by Sex",
       x = "Coefficient Estimate",
       y = "Predictors") +
  theme(legend.position = "bottom")

accuracy level of the model

predicted_values <- predict(model, newdata = test_data)
actual_values <- test_data$blotch
r_squared_test <- cor(predicted_values, actual_values)^2
print(r_squared_test)
[1] 0.4981283

Visualize the relationship between depth and blotch, with separate regression lines for males and females:

library(ggplot2)

# Example: Depth vs. Blotch by Sex
ggplot(sharks_data, aes(x = depth, y = blotch, color = sex)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  theme_minimal() +
  labs(title = "Depth vs. Blotch by Sex",
       x = "Depth (m)",
       y = "Blotch Time (s)")
`geom_smooth()` using formula = 'y ~ x'

comparing models

# Compare R-squared and Adjusted R-squared
cat("Male Sharks: R-squared:", summary(model_male)$r.squared, "\n")
Male Sharks: R-squared: 0.5057271 
cat("Female Sharks: R-squared:", summary(model_female)$r.squared, "\n")
Female Sharks: R-squared: 0.5399398 
# Compare coefficients visually (optional)
library(dotwhisker)
dwplot(list(Male = model_male, Female = model_female))

the end :)