Main Asssessment

Load libraries

library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.4.2
Warning: package 'ggplot2' was built under R version 4.4.2
Warning: package 'tibble' was built under R version 4.4.2
Warning: package 'tidyr' was built under R version 4.4.2
Warning: package 'readr' was built under R version 4.4.2
Warning: package 'purrr' was built under R version 4.4.2
Warning: package 'dplyr' was built under R version 4.4.2
Warning: package 'stringr' was built under R version 4.4.2
Warning: package 'forcats' was built under R version 4.4.2
Warning: package 'lubridate' was built under R version 4.4.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
Warning: package 'readxl' was built under R version 4.4.2

Step 1: Load Data

Set file path

equine_path <- "C:\\Users\\LENOVO\\Desktop\\equine.xlsx"
eqsub_path <- "C:\\Users\\LENOVO\\Desktop\\eqsub.xlsx"

Load the data

equine_data <- read_excel(equine_path)
eqsub_data <- read_excel(eqsub_path)

Structure and summary of the data

str(equine_data)
tibble [498 × 8] (S3: tbl_df/tbl/data.frame)
 $ ID      : chr [1:498] "Eq001" "Eq002" "Eq003" "Eq004" ...
 $ sex     : chr [1:498] "Female" "Female" "Female" "Male" ...
 $ comp    : num [1:498] 37.2 34.5 36.3 35.3 37.4 ...
 $ weight  : num [1:498] 74.7 73.4 71.8 104.6 67.1 ...
 $ irt     : num [1:498] 37.7 35.7 34.8 36.2 33.6 ...
 $ air     : num [1:498] 23.4 21.4 20.1 21.6 21.8 ...
 $ cortisol: num [1:498] 64.1 73.7 54.4 86.3 108 ...
 $ BPM     : num [1:498] 153 150 149 150 149 ...
summary(equine_data)
      ID                sex                 comp           weight      
 Length:498         Length:498         Min.   :30.78   Min.   : 65.10  
 Class :character   Class :character   1st Qu.:34.16   1st Qu.: 75.67  
 Mode  :character   Mode  :character   Median :35.04   Median : 87.82  
                                       Mean   :35.12   Mean   : 87.93  
                                       3rd Qu.:36.05   3rd Qu.:100.34  
                                       Max.   :40.08   Max.   :110.94  
      irt             air           cortisol           BPM       
 Min.   :33.00   Min.   :20.01   Min.   : 50.03   Min.   :144.6  
 1st Qu.:34.42   1st Qu.:21.54   1st Qu.: 67.27   1st Qu.:148.9  
 Median :35.43   Median :23.11   Median : 82.43   Median :150.1  
 Mean   :35.54   Mean   :23.02   Mean   : 82.00   Mean   :150.1  
 3rd Qu.:36.71   3rd Qu.:24.35   3rd Qu.: 95.96   3rd Qu.:151.3  
 Max.   :38.00   Max.   :25.99   Max.   :112.45   Max.   :156.8  
str(eqsub_data)
tibble [498 × 4] (S3: tbl_df/tbl/data.frame)
 $ ID   : chr [1:498] "EQ001" "EQ002" "EQ003" "EQ004" ...
 $ sex  : chr [1:498] "Female" "Female" "Female" "Male" ...
 $ comp : num [1:498] 37.2 34.5 36.3 35.3 37.4 ...
 $ comp2: num [1:498] 31.5 29.6 30.8 30.1 31.7 ...
summary(eqsub_data)
      ID                sex                 comp           comp2      
 Length:498         Length:498         Min.   :30.78   Min.   :27.37  
 Class :character   Class :character   1st Qu.:34.16   1st Qu.:29.23  
 Mode  :character   Mode  :character   Median :35.04   Median :29.99  
                                       Mean   :35.12   Mean   :29.94  
                                       3rd Qu.:36.05   3rd Qu.:30.65  
                                       Max.   :40.08   Max.   :32.81  

Check for missing values

colSums(is.na(equine_data))
      ID      sex     comp   weight      irt      air cortisol      BPM 
       0        0        0        0        0        0        0        0 
colSums(is.na(eqsub_data))
   ID   sex  comp comp2 
    0     0     0     0 

Exploratory Data Analysis

Visualizations for equine_data

hist(equine_data$comp, 
     main = "Distribution of Compliance Time", 
     xlab = "Compliance Time (Seconds)", 
     col = "lightblue", 
     border = "black")

Boxplot for compliance time by sex

ggplot(equine_data, aes(x = sex, y = comp, fill = sex)) +
  geom_boxplot() +
  labs(title = "Compliance Time by Sex", 
       x = "Sex", 
       y = "Compliance Time (seconds)") +
  scale_fill_manual(values = c("pink", "lightblue")) +
  theme_minimal()

Scatterplot for irt vs cortisol

ggplot(equine_data, aes(x = irt, y = cortisol)) +
  geom_point(color = "red") +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Scatterplot: irt vs cortisol", x = "Thermographic Eye Temperature (°C)", y = "Cortisol (mcg/dl)")
`geom_smooth()` using formula = 'y ~ x'

Scatterplot for weight vs BPM

ggplot(equine_data, aes(x = weight, y = BPM)) +
  geom_point(color = "green", size = 3, shape = 19) +
  labs(title = "Scatterplot: Weight vs BPM", 
       x = "Weight (Kg)", 
       y = "BPM (Beats per Minute)") +
  theme_minimal()

Boxplot for compliance times before and after spray

Transform data for before/after comparison

eqsub_long <- eqsub_data %>%
  pivot_longer(cols = c(comp, comp2), names_to = "condition", values_to = "compliance_time") %>%
  mutate(condition = factor(condition, levels = c("comp", "comp2"), labels = c("Before Spray", "After Spray")))

ggplot(eqsub_long, aes(x = condition, y = compliance_time, fill = condition)) +
  geom_boxplot() +
  labs(title = "Compliance Times Before and After Spray", x = "Condition", y = "Compliance Time (seconds)")

Statistical Analysis

(a) Correlation between irt and cortisol (from equine_data)

Shapiro-Wilk test for normality

shapiro.test(equine_data$irt)

    Shapiro-Wilk normality test

data:  equine_data$irt
W = 0.95889, p-value = 1.447e-10
shapiro.test(equine_data$cortisol)

    Shapiro-Wilk normality test

data:  equine_data$cortisol
W = 0.96362, p-value = 9.245e-10

Perform Spearman’s correlation

spearman_result <- cor.test(equine_data$irt, equine_data$cortisol, method = "spearman")
print(spearman_result)

    Spearman's rank correlation rho

data:  equine_data$irt and equine_data$cortisol
S = 17878766, p-value = 0.00332
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.1314346 

Scatterplot for irt vs cortisol

ggplot(equine_data, aes(x = irt, y = cortisol)) +
  geom_point(color = "red") +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Scatterplot: irt vs cortisol", x = "Thermographic Eye Temperature (°C)", y = "Cortisol (mcg/dl)")
`geom_smooth()` using formula = 'y ~ x'

(b) Effect of calming spray (from eqsub_data)

Check normality of compliance time before spray

shapiro.test(eqsub_data$comp)

    Shapiro-Wilk normality test

data:  eqsub_data$comp
W = 0.99692, p-value = 0.4695

Check normality of compliance time after spray

shapiro.test(eqsub_data$comp2)

    Shapiro-Wilk normality test

data:  eqsub_data$comp2
W = 0.99433, p-value = 0.06154

Check normality of the differences

differences <- eqsub_data$comp2 - eqsub_data$comp
shapiro.test(differences)

    Shapiro-Wilk normality test

data:  differences
W = 0.84738, p-value < 2.2e-16

Perform Wilcoxon signed-rank test

wilcoxon_result <- wilcox.test(eqsub_data$comp, eqsub_data$comp2, paired = TRUE)
print(wilcoxon_result)

    Wilcoxon signed rank test with continuity correction

data:  eqsub_data$comp and eqsub_data$comp2
V = 124251, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0

Scatter plot for compliance time before and after spray

ggplot(eqsub_data, aes(x = comp, y = comp2)) +
  geom_point(color = "blue", size = 3, shape = 19) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "Scatterplot: Compliance Time Before vs After Spray",
       x = "Compliance Time Before Spray (Seconds)",
       y = "Compliance Time After Spray (Seconds)") +
  theme_minimal()

(c) Predict Compliance Time (from equine_data)

Build a linear regression model

model <- lm(comp ~ irt + cortisol + BPM + weight + air, data = equine_data)
summary(model)

Call:
lm(formula = comp ~ irt + cortisol + BPM + weight + air, data = equine_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.88796 -0.65339  0.00537  0.58312  2.79258 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.910e+01  3.641e+00 -10.739   <2e-16 ***
irt         -3.078e-02  3.191e-02  -0.964    0.335    
cortisol    -8.361e-04  2.602e-03  -0.321    0.748    
BPM          5.048e-01  2.228e-02  22.661   <2e-16 ***
weight       1.325e-03  3.361e-03   0.394    0.694    
air         -2.256e-02  2.708e-02  -0.833    0.405    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.003 on 492 degrees of freedom
Multiple R-squared:  0.5129,    Adjusted R-squared:  0.5079 
F-statistic: 103.6 on 5 and 492 DF,  p-value: < 2.2e-16

Scatterplot of Compliance time vs BPM with Regression Line

ggplot(equine_data, aes(x = BPM, y = comp)) +
  geom_point(color = "blue", size = 3, shape = 19) +
  geom_smooth(method = "lm", color = "red", linetype = "dashed") +
  labs(title = "Compliance Time vs BPM with Regression Line",
       x = "BPM (Beats Per Minute)",
       y = "Compliance Time (Seconds)") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

Generate predictions and add to the dataset

equine_data$predicted_comp <- predict(model, newdata = equine_data)

Check the dataset to confirm the column is added

head(equine_data)
# A tibble: 6 × 9
  ID    sex     comp weight   irt   air cortisol   BPM predicted_comp
  <chr> <chr>  <dbl>  <dbl> <dbl> <dbl>    <dbl> <dbl>          <dbl>
1 Eq001 Female  37.2   74.7  37.7  23.4     64.1  153.           36.6
2 Eq002 Female  34.5   73.4  35.7  21.4     73.7  150.           34.9
3 Eq003 Female  36.3   71.8  34.8  20.1     54.4  149.           34.9
4 Eq004 Male    35.3  105.   36.2  21.6     86.3  150.           35.2
5 Eq005 Female  37.4   67.1  33.6  21.8    108.   149.           34.6
6 Eq006 Male    33.5  110.   36.4  20.9    109.   147.           33.5

Calculate Mean Squared Error

mse <- mean((equine_data$comp - equine_data$predicted_comp)^2)
print(mse)
[1] 0.9945532

Calculate Root Mean Squared Error

rmse <- sqrt(mse)
print(rmse)
[1] 0.9972729

Mean Absolute Error

mae <- mean(abs(equine_data$comp - equine_data$predicted_comp))
print(mae)
[1] 0.7867215

Extract residuals

residuals <- resid(model)

Shapiro-Wilk test for residuals

shapiro_test <- shapiro.test(residuals)
print(shapiro_test)

    Shapiro-Wilk normality test

data:  residuals
W = 0.99755, p-value = 0.6828

Calculate residuals and fitted values

equine_data$residuals <- resid(model)
equine_data$fitted_values <- fitted(model)

Plot residuals vs. fitted values

ggplot(equine_data, aes(x = fitted_values, y = residuals)) +
  geom_point(color = "blue", size = 2) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residuals vs Fitted Values",
       x = "Fitted Values",
       y = "Residuals") +
  theme_minimal()

Actual vs Predicted Plot

ggplot(equine_data, aes(x = predicted_comp, y = comp)) +
  geom_point(color = "blue", size = 3) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "Actual vs Predicted Compliance Time",
       x = "Predicted Compliance Time (Seconds)",
       y = "Actual Compliance Time (Seconds)") +
  theme_minimal()