EquineData

Author

Shauna Meredith

# CRAN mirror needed setting
options(repos = c(CRAN = "https://cloud.r-project.org/"))

# Installation and reading of the readxl package
install.packages("readxl")

Installing package into 'C:/Users/smere/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)

package 'readxl' successfully unpacked and MD5 sums checked

Warning: cannot remove prior installation of package 'readxl'

Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
C:\Users\smere\AppData\Local\R\win-library\4.4\00LOCK\readxl\libs\x64\readxl.dll
to C:\Users\smere\AppData\Local\R\win-library\4.4\readxl\libs\x64\readxl.dll:
Permission denied

Warning: restored 'readxl'


The downloaded binary packages are in
    C:\Users\smere\AppData\Local\Temp\Rtmpcl9i1D\downloaded_packages

library(readxl)

Equine dataset

Is there a correlation between the variables IRT and Cortisol?

# First dataset 'equine' must be loaded into the document
equine <- readxl::read_excel("C:/Users/smere/OneDrive/Documents/Level 7/Equine_data/equine.xlsx")
head(equine)

# A tibble: 6 × 8
  ID    sex     comp weight   irt   air cortisol   BPM
  <chr> <chr>  <dbl>  <dbl> <dbl> <dbl>    <dbl> <dbl>
1 Eq001 Female  37.2   74.7  37.7  23.4     64.1  153.
2 Eq002 Female  34.5   73.4  35.7  21.4     73.7  150.
3 Eq003 Female  36.3   71.8  34.8  20.1     54.4  149.
4 Eq004 Male    35.3  105.   36.2  21.6     86.3  150.
5 Eq005 Female  37.4   67.1  33.6  21.8    108.   149.
6 Eq006 Male    33.5  110.   36.4  20.9    109.   147.

# Shapiro test for normality must be performed first
shapiro_result <- shapiro.test(equine$cortisol)

# Print result to show numerical findings
print(shapiro_result)


    Shapiro-Wilk normality test

data:  equine$cortisol
W = 0.96362, p-value = 9.245e-10

Data is normally distributed as values that are closer to 1 is normal.

P-value is greater than 0.05 so data is normally distributed.

Histogram with curve:

# Histogram showcassing normality for cortisol variable from 'equine' dataset
hist(equine$cortisol, 
     main = "Normal Distribution Histogram of Cortisol", 
     xlab = "Cortisol", 
     ylab = "Frequency", 
     col = "lightblue", 
     border = "black", 
     prob = TRUE)  

# Distribution curve is then created using the mean and standard deviation
curve(dnorm(x, mean = mean(equine$cortisol), sd = sd(equine$cortisol)),
      col = "red", 
      lwd = 2, 
      add = TRUE)

#Same steps are performed for the 'irt' variable
shapiro_result <- shapiro.test(equine$irt)
print(shapiro_result)


    Shapiro-Wilk normality test

data:  equine$irt
W = 0.95889, p-value = 1.447e-10

Data is normally distributed.

hist(equine$irt, 
     main = "Normal Distribution Histogram of IRT", 
     xlab = "irt", 
     ylab = "Frequency", 
     col = "lightblue", 
     border = "black", 
     prob = TRUE)  

curve(dnorm(x, mean = mean(equine$irt), sd = sd(equine$irt)),
      col = "red", 
      lwd = 2, 
      add = TRUE)

Pearsons correlation coefficient.

# cor.test() performs the test
cor.test(equine$cortisol, equine$irt, method = "pearson")


    Pearson's product-moment correlation

data:  equine$cortisol and equine$irt
t = 2.8881, df = 496, p-value = 0.004046
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.04119998 0.21404903
sample estimates:
      cor 
0.1286011

cor_result <- cor.test(equine$cortisol, equine$irt, method = "pearson")

# To create a scatter plot, install and load the ggplot2 package
install.packages("ggplot2")

Installing package into 'C:/Users/smere/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)

package 'ggplot2' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\smere\AppData\Local\Temp\Rtmpcl9i1D\downloaded_packages

library(ggplot2)

# Create a scatter plot of cortisol vs IRT with a regression line
ggplot(equine, aes(x = cortisol, y = irt)) +
  
  # geom_point() to add data points
  geom_point(color = "blue") +
  
  # geom() smooth to add regression line
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  
  # To add a regression line:
  labs(title = "Correlation between IRT and Cortisol", x = "Cortisol", y = "IRT") +
  theme(plot.title = element_text(hjust = 0.5)) + 
  
  # theme_minimal() to remove excess details
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

Eqsub dataset

Does the application of calming spray have an effect on compliance times?

library(readxl)

# Second dataset 'eqsub' must now be loaded for analysis
eqsub <- read_excel("~/Level 7/Equine_data/eqsub.xlsx")

Paired T-Test

Before calming spray

shapiro.test(eqsub$comp)


    Shapiro-Wilk normality test

data:  eqsub$comp
W = 0.99692, p-value = 0.4695

Data is normally distributed

hist(eqsub$comp, 
     main = "Normal Distribution Histogram of Compliance Time", 
     xlab = "Compliance Time (seconds)", 
     ylab = "Frequency", 
     col = "lightgreen", 
     border = "black", 
     prob = TRUE)


curve(dnorm(x, mean = mean(eqsub$comp), sd = sd(eqsub$comp)),
      col = "blue", 
      lwd = 2, 
      add = TRUE)

After calming spray

shapiro.test(eqsub$comp2)


    Shapiro-Wilk normality test

data:  eqsub$comp2
W = 0.99433, p-value = 0.06154

Data is normally distributed

hist(eqsub$comp2, 
     main = "Normal Distribution Histogram of Compliance Time", 
     xlab = "Compliance Time (seconds)", 
     ylab = "Frequency", 
     col = "lightgreen", 
     border = "black", 
     prob = TRUE)

# Overlay the normal distribution curve for 'comp2'
curve(dnorm(x, mean = mean(eqsub$comp2), sd = sd(eqsub$comp2)),
      col = "blue", 
      lwd = 2, 
      add = TRUE)

T-test performed

t_test_result <- t.test(eqsub$comp, eqsub$comp2, paired = TRUE)
print(t_test_result)


    Paired t-test

data:  eqsub$comp and eqsub$comp2
t = 268.31, df = 497, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
 5.144700 5.220602
sample estimates:
mean difference 
       5.182651

The “e” in scientific notation means “times ten to the power of”.

install.packages("ggplot2")

Warning: package 'ggplot2' is in use and will not be installed

Boxplot for before and after calming spray

The initial plot presented ‘After’ data on the left and ‘Before’ data on the right, so this code creates a boxplot that fixes that error

## A new data frame needs to be created with 'Before' representing the 'comp' data and 'After' representing the 'comp2' data
library(ggplot2)
data_melted <- data.frame(
  comp_time = c(eqsub$comp, eqsub$comp2),
spray_status = rep(c("Before", "After"), each = length(eqsub$comp))  
)

# 'spray_status' changed so that 'Before' appears first
data_melted$spray_status <- factor(data_melted$spray_status, levels = c("Before", "After"))

# Boxplot created with colours
ggplot(data_melted, aes(x = spray_status, y = comp_time, fill = spray_status)) +   
  geom_boxplot() +
  labs(title = "Compliance Time Before and After Calming Spray",
       x = "Spray Phase",
       y = "Time (seconds)") +
  scale_fill_manual(values = c("Before" = "lightblue", "After" = "lightcoral")) +
theme(plot.title = element_text(hjust = 0.5))

Paired Scatter Plot

library(ggplot2)
ggplot(eqsub, aes(x = comp, y = comp2)) +
  geom_point() +  
  
  #Connect data with a line using geom_line()
  geom_line(aes(group = 1), color = "gray") +  
  labs(title = "Paired Compliance Times",
       x = "Before Spray (seconds)",
       y = "After Spray (seconds)") +
  theme(plot.title = element_text(hjust = 0.5))

Is it possible to predict compliance time?

Multiple Linear Regression

Predicted Compliance Time

#Predict data
model <- lm(comp ~ weight + irt + air + cortisol + BPM, data = equine)
summary(model)


Call:
lm(formula = comp ~ weight + irt + air + cortisol + BPM, data = equine)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.88796 -0.65339  0.00537  0.58312  2.79258 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.910e+01  3.641e+00 -10.739   <2e-16 ***
weight       1.325e-03  3.361e-03   0.394    0.694    
irt         -3.078e-02  3.191e-02  -0.964    0.335    
air         -2.256e-02  2.708e-02  -0.833    0.405    
cortisol    -8.361e-04  2.602e-03  -0.321    0.748    
BPM          5.048e-01  2.228e-02  22.661   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.003 on 492 degrees of freedom
Multiple R-squared:  0.5129,    Adjusted R-squared:  0.5079 
F-statistic: 103.6 on 5 and 492 DF,  p-value: < 2.2e-16

#Visualising these predictions as a scatterplot
equine$predicted_comp <- predict(model, newdata = equine)

ggplot(equine, aes(x = comp, y = predicted_comp)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Predicted vs Actual Compliance Time",
       x = "Actual Compliance Time (seconds)",
       y = "Predicted Compliance Time (seconds)")+
  theme(plot.title = element_text(hjust = 0.5))

`geom_smooth()` using formula = 'y ~ x'

#Data shows that BPM is statistically significant and can therefore be a good predictor of stress in horses

model <- lm(comp ~ weight + irt + air + cortisol + BPM, data = equine)
summary(model)


Call:
lm(formula = comp ~ weight + irt + air + cortisol + BPM, data = equine)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.88796 -0.65339  0.00537  0.58312  2.79258 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.910e+01  3.641e+00 -10.739   <2e-16 ***
weight       1.325e-03  3.361e-03   0.394    0.694    
irt         -3.078e-02  3.191e-02  -0.964    0.335    
air         -2.256e-02  2.708e-02  -0.833    0.405    
cortisol    -8.361e-04  2.602e-03  -0.321    0.748    
BPM          5.048e-01  2.228e-02  22.661   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1.003 on 492 degrees of freedom
Multiple R-squared:  0.5129,    Adjusted R-squared:  0.5079 
F-statistic: 103.6 on 5 and 492 DF,  p-value: < 2.2e-16

BPM and Compliance Time in Horses

library(ggplot2)

# Create a linear model (lm) to show relationship between BPM and compliance time

model <- lm(comp ~ BPM, data = equine)
ggplot(equine, aes(x = BPM, y = comp)) +
  geom_point() +                          
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "BPM vs Compliance Time in Horses",
       x = "Beats Per Minute (BPM)",
       y = "Compliance Time (seconds)") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

`geom_smooth()` using formula = 'y ~ x'

Residual plot

model <- lm(comp ~ weight + irt + air + cortisol + BPM, data = equine)

# Generate predicted compliance time
predicted_comp_time <- predict(model, newdata = equine)

# Calculate residuals (Actual - Predicted) temporarily
residuals <- equine$comp - predicted_comp_time


# Create scatterplot of Actual vs Predicted compliance times with Residuals

ggplot(data.frame(predicted_comp_time, residuals), aes(x = predicted_comp_time, y = residuals)) +
  geom_point(color = "blue") +  
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") + 
  labs(title = "Residuals Plot: Actual vs Predicted",
       x = "Predicted Compliance Time (seconds)",
       y = "Residuals (Actual - Predicted)") +   theme(plot.title = element_text(hjust = 0.5)) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

Residual plots help you check if the model assumptions are met. In a good model, residuals should be randomly distributed around 0. This plot shows the difference between the observed and predicted values.

Does sex affect compliance time?

Hypothesis: Horses’ performance (compliance time) varies significantly between male and female horses.

t_test_sex <- t.test(comp2 ~ sex, data = eqsub)
print(t_test_sex)


    Welch Two Sample t-test

data:  comp2 by sex
t = -2.7058, df = 495.98, p-value = 0.007048
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
 -0.43524258 -0.06905908
sample estimates:
mean in group Female   mean in group Male 
            29.81449             30.06665

the p-value is 0.007 so there is a statistically significant difference in compliance time between males and females.

# Boxplot for 'Before' - 'comp'
boxplot(comp ~ sex, data = eqsub, main = "Compliance Time by Sex", ylab = "Compliance Time", col = c("lightblue", "lightgreen"))

#Boxplot for 'After' - 'comp2'
boxplot(comp2 ~ sex, data = eqsub, main = "Compliance Time by Sex", ylab = "Compliance Time", col = c("lightblue", "lightgreen"))