##title: "Equine Research Formative" 

##author: "Laura Marks" 

##date: "2024-10-16" 
setwd("/Users/Laura/Documents/Equine Masters/Data Analysis & Research Methods/")

equine <- read.table("Equine Summative NEW/equine.txt", header = TRUE, sep = "\t")

eqsub <- read.table("Equine Summative NEW/eqsub.txt", header = TRUE, sep = "\t")

Equine Research Formative

Data Analysis

1. Is there a difference in compliance time between mares and geldings?


Hypothesis

It is hypothesised there will be no significant difference in average compliance time based on the sex of the animal (mares and geldings)

Results
# Testing for normality shows to test the residuals for normality  
 # Fitting a linear model 
model <- lm (comp ~ sex, data = equine) 
 # Extract residuals
residuals <- residuals(model) 
 # Visual tests 
hist(residuals, main = "Histogram of Residuals") 

qqnorm(residuals) 

qqline(residuals, col = "red") 

# Formal test 
shapiro.test(residuals) 

    Shapiro-Wilk normality test

data:  residuals
W = 0.99738, p-value = 0.6242
# P > 0.05 therefore the residuals are normally distributed 
# Testing via Independent Two-Sample t-test based on normality testing 
# Perform independent t-test comparing 'comp' by 'sex' 
t_test_result <- t.test(comp ~ sex, data = equine) 

# Print the result 
print(t_test_result) 

    Welch Two Sample t-test

data:  comp by sex
t = -2.8315, df = 494.83, p-value = 0.004821
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
 -0.6105344 -0.1103336
sample estimates:
mean in group Female   mean in group Male 
            34.94452             35.30496 
# P < 0.05 therefore, there is a significant difference in compliance time between mares and geldings. This result rejects the hypothesis. 
# Load necessary Libraries 
library(ggplot2)
# Create the box plot
ggplot(equine, aes(x = sex, y = comp, fill = sex)) + 
# Keep the outline color black
  geom_boxplot(color = "black") + 
 # Add means
  stat_summary(fun = mean, geom = "point", size = 3, color = "red", shape = 18) + 
# Labeling Axis' and Titles
  labs(x = "Sex", y = "Compliance Time (s)", title = "Compliance Time by Gender") + 
# Add Colour to Variables  
   theme_minimal() +
  scale_fill_manual(values = c("Male" = "skyblue", "Female" = "lightpink"))  # Custom colors for each sex

2. Are there significant correlations between compliance time and other variables (IRT, cortisol level and heart rate)


Is there a significant difference between Thermographic Eye Temperature (Celcius) and Compliance Time (seconds)?

Hypothesis

It is hypothesised there will be no difference between thermographic eye temperature and compliance time

Results
# Perform the Shapiro-Wilk normality test 

shapiro.test(equine$irt) 

    Shapiro-Wilk normality test

data:  equine$irt
W = 0.95889, p-value = 1.447e-10
shapiro.test(equine$comp) 

    Shapiro-Wilk normality test

data:  equine$comp
W = 0.99692, p-value = 0.4695
# P < 0.05 therefore the data is not normally distributed 

cor.test(equine$irt, equine$comp, method = "spearman") 

    Spearman's rank correlation rho

data:  equine$irt and equine$comp
S = 21203514, p-value = 0.5028
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.03008441 
# P > 0.05 therefore there is a not a significant correlation between compliance time and eye temperature. This result accepts the hypothesis. 
# Add a linear regression line 

library(ggplot2) 

ggplot(equine, aes(x = irt, y = comp)) + 

  geom_point() +  # Add points 

  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Add linear regression line 

  labs(title = "Scatterplot of Compliance Time vs. Thermographic Eye Temperature (IRT)", 

       x = "IRT (Celcius)", 

       y = "Compliance Time (seconds)") 
`geom_smooth()` using formula = 'y ~ x'

Is there a significant difference between Blood Cortisol Levels (mcg/dL) and Compliance Time (seconds)?

Hypothesis

It is hypothesised there will be a difference. When blood cortisol increases, the compliance time will increase.

Results
# Perform the Shapiro-Wilk normality test 

shapiro.test(equine$cortisol) 

    Shapiro-Wilk normality test

data:  equine$cortisol
W = 0.96362, p-value = 9.245e-10
shapiro.test(equine$comp) 

    Shapiro-Wilk normality test

data:  equine$comp
W = 0.99692, p-value = 0.4695
# P < 0.05 therefore the data is not normally distributed 
cor.test(equine$cortisol, equine$comp, method = "spearman") 

    Spearman's rank correlation rho

data:  equine$cortisol and equine$comp
S = 21166080, p-value = 0.529
alternative hypothesis: true rho is not equal to 0
sample estimates:
        rho 
-0.02826584 
# P > 0.05 therefore there is a not a significant correlation between compliance time and blood lactate levels. This result rejects the hypothesis.  
# Add a linear regression line 

library(ggplot2) 

ggplot(equine, aes(x = cortisol, y = comp)) + 

  geom_point() +  # Add points 

  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Add linear regression line 

  labs(title = "Scatterplot of Compliance Time vs. Cortisol", 

       x = "Cortisol (mcg/dL)", 

       y = "Compliance Time (seconds)") 
`geom_smooth()` using formula = 'y ~ x'

Is there a significant difference between Heart Rate (bpm) and Compliance Time (seconds)?

Hypothesis

It is hypothesised there will be a difference. When heart rate increases, the compliance time will increase.

Results
# Testing each variable for normality  

# Perform the Shapiro-Wilk normality test 

shapiro.test(equine$BPM) 

    Shapiro-Wilk normality test

data:  equine$BPM
W = 0.99739, p-value = 0.6264
shapiro.test(equine$comp) 

    Shapiro-Wilk normality test

data:  equine$comp
W = 0.99692, p-value = 0.4695
# P > 0.05 therefore the data is normally distributed 
# Testing significance of Pearson correlation 
cor.test(equine$BPM, equine$comp) 

    Pearson's product-moment correlation

data:  equine$BPM and equine$comp
t = 22.767, df = 496, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6689964 0.7552703
sample estimates:
      cor 
0.7148428 
# P < 0.05 therefore there is a significant correlation between compliance time and heart rate. This result accepts the hypothesis.  
# Add a linear regression line 

library(ggplot2) 

ggplot(equine, aes(x = BPM, y = comp)) + 

  geom_point() +  # Add points 

  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Add linear regression line 

  labs(title = "Scatterplot of Compliance Time vs. Heart Rate ", 

       x = "Heart Rate (BPM)", 

       y = "Compliance Time (seconds)") 
`geom_smooth()` using formula = 'y ~ x'

3. Is there a correlation between IRT and Cortisol?


Hypothesis

It is hypothesised there will be a correlation between IRT and cortisol. When IRT increases, cortisol will also increase.

Results
# Perform the Shapiro-Wilk normality test 
shapiro.test(equine$irt) 

    Shapiro-Wilk normality test

data:  equine$irt
W = 0.95889, p-value = 1.447e-10
shapiro.test(equine$cortisol) 

    Shapiro-Wilk normality test

data:  equine$cortisol
W = 0.96362, p-value = 9.245e-10
# P < 0.05 therefore the data is not normally distributed  
cor.test(equine$irt, equine$cortisol, method = "spearman") 

    Spearman's rank correlation rho

data:  equine$irt and equine$cortisol
S = 17878766, p-value = 0.00332
alternative hypothesis: true rho is not equal to 0
sample estimates:
      rho 
0.1314346 
# P < 0.05 therefore there is a significant correlation between eye temperature and blood lactate levels. This result accepts the hypothesis. 
# Call necessary library 
library(ggplot2)
# Call Variables 
ggplot(equine, aes(x = irt, y = cortisol)) +
# Add points 
   geom_point() +
# Label Axis'
  labs(title = "Correlation between IRT and Cortisol Levels",
       x = "Thermographic Eye Temperature (Celsius)",
       y = "Blood Cortisol Levels (mcg/dl)") +
# Add Linear Line Regression   
  theme_minimal() +
  geom_smooth(method = "lm", se = FALSE, color = "blue")
`geom_smooth()` using formula = 'y ~ x'

4. Does the application of the calming spray have an effect on compliance time?


Hypothesis

It is hypothesised there will be a difference. When the calming spray is applied, compliance time will decrease.

Results

#Testing for normality shows to test the residuals for normality  
# Fitting a linear model 
model <- lm (comp ~ comp2, data = eqsub) 
 
# Extract residuals 
residuals <- residuals(model) 
 
# Visual tests 
hist(residuals, main = "Histogram of Residuals") 

qqnorm(residuals) 
qqline(residuals, col = "red") 

# Formal test 
shapiro.test(residuals) 

    Shapiro-Wilk normality test

data:  residuals
W = 0.48102, p-value < 2.2e-16
# P < 0.05 therefore the residuals are not normally distributed 
# Perform the Wilcoxon signed-rank test 
wilcox_test <- wilcox.test(eqsub$comp, eqsub$comp2, paired = TRUE) 
# Print the results 
print(wilcox_test) 

    Wilcoxon signed rank test with continuity correction

data:  eqsub$comp and eqsub$comp2
V = 124251, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
# P < 0.05 therefore there is a significant difference between compliance time before the calming spray and after the application of the calming spray. This result accepts the hypothesis. 
# Sample Data
df <- data.frame(
  comp = c(34.9, 35.1, 36.0, 34.5, 35.2),
  comp2 = c(33.8, 36.1, 34.7, 35.5, 36.3)
)
# Load necessary libraries
library(ggplot2)
library(tidyr)
Warning: package 'tidyr' was built under R version 4.4.1
# Reshape data to long format
df_long <- df %>%
  pivot_longer(cols = c(comp, comp2), names_to = "Variable", values_to = "Value")

# Create the box plot
ggplot(df_long, aes(x = Variable, y = Value, fill = Variable)) +
  geom_boxplot() +
  
# Labeling Axis' and Titles
  labs(title = "The Difference in Compliance Time When Applying a Calming Spray",
       x = "Treatment",
       y = "Compliance Time") +
  
# Add Colour to Variables 
  theme_minimal() +
  scale_fill_manual(values = c("comp" = "skyblue", "comp2" = "lightpink"),
                    name = "Treatment")

5. Is it possible to predict compliance time?


Hypothesis

The compliance time can be predicted, as time will decrease with the application of the calming spray.

# Running linear regression
model <- lm(comp2 ~ comp, data = eqsub)

# Summary of the model
summary(model)

Call:
lm(formula = comp2 ~ comp, data = eqsub)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.01780 -0.02830  0.03085  0.06742  2.60626 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 4.571268   0.184969   24.71   <2e-16 ***
comp        0.722302   0.005262  137.28   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1678 on 496 degrees of freedom
Multiple R-squared:  0.9744,    Adjusted R-squared:  0.9743 
F-statistic: 1.884e+04 on 1 and 496 DF,  p-value: < 2.2e-16
# Load necessary libraries
library(ggplot2)
# Create the linear model
model <- lm(comp2 ~ comp, data = eqsub)
# Create a scatter plot with the regression line
ggplot(eqsub, aes(x = comp, y = comp2)) +
# Scatter plot of points
  geom_point(color = 'black', alpha = 0.5) + 
# Add Regression line
  geom_smooth(method = 'lm', color = 'blue', se = FALSE) +  
# Labeling Axis'
  labs(title = 'Linear Regression of Compliance Time',
       x = 'Comp',
       y = 'Comp2') +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'