# CRAN mirror needed settingoptions(repos =c(CRAN ="https://cloud.r-project.org/"))# Installation and reading of the readxl packageinstall.packages("readxl")
Installing package into 'C:/Users/smere/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)
package 'readxl' successfully unpacked and MD5 sums checked
Warning: cannot remove prior installation of package 'readxl'
Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
C:\Users\smere\AppData\Local\R\win-library\4.4\00LOCK\readxl\libs\x64\readxl.dll
to C:\Users\smere\AppData\Local\R\win-library\4.4\readxl\libs\x64\readxl.dll:
Permission denied
Warning: restored 'readxl'
The downloaded binary packages are in
C:\Users\smere\AppData\Local\Temp\Rtmpcl9i1D\downloaded_packages
library(readxl)
Equine dataset
Is there a correlation between the variables IRT and Cortisol?
# First dataset 'equine' must be loaded into the documentequine <- readxl::read_excel("C:/Users/smere/OneDrive/Documents/Level 7/Equine_data/equine.xlsx")head(equine)
# Shapiro test for normality must be performed firstshapiro_result <-shapiro.test(equine$cortisol)# Print result to show numerical findingsprint(shapiro_result)
Shapiro-Wilk normality test
data: equine$cortisol
W = 0.96362, p-value = 9.245e-10
Data is normally distributed as values that are closer to 1 is normal.
P-value is greater than 0.05 so data is normally distributed.
Histogram with curve:
# Histogram showcassing normality for cortisol variable from 'equine' datasethist(equine$cortisol, main ="Normal Distribution Histogram of Cortisol", xlab ="Cortisol", ylab ="Frequency", col ="lightblue", border ="black", prob =TRUE) # Distribution curve is then created using the mean and standard deviationcurve(dnorm(x, mean =mean(equine$cortisol), sd =sd(equine$cortisol)),col ="red", lwd =2, add =TRUE)
#Same steps are performed for the 'irt' variableshapiro_result <-shapiro.test(equine$irt)print(shapiro_result)
Shapiro-Wilk normality test
data: equine$irt
W = 0.95889, p-value = 1.447e-10
Data is normally distributed.
hist(equine$irt, main ="Normal Distribution Histogram of IRT", xlab ="irt", ylab ="Frequency", col ="lightblue", border ="black", prob =TRUE) curve(dnorm(x, mean =mean(equine$irt), sd =sd(equine$irt)),col ="red", lwd =2, add =TRUE)
Pearsons correlation coefficient.
# cor.test() performs the testcor.test(equine$cortisol, equine$irt, method ="pearson")
Pearson's product-moment correlation
data: equine$cortisol and equine$irt
t = 2.8881, df = 496, p-value = 0.004046
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.04119998 0.21404903
sample estimates:
cor
0.1286011
# To create a scatter plot, install and load the ggplot2 packageinstall.packages("ggplot2")
Installing package into 'C:/Users/smere/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)
package 'ggplot2' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\smere\AppData\Local\Temp\Rtmpcl9i1D\downloaded_packages
library(ggplot2)# Create a scatter plot of cortisol vs IRT with a regression lineggplot(equine, aes(x = cortisol, y = irt)) +# geom_point() to add data pointsgeom_point(color ="blue") +# geom() smooth to add regression linegeom_smooth(method ="lm", color ="red", se =FALSE) +# To add a regression line:labs(title ="Correlation between IRT and Cortisol", x ="Cortisol", y ="IRT") +theme(plot.title =element_text(hjust =0.5)) +# theme_minimal() to remove excess detailstheme_minimal()
`geom_smooth()` using formula = 'y ~ x'
Eqsub dataset
Does the application of calming spray have an effect on compliance times?
library(readxl)# Second dataset 'eqsub' must now be loaded for analysiseqsub <-read_excel("~/Level 7/Equine_data/eqsub.xlsx")
Paired T-Test
Before calming spray
shapiro.test(eqsub$comp)
Shapiro-Wilk normality test
data: eqsub$comp
W = 0.99692, p-value = 0.4695
Data is normally distributed
hist(eqsub$comp, main ="Normal Distribution Histogram of Compliance Time", xlab ="Compliance Time (seconds)", ylab ="Frequency", col ="lightgreen", border ="black", prob =TRUE)curve(dnorm(x, mean =mean(eqsub$comp), sd =sd(eqsub$comp)),col ="blue", lwd =2, add =TRUE)
After calming spray
shapiro.test(eqsub$comp2)
Shapiro-Wilk normality test
data: eqsub$comp2
W = 0.99433, p-value = 0.06154
Data is normally distributed
hist(eqsub$comp2, main ="Normal Distribution Histogram of Compliance Time", xlab ="Compliance Time (seconds)", ylab ="Frequency", col ="lightgreen", border ="black", prob =TRUE)# Overlay the normal distribution curve for 'comp2'curve(dnorm(x, mean =mean(eqsub$comp2), sd =sd(eqsub$comp2)),col ="blue", lwd =2, add =TRUE)
Paired t-test
data: eqsub$comp and eqsub$comp2
t = 268.31, df = 497, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
5.144700 5.220602
sample estimates:
mean difference
5.182651
The “e” in scientific notation means “times ten to the power of”.
install.packages("ggplot2")
Warning: package 'ggplot2' is in use and will not be installed
Boxplot for before and after calming spray
The initial plot presented ‘After’ data on the left and ‘Before’ data on the right, so this code creates a boxplot that fixes that error
## A new data frame needs to be created with 'Before' representing the 'comp' data and 'After' representing the 'comp2' datalibrary(ggplot2)data_melted <-data.frame(comp_time =c(eqsub$comp, eqsub$comp2),spray_status =rep(c("Before", "After"), each =length(eqsub$comp)) )# 'spray_status' changed so that 'Before' appears firstdata_melted$spray_status <-factor(data_melted$spray_status, levels =c("Before", "After"))# Boxplot created with coloursggplot(data_melted, aes(x = spray_status, y = comp_time, fill = spray_status)) +geom_boxplot() +labs(title ="Compliance Time Before and After Calming Spray",x ="Spray Phase",y ="Time (seconds)") +scale_fill_manual(values =c("Before"="lightblue", "After"="lightcoral")) +theme(plot.title =element_text(hjust =0.5))
Paired Scatter Plot
library(ggplot2)ggplot(eqsub, aes(x = comp, y = comp2)) +geom_point() +#Connect data with a line using geom_line()geom_line(aes(group =1), color ="gray") +labs(title ="Paired Compliance Times",x ="Before Spray (seconds)",y ="After Spray (seconds)") +theme(plot.title =element_text(hjust =0.5))
Is it possible to predict compliance time?
Multiple Linear Regression
Predicted Compliance Time
#Predict datamodel <-lm(comp ~ weight + irt + air + cortisol + BPM, data = equine)summary(model)
Call:
lm(formula = comp ~ weight + irt + air + cortisol + BPM, data = equine)
Residuals:
Min 1Q Median 3Q Max
-2.88796 -0.65339 0.00537 0.58312 2.79258
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.910e+01 3.641e+00 -10.739 <2e-16 ***
weight 1.325e-03 3.361e-03 0.394 0.694
irt -3.078e-02 3.191e-02 -0.964 0.335
air -2.256e-02 2.708e-02 -0.833 0.405
cortisol -8.361e-04 2.602e-03 -0.321 0.748
BPM 5.048e-01 2.228e-02 22.661 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.003 on 492 degrees of freedom
Multiple R-squared: 0.5129, Adjusted R-squared: 0.5079
F-statistic: 103.6 on 5 and 492 DF, p-value: < 2.2e-16
#Visualising these predictions as a scatterplotequine$predicted_comp <-predict(model, newdata = equine)ggplot(equine, aes(x = comp, y = predicted_comp)) +geom_point() +geom_smooth(method ="lm", color ="blue") +labs(title ="Predicted vs Actual Compliance Time",x ="Actual Compliance Time (seconds)",y ="Predicted Compliance Time (seconds)")+theme(plot.title =element_text(hjust =0.5))
`geom_smooth()` using formula = 'y ~ x'
#Data shows that BPM is statistically significant and can therefore be a good predictor of stress in horsesmodel <-lm(comp ~ weight + irt + air + cortisol + BPM, data = equine)summary(model)
Call:
lm(formula = comp ~ weight + irt + air + cortisol + BPM, data = equine)
Residuals:
Min 1Q Median 3Q Max
-2.88796 -0.65339 0.00537 0.58312 2.79258
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.910e+01 3.641e+00 -10.739 <2e-16 ***
weight 1.325e-03 3.361e-03 0.394 0.694
irt -3.078e-02 3.191e-02 -0.964 0.335
air -2.256e-02 2.708e-02 -0.833 0.405
cortisol -8.361e-04 2.602e-03 -0.321 0.748
BPM 5.048e-01 2.228e-02 22.661 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.003 on 492 degrees of freedom
Multiple R-squared: 0.5129, Adjusted R-squared: 0.5079
F-statistic: 103.6 on 5 and 492 DF, p-value: < 2.2e-16
BPM and Compliance Time in Horses
library(ggplot2)# Create a linear model (lm) to show relationship between BPM and compliance timemodel <-lm(comp ~ BPM, data = equine)ggplot(equine, aes(x = BPM, y = comp)) +geom_point() +geom_smooth(method ="lm", se =FALSE, color ="blue") +labs(title ="BPM vs Compliance Time in Horses",x ="Beats Per Minute (BPM)",y ="Compliance Time (seconds)") +theme_minimal() +theme(plot.title =element_text(hjust =0.5))
`geom_smooth()` using formula = 'y ~ x'
Residual plot
model <-lm(comp ~ weight + irt + air + cortisol + BPM, data = equine)# Generate predicted compliance timepredicted_comp_time <-predict(model, newdata = equine)# Calculate residuals (Actual - Predicted) temporarilyresiduals <- equine$comp - predicted_comp_time# Create scatterplot of Actual vs Predicted compliance times with Residualsggplot(data.frame(predicted_comp_time, residuals), aes(x = predicted_comp_time, y = residuals)) +geom_point(color ="blue") +geom_hline(yintercept =0, linetype ="dashed", color ="red") +labs(title ="Residuals Plot: Actual vs Predicted",x ="Predicted Compliance Time (seconds)",y ="Residuals (Actual - Predicted)") +theme(plot.title =element_text(hjust =0.5)) +theme_minimal() +theme(plot.title =element_text(hjust =0.5))
Residual plots help you check if the model assumptions are met. In a good model, residuals should be randomly distributed around 0. This plot shows the difference between the observed and predicted values.
Does sex affect compliance time?
Hypothesis: Horses’ performance (compliance time) varies significantly between male and female horses.
t_test_sex <-t.test(comp2 ~ sex, data = eqsub)print(t_test_sex)
Welch Two Sample t-test
data: comp2 by sex
t = -2.7058, df = 495.98, p-value = 0.007048
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
-0.43524258 -0.06905908
sample estimates:
mean in group Female mean in group Male
29.81449 30.06665
the p-value is 0.007 so there is a statistically significant difference in compliance time between males and females.
# Boxplot for 'Before' - 'comp'boxplot(comp ~ sex, data = eqsub, main ="Compliance Time by Sex", ylab ="Compliance Time", col =c("lightblue", "lightgreen"))
#Boxplot for 'After' - 'comp2'boxplot(comp2 ~ sex, data = eqsub, main ="Compliance Time by Sex", ylab ="Compliance Time", col =c("lightblue", "lightgreen"))