# load the tidyverse package
library(tidyverse)
ARES4011 Submission
Martin Cooper
Sharks Data
Initial Data Import and checking
# Call data and create data frame.
library(readxl)
sharks <- read_excel("D:/ARES40011 Rsrch Methods & Data Analysis/Summative/sharks.xlsx")
sharks <-data.frame(sharks)
# Summarize the data frame
summary(sharks)
ID sex blotch BPM
Length:500 Length:500 Min. :30.78 Min. :119.0
Class :character Class :character 1st Qu.:34.16 1st Qu.:129.0
Mode :character Mode :character Median :35.05 Median :142.0
Mean :35.13 Mean :141.8
3rd Qu.:36.05 3rd Qu.:153.2
Max. :40.08 Max. :166.0
weight length air water
Min. : 65.10 Min. :128.3 Min. :33.00 Min. :20.01
1st Qu.: 75.68 1st Qu.:172.0 1st Qu.:34.42 1st Qu.:21.55
Median : 87.82 Median :211.1 Median :35.43 Median :23.11
Mean : 87.94 Mean :211.0 Mean :35.54 Mean :23.02
3rd Qu.:100.40 3rd Qu.:251.8 3rd Qu.:36.71 3rd Qu.:24.37
Max. :110.94 Max. :291.0 Max. :38.00 Max. :25.99
meta depth
Min. : 50.03 Min. :44.64
1st Qu.: 67.39 1st Qu.:48.90
Median : 82.45 Median :50.14
Mean : 82.04 Mean :50.14
3rd Qu.: 95.97 3rd Qu.:51.35
Max. :112.45 Max. :56.83
The summary output matches the expectation from the data set so data exploration can continue
'data.frame': 500 obs. of 10 variables:
$ ID : chr "SH001" "SH002" "SH003" "SH004" ...
$ sex : chr "Female" "Female" "Female" "Male" ...
$ blotch: num 37.2 34.5 36.3 35.3 37.4 ...
$ BPM : num 148 158 125 161 138 126 166 135 132 127 ...
$ weight: num 74.7 73.4 71.8 104.6 67.1 ...
$ length: num 187 189 284 171 264 ...
$ air : num 37.7 35.7 34.8 36.2 33.6 ...
$ water : num 23.4 21.4 20.1 21.6 21.8 ...
$ meta : num 64.1 73.7 54.4 86.3 108 ...
$ depth : num 53.2 49.6 49.4 50.3 49 ...
We can see that there are 500 observations of 11 variables, with 1 character variable (sex) and 9 continuous variables.
ID sex blotch BPM weight length air water meta depth
0 0 0 0 0 0 0 0 0 0
There is no missing data.
Outliers
As there are two data types, we need to check in different ways for outliers
This plot suggests that there is a difference between the means of blotch time by sex (different means). There are also 3 possible outliers in blotch time.
Note: If doing further analysis of blotch time by sex, we will need to remeber to take out these outliers.
To check for outliers in the continuous variables using Cleveland dotplotts.
#install libraries
library(lattice)
echo=TRUE
results='hide'
# Define column names
Names <- c("BPM", "weight", "length", "air", "water", "meta", "depth")
# Create dot plot
dotplot(as.matrix(sharks[, Names]),
groups = FALSE,
strip = strip.custom(bg = 'white'),
par.strip.text = list(cex = 1.2),
scales = list(x = list(relation = "free", draw = TRUE),
y = list(relation = "free", draw = FALSE)),
col = 1, cex = 1, pch = 16,
xlab = list(label = "Value of the variable", cex = 1.2),
ylab = list(label = "Order of the data", cex = 1.2))
The data is spread, but there are only possible outliers in the depth dataset. We can test these specifically if we want to conduct any futher analysis.
Normality Tests
# Create frequency polygons using different binning methods
ggplot(data = sharks, aes(x = blotch)) +
geom_freqpoly(bins = 15, color = "blue", linetype = "solid", size = 1) +
labs(title = "Frequency Polygon of Blotch Time",
x = "Blotch Time (s)",
y = "Frequency")
Our varibale is normal. To confirm:
Shapiro-Wilk normality test
data: sharks$blotch
W = 0.99695, p-value = 0.4769
Data is confirmed as normal.
Multicollinearity among covariates
This is to test if the independant variables are corelated
library(Hmisc)
echo=TRUE
results='hide'
# Define column names
Coll <- c("BPM", "weight", "length", "air", "water", "meta", "depth")
# Calculate correlation matrix with p-values
cor_results <- rcorr(as.matrix(sharks[, Coll]), type = "pearson")
# Correlation coefficients
cor_matrix <- cor_results$r
# Display correlation coefficients
print(cor_matrix)
BPM weight length air water
BPM 1.000000000 0.017036558 -0.06856053 -0.06841209 0.02451337
weight 0.017036558 1.000000000 -0.01959676 -0.05264537 0.08633875
length -0.068560532 -0.019596758 1.00000000 -0.03027426 -0.05940708
air -0.068412093 -0.052645366 -0.03027426 1.00000000 -0.05524051
water 0.024513368 0.086338753 -0.05940708 -0.05524051 1.00000000
meta -0.006016429 0.019601470 0.00302851 0.12531801 0.02249461
depth -0.012173520 -0.006057435 -0.08334774 -0.01188199 -0.04088851
meta depth
BPM -0.006016429 -0.012173520
weight 0.019601470 -0.006057435
length 0.003028510 -0.083347736
air 0.125318005 -0.011881989
water 0.022494605 -0.040888511
meta 1.000000000 0.008150764
depth 0.008150764 1.000000000
This is a table of the corrrelation values
BPM weight length air water meta
BPM NA 0.70392824 0.1257632 0.126586449 0.58448465 0.893247627
weight 0.7039282 NA 0.6620083 0.239972797 0.05368554 0.661932090
length 0.1257632 0.66200832 NA 0.499413144 0.18476370 0.946143700
air 0.1265864 0.23997280 0.4994131 NA 0.21755131 0.005012136
water 0.5844847 0.05368554 0.1847637 0.217551315 NA 0.615808758
meta 0.8932476 0.66193209 0.9461437 0.005012136 0.61580876 NA
depth 0.7859781 0.89252444 0.0625624 0.790983481 0.36156444 0.855736961
depth
BPM 0.7859781
weight 0.8925244
length 0.0625624
air 0.7909835
water 0.3615644
meta 0.8557370
depth NA
This is a table of the significance of these correlations.
Note: We can see that the only statistically signifcant correlation is between air and meta - Air Temperature and Cortizol level.
I found this easier to spot using graphics
library(GGally)
# Define column names
Names <- c("BPM", "weight", "length", "air", "water", "meta", "depth")
# Create a scatter plot matrix
ggpairs(sharks[, Names],
lower = list(continuous = wrap("points", size = 1, alpha = 0.6)),
diag = list(continuous = wrap("barDiag", fill = "lightblue")),
upper = list(continuous = wrap("cor", size = 4))) +
theme(panel.background = element_blank(),
panel.border = element_rect(fill = NA, size = 1),
strip.background = element_rect(fill = "white", color = "white", size = 1),
text = element_text(size = 12))
Relations among dependent and independant variables
To see if there are any relationships between the blotch time and the variables collected. Given the difference in blotch time by sex, that split has been inluded to highlight if there are any sex spefic significance, the overall correlation and p-value are also displayed
# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$BPM, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value
#Plot
ggplot(sharks, aes(x = BPM, y = blotch, color=sex)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE) +
theme(panel.background = element_blank()) +
theme(panel.border = element_rect(fill = NA, size = 1)) +
theme(strip.background = element_rect(fill = "white",
color = "white", size = 1)) +
theme(text = element_text(size = 16)) +
labs(title = "Relationship between Blotch Time and BPM",
x = "BPM",
y = "Blotch Time",
color="sex")+
annotate("text", x = 155, y = 39,
label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)),
size = 5, color = "blue")
# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$weight, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value
ggplot(sharks, aes(x = weight, y = blotch, color=sex)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE) +
theme(panel.background = element_blank()) +
theme(panel.border = element_rect(fill = NA, size = 1)) +
theme(strip.background = element_rect(fill = "white",
color = "white", size = 1)) +
theme(text = element_text(size = 16)) +
labs(title = "Relationship between Blotch Time and Weight",
x = "Weight",
y = "Blotch Time",
color="sex")+
annotate("text", x = 70, y=39.05,
label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)),
size = 5, color = "blue")
# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$length, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value
ggplot(sharks, aes(x = length, y = blotch, color=sex)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE) +
theme(panel.background = element_blank()) +
theme(panel.border = element_rect(fill = NA, size = 1)) +
theme(strip.background = element_rect(fill = "white",
color = "white", size = 1)) +
theme(text = element_text(size = 16)) +
labs(title = "Relationship between Blotch Time and Length",
x = "Length",
y = "Blotch Time",
color="sex")+
annotate("text", x = 150, y=39,
label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)),
size = 5, color = "blue")
# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$air, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value
ggplot(sharks, aes(x = air, y = blotch, colour=sex)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE) +
theme(panel.background = element_blank()) +
theme(panel.border = element_rect(fill = NA, size = 1)) +
theme(strip.background = element_rect(fill = "white",
color = "white", size = 1)) +
theme(text = element_text(size = 16)) +
labs(title = "Relationship between Blotch Time and Air Temperature",
x = "Air Tempurature",
y = "Blotch Time",
colour="sex")+
annotate("text", x = 37, y=39,
label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)),
size = 5, color = "blue")
# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$water, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value
ggplot(sharks, aes(x = water, y = blotch, color = sex)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE) +
theme(panel.background = element_blank()) +
theme(panel.border = element_rect(fill = NA, size = 1)) +
theme(strip.background = element_rect(fill = "white", color = "white", size = 1)) +
theme(text = element_text(size = 16)) +
labs(title = "Relationship between Blotch Time and Surface Water Temperature",
x = "Surface Water Temperature",
y = "Blotch Time",
color = "Sex") +
annotate("text", x = 21.5, y = 39.5,
label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)),
size = 5, color = "blue")
# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$meta, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value
ggplot(sharks, aes(x = meta, y = blotch, color = sex)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE) +
theme(panel.background = element_blank()) +
theme(panel.border = element_rect(fill = NA, size = 1)) +
theme(strip.background = element_rect(fill = "white", color = "white", size = 1)) +
theme(text = element_text(size = 16)) +
labs(title = "Relationship between Blotch Time and Cortisol",
x = "Cortisol",
y = "Blotch Time",
color = "Sex") +
annotate("text", x = 57, y = 39.5,
label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)),
size = 5, color = "blue")
# Calculate the correlation coefficient and p-value
cor_test <- cor.test(sharks$depth, sharks$blotch)
correlation <- cor_test$estimate
p_value <- cor_test$p.value
ggplot(sharks, aes(x = depth, y = blotch, color = sex)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE) +
theme(panel.background = element_blank()) +
theme(panel.border = element_rect(fill = NA, size = 1)) +
theme(strip.background = element_rect(fill = "white", color = "white", size = 1)) +
theme(text = element_text(size = 16)) +
labs(title = "Relationship between Blotch Time and Depth of Capture",
x = "Depth",
y = "Blotch Time",
color = "Sex") +
annotate("text", x = 46, y = 39.5,
label = paste("Correlation:", round(correlation, 2), "\nP-value:", format.pval(p_value, digits = 2)),
size = 5, color = "blue")
Note: We can see that the only statistically signifcant correlation is between blotch time and depth.
#Analysis of Shark Data
Two Sample t-test
data: blotch by sex
t = -3.023, df = 498, p-value = 0.002632
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
-0.6326914 -0.1342420
sample estimates:
mean in group Female mean in group Male
34.92294 35.30641
This confimes that there is a significant differnce between male and female blotch times.
Given the difference between the sex and the significance of the correlation between depth and blotch time, we should calculate the stats for this.
# Run correlation tests for females and males
cor_test_female <- cor.test(sharks_female$depth, sharks_female$blotch)
correlation_female <- cor_test_female$estimate
p_value_female <- cor_test_female$p.value
cor_test_male <- cor.test(sharks_male$depth, sharks_male$blotch)
correlation_male <- cor_test_male$estimate
p_value_male <- cor_test_male$p.value
# Plot for combined data
ggplot(sharks, aes(x = depth, y = blotch, color = sex)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE) +
theme(panel.background = element_blank()) +
theme(panel.border = element_rect(fill = NA, size = 1)) +
theme(strip.background = element_rect(fill = "white", color = "white", size = 1)) +
theme(text = element_text(size = 16)) +
labs(title = "Relationship between Blotch Time and Depth by Sex",
x = "Depth",
y = "Blotch Time",
color = "Sex") +
annotate("text", x = 48, y = 39.5,
label = paste("Female:\nCorrelation:", round(correlation_female, 2),
"\nP-value:", format.pval(p_value_female, digits = 2)),
size = 5, color = "blue") +
annotate("text", x = 54, y = 32.5,
label = paste("Male:\nCorrelation:", round(correlation_male, 2),
"\nP-value:", format.pval(p_value_male, digits = 2)),
size = 5, color = "red")
`geom_smooth()` using formula = 'y ~ x'
Sharsub
Initial Data Import and checking
# Call data and create data frame.
library(readxl)
sharksub <- read_excel("D:/ARES40011 Rsrch Methods & Data Analysis/Summative/sharksub (1).xlsx")
sharksub <-data.frame(sharksub)
# Summarize the data frame
summary(sharksub)
ID sex blotch1 blotch2
Length:50 Length:50 Min. :32.49 Min. :33.47
Class :character Class :character 1st Qu.:34.38 1st Qu.:35.31
Mode :character Mode :character Median :34.94 Median :35.94
Mean :35.03 Mean :35.96
3rd Qu.:35.90 3rd Qu.:36.78
Max. :37.07 Max. :38.18
'data.frame': 50 obs. of 4 variables:
$ ID : chr "SH269" "SH163" "SH008" "SH239" ...
$ sex : chr "Female" "Female" "Female" "Female" ...
$ blotch1: num 36.1 33.4 36.3 35 35.7 ...
$ blotch2: num 37.2 34.4 36.5 36 36.8 ...
Checking for missing data
##Outliers
For each capture event
#Looking for differences between male and felame for each capture
# Create a boxplot with color by sex
ggplot(sharksub, aes(x = sex, y = blotch1, fill = sex)) +
geom_boxplot(outlier.shape = 16) +
labs(title = "Boxplot of Blotch1 Time by Sex",
x = "Sex",
y = "Blotch1 Time (s)") +
scale_fill_manual(values = c("Female" = "blue", "Male" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, hjust = 1))
This looks like there may be significance.
#Looking for differences between male and felame for each capture
# Create a boxplot with color by sex
ggplot(sharksub, aes(x = sex, y = blotch2, fill = sex)) +
geom_boxplot(outlier.shape = 16) +
labs(title = "Boxplot of Blotch1 Time by Sex",
x = "Sex",
y = "Blotch2 Time (s)") +
scale_fill_manual(values = c("Female" = "blue", "Male" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 0, hjust = 1))
Note: We can see that there may be a difference between male and female blotch time in both capture events.
##**Normality Tests
#To make it easier, group data by individual
sharksub_long <- sharksub %>%
pivot_longer(cols = c(blotch1, blotch2), names_to = "variable", values_to = "value")
# Plot histograms
ggplot(data = sharksub_long, aes(x = value, fill = variable)) +
geom_histogram(binwidth = 1, alpha = 0.5, position = "identity") +
facet_wrap(~ variable, scales = "free") +
scale_fill_manual(values = c("blotch1" = "blue", "blotch2" = "red")) +
labs(title = "Histograms of Blotch1 and Blotch2 Time",
x = "Blotch Time (s)",
y = "Frequency",
fill = "Blotch Type") +
theme_minimal()
Note: We can see that this is normally distributed.
ggplot(data = sharksub_long, aes(x = value, fill = sex)) +
geom_histogram(binwidth = 1, alpha = 0.5, position = "identity") +
facet_wrap(sex ~ variable, scales = "free") +
scale_fill_manual(values = c("Female" = "blue", "Male" = "red")) +
labs(title = "Histograms of Blotch1 and Blotch2 Time Grouped by Sex",
x = "Blotch Time (s)",
y = "Frequency",
fill = "Sex") +
theme_minimal()
Note: We can see that this is not normally distributed when grouped by sex.
Looking at Differences between blotch1 and blotch2
# Perform paired t-test
t_test_result2 <- t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)
p_value2 <- t_test_result2$p.value
print(t_test_result2)
Paired t-test
data: sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-1.037176 -0.822301
sample estimates:
mean difference
-0.9297384
# Plot box plots with t-test annotation
ggplot(data = sharksub_long, aes(x = variable, y = value, fill = variable)) +
geom_boxplot(alpha = 0.5) +
facet_wrap(~ variable, scales = "free") +
scale_fill_manual(values = c("blotch1" = "blue", "blotch2" = "red")) +
labs(title = "Box Plots of Blotch1 and Blotch2 Time",
x = "Blotch Type",
y = "Blotch Time (s)",
fill = "Blotch Event") +
theme_minimal() +
annotate("text", x = 1., y = max(sharksub_long$value),
label = paste(format.pval(p_value2, digits = 3)),
size = 5, color = "black")
Note: We can see that there is a significant diference between the time of blotch1 and blotch2.
Looking at Differences between blotch1 and blotch2 with sex
# Split the data by sex
sharksub_female <- sharksub_long %>% filter(sex == "Female")
sharksub_male <- sharksub_long %>% filter(sex == "Male")
# Run paired t-tests within each sex
t_test_female <- t.test(sharksub_female$value[sharksub_female$variable == "blotch1"],
sharksub_female$value[sharksub_female$variable == "blotch2"],
paired = TRUE)
#remebering that normality cannot be assumed for the male subset - ranked test used instead of standard t.test
t_test_male <- wilcox.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)
# Create a data frame to store the t-test results
t_test_results3 <- data.frame(
sex = c("Female", "Male"),
p_value = c(t_test_female$p.value, t_test_male$p.value)
)
print(t_test_results3)
sex p_value
1 Female 1.699676e-13
2 Male 1.606456e-09
# Plot box plot
ggplot(data = sharksub_long, aes(x = sex, y = value, fill = sex)) +
geom_boxplot(alpha = 0.5, outlier.shape = 16) +
facet_wrap(~ variable, scales = "free") +
scale_fill_manual(values = c("Female" = "blue", "Male" = "red")) +
labs(title = "Box Plots of Blotch1 and Blotch2 Time Grouped by Sex",
x = "Sex",
y = "Blotch Time (s)",
fill = "Sex") +
theme_minimal()
Note: We can see that there is a significant diference between the time of blotch1 and blotch2 when controlled by sex as well.
It is worth assessing if there is significance in the data between male and female when looking at each blotch event. There was with the larger dataset.
# Tests for blotch1
t_test_ss1 <- t.test(blotch1 ~ sex, data = sharksub, var.equal = TRUE)
# Tests for blotch2
t_test_ss2 <- t.test(blotch2 ~ sex, data = sharksub, var.equal = TRUE)
# Create a data frame to store the t-test results
t_test_results <- data.frame(
Blotch_Event = c("1", "2"),
p_value = c(t_test_ss1$p.value, t_test_ss2$p.value)
)
# Print the t-test results
print(t_test_results)
Blotch_Event p_value
1 1 0.1499993
2 2 0.2224073
Interestingly, there is no difference between male and female in each of the blotch events.