# Packages Required
library(tidyverse)
library(dplyr)
library(ggplot2)
library(corrplot)
library(psych)
library(GGally)
library(patchwork)
# Importing Data
sharks <- data.frame(read_csv("sharks.csv"))
sharksub <- data.frame(read.csv("sharksub.csv"))
# Data Summary
describe(sharks) # Exploring the dataset to understand range of data
summary(sharks$blotch) # Exploring key variable blotching
hist(sharks$blotch)
# Visualising all correlations
sharks$sex <- factor(sharks$sex, levels = c("Male", "Female"))
ggpairs(sharks, columns = 2:10, cardinality_threshold = 500,
columnLabels = c("Sex", "Blotch Time", "BPM", "Weight", "Length", "Air Temp", "Water Temp", "Meta", "Depth"),
proportions = "auto",
lower = list(combo = "dot"),
upper = list(combo = "box"))
# Indication of relationship between blotch ~ depth and meta ~ air.
# Stress Variables - CORRELATION
cor.test(sharks$blotch, sharks$BPM)
cor.test(sharks$blotch, sharks$meta)
bpmplot <- ggplot(sharks, aes(BPM, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Heart Rate (BPM)", y = "Blotching Time (Seconds)")
metaplot <- ggplot(sharks, aes(meta, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Meta (mcg/dl)", y = "Blotching Time (Seconds)")
stressplot <- bpmplot + metaplot # Creating combined plot
print(stressplot)
# Environmental Variable - CORRELATION
sharks1 <- sharks %>%
mutate(temp_diff = air - water) # create new variable investigating temperature difference
cor.test(sharks$blotch, sharks$air)
cor.test(sharks$blotch, sharks$water)
cor.test(sharks$blotch, sharks$depth)
cor.test(sharks1$blotch, sharks1$temp_diff)
airplot <- ggplot(sharks, aes(air, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Air Temperature (Celcius)", y = "Blotching Time (Seconds)")
waterplot <- ggplot(sharks, aes(water, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Water Temperature (Celcius)", y = "Blotching Time (Seconds)")
depthplot <- ggplot(sharks, aes(depth, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Capture Depth (Metres)", y = "Blotching Time (Seconds)")
tempplot <- ggplot(sharks1, aes(temp_diff, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Temperature Difference (Celcius)", y = "Blotching Time (Seconds)")
envplot <- airplot + waterplot + tempplot + depthplot # Creating combined plot
print(envplot)
# Air and Water Temp Effects
summary(sharks$air)
summary(sharks$water)
# Relationship between air and water temperature
tempcor <- cor.test(sharks$air, sharks$water)
print(tempcor)
airplot1 <- ggplot(sharks, aes(water, air))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Water Temperature (Celcius)", y = "Air Temperature (Celcius)")
cor.test(sharks$meta, sharks$air)
airplot2 <- ggplot(sharks, aes(meta, air))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Meta (mcg/dl)", y = "Air Temperature (Celcius)")
airplot <- airplot1 + airplot2
print(airplot)
# Physiological Variable - CORRELATION + T TEST
cor.test(sharks$blotch, sharks$weight)
cor.test(sharks$blotch, sharks$length)
weightplot <- ggplot(sharks, aes(weight, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Weight (Kilograms)", y = "Blotching Time (Seconds)")
lengthplot <- ggplot(sharks, aes(length, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Length (Centimetres)", y = "Blotching Time (Seconds)")
physplot <- weightplot + lengthplot # Creating combined plot
print(physplot)
# Separating variables by sex
sharksmale <- sharks %>%
filter(sex == "Male") %>%
na.omit()
sharksfemale <- sharks %>%
filter(sex == "Female") %>%
na.omit()
# Comparing variables between sexes
t.test(sharksmale$blotch, sharksfemale$blotch,
alternative = "two.sided",
mu = 0, paired = FALSE)
sexplot <- ggplot(sharks, aes(sex, blotch))+
geom_boxplot()+
labs(x = "Sex", y = "Blotching Time (Seconds)")
# Investigating life stage
sharkslife <- sharks %>% # Assigning adult or juvenile based on length
mutate(life_stage = case_when(
sex == "Female" & length > 180 ~ "adult",
sex == "Female" & length <= 180 ~ "juvenile",
sex == "Male" & length > 170 ~ "adult",
sex == "Male" & length <= 170 ~ "juvenile",
TRUE ~ NA_character_))
table(sharkslife$life_stage) # count how many adult vs juvenile
t.test(blotch ~ life_stage, data = sharkslife)
lifeplot <- ggplot(sharkslife, aes(life_stage, blotch))+
geom_boxplot()+
labs(x = "Life Stage", y = "Blotching Time (Seconds)")
physplot2 <- sexplot + lifeplot # Creating combined plot
print(physplot2)
# Difference between first and second capture
summary(sharksub) # Exploring data
shapiro.test(sharksub$blotch1)
shapiro.test(sharksub$blotch2) # both normally distributed
t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)
summary(sharksub$blotch1)
summary(sharksub$blotch2)
sharks_long <- sharksub %>%
pivot_longer(cols = c(blotch1, blotch2),
names_to = "capture",
values_to = "blotch",
names_prefix = "blotch")
captureplot1 <- ggplot(sharks_long, aes(x = capture, y = blotch))+
geom_boxplot()+
labs(x= "Capture", y = "Blotching time (Seconds)")
# Are male and female sharks affected differently by a second capture?
capture_sex <- sharksub %>%
group_by(sex) %>%
summarise(mean_blotch1 = mean(blotch1),
mean_blotch2 = mean(blotch2),
mean_diff = mean(blotch2 - blotch1))
print(capture_sex)
timediff <- sharksub %>%
mutate(timediff = blotch2 - blotch1)
t.test(timediff ~ sex, data = timediff)
captureplot2 <- ggplot(sharks_long, aes(x = capture, y = blotch, fill = sex))+
geom_boxplot()+
labs(x= "Capture", y = "Blotching time (Seconds)")+
scale_fill_manual(values = c("Male" = "darkgrey", "Female" = "azure2"))
captureplot <- captureplot1 + captureplot2 # Creating combined plot
print(captureplot)
# Predicting Blotching Time
blotchprediction <- lm (blotch ~ depth, data = sharks) # Initial linear model based on previous results
summary(blotchprediction)
ggplot(sharks, aes(blotchprediction$residuals))+ # shows normal distribution of residuals
geom_histogram()
# Can the model be improved?
backwardmodel <- lm(blotch ~ sex + BPM + meta + weight + length + air + water + depth + sharks1$temp_diff, sharks)
backwardmodel <- step(backwardmodel, direction = "backward") # Stepwise backward linear regression to find variables that are best for model
# Best fit = length, sex and depth
blotchprediction2 <- lm(blotch ~ length + sex + depth, data = sharks)
summary(blotchprediction2) # slightly improved model
# Length is not contrubuting much to the model - does removing it improve the model?
blotchprediction3 <- lm(blotch ~ sex + depth, data = sharks)
summary(blotchprediction3)
# this is the best model
# Plotting the model
plot <- ggplot(sharks,aes(x=depth,y=blotch, colour = sex))+
geom_point()+
labs(x = "Depth (Metres)", y = "Blotching Time (Seconds)")+
theme(legend.position = "none")
plot2 <- ggplot(sharks,aes(x=depth,y=blotch,shape=sex, colour = sex))+
geom_smooth(method=lm,se=TRUE,fullrange=TRUE,
aes(color=sex))+
labs(x = "Depth (Metres)", y = "Blotching Time (Seconds)")
lmplot <- plot + plot2
print(lmplot)
summary(sharks$depth) # check range of data
depth <- seq(40,60, by=0.5) # creating new data to test predictions
sex <- rep(c("Male", "Female"), length.out = length(depth))
sex <- as.factor(sex)
pred_grid <- expand.grid(depth = depth, sex = sex) # make new values to predict
pred_grid$blotch <-predict(blotchprediction3, new = pred_grid) # predict blotching time
# Check model
predict(blotchprediction3, data.frame(depth = 53.22635, sex = "Female"))
# predicted = 36.51418
# actual = 37.17081
predict(blotchprediction3, data.frame(depth = 50.95442, sex = "Male"))
# predicted = 35.6781
# actual = 35.66700
# Check ends of variables
predict(blotchprediction3, data.frame(depth = 44.64474, sex = "Male"))
# predicted = 32.51239
# actual = 32.53531
# Measure accuracy of the model
MAPE <- mean(abs(blotchprediction3$residuals / sharks$blotch *100))
accuracy <- 100 - MAPE
print(accuracy)
# Plot the predicted model
ggplot(pred_grid, aes(x = depth, y = blotch, colour = sex))+
geom_smooth(method = "lm")+
labs(x = "Depth (m)", y = "Blotching Time (seconds)", colour = "Sex")+
scale_x_continuous(breaks = seq(40, 60, by = 2)) +
scale_y_continuous(breaks = seq(29, 41, by = 1))Sharks Summative
R Code
Analysis displayed without figures and results
Click to expand Analysis
Full analysis displayed with figures and results
Click to expand full analysis
# Packages Required
library(tidyverse)── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.4.4 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.0
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
library(corrplot)Warning: package 'corrplot' was built under R version 4.3.3
corrplot 0.95 loaded
library(psych)Warning: package 'psych' was built under R version 4.3.3
Attaching package: 'psych'
The following objects are masked from 'package:ggplot2':
%+%, alpha
library(GGally)Warning: package 'GGally' was built under R version 4.3.3
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
library(patchwork)Warning: package 'patchwork' was built under R version 4.3.3
# Importing Data
sharks <- data.frame(read_csv("sharks.csv"))Rows: 500 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): ID, sex
dbl (8): blotch, BPM, weight, length, air, water, meta, depth
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sharksub <- data.frame(read.csv("sharksub.csv"))
# Data Summary
describe(sharks) # Exploring the dataset to understand range of data vars n mean sd median trimmed mad min max range skew
ID* 1 500 250.50 144.48 250.50 250.50 185.32 1.00 500.00 499.00 0.00
sex* 2 500 1.53 0.50 2.00 1.53 0.00 1.00 2.00 1.00 -0.11
blotch 3 500 35.13 1.43 35.05 35.11 1.38 30.78 40.08 9.31 0.15
BPM 4 500 141.76 14.14 142.00 141.54 19.27 119.00 166.00 47.00 0.08
weight 5 500 87.94 13.46 87.82 88.03 18.36 65.10 110.94 45.84 -0.02
length 6 500 211.04 46.61 211.07 211.24 59.51 128.25 290.95 162.70 -0.02
air 7 500 35.54 1.43 35.43 35.53 1.71 33.00 38.00 5.00 0.04
water 8 500 23.02 1.67 23.11 23.02 2.08 20.01 25.99 5.98 -0.01
meta 9 500 82.04 17.44 82.45 82.14 20.70 50.03 112.45 62.42 -0.05
depth 10 500 50.14 2.02 50.14 50.13 1.82 44.64 56.83 12.18 0.03
kurtosis se
ID* -1.21 6.46
sex* -1.99 0.02
blotch 0.20 0.06
BPM -1.24 0.63
weight -1.28 0.60
length -1.19 2.08
air -1.14 0.06
water -1.15 0.07
meta -1.10 0.78
depth -0.06 0.09
summary(sharks$blotch) # Exploring key variable blotching Min. 1st Qu. Median Mean 3rd Qu. Max.
30.78 34.16 35.05 35.13 36.05 40.08
hist(sharks$blotch)# Visualising all correlations
sharks$sex <- factor(sharks$sex, levels = c("Male", "Female"))
ggpairs(sharks, columns = 2:10, cardinality_threshold = 500,
columnLabels = c("Sex", "Blotch Time", "BPM", "Weight", "Length", "Air Temp", "Water Temp", "Meta", "Depth"),
proportions = "auto",
lower = list(combo = "dot"),
upper = list(combo = "box"))# Indication of relationship between blotch ~ depth and meta ~ air.
# Stress Variables - CORRELATION
cor.test(sharks$blotch, sharks$BPM)
Pearson's product-moment correlation
data: sharks$blotch and sharks$BPM
t = -0.65406, df = 498, p-value = 0.5134
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.11668743 0.05854438
sample estimates:
cor
-0.02929661
cor.test(sharks$blotch, sharks$meta)
Pearson's product-moment correlation
data: sharks$blotch and sharks$meta
t = -0.21232, df = 498, p-value = 0.8319
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.09712341 0.07824201
sample estimates:
cor
-0.009513855
bpmplot <- ggplot(sharks, aes(BPM, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Heart Rate (BPM)", y = "Blotching Time (Seconds)")
metaplot <- ggplot(sharks, aes(meta, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Meta (mcg/dl)", y = "Blotching Time (Seconds)")
stressplot <- bpmplot + metaplot # Creating combined plot
print(stressplot)`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
# Environmental Variable - CORRELATION
sharks1 <- sharks %>%
mutate(temp_diff = air - water) # create new variable investigating temperature difference
cor.test(sharks$blotch, sharks$air)
Pearson's product-moment correlation
data: sharks$blotch and sharks$air
t = -0.84005, df = 498, p-value = 0.4013
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.12489535 0.05023956
sample estimates:
cor
-0.03761675
cor.test(sharks$blotch, sharks$water)
Pearson's product-moment correlation
data: sharks$blotch and sharks$water
t = -1.1542, df = 498, p-value = 0.249
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.13871605 0.03620077
sample estimates:
cor
-0.05165379
cor.test(sharks$blotch, sharks$depth)
Pearson's product-moment correlation
data: sharks$blotch and sharks$depth
t = 22.772, df = 498, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.6683963 0.7546509
sample estimates:
cor
0.7142247
cor.test(sharks1$blotch, sharks1$temp_diff)
Pearson's product-moment correlation
data: sharks1$blotch and sharks1$temp_diff
t = 0.32238, df = 498, p-value = 0.7473
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.07333889 0.10200597
sample estimates:
cor
0.01444459
airplot <- ggplot(sharks, aes(air, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Air Temperature (Celcius)", y = "Blotching Time (Seconds)")
waterplot <- ggplot(sharks, aes(water, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Water Temperature (Celcius)", y = "Blotching Time (Seconds)")
depthplot <- ggplot(sharks, aes(depth, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Capture Depth (Metres)", y = "Blotching Time (Seconds)")
tempplot <- ggplot(sharks1, aes(temp_diff, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Temperature Difference (Celcius)", y = "Blotching Time (Seconds)")
envplot <- airplot + waterplot + tempplot + depthplot # Creating combined plot
print(envplot)`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
# Air and Water Temp Effects
summary(sharks$air) Min. 1st Qu. Median Mean 3rd Qu. Max.
33.00 34.42 35.43 35.54 36.71 38.00
summary(sharks$water) Min. 1st Qu. Median Mean 3rd Qu. Max.
20.01 21.55 23.11 23.02 24.37 25.99
# Relationship between air and water temperature
tempcor <- cor.test(sharks$air, sharks$water)
print(tempcor)
Pearson's product-moment correlation
data: sharks$air and sharks$water
t = -1.2346, df = 498, p-value = 0.2176
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.14224207 0.03260803
sample estimates:
cor
-0.05524051
airplot1 <- ggplot(sharks, aes(water, air))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Water Temperature (Celcius)", y = "Air Temperature (Celcius)")
cor.test(sharks$meta, sharks$air)
Pearson's product-moment correlation
data: sharks$meta and sharks$air
t = 2.8188, df = 498, p-value = 0.005012
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.03804551 0.21069324
sample estimates:
cor
0.125318
airplot2 <- ggplot(sharks, aes(meta, air))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Meta (mcg/dl)", y = "Air Temperature (Celcius)")
airplot <- airplot1 + airplot2
print(airplot)`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
# Physiological Variable - CORRELATION + T TEST
cor.test(sharks$blotch, sharks$weight)
Pearson's product-moment correlation
data: sharks$blotch and sharks$weight
t = 0.20613, df = 498, p-value = 0.8368
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.07851766 0.09684867
sample estimates:
cor
0.009236525
cor.test(sharks$blotch, sharks$length)
Pearson's product-moment correlation
data: sharks$blotch and sharks$length
t = -0.36562, df = 498, p-value = 0.7148
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.1039230 0.0714115
sample estimates:
cor
-0.01638167
weightplot <- ggplot(sharks, aes(weight, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Weight (Kilograms)", y = "Blotching Time (Seconds)")
lengthplot <- ggplot(sharks, aes(length, blotch))+
geom_point()+
geom_smooth(method = "lm")+
labs(x = "Length (Centimetres)", y = "Blotching Time (Seconds)")
physplot <- weightplot + lengthplot # Creating combined plot
print(physplot)`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
# Separating variables by sex
sharksmale <- sharks %>%
filter(sex == "Male") %>%
na.omit()
sharksfemale <- sharks %>%
filter(sex == "Female") %>%
na.omit()
# Comparing variables between sexes
t.test(sharksmale$blotch, sharksfemale$blotch,
alternative = "two.sided",
mu = 0, paired = FALSE)
Welch Two Sample t-test
data: sharksmale$blotch and sharksfemale$blotch
t = 3.0282, df = 494.67, p-value = 0.002589
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.1346620 0.6322714
sample estimates:
mean of x mean of y
35.30641 34.92294
sexplot <- ggplot(sharks, aes(sex, blotch))+
geom_boxplot()+
labs(x = "Sex", y = "Blotching Time (Seconds)")
# Investigating life stage
sharkslife <- sharks %>% # Assigning adult or juvenile based on length
mutate(life_stage = case_when(
sex == "Female" & length > 180 ~ "adult",
sex == "Female" & length <= 180 ~ "juvenile",
sex == "Male" & length > 170 ~ "adult",
sex == "Male" & length <= 170 ~ "juvenile",
TRUE ~ NA_character_))
table(sharkslife$life_stage) # count how many adult vs juvenile
adult juvenile
366 134
t.test(blotch ~ life_stage, data = sharkslife)
Welch Two Sample t-test
data: blotch by life_stage
t = 0.9934, df = 242.26, p-value = 0.3215
alternative hypothesis: true difference in means between group adult and group juvenile is not equal to 0
95 percent confidence interval:
-0.1390418 0.4219664
sample estimates:
mean in group adult mean in group juvenile
35.16332 35.02186
lifeplot <- ggplot(sharkslife, aes(life_stage, blotch))+
geom_boxplot()+
labs(x = "Life Stage", y = "Blotching Time (Seconds)")
physplot2 <- sexplot + lifeplot # Creating combined plot
print(physplot2)# Difference between first and second capture
summary(sharksub) # Exploring data ID sex blotch1 blotch2
Length:50 Length:50 Min. :32.49 Min. :33.47
Class :character Class :character 1st Qu.:34.38 1st Qu.:35.31
Mode :character Mode :character Median :34.94 Median :35.94
Mean :35.03 Mean :35.96
3rd Qu.:35.90 3rd Qu.:36.78
Max. :37.07 Max. :38.18
shapiro.test(sharksub$blotch1)
Shapiro-Wilk normality test
data: sharksub$blotch1
W = 0.97958, p-value = 0.5345
shapiro.test(sharksub$blotch2) # both normally distributed
Shapiro-Wilk normality test
data: sharksub$blotch2
W = 0.97936, p-value = 0.5255
t.test(sharksub$blotch1, sharksub$blotch2, paired = TRUE)
Paired t-test
data: sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-1.037176 -0.822301
sample estimates:
mean difference
-0.9297384
summary(sharksub$blotch1) Min. 1st Qu. Median Mean 3rd Qu. Max.
32.49 34.38 34.94 35.03 35.90 37.07
summary(sharksub$blotch2) Min. 1st Qu. Median Mean 3rd Qu. Max.
33.47 35.31 35.94 35.96 36.78 38.18
sharks_long <- sharksub %>%
pivot_longer(cols = c(blotch1, blotch2),
names_to = "capture",
values_to = "blotch",
names_prefix = "blotch")
captureplot1 <- ggplot(sharks_long, aes(x = capture, y = blotch))+
geom_boxplot()+
labs(x= "Capture", y = "Blotching time (Seconds)")
# Are male and female sharks affected differently by a second capture?
capture_sex <- sharksub %>%
group_by(sex) %>%
summarise(mean_blotch1 = mean(blotch1),
mean_blotch2 = mean(blotch2),
mean_diff = mean(blotch2 - blotch1))
print(capture_sex)# A tibble: 2 × 4
sex mean_blotch1 mean_blotch2 mean_diff
<chr> <dbl> <dbl> <dbl>
1 Female 34.8 35.8 0.952
2 Male 35.3 36.2 0.908
timediff <- sharksub %>%
mutate(timediff = blotch2 - blotch1)
t.test(timediff ~ sex, data = timediff)
Welch Two Sample t-test
data: timediff by sex
t = 0.40707, df = 44.541, p-value = 0.6859
alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
95 percent confidence interval:
-0.1733770 0.2611792
sample estimates:
mean in group Female mean in group Male
0.9516890 0.9077879
captureplot2 <- ggplot(sharks_long, aes(x = capture, y = blotch, fill = sex))+
geom_boxplot()+
labs(x= "Capture", y = "Blotching time (Seconds)")+
scale_fill_manual(values = c("Male" = "darkgrey", "Female" = "azure2"))
captureplot <- captureplot1 + captureplot2 # Creating combined plot
print(captureplot)# Predicting Blotching Time
blotchprediction <- lm (blotch ~ depth, data = sharks) # Initial linear model based on previous results
summary(blotchprediction)
Call:
lm(formula = blotch ~ depth, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.81869 -0.65427 -0.01035 0.58825 2.83116
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.82178 1.11207 8.832 <2e-16 ***
depth 0.50467 0.02216 22.772 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1 on 498 degrees of freedom
Multiple R-squared: 0.5101, Adjusted R-squared: 0.5091
F-statistic: 518.6 on 1 and 498 DF, p-value: < 2.2e-16
ggplot(sharks, aes(blotchprediction$residuals))+ # shows normal distribution of residuals
geom_histogram()`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Can the model be improved?
backwardmodel <- lm(blotch ~ sex + BPM + meta + weight + length + air + water + depth + sharks1$temp_diff, sharks)
backwardmodel <- step(backwardmodel, direction = "backward") # Stepwise backward linear regression to find variables that are best for modelStart: AIC=0.08
blotch ~ sex + BPM + meta + weight + length + air + water + depth +
sharks1$temp_diff
Step: AIC=0.08
blotch ~ sex + BPM + meta + weight + length + air + water + depth
Df Sum of Sq RSS AIC
- meta 1 0.20 482.60 -1.71
- weight 1 0.27 482.67 -1.64
- water 1 0.28 482.68 -1.62
- BPM 1 0.43 482.83 -1.47
- air 1 0.95 483.35 -0.93
- length 1 1.81 484.21 -0.04
<none> 482.40 0.08
- sex 1 11.82 494.22 10.18
- depth 1 510.38 992.78 358.95
Step: AIC=-1.71
blotch ~ sex + BPM + weight + length + air + water + depth
Df Sum of Sq RSS AIC
- weight 1 0.26 482.86 -3.44
- water 1 0.30 482.90 -3.40
- BPM 1 0.43 483.03 -3.26
- air 1 1.08 483.68 -2.59
- length 1 1.80 484.40 -1.85
<none> 482.60 -1.71
- sex 1 11.76 494.36 8.33
- depth 1 510.21 992.82 356.97
Step: AIC=-3.44
blotch ~ sex + BPM + length + air + water + depth
Df Sum of Sq RSS AIC
- water 1 0.26 483.11 -5.18
- BPM 1 0.42 483.28 -5.01
- air 1 1.14 483.99 -4.27
- length 1 1.78 484.64 -3.61
<none> 482.86 -3.44
- sex 1 11.73 494.59 6.56
- depth 1 510.13 992.99 355.05
Step: AIC=-5.18
blotch ~ sex + BPM + length + air + depth
Df Sum of Sq RSS AIC
- BPM 1 0.43 483.55 -6.73
- air 1 1.08 484.19 -6.06
- length 1 1.87 484.99 -5.24
<none> 483.11 -5.18
- sex 1 11.93 495.04 5.02
- depth 1 512.18 995.30 354.22
Step: AIC=-6.73
blotch ~ sex + length + air + depth
Df Sum of Sq RSS AIC
- air 1 0.99 484.54 -7.71
<none> 483.55 -6.73
- length 1 2.02 485.56 -6.65
- sex 1 11.89 495.43 3.41
- depth 1 512.95 996.50 352.82
Step: AIC=-7.71
blotch ~ sex + length + depth
Df Sum of Sq RSS AIC
<none> 484.54 -7.71
- length 1 2.11 486.64 -7.54
- sex 1 11.68 496.22 2.20
- depth 1 513.79 998.32 351.73
# Best fit = length, sex and depth
blotchprediction2 <- lm(blotch ~ length + sex + depth, data = sharks)
summary(blotchprediction2) # slightly improved model
Call:
lm(formula = blotch ~ length + sex + depth, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.96351 -0.64800 -0.02128 0.62242 2.91815
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.6857036 1.1400938 8.496 2.31e-16 ***
length 0.0013987 0.0009527 1.468 0.142713
sexFemale -0.3064815 0.0886293 -3.458 0.000591 ***
depth 0.5043852 0.0219935 22.933 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9884 on 496 degrees of freedom
Multiple R-squared: 0.5235, Adjusted R-squared: 0.5206
F-statistic: 181.6 on 3 and 496 DF, p-value: < 2.2e-16
# Length is not contrubuting much to the model - does removing it improve the model?
blotchprediction3 <- lm(blotch ~ sex + depth, data = sharks)
summary(blotchprediction3)
Call:
lm(formula = blotch ~ sex + depth, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.96224 -0.64833 -0.01317 0.58882 2.98829
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.11317 1.10357 9.164 < 2e-16 ***
sexFemale -0.30379 0.08871 -3.424 0.000667 ***
depth 0.50172 0.02194 22.864 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9895 on 497 degrees of freedom
Multiple R-squared: 0.5214, Adjusted R-squared: 0.5195
F-statistic: 270.7 on 2 and 497 DF, p-value: < 2.2e-16
# this is the best model
# Plotting the model
plot <- ggplot(sharks,aes(x=depth,y=blotch, colour = sex))+
geom_point()+
labs(x = "Depth (Metres)", y = "Blotching Time (Seconds)")+
theme(legend.position = "none")
plot2 <- ggplot(sharks,aes(x=depth,y=blotch,shape=sex, colour = sex))+
geom_smooth(method=lm,se=TRUE,fullrange=TRUE,
aes(color=sex))+
labs(x = "Depth (Metres)", y = "Blotching Time (Seconds)")
lmplot <- plot + plot2
print(lmplot)`geom_smooth()` using formula = 'y ~ x'
summary(sharks$depth) # check range of data Min. 1st Qu. Median Mean 3rd Qu. Max.
44.64 48.90 50.14 50.14 51.35 56.83
depth <- seq(40,60, by=0.5) # creating new data to test predictions
sex <- rep(c("Male", "Female"), length.out = length(depth))
sex <- as.factor(sex)
pred_grid <- expand.grid(depth = depth, sex = sex) # make new values to predict
pred_grid$blotch <-predict(blotchprediction3, new = pred_grid) # predict blotching time
# Check model
predict(blotchprediction3, data.frame(depth = 53.22635, sex = "Female")) 1
36.51418
# predicted = 36.51418
# actual = 37.17081
predict(blotchprediction3, data.frame(depth = 50.95442, sex = "Male")) 1
35.6781
# predicted = 35.6781
# actual = 35.66700
# Check ends of variables
predict(blotchprediction3, data.frame(depth = 44.64474, sex = "Male")) 1
32.51239
# predicted = 32.51239
# actual = 32.53531
# Measure accuracy of the model
MAPE <- mean(abs(blotchprediction3$residuals / sharks$blotch *100))
accuracy <- 100 - MAPE
print(accuracy)[1] 97.76933
# Plot the predicted model
ggplot(pred_grid, aes(x = depth, y = blotch, colour = sex))+
geom_smooth(method = "lm")+
labs(x = "Depth (m)", y = "Blotching Time (seconds)", colour = "Sex")+
scale_x_continuous(breaks = seq(40, 60, by = 2)) +
scale_y_continuous(breaks = seq(29, 41, by = 1))`geom_smooth()` using formula = 'y ~ x'