options(repos =c(CRAN ="https://cloud.r-project.org")) #set the destination for publication rm(list =ls()) #clear objects in workspace library(readxl) #to import datalibrary(conflicted) #to get better warnings if packages are conflicting each otherlibrary(tidyverse) #for ggpplot and histograms
Installing package into '/Users/eloise/Library/R/arm64/4.4/library'
(as 'lib' is unspecified)
The downloaded binary packages are in
/var/folders/s2/5m_j2hbx2qqd8264j2_ljrnh0000gn/T//Rtmpxhh2Nn/downloaded_packages
#to see if there are obvious outliers in the data set used dotplot to test all the continous variablesNames <-c("BPM", "weight", "length", "air", "water", "meta", "depth")dotplot(as.matrix(as.matrix(sharks[,Names])),groups=FALSE, #keeps data ungrouped strip =strip.custom(bg ='white',par.strip.text =list(cex =1.2)),scales =list(x =list(relation ="free",draw =TRUE),y =list(relation ="free", draw =FALSE)),col =1, cex =1, pch =16,xlab =list(label ="Value of the variable",cex =1.2),ylab =list(label ="Order of the data",cex =1.2))
#depth has some measurements that show sharks caught at deeper/shallower water #check normality of the two variablesggplot(sharks, aes(x = air)) +geom_histogram(binwidth =0.5, fill ="blue", color ="white", alpha =0.7) +labs(title ="Histogram of Air Temperature", x ="Air Temperature", y ="Frequency") +theme_minimal()
ggplot(sharks, aes(x = water)) +geom_histogram(binwidth =0.5, fill ="blue", color ="white", alpha =0.7) +labs(title ="Histogram of Water", x ="Water", y ="Frequency") +theme_minimal()
sharks %>%ggplot(aes(x = air, y = water)) +geom_point(color ="blue", size =3) +geom_smooth(method ="lm", color ="red", se =FALSE) +labs(title ="Scatter Plot of Air Temperature Compared to Water Temperature", x ="Air Temperature (°C)", y ="Water Temperature (°C)" ) +theme_minimal()
`geom_smooth()` using formula = 'y ~ x'
model <-lm(water ~ air, data = sharks)#data not normally distributed# Check residuals with a histogram or Q-Q plotsharks$residuals <-residuals(model)# Histogram of residualsggplot(sharks, aes(x = residuals)) +geom_histogram(binwidth =0.5, fill ="blue", color ="white") +labs(title ="Histogram of Residuals")
# Q-Q plot of residualsggplot(sharks, aes(sample = residuals)) +geom_qq() +geom_qq_line() +labs(title ="Q-Q Plot of Residuals")
shapiro.test(sharks$residuals)
Shapiro-Wilk normality test
data: sharks$residuals
W = 0.96102, p-value = 3.078e-10
#Data not normally distributed raw or tested with residuals so to test for significance as a monotonic between the two variables spearman's rank correlation was used. spearman_rank <-cor.test(sharks$air, sharks$water, method ="spearman")print(spearman_rank)
Spearman's rank correlation rho
data: sharks$air and sharks$water
S = 22007692, p-value = 0.2082
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
-0.05637344
#data is not significantly correlated
Question 2
#Histogram of blotch time 1ggplot(sharksub, aes(x = blotch1)) +geom_histogram(binwidth =0.5, fill ="blue", color ="white", alpha =0.7) +labs(title ="Histogram of Time to Blotch First", x ="Time to Blotch First", y ="Frequency") +theme_minimal()
#Histogram of blotch time 2ggplot(sharksub, aes(x = blotch2)) +geom_histogram(binwidth =0.5, fill ="blue", color ="white", alpha =0.7) +labs(title ="Histogram of Time to Blotch First", x ="Time to Blotch First", y ="Frequency") +theme_minimal()
#QQ plot to check for normality ggplot(sharksub, aes(sample = blotch1)) +geom_qq() +geom_qq_line() +labs(title ="Q-Q Plot for Time to Blotch First Caught") +theme_minimal()
#QQ plot to check for normality ggplot(sharksub, aes(sample = blotch2)) +geom_qq() +geom_qq_line() +labs(title ="Q-Q Plot for Time to Blotch for Second Capture") +theme_minimal()
#QQ plots shows data to be normally distributed#Shapiro Wilk check for normalityshapiro_test_blotch1 <-shapiro.test(sharksub$blotch1)print(shapiro_test_blotch1)
Shapiro-Wilk normality test
data: sharksub$blotch1
W = 0.97958, p-value = 0.5345
#data normally distributed p= 0.5345 accept the null hypothesisshapiro_test_blotch2 <-shapiro.test(sharksub$blotch2)print(shapiro_test_blotch2)
Shapiro-Wilk normality test
data: sharksub$blotch2
W = 0.97936, p-value = 0.5255
#data normally distributed p=0.5255 so can use standard T - testt.test(sharksub$blotch1, sharksub$blotch2, paired =TRUE, var.equal =TRUE)
Paired t-test
data: sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-1.037176 -0.822301
sample estimates:
mean difference
-0.9297384
#Blotching time is significantly different p < 0.05 time taken for sharks to blotch increases by mean time of 0.930 (3 s.f) seconds after the second capture
Question 3
#check normality of blotchingggplot(sharks, aes(x = blotch)) +geom_histogram(binwidth =0.5, fill ="blue", color ="white", alpha =0.7) +labs(title ="Histogram of Blotch", x ="Blotch", y ="Frequency") +theme_minimal()
Shapiro-Wilk normality test
data: sharks$blotch
W = 0.99695, p-value = 0.4769
#data normally distributed#Check if any of the continous variables have a linear relationship# Scatter plot for BPM vs Blotching Timeggplot(sharks, aes(x = BPM, y = blotch)) +geom_point() +labs(title ="Scatter plot of BPM vs Blotching Time", x ="BPM", y ="Blotching Time") +theme_minimal()
# Scatter plot for Air Temperature vs Blotching Timeggplot(sharks, aes(x = air, y = blotch)) +geom_point() +labs(title ="Scatter plot of Air Temperature vs Blotching Time", x ="Air Temperature", y ="Blotching Time") +theme_minimal()
# Scatter plot for Water Temperature vs Blotching Timeggplot(sharks, aes(x = water, y = blotch)) +geom_point() +labs(title ="Scatter plot of Water Temperature vs Blotching Time", x ="Water Temperature", y ="Blotching Time") +theme_minimal()
# Scatter plot for Weight vs Blotching Timeggplot(sharks, aes(x = weight, y = blotch)) +geom_point() +labs(title ="Scatter plot of Weight vs Blotching Time", x ="Weight", y ="Blotching Time") +theme_minimal()
# Scatter plot for Length vs Blotching Timeggplot(sharks, aes(x = length, y = blotch)) +geom_point() +labs(title ="Scatter plot of Length vs Blotching Time", x ="Length", y ="Blotching Time") +theme_minimal()
# Scatter plot for Depth vs Blotching Timeggplot(sharks, aes(x = depth, y = blotch)) +geom_point() +labs(title ="Scatter plot of Depth vs Blotching Time", x ="Depth", y ="Blotching Time") +theme_minimal()
# Scatter plot for Meta vs Blotching Timeggplot(sharks, aes(x = meta, y = blotch)) +geom_point() +labs(title ="Scatter plot of Meta vs Blotching Time", x ="Depth", y ="Blotching Time") +theme_minimal()
#Depth has a linear relationship with blotching#Boxplot for sex versus blotching timeggplot(sharks, aes(x = sex, y = blotch, fill = sex)) +geom_boxplot() +labs(title ="Blotching Time by Sex",x ="Sex",y ="Blotch Time (S)") +theme_minimal() +scale_fill_manual(values =c("Male"="blue", "Female"="red"))
#Generalised linear model used as residuals plotted in Question 1 showed normality see if depth is statistically significantmodel <-lm(blotch ~ BPM + weight + length + air + water + meta + depth , data = sharks)summary(model)
Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta +
depth, data = sharks)
Residuals:
Min 1Q Median 3Q Max
-2.83745 -0.66117 -0.00702 0.60110 2.74108
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.1405851 1.8958668 5.876 7.74e-09 ***
BPM -0.0019723 0.0031890 -0.618 0.537
weight 0.0016283 0.0033511 0.486 0.627
length 0.0012295 0.0009710 1.266 0.206
air -0.0281474 0.0318707 -0.883 0.378
water -0.0188934 0.0270782 -0.698 0.486
meta -0.0009712 0.0025951 -0.374 0.708
depth 0.5061285 0.0223191 22.677 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.002 on 492 degrees of freedom
Multiple R-squared: 0.514, Adjusted R-squared: 0.507
F-statistic: 74.32 on 7 and 492 DF, p-value: < 2.2e-16
#biggest indicators of blotching time are depth caught # Correlation matrix of continuous variables to see if there's other indicators of blotching timecor(sharks %>%select(blotch, BPM, weight, length, air, water, meta, depth))
blotch BPM weight length air
blotch 1.000000000 -0.029296612 0.009236525 -0.01638167 -0.03761675
BPM -0.029296612 1.000000000 0.017036558 -0.06856053 -0.06841209
weight 0.009236525 0.017036558 1.000000000 -0.01959676 -0.05264537
length -0.016381675 -0.068560532 -0.019596758 1.00000000 -0.03027426
air -0.037616747 -0.068412093 -0.052645366 -0.03027426 1.00000000
water -0.051653787 0.024513368 0.086338753 -0.05940708 -0.05524051
meta -0.009513855 -0.006016429 0.019601470 0.00302851 0.12531801
depth 0.714224701 -0.012173520 -0.006057435 -0.08334774 -0.01188199
water meta depth
blotch -0.05165379 -0.009513855 0.714224701
BPM 0.02451337 -0.006016429 -0.012173520
weight 0.08633875 0.019601470 -0.006057435
length -0.05940708 0.003028510 -0.083347736
air -0.05524051 0.125318005 -0.011881989
water 1.00000000 0.022494605 -0.040888511
meta 0.02249461 1.000000000 0.008150764
depth -0.04088851 0.008150764 1.000000000
#no other indicatorsggplot(sharks, aes(x = depth, y = blotch)) +geom_point() +geom_smooth(method ="lm", color ="red") +labs(title ="Depth vs Blotching Time", x ="Depth (m)", y ="Blotching Time (s)")
`geom_smooth()` using formula = 'y ~ x'
#blotching time increases with increased depth# QQ plot for Male sharks.male <- sharks %>%#made a new column with only male blotching timemutate(blotch_male =if_else(sex =="Male", blotch, NA_real_))ggplot(sharks.male, aes(sample = blotch_male)) +geom_qq() +geom_qq_line() +labs(title ="Q-Q Plot for Male Blotching Time") +theme_minimal()
Warning: Removed 236 rows containing non-finite outside the scale range
(`stat_qq()`).
Warning: Removed 236 rows containing non-finite outside the scale range
(`stat_qq_line()`).
na.omit(sharks.male$blotch_male) #remove the NAs shown in female row for test
Shapiro-Wilk normality test
data: sharks.female$blotch_female
W = 0.99527, p-value = 0.682
#data normally distributedsharks$sex <-factor(sharks$sex, levels =c("Male", "Female")) #allows for means to be grouped by sex so know which sex has a faster blotching timef_test_result_sex <-var.test(blotch ~ sex, data = sharks)f_test_result_sex
F test to compare two variances
data: blotch by sex
F = 1.0626, num df = 263, denom df = 235, p-value = 0.6347
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
0.8273317 1.3623562
sample estimates:
ratio of variances
1.0626
# variance equal can use an independent t testt.test(sharks.female$blotch_female, sharks.male$blotch_male, paired =FALSE, var.equal =TRUE)
Two Sample t-test
data: sharks.female$blotch_female and sharks.male$blotch_male
t = -3.023, df = 498, p-value = 0.002632
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.6326914 -0.1342420
sample estimates:
mean of x mean of y
34.92294 35.30641