library(dplyr)library(lattice)#to see if there are obvious outliers in the data set used dotplot to test all the continous variablesNames <-c("BPM", "weight", "length", "air", "water", "meta", "depth")dotplot(as.matrix(as.matrix(sharks[,Names])),groups=FALSE, #keeps data ungrouped strip =strip.custom(bg ='white',par.strip.text =list(cex =1.2)),scales =list(x =list(relation ="free",draw =TRUE),y =list(relation ="free", draw =FALSE)),col =1, cex =1, pch =16,xlab =list(label ="Value of the variable",cex =1.2),ylab =list(label ="Order of the data",cex =1.2))
#depth has some measurements that show sharks caught at deeper/shallower water sharks %>%ggplot(aes(x = air, y = water)) +geom_point(color ="blue", size =3) +geom_smooth(method ="lm", color ="red", se =FALSE) +labs(title ="Scatter Plot of Air Temperature Compared to Water Temperature", x ="Air Temperature (°C)", y ="Water Temperature (°C)" ) +theme_minimal()
`geom_smooth()` using formula = 'y ~ x'
# Fit a linear modelmodel <-lm(water ~ air, data = sharks)# Check residuals with a histogram or Q-Q plotsharks$residuals <-residuals(model)# Histogram of residualsggplot(sharks, aes(x = residuals)) +geom_histogram(binwidth =0.5, fill ="blue", color ="white") +labs(title ="Histogram of Residuals")
# Q-Q plot of residualsggplot(sharks, aes(sample = residuals)) +geom_qq() +geom_qq_line() +labs(title ="Q-Q Plot of Residuals")
#Data not normally distributed raw or tested with residuals so to test for significance between the two variables spearman's rank correlation was used. spearman_rank <-cor.test(sharks$air, sharks$water, method ="spearman")print(spearman_rank)
Spearman's rank correlation rho
data: sharks$air and sharks$water
S = 22007692, p-value = 0.2082
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
-0.05637344
#data is not significantly correlated
Question 2
#Histogram of blotch time 1ggplot(sharksub, aes(x = blotch1)) +geom_histogram(binwidth =0.5, fill ="blue", color ="white", alpha =0.7) +labs(title ="Histogram of Time to Blotch First", x ="Time to Blotch First", y ="Frequency") +theme_minimal()
#Histogram of blotch time 2ggplot(sharksub, aes(x = blotch2)) +geom_histogram(binwidth =0.5, fill ="blue", color ="white", alpha =0.7) +labs(title ="Histogram of Time to Blotch First", x ="Time to Blotch First", y ="Frequency") +theme_minimal()
#QQ plot to check for normality ggplot(sharksub, aes(sample = blotch1)) +geom_qq() +geom_qq_line() +labs(title ="Q-Q Plot for Time to Blotch First Caught") +theme_minimal()
#QQ plot to check for normality ggplot(sharksub, aes(sample = blotch2)) +geom_qq() +geom_qq_line() +labs(title ="Q-Q Plot for Time to Blotch for Second Capture") +theme_minimal()
#QQ plots shows data to be normally distributed#Shapiro Wilk check for normalityshapiro_test_blotch1 <-shapiro.test(sharksub$blotch1)print(shapiro_test_blotch1)
Shapiro-Wilk normality test
data: sharksub$blotch1
W = 0.97958, p-value = 0.5345
#data normally distributed p= 0.5345 accept the null hypothesisshapiro_test_blotch2 <-shapiro.test(sharksub$blotch2)print(shapiro_test_blotch2)
Shapiro-Wilk normality test
data: sharksub$blotch2
W = 0.97936, p-value = 0.5255
#data normally distributed p=0.5255 so can use standard T - testt.test(sharksub$blotch1, sharksub$blotch2, paired =TRUE, var.equal =TRUE)
Paired t-test
data: sharksub$blotch1 and sharksub$blotch2
t = -17.39, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean difference is not equal to 0
95 percent confidence interval:
-1.037176 -0.822301
sample estimates:
mean difference
-0.9297384
#Blotching time is significantly different p < 0.05 time taken for sharks to blotch increases by mean time of 0.930 (3 s.f) seconds after the second capture
Question 3
#Check if any of the continous variables have a linear relationship# Scatter plot for BPM vs Blotching Timeggplot(sharks, aes(x = BPM, y = blotch)) +geom_point() +labs(title ="Scatter plot of BPM vs Blotching Time", x ="BPM", y ="Blotching Time") +theme_minimal()
# Scatter plot for Air Temperature vs Blotching Timeggplot(sharks, aes(x = air, y = blotch)) +geom_point() +labs(title ="Scatter plot of Air Temperature vs Blotching Time", x ="Air Temperature", y ="Blotching Time") +theme_minimal()
# Scatter plot for Water Temperature vs Blotching Timeggplot(sharks, aes(x = water, y = blotch)) +geom_point() +labs(title ="Scatter plot of Water Temperature vs Blotching Time", x ="Water Temperature", y ="Blotching Time") +theme_minimal()
# Scatter plot for Weight vs Blotching Timeggplot(sharks, aes(x = weight, y = blotch)) +geom_point() +labs(title ="Scatter plot of Weight vs Blotching Time", x ="Weight", y ="Blotching Time") +theme_minimal()
# Scatter plot for Length vs Blotching Timeggplot(sharks, aes(x = length, y = blotch)) +geom_point() +labs(title ="Scatter plot of Length vs Blotching Time", x ="Length", y ="Blotching Time") +theme_minimal()
# Scatter plot for Depth vs Blotching Timeggplot(sharks, aes(x = depth, y = blotch)) +geom_point() +labs(title ="Scatter plot of Depth vs Blotching Time", x ="Depth", y ="Blotching Time") +theme_minimal()
# Scatter plot for Meta vs Blotching Timeggplot(sharks, aes(x = meta, y = blotch)) +geom_point() +labs(title ="Scatter plot of Meta vs Blotching Time", x ="Depth", y ="Blotching Time") +theme_minimal()
#Depth has a linear relationship with blotching#Boxplot for sex versus blotching timeggplot(sharks, aes(x = sex, y = blotch, fill = sex)) +geom_boxplot() +labs(title ="Blotching Time by Sex",x ="Sex",y ="Blotch Time (S)") +theme_minimal() +scale_fill_manual(values =c("Male"="blue", "Female"="red"))
#shark sex is catergorical data but can be converted to a numeric format for linear modelsharks.new <- sharks %>%mutate(sex =ifelse(sex =="Male", 1, 2))#Generalised linear model used as residuals plotted in Question 1 showed normality see if depth is statistically significantmodel <-lm(blotch ~ BPM + weight + length + air + water + meta + depth + sex, data = sharks.new)summary(model)
Call:
lm(formula = blotch ~ BPM + weight + length + air + water + meta +
depth + sex, data = sharks.new)
Residuals:
Min 1Q Median 3Q Max
-2.97715 -0.66193 -0.00841 0.64123 2.90395
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.7356961 1.8828077 6.233 9.86e-10 ***
BPM -0.0020791 0.0031540 -0.659 0.51009
weight 0.0017281 0.0033143 0.521 0.60231
length 0.0013042 0.0009606 1.358 0.17517
air -0.0310068 0.0315302 -0.983 0.32590
water -0.0143878 0.0268112 -0.537 0.59176
meta -0.0011610 0.0025671 -0.452 0.65127
depth 0.5034077 0.0220870 22.792 < 2e-16 ***
sex -0.3088617 0.0890602 -3.468 0.00057 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.9912 on 491 degrees of freedom
Multiple R-squared: 0.5256, Adjusted R-squared: 0.5178
F-statistic: 67.99 on 8 and 491 DF, p-value: < 2.2e-16
#biggest indicators of blotching time are depth caught and the sex of the shark # Correlation matrix of continuous variables to see if there's other indicators of blotching timecor(sharks %>%select(blotch, BPM, weight, length, air, water, meta, depth))
blotch BPM weight length air
blotch 1.000000000 -0.029296612 0.009236525 -0.01638167 -0.03761675
BPM -0.029296612 1.000000000 0.017036558 -0.06856053 -0.06841209
weight 0.009236525 0.017036558 1.000000000 -0.01959676 -0.05264537
length -0.016381675 -0.068560532 -0.019596758 1.00000000 -0.03027426
air -0.037616747 -0.068412093 -0.052645366 -0.03027426 1.00000000
water -0.051653787 0.024513368 0.086338753 -0.05940708 -0.05524051
meta -0.009513855 -0.006016429 0.019601470 0.00302851 0.12531801
depth 0.714224701 -0.012173520 -0.006057435 -0.08334774 -0.01188199
water meta depth
blotch -0.05165379 -0.009513855 0.714224701
BPM 0.02451337 -0.006016429 -0.012173520
weight 0.08633875 0.019601470 -0.006057435
length -0.05940708 0.003028510 -0.083347736
air -0.05524051 0.125318005 -0.011881989
water 1.00000000 0.022494605 -0.040888511
meta 0.02249461 1.000000000 0.008150764
depth -0.04088851 0.008150764 1.000000000
#no other indicatorsggplot(sharks, aes(x = depth, y = blotch)) +geom_point() +geom_smooth(method ="lm", color ="red") +labs(title ="Depth vs Blotching Time", x ="Depth", y ="Blotching Time")
`geom_smooth()` using formula = 'y ~ x'
#blotching time increases with increased depth# QQ plot for Male sharks.male <- sharks %>%#made a new column with only male blotching timemutate(blotch_male =if_else(sex =="Male", blotch, NA_real_))ggplot(sharks.male, aes(sample = blotch_male)) +geom_qq() +geom_qq_line() +labs(title ="Q-Q Plot for Male Blotching Time") +theme_minimal()
Warning: Removed 236 rows containing non-finite outside the scale range
(`stat_qq()`).
Warning: Removed 236 rows containing non-finite outside the scale range
(`stat_qq_line()`).
na.omit(sharks.male$blotch_male) #remove the NAs shown in female row for test
Shapiro-Wilk normality test
data: sharks.female$blotch_female
W = 0.99527, p-value = 0.682
#data normally distributedsharks$sex <-factor(sharks$sex, levels =c("Male", "Female")) #allows for means to be grouped by sex so know which sex has a faster blotching timef_test_result_sex <-var.test(blotch ~ sex, data = sharks)f_test_result_sex
F test to compare two variances
data: blotch by sex
F = 1.0626, num df = 263, denom df = 235, p-value = 0.6347
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
0.8273317 1.3623562
sample estimates:
ratio of variances
1.0626
# variance equal can use an independent t testt.test(sharks.female$blotch_female, sharks.male$blotch_male, paired =FALSE, var.equal =TRUE)
Two Sample t-test
data: sharks.female$blotch_female and sharks.male$blotch_male
t = -3.023, df = 498, p-value = 0.002632
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.6326914 -0.1342420
sample estimates:
mean of x mean of y
34.92294 35.30641