#loading packages
library(tidyverse)
library(ggplot2)
library(Biostatistics)
library(car)#this package allows me to conduct type II and III ANOVA
#loading data
library(readxl)
newt_data <- read_csv("C:/Users/LIEWY/Downloads/Newts.csv")
sheep_data <- read_csv("C:/Users/LIEWY/Downloads/sheep.csv")
weight_data <- read_csv("C:/Users/LIEWY/Downloads/weight.csv")
blooms_data <- read_csv("C:/Users/LIEWY/Downloads/blooms.csv")
Checking data structure
view(newt_data)
str(newt_data)
## spc_tbl_ [87 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ SVL : num [1:87] 108.9 94.7 96.9 110.1 103.7 ...
## $ Crest : num [1:87] 10.12 5.62 2 15.04 3.88 ...
## $ Pond : num [1:87] 4 7 7 10 6 8 7 6 7 2 ...
## $ Date : num [1:87] 27 37 28 24 9 12 32 16 15 25 ...
## $ LSVL : num [1:87] 2.04 1.98 1.99 2.04 2.02 ...
## $ LCREST: num [1:87] 1.005 0.75 0.302 1.177 0.589 ...
## - attr(*, "spec")=
## .. cols(
## .. SVL = col_double(),
## .. Crest = col_double(),
## .. Pond = col_double(),
## .. Date = col_double(),
## .. LSVL = col_double(),
## .. LCREST = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#POND is classed as numeric, should be categorical
pond_code <- as.factor(newt_data$Pond)
Plotting the data
newt_data %>% #plotting raincloud plot of LCREST vs LSVL
ggplot(aes(x = LSVL, y = LCREST))+
geom_point()+
geom_smooth(method = 'lm')+
labs(title = 'Dorsal Crest Size vs Body Size',
x = 'Log of Body size (mm)',
y = 'Log of Doral Crest Size (mm)')+
theme_minimal()
newt_data %>% #plotting raincloud plot of LCREST vs Pond
ggplot(aes(x = pond_code, y = LCREST, colour = pond_code, fill = pond_code))+
geom_boxplot(width = 0.2)+
geom_point(size = 0.5, position = position_nudge(x = -0.2))+
labs(title = 'Dorsal Crest Size vs Pond',
x = 'Pond Sampled',
y = 'Log of Doral Crest Size (mm)')+
theme_minimal()+
theme(legend.position = "none")
A linear model is used to test if there’s a linear relationship between LCREST and LSVL
newt_lm <- lm(LCREST ~ LSVL, data = newt_data)
anova(newt_lm)
## Analysis of Variance Table
##
## Response: LCREST
## Df Sum Sq Mean Sq F value Pr(>F)
## LSVL 1 2.3894 2.38939 45.808 1.579e-09 ***
## Residuals 85 4.4337 0.05216
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Diagnostic plots are used to evaluate the fit of the model.
par(mfrow = c(2,2))
plot(newt_lm)
Analysis of diagnostic plots:
Residuals vs Fitted: The spread of residuals around the horizontal line is even with no patterns, indicating a linear relationship.
Q-Q Residuals: The residuals mostly fall along the diagonal line, indicating a normal distribution.
Scale-Location: The residuals are evenly spread along the range of predictors, indicating that the data fulfills the assumption of homoscedasticity (equal variance).
Residuals vs Leverage: Every residual falls within the Cook’s distance line, indicating that there are no influential outliers.
From the results of the linear model, snout-vent length has a statistically significant effect on the dorsal crest height of newts (F1,85 = 45.81, p < 0.001).
Previously, it was determined that there is a statistically significant relationship between newt body length and dorsal crest sizes. Now we try to understand if the pond from which newts are sampled from is a nuisance source of variation in the data.
Performing Type I SS test
#Putting POND first
newt_lm1 <- lm(LCREST ~ pond_code + LSVL, data = newt_data)
anova(newt_lm1)
## Analysis of Variance Table
##
## Response: LCREST
## Df Sum Sq Mean Sq F value Pr(>F)
## pond_code 9 0.3252 0.03613 0.6549 0.7466
## LSVL 1 2.3048 2.30483 41.7751 8.84e-09 ***
## Residuals 76 4.1931 0.05517
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Putting POND second
newt_lm2 <- lm(LCREST ~ LSVL + pond_code, data = newt_data)
anova(newt_lm2)
## Analysis of Variance Table
##
## Response: LCREST
## Df Sum Sq Mean Sq F value Pr(>F)
## LSVL 1 2.3894 2.38939 43.3078 5.35e-09 ***
## pond_code 9 0.2406 0.02674 0.4846 0.8807
## Residuals 76 4.1931 0.05517
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
In both of these tests, POND is shown to not have a statistically significant on LCREST (put first:F9,76 = 0.65, p > 0.05; put second:F9,76 = 0.48, p > 0.05), no matter the order it was put into the model.
Performing Type II SS test
Anova(newt_lm1, type = "2")
## Anova Table (Type II tests)
##
## Response: LCREST
## Sum Sq Df F value Pr(>F)
## pond_code 0.2406 9 0.4846 0.8807
## LSVL 2.3048 1 41.7751 8.84e-09 ***
## Residuals 4.1931 76
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
However, since the study design involved sampling multiple newts from the same ponds, observations within ponds may not be independent, and POND should be kept in the model as a blocking factor to account for pond-pond variation.
Plotting data to include DATE to show temporal variation
newt_data %>%
ggplot(aes(x = LSVL, y = LCREST)) +
geom_point(aes(colour = Date)) +
geom_smooth(method = 'lm') +
labs(Title = "Height of dorsal crest against body size across days",
x = "Log of snout-vent length (mm)",
y = "Log of dorsal crest height (mm)") +
theme_minimal()
Adding date as a variable - Performing Type I SS test, where POND should
be considered before LSVL
#Putting DATE first
newt_lm3 <- lm(LCREST ~ Date + pond_code + LSVL, data = newt_data)
anova(newt_lm3)
## Analysis of Variance Table
##
## Response: LCREST
## Df Sum Sq Mean Sq F value Pr(>F)
## Date 1 0.0310 0.03101 0.5613 0.4561
## pond_code 9 0.3223 0.03581 0.6481 0.7524
## LSVL 1 2.3257 2.32567 42.0896 8.336e-09 ***
## Residuals 75 4.1441 0.05526
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Putting DATE second
newt_lm4 <- lm(LCREST ~ pond_code + Date + LSVL, data = newt_data)
anova(newt_lm4)
## Analysis of Variance Table
##
## Response: LCREST
## Df Sum Sq Mean Sq F value Pr(>F)
## pond_code 9 0.3252 0.03613 0.6539 0.7474
## Date 1 0.0281 0.02810 0.5086 0.4779
## LSVL 1 2.3257 2.32567 42.0896 8.336e-09 ***
## Residuals 75 4.1441 0.05526
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
In both of these tests, DATE is shown to not have a statistically significant on LCREST (put first:F1,75 = 0.56, p > 0.05; put second:F1,75 = 0.51, p > 0.05), no matter the order it was put into the model. This is visually confirmed by the scatterplot which shows no temporal trend in the dorsal crest height of newts.
Performing Type II SS test
Anova(newt_lm3, type = "2")
## Anova Table (Type II tests)
##
## Response: LCREST
## Sum Sq Df F value Pr(>F)
## Date 0.0490 1 0.8859 0.3496
## pond_code 0.2222 9 0.4467 0.9050
## LSVL 2.3257 1 42.0896 8.336e-09 ***
## Residuals 4.1441 75
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since the dorsal crest is used during the courtship season, it can be assumed that there may be a biological influence on the height of dorsal crest that would follow a temporal trend. Hence DATE should not be excluded from the model to account for these variations.
Checking data structure
str(sheep_data)# sex and obsper are numerical instead of categorical
## spc_tbl_ [120 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Duration : num [1:120] 39 40 39 34 31 37 38 36 36 37 ...
## $ nlookups : num [1:120] 3 7 3 7 6 5 3 4 2 5 ...
## $ sex : num [1:120] 1 1 1 1 1 1 1 1 1 1 ...
## $ sheep : num [1:120] 1 1 1 1 1 1 1 1 1 1 ...
## $ sheepn : num [1:120] 1 1 1 1 1 1 1 1 1 1 ...
## $ obsper : num [1:120] 1 2 3 4 5 6 7 8 9 10 ...
## $ luprate : num [1:120] 0.0769 0.175 0.0769 0.2059 0.1935 ...
## $ Sex2 : num [1:120] 1 1 1 2 2 2 NA NA NA NA ...
## $ sheep2 : num [1:120] 1 2 3 4 5 6 NA NA NA NA ...
## $ avluprate: num [1:120] 0.133 0.235 0.168 0.195 0.217 ...
## - attr(*, "spec")=
## .. cols(
## .. Duration = col_double(),
## .. nlookups = col_double(),
## .. sex = col_double(),
## .. sheep = col_double(),
## .. sheepn = col_double(),
## .. obsper = col_double(),
## .. luprate = col_double(),
## .. Sex2 = col_double(),
## .. sheep2 = col_double(),
## .. avluprate = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
sheep_data$sex <- as.factor(sheep_data$sex)
sheep_data$obsper <- as.factor(sheep_data$obsper)
Raincloud plot of Look Up rate of Male and Female sheep across observational periods
#how does the lookup rate vary with observation period and sex?
sheep_data %>%
ggplot(aes(x = obsper, y = luprate, fill = sex))+
geom_point(size = 0.5, position = position_nudge(x = -0.5))+
geom_boxplot()+
facet_wrap(~sex, labeller = as_labeller(c(`1` = "Female", `2` = "Male"))) +
scale_fill_discrete(labels = c("Female", "Male")) +
labs(title = " Look Up rate of Male and Female sheep across observational periods",
x = "Observation Period",
y = "Look up rate")+
theme_minimal()
Fitting the model
sheep_lm1 <- lm(luprate ~ obsper + sex, data = sheep_data)
anova(sheep_lm1)
## Analysis of Variance Table
##
## Response: luprate
## Df Sum Sq Mean Sq F value Pr(>F)
## obsper 19 0.19192 0.010101 2.4082 0.002655 **
## sex 1 0.13282 0.132816 31.6652 1.707e-07 ***
## Residuals 99 0.41524 0.004194
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From these results, an extremely high residual df of 99 is reported which tells us that the number of independent observations assumed by the model is 120. The dataset provided does consist of 120 observations, but these are gathered from multiple measurements of only 6 sheep and are therefore pseudoreplicates. The current study design hence does not represent the data accurately. Data from each sheep may be correlated and sheep should hence be treated as experimental units.
It is also seen that both obsper and sex have statistically significant effects on the look up rate of the sheep (obsper: F19,99=2.41, P<0.05; sex: F1,99=31.67, P<0.001).
To avoid pseudoreplication, sheep can be treated as experimental units and each have an average number of look ups.
Creating new dataset with averaged luprate
avg_sheep <- sheep_data %>%
group_by(sheep, sex) %>%
summarise(avg_luprate = mean(luprate))
Analysing the new dataset
avg_sheep %>%
ggplot(aes(x = sex, y = avg_luprate, fill = sex))+
geom_point(position = position_nudge(x = -0.5))+
geom_boxplot(size = 0.5)+
scale_fill_discrete(labels = c("Female", "Male")) +
labs(title = " Comparing the Look Up rates of Male and Female sheep",
x = "Sex",
y = "Average Look up rate")+
theme_minimal()
With the removal of observation period as a factor, a single categorical factor (sex) is tested for its statistical significance on the average of 2 groups (female and male). Since the box plot shows that the average look up rate data of either sex are skewed, distribution is not normal and a non-parametric Mann-Whitney U test should be used for analysis.
Performing a non-parametric Mann-Whitney U test
wilcox.test(avg_luprate ~ sex, avg_sheep)
##
## Wilcoxon rank sum exact test
##
## data: avg_luprate by sex
## W = 2, p-value = 0.4
## alternative hypothesis: true location shift is not equal to 0
In contrast to the original linear model’s results that indicate that sex has a statistically significant effect on the look up rate of sheep, this model’s results show that there is no statistically significant difference between the look up rates of the 2 sexes (W = 2, p = 0.4). However, since this model is more considerate of the grower’s study design,its results holds more weight.
Checking data structure
view(weight_data)
str(weight_data)
## spc_tbl_ [39 × 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ FOREARM: num [1:39] 4.87 4.67 3 3.54 1.72 ...
## $ HT : num [1:39] 159 164 158 148 155 ...
## $ WT : num [1:39] 70.7 63.9 60.4 56.7 56.6 ...
## - attr(*, "spec")=
## .. cols(
## .. FOREARM = col_double(),
## .. HT = col_double(),
## .. WT = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Plotting the data
#plotting FOREARM vs HT
weight_data %>%
ggplot(aes(x = HT, y = FOREARM))+
geom_point()+
geom_smooth(method = 'lm')+
labs(title = 'Effect of Height on Skinfold Thickness',
x = 'Height (cm)',
y = 'Skinfold Thickness (mm)')+
theme_minimal()
#plotting FOREARM vs WT
weight_data %>%
ggplot(aes(x = WT, y = FOREARM))+
geom_point()+
geom_smooth(method = 'lm')+
labs(title = 'Effect of Weight on Skinfold Thickness',
x = 'Weight (kg)',
y = 'Skinfold Thickness (mm)')+
theme_minimal()
Linear models with either HT or WT as the sole explanatory variable are fitted.
#Fitting linear model with HT as explanatory variable
weight_lm1 <- lm(FOREARM ~ HT, data = weight_data)
anova(weight_lm1)
## Analysis of Variance Table
##
## Response: FOREARM
## Df Sum Sq Mean Sq F value Pr(>F)
## HT 1 0.944 0.9439 0.1754 0.6778
## Residuals 37 199.094 5.3809
#Fitting linear model with WT as explanatory variable
weight_lm2 <- lm(FOREARM ~ WT, data = weight_data)
anova(weight_lm2)
## Analysis of Variance Table
##
## Response: FOREARM
## Df Sum Sq Mean Sq F value Pr(>F)
## WT 1 59.137 59.137 15.529 0.000347 ***
## Residuals 37 140.901 3.808
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the results of the 2 linear models, height is shown to not have a statistically significant effect on the forearm skinfold thickness (F1,37 = 0.18, p > 0.05), whereas weight has a statistically significant effect (F1,37 = 15.53, p < 0.001). Hence, WT is a better predictor of obesity.
#HT put first
weight_lm3 <- lm(FOREARM ~ HT + WT, data = weight_data)
anova(weight_lm3)
## Analysis of Variance Table
##
## Response: FOREARM
## Df Sum Sq Mean Sq F value Pr(>F)
## HT 1 0.944 0.944 0.2926 0.5919
## WT 1 82.970 82.970 25.7220 1.207e-05 ***
## Residuals 36 116.124 3.226
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This model where HT is put before WT shows that HT does not have a statistically significant effect on forearm skinfold thickness (F1,36 = 0.29, P > 0.05) while WT does (F1,36 = 25.72, P < 0.001).
#WT put first
weight_lm4 <- lm(FOREARM ~ WT + HT, data = weight_data)
anova(weight_lm4)
## Analysis of Variance Table
##
## Response: FOREARM
## Df Sum Sq Mean Sq F value Pr(>F)
## WT 1 59.137 59.137 18.3334 0.0001314 ***
## HT 1 24.777 24.777 7.6813 0.0087755 **
## Residuals 36 116.124 3.226
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This model where WT is put before HT shows that both WT and HT have statistically significant effects on forearm skinfold thickness (WT: F1,36 = 18.33, P < 0.001; HT: F1,36 = 7.68, P < 0.01).
Since no interactions are assumed, a type II SS (hierarchical) test can be run to test each main effect after accounting for the other.
#Running a type II SS test
Anova(weight_lm3, type = "2")
## Anova Table (Type II tests)
##
## Response: FOREARM
## Sum Sq Df F value Pr(>F)
## HT 24.777 1 7.6813 0.008775 **
## WT 82.970 1 25.7220 1.207e-05 ***
## Residuals 116.124 36
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As mentioned in 3C, the models differ in results, whereby WT and HT have statistically significant effects on forearm skinfold thickness (WT: F1,36 = 18.33, P < 0.001; HT: F1,36 = 7.68, P < 0.01) when WT is put before height. This shows that HT becomes a significant variable after a portion of the variation is first explained by WT.
Checking data structure
view(blooms_data)
str(blooms_data)
## spc_tbl_ [36 × 9] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ SQBLOOMS: num [1:36] 4.36 3.32 3.61 4.12 4.47 ...
## $ BED : num [1:36] 1 1 1 1 1 1 1 1 1 1 ...
## $ WATER : num [1:36] 1 1 1 1 2 2 2 2 3 3 ...
## $ SHADE : num [1:36] 1 2 3 4 1 2 3 4 1 2 ...
## $ ...5 : logi [1:36] NA NA NA NA NA NA ...
## $ SQ2 : num [1:36] 4.36 3.32 3.61 4.12 4.47 ...
## $ B2 : num [1:36] 1 1 1 1 1 1 1 1 1 1 ...
## $ W2 : num [1:36] 1 1 1 1 2 2 2 2 3 3 ...
## $ S2 : num [1:36] 1 2 3 4 1 2 3 4 1 2 ...
## - attr(*, "spec")=
## .. cols(
## .. SQBLOOMS = col_double(),
## .. BED = col_double(),
## .. WATER = col_double(),
## .. SHADE = col_double(),
## .. ...5 = col_logical(),
## .. SQ2 = col_double(),
## .. B2 = col_double(),
## .. W2 = col_double(),
## .. S2 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#both WATER and SHADE should be categorical variables rather than numeric.
blooms_data$WATER <- as.factor(blooms_data$WATER)
blooms_data$SHADE <- as.factor(blooms_data$SHADE)
blooms_data$BED <- as.factor(blooms_data$BED)
Plotting the data
blooms_data %>%
ggplot(aes(x = WATER, y = SQBLOOMS, colour = SHADE))+
geom_boxplot(width = 0.2, position = position_dodge(0.5))+
labs(title = 'Variation in bloom number with level of water and shade',
x = 'Level of water',
y = 'Number of blooms',
colour = 'Shade level')+
theme_minimal()
To check if the data is balanced and analysis is orthogonal, Type I and II SS ANOVA should be conducted and their results compared.
blooms_lm <- lm(SQBLOOMS ~ BED + WATER + SHADE, data = blooms_data)
#conducting type I SS ANOVA
anova(blooms_lm)
## Analysis of Variance Table
##
## Response: SQBLOOMS
## Df Sum Sq Mean Sq F value Pr(>F)
## BED 2 4.1323 2.06614 9.4570 0.0007277 ***
## WATER 2 3.7153 1.85767 8.5029 0.0013016 **
## SHADE 3 1.6465 0.54882 2.5120 0.0789451 .
## Residuals 28 6.1173 0.21848
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#conducting type II SS ANOVA
Anova(blooms_lm, type = "2")
## Anova Table (Type II tests)
##
## Response: SQBLOOMS
## Sum Sq Df F value Pr(>F)
## BED 4.1323 2 9.4570 0.0007277 ***
## WATER 3.7153 2 8.5029 0.0013016 **
## SHADE 1.6465 3 2.5120 0.0789451 .
## Residuals 6.1173 28
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Since Type I and II ANOVAs produced the same results, the analysis is orthogonal.
Fitting the linear model
blooms_lm2 <- lm(SQBLOOMS ~ WATER + SHADE, data = blooms_data)
anova(blooms_lm2)
## Analysis of Variance Table
##
## Response: SQBLOOMS
## Df Sum Sq Mean Sq F value Pr(>F)
## WATER 2 3.7153 1.85767 5.4373 0.009661 **
## SHADE 3 1.6465 0.54882 1.6064 0.208609
## Residuals 30 10.2496 0.34165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The model performed in Qn 4B (SQBLOOMS ~ BED + WATER + SHADE) showed that BED had a significant effect on SQBLOOMS (F2,28 = 9.46, p < 0.001) and should remain in the model.
When BED was removed in Qn 4C’s model, the statistical significance of WATER and SHADE were affected, the variance previously explained by BED is now part of the residual error, and the df increases from 28 to 30.This is not ideal as df and the residual error should be as reduced as possible for the model to detect statistically significant effects of the other predictors, further supporting that it is not worthwhile for BED to be ommitted from the model as a blocking factor.
Furthermore, the grower considered that different beds would have different fertilities in his study design (potential soures of nuisance variation), hence BED should be used as a blocking factor
Checking data structure of the updated dataset
#converting B2, W2 and S2 to categorical data
blooms_data$W2 <- as.factor(blooms_data$W2)
blooms_data$S2 <- as.factor(blooms_data$S2)
blooms_data$B2 <- as.factor(blooms_data$B2)
Conducting type I and II ANOVA as was done for the first model.
#conducting type I ANOVA
blooms_lm3 <- lm(SQ2 ~ B2 + W2 + S2, data = blooms_data)
anova(blooms_lm3)
## Analysis of Variance Table
##
## Response: SQ2
## Df Sum Sq Mean Sq F value Pr(>F)
## B2 2 2.7626 1.38129 8.3790 0.001641 **
## W2 2 5.0793 2.53966 15.4057 4.367e-05 ***
## S2 3 0.8072 0.26906 1.6321 0.207156
## Residuals 25 4.1213 0.16485
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#conducting type II ANOVA
blooms_lm4 <- lm(SQ2 ~ B2 + W2 + S2, data = blooms_data)
Anova(blooms_lm4, type = "2")
## Anova Table (Type II tests)
##
## Response: SQ2
## Sum Sq Df F value Pr(>F)
## B2 2.6490 2 8.0346 0.00202 **
## W2 4.6764 2 14.1836 7.644e-05 ***
## S2 0.8072 3 1.6321 0.20716
## Residuals 4.1213 25
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The othorgonality observed from the pair of ANOVA tests conducted for the original dataset is now lost, as Type I and II ANOVA conducted with the new dataset now produce different results. In the analysis of both sets of data, level of water is shown to have a statistically significant effect on the number of blooms, whereas shade does not have a statistically significant effect.