Load dataset and packages
library(ggplot2)
library(dplyr)
library(readxl)
setwd ("/Users/Megan/Documents/ES3307 Experimental Design & Analysis for Ecology/Assignment 1/")
squirrels <- read_excel("Assignment 1 data.xls", sheet = "Squirrels")
melons <- read_excel("Assignment 1 data.xls", sheet = "Melons")
dioecioustrees <- read_excel("Assignment 1 data.xls", sheet = "Dioecious trees")
Checking the first few rows of the datasets
head (squirrels)
## # A tibble: 6 × 2
## MALE FEMALE
## <dbl> <dbl>
## 1 0.41 0.4
## 2 0.38 0.39
## 3 1.13 0.59
## 4 0.61 0.62
## 5 0.73 0.53
## 6 0.56 0.34
head(melons)
## # A tibble: 6 × 2
## YIELDM VARIETY
## <dbl> <dbl>
## 1 25.1 1
## 2 17.2 1
## 3 26.4 1
## 4 16.1 1
## 5 22.2 1
## 6 15.9 1
head(dioecioustrees)
## # A tibble: 6 × 3
## SEX DBH FLOWERS
## <dbl> <dbl> <dbl>
## 1 2 174 234
## 2 1 171 110
## 3 2 212 512
## 4 1 187 133
## 5 2 227 1107
## 6 1 274 624
Structure of each dataset
str(squirrels)
## tibble [50 × 2] (S3: tbl_df/tbl/data.frame)
## $ MALE : num [1:50] 0.41 0.38 1.13 0.61 0.73 0.56 0.62 0.75 0.5 0.75 ...
## $ FEMALE: num [1:50] 0.4 0.39 0.59 0.62 0.53 0.34 0.41 0.59 0.46 0.63 ...
str(melons)
## tibble [22 × 2] (S3: tbl_df/tbl/data.frame)
## $ YIELDM : num [1:22] 25.1 17.2 26.4 16.1 22.1 ...
## $ VARIETY: num [1:22] 1 1 1 1 1 1 2 2 2 2 ...
str(dioecioustrees)
## tibble [50 × 3] (S3: tbl_df/tbl/data.frame)
## $ SEX : num [1:50] 2 1 2 1 2 1 2 2 1 1 ...
## $ DBH : num [1:50] 174 171 212 187 227 274 86 161 285 182 ...
## $ FLOWERS: num [1:50] 234 110 512 133 1107 ...
Summary of variables in each dataset
summary(squirrels)
## MALE FEMALE
## Min. :0.2900 Min. :0.320
## 1st Qu.:0.4350 1st Qu.:0.410
## Median :0.5600 Median :0.495
## Mean :0.5908 Mean :0.518
## 3rd Qu.:0.7025 3rd Qu.:0.605
## Max. :1.1300 Max. :0.850
summary(melons)
## YIELDM VARIETY
## Min. :15.05 Min. :1.000
## 1st Qu.:22.26 1st Qu.:1.250
## Median :27.82 Median :2.000
## Mean :27.66 Mean :2.455
## 3rd Qu.:32.90 3rd Qu.:3.750
## Max. :43.32 Max. :4.000
summary(dioecioustrees)
## SEX DBH FLOWERS
## Min. :1.0 Min. : 68.0 Min. : 1.0
## 1st Qu.:1.0 1st Qu.:140.5 1st Qu.: 115.8
## Median :2.0 Median :188.0 Median : 253.0
## Mean :1.6 Mean :191.6 Mean : 383.5
## 3rd Qu.:2.0 3rd Qu.:242.0 3rd Qu.: 533.0
## Max. :2.0 Max. :316.0 Max. :1482.0
In a study of squirrels, the weight of 50 female and 50 male squirrels was measured. The observations are in the columns FEMALE and MALE (Squirrels dataset).
Histogram of female and male squirrels
library(gridExtra)
female_hist <- ggplot(squirrels, aes(x = FEMALE)) +
geom_histogram(aes(y = ..density..), bins = 20, fill = "lightblue", color = "black") +
geom_density(color = "red", size = 1) +
labs(title = "Distribution of weight of female squirrels",
x = "Weight (g)",
y = "Density")
male_hist <- ggplot(squirrels, aes(x = MALE)) +
geom_histogram(aes(y = ..density..), bins = 20, fill = "lightblue", color = "black") +
geom_density(color = "red", size = 1) +
labs(title = "Distribution of weight of male squirrels",
x = "Weight (g)",
y = "Density")
grid.arrange(female_hist, male_hist, ncol = 2)
The histograms show that the distribution for both male and female squirrels are right-skewed. There is a larger range of weights for male squirrels (0.25g to 1.1g) compared to the range of weights for female squirrels (0.3g to 0.85g). The mode of weight of female squirrels is about 0.45 which is lower than the mode of male squirrels which is about 0.5.
Boxplots of female and male squirrels
library(tidyr)
#convert data to from wide format where each variable has its own column to
#long format where a column stores the sex of the squirrel and another column has its weight
#this will help to plot the 2 boxplots of female and male squirrels onto the same graph
squirrels_long <- squirrels %>%
pivot_longer(cols = c(MALE, FEMALE), names_to = "Sex", values_to = "Weight")
ggplot(squirrels_long, aes(x = Sex, y = Weight, fill = Sex)) +
geom_boxplot() +
labs(title = "Boxplot of Male and Female Squirrel Weights",
x = "Sex",
y = "Weight (g)") +
scale_fill_manual(values = c("lightblue", "lightgreen"))
The boxplot shows that the median weight of male squirrels is higher than the median weight of female squirrels. The boxplot also shows that there is a larger range of weights of male squirrels than the range of weights of female squirrels.
Conduct a two-sample t-test
squirrels_t <- t.test(squirrels$MALE, squirrels$FEMALE, var.equal = FALSE)
squirrels_tvalue <- squirrels_t$statistic
squirrels_pvalue_ttest<- squirrels_t$p.value
squirrels_df<- squirrels_t$parameter
Conduct a Mann-Whitney test
squirrels_wilcox <- wilcox.test(squirrels$FEMALE,squirrels$MALE, paired=F)
squirrels_wvalue <- squirrels_wilcox$statistic
squirrels_pvalue_wilcox <- squirrels_wilcox$p.value
The histogram plots shows that the data for female and male squirrels are right-skewed, indicating that they are not normally distributed. Hence, using a two-sample t-test is unwise as the data should be normally distributed to use this test.
Print results of t test
paste("The t value is", round(squirrels_tvalue, 3))
## [1] "The t value is 2.166"
paste("The p value of the t-test is", round(squirrels_pvalue_ttest, 3))
## [1] "The p value of the t-test is 0.033"
paste("The df of the t-test is", round(squirrels_df, 3))
## [1] "The df of the t-test is 85.212"
Since the p-value is 0.033 which is less than 0.05, we can conclude from the t-test that there is a statistically significant difference between the weights of female and male squirrels.
Print results of Mann-Whitney test
paste("The w value is", round(squirrels_wvalue, 3))
## [1] "The w value is 1012"
paste("The p value of the Mann-Whitney test is", round(squirrels_pvalue_wilcox, 3))
## [1] "The p value of the Mann-Whitney test is 0.101"
Since the p-value is 0.101 which is more than 0.05, we cannot conclude from the Mann-Whitney test that there is a statistically significant difference between the weights of female and male squirrels.
A Type 1 error occurs where we conclude that there is a statistically significant difference between the weights of female and male squirrels but there is actually no statistically significant difference betweeen the weights of female and male squirrels.
An experiment was performed to compare four melon varieties. It was designed so that each variety was grown in six plots — but two plots growing variety 3 were accidentally destroyed. The data (Melons dataset) are in the columns YIELDM and VARIETY.
Null hypothesis: There is no statistically significant difference in the mean yields of the four melon varieties
Alternative hypothesis: There is a statistically significant difference in the mean yield of at least one melon variety
Plot a boxplot of the four different melon varieties
# factor the variety column
variety_melons <- as.factor(melons$VARIETY)
ggplot(melons, aes(x = variety_melons, y = YIELDM, fill=variety_melons)) +
geom_boxplot() +
labs(title = "Yields of different melon varieties",
x = "Melon Varieties",
y = "Yield",
fill = "Melon Variety")
Based on the boxplot, we can see that melon variety 2 has the highest median yield while melon variety 1 has the lowest median yield.
melons_stats <- melons %>%
group_by(VARIETY) %>%
summarise(
n_melons = n(),
mean_melons = mean(YIELDM, na.rm = TRUE),
sd_melons = sd(YIELDM, na.rm = TRUE),
se_melons = sd_melons / sqrt(n_melons),
ci_lower_melons = mean_melons - qt(0.975, df = n_melons - 1) * se_melons,
ci_upper_melons = mean_melons + qt(0.975, df = n_melons - 1) * se_melons
)
melons_stats
## # A tibble: 4 × 7
## VARIETY n_melons mean_melons sd_melons se_melons ci_lower_melons
## <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 6 20.5 4.69 1.92 15.6
## 2 2 6 37.4 3.95 1.61 33.3
## 3 3 4 20.5 4.76 2.38 12.9
## 4 4 6 29.9 2.23 0.910 27.6
## # ℹ 1 more variable: ci_upper_melons <dbl>
To show these results on a bar plot,
ggplot(melons_stats, aes(x = VARIETY, y = mean_melons, ymin = ci_lower_melons, ymax = ci_upper_melons)) +
geom_bar(stat = "identity", fill = "lightblue", color = "black") +
geom_errorbar(width = 0.1) +
labs(x = "Melon Variety", y = "Mean Yield", title = "Mean Yield of Melon Varieties with 95% CI")
Check if Melons dataset is normally distributed
ggplot(melons, aes(x = YIELDM)) +
geom_histogram(aes(y = ..density..), bins = 20, fill = "lightblue", color = "black") +
geom_density(color = "red", size = 1) +
labs(title = "Distribution of yield of melons")
Based on the histogram plotted, the data is normally distributed. Hence, we can use a 1-way ANOVA test.
Fit the melons data into a linear model where YIELDM is the dependent variable and VARIETY is the independent variable and then conduct as ANOVA test.
lm_model_melons <- lm(YIELDM ~ variety_melons, data = melons)
anova_table <- anova(lm_model_melons)
anova_table
## Analysis of Variance Table
##
## Response: YIELDM
## Df Sum Sq Mean Sq F value Pr(>F)
## variety_melons 3 1115.28 371.76 23.798 1.735e-06 ***
## Residuals 18 281.19 15.62
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(2, 2))
plot(lm_model_melons)
From the Residuals vs Fitted plot, we can see that the residuals are randomly scattered around zero and there is not a clear curve present. This suggest a linear relationship.
From the Q-Q Residuals plot, we can see that most of the points lie along the line and does not deviate much from the line. This indicates that the residuals are normally distributed.
From the Scale-Location plot, we can see that there is some difference in variability as the line is not completely straight but it is likely still homoscedasticity as the line is relatively straight and there is only slight unevenness.
From the Residuals vs Leverage plot, we can see that there are no extreme outliers as most of the points are grouped close together
Based on these plots, we can see that the assumptions of linear models has been met: - There is a linear relationship between the yield and variety of melons. - Yield of melons is normally distributed. - There is homoscedasticity of the data. The linear model is thus a good fit for this data.
F_value_melons <- anova_table$`F value`[1]
df1_melons <- anova_table$Df[1] # numerator df
df2_melons <- anova_table$Df[2] # denominator df
p_value_melons <- anova_table$`Pr(>F)`[1]
paste("The F value is", round(F_value_melons, 3))
## [1] "The F value is 23.798"
paste("The dfs are", round(df1_melons,3), "and", round(df2_melons),". The df of 3 is because there are 4 melon varieties so there are 3 independent comparisons made between group means. The df of 18 is the total sample size minus the number of groups (melon varieties).")
## [1] "The dfs are 3 and 18 . The df of 3 is because there are 4 melon varieties so there are 3 independent comparisons made between group means. The df of 18 is the total sample size minus the number of groups (melon varieties)."
paste("The p value is", round(p_value_melons, 8),". Since the p-value is < 0.05, we can conclude that there is a statistically significant difference in mean yield of the 4 melon varieties.")
## [1] "The p value is 1.73e-06 . Since the p-value is < 0.05, we can conclude that there is a statistically significant difference in mean yield of the 4 melon varieties."
A plant species is dioecious if each individual produces all male flowers or all female flowers. The Dioecious trees dataset contains data from 50 trees of one particular dioecious species, from a ten hectare area of mixed woodland.
For each individual, the SEX was recorded (coded as 1 for male and 2 for female), the diameter at breast height in millimetres (DBH), and the number of flowers on the tree at the time of measurement (FLOWERS).
Plot a boxplot of the number of flowers of male and female trees,
trees_sex <- factor(dioecioustrees$SEX,
#rename 1 and 2 in dataset to male and female
levels = c(1, 2),
labels = c("Male", "Female"))
ggplot(dioecioustrees, aes(x = trees_sex, y = FLOWERS, fill = trees_sex)) +
geom_boxplot() +
labs(title = "Number of flowers by sex of trees", x = "Sex", y = "Number of flowers")+
labs(fill = "Sex") #to change legend title
Based on this boxplot, we can see that the median number of flowers produced by female trees are lower than the median number of flowers produced by male trees.
There is also a greater variability in the number of flowers for female trees than male trees as the whiskers of the boxplot extends further for the female trees than the male trees
Female trees also have some extreme outliers while the male trees do not have extreme outliers, which may possibly indicate that some female trees produce an exceptionally high number of flowers.
Since the number of flowers is count data and we are testing if there is a statistically significant difference in the sex of the tree and the number of flowers produced (1 explanatory variable with 2 groups), we will use the Mann-Whitney U-test.
wilcox.test(FLOWERS ~ SEX, data = dioecioustrees)
##
## Wilcoxon rank sum test with continuity correction
##
## data: FLOWERS by SEX
## W = 298, p-value = 0.9763
## alternative hypothesis: true location shift is not equal to 0
Extract the w statistic and p-value from the test. Df will not be extracted as Mann-Whitney U-test does not have df.
mwtest_trees <- wilcox.test(FLOWERS ~ SEX, data = dioecioustrees)
paste("The W statistic is", mwtest_trees$statistic)
## [1] "The W statistic is 298"
paste("The p-value of the Mann-Whitney U-test is", round(mwtest_trees$p.value, 3))
## [1] "The p-value of the Mann-Whitney U-test is 0.976"
Since the p-value is larger than 0.05, we cannot conclude that there is a statistically significant difference between the number of flowers for male and female trees.