# Set working directory
setwd("C:/Users/racha/OneDrive/Desktop/NTU/Experimental Design/Assignment 1")
# Load dataset
library(readxl)
squirrels <- read_excel("Assignment 1 data.xls")
# Load packages
library(ggplot2)
library(gridExtra)
# Plot histograms
hist_male <- ggplot(squirrels, aes(x = MALE)) +
geom_histogram(bins = 15, color = "black", fill = "skyblue") +
labs(title = "Histogram of Weight of Male Squirrels", x = "Weight / kg", y = "Frequency") +
theme_minimal()
hist_female <- ggplot(squirrels, aes(x = FEMALE)) +
geom_histogram(bins = 15, color = "black", fill = "pink") +
labs(title = "Histogram of Weight of Female Squirrels", x = "Weight / kg", y = "Frequency") +
theme_minimal()
# Combine histograms into 1 figure
grid.arrange(hist_male, hist_female, ncol = 1)
# Stacked histogram
ggplot() +
geom_histogram(data = squirrels, aes(x = MALE, fill = "Male"),
color = "black", alpha = 0.6, bins = 15) +
geom_histogram(data = squirrels, aes(x = FEMALE, fill = "Female"),
color = "black", alpha = 0.6, bins = 15) +
scale_fill_manual(values = c("Male" = "skyblue", "Female" = "pink")) +
labs(title = "Stacked Histograms of Male and Female Weights",
x = "Weight / kg", y = "Count", fill = "Sex") +
theme_minimal()
# Plot boxplots
ggplot() +
geom_boxplot(aes(x = factor("Male", levels = c("Male", "Female")), y = squirrels$MALE), fill = "skyblue") +
geom_boxplot(aes(x = factor("Female", levels = c("Male", "Female")), y = squirrels$FEMALE), fill = "pink") +
labs(title = "Boxplot of Male and Female Squirrel Weights",
x = "Gender", y = "Weight / kg") +
theme_minimal()
# 2-sample t-test
t.test(squirrels$MALE, squirrels$FEMALE)
##
## Welch Two Sample t-test
##
## data: squirrels$MALE and squirrels$FEMALE
## t = 2.1662, df = 85.212, p-value = 0.03309
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.005981917 0.139618083
## sample estimates:
## mean of x mean of y
## 0.5908 0.5180
# Mann-Whitney test
wilcox.test(squirrels$MALE, squirrels$FEMALE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: squirrels$MALE and squirrels$FEMALE
## W = 1488, p-value = 0.1014
## alternative hypothesis: true location shift is not equal to 0
From the histograms and boxplots, the weight of male squirrels seem right-skewed, which suggests that the data is not normally distributed, making a parametric test unsuitable. For the weight of female squirrels, the skewness is less clear, especially on the histogram. Regardless, we can easily do an additional check for normality for both male and female squirrels using the Shapiro-Wilk test:
shapiro.test(squirrels$MALE)
##
## Shapiro-Wilk normality test
##
## data: squirrels$MALE
## W = 0.94065, p-value = 0.01429
shapiro.test(squirrels$FEMALE)
##
## Shapiro-Wilk normality test
##
## data: squirrels$FEMALE
## W = 0.94139, p-value = 0.0153
For both male and female, the p-value is <0.05, which suggests that the weights for both genders are not normal, hence unsuitable for a parametric test.
t = 2.17, df = 85.2, p-value = 0.0331. As the p-value < 0.05, we conclude that there is a statistically significant difference between the weights of male and female squirrels.
W = 1488, p-value = 0.101. As the p-value > 0.05, we conclude that there is no statistically significant difference between the weights of male and female squirrels.
Type I error, as the null hypothesis is rejected incorrectly, resulting in a “false postiive”.
melons <- read_excel("Assignment 1 data.xls", sheet = "Melons")
The null hypothesis (H0) is that there is no statistically significant difference in yield between the different varieties of melons. The alternative hypothesis (H1) is that there is a statistically significant difference between at least two different varieties of melons.
# Subset data based on variety
variety1 <- subset(melons, VARIETY == 1)
variety2 <- subset(melons, VARIETY == 2)
variety3 <- subset(melons, VARIETY == 3)
variety4 <- subset(melons, VARIETY == 4)
# Plot boxplots
ggplot() +
geom_boxplot(aes(x = "1", y = variety1$YIELDM), fill = "skyblue") +
geom_boxplot(aes(x = "2", y = variety2$YIELDM), fill = "pink") +
geom_boxplot(aes(x = "3", y = variety3$YIELDM), fill = "forestgreen") +
geom_boxplot(aes(x = "4", y = variety4$YIELDM), fill = "lavender") +
labs(title = "Boxplots of Yields of Melon Varieties",
x = "Variety", y = "Yield / kg") +
theme_minimal()
library(dplyr)
stats <- melons %>%
group_by(VARIETY) %>%
summarise(
n = n(),
mean = mean(YIELDM),
sd = sd(YIELDM),
se = sd/sqrt(n),
# for 95% CI
lower_ci = mean - qt(0.975, df = n - 1) * se,
upper_ci = mean + qt(0.975, df = n - 1) * se
)
# Print descriptive stats
print(stats)
## # A tibble: 4 × 7
## VARIETY n mean sd se lower_ci upper_ci
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 6 20.5 4.69 1.92 15.6 25.4
## 2 2 6 37.4 3.95 1.61 33.3 41.5
## 3 3 4 20.5 4.76 2.38 12.9 28.0
## 4 4 6 29.9 2.23 0.910 27.6 32.2
# Fit linear model, ensuring that VARIETY is categorical (factor)
melon_model <- lm(YIELDM ~ factor(VARIETY), data = melons)
# ANOVA
anova(melon_model)
## Analysis of Variance Table
##
## Response: YIELDM
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(VARIETY) 3 1115.28 371.76 23.798 1.735e-06 ***
## Residuals 18 281.19 15.62
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot 4 diagnostic plots in 1 figure
par(mfrow = c(2,2))
plot(melon_model)
# Return to single plot
par(mfrow = c(1,1))
The data is relatively normal as the points mostly lie along the line in the QQ Plot. From the Residuals vs Fitted plot, there is no obvious curve, which suggests that linear relationship assumptions are met. However, there seems to be heteroscedasticity as the same plot shows greater variance at fitted values ~20 and ~37. Similarly, the Scale-Location plot indicates heteroscedasticity as the red line dips at fitted values ~30. Finally, the Residuals vs Leverage plot suggests that some points like 15 and 16 may be extreme values that influence regression results.
F(3,18) = 23.8, p-value < 0.001. As p-value < 0.001, there is a statistically significant difference between at least two different varieties of melons.
trees <- read_excel("Assignment 1 data.xls", sheet = "Dioecious trees")
# Subset data based on gender
male <- subset(trees, SEX == 1)
female <- subset(trees, SEX == 2)
# Boxplot
ggplot() +
geom_boxplot(aes(x = factor("Male", levels = c("Male", "Female")), y = male$FLOWERS), fill = "skyblue") +
geom_boxplot(aes(x = factor("Female", levels = c("Male", "Female")), y = female$FLOWERS), fill = "pink") +
labs(title = "Boxplot of Male and Female Flowers",
x = "Sex", y = "Number of Flowers") +
theme_minimal()
# Stacked histogram
ggplot() +
geom_histogram(aes(x = FLOWERS, fill = "Male"), data = male,
color = "black", alpha = 0.6, bins = 15) +
geom_histogram(aes(x = FLOWERS, fill = "Female"), data = female,
color = "black", alpha = 0.6, bins = 15) +
scale_fill_manual(values = c("Male" = "skyblue", "Female" = "pink")) +
labs(title = "Stacked Histogram of Male and Female Flowers",
x = "Number of Flowers", y = "Count", fill = "Sex") +
theme_minimal()
From the boxplot, the median number of male flowers is slightly higher
than that of female flowers, but the range of number of female flowers
is much larger than that of male. From the histogram, the outliers of
high number of female flowers are especially obvious, with some female
trees having more than 1000 flowers.
The hypothesis is that male and female trees produce different number of flowers. We are dealing wtih count data and a single variable (number of flowers) with two groups (male, female). The data for female flowers is also obviously right-skewed, i.e. not normally distirbuted. As such, we will do a non-parametric Mann-Whitney U-Test.
# Mann-Whitney test
wilcox.test(male$FLOWERS, female$FLOWERS)
##
## Wilcoxon rank sum test with continuity correction
##
## data: male$FLOWERS and female$FLOWERS
## W = 298, p-value = 0.9763
## alternative hypothesis: true location shift is not equal to 0
W = 298, p-value = 0.976. As the p-value > 0.05, we conclude that there is no statistically significant difference between the number of flowers produced by male and female trees.