Question 1

Load data

# Set working directory
setwd("C:/Users/racha/OneDrive/Desktop/NTU/Experimental Design/Assignment 1")

# Load dataset
library(readxl)
squirrels <- read_excel("Assignment 1 data.xls")

1A Plot data with histograms and boxplots

# Load packages
library(ggplot2)
library(gridExtra)

# Plot histograms
hist_male <- ggplot(squirrels, aes(x = MALE)) +
  geom_histogram(bins = 15, color = "black", fill = "skyblue") +
  labs(title = "Histogram of Weight of Male Squirrels", x = "Weight / kg", y = "Frequency") +
  theme_minimal()
hist_female <- ggplot(squirrels, aes(x = FEMALE)) +
  geom_histogram(bins = 15, color = "black", fill = "pink") +
  labs(title = "Histogram of Weight of Female Squirrels", x = "Weight / kg", y = "Frequency") +
  theme_minimal()

# Combine histograms into 1 figure
grid.arrange(hist_male, hist_female, ncol = 1)

# Stacked histogram
ggplot() +
  geom_histogram(data = squirrels, aes(x = MALE, fill = "Male"),
                 color = "black", alpha = 0.6, bins = 15) +
  geom_histogram(data = squirrels, aes(x = FEMALE, fill = "Female"),
                 color = "black", alpha = 0.6, bins = 15) +
  scale_fill_manual(values = c("Male" = "skyblue", "Female" = "pink")) +
  labs(title = "Stacked Histograms of Male and Female Weights",
       x = "Weight / kg", y = "Count", fill = "Sex") +
  theme_minimal()

# Plot boxplots
ggplot() +
  geom_boxplot(aes(x = factor("Male", levels = c("Male", "Female")), y = squirrels$MALE), fill = "skyblue") +
  geom_boxplot(aes(x = factor("Female", levels = c("Male", "Female")), y = squirrels$FEMALE), fill = "pink") +
  labs(title = "Boxplot of Male and Female Squirrel Weights",
       x = "Gender", y = "Weight / kg") +
  theme_minimal()

1B Perform statistical tests

# 2-sample t-test
t.test(squirrels$MALE, squirrels$FEMALE)
## 
##  Welch Two Sample t-test
## 
## data:  squirrels$MALE and squirrels$FEMALE
## t = 2.1662, df = 85.212, p-value = 0.03309
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.005981917 0.139618083
## sample estimates:
## mean of x mean of y 
##    0.5908    0.5180
# Mann-Whitney test
wilcox.test(squirrels$MALE, squirrels$FEMALE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  squirrels$MALE and squirrels$FEMALE
## W = 1488, p-value = 0.1014
## alternative hypothesis: true location shift is not equal to 0

1C Parametric tests are unwise

From the histograms and boxplots, the weight of male squirrels seem right-skewed, which suggests that the data is not normally distributed, making a parametric test unsuitable. For the weight of female squirrels, the skewness is less clear, especially on the histogram. Regardless, we can easily do an additional check for normality for both male and female squirrels using the Shapiro-Wilk test:

shapiro.test(squirrels$MALE)
## 
##  Shapiro-Wilk normality test
## 
## data:  squirrels$MALE
## W = 0.94065, p-value = 0.01429
shapiro.test(squirrels$FEMALE)
## 
##  Shapiro-Wilk normality test
## 
## data:  squirrels$FEMALE
## W = 0.94139, p-value = 0.0153

For both male and female, the p-value is <0.05, which suggests that the weights for both genders are not normal, hence unsuitable for a parametric test.

1D Intepretation of 2-sample t-test

t = 2.17, df = 85.2, p-value = 0.0331. As the p-value < 0.05, we conclude that there is a statistically significant difference between the weights of male and female squirrels.

1E Intepretation of Mann-Whitney test

W = 1488, p-value = 0.101. As the p-value > 0.05, we conclude that there is no statistically significant difference between the weights of male and female squirrels.

1F Type of error if t-test is used

Type I error, as the null hypothesis is rejected incorrectly, resulting in a “false postiive”.

Question 2

Load data

melons <- read_excel("Assignment 1 data.xls", sheet = "Melons")

2A Hypotheses

The null hypothesis (H0) is that there is no statistically significant difference in yield between the different varieties of melons. The alternative hypothesis (H1) is that there is a statistically significant difference between at least two different varieties of melons.

2B Plot data

# Subset data based on variety
variety1 <- subset(melons, VARIETY == 1)
variety2 <- subset(melons, VARIETY == 2)
variety3 <- subset(melons, VARIETY == 3)
variety4 <- subset(melons, VARIETY == 4)

# Plot boxplots
ggplot() +
  geom_boxplot(aes(x = "1", y = variety1$YIELDM), fill = "skyblue") +
  geom_boxplot(aes(x = "2", y = variety2$YIELDM), fill = "pink") +
  geom_boxplot(aes(x = "3", y = variety3$YIELDM), fill = "forestgreen") +
  geom_boxplot(aes(x = "4", y = variety4$YIELDM), fill = "lavender") +
  labs(title = "Boxplots of Yields of Melon Varieties",
       x = "Variety", y = "Yield / kg") +
  theme_minimal()

2C Descriptive stats

library(dplyr)
stats <- melons %>%
  group_by(VARIETY) %>%
  summarise(
    n = n(),
    mean = mean(YIELDM),
    sd = sd(YIELDM),
    se = sd/sqrt(n),
    # for 95% CI
    lower_ci = mean - qt(0.975, df = n - 1) * se,
    upper_ci = mean + qt(0.975, df = n - 1) * se
  )

# Print descriptive stats
print(stats)
## # A tibble: 4 × 7
##   VARIETY     n  mean    sd    se lower_ci upper_ci
##     <dbl> <int> <dbl> <dbl> <dbl>    <dbl>    <dbl>
## 1       1     6  20.5  4.69 1.92      15.6     25.4
## 2       2     6  37.4  3.95 1.61      33.3     41.5
## 3       3     4  20.5  4.76 2.38      12.9     28.0
## 4       4     6  29.9  2.23 0.910     27.6     32.2

2D ANOVA

# Fit linear model, ensuring that VARIETY is categorical (factor)
melon_model <- lm(YIELDM ~ factor(VARIETY), data = melons)

# ANOVA
anova(melon_model)
## Analysis of Variance Table
## 
## Response: YIELDM
##                 Df  Sum Sq Mean Sq F value    Pr(>F)    
## factor(VARIETY)  3 1115.28  371.76  23.798 1.735e-06 ***
## Residuals       18  281.19   15.62                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2E Diagnostic plots

# Plot 4 diagnostic plots in 1 figure
par(mfrow = c(2,2))
plot(melon_model)

# Return to single plot
par(mfrow = c(1,1))

The data is relatively normal as the points mostly lie along the line in the QQ Plot. From the Residuals vs Fitted plot, there is no obvious curve, which suggests that linear relationship assumptions are met. However, there seems to be heteroscedasticity as the same plot shows greater variance at fitted values ~20 and ~37. Similarly, the Scale-Location plot indicates heteroscedasticity as the red line dips at fitted values ~30. Finally, the Residuals vs Leverage plot suggests that some points like 15 and 16 may be extreme values that influence regression results.

2F Reporting data

F(3,18) = 23.8, p-value < 0.001. As p-value < 0.001, there is a statistically significant difference between at least two different varieties of melons.

Question 3

Load data

trees <- read_excel("Assignment 1 data.xls", sheet = "Dioecious trees")

3A Boxplot and histogram

# Subset data based on gender
male <- subset(trees, SEX == 1)
female <- subset(trees, SEX == 2)

# Boxplot
ggplot() +
  geom_boxplot(aes(x = factor("Male", levels = c("Male", "Female")), y = male$FLOWERS), fill = "skyblue") +
  geom_boxplot(aes(x = factor("Female", levels = c("Male", "Female")), y = female$FLOWERS), fill = "pink") +
  labs(title = "Boxplot of Male and Female Flowers",
       x = "Sex", y = "Number of Flowers") +
  theme_minimal()

# Stacked histogram
ggplot() +
  geom_histogram(aes(x = FLOWERS, fill = "Male"), data = male,
                 color = "black", alpha = 0.6, bins = 15) +
  geom_histogram(aes(x = FLOWERS, fill = "Female"), data = female,
                 color = "black", alpha = 0.6, bins = 15) +
  scale_fill_manual(values = c("Male" = "skyblue", "Female" = "pink")) +
  labs(title = "Stacked Histogram of Male and Female Flowers",
       x = "Number of Flowers", y = "Count", fill = "Sex") +
  theme_minimal()

From the boxplot, the median number of male flowers is slightly higher than that of female flowers, but the range of number of female flowers is much larger than that of male. From the histogram, the outliers of high number of female flowers are especially obvious, with some female trees having more than 1000 flowers.

3B Hypothesis testing

The hypothesis is that male and female trees produce different number of flowers. We are dealing wtih count data and a single variable (number of flowers) with two groups (male, female). The data for female flowers is also obviously right-skewed, i.e. not normally distirbuted. As such, we will do a non-parametric Mann-Whitney U-Test.

# Mann-Whitney test
wilcox.test(male$FLOWERS, female$FLOWERS)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  male$FLOWERS and female$FLOWERS
## W = 298, p-value = 0.9763
## alternative hypothesis: true location shift is not equal to 0

3C Interpretation

W = 298, p-value = 0.976. As the p-value > 0.05, we conclude that there is no statistically significant difference between the number of flowers produced by male and female trees.