#Task 1 — Create Subsets
data(iris)
#1. Create a subset called setosa containing only observations where Species == “setosa”.
setosa <- subset(iris, Species == "setosa")
#2. Create a subset called versicolor containing only observations where Species == “versicolor”.
versicolor <- subset(iris, Species == "versicolor")
#3. Report: The sample size of each subset. The sample mean of Sepal.Length for each subset.
n_setosa <- nrow(setosa)
n_versicolor <- nrow(versicolor)
mean_setosa <- mean(setosa$Sepal.Length)
mean_versicolor <- mean(versicolor$Sepal.Length)
n_setosa
[1] 50
n_versicolor
[1] 50
mean_setosa
[1] 5.006
mean_versicolor
[1] 5.936
#Task 2-One-Sample t-Test Confidence Interval #1.
versicolor_test <- t.test(versicolor$Sepal.Length, conf.level = 0.90)
versicolor_test
One Sample t-test
data: versicolor$Sepal.Length
t = 81.318, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean is not equal to 0
90 percent confidence interval:
5.813616 6.058384
sample estimates:
mean of x
5.936
versicolor_test$estimate
mean of x
5.936
versicolor_test$conf.int
[1] 5.813616 6.058384
attr(,"conf.level")
[1] 0.9
versicolor_test$parameter
df
49
#2.
versicolor_test$estimate
mean of x
5.936
versicolor_test$conf.int
[1] 5.813616 6.058384
attr(,"conf.level")
[1] 0.9
versicolor_test$parameter
df
49
#Task 3 - Two-Sample Welch t-Test #1.
two_sample_test <- t.test(setosa$Sepal.Length, versicolor$Sepal.Length)
two_sample_test
Welch Two Sample t-test
data: setosa$Sepal.Length and versicolor$Sepal.Length
t = -10.521, df = 86.538, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-1.1057074 -0.7542926
sample estimates:
mean of x mean of y
5.006 5.936
two_sample_test$estimate
mean of x mean of y
5.006 5.936
two_sample_test$conf.int
[1] -1.1057074 -0.7542926
attr(,"conf.level")
[1] 0.95
two_sample_test$p.value
[1] 3.746743e-17
#2 #The estimated difference in means (setosa − versicolor):0.93 # 95% confidence interval for the difference: (-1.1057074 -0.7542926) # p-value: 3.746743e-17
#3 #The estimated difference in means is approximately -0.93, meaning that setosa sepals are about 0.93 units shorter than versicolor sepals. The 95% confidence interval for the difference is (-1.106, -.754). Since this confidence interval does not include 0, it suggests a difference between groups. The p-value is extremely small (3.75e-17), which is much smaller than .05. Therefore, it is statistically significant that the mean Sepal.Length differs between setosa and versicolor. The data provided evidence that versicolor flowers have longer sepals on average than setosa flowers.
#Task 4 #(a) #1.In this context the Type I error that occurs is that when it comes to rejecting the null hypothesis that the means are equal it’s difficult as they are equal.
#2. If α = 0.05, the probability of making a Type I error would be 5%.
#3. Rejecting the null hypothesis would be a mistake as it says that they are different when they are actually the same.
#(b)
data(iris)
setosa <- iris[iris$Species == "setosa", "Sepal.Length"]
versicolor <- iris[iris$Species == "versicolor", "Sepal.Length"]
#Pooled
pooled <- c(setosa, versicolor)
set.seed(2026)
nrep <- 100
rejections <- integer(nrep)
for (i in 1:nrep) {
# shuffle values
shuffled <- sample(pooled)
#Two group of size 50
g1 <- shuffled[1:50]
g2 <- shuffled[51:100]
#t-test
t_res <- t.test(g1, g2)
rejections[i] <- as.integer(t_res$p.value < 0.05)
}
num_rejections <- sum(rejections)
Prop_rejections <- mean(rejections)
num_rejections
[1] 6
Prop_rejections
[1] 0.06