# load dataset
data(iris)
# scatter plots for the species features

# use only numeric features
data_numeric <- iris[, 1:4]   # takes first 4 columns of data

# convert species names to numbers and assign colors
species_colors <- c("red", "green3", "blue")[as.numeric(as.factor(iris$Species))]

# creates scatter plots
pairs(data_numeric,
      main = "Iris Data (red=setosa, green=versicolor, blue=virginica)",
      pch = 21, bg = species_colors)
# boxplots for each feature
library(ggplot2)
library(gridExtra)  # for arranging plots in one figure

# sepal length
p1 <- ggplot(iris, aes(x = Species, y = Sepal.Length, fill = Species)) +
  geom_boxplot() +
  theme_minimal() +
  ggtitle("Sepal Length by Species")

# sepal width
p2 <- ggplot(iris, aes(x = Species, y = Sepal.Width, fill = Species)) +
  geom_boxplot() +
  theme_minimal() +
  ggtitle("Sepal Width by Species")

# petal length
p3 <- ggplot(iris, aes(x = Species, y = Petal.Length, fill = Species)) +
  geom_boxplot() +
  theme_minimal() +
  ggtitle("Petal Length by Species")

# petal width
p4 <- ggplot(iris, aes(x = Species, y = Petal.Width, fill = Species)) +
  geom_boxplot() +
  theme_minimal() +
  ggtitle("Petal Width by Species")

# put all plots in one figure
grid.arrange(p1, p2, p3, p4, ncol = 2)
# histogram of the data
library(ggplot2)

# petal length
p1 <- ggplot(iris, aes(x = Petal.Length, fill = Species)) +
  geom_histogram(alpha = 0.6, position = "identity", bins = 20) +
  theme_minimal() +
  ggtitle("Histogram of Petal Length")

# petal width
p2 <- ggplot(iris, aes(x = Petal.Width, fill = Species)) +
  geom_histogram(alpha = 0.6, position = "identity", bins = 20) +
  theme_minimal() +
  ggtitle("Histogram of Petal Width")

# plot
library(gridExtra)
grid.arrange(p1, p2, ncol = 2)
# confidence interval for the mean
ci_mean <- function(x, conf_level = 0.95) {
  n <- length(x)
  mean_x <- mean(x)
  se <- sd(x) / sqrt(n)
  alpha <- 1 - conf_level
  t_score <- qt(1 - alpha/2, df = n - 1)
  margin <- t_score * se
  lower <- mean_x - margin
  upper <- mean_x + margin
  return(c(lower = lower, mean = mean_x, upper = upper))
}

# estimate confidence interval for petal length of each species
# split the data by each species
setosa_pl <- iris$Petal.Length[iris$Species == "setosa"]
versicolor_pl <- iris$Petal.Length[iris$Species == "versicolor"]
virginica_pl <- iris$Petal.Length[iris$Species == "virginica"]

# calculate confidence intervals using split data
ci_setosa <- ci_mean(setosa_pl)
ci_versicolor <- ci_mean(versicolor_pl)
ci_virginica <- ci_mean(virginica_pl)

# print results
print('Setosa')
print(ci_setosa)
print('Versicolor')
print(ci_versicolor)
print('Virginica')
print(ci_virginica)
# hypothesis test for comparison of 2 population means
t_test <- function(x1, x2, alternative = "two.sided", alpha = 0.05) {
  n1 <- length(x1)
  n2 <- length(x2)
  mean1 <- mean(x1)
  mean2 <- mean(x2)
  var1 <- var(x1)
  var2 <- var(x2)
  
  # t-statistic
  t_stat <- (mean1 - mean2) / sqrt(var1/n1 + var2/n2)
  
  # degrees of freedom
  df <- (var1/n1 + var2/n2)^2 / ((var1^2)/(n1^2 * (n1 - 1)) + (var2^2)/(n2^2 * (n2 - 1)))
  
  # calculate p-value based on the alternative hypothesis
  if (alternative == "greater") {
    p_value <- 1 - pt(t_stat, df)
  } else if (alternative == "less") {
    p_value <- pt(t_stat, df)
  } else {
    p_value <- 2 * (1 - pt(abs(t_stat), df))
  }
  
  # results
  list(
    t_statistic = t_stat,
    degrees_freedom = df,
    p_value = p_value,
    reject_null = p_value < alpha
  )
}

# hypothesis test 1: Virginica > Versicolor
test1 <- t_test(virginica_pl, versicolor_pl, alternative = "greater")
print("Hypothesis Test for Petal Length: Virginica > Versicolor")
print(test1)

# hypothesis test 2: Versicolor > Setosa
test2 <- t_test(versicolor_pl, setosa_pl, alternative = "greater")
print("Hypothesis Test for Petal Length: Versicolor > Setosa")
print(test2)