#Question3
rm(list=ls())
yield_values <- c(14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48)
frequencies <- c(1, 4, 1, 5, 7, 10, 26, 22, 39, 33, 28, 18, 27, 13, 4, 5, 6, 1)
yield_data <- rep(yield_values, times = frequencies)
table(yield_data)
## yield_data
## 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 
##  1  4  1  5  7 10 26 22 39 33 28 18 27 13  4  5  6  1
#(a)Sample mean, median, and mode
mean_yield <- mean(yield_data)
mean_yield
## [1] 31.864
median_yield <- median(yield_data)
median_yield
## [1] 32
mode_yield <- as.numeric(names(which.max(table(yield_data))))
mode_yield
## [1] 30
#(b)Sample variance, standard deviation, and coefficient of variation
variance_yield <- var(yield_data)
sd_yield <- sd(yield_data)
cv_yield <- (sd_yield / mean_yield) * 100
variance_yield
## [1] 39.35492
sd_yield
## [1] 6.27335
cv_yield
## [1] 19.68789
Minimum= min(yield_data)
Maximum=max(yield_data)
Q_1=quantile(yield_data,0.25)
Q_3=quantile(yield_data,0.75)
Minimum
## [1] 14
Maximum
## [1] 48
Q_1
## 25% 
##  28
Q_3
## 75% 
##  36
#(c)Boxplot
boxplot(yield_data, horizontal = TRUE, main = "Boxplot of Soybean Yields", xlab = "Yield (grams)")

#Question2

tree_data=read.csv("C:/Users/Sabuj Ganguly/OneDrive/Documents/PhD 1st sem/STAT 5034 Inference/treedat.csv")
height <- tree_data$height
normal_kernel <- function(x) {
  (1 / sqrt(2 * pi)) * exp(-0.5 * x^2)    #We have used gaussian kernel here
}


kde_normal <- function(x, data, h) {
  n <- length(data)
  sum_kernel <- sum(normal_kernel((x - data) / h))
  (1 / (n * h)) * sum_kernel
}


kde_normal_vectorized <- Vectorize(kde_normal, vectorize.args = "x") #vectorized it to work over multiple x


x_grid <- seq(min(height) - 10, max(height) + 10, length.out = 1000) #setting up grid to work over multiple x

bandwidths <- c(0.77, 7.7, 77)
kde_list <- lapply(bandwidths, function(h) {
  kde_normal_vectorized(x_grid, height, h)
})


modes <- sapply(1:length(bandwidths), function(i) {
  kde_vals <- kde_list[[i]]  #finding x where KDE is max
  x_grid[which.max(kde_vals)]
})


par(mfrow = c(1, 3))  

for (i in 1:length(bandwidths)) {
  h <- bandwidths[i]
  kde_vals <- kde_list[[i]]
  
  
  hist(height, prob = TRUE, main = paste("KDE with h =", h), 
       xlab = "Height", col = "lightblue", breaks = 20)
  
  lines(x_grid, kde_vals, col = "red", lwd = 2)
}

par(mfrow = c(1, 1))

cat("Modes for each bandwidth:\n")  #for each bandwidth I'm getting the modes
## Modes for each bandwidth:
for (i in 1:length(bandwidths)) {
  cat("h =", bandwidths[i], ": Mode =", round(modes[i], 2), "\n") 
}
## h = 0.77 : Mode = 96.43 
## h = 7.7 : Mode = 96.13 
## h = 77 : Mode = 87.31
n <- length(height)
h_values <- c(0.77, 7.7, 77)
a <- 92
empirical <- mean(height < a)
cat("Empirical P(X < 92) =", empirical, "\n") #This is the empirical probability
## Empirical P(X < 92) = 0.5405405
kde_prob <- function(a, data, h) {
  n <- length(data)
  sum(pnorm((a - data) / h)) / n
}

probs <- sapply(h_values, function(h) {
  kde_prob(a, height, h) #computing for each bandwidth
})

cat("KDE-based P(X < 92):\n")
## KDE-based P(X < 92):
for (i in 1:length(h_values)) {
  cat("h =", h_values[i], ": ", round(probs[i], 4), "\n")
}
## h = 0.77 :  0.5385 
## h = 7.7 :  0.5539 
## h = 77 :  0.5241

#Question1

gamma_pdf<- function(x, shape, scale) {
  dgamma(x, shape = shape, scale = scale)
}

par(mfrow = c(2, 2))

#(a)
x<- seq(0, 10, length.out = 1000)
y1_a<- gamma_pdf(x, 2, 1)
y2_a<- gamma_pdf(x, 4, 0.5)
plot(x, y1_a, type = "l", col = "blue", lwd = 2, ylim = c(0, 0.6), main = "Same mean, different variance and skew", xlab ="x", ylab ="Density")
lines(x, y2_a, col = "red", lwd = 2)


#(b)
y1_b<- gamma_pdf(x, 2, 1)
y2_b<- gamma_pdf(x, 8, 0.5)
plot(x, y1_b, type = "l", col = "blue", lwd = 2, ylim = c(0, 0.6), 
     main = "Same variance, different mean and skew", xlab ="x", ylab= "Density")
lines(x, y2_b, col = "red", lwd = 2)


#(c)
y1_c <- gamma_pdf(x, 2, 1)
y2_c <- gamma_pdf(x, 2, 2)
plot(x, y1_c, type = "l", col = "blue", lwd = 2, ylim = c(0, 0.6), 
     main = "Same skew, different mean and variance", xlab= "x", ylab= "Density")
lines(x, y2_c, col = "red", lwd = 2)