Question 1: Monte Carlo Estimation

1.(a)

set.seed(4060)
theta <- 3
sigma <- 1.5
M <- 1000
N <- 30
x <- p <- means <- sds <- numeric(M)

for (i in 1:M) {
  x <- rnorm(N, theta, sigma)
  means[i] <- mean(x)
  sds[i] <- sd(x)
  C <- c(means[i] - 1.96 * (sds[i] / sqrt(N)), 
         means[i] + 1.96 * (sds[i] / sqrt(N)))
  if (theta >= C[1] && theta <= C[2]) {
    p[i] <- 1
  }
}
cat("Proportion of intervals containing theta (95% CI):", mean(p), "\n")
## Proportion of intervals containing theta (95% CI): 0.934

1.(b)

p2 <- numeric(M)
for (i in 1:M) {
  x <- rnorm(N, theta, sigma)
  means[i] <- mean(x)
  sds[i] <- sd(x)
  C <- c(means[i] - 1.645 * (sds[i] / sqrt(N)), 
         means[i] + 1.645 * (sds[i] / sqrt(N)))
  if (theta >= C[1] && theta <= C[2]) {
    p2[i] <- 1
  }
}
cat("Proportion of intervals containing theta (90% CI):", mean(p2), "\n")
## Proportion of intervals containing theta (90% CI): 0.884

1.(c)

For a 95% confidence interval, the critical value is \(z = 1.96\), representing the cutoffs for 2.5% in each tail.
For a 90% confidence interval, the critical value is \(z = 1.645\), representing the cutoffs for 5% in each tail.

Larger confidence intervals are more inclusive because the range of values considered plausible for the parameter increases. This results in a higher proportion of intervals containing the true parameter value but at the cost of reduced precision.

1.(d) As the sample size \(N\) increases:

This demonstrates the consistency of the Monte Carlo simulation with increasing sample size and supports the validity of the confidence interval calculations.

Question 2: Bootstrap Analysis

2.(a)

rm(list = ls())
set.seed(4060)
library(MASS)

x <- Animals
B <- 1000
n <- nrow(Animals)
brain_ms <- body_ms <- numeric(B)

for (i in 1:B) {
  ib <- sample(1:n, size = n, replace = TRUE)
  brw <- x$brain[ib]
  bow <- x$body[ib]
  brain_ms[i] <- mean(brw)
  body_ms[i] <- mean(bow)
}

cat("Bootstrap mean of brain weights:", mean(brain_ms), "\n")
## Bootstrap mean of brain weights: 568.1507
cat("Bootstrap mean of body weights:", mean(body_ms), "\n")
## Bootstrap mean of body weights: 4241.751
cat("Original mean of brain weights:", mean(x$brain), "\n")
## Original mean of brain weights: 574.5214
cat("Original mean of body weights:", mean(x$body), "\n")
## Original mean of body weights: 4278.439

2.(b)

ratio <- numeric(B)
for (i in 1:B) {
  ib <- sample(1:n, size = n, replace = TRUE)
  brw <- x$brain[ib]
  bow <- x$body[ib]
  ratio[i] <- mean(brw / bow)
}

cat("Bootstrap mean ratio:", mean(ratio), "\n")
## Bootstrap mean ratio: 6.24373
cat("Original mean ratio:", mean(x$brain / x$body), "\n")
## Original mean ratio: 6.199137

2.(c)

mean_diff <- mean(body_ms) - mean(x$body)
cat("Difference between bootstrap mean and original mean (body weights):", mean_diff, "\n")
## Difference between bootstrap mean and original mean (body weights): -36.68763

2.(d)

ci <- quantile(body_ms, c(0.025, 0.975))
cat("95% confidence interval for body weights:", ci, "\n")
## 95% confidence interval for body weights: 419.4387 10920.69

2.(e)

cat("Standard deviation of body weights:", sd(x$body), "\n")
## Standard deviation of body weights: 16480.49
plot(x$body, pch = 16, xlab = "Index of Animal", ylab = "Body Weight", main = "Body Weight of Animals")
abline(h = 15000, col = "red", lwd = 2)
max_body <- max(x$body)
quartiles <- quantile(x$body)
abline(h = quartiles[3], col = "orange", lwd = 2)
legend("topleft", legend = c("All but 1 below", "3rd Quartile"), col = c("red", "orange"), lwd = 2)

Question 3: Spline Analysis

3.(a)

rm(list = ls())
set.seed(4060)
library(splines)
library(MASS)

x <- Boston$nox
y <- Boston$medv

k <- quantile(x, c(0.15, 0.4, 0.6, 0.7, 0.85))
B <- bs(x, knots = k)
bspl <- lm(y ~ B)
summary(bspl)
## 
## Call:
## lm(formula = y ~ B)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.633  -5.022  -1.840   3.012  32.148 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   23.677      4.559   5.194 3.01e-07 ***
## B1             7.728      6.678   1.157  0.24777    
## B2            -1.641      4.207  -0.390  0.69667    
## B3             8.679      5.478   1.584  0.11380    
## B4           -13.190      4.747  -2.778  0.00567 ** 
## B5            11.145      5.118   2.178  0.02991 *  
## B6           -34.857      6.008  -5.802 1.17e-08 ***
## B7            19.210      8.352   2.300  0.02187 *  
## B8            -7.249      4.961  -1.461  0.14459    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.843 on 497 degrees of freedom
## Multiple R-squared:  0.2844, Adjusted R-squared:  0.2729 
## F-statistic: 24.69 on 8 and 497 DF,  p-value: < 2.2e-16

3.(b)

plot(x, y, pch = 16, xlab = "Nitric Oxide Concentration (nox)", ylab = "Median Value of Homes (medv)", main = "Spline Fit with Knots")
is <- order(x)
lines(x[is], fitted(bspl)[is], col = "red", lwd = 2)
abline(v = c(0.4, 0.5, 0.6), col = "green")
legend("topright", legend = c("Fitted Spline", "Knots"), col = c("red", "green"), lwd = c(2, 1), bty = "n")

newx <- c(0.4, 0.5, 0.6)
newB <- predict(B, newx)
bspl_2 <- lm(y ~ ., data = as.data.frame(B))
pred <- predict(bspl_2, newdata = as.data.frame(newB))
cat("Predictions for new values (0.4, 0.5, 0.6):", pred, "\n")
## Predictions for new values (0.4, 0.5, 0.6): 27.81543 26.52846 24.01868

3.(c) (i)

pspl <- smooth.spline(x, y)
rss <- sum(resid(pspl)^2)
cat("Residual Sum of Squares (RSS):", rss, "\n")
## Residual Sum of Squares (RSS): 16776.91
plot(x, y, pch = 20, cex = 1.5, xlab = "Nitric Oxide Concentration (nox)", ylab = "Median Value of Homes (medv)", main = "Comparison of Spline Fits")
lines(x[is], fitted(bspl)[is], col = "blue", lwd = 2)
lines(pspl, col = "red", lwd = 2)
legend("topright", legend = c("B-Spline Fit", "Smooth Spline Fit"), col = c("blue", "red"), lwd = 2, bty = "n")

3.(d)

pred2 <- approx(pspl, xout = c(0.4, 0.5, 0.6))$y
cat("Predictions from smooth spline at (0.4, 0.5, 0.6):", pred2, "\n")
## Predictions from smooth spline at (0.4, 0.5, 0.6): 28.60715 26.1052 23.71711

3.(e)

K <- 5
err <- numeric(K)
N <- nrow(Boston)
folds <- cut(1:N, K, labels = FALSE)

for (k in 1:K) {
  ktrain <- which(folds != k)
  x.train <- x[ktrain]
  x.test <- x[-ktrain]
  y.train <- y[ktrain]
  y.test <- y[-ktrain]
  pspl <- smooth.spline(x.train, y.train)
  pred <- predict(pspl, x.test)$y
  err[k] <- mean((pred - y.test)^2)
}

rmse <- sqrt(mean(err))
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
## Root Mean Squared Error (RMSE): 32.24743

Question 4: Clustering and PCA Analysis

4.(a)

rm(list = ls())
set.seed(4060)
library(MASS)

x <- Pima.tr
x$type <- NULL
y <- Pima.tr$type

Classification problem since y contains labels (YES/NO), which are discrete categories, not continuous values.

4.(b)

ko <- kmeans(x, centers = 2)
cat("Cluster assignment vs actual labels:\n")
## Cluster assignment vs actual labels:
print(table(ko$cluster, y))
##    y
##      No Yes
##   1 103  23
##   2  29  45
plot(x[, 1:2], pch = 20, cex = 2, col = ko$cluster, 
     xlab = "Feature 1", ylab = "Feature 2", 
     main = "K-Means Clustering")
legend("topright", legend = c("Cluster 1", "Cluster 2"), 
       col = 1:2, pch = 20, pt.cex = 2, bty = "n")

4.(c)

# Analysis of K-Means Clustering:

cat("Clustering observations:\n")
## Clustering observations:
cat("- Two distinct clusters are identified.\n")
## - Two distinct clusters are identified.
cat("- Higher glucose levels are predominantly assigned to one cluster.\n")
## - Higher glucose levels are predominantly assigned to one cluster.
cat("- Number of pregnancies seems to have less influence on cluster assignments.\n")
## - Number of pregnancies seems to have less influence on cluster assignments.
cat("- Overlap occurs around a glucose level of 130, making intermediate cases harder to classify.\n")
## - Overlap occurs around a glucose level of 130, making intermediate cases harder to classify.
cat("- Individuals with higher glucose levels are more indicative of diabetes.\n")
## - Individuals with higher glucose levels are more indicative of diabetes.

4.(d)

pca <- prcomp(x, scale = TRUE)
summary(pca) # Output summary of PCA
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.5522 1.2233 0.9549 0.8944 0.83074 0.62435 0.55000
## Proportion of Variance 0.3442 0.2138 0.1303 0.1143 0.09859 0.05569 0.04321
## Cumulative Proportion  0.3442 0.5580 0.6882 0.8025 0.90110 0.95679 1.00000

4.(e)

pca.r <- prcomp(x) # Unscaled PCA for comparison
summary(pca.r)
## Importance of components:
##                            PC1     PC2      PC3     PC4     PC5     PC6     PC7
## Standard deviation     32.2816 13.5337 10.65363 8.80415 4.26711 2.63685 0.29758
## Proportion of Variance  0.7229  0.1271  0.07874 0.05377 0.01263 0.00482 0.00006
## Cumulative Proportion   0.7229  0.8500  0.92871 0.98248 0.99512 0.99994 1.00000
influential_features <- apply(abs(pca$rotation), 2, which.max)
most_influential <- row.names(pca$rotation)[influential_features]

cat("Most influential features for each PC:\n")
## Most influential features for each PC:
print(most_influential)
## [1] "age"  "bmi"  "ped"  "glu"  "bp"   "age"  "skin"

PC1 is dominantly influenced by glucose, as it has the largest absolute loading. This suggests that glucose levels are the most significant factor in explaining variance.