Winter 2023 Exam Solution

Question 1: Monte Carlo Estimation

1.(a)

set.seed(4060)
theta <- 3
sigma <- 1.5
M <- 1000
N <- 30
x <- p <- means <- sds <- numeric(M)

for (i in 1:M) {
  x <- rnorm(N, theta, sigma)
  means[i] <- mean(x)
  sds[i] <- sd(x)
  C <- c(means[i] - 1.96 * (sds[i] / sqrt(N)), 
         means[i] + 1.96 * (sds[i] / sqrt(N)))
  if (theta >= C[1] && theta <= C[2]) {
    p[i] <- 1
  }
}
cat("Proportion of intervals containing theta (95% CI):", mean(p), "\n")

## Proportion of intervals containing theta (95% CI): 0.934

1.(b)

p2 <- numeric(M)
for (i in 1:M) {
  x <- rnorm(N, theta, sigma)
  means[i] <- mean(x)
  sds[i] <- sd(x)
  C <- c(means[i] - 1.645 * (sds[i] / sqrt(N)), 
         means[i] + 1.645 * (sds[i] / sqrt(N)))
  if (theta >= C[1] && theta <= C[2]) {
    p2[i] <- 1
  }
}
cat("Proportion of intervals containing theta (90% CI):", mean(p2), "\n")

## Proportion of intervals containing theta (90% CI): 0.884

1.(c)

For a 95% confidence interval, the critical value is \(z = 1.96\), representing the cutoffs for 2.5% in each tail.
For a 90% confidence interval, the critical value is \(z = 1.645\), representing the cutoffs for 5% in each tail.

Larger confidence intervals are more inclusive because the range of values considered plausible for the parameter increases. This results in a higher proportion of intervals containing the true parameter value but at the cost of reduced precision.

1.(d) As the sample size \(N\) increases:

The standard error decreases, since it is inversely proportional to \(\sqrt{N}\).
Confidence intervals become narrower, reflecting greater precision in the estimate of the parameter.
The proportion \(p\) of intervals that contain the true parameter \(\theta\) approaches the nominal level (e.g., 95% for a 95% confidence interval).

This demonstrates the consistency of the Monte Carlo simulation with increasing sample size and supports the validity of the confidence interval calculations.

Question 2: Bootstrap Analysis

2.(a)

rm(list = ls())
set.seed(4060)
library(MASS)

x <- Animals
B <- 1000
n <- nrow(Animals)
brain_ms <- body_ms <- numeric(B)

for (i in 1:B) {
  ib <- sample(1:n, size = n, replace = TRUE)
  brw <- x$brain[ib]
  bow <- x$body[ib]
  brain_ms[i] <- mean(brw)
  body_ms[i] <- mean(bow)
}

cat("Bootstrap mean of brain weights:", mean(brain_ms), "\n")

## Bootstrap mean of brain weights: 568.1507

cat("Bootstrap mean of body weights:", mean(body_ms), "\n")

## Bootstrap mean of body weights: 4241.751

cat("Original mean of brain weights:", mean(x$brain), "\n")

## Original mean of brain weights: 574.5214

cat("Original mean of body weights:", mean(x$body), "\n")

## Original mean of body weights: 4278.439

2.(b)

ratio <- numeric(B)
for (i in 1:B) {
  ib <- sample(1:n, size = n, replace = TRUE)
  brw <- x$brain[ib]
  bow <- x$body[ib]
  ratio[i] <- mean(brw / bow)
}

cat("Bootstrap mean ratio:", mean(ratio), "\n")

## Bootstrap mean ratio: 6.24373

cat("Original mean ratio:", mean(x$brain / x$body), "\n")

## Original mean ratio: 6.199137

2.(c)

mean_diff <- mean(body_ms) - mean(x$body)
cat("Difference between bootstrap mean and original mean (body weights):", mean_diff, "\n")

## Difference between bootstrap mean and original mean (body weights): -36.68763

2.(d)

ci <- quantile(body_ms, c(0.025, 0.975))
cat("95% confidence interval for body weights:", ci, "\n")

## 95% confidence interval for body weights: 419.4387 10920.69

2.(e)

cat("Standard deviation of body weights:", sd(x$body), "\n")

## Standard deviation of body weights: 16480.49

plot(x$body, pch = 16, xlab = "Index of Animal", ylab = "Body Weight", main = "Body Weight of Animals")
abline(h = 15000, col = "red", lwd = 2)
max_body <- max(x$body)
quartiles <- quantile(x$body)
abline(h = quartiles[3], col = "orange", lwd = 2)
legend("topleft", legend = c("All but 1 below", "3rd Quartile"), col = c("red", "orange"), lwd = 2)

Question 3: Spline Analysis

3.(a)

rm(list = ls())
set.seed(4060)
library(splines)
library(MASS)

x <- Boston$nox
y <- Boston$medv

k <- quantile(x, c(0.15, 0.4, 0.6, 0.7, 0.85))
B <- bs(x, knots = k)
bspl <- lm(y ~ B)
summary(bspl)

## 
## Call:
## lm(formula = y ~ B)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.633  -5.022  -1.840   3.012  32.148 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   23.677      4.559   5.194 3.01e-07 ***
## B1             7.728      6.678   1.157  0.24777    
## B2            -1.641      4.207  -0.390  0.69667    
## B3             8.679      5.478   1.584  0.11380    
## B4           -13.190      4.747  -2.778  0.00567 ** 
## B5            11.145      5.118   2.178  0.02991 *  
## B6           -34.857      6.008  -5.802 1.17e-08 ***
## B7            19.210      8.352   2.300  0.02187 *  
## B8            -7.249      4.961  -1.461  0.14459    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.843 on 497 degrees of freedom
## Multiple R-squared:  0.2844, Adjusted R-squared:  0.2729 
## F-statistic: 24.69 on 8 and 497 DF,  p-value: < 2.2e-16

3.(b)

plot(x, y, pch = 16, xlab = "Nitric Oxide Concentration (nox)", ylab = "Median Value of Homes (medv)", main = "Spline Fit with Knots")
is <- order(x)
lines(x[is], fitted(bspl)[is], col = "red", lwd = 2)
abline(v = c(0.4, 0.5, 0.6), col = "green")
legend("topright", legend = c("Fitted Spline", "Knots"), col = c("red", "green"), lwd = c(2, 1), bty = "n")

newx <- c(0.4, 0.5, 0.6)
newB <- predict(B, newx)
bspl_2 <- lm(y ~ ., data = as.data.frame(B))
pred <- predict(bspl_2, newdata = as.data.frame(newB))
cat("Predictions for new values (0.4, 0.5, 0.6):", pred, "\n")

## Predictions for new values (0.4, 0.5, 0.6): 27.81543 26.52846 24.01868

3.(c) (i)

pspl <- smooth.spline(x, y)
rss <- sum(resid(pspl)^2)
cat("Residual Sum of Squares (RSS):", rss, "\n")

## Residual Sum of Squares (RSS): 16776.91

plot(x, y, pch = 20, cex = 1.5, xlab = "Nitric Oxide Concentration (nox)", ylab = "Median Value of Homes (medv)", main = "Comparison of Spline Fits")
lines(x[is], fitted(bspl)[is], col = "blue", lwd = 2)
lines(pspl, col = "red", lwd = 2)
legend("topright", legend = c("B-Spline Fit", "Smooth Spline Fit"), col = c("blue", "red"), lwd = 2, bty = "n")

3.(d)

pred2 <- approx(pspl, xout = c(0.4, 0.5, 0.6))$y
cat("Predictions from smooth spline at (0.4, 0.5, 0.6):", pred2, "\n")

## Predictions from smooth spline at (0.4, 0.5, 0.6): 28.60715 26.1052 23.71711

3.(e)

K <- 5
err <- numeric(K)
N <- nrow(Boston)
folds <- cut(1:N, K, labels = FALSE)

for (k in 1:K) {
  ktrain <- which(folds != k)
  x.train <- x[ktrain]
  x.test <- x[-ktrain]
  y.train <- y[ktrain]
  y.test <- y[-ktrain]
  pspl <- smooth.spline(x.train, y.train)
  pred <- predict(pspl, x.test)$y
  err[k] <- mean((pred - y.test)^2)
}

rmse <- sqrt(mean(err))
cat("Root Mean Squared Error (RMSE):", rmse, "\n")

## Root Mean Squared Error (RMSE): 32.24743

Question 4: Clustering and PCA Analysis

4.(a)

rm(list = ls())
set.seed(4060)
library(MASS)

x <- Pima.tr
x$type <- NULL
y <- Pima.tr$type

Classification problem since y contains labels (YES/NO), which are discrete categories, not continuous values.

4.(b)

ko <- kmeans(x, centers = 2)
cat("Cluster assignment vs actual labels:\n")

## Cluster assignment vs actual labels:

print(table(ko$cluster, y))

##    y
##      No Yes
##   1 103  23
##   2  29  45

plot(x[, 1:2], pch = 20, cex = 2, col = ko$cluster, 
     xlab = "Feature 1", ylab = "Feature 2", 
     main = "K-Means Clustering")
legend("topright", legend = c("Cluster 1", "Cluster 2"), 
       col = 1:2, pch = 20, pt.cex = 2, bty = "n")

4.(c)

# Analysis of K-Means Clustering:

cat("Clustering observations:\n")

## Clustering observations:

cat("- Two distinct clusters are identified.\n")

## - Two distinct clusters are identified.

cat("- Higher glucose levels are predominantly assigned to one cluster.\n")

## - Higher glucose levels are predominantly assigned to one cluster.

cat("- Number of pregnancies seems to have less influence on cluster assignments.\n")

## - Number of pregnancies seems to have less influence on cluster assignments.

cat("- Overlap occurs around a glucose level of 130, making intermediate cases harder to classify.\n")

## - Overlap occurs around a glucose level of 130, making intermediate cases harder to classify.

cat("- Individuals with higher glucose levels are more indicative of diabetes.\n")

## - Individuals with higher glucose levels are more indicative of diabetes.

4.(d)

pca <- prcomp(x, scale = TRUE)
summary(pca) # Output summary of PCA

## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5     PC6     PC7
## Standard deviation     1.5522 1.2233 0.9549 0.8944 0.83074 0.62435 0.55000
## Proportion of Variance 0.3442 0.2138 0.1303 0.1143 0.09859 0.05569 0.04321
## Cumulative Proportion  0.3442 0.5580 0.6882 0.8025 0.90110 0.95679 1.00000

4.(e)

pca.r <- prcomp(x) # Unscaled PCA for comparison
summary(pca.r)

## Importance of components:
##                            PC1     PC2      PC3     PC4     PC5     PC6     PC7
## Standard deviation     32.2816 13.5337 10.65363 8.80415 4.26711 2.63685 0.29758
## Proportion of Variance  0.7229  0.1271  0.07874 0.05377 0.01263 0.00482 0.00006
## Cumulative Proportion   0.7229  0.8500  0.92871 0.98248 0.99512 0.99994 1.00000

influential_features <- apply(abs(pca$rotation), 2, which.max)
most_influential <- row.names(pca$rotation)[influential_features]

cat("Most influential features for each PC:\n")

## Most influential features for each PC:

print(most_influential)

## [1] "age"  "bmi"  "ped"  "glu"  "bp"   "age"  "skin"

PC1 is dominantly influenced by glucose, as it has the largest absolute loading. This suggests that glucose levels are the most significant factor in explaining variance.

Winter 2023 Exam Solution

Colm McCarthy

2024-12-02