Question 1: Monte Carlo Estimation
1.(a)
set.seed(4060)
theta <- 3
sigma <- 1.5
M <- 1000
N <- 30
x <- p <- means <- sds <- numeric(M)
for (i in 1:M) {
x <- rnorm(N, theta, sigma)
means[i] <- mean(x)
sds[i] <- sd(x)
C <- c(means[i] - 1.96 * (sds[i] / sqrt(N)),
means[i] + 1.96 * (sds[i] / sqrt(N)))
if (theta >= C[1] && theta <= C[2]) {
p[i] <- 1
}
}
cat("Proportion of intervals containing theta (95% CI):", mean(p), "\n")
## Proportion of intervals containing theta (95% CI): 0.934
1.(b)
p2 <- numeric(M)
for (i in 1:M) {
x <- rnorm(N, theta, sigma)
means[i] <- mean(x)
sds[i] <- sd(x)
C <- c(means[i] - 1.645 * (sds[i] / sqrt(N)),
means[i] + 1.645 * (sds[i] / sqrt(N)))
if (theta >= C[1] && theta <= C[2]) {
p2[i] <- 1
}
}
cat("Proportion of intervals containing theta (90% CI):", mean(p2), "\n")
## Proportion of intervals containing theta (90% CI): 0.884
1.(c)
For a 95% confidence interval, the critical value is \(z = 1.96\), representing the cutoffs for
2.5% in each tail.
For a 90% confidence interval, the critical value is \(z = 1.645\), representing the cutoffs for
5% in each tail.
Larger confidence intervals are more inclusive because the range of values considered plausible for the parameter increases. This results in a higher proportion of intervals containing the true parameter value but at the cost of reduced precision.
1.(d) As the sample size \(N\) increases:
This demonstrates the consistency of the Monte Carlo simulation with increasing sample size and supports the validity of the confidence interval calculations.
Question 2: Bootstrap Analysis
2.(a)
rm(list = ls())
set.seed(4060)
library(MASS)
x <- Animals
B <- 1000
n <- nrow(Animals)
brain_ms <- body_ms <- numeric(B)
for (i in 1:B) {
ib <- sample(1:n, size = n, replace = TRUE)
brw <- x$brain[ib]
bow <- x$body[ib]
brain_ms[i] <- mean(brw)
body_ms[i] <- mean(bow)
}
cat("Bootstrap mean of brain weights:", mean(brain_ms), "\n")
## Bootstrap mean of brain weights: 568.1507
cat("Bootstrap mean of body weights:", mean(body_ms), "\n")
## Bootstrap mean of body weights: 4241.751
cat("Original mean of brain weights:", mean(x$brain), "\n")
## Original mean of brain weights: 574.5214
cat("Original mean of body weights:", mean(x$body), "\n")
## Original mean of body weights: 4278.439
2.(b)
ratio <- numeric(B)
for (i in 1:B) {
ib <- sample(1:n, size = n, replace = TRUE)
brw <- x$brain[ib]
bow <- x$body[ib]
ratio[i] <- mean(brw / bow)
}
cat("Bootstrap mean ratio:", mean(ratio), "\n")
## Bootstrap mean ratio: 6.24373
cat("Original mean ratio:", mean(x$brain / x$body), "\n")
## Original mean ratio: 6.199137
2.(c)
mean_diff <- mean(body_ms) - mean(x$body)
cat("Difference between bootstrap mean and original mean (body weights):", mean_diff, "\n")
## Difference between bootstrap mean and original mean (body weights): -36.68763
2.(d)
ci <- quantile(body_ms, c(0.025, 0.975))
cat("95% confidence interval for body weights:", ci, "\n")
## 95% confidence interval for body weights: 419.4387 10920.69
2.(e)
cat("Standard deviation of body weights:", sd(x$body), "\n")
## Standard deviation of body weights: 16480.49
plot(x$body, pch = 16, xlab = "Index of Animal", ylab = "Body Weight", main = "Body Weight of Animals")
abline(h = 15000, col = "red", lwd = 2)
max_body <- max(x$body)
quartiles <- quantile(x$body)
abline(h = quartiles[3], col = "orange", lwd = 2)
legend("topleft", legend = c("All but 1 below", "3rd Quartile"), col = c("red", "orange"), lwd = 2)
Question 3: Spline Analysis
3.(a)
rm(list = ls())
set.seed(4060)
library(splines)
library(MASS)
x <- Boston$nox
y <- Boston$medv
k <- quantile(x, c(0.15, 0.4, 0.6, 0.7, 0.85))
B <- bs(x, knots = k)
bspl <- lm(y ~ B)
summary(bspl)
##
## Call:
## lm(formula = y ~ B)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.633 -5.022 -1.840 3.012 32.148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.677 4.559 5.194 3.01e-07 ***
## B1 7.728 6.678 1.157 0.24777
## B2 -1.641 4.207 -0.390 0.69667
## B3 8.679 5.478 1.584 0.11380
## B4 -13.190 4.747 -2.778 0.00567 **
## B5 11.145 5.118 2.178 0.02991 *
## B6 -34.857 6.008 -5.802 1.17e-08 ***
## B7 19.210 8.352 2.300 0.02187 *
## B8 -7.249 4.961 -1.461 0.14459
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.843 on 497 degrees of freedom
## Multiple R-squared: 0.2844, Adjusted R-squared: 0.2729
## F-statistic: 24.69 on 8 and 497 DF, p-value: < 2.2e-16
3.(b)
plot(x, y, pch = 16, xlab = "Nitric Oxide Concentration (nox)", ylab = "Median Value of Homes (medv)", main = "Spline Fit with Knots")
is <- order(x)
lines(x[is], fitted(bspl)[is], col = "red", lwd = 2)
abline(v = c(0.4, 0.5, 0.6), col = "green")
legend("topright", legend = c("Fitted Spline", "Knots"), col = c("red", "green"), lwd = c(2, 1), bty = "n")
newx <- c(0.4, 0.5, 0.6)
newB <- predict(B, newx)
bspl_2 <- lm(y ~ ., data = as.data.frame(B))
pred <- predict(bspl_2, newdata = as.data.frame(newB))
cat("Predictions for new values (0.4, 0.5, 0.6):", pred, "\n")
## Predictions for new values (0.4, 0.5, 0.6): 27.81543 26.52846 24.01868
3.(c) (i)
pspl <- smooth.spline(x, y)
rss <- sum(resid(pspl)^2)
cat("Residual Sum of Squares (RSS):", rss, "\n")
## Residual Sum of Squares (RSS): 16776.91
plot(x, y, pch = 20, cex = 1.5, xlab = "Nitric Oxide Concentration (nox)", ylab = "Median Value of Homes (medv)", main = "Comparison of Spline Fits")
lines(x[is], fitted(bspl)[is], col = "blue", lwd = 2)
lines(pspl, col = "red", lwd = 2)
legend("topright", legend = c("B-Spline Fit", "Smooth Spline Fit"), col = c("blue", "red"), lwd = 2, bty = "n")
3.(d)
pred2 <- approx(pspl, xout = c(0.4, 0.5, 0.6))$y
cat("Predictions from smooth spline at (0.4, 0.5, 0.6):", pred2, "\n")
## Predictions from smooth spline at (0.4, 0.5, 0.6): 28.60715 26.1052 23.71711
3.(e)
K <- 5
err <- numeric(K)
N <- nrow(Boston)
folds <- cut(1:N, K, labels = FALSE)
for (k in 1:K) {
ktrain <- which(folds != k)
x.train <- x[ktrain]
x.test <- x[-ktrain]
y.train <- y[ktrain]
y.test <- y[-ktrain]
pspl <- smooth.spline(x.train, y.train)
pred <- predict(pspl, x.test)$y
err[k] <- mean((pred - y.test)^2)
}
rmse <- sqrt(mean(err))
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
## Root Mean Squared Error (RMSE): 32.24743
Question 4: Clustering and PCA Analysis
4.(a)
rm(list = ls())
set.seed(4060)
library(MASS)
x <- Pima.tr
x$type <- NULL
y <- Pima.tr$type
Classification problem since y contains labels (YES/NO), which are discrete categories, not continuous values.
4.(b)
ko <- kmeans(x, centers = 2)
cat("Cluster assignment vs actual labels:\n")
## Cluster assignment vs actual labels:
print(table(ko$cluster, y))
## y
## No Yes
## 1 103 23
## 2 29 45
plot(x[, 1:2], pch = 20, cex = 2, col = ko$cluster,
xlab = "Feature 1", ylab = "Feature 2",
main = "K-Means Clustering")
legend("topright", legend = c("Cluster 1", "Cluster 2"),
col = 1:2, pch = 20, pt.cex = 2, bty = "n")
4.(c)
# Analysis of K-Means Clustering:
cat("Clustering observations:\n")
## Clustering observations:
cat("- Two distinct clusters are identified.\n")
## - Two distinct clusters are identified.
cat("- Higher glucose levels are predominantly assigned to one cluster.\n")
## - Higher glucose levels are predominantly assigned to one cluster.
cat("- Number of pregnancies seems to have less influence on cluster assignments.\n")
## - Number of pregnancies seems to have less influence on cluster assignments.
cat("- Overlap occurs around a glucose level of 130, making intermediate cases harder to classify.\n")
## - Overlap occurs around a glucose level of 130, making intermediate cases harder to classify.
cat("- Individuals with higher glucose levels are more indicative of diabetes.\n")
## - Individuals with higher glucose levels are more indicative of diabetes.
4.(d)
pca <- prcomp(x, scale = TRUE)
summary(pca) # Output summary of PCA
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.5522 1.2233 0.9549 0.8944 0.83074 0.62435 0.55000
## Proportion of Variance 0.3442 0.2138 0.1303 0.1143 0.09859 0.05569 0.04321
## Cumulative Proportion 0.3442 0.5580 0.6882 0.8025 0.90110 0.95679 1.00000
4.(e)
pca.r <- prcomp(x) # Unscaled PCA for comparison
summary(pca.r)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 32.2816 13.5337 10.65363 8.80415 4.26711 2.63685 0.29758
## Proportion of Variance 0.7229 0.1271 0.07874 0.05377 0.01263 0.00482 0.00006
## Cumulative Proportion 0.7229 0.8500 0.92871 0.98248 0.99512 0.99994 1.00000
influential_features <- apply(abs(pca$rotation), 2, which.max)
most_influential <- row.names(pca$rotation)[influential_features]
cat("Most influential features for each PC:\n")
## Most influential features for each PC:
print(most_influential)
## [1] "age" "bmi" "ped" "glu" "bp" "age" "skin"
PC1 is dominantly influenced by glucose, as it has the largest absolute loading. This suggests that glucose levels are the most significant factor in explaining variance.