Input Data:
data <- na.omit(airquality[, c("Ozone", "Solar.R", "Wind", "Temp")])
# dim(data) # 111 x 4
print(data)
Ozone Solar.R Wind Temp
1 41 190 7.4 67
2 36 118 8.0 72
3 12 149 12.6 74
4 18 313 11.5 62
7 23 299 8.6 65
8 19 99 13.8 59
9 8 19 20.1 61
12 16 256 9.7 69
13 11 290 9.2 66
14 14 274 10.9 68
15 18 65 13.2 58
16 14 334 11.5 64
17 34 307 12.0 66
18 6 78 18.4 57
19 30 322 11.5 68
20 11 44 9.7 62
21 1 8 9.7 59
22 11 320 16.6 73
23 4 25 9.7 61
24 32 92 12.0 61
28 23 13 12.0 67
29 45 252 14.9 81
30 115 223 5.7 79
31 37 279 7.4 76
38 29 127 9.7 82
40 71 291 13.8 90
41 39 323 11.5 87
44 23 148 8.0 82
47 21 191 14.9 77
48 37 284 20.7 72
49 20 37 9.2 65
50 12 120 11.5 73
51 13 137 10.3 76
62 135 269 4.1 84
63 49 248 9.2 85
64 32 236 9.2 81
66 64 175 4.6 83
67 40 314 10.9 83
68 77 276 5.1 88
69 97 267 6.3 92
70 97 272 5.7 92
71 85 175 7.4 89
73 10 264 14.3 73
74 27 175 14.9 81
76 7 48 14.3 80
77 48 260 6.9 81
78 35 274 10.3 82
79 61 285 6.3 84
80 79 187 5.1 87
81 63 220 11.5 85
82 16 7 6.9 74
85 80 294 8.6 86
86 108 223 8.0 85
87 20 81 8.6 82
88 52 82 12.0 86
89 82 213 7.4 88
90 50 275 7.4 86
91 64 253 7.4 83
92 59 254 9.2 81
93 39 83 6.9 81
94 9 24 13.8 81
95 16 77 7.4 82
99 122 255 4.0 89
100 89 229 10.3 90
101 110 207 8.0 90
104 44 192 11.5 86
105 28 273 11.5 82
106 65 157 9.7 80
108 22 71 10.3 77
109 59 51 6.3 79
110 23 115 7.4 76
111 31 244 10.9 78
112 44 190 10.3 78
113 21 259 15.5 77
114 9 36 14.3 72
116 45 212 9.7 79
117 168 238 3.4 81
118 73 215 8.0 86
120 76 203 9.7 97
121 118 225 2.3 94
122 84 237 6.3 96
123 85 188 6.3 94
124 96 167 6.9 91
125 78 197 5.1 92
126 73 183 2.8 93
127 91 189 4.6 93
128 47 95 7.4 87
129 32 92 15.5 84
130 20 252 10.9 80
131 23 220 10.3 78
132 21 230 10.9 75
133 24 259 9.7 73
134 44 236 14.9 81
135 21 259 15.5 76
136 28 238 6.3 77
137 9 24 10.9 71
138 13 112 11.5 71
139 46 237 6.9 78
140 18 224 13.8 67
141 13 27 10.3 76
142 24 238 10.3 68
143 16 201 8.0 82
144 13 238 12.6 64
145 23 14 9.2 71
146 36 139 10.3 81
147 7 49 10.3 69
148 14 20 16.6 63
149 30 193 6.9 70
151 14 191 14.3 75
152 18 131 8.0 76
153 20 223 11.5 68
Given matrix A (m x n) and matrix B (n x p), compute C (m x p) where: c_ij = sum_{k=1}^{n} a_ik * b_kj
Do NOT use the %*% operator anywhere inside mat_mult().
mat_mult <- function(A, B) {
m = nrow(A)
n = ncol(A)
p = ncol(B)
mat_result = matrix(0, nrow=m, ncol=p)
for (i in 1:m) {
for (j in 1:p) {
for (k in 1:n) {
mat_result[i,j] = mat_result[i,j] + A[i,k] * B[k,j]
}
}
}
return(mat_result)
}
# ----- Validation -----
a <- matrix(c(1, 2, 3, 4), nrow = 2)
b <- matrix(c(5, 6, 7, 8), nrow = 2)
c_std <- a %*% b
c_manual <- mat_mult(a, b)
{
cat("Standard result (a %*% b):\n")
print(c_std)
cat("\nManual result mat_mult(a, b):\n")
print(c_manual)
cat("Element-wise match:\n")
print(c_std == c_manual)
}
Standard result (a %*% b):
[,1] [,2]
[1,] 23 31
[2,] 34 46
Manual result mat_mult(a, b):
[,1] [,2]
[1,] 23 31
[2,] 34 46
Element-wise match:
[,1] [,2]
[1,] TRUE TRUE
[2,] TRUE TRUE
Given data matrix X (n x p), follow these steps (use mat_mult() for ALL matrix multiplications):
Column mean vector (p x 1): X_bar = (1/n) * t(X) %*% 1_n where 1_n is a column vector of ones with n elements.
Centered matrix and covariance matrix: X_c = X - 1_n %% t(X_bar) S = (1/(n-1)) t(X_c) %*% X_c
Correlation matrix: D_inv_half = diag(1 / sqrt(diag(S))) # D^{-1/2} R = D^{-1/2} %% S %% D^{-1/2}
Hint: use t() for matrix transpose; use diag(1/sqrt(diag(S))) for D^{-1/2}.
corr_matrix <- function(X) {
X <- as.matrix(X)
n <- nrow(X)
vector1 <- matrix(1, ncol = 1, nrow=n)
# TODO (i): compute column mean vector X_bar
X_bar = (1/n)*(mat_mult(t(X),vector1))
# TODO (ii): compute centered matrix X_c and covariance matrix S
X_c = X - mat_mult(vector1, t(X_bar))
Cov_S <- (1/(n-1))* mat_mult(t(X_c), X_c)
# TODO (iii): compute D^{-1/2} and correlation matrix R
D_12 = diag(1/sqrt(diag(Cov_S)))
D_12S = mat_mult(D_12, Cov_S)
R = mat_mult(D_12S,D_12)
# Return R
return (R)
}
# Apply to air quality data and display
R_hat <- corr_matrix(data)
print(round(R_hat, 6))
[,1] [,2] [,3] [,4]
[1,] 1.000000 0.348342 -0.612497 0.698541
[2,] 0.348342 1.000000 -0.127183 0.294088
[3,] -0.612497 -0.127183 1.000000 -0.497190
[4,] 0.698541 0.294088 -0.497190 1.000000
# Validation
cat("\nElement-wise match with cor():\n")
Element-wise match with cor():
print(round(R_hat, 10) == round(cor(data), 10))
Ozone Solar.R Wind Temp
Ozone TRUE TRUE TRUE TRUE
Solar.R TRUE TRUE TRUE TRUE
Wind TRUE TRUE TRUE TRUE
Temp TRUE TRUE TRUE TRUE
Extend corr_matrix() to also compute a p-value for every pair (i, j). Output: a list with two matrices: $R – correlation matrix (from sub-question b) $P – p-value matrix
Steps for each off-diagonal pair (i, j) with i != j:
Test statistic: t_ij = r_ij * sqrt(n - 2) / sqrt(1 - r_ij^2) where df = n - 2.
p-value (two-sided): p_ij = 1 - 2 * integral_0^{|t_ij|} f(t; df) dt
Trapezoidal approximation: integral ≈ (h/2) * [f(0; df) + 2 * sum_{k=1}^{N-1} f(kh; df) + f(Nh; df)] with h = |t_ij| / N, N = 10000. Use dt(t, df) as the t-distribution pdf.
For diagonal elements (i == j): set p_ii = 0.
Apply to air quality data with alpha = 0.05 and display both matrices. Hint: use dt(t, df) for the t-distribution pdf f(t; df).
trap <- function(a, b, n = 10000, df) {
h <- (b - a) / n
integral <- (f(a, df) + f(b, df)) / 2
for (k in 1:(n-1)) {
integral <- integral + f(a + k*h, df)
}
return(integral * h)
}
corr_test_matrix <- function(X, alpha = 0.05) {
X <- as.matrix(X)
n <- nrow(X)
p <- ncol(X)
df <- n - 2
N <- 10000
R <- corr_matrix(X)
P <- matrix(0, nrow = p, ncol = p)
rownames(P) <- colnames(X)
colnames(P) <- colnames(X)
for (i in 1:p) {
for (j in 1:p) {
if (i == j) {
P[i, j] <- 0
} else {
t_ij <- R[i,j] * sqrt(n - 2) / sqrt(1 - R[i,j]^2)
integral <- trap(0, abs(t_ij), N, df)
P[i,j] <- 1 - 2 * integral
}
}
}
return(list(R = R, P = P))
}
Validation:
# ----- Apply to air quality data -----
results <- corr_test_matrix(data, alpha = 0.05)
{
cat("========== Correlation Matrix (R) ==========\n")
print(round(results$R, 6))
cat("\n========== P-value Matrix (P) ==========\n")
print(round(results$P, 6))
}
========== Correlation Matrix (R) ==========
[,1] [,2] [,3] [,4]
[1,] 1.000000 0.348342 -0.612497 0.698541
[2,] 0.348342 1.000000 -0.127183 0.294088
[3,] -0.612497 -0.127183 1.000000 -0.497190
[4,] 0.698541 0.294088 -0.497190 1.000000
========== P-value Matrix (P) ==========
Ozone Solar.R Wind Temp
Ozone 0.000000 0.000179 0.000000 0.000000
Solar.R 0.000179 0.000000 0.183452 0.001731
Wind 0.000000 0.183452 0.000000 0.000000
Temp 0.000000 0.001731 0.000000 0.000000
# Validation: compare P against cor.test() p-values
cor_mat <- matrix(0, ncol=4, nrow=4)
for (i in 1:4) {
for (j in 1:4) {
if (i == j) {
cor_mat[i,j] <- 0
} else {
cor_mat[i,j] <- cor.test(data[,i], data[,j])$p.value
}
}
}
rownames(cor_mat) <- colnames(cor_mat) <- names(data)
{
cat("\nCorrelation Matrix\n")
print(cor(data))
cat("\nP-value Matrix\n")
print(cor_mat)
}
Correlation Matrix
Ozone Solar.R Wind Temp
Ozone 1.0000000 0.3483417 -0.6124966 0.6985414
Solar.R 0.3483417 1.0000000 -0.1271835 0.2940876
Wind -0.6124966 -0.1271835 1.0000000 -0.4971897
Temp 0.6985414 0.2940876 -0.4971897 1.0000000
P-value Matrix
Ozone Solar.R Wind Temp
Ozone 0.000000e+00 0.0001793109 9.089415e-13 1.552677e-17
Solar.R 1.793109e-04 0.0000000000 1.834520e-01 1.730786e-03
Wind 9.089415e-13 0.1834519762 0.000000e+00 2.841966e-08
Temp 1.552677e-17 0.0017307856 2.841966e-08 0.000000e+00
# Verify answer
round(results$P, 10) == round(cor_mat, 10)
Ozone Solar.R Wind Temp
Ozone TRUE TRUE TRUE TRUE
Solar.R TRUE TRUE FALSE FALSE
Wind TRUE FALSE TRUE TRUE
Temp TRUE FALSE TRUE TRUE