setwd("/Users/junwen/Documents/Multivariate Data Analysis via Matrix Decompositions")
url_data <- "http://www.stat.uchicago.edu/~lekheng/courses/331/hw3/p1.txt"
data <- read.table(url(url_data))
data <- data[colnames(data)[c(1, 2, 5, 6)]]
Covariance
S <- cov(data)
eig <- eigen(S)
eig
## $values
## [1] 304.189519 27.987394 10.868216 2.312595
##
## $vectors
## [,1] [,2] [,3] [,4]
## [1,] 0.01003256 0.07651991 0.027956590 0.9966255469
## [2,] -0.99332913 0.11503294 0.007985478 0.0009432616
## [3,] -0.02416098 -0.14029264 -0.989056096 0.0387590027
## [4,] -0.11230686 -0.98042343 0.144646812 0.0723489409
m = 1
L <- as.matrix(sqrt(eig$values[1]) * eig$vectors[,1])
phi <- S - L %*% t(L)
L
## [,1]
## [1,] 0.1749782
## [2,] -17.3246829
## [3,] -0.4213923
## [4,] -1.9587473
phi
## V1 V2 V5 V6
## V1 2.4693826 0.2509540 -0.5116314 -1.888969
## V2 0.2509540 0.3710405 -0.5374218 -3.143735
## V5 -0.5116314 -0.5374218 11.1859593 2.301196
## V6 -1.8889692 -3.1437346 2.3011960 27.141822
m = 2
L <- matrix(c(sqrt(eig$values[1]) * eig$vectors[,1], sqrt(eig$values[2]) * eig$vectors[,2]), nrow = 4, ncol = 2)
phi <- S - L %*% t(L)
L
## [,1] [,2]
## [1,] 0.1749782 0.4048141
## [2,] -17.3246829 0.6085601
## [3,] -0.4213923 -0.7421918
## [4,] -1.9587473 -5.1867451
phi
## V1 V2 V5 V6
## V1 2.305508154 0.0046003149 -0.21118168 0.21069845
## V2 0.004600315 0.0006951005 -0.08575355 0.01271141
## V5 -0.211181680 -0.0857535537 10.63511070 -1.54836354
## V6 0.210698445 0.0127114138 -1.54836354 0.23949741
Correlation
S <- (cor(data))
eig <- eigen(S)
eig
## $values
## [1] 1.5556393 0.9097107 0.8980289 0.6366211
##
## $vectors
## [,1] [,2] [,3] [,4]
## [1,] 0.4520666 -0.2545336 0.77712849 0.35625792
## [2,] -0.5171406 -0.5465407 0.37124600 -0.54409127
## [3,] -0.3824193 0.7708084 0.50521588 -0.06608137
## [4,] -0.6180266 -0.2058161 -0.05481445 0.75675507
m = 1
L <- as.matrix(sqrt(eig$values[1]) * eig$vectors[,1])
phi <- S - L %*% t(L)
L
## [,1]
## [1,] 0.5638413
## [2,] -0.6450049
## [3,] -0.4769735
## [4,] -0.7708354
phi
## V1 V2 V5 V6
## V1 0.6820829 0.2622385 0.1591125 0.1810360
## V2 0.2622385 0.5839686 -0.1919183 -0.1780689
## V5 0.1591125 -0.1919183 0.7724963 -0.2010258
## V6 0.1810360 -0.1780689 -0.2010258 0.4058129
m = 2
L <- matrix(c(sqrt(eig$values[1]) * eig$vectors[,1], sqrt(eig$values[2]) * eig$vectors[,2]), nrow = 4, ncol = 2)
phi <- S - L %*% t(L)
L
## [,1] [,2]
## [1,] 0.5638413 -0.2427710
## [2,] -0.6450049 -0.5212837
## [3,] -0.4769735 0.7351875
## [4,] -0.7708354 -0.1963048
phi
## V1 V2 V5 V6
## V1 0.6231452 0.1356860 0.33759467 0.13337892
## V2 0.1356860 0.3122319 0.19132301 -0.28039938
## V5 0.3375947 0.1913230 0.23199564 -0.05670501
## V6 0.1333789 -0.2803994 -0.05670501 0.36727729
x <- matrix(c(8, 2, 2, 5), nrow = 2, ncol = 2)
s <- svd(x)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmax <- solve(m)
x <- matrix(c(6, -2, -2, 7), nrow = 2, ncol = 2)
s <- svd(x)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmay <- solve(m)
sigmax %*% matrix(c(3, 1, -1, 3), nrow = 2, ncol = 2) %*% sigmay
## [,1] [,2]
## [1,] 0.4019420 -0.1555726
## [2,] 0.2077044 0.5922643
x <- sigmax %*% matrix(c(3, 1, -1, 3), nrow = 2, ncol = 2) %*% sigmay
s <- svd(x)
s
## $d
## [1] 0.6279151 0.4305822
##
## $u
## [,1] [,2]
## [1,] -0.04147153 0.99913969
## [2,] 0.99913969 0.04147153
##
## $v
## [,1] [,2]
## [1,] 0.3039528 0.9526871
## [2,] 0.9526871 -0.3039528
sigmax %*% s$u[,1]
## [,1]
## [1,] -0.08181554
## [2,] 0.46902996
sigmax %*% s$u[,2]
## [,1]
## [1,] 0.36358645
## [2,] -0.04725593
sigmay %*% s$v[,1]
## [,1]
## [1,] 0.1903917
## [2,] 0.3931389
sigmay %*% s$v[,2]
## [,1]
## [1,] 0.3846577
## [2,] -0.0577626
url_data <- "http://www.stat.uchicago.edu/~lekheng/courses/331/hw3/p3.txt"
data <- read.table(url(url_data))
data <- data[colnames(data)[c(1, 2, 3, 4)]]
cov(data)
## V1 V2 V3 V4
## V1 105616.30 94613.53 87289.71 94230.73
## V2 94613.53 101510.12 76137.10 81064.36
## V3 87289.71 76137.10 91917.09 90352.38
## V4 94230.73 81064.36 90352.38 104227.96
D <- cov(data)
X <- D[c('V1', 'V2'), c('V1', 'V2')]
Y <- D[c('V3', 'V4'), c('V3', 'V4')]
XY <- D[c('V1', 'V2'), c('V3', 'V4')]
s <- svd(X)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmaX <- solve(m)
s <- svd(Y)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmaY <- solve(m)
G <- sigmaX%*% XY %*% sigmaY
G
## [,1] [,2]
## [1,] 0.5090313 0.6236580
## [2,] 0.3310498 0.2835438
svd(G)
## $d
## [1] 0.91291935 0.06805556
##
## $u
## [,1] [,2]
## [1,] -0.8811069 -0.4729172
## [2,] -0.4729172 0.8811069
##
## $v
## [,1] [,2]
## [1,] -0.6627860 0.7488089
## [2,] -0.7488089 -0.6627860
sigmaX %*% s$u[,1]
## [,1]
## [1,] -0.001271884
## [2,] -0.001908628
sigmaY %*% s$v[,1]
## [,1]
## [1,] -0.001571773
## [2,] -0.001682496
U1 <- t(sigmaX %*% s$u[,1]) %*% t(as.matrix(data[colnames(data)[c(1, 2)]]))
V1 <- t(sigmaY %*% s$v[,1]) %*% t(as.matrix(data[colnames(data)[c(3, 4)]]))
plot(U1, V1)
U2 <- t(sigmaX %*% s$u[,2]) %*% t(as.matrix(data[colnames(data)[c(1, 2)]]))
V2 <- t(sigmaY %*% s$v[,2]) %*% t(as.matrix(data[colnames(data)[c(3, 4)]]))
plot(U2, V2)
D <- cor(data)
D
## V1 V2 V3 V4
## V1 1.0000000 0.9137620 0.8859301 0.8981212
## V2 0.9137620 1.0000000 0.7882129 0.7881034
## V3 0.8859301 0.7882129 1.0000000 0.9231013
## V4 0.8981212 0.7881034 0.9231013 1.0000000
X <- D[c('V1', 'V2'), c('V1', 'V2')]
Y <- D[c('V3', 'V4'), c('V3', 'V4')]
XY <- D[c('V1', 'V2'), c('V3', 'V4')]
s <- svd(X)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmaX <- solve(m)
s <- svd(Y)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmaY <- solve(m)
G <- sigmaX%*% XY %*% sigmaY
G
## [,1] [,2]
## [1,] 0.5197974 0.6110687
## [2,] 0.3402696 0.2804917
mydata <- data.frame(matrix(c(0.9, 0.6, 0.6, -0.9, 0, 0.6, -0.6, 0), nrow = 4, ncol = 2))
apply(mydata, 2, mean)
## X1 X2
## 0.3 0.0
cov(mydata)
## X1 X2
## X1 0.66 0.00
## X2 0.00 0.24
Y <- matrix(c(1, -0.051, -0.051, 1), nrow = 2, ncol = 2)
s <- svd(Y)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmaY <- solve(m)
u <- matrix(c(0.166, 0.694), nrow = 1, ncol = 2)
u %*% sigmaY
## [,1] [,2]
## [1,] 0.183888 0.6989181
svd(u %*% sigmaY)
## $d
## [1] 0.7227042
##
## $u
## [,1]
## [1,] -1
##
## $v
## [,1]
## [1,] -0.2544444
## [2,] -0.9670874
v <- matrix(c(-0.2544, -0.9671), nrow = 1, ncol = 2)
v %*% sigmaY
## [,1] [,2]
## [1,] -0.2793498 -0.9745428
X <- matrix(c(1, -0.291, -0.291, 1), nrow = 2, ncol = 2)
Y <- matrix(c(1, 0.181, 0.181, 1), nrow = 2, ncol = 2)
XY <- matrix(c(0.44, 0.372, -0.205, 0.243), nrow = 2, ncol = 2)
s <- svd(X)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmaX <- solve(m)
s <- svd(Y)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmaY <- solve(m)
G <- sigmaX%*% XY %*% sigmaY
G
## [,1] [,2]
## [1,] 0.5346741 -0.2240962
## [2,] 0.4376401 0.1806860
svd(G)
## $d
## [1] 0.6939516 0.2805406
##
## $u
## [,1] [,2]
## [1,] -0.7993146 -0.6009128
## [2,] -0.6009128 0.7993146
##
## $v
## [,1] [,2]
## [1,] -0.9948192 0.1016596
## [2,] 0.1016596 0.9948192
sigmaX %*% s$u[,1]
## [,1]
## [1,] -0.839773
## [2,] -0.839773
sigmaY %*% s$v[,1]
## [,1]
## [1,] -0.6506689
## [2,] -0.6506689
url_data <- "http://www.stat.uchicago.edu/~lekheng/courses/331/hw3/p6.txt"
gsbdata <- read.table(url(url_data))
colnames(gsbdata) <- c('GPA', 'GMAT', 'admit')
x1 <- gsbdata[gsbdata$admit == 1, c(1,2)]
x1.mean <- apply(x1, 2, mean)
x1.mean
## GPA GMAT
## 3.403871 561.225806
x2 <- gsbdata[gsbdata$admit == 2, c(1,2)]
x2.mean <- apply(x2, 2, mean)
x2.mean
## GPA GMAT
## 2.4825 447.0714
x3 <- gsbdata[gsbdata$admit == 3, c(1,2)]
x3.mean <- apply(x3, 2, mean)
x3.mean
## GPA GMAT
## 2.992692 446.230769
x <- gsbdata[c(1,2)]
x.mean <- apply(x, 2, mean)
x.mean
## GPA GMAT
## 2.974588 488.447059
x1$GPA <- x1$GPA- x1.mean[1]
x1$GMAT <- x1$GMAT- x1.mean[2]
S1 <- t(as.matrix(x1)) %*% as.matrix(x1)/(length(x1$GPA)-1)
S1
## GPA GMAT
## GPA 0.04355785 5.809677e-02
## GMAT 0.05809677 4.618247e+03
x2$GPA <- x2$GPA- x2.mean[1]
x2$GMAT <- x2$GMAT- x2.mean[2]
S2 <- t(as.matrix(x2)) %*% as.matrix(x2)/(length(x2$GPA)-1)
S2
## GPA GMAT
## GPA 0.03364907 -1.192037
## GMAT -1.19203704 3891.253968
x3$GPA <- x3$GPA- x3.mean[1]
x3$GMAT <- x3$GMAT- x3.mean[2]
S3 <- t(as.matrix(x3)) %*% as.matrix(x3)/(length(x3$GPA)-1)
S3
## GPA GMAT
## GPA 0.02969246 -5.403846
## GMAT -5.40384615 2246.904615
S.pool <- ((length(x1$GPA)-1)*S1 + (length(x2$GPA)-1)*S2 + (length(x3$GPA)-1)*S3)/(length(x$GPA)-3)
S.pool
## GPA GMAT
## GPA 0.03606795 -2.018759
## GMAT -2.01875915 3655.901121
W <- (length(x$GPA)-3)*S.pool
W
## GPA GMAT
## GPA 2.957572 -165.5383
## GMAT -165.538251 299783.8919
B <- length(x1$GPA)*(as.matrix(x1.mean) - as.matrix(x.mean)) %*% t((as.matrix(x1.mean) - as.matrix(x.mean))) + length(x2$GPA)*(as.matrix(x2.mean) - as.matrix(x.mean)) %*% t((as.matrix(x2.mean) - as.matrix(x.mean))) + length(x3$GPA)*(as.matrix(x3.mean) - as.matrix(x.mean)) %*% t((as.matrix(x3.mean) - as.matrix(x.mean)))
B
## GPA GMAT
## GPA 12.50154 1518.744
## GMAT 1518.74390 258471.120
D <- solve(W) %*% B
eigen(D)
## $values
## [1] 5.6460445 0.1906115
##
## $vectors
## [,1] [,2]
## [1,] 0.999998537 -0.999970354
## [2,] 0.001710717 0.007700095
library(ggplot2)
gsbdata$admit <- as.factor(gsbdata$admit)
qplot(GPA, GMAT, data = gsbdata, color = admit)
gsbdata.transformed <- data.frame(as.matrix(gsbdata[c('GPA', 'GMAT')]) %*% solve(t(eigen(D)$vectors)))
colnames(gsbdata.transformed) <- c('Q1', 'Q2')
gsbdata.transformed['admit'] <- gsbdata['admit']
library(ggplot2)
gsbdata.transformed$admit <- as.factor(gsbdata.transformed$admit)
qplot(Q1, Q2, data = gsbdata.transformed, color = admit)