Multivariate Data Analysis HWK3

Q1.

setwd("/Users/junwen/Documents/Multivariate Data Analysis via Matrix Decompositions")
url_data <- "http://www.stat.uchicago.edu/~lekheng/courses/331/hw3/p1.txt"
data <- read.table(url(url_data))
data <- data[colnames(data)[c(1, 2, 5, 6)]]

Covariance

S <- cov(data)
eig <- eigen(S)
eig

## $values
## [1] 304.189519  27.987394  10.868216   2.312595
## 
## $vectors
##             [,1]        [,2]         [,3]         [,4]
## [1,]  0.01003256  0.07651991  0.027956590 0.9966255469
## [2,] -0.99332913  0.11503294  0.007985478 0.0009432616
## [3,] -0.02416098 -0.14029264 -0.989056096 0.0387590027
## [4,] -0.11230686 -0.98042343  0.144646812 0.0723489409

m = 1

L <- as.matrix(sqrt(eig$values[1]) * eig$vectors[,1])
phi <- S - L %*% t(L)
L

##             [,1]
## [1,]   0.1749782
## [2,] -17.3246829
## [3,]  -0.4213923
## [4,]  -1.9587473

phi

##            V1         V2         V5        V6
## V1  2.4693826  0.2509540 -0.5116314 -1.888969
## V2  0.2509540  0.3710405 -0.5374218 -3.143735
## V5 -0.5116314 -0.5374218 11.1859593  2.301196
## V6 -1.8889692 -3.1437346  2.3011960 27.141822

m = 2

L <- matrix(c(sqrt(eig$values[1]) * eig$vectors[,1], sqrt(eig$values[2]) * eig$vectors[,2]), nrow = 4, ncol = 2)
phi <- S - L %*% t(L)
L

##             [,1]       [,2]
## [1,]   0.1749782  0.4048141
## [2,] -17.3246829  0.6085601
## [3,]  -0.4213923 -0.7421918
## [4,]  -1.9587473 -5.1867451

phi

##              V1            V2          V5          V6
## V1  2.305508154  0.0046003149 -0.21118168  0.21069845
## V2  0.004600315  0.0006951005 -0.08575355  0.01271141
## V5 -0.211181680 -0.0857535537 10.63511070 -1.54836354
## V6  0.210698445  0.0127114138 -1.54836354  0.23949741

Correlation

S <- (cor(data))
eig <- eigen(S)
eig

## $values
## [1] 1.5556393 0.9097107 0.8980289 0.6366211
## 
## $vectors
##            [,1]       [,2]        [,3]        [,4]
## [1,]  0.4520666 -0.2545336  0.77712849  0.35625792
## [2,] -0.5171406 -0.5465407  0.37124600 -0.54409127
## [3,] -0.3824193  0.7708084  0.50521588 -0.06608137
## [4,] -0.6180266 -0.2058161 -0.05481445  0.75675507

m = 1

L <- as.matrix(sqrt(eig$values[1]) * eig$vectors[,1])
phi <- S - L %*% t(L)
L

##            [,1]
## [1,]  0.5638413
## [2,] -0.6450049
## [3,] -0.4769735
## [4,] -0.7708354

phi

##           V1         V2         V5         V6
## V1 0.6820829  0.2622385  0.1591125  0.1810360
## V2 0.2622385  0.5839686 -0.1919183 -0.1780689
## V5 0.1591125 -0.1919183  0.7724963 -0.2010258
## V6 0.1810360 -0.1780689 -0.2010258  0.4058129

m = 2

L <- matrix(c(sqrt(eig$values[1]) * eig$vectors[,1], sqrt(eig$values[2]) * eig$vectors[,2]), nrow = 4, ncol = 2)
phi <- S - L %*% t(L)
L

##            [,1]       [,2]
## [1,]  0.5638413 -0.2427710
## [2,] -0.6450049 -0.5212837
## [3,] -0.4769735  0.7351875
## [4,] -0.7708354 -0.1963048

phi

##           V1         V2          V5          V6
## V1 0.6231452  0.1356860  0.33759467  0.13337892
## V2 0.1356860  0.3122319  0.19132301 -0.28039938
## V5 0.3375947  0.1913230  0.23199564 -0.05670501
## V6 0.1333789 -0.2803994 -0.05670501  0.36727729

Q2.

x <- matrix(c(8, 2, 2, 5), nrow = 2, ncol = 2)
s <- svd(x)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmax <- solve(m)

x <- matrix(c(6, -2, -2, 7), nrow = 2, ncol = 2)
s <- svd(x)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmay <- solve(m)
sigmax %*% matrix(c(3, 1, -1, 3), nrow = 2, ncol = 2) %*% sigmay

##           [,1]       [,2]
## [1,] 0.4019420 -0.1555726
## [2,] 0.2077044  0.5922643

x <- sigmax %*% matrix(c(3, 1, -1, 3), nrow = 2, ncol = 2) %*% sigmay
s <- svd(x)
s

## $d
## [1] 0.6279151 0.4305822
## 
## $u
##             [,1]       [,2]
## [1,] -0.04147153 0.99913969
## [2,]  0.99913969 0.04147153
## 
## $v
##           [,1]       [,2]
## [1,] 0.3039528  0.9526871
## [2,] 0.9526871 -0.3039528

sigmax %*% s$u[,1]

##             [,1]
## [1,] -0.08181554
## [2,]  0.46902996

sigmax %*% s$u[,2]

##             [,1]
## [1,]  0.36358645
## [2,] -0.04725593

sigmay %*% s$v[,1]

##           [,1]
## [1,] 0.1903917
## [2,] 0.3931389

sigmay %*% s$v[,2]

##            [,1]
## [1,]  0.3846577
## [2,] -0.0577626

Q3.

url_data <- "http://www.stat.uchicago.edu/~lekheng/courses/331/hw3/p3.txt"
data <- read.table(url(url_data))
data <- data[colnames(data)[c(1, 2, 3, 4)]]

cov(data)

##           V1        V2       V3        V4
## V1 105616.30  94613.53 87289.71  94230.73
## V2  94613.53 101510.12 76137.10  81064.36
## V3  87289.71  76137.10 91917.09  90352.38
## V4  94230.73  81064.36 90352.38 104227.96

D <- cov(data)
X <- D[c('V1', 'V2'), c('V1', 'V2')]
Y <- D[c('V3', 'V4'), c('V3', 'V4')]
XY <- D[c('V1', 'V2'), c('V3', 'V4')]
s <- svd(X)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmaX <- solve(m)

s <- svd(Y)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmaY <- solve(m)

G <- sigmaX%*% XY %*% sigmaY
G

##           [,1]      [,2]
## [1,] 0.5090313 0.6236580
## [2,] 0.3310498 0.2835438

svd(G)

## $d
## [1] 0.91291935 0.06805556
## 
## $u
##            [,1]       [,2]
## [1,] -0.8811069 -0.4729172
## [2,] -0.4729172  0.8811069
## 
## $v
##            [,1]       [,2]
## [1,] -0.6627860  0.7488089
## [2,] -0.7488089 -0.6627860

sigmaX %*% s$u[,1]

##              [,1]
## [1,] -0.001271884
## [2,] -0.001908628

sigmaY %*% s$v[,1]

##              [,1]
## [1,] -0.001571773
## [2,] -0.001682496

U1 <- t(sigmaX %*% s$u[,1]) %*% t(as.matrix(data[colnames(data)[c(1, 2)]]))
V1 <- t(sigmaY %*% s$v[,1]) %*% t(as.matrix(data[colnames(data)[c(3, 4)]]))
plot(U1, V1)

U2 <- t(sigmaX %*% s$u[,2]) %*% t(as.matrix(data[colnames(data)[c(1, 2)]]))
V2 <- t(sigmaY %*% s$v[,2]) %*% t(as.matrix(data[colnames(data)[c(3, 4)]]))
plot(U2, V2)

D <- cor(data)
D

##           V1        V2        V3        V4
## V1 1.0000000 0.9137620 0.8859301 0.8981212
## V2 0.9137620 1.0000000 0.7882129 0.7881034
## V3 0.8859301 0.7882129 1.0000000 0.9231013
## V4 0.8981212 0.7881034 0.9231013 1.0000000

X <- D[c('V1', 'V2'), c('V1', 'V2')]
Y <- D[c('V3', 'V4'), c('V3', 'V4')]
XY <- D[c('V1', 'V2'), c('V3', 'V4')]
s <- svd(X)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmaX <- solve(m)

s <- svd(Y)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmaY <- solve(m)

G <- sigmaX%*% XY %*% sigmaY
G

##           [,1]      [,2]
## [1,] 0.5197974 0.6110687
## [2,] 0.3402696 0.2804917

Q4.

mydata <- data.frame(matrix(c(0.9, 0.6, 0.6, -0.9, 0, 0.6, -0.6, 0), nrow = 4, ncol = 2))
apply(mydata, 2, mean)

##  X1  X2 
## 0.3 0.0

cov(mydata)

##      X1   X2
## X1 0.66 0.00
## X2 0.00 0.24

Y <- matrix(c(1, -0.051, -0.051, 1), nrow = 2, ncol = 2)
s <- svd(Y)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmaY <- solve(m)
u <- matrix(c(0.166, 0.694), nrow = 1, ncol = 2)
u %*% sigmaY

##          [,1]      [,2]
## [1,] 0.183888 0.6989181

svd(u %*% sigmaY)

## $d
## [1] 0.7227042
## 
## $u
##      [,1]
## [1,]   -1
## 
## $v
##            [,1]
## [1,] -0.2544444
## [2,] -0.9670874

v <- matrix(c(-0.2544, -0.9671), nrow = 1, ncol = 2)
v %*% sigmaY

##            [,1]       [,2]
## [1,] -0.2793498 -0.9745428

X <- matrix(c(1, -0.291, -0.291, 1), nrow = 2, ncol = 2)
Y <- matrix(c(1, 0.181, 0.181, 1), nrow = 2, ncol = 2)
XY <-  matrix(c(0.44, 0.372, -0.205, 0.243), nrow = 2, ncol = 2)
s <- svd(X)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmaX <- solve(m)

s <- svd(Y)
m <- s$u %*% diag(sqrt(s$d)) %*% s$u
sigmaY <- solve(m)

G <- sigmaX%*% XY %*% sigmaY
G

##           [,1]       [,2]
## [1,] 0.5346741 -0.2240962
## [2,] 0.4376401  0.1806860

svd(G)

## $d
## [1] 0.6939516 0.2805406
## 
## $u
##            [,1]       [,2]
## [1,] -0.7993146 -0.6009128
## [2,] -0.6009128  0.7993146
## 
## $v
##            [,1]      [,2]
## [1,] -0.9948192 0.1016596
## [2,]  0.1016596 0.9948192

sigmaX %*% s$u[,1]

##           [,1]
## [1,] -0.839773
## [2,] -0.839773

sigmaY %*% s$v[,1]

##            [,1]
## [1,] -0.6506689
## [2,] -0.6506689

Q6.

url_data <- "http://www.stat.uchicago.edu/~lekheng/courses/331/hw3/p6.txt"
gsbdata <- read.table(url(url_data))
colnames(gsbdata) <- c('GPA', 'GMAT', 'admit')

x1 <- gsbdata[gsbdata$admit == 1, c(1,2)]
x1.mean <- apply(x1, 2, mean)
x1.mean

##        GPA       GMAT 
##   3.403871 561.225806

x2 <- gsbdata[gsbdata$admit == 2, c(1,2)]
x2.mean <- apply(x2, 2, mean)
x2.mean

##      GPA     GMAT 
##   2.4825 447.0714

x3 <- gsbdata[gsbdata$admit == 3, c(1,2)]
x3.mean <- apply(x3, 2, mean)
x3.mean

##        GPA       GMAT 
##   2.992692 446.230769

x <- gsbdata[c(1,2)]
x.mean <- apply(x, 2, mean)
x.mean

##        GPA       GMAT 
##   2.974588 488.447059

x1$GPA <- x1$GPA- x1.mean[1] 
x1$GMAT <- x1$GMAT- x1.mean[2]
S1 <- t(as.matrix(x1)) %*% as.matrix(x1)/(length(x1$GPA)-1)
S1

##             GPA         GMAT
## GPA  0.04355785 5.809677e-02
## GMAT 0.05809677 4.618247e+03

x2$GPA <- x2$GPA- x2.mean[1] 
x2$GMAT <- x2$GMAT- x2.mean[2]
S2 <- t(as.matrix(x2)) %*% as.matrix(x2)/(length(x2$GPA)-1)
S2

##              GPA        GMAT
## GPA   0.03364907   -1.192037
## GMAT -1.19203704 3891.253968

x3$GPA <- x3$GPA- x3.mean[1] 
x3$GMAT <- x3$GMAT- x3.mean[2]
S3 <- t(as.matrix(x3)) %*% as.matrix(x3)/(length(x3$GPA)-1)
S3

##              GPA        GMAT
## GPA   0.02969246   -5.403846
## GMAT -5.40384615 2246.904615

S.pool <- ((length(x1$GPA)-1)*S1 + (length(x2$GPA)-1)*S2 + (length(x3$GPA)-1)*S3)/(length(x$GPA)-3)
S.pool

##              GPA        GMAT
## GPA   0.03606795   -2.018759
## GMAT -2.01875915 3655.901121

W <- (length(x$GPA)-3)*S.pool
W

##              GPA        GMAT
## GPA     2.957572   -165.5383
## GMAT -165.538251 299783.8919

B <- length(x1$GPA)*(as.matrix(x1.mean) - as.matrix(x.mean)) %*% t((as.matrix(x1.mean) - as.matrix(x.mean))) + length(x2$GPA)*(as.matrix(x2.mean) - as.matrix(x.mean)) %*% t((as.matrix(x2.mean) - as.matrix(x.mean))) + length(x3$GPA)*(as.matrix(x3.mean) - as.matrix(x.mean)) %*% t((as.matrix(x3.mean) - as.matrix(x.mean)))
B

##             GPA       GMAT
## GPA    12.50154   1518.744
## GMAT 1518.74390 258471.120

D <- solve(W) %*% B
eigen(D)

## $values
## [1] 5.6460445 0.1906115
## 
## $vectors
##             [,1]         [,2]
## [1,] 0.999998537 -0.999970354
## [2,] 0.001710717  0.007700095

library(ggplot2)
gsbdata$admit <- as.factor(gsbdata$admit)
qplot(GPA, GMAT, data = gsbdata, color = admit)

gsbdata.transformed <- data.frame(as.matrix(gsbdata[c('GPA', 'GMAT')]) %*% solve(t(eigen(D)$vectors)))
colnames(gsbdata.transformed) <- c('Q1', 'Q2')
gsbdata.transformed['admit'] <- gsbdata['admit']
library(ggplot2)
gsbdata.transformed$admit <- as.factor(gsbdata.transformed$admit)
qplot(Q1, Q2, data = gsbdata.transformed, color = admit)