\[X_1 = \begin{bmatrix} 3&7 \\2&4 \\ 4&7 \end{bmatrix}, X_2=\begin{bmatrix} 6&9 \\5&7 \\ 4&8 \end{bmatrix}\] Find the discriminant function and use it to classify a new data point \(x_{new}=\begin{bmatrix}2&7\end{bmatrix}\)
x1bar <- colMeans(iris[iris$Species=="setosa",1:4])
x2bar <- colMeans(iris[iris$Species!="setosa",1:4])
table(iris$Species)
##
## setosa versicolor virginica
## 50 50 50
S_pl <- ((49*var(iris[iris$Species=="setosa",1:4])+
99*var(iris[iris$Species!="setosa",1:4]))/148)
solve(S_pl)%*%(t(t(x1bar-x2bar)))
## [,1]
## Sepal.Length 3.186486
## Sepal.Width 11.719430
## Petal.Length -10.841575
## Petal.Width -2.773537
m1 <- matrix(c(3,7,2,4,4,7),byrow=T, nrow=3)
m2 <- matrix(c(6,9,5,7,4,8),byrow=T, nrow=3)
colmean.m1<- colMeans(m1)
colmean.m2<- colMeans(m2)
S_p1<- ((2*var(m1))+2*var(m2))/4
a<-solve(S_p1)%*%(t(t(colmean.m1-colmean.m2)))
z1bar<- t(a)%*%colmean.m1
z2bar<-t(a)%*%colmean.m2
mynew<-t(a)%*%as.matrix(c(2,7))
predicting<- ifelse(abs(mynew-z1bar)>abs(mynew-z2bar),"group2","group1")
predicting
## [,1]
## [1,] "group1"
So, using discriminant analysis, the observation belongs to \(X_1\)
#from https://www.youtube.com/watch?v=Uzj-_OQMXHI
#Example of Spectral Decomposition
A = matrix(c(1,2,3,2,3,5,3,5,6),nrow=3,byrow=T)
P1=t(t(eigen(A)$vectors[,1]))%*%t(eigen(A)$vectors[,1])
P2=t(t(eigen(A)$vectors[,2]))%*%t(eigen(A)$vectors[,2])
P3=t(t(eigen(A)$vectors[,3]))%*%t(eigen(A)$vectors[,3])
a1 = eigen(A)$values[1]*P1+eigen(A)$values[2]*P2+eigen(A)$values[3]*P3
#proof we arrived to the same matrix
A
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 2 3 5
## [3,] 3 5 6
a1
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 2 3 5
## [3,] 3 5 6
\[\sum= \begin{bmatrix}1&-2&0\\-2&5&0\\0&0&1 \end{bmatrix}\] Calculate the principal components and interpret their meaning. What can be gained from knowing these values?
A= matrix(c(1,-2,0,-2,5,0,0,0,1),byrow = T,nrow=3)
prcomp(A)
## Standard deviations (1, .., p=3):
## [1] 3.917433e+00 5.660226e-01 2.451778e-16
##
## Rotation (n x k) = (3 x 3):
## PC1 PC2 PC3
## [1,] -0.38945282 -0.13348418 0.9113224
## [2,] 0.92038207 -0.01882761 0.3905667
## [3,] -0.03497646 0.99087209 0.1301889
summary(prcomp(A))
## Importance of components:
## PC1 PC2 PC3
## Standard deviation 3.9174 0.56602 2.452e-16
## Proportion of Variance 0.9796 0.02045 0.000e+00
## Cumulative Proportion 0.9796 1.00000 1.000e+00
We know that PC1 explains 97% of the variation in the data. And we also know that the column vector 2 \(\vec{v}_2\) is the one that is Highest correlated to PC1. In other words, \(\vec{v}_2\) is the vector that helps to best define the different possible groups that exist in the data.
library(ACSWR)
data("stackloss")
summary(prcomp(stackloss))
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 13.932 4.7916 2.6822 1.40006
## Proportion of Variance 0.858 0.1015 0.0318 0.00866
## Cumulative Proportion 0.858 0.9595 0.9913 1.00000
PC1 explains 85% of the variation in the original dataset
pca.iris<-prcomp(iris[1:4])
#result of pca iris
summary(pca.iris)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 2.0563 0.49262 0.2797 0.15439
## Proportion of Variance 0.9246 0.05307 0.0171 0.00521
## Cumulative Proportion 0.9246 0.97769 0.9948 1.00000
pca.setosa<-prcomp(iris[iris$Species=='setosa',1:4])
pca.versicolor<-prcomp(iris[iris$Species=='versicolor',1:4])
pca.virginica<-prcomp(iris[iris$Species=='virginica',1:4])
#testing for equality of variances:
res <- boxM(iris[, 1:4], iris[, "Species"])
#summary tells us that p is low enough to say that covariance matrices are different from each other
summary(res)
## Summary for Box's M-test of Equality of Covariance Matrices
##
## Chi-Sq: 140.943
## df: 20
## p-value: < 2.2e-16
##
## log of Covariance determinants:
## setosa versicolor virginica pooled
## -13.067360 -10.874325 -8.927058 -9.958539
##
## Eigenvalues:
## setosa versicolor virginica pooled
## 1 0.236455690 0.487873944 0.69525484 0.44356592
## 2 0.036918732 0.072384096 0.10655123 0.08618331
## 3 0.026796399 0.054776085 0.05229543 0.05535235
## 4 0.009033261 0.009790365 0.03426585 0.02236372
##
## Statistics based on eigenvalues:
## setosa versicolor virginica pooled
## product 2.113088e-06 1.893828e-05 0.0001327479 4.732183e-05
## sum 3.092041e-01 6.248245e-01 0.8883673469 6.074653e-01
## precision 5.576122e-03 7.338788e-03 0.0169121236 1.304819e-02
## max 2.364557e-01 4.878739e-01 0.6952548382 4.435659e-01
Vieing differences of variances visually:
dets <- res$logDet
ng <- length(res$logDet)-1
dotchart(dets, xlab = "log determinant")
points(dets , 1:4,
cex=c(rep(1.5, ng), 2.5),
pch=c(rep(16, ng), 15),
col= c(rep("blue", ng), "red"))
Are PCs different among matrices? (setosa, versicolor, virginica)
Assuming different variances:
\[[\bar{X}_1 - \bar{X}_2 ]' \begin{pmatrix}\frac{1}{n_1}S+\frac{1}{n_2}S_2 \end{pmatrix}^{-1}[\bar{X}_1 - \bar{X}_2 ]\]
#assuming diff. variances
#setosa
setosa.means<-as.matrix(colMeans(pca.setosa$rotation))
setosa.covar<- cov(pca.setosa$rotation)
#versicolor
versicolor.means<-as.matrix(colMeans(pca.versicolor$rotation))
versicolor.covar<- cov(pca.versicolor$rotation)
#virginica
virginica.means<-as.matrix(colMeans(pca.virginica$rotation))
virginica.covar<- cov(pca.virginica$rotation)
##########calculating T^2
#setosa vs versicolor T2
t2.setosa.versicolor = t(setosa.means - versicolor.means)%*%solve((setosa.covar/4)+(versicolor.covar/4))%*%(setosa.means - versicolor.means)
#setosa vs virginica T2
t2.setosa.virginica = t(setosa.means - virginica.means)%*%solve((setosa.covar/4)+(virginica.covar/4))%*%(setosa.means - virginica.means)
#virginica vs versicolor T2
t2.versicolor.virginica = t(versicolor.means-virginica.means)%*%solve((versicolor.covar/4)+(virginica.covar/4))%*%(versicolor.means-virginica.means)
Now, we will use the Chi-Square distribution to find the 95% confidence intervals:
\[\bar{x}_i - \bar{y}_i +/- \sqrt{X^2_{crit} \begin{pmatrix} \frac{S^2_{X_i}}{n_X} + \frac{S^2_{Y_i}}{n_Y} \end{pmatrix}}\]
#95% Conf Int Setosa vs Versicolor
#PC1
criticalValueT2<- qchisq(0.95,2)
pc=1
diff.Means.seto.versi.PC1 = setosa.means[pc,] - versicolor.means[pc,]
varPC1.setosa<- setosa.covar[1,pc]
varPC1.versicolor<- versicolor.covar[1,pc]
pc1.upper.seto.versi <- diff.Means.seto.versi.PC1 + sqrt(criticalValueT2 * ((varPC1.setosa^2/4) + (varPC1.versicolor^2/4) ))
pc1.lower.seto.versi <- diff.Means.seto.versi.PC1 - sqrt(criticalValueT2 * ((varPC1.setosa^2/4) + (varPC1.versicolor^2/4) ))
#PC1 is not very diff since 95% conf passes by 0
pc1.lower.seto.versi;pc1.upper.seto.versi
## PC1
## -0.1050786
## PC1
## 0.2387705
#PC2
pc=2
diff.Means.seto.versi.PC2 = setosa.means[pc,] - versicolor.means[pc,]
varPC2.setosa<- setosa.covar[pc,pc]
varPC2.versicolor<- versicolor.covar[pc,pc]
pc2.upper.seto.versi <- diff.Means.seto.versi.PC2 + sqrt(criticalValueT2 * ((varPC2.setosa^2/4) + (varPC2.versicolor^2/4) ))
pc2.lower.seto.versi <- diff.Means.seto.versi.PC2 - sqrt(criticalValueT2 * ((varPC2.setosa^2/4) + (varPC2.versicolor^2/4) ))
#PC2 is not very diff since 95% conf passes by 0
pc2.lower.seto.versi;pc2.upper.seto.versi
## PC2
## -0.5218258
## PC2
## 0.5324237
#PC3
pc=3
diff.Means.seto.versi.PC3 = setosa.means[pc,] - versicolor.means[pc,]
varPC3.setosa<- setosa.covar[pc,pc]
varPC3.versicolor<- versicolor.covar[pc,pc]
pc3.upper.seto.versi <- diff.Means.seto.versi.PC3 + sqrt(criticalValueT2 * ((varPC3.setosa^2/4) + (varPC3.versicolor^2/4) ))
pc3.lower.seto.versi <- diff.Means.seto.versi.PC3 - sqrt(criticalValueT2 * ((varPC3.setosa^2/4) + (varPC3.versicolor^2/4) ))
#PC3 is not very diff since 95% conf passes by 0
pc3.lower.seto.versi;pc3.upper.seto.versi
## PC3
## -0.6583737
## PC3
## 0.3792306
#PC4
pc=4
diff.Means.seto.versi.PC4 = setosa.means[pc,] - versicolor.means[pc,]
varPC4.setosa<- setosa.covar[pc,pc]
varPC4.versicolor<- versicolor.covar[pc,pc]
pc4.upper.seto.versi <- diff.Means.seto.versi.PC4 + sqrt(criticalValueT2 * ((varPC4.setosa^2/4) + (varPC4.versicolor^2/4) ))
pc4.lower.seto.versi <- diff.Means.seto.versi.PC4 - sqrt(criticalValueT2 * ((varPC4.setosa^2/4) + (varPC4.versicolor^2/4) ))
#PC4 is not very diff since 95% conf passes by 0
pc4.lower.seto.versi;pc4.upper.seto.versi
## PC4
## -0.2415984
## PC4
## 0.815016
All principal components are very similar at the 95% conf interval between Setosa and Versicolor.
Next, Setosa and Virginica:
#95% Conf Int Setosa vs Virginica
#PC1
criticalValueT2<- qchisq(0.95,2)
pc=1
diff.Means.seto.versi.PC1 = setosa.means[pc,] - virginica.means[pc,]
varPC1.setosa<- setosa.covar[1,pc]
varPC1.versicolor<- virginica.covar[1,pc]
pc1.upper.seto.versi <- diff.Means.seto.versi.PC1 + sqrt(criticalValueT2 * ((varPC1.setosa^2/4) + (varPC1.versicolor^2/4) ))
pc1.lower.seto.versi <- diff.Means.seto.versi.PC1 - sqrt(criticalValueT2 * ((varPC1.setosa^2/4) + (varPC1.versicolor^2/4) ))
#PC1 is diff since 95% conf interval does not pass by 0
pc1.lower.seto.versi;pc1.upper.seto.versi
## PC1
## -1.010576
## PC1
## -0.6190765
#PC2
pc=2
diff.Means.seto.versi.PC2 = setosa.means[pc,] - virginica.means[pc,]
varPC2.setosa<- setosa.covar[pc,pc]
varPC2.versicolor<- virginica.covar[pc,pc]
pc2.upper.seto.versi <- diff.Means.seto.versi.PC2 + sqrt(criticalValueT2 * ((varPC2.setosa^2/4) + (varPC2.versicolor^2/4) ))
pc2.lower.seto.versi <- diff.Means.seto.versi.PC2 - sqrt(criticalValueT2 * ((varPC2.setosa^2/4) + (varPC2.versicolor^2/4) ))
#PC2 is not very diff since 95% conf interval passes by 0
pc2.lower.seto.versi;pc2.upper.seto.versi
## PC2
## -0.5856551
## PC2
## 0.3681351
#PC3
pc=3
diff.Means.seto.versi.PC3 = setosa.means[pc,] - virginica.means[pc,]
varPC3.setosa<- setosa.covar[pc,pc]
varPC3.versicolor<- virginica.covar[pc,pc]
pc3.upper.seto.versi <- diff.Means.seto.versi.PC3 + sqrt(criticalValueT2 * ((varPC3.setosa^2/4) + (varPC3.versicolor^2/4) ))
pc3.lower.seto.versi <- diff.Means.seto.versi.PC3 - sqrt(criticalValueT2 * ((varPC3.setosa^2/4) + (varPC3.versicolor^2/4) ))
#PC3 is not very diff since 95% conf passes by 0
pc3.lower.seto.versi;pc3.upper.seto.versi
## PC3
## -0.6826041
## PC3
## 0.3618551
#PC4
pc=4
diff.Means.seto.versi.PC4 = setosa.means[pc,] - virginica.means[pc,]
varPC4.setosa<- setosa.covar[pc,pc]
varPC4.versicolor<- virginica.covar[pc,pc]
pc4.upper.seto.versi <- diff.Means.seto.versi.PC4 + sqrt(criticalValueT2 * ((varPC4.setosa^2/4) + (varPC4.versicolor^2/4) ))
pc4.lower.seto.versi <- diff.Means.seto.versi.PC4 - sqrt(criticalValueT2 * ((varPC4.setosa^2/4) + (varPC4.versicolor^2/4) ))
#PC4 is not very diff since 95% conf passes by 0
pc4.lower.seto.versi;pc4.upper.seto.versi
## PC4
## -0.3974639
## PC4
## 0.6916612
The only difference between setosa and virginica was noticed in PC1 (all other PC’s were relatively similar). Assuming that PC1 was equally important as all other principal components, I would say that’s not a big deal. However, PC1 is the variable that explains the highest variation in the data, hence, we could perfectly argue that setosa and virginica are very different given that PC1 is different. That can be noticed by seeing how all variables are inversely proportional to PC1 in Setosa, but quite the opposite with Virginica. Finally, let’s compare Versicolor and Virginica:
#95% Conf Int Setosa vs Virginica
#PC1
criticalValueT2<- qchisq(0.95,2)
pc=1
diff.Means.seto.versi.PC1 = versicolor.means[pc,] - virginica.means[pc,]
varPC1.setosa<- versicolor.covar[1,pc]
varPC1.versicolor<- virginica.covar[1,pc]
pc1.upper.seto.versi <- diff.Means.seto.versi.PC1 + sqrt(criticalValueT2 * ((varPC1.setosa^2/4) + (varPC1.versicolor^2/4) ))
pc1.lower.seto.versi <- diff.Means.seto.versi.PC1 - sqrt(criticalValueT2 * ((varPC1.setosa^2/4) + (varPC1.versicolor^2/4) ))
#PC1 is diff since 95% conf interval does not pass by 0
pc1.lower.seto.versi;pc1.upper.seto.versi
## PC1
## -1.013991
## PC1
## -0.7493536
#PC2
pc=2
diff.Means.seto.versi.PC2 = versicolor.means[pc,] - virginica.means[pc,]
varPC2.setosa<- versicolor.covar[pc,pc]
varPC2.versicolor<- virginica.covar[pc,pc]
pc2.upper.seto.versi <- diff.Means.seto.versi.PC2 + sqrt(criticalValueT2 * ((varPC2.setosa^2/4) + (varPC2.versicolor^2/4) ))
pc2.lower.seto.versi <- diff.Means.seto.versi.PC2 - sqrt(criticalValueT2 * ((varPC2.setosa^2/4) + (varPC2.versicolor^2/4) ))
#PC2 is not very diff since 95% conf interval passes by 0
pc2.lower.seto.versi;pc2.upper.seto.versi
## PC2
## -0.5929355
## PC2
## 0.3648176
#PC3
pc=3
diff.Means.seto.versi.PC3 = versicolor.means[pc,] - virginica.means[pc,]
varPC3.setosa<- versicolor.covar[pc,pc]
varPC3.versicolor<- virginica.covar[pc,pc]
pc3.upper.seto.versi <- diff.Means.seto.versi.PC3 + sqrt(criticalValueT2 * ((varPC3.setosa^2/4) + (varPC3.versicolor^2/4) ))
pc3.lower.seto.versi <- diff.Means.seto.versi.PC3 - sqrt(criticalValueT2 * ((varPC3.setosa^2/4) + (varPC3.versicolor^2/4) ))
#PC3 is not very diff since 95% conf passes by 0
pc3.lower.seto.versi;pc3.upper.seto.versi
## PC3
## -0.5875802
## PC3
## 0.5459744
#PC4
pc=4
diff.Means.seto.versi.PC4 = versicolor.means[pc,] - virginica.means[pc,]
varPC4.setosa<- versicolor.covar[pc,pc]
varPC4.versicolor<- virginica.covar[pc,pc]
pc4.upper.seto.versi <- diff.Means.seto.versi.PC4 + sqrt(criticalValueT2 * ((varPC4.setosa^2/4) + (varPC4.versicolor^2/4) ))
pc4.lower.seto.versi <- diff.Means.seto.versi.PC4 - sqrt(criticalValueT2 * ((varPC4.setosa^2/4) + (varPC4.versicolor^2/4) ))
#PC4 is not very diff since 95% conf passes by 0
pc4.lower.seto.versi;pc4.upper.seto.versi
## PC4
## -0.7001361
## PC4
## 0.4209157
As expected, we can see tha PC1 is very different between virginica and versicolor. Nonetheless, all other PC’s were relatively similar.
library(ACSWR)
data("stiff")
#exploring
str(stiff)
## 'data.frame': 30 obs. of 4 variables:
## $ x1: int 1889 2403 2119 1645 1976 1712 1943 2104 2983 1745 ...
## $ x2: int 1651 2048 1700 1627 1916 1712 1685 1820 2794 1600 ...
## $ x3: int 1561 2087 1815 1110 1614 1439 1271 1717 2412 1384 ...
## $ x4: int 1778 2197 2222 1533 1883 1546 1671 1874 2581 1508 ...
#initial result with outliers
summary(prcomp(stiff))
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 602.6758 163.74996 87.68382 74.50423
## Proportion of Variance 0.9007 0.06649 0.01907 0.01376
## Cumulative Proportion 0.9007 0.96717 0.98624 1.00000
#identifying outliers
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#calculating Maha distance for Data
mahaDistance<-mahalanobis(stiff,colMeans(stiff),var(stiff))
stiff$MahaDistance<- mahaDistance
stiff1<-arrange(stiff,desc(MahaDistance))
#identifying outliers (first 5 outliers)
stiff1$outliers<- c(rep("outlier",5),rep("not-outlier",nrow(stiff1)-5))
no.outlier.stiff1<-stiff1[stiff1$outliers=='not-outlier',]
#PCA without outlier
summary(prcomp(no.outlier.stiff1[1:4]))
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 432.5844 84.91355 76.31435 58.4758
## Proportion of Variance 0.9192 0.03542 0.02861 0.0168
## Cumulative Proportion 0.9192 0.95460 0.98320 1.0000
When removing the outliers we find that PC1 becomes more significant in detecting variation.
N4. Canonical correlation analysis quantifies the correlation between a linear combination of variables in one set with a linear combination of potentially different variables in another set and maximizes such correlation among the space of linear combinations. Use equation (15.6) to give the sample canonical correlations if the sample covariance/variance matrix is: [25 pts]
\[S=\begin{bmatrix}S_{yy}&S_{yx}\\S_{xy}&S_{xx} \end{bmatrix}= \begin{bmatrix}8&2&3&1\\2&5&-1&3\\3&-1&6&-2\\1&3&-2&7 \end{bmatrix}\]
Syy = matrix(c(8,2,2,5),byrow = T,nrow=2)
Syx = matrix(c(3,1,-1,3),byrow = T,nrow=2)
Sxx = matrix(c(6,-2,-2,7),byrow = T,nrow=2)
Sxy = matrix(c(3,-1,1,3),byrow = T,nrow=2)
n.matrix<-solve(Syy)%*%Syx%*%solve(Sxx)%*%Sxy
correlation.final=eigen(n.matrix)$values[1]*eigen(n.matrix)$values[2]
correlation.final
## [1] 0.07309942