Our dataset uses 40 observations with twenty engineer apprentices and twenty pilots. We will be conducting PCA, FA, and LDA/QDA.
#Read Data
mydata <- read.csv('T5_6_PILOT.DAT.txt', sep = "", header=FALSE)
colnames(mydata) = c("group","intel","form","dynamo","dot","smc","persev")
#Subset Data
engineer <- mydata[which(mydata$group == 1),]
pilot <- mydata[which(mydata$group == 2),]
From the pooled covariance matrix, we obtain the first two principal components as listed below.
\(z1=−.44y1-.0408y2+0.039y3-0.449y4+0.019y5−0.774y6\)
\(z2=.189y1+.037y2-0.0313y3-0.8923y4+0.0009y5+0.406y6\)
engineer <- engineer[,-1]
pilot <- pilot[,-1]
colMeans(engineer)
## intel form dynamo dot smc persev
## 124.50 38.10 76.20 192.75 53.65 250.30
colMeans(pilot)
## intel form dynamo dot smc persev
## 129.30 31.70 87.40 236.60 44.25 280.20
S1 <- var(engineer);S1
## intel form dynamo dot smc
## intel 384.26316 20.736842 81.31578947 26.605263 -15.92105263
## form 20.73684 68.200000 9.55789474 26.868421 -64.06842105
## dynamo 81.31579 9.557895 121.11578947 -37.789474 0.02105263
## dot 26.60526 26.868421 -37.78947368 1000.197368 -24.35526316
## smc -15.92105 -64.068421 0.02105263 -24.355263 322.45000000
## persev 82.89474 6.705263 -4.06315789 -8.657895 -132.20526316
## persev
## intel 82.894737
## form 6.705263
## dynamo -4.063158
## dot -8.657895
## smc -132.205263
## persev 470.221053
S2 <- var(pilot); S2
## intel form dynamo dot smc persev
## intel 687.800000 69.305263 -52.178947 76.968421 -1.447368 439.20000
## form 69.305263 51.694737 3.336842 -46.968421 -35.342105 56.48421
## dynamo -52.178947 3.336842 110.884211 43.852632 -7.947368 -111.87368
## dot 76.968421 -46.968421 43.852632 792.568421 8.157895 129.76842
## smc -1.447368 -35.342105 -7.947368 8.157895 173.671053 116.21053
## persev 439.200000 56.484211 -111.873684 129.768421 116.210526 1253.74737
n1<-dim(engineer)[1]
n2<-dim(pilot)[1]
p <- dim(engineer)[2]
S_p<-((n1-1)*S1+(n2-1)*S2)/(n1+n2-2);S_p
## intel form dynamo dot smc persev
## intel 536.031579 45.021053 14.568421 51.786842 -8.684211 261.047368
## form 45.021053 59.947368 6.447368 -10.050000 -49.705263 31.594737
## dynamo 14.568421 6.447368 116.000000 3.031579 -3.963158 -57.968421
## dot 51.786842 -10.050000 3.031579 896.382895 -8.098684 60.555263
## smc -8.684211 -49.705263 -3.963158 -8.098684 248.060526 -7.997368
## persev 261.047368 31.594737 -57.968421 60.555263 -7.997368 861.984211
sum(diag(S_p))
## [1] 2718.407
s.eigen <- eigen(S_p)
s.eigen
## eigen() decomposition
## $values
## [1] 1050.59628 858.31581 398.90346 259.14837 108.08917 43.35349
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.44092830 0.1897832542 0.86398819 0.079577443 0.105698146
## [2,] -0.04088997 0.0377138213 0.08242069 -0.227027690 -0.057349251
## [3,] 0.03933709 -0.0313490863 0.14336529 -0.023912831 -0.985372990
## [4,] -0.44974010 -0.8923295867 -0.03264727 0.003570544 0.004135096
## [5,] 0.01936520 0.0009446983 -0.05392188 0.970251939 -0.047798944
## [6,] -0.77441699 0.4066008303 -0.47138542 -0.012347547 -0.110802522
## [,6]
## [1,] 0.07472808
## [2,] -0.96709979
## [3,] 0.07337894
## [4,] -0.01964830
## [5,] -0.23031058
## [6,] 0.01789487
for (s in s.eigen$values) {
print(s / sum(s.eigen$values))
}
## [1] 0.386475
## [1] 0.3157423
## [1] 0.1467416
## [1] 0.09533098
## [1] 0.03976196
## [1] 0.01594813
The first two columns of the object pca, or the eigen vectors, are the principal components of the covariance matrix S. We see below from the eigenvalues and the summary of pca the cumulative proportion of variance explained by the components.
\(z1=−.212y1+.039y2−0.080y3−0.776y4+0.096y5−0.580y6\)
\(z2=−.389y1−.064y2+0.066y3+0.608y4−0.010y5−0.686y6\)
\(z3=0.88y1-0.0957y2+0.08y3+0.08y4+0.0149y5-0.434y6\)
#Ignoring Groups
S <- cov(mydata[,2:7])
S
## intel form dynamo dot smc persev
## intel 528.19487 35.98974 27.97949 104.42821 -20.03077 291.15385
## form 35.98974 68.91282 -12.09744 -81.75128 -33.00513 -18.28205
## dynamo 27.97949 -12.09744 145.18974 128.88205 -30.85641 29.38462
## dot 104.42821 -81.75128 128.88205 1366.43013 -113.58077 395.18590
## smc -20.03077 -33.00513 -30.85641 -113.58077 264.35641 -79.85897
## persev 291.15385 -18.28205 29.38462 395.18590 -79.85897 1069.11538
sum(diag(S))
## [1] 3442.199
s.eigen <- eigen(S)
s.eigen
## eigen() decomposition
## $values
## [1] 1722.0424 878.3578 401.4386 261.0769 128.9051 50.3785
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.21165160 -0.38949336 0.88819049 0.03082062 -0.04760343
## [2,] 0.03883125 -0.06379320 0.09571590 -0.19128493 -0.14793191
## [3,] -0.08012946 0.06602004 0.08145863 -0.12854488 0.97505667
## [4,] -0.77552673 0.60795970 0.08071120 0.08125631 -0.10891968
## [5,] 0.09593926 -0.01046493 0.01494473 0.96813856 0.10919120
## [6,] -0.58019734 -0.68566916 -0.43426141 0.04518327 0.03644629
## [,6]
## [1,] 0.10677164
## [2,] -0.96269790
## [3,] -0.12379748
## [4,] -0.06295166
## [5,] -0.20309559
## [6,] -0.03572141
for (s in s.eigen$values) {
print(s / sum(s.eigen$values))
}
## [1] 0.5002739
## [1] 0.2551734
## [1] 0.1166227
## [1] 0.07584597
## [1] 0.03744848
## [1] 0.01463556
#Easier Way/Checking Work
pca <- prcomp(mydata[,2:7])
pca
## Standard deviations (1, .., p=6):
## [1] 41.497499 29.637102 20.035932 16.157875 11.353640 7.097781
##
## Rotation (n x k) = (6 x 6):
## PC1 PC2 PC3 PC4 PC5
## intel 0.21165160 -0.38949336 0.88819049 -0.03082062 -0.04760343
## form -0.03883125 -0.06379320 0.09571590 0.19128493 -0.14793191
## dynamo 0.08012946 0.06602004 0.08145863 0.12854488 0.97505667
## dot 0.77552673 0.60795970 0.08071120 -0.08125631 -0.10891968
## smc -0.09593926 -0.01046493 0.01494473 -0.96813856 0.10919120
## persev 0.58019734 -0.68566916 -0.43426141 -0.04518327 0.03644629
## PC6
## intel -0.10677164
## form 0.96269790
## dynamo 0.12379748
## dot 0.06295166
## smc 0.20309559
## persev 0.03572141
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 41.4975 29.6371 20.0359 16.15788 11.35364 7.09778
## Proportion of Variance 0.5003 0.2552 0.1166 0.07585 0.03745 0.01464
## Cumulative Proportion 0.5003 0.7554 0.8721 0.94792 0.98536 1.00000
From above, the covariance matrix method provides more cumulative explained variance from the first two principal components so I would use that method.
From the scree plot, the elbow is at 3, so I would use 3 principal components. The first two principal components account for 87.2 percent of the variance so I believe that is a good reason to use 3 as well.
\(z1=−.212y1+.039y2−0.080y3−0.776y4+0.096y5−0.580y6\)
\(z2=−.389y1−.064y2+0.066y3+0.608y4−0.010y5−0.686y6\)
\(z3=0.88y1-0.0957y2+0.08y3+0.08y4+0.0149y5-0.434y6\)
plot(s.eigen$values, xlab = 'Eigenvalue Number', ylab = 'Eigenvalue Size', main = 'Scree Graph')
lines(s.eigen$values)
3 Factors Extracted below By Principal Component Method
#Usually use centralized or standardedized
pc.p=prcomp(mydata[,-1],scale=TRUE,center=TRUE) #standardized
pc.p
## Standard deviations (1, .., p=6):
## [1] 1.3323392 1.1637937 1.0356884 0.9026604 0.7284317 0.6726049
##
## Rotation (n x k) = (6 x 6):
## PC1 PC2 PC3 PC4 PC5 PC6
## intel 0.40239072 -0.3964661 0.4617841 -0.3928149 -0.2103062 -0.5187674
## form -0.09715877 -0.7472294 -0.1752970 -0.1315611 -0.2801896 0.5528697
## dynamo 0.38541311 0.2181560 -0.4329575 -0.7177525 0.2585104 0.1855163
## dot 0.54333623 0.3144601 -0.1065065 0.2453920 -0.7066663 0.1869825
## smc -0.31188931 0.3559400 0.6268314 -0.3992852 -0.2012981 0.4279773
## persev 0.53629229 -0.1062657 0.4053555 0.3058981 0.5201339 0.4155385
loading1=pc.p$sdev[1]*pc.p$rotation[,1] #factor loading1
loading1
## intel form dynamo dot smc persev
## 0.5361209 -0.1294484 0.5135010 0.7239081 -0.4155423 0.7145232
loading2=pc.p$sdev[2]*pc.p$rotation[,2] #factor loading1
loading2
## intel form dynamo dot smc persev
## -0.4614047 -0.8696209 0.2538886 0.3659667 0.4142408 -0.1236713
loading3=pc.p$sdev[3]*pc.p$rotation[,3] #factor loading3
loading3
## intel form dynamo dot smc persev
## 0.4782644 -0.1815530 -0.4484090 -0.1103075 0.6492020 0.4198220
##Specific variance epsilon- the diagonal matrix and The residual matrix##
#One factor model
1-(loading1)^2
## intel form dynamo dot smc persev
## 0.7125744 0.9832431 0.7363167 0.4759570 0.8273246 0.4894566
#The residual matrix
cor(mydata[,-1]) - loading1 %*% t(loading1)- diag(1-(loading1)^2)
## intel form dynamo dot smc
## intel 0.000000000 0.25803909 -0.1742630 -0.2651810 0.1691759
## form 0.258039085 0.00000000 -0.0544696 -0.1727014 -0.2983237
## dynamo -0.174262961 -0.05446960 0.0000000 -0.0823727 0.0558807
## dot -0.265181037 -0.17270142 -0.0823727 0.0000000 0.1118343
## smc 0.169175904 -0.29832375 0.0558807 0.1118343 0.0000000
## persev 0.004376905 0.02514003 -0.2923254 -0.1902886 0.1466985
## persev
## intel 0.004376905
## form 0.025140034
## dynamo -0.292325403
## dot -0.190288575
## smc 0.146698545
## persev 0.000000000
#Three factor model
1-apply(cbind(loading1^2+loading2^2+loading3^2),1,sum)
## intel form dynamo dot smc persev
## 0.2709432 0.1940411 0.4707867 0.3298577 0.2342659 0.2979114
#The residual matrix
cor(mydata[,-1]) - cbind(loading1,loading2,loading3) %*% t(cbind(loading1,loading2,loading3))-
diag(1-apply(cbind(loading1^2+loading2^2,loading3^2),1,sum))
## intel form dynamo dot smc
## intel 0.00000000 -0.056377762 0.15734054 -0.04356612 0.04981834
## form -0.05637776 0.000000000 0.08490722 0.12552416 0.17977327
## dynamo 0.15734054 0.084907221 0.00000000 -0.22475036 0.24181771
## dot -0.04356612 0.125524158 -0.22475036 0.00000000 0.03184790
## smc 0.04981834 0.179773272 0.24181771 0.03184790 0.00000000
## persev -0.25347157 -0.006187177 -0.07267468 -0.09871947 -0.07462103
## persev
## intel -0.253471570
## form -0.006187177
## dynamo -0.072674681
## dot -0.098719468
## smc -0.074621033
## persev 0.000000000
##Rotation##
Lpc=cbind(loading1,loading2,loading3)
varimax(Lpc)
## $loadings
##
## Loadings:
## loading1 loading2 loading3
## intel 0.834 -0.170
## form -0.819 0.355
## dynamo -0.724
## dot 0.295 0.195 -0.738
## smc 0.728 0.486
## persev 0.800 -0.238
##
## loading1 loading2 loading3
## SS loadings 1.435 1.275 1.492
## Proportion Var 0.239 0.213 0.249
## Cumulative Var 0.239 0.452 0.700
##
## $rotmat
## [,1] [,2] [,3]
## [1,] 0.6826816 -0.07372227 -0.7269875
## [2,] -0.3504684 0.83996267 -0.4142881
## [3,] 0.6411847 0.53761300 0.5475897
#variance epsilon
1-apply(varimax(Lpc)$loadings^2,1,sum)
## intel form dynamo dot smc persev
## 0.2709432 0.1940411 0.4707867 0.3298577 0.2342659 0.2979114
3 Factors Extracted below By Maximum Likelihood Method
#The Maximum Likelihood Method
#Using Covariance Matrix S
##Rotation (varimax)#
mlik.prot=factanal(mydata[,-1],factors=3,rotation="varimax") #rotmat
mlik.prot
##
## Call:
## factanal(x = mydata[, -1], factors = 3, rotation = "varimax")
##
## Uniquenesses:
## intel form dynamo dot smc persev
## 0.005 0.703 0.866 0.378 0.005 0.726
##
## Loadings:
## Factor1 Factor2 Factor3
## intel 0.996
## form 0.199 -0.328 -0.387
## dynamo 0.345
## dot 0.784
## smc 0.977 -0.201
## persev 0.367 0.366
##
## Factor1 Factor2 Factor3
## SS loadings 1.178 1.078 1.061
## Proportion Var 0.196 0.180 0.177
## Cumulative Var 0.196 0.376 0.553
##
## The degrees of freedom for the model is 0 and the fit was 0.0165
#specific variance
1-apply(mlik.prot$loadings^2,1,sum)
## intel form dynamo dot smc persev
## 0.004999678 0.702777592 0.866336124 0.377565782 0.004999862 0.726101251
I’ve used the psych package below to restate the results in a shorter script. From above, we see that the PCA method has a higher cumulative variance explained than Factor Analysis using MLE. Additionally, PCA has no assumptions which would be useful in this scenario.
library(psych)
## Warning: package 'psych' was built under R version 3.5.2
#The Principal Component (Principal Factor) Method
principal(mydata[,-1],nfactors=3,rotate="varimax")
## Principal Components Analysis
## Call: principal(r = mydata[, -1], nfactors = 3, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC3 RC1 RC2 h2 u2 com
## intel -0.06 0.83 0.17 0.73 0.27 1.1
## form -0.35 0.10 0.82 0.81 0.19 1.4
## dynamo 0.72 -0.03 0.07 0.53 0.47 1.0
## dot 0.74 0.30 -0.19 0.67 0.33 1.5
## smc -0.49 -0.01 -0.73 0.77 0.23 1.7
## persev 0.24 0.80 -0.07 0.70 0.30 1.2
##
## RC3 RC1 RC2
## SS loadings 1.49 1.43 1.28
## Proportion Var 0.25 0.24 0.21
## Cumulative Var 0.25 0.49 0.70
## Proportion Explained 0.36 0.34 0.30
## Cumulative Proportion 0.36 0.70 1.00
##
## Mean item complexity = 1.3
## Test of the hypothesis that 3 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.14
## with the empirical chi square 22.6 with prob < NA
##
## Fit based upon off diagonal values = 0.56
#The Maximum Likelihood Method
fa(mydata[,-1],nfactors=3,rotate="varimax",fm="mle")
## Factor Analysis using method = ml
## Call: fa(r = mydata[, -1], nfactors = 3, rotate = "varimax", fm = "mle")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML2 ML3 h2 u2 com
## intel 1.00 -0.03 0.05 1.00 0.005 1.0
## form 0.20 -0.33 -0.39 0.30 0.703 2.5
## dynamo 0.08 -0.09 0.35 0.13 0.866 1.2
## dot 0.08 -0.03 0.78 0.62 0.378 1.0
## smc -0.01 0.98 -0.20 1.00 0.005 1.1
## persev 0.37 -0.07 0.37 0.27 0.726 2.1
##
## ML1 ML2 ML3
## SS loadings 1.18 1.08 1.06
## Proportion Var 0.20 0.18 0.18
## Cumulative Var 0.20 0.38 0.55
## Proportion Explained 0.36 0.32 0.32
## Cumulative Proportion 0.36 0.68 1.00
##
## Mean item complexity = 1.5
## Test of the hypothesis that 3 factors are sufficient.
##
## The degrees of freedom for the null model are 15 and the objective function was 0.68 with Chi Square of 24.75
## The degrees of freedom for the model are 0 and the objective function was 0.02
##
## The root mean square of the residuals (RMSR) is 0.03
## The df corrected root mean square of the residuals is NA
##
## The harmonic number of observations is 40 with the empirical chi square 0.76 with prob < NA
## The total number of observations was 40 with Likelihood Chi Square = 0.56 with prob < NA
##
## Tucker Lewis Index of factoring reliability = -Inf
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy
## ML1 ML2 ML3
## Correlation of (regression) scores with factors 1.00 0.99 0.83
## Multiple R square of scores with factors 0.99 0.98 0.69
## Minimum correlation of possible factor scores 0.99 0.96 0.38
The LDA function is $z_hat = 0.0075152019y1 + 0.193260187y2 - 0.129182304y3 - 0.042785864y4 + 0.071839177y5 + -0.049062257y6
The classification confusion matrix using LDA is shown below.
#######
# Linear Discriminant Function
#########
n_1=dim(engineer)[1];n_1
## [1] 20
n_2=dim(pilot)[1];n_2
## [1] 20
xbar_1 = colMeans(engineer)
xbar_2 = colMeans(pilot)
S_1 = var(engineer)
S_2 = var(pilot)
S_p=((n_1-1)*S_1+(n_2-1)*S_2)/(n_1+n_2-2)
a_hat=solve(S_p)%*%(xbar_1-xbar_2);a_hat
## [,1]
## intel 0.007515209
## form 0.193260187
## dynamo -0.129182304
## dot -0.042785864
## smc 0.071839177
## persev -0.049062257
ybar_1=t(a_hat) %*% xbar_1
ybar_2=t(a_hat) %*% xbar_2
mhat =(1/2)*(ybar_1+ybar_2);mhat
## [,1]
## [1,] -21.55094
#Confusion Matrix
library(MASS)
## Warning: package 'MASS' was built under R version 3.5.2
table(mydata$group)
##
## 1 2
## 20 20
z=lda(group~.,mydata)
newclass=predict(z,mydata)$class
table(mydata$group,newclass)
## newclass
## 1 2
## 1 18 2
## 2 2 18
mean(newclass == mydata$group)
## [1] 0.9
The classification confusion matrix using QDA is shown below.
#Quadratic
(solve(S_1)-solve(S_2))
## intel form dynamo dot
## intel 0.0011005098 0.001930897 -0.0028781276 0.0001634347
## form 0.0019308973 -0.014261930 0.0037659607 -0.0031646953
## dynamo -0.0028781276 0.003765961 -0.0010863433 0.0016114313
## dot 0.0001634347 -0.003164695 0.0016114313 -0.0005338296
## smc -0.0001403828 -0.003589709 0.0011240129 -0.0006428513
## persev -0.0001428524 0.002976418 -0.0007224636 0.0003961864
## smc persev
## intel -0.0001403828 -0.0001428524
## form -0.0035897088 0.0029764178
## dynamo 0.0011240129 -0.0007224636
## dot -0.0006428513 0.0003961864
## smc -0.0036431007 0.0024497932
## persev 0.0024497932 0.0012579371
#Just wanted to know what K was
khat=1/2*(log(det(S_1))-log(det(S_2)))+
1/2*(t(xbar_1)%*%solve(S_1)%*%xbar_1-t(xbar_2)%*%solve(S_2)%*%xbar_2)
#Confusion Matrix
z=qda(group~.,mydata)
newclass=predict(z,mydata)$class
table(mydata$group,newclass)
## newclass
## 1 2
## 1 18 2
## 2 2 18
mean(newclass == mydata$group)
## [1] 0.9
Both LDA and QDA predictions are accurate ~ 90% of the time, which is pretty good accuracy. The quadratic relationship and linear relationship are equally suited for this problem.