Dataset

Our dataset uses 40 observations with twenty engineer apprentices and twenty pilots. We will be conducting PCA, FA, and LDA/QDA.

#Read Data
mydata <- read.csv('T5_6_PILOT.DAT.txt', sep = "", header=FALSE)
colnames(mydata) = c("group","intel","form","dynamo","dot","smc","persev")

#Subset Data
engineer <- mydata[which(mydata$group == 1),]
pilot <- mydata[which(mydata$group == 2),]

Problem A.

From the pooled covariance matrix, we obtain the first two principal components as listed below.

$z1=−.44y1-.0408y2+0.039y3-0.449y4+0.019y5−0.774y6$

$z2=.189y1+.037y2-0.0313y3-0.8923y4+0.0009y5+0.406y6$

engineer <- engineer[,-1]
pilot <- pilot[,-1]
colMeans(engineer)

##  intel   form dynamo    dot    smc persev 
## 124.50  38.10  76.20 192.75  53.65 250.30

colMeans(pilot)

##  intel   form dynamo    dot    smc persev 
## 129.30  31.70  87.40 236.60  44.25 280.20

S1 <- var(engineer);S1

##            intel       form       dynamo         dot           smc
## intel  384.26316  20.736842  81.31578947   26.605263  -15.92105263
## form    20.73684  68.200000   9.55789474   26.868421  -64.06842105
## dynamo  81.31579   9.557895 121.11578947  -37.789474    0.02105263
## dot     26.60526  26.868421 -37.78947368 1000.197368  -24.35526316
## smc    -15.92105 -64.068421   0.02105263  -24.355263  322.45000000
## persev  82.89474   6.705263  -4.06315789   -8.657895 -132.20526316
##             persev
## intel    82.894737
## form      6.705263
## dynamo   -4.063158
## dot      -8.657895
## smc    -132.205263
## persev  470.221053

S2 <- var(pilot); S2

##             intel       form      dynamo        dot        smc     persev
## intel  687.800000  69.305263  -52.178947  76.968421  -1.447368  439.20000
## form    69.305263  51.694737    3.336842 -46.968421 -35.342105   56.48421
## dynamo -52.178947   3.336842  110.884211  43.852632  -7.947368 -111.87368
## dot     76.968421 -46.968421   43.852632 792.568421   8.157895  129.76842
## smc     -1.447368 -35.342105   -7.947368   8.157895 173.671053  116.21053
## persev 439.200000  56.484211 -111.873684 129.768421 116.210526 1253.74737

n1<-dim(engineer)[1]
n2<-dim(pilot)[1]
p <- dim(engineer)[2]

S_p<-((n1-1)*S1+(n2-1)*S2)/(n1+n2-2);S_p

##             intel       form     dynamo        dot        smc     persev
## intel  536.031579  45.021053  14.568421  51.786842  -8.684211 261.047368
## form    45.021053  59.947368   6.447368 -10.050000 -49.705263  31.594737
## dynamo  14.568421   6.447368 116.000000   3.031579  -3.963158 -57.968421
## dot     51.786842 -10.050000   3.031579 896.382895  -8.098684  60.555263
## smc     -8.684211 -49.705263  -3.963158  -8.098684 248.060526  -7.997368
## persev 261.047368  31.594737 -57.968421  60.555263  -7.997368 861.984211

sum(diag(S_p))

## [1] 2718.407

s.eigen <- eigen(S_p)
s.eigen

## eigen() decomposition
## $values
## [1] 1050.59628  858.31581  398.90346  259.14837  108.08917   43.35349
## 
## $vectors
##             [,1]          [,2]        [,3]         [,4]         [,5]
## [1,] -0.44092830  0.1897832542  0.86398819  0.079577443  0.105698146
## [2,] -0.04088997  0.0377138213  0.08242069 -0.227027690 -0.057349251
## [3,]  0.03933709 -0.0313490863  0.14336529 -0.023912831 -0.985372990
## [4,] -0.44974010 -0.8923295867 -0.03264727  0.003570544  0.004135096
## [5,]  0.01936520  0.0009446983 -0.05392188  0.970251939 -0.047798944
## [6,] -0.77441699  0.4066008303 -0.47138542 -0.012347547 -0.110802522
##             [,6]
## [1,]  0.07472808
## [2,] -0.96709979
## [3,]  0.07337894
## [4,] -0.01964830
## [5,] -0.23031058
## [6,]  0.01789487

for (s in s.eigen$values) {
  print(s / sum(s.eigen$values))
}

## [1] 0.386475
## [1] 0.3157423
## [1] 0.1467416
## [1] 0.09533098
## [1] 0.03976196
## [1] 0.01594813

Part B.

The first two columns of the object pca, or the eigen vectors, are the principal components of the covariance matrix S. We see below from the eigenvalues and the summary of pca the cumulative proportion of variance explained by the components.

$z1=−.212y1+.039y2−0.080y3−0.776y4+0.096y5−0.580y6$

$z2=−.389y1−.064y2+0.066y3+0.608y4−0.010y5−0.686y6$

$z3=0.88y1-0.0957y2+0.08y3+0.08y4+0.0149y5-0.434y6$

#Ignoring Groups
S <- cov(mydata[,2:7])
S

##            intel      form    dynamo        dot        smc     persev
## intel  528.19487  35.98974  27.97949  104.42821  -20.03077  291.15385
## form    35.98974  68.91282 -12.09744  -81.75128  -33.00513  -18.28205
## dynamo  27.97949 -12.09744 145.18974  128.88205  -30.85641   29.38462
## dot    104.42821 -81.75128 128.88205 1366.43013 -113.58077  395.18590
## smc    -20.03077 -33.00513 -30.85641 -113.58077  264.35641  -79.85897
## persev 291.15385 -18.28205  29.38462  395.18590  -79.85897 1069.11538

sum(diag(S))

## [1] 3442.199

s.eigen <- eigen(S)
s.eigen

## eigen() decomposition
## $values
## [1] 1722.0424  878.3578  401.4386  261.0769  128.9051   50.3785
## 
## $vectors
##             [,1]        [,2]        [,3]        [,4]        [,5]
## [1,] -0.21165160 -0.38949336  0.88819049  0.03082062 -0.04760343
## [2,]  0.03883125 -0.06379320  0.09571590 -0.19128493 -0.14793191
## [3,] -0.08012946  0.06602004  0.08145863 -0.12854488  0.97505667
## [4,] -0.77552673  0.60795970  0.08071120  0.08125631 -0.10891968
## [5,]  0.09593926 -0.01046493  0.01494473  0.96813856  0.10919120
## [6,] -0.58019734 -0.68566916 -0.43426141  0.04518327  0.03644629
##             [,6]
## [1,]  0.10677164
## [2,] -0.96269790
## [3,] -0.12379748
## [4,] -0.06295166
## [5,] -0.20309559
## [6,] -0.03572141

for (s in s.eigen$values) {
  print(s / sum(s.eigen$values))
}

## [1] 0.5002739
## [1] 0.2551734
## [1] 0.1166227
## [1] 0.07584597
## [1] 0.03744848
## [1] 0.01463556

#Easier Way/Checking Work
pca <- prcomp(mydata[,2:7])
pca

## Standard deviations (1, .., p=6):
## [1] 41.497499 29.637102 20.035932 16.157875 11.353640  7.097781
## 
## Rotation (n x k) = (6 x 6):
##                PC1         PC2         PC3         PC4         PC5
## intel   0.21165160 -0.38949336  0.88819049 -0.03082062 -0.04760343
## form   -0.03883125 -0.06379320  0.09571590  0.19128493 -0.14793191
## dynamo  0.08012946  0.06602004  0.08145863  0.12854488  0.97505667
## dot     0.77552673  0.60795970  0.08071120 -0.08125631 -0.10891968
## smc    -0.09593926 -0.01046493  0.01494473 -0.96813856  0.10919120
## persev  0.58019734 -0.68566916 -0.43426141 -0.04518327  0.03644629
##                PC6
## intel  -0.10677164
## form    0.96269790
## dynamo  0.12379748
## dot     0.06295166
## smc     0.20309559
## persev  0.03572141

summary(pca)

## Importance of components:
##                            PC1     PC2     PC3      PC4      PC5     PC6
## Standard deviation     41.4975 29.6371 20.0359 16.15788 11.35364 7.09778
## Proportion of Variance  0.5003  0.2552  0.1166  0.07585  0.03745 0.01464
## Cumulative Proportion   0.5003  0.7554  0.8721  0.94792  0.98536 1.00000

Part C.

From above, the covariance matrix method provides more cumulative explained variance from the first two principal components so I would use that method.

Part D.

From the scree plot, the elbow is at 3, so I would use 3 principal components. The first two principal components account for 87.2 percent of the variance so I believe that is a good reason to use 3 as well.

$z1=−.212y1+.039y2−0.080y3−0.776y4+0.096y5−0.580y6$

$z2=−.389y1−.064y2+0.066y3+0.608y4−0.010y5−0.686y6$

$z3=0.88y1-0.0957y2+0.08y3+0.08y4+0.0149y5-0.434y6$

plot(s.eigen$values, xlab = 'Eigenvalue Number', ylab = 'Eigenvalue Size', main = 'Scree Graph')
lines(s.eigen$values)

Part E.

3 Factors Extracted below By Principal Component Method

#Usually use centralized or standardedized
pc.p=prcomp(mydata[,-1],scale=TRUE,center=TRUE)   #standardized
pc.p

## Standard deviations (1, .., p=6):
## [1] 1.3323392 1.1637937 1.0356884 0.9026604 0.7284317 0.6726049
## 
## Rotation (n x k) = (6 x 6):
##                PC1        PC2        PC3        PC4        PC5        PC6
## intel   0.40239072 -0.3964661  0.4617841 -0.3928149 -0.2103062 -0.5187674
## form   -0.09715877 -0.7472294 -0.1752970 -0.1315611 -0.2801896  0.5528697
## dynamo  0.38541311  0.2181560 -0.4329575 -0.7177525  0.2585104  0.1855163
## dot     0.54333623  0.3144601 -0.1065065  0.2453920 -0.7066663  0.1869825
## smc    -0.31188931  0.3559400  0.6268314 -0.3992852 -0.2012981  0.4279773
## persev  0.53629229 -0.1062657  0.4053555  0.3058981  0.5201339  0.4155385

loading1=pc.p$sdev[1]*pc.p$rotation[,1] #factor loading1
loading1

##      intel       form     dynamo        dot        smc     persev 
##  0.5361209 -0.1294484  0.5135010  0.7239081 -0.4155423  0.7145232

loading2=pc.p$sdev[2]*pc.p$rotation[,2] #factor loading1
loading2

##      intel       form     dynamo        dot        smc     persev 
## -0.4614047 -0.8696209  0.2538886  0.3659667  0.4142408 -0.1236713

loading3=pc.p$sdev[3]*pc.p$rotation[,3] #factor loading3
loading3

##      intel       form     dynamo        dot        smc     persev 
##  0.4782644 -0.1815530 -0.4484090 -0.1103075  0.6492020  0.4198220

##Specific variance epsilon- the diagonal matrix and The residual matrix## 
#One factor model
1-(loading1)^2

##     intel      form    dynamo       dot       smc    persev 
## 0.7125744 0.9832431 0.7363167 0.4759570 0.8273246 0.4894566

#The residual matrix
cor(mydata[,-1]) - loading1 %*% t(loading1)- diag(1-(loading1)^2)

##               intel        form     dynamo        dot        smc
## intel   0.000000000  0.25803909 -0.1742630 -0.2651810  0.1691759
## form    0.258039085  0.00000000 -0.0544696 -0.1727014 -0.2983237
## dynamo -0.174262961 -0.05446960  0.0000000 -0.0823727  0.0558807
## dot    -0.265181037 -0.17270142 -0.0823727  0.0000000  0.1118343
## smc     0.169175904 -0.29832375  0.0558807  0.1118343  0.0000000
## persev  0.004376905  0.02514003 -0.2923254 -0.1902886  0.1466985
##              persev
## intel   0.004376905
## form    0.025140034
## dynamo -0.292325403
## dot    -0.190288575
## smc     0.146698545
## persev  0.000000000

#Three factor model
1-apply(cbind(loading1^2+loading2^2+loading3^2),1,sum)

##     intel      form    dynamo       dot       smc    persev 
## 0.2709432 0.1940411 0.4707867 0.3298577 0.2342659 0.2979114

#The residual matrix
cor(mydata[,-1]) - cbind(loading1,loading2,loading3) %*% t(cbind(loading1,loading2,loading3))- 
  diag(1-apply(cbind(loading1^2+loading2^2,loading3^2),1,sum))

##              intel         form      dynamo         dot         smc
## intel   0.00000000 -0.056377762  0.15734054 -0.04356612  0.04981834
## form   -0.05637776  0.000000000  0.08490722  0.12552416  0.17977327
## dynamo  0.15734054  0.084907221  0.00000000 -0.22475036  0.24181771
## dot    -0.04356612  0.125524158 -0.22475036  0.00000000  0.03184790
## smc     0.04981834  0.179773272  0.24181771  0.03184790  0.00000000
## persev -0.25347157 -0.006187177 -0.07267468 -0.09871947 -0.07462103
##              persev
## intel  -0.253471570
## form   -0.006187177
## dynamo -0.072674681
## dot    -0.098719468
## smc    -0.074621033
## persev  0.000000000

##Rotation##
Lpc=cbind(loading1,loading2,loading3)
varimax(Lpc)

## $loadings
## 
## Loadings:
##        loading1 loading2 loading3
## intel   0.834   -0.170           
## form            -0.819    0.355  
## dynamo                   -0.724  
## dot     0.295    0.195   -0.738  
## smc              0.728    0.486  
## persev  0.800            -0.238  
## 
##                loading1 loading2 loading3
## SS loadings       1.435    1.275    1.492
## Proportion Var    0.239    0.213    0.249
## Cumulative Var    0.239    0.452    0.700
## 
## $rotmat
##            [,1]        [,2]       [,3]
## [1,]  0.6826816 -0.07372227 -0.7269875
## [2,] -0.3504684  0.83996267 -0.4142881
## [3,]  0.6411847  0.53761300  0.5475897

#variance epsilon
1-apply(varimax(Lpc)$loadings^2,1,sum)

##     intel      form    dynamo       dot       smc    persev 
## 0.2709432 0.1940411 0.4707867 0.3298577 0.2342659 0.2979114

Part F.

3 Factors Extracted below By Maximum Likelihood Method

#The Maximum Likelihood Method
#Using Covariance Matrix S
##Rotation (varimax)#
mlik.prot=factanal(mydata[,-1],factors=3,rotation="varimax") #rotmat
mlik.prot

## 
## Call:
## factanal(x = mydata[, -1], factors = 3, rotation = "varimax")
## 
## Uniquenesses:
##  intel   form dynamo    dot    smc persev 
##  0.005  0.703  0.866  0.378  0.005  0.726 
## 
## Loadings:
##        Factor1 Factor2 Factor3
## intel   0.996                 
## form    0.199  -0.328  -0.387 
## dynamo                  0.345 
## dot                     0.784 
## smc             0.977  -0.201 
## persev  0.367           0.366 
## 
##                Factor1 Factor2 Factor3
## SS loadings      1.178   1.078   1.061
## Proportion Var   0.196   0.180   0.177
## Cumulative Var   0.196   0.376   0.553
## 
## The degrees of freedom for the model is 0 and the fit was 0.0165

#specific variance
1-apply(mlik.prot$loadings^2,1,sum)

##       intel        form      dynamo         dot         smc      persev 
## 0.004999678 0.702777592 0.866336124 0.377565782 0.004999862 0.726101251

Part G.

I’ve used the psych package below to restate the results in a shorter script. From above, we see that the PCA method has a higher cumulative variance explained than Factor Analysis using MLE. Additionally, PCA has no assumptions which would be useful in this scenario.

library(psych)

## Warning: package 'psych' was built under R version 3.5.2

#The Principal Component (Principal Factor) Method
principal(mydata[,-1],nfactors=3,rotate="varimax")

## Principal Components Analysis
## Call: principal(r = mydata[, -1], nfactors = 3, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##          RC3   RC1   RC2   h2   u2 com
## intel  -0.06  0.83  0.17 0.73 0.27 1.1
## form   -0.35  0.10  0.82 0.81 0.19 1.4
## dynamo  0.72 -0.03  0.07 0.53 0.47 1.0
## dot     0.74  0.30 -0.19 0.67 0.33 1.5
## smc    -0.49 -0.01 -0.73 0.77 0.23 1.7
## persev  0.24  0.80 -0.07 0.70 0.30 1.2
## 
##                        RC3  RC1  RC2
## SS loadings           1.49 1.43 1.28
## Proportion Var        0.25 0.24 0.21
## Cumulative Var        0.25 0.49 0.70
## Proportion Explained  0.36 0.34 0.30
## Cumulative Proportion 0.36 0.70 1.00
## 
## Mean item complexity =  1.3
## Test of the hypothesis that 3 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.14 
##  with the empirical chi square  22.6  with prob <  NA 
## 
## Fit based upon off diagonal values = 0.56

#The Maximum Likelihood Method
fa(mydata[,-1],nfactors=3,rotate="varimax",fm="mle")

## Factor Analysis using method =  ml
## Call: fa(r = mydata[, -1], nfactors = 3, rotate = "varimax", fm = "mle")
## Standardized loadings (pattern matrix) based upon correlation matrix
##          ML1   ML2   ML3   h2    u2 com
## intel   1.00 -0.03  0.05 1.00 0.005 1.0
## form    0.20 -0.33 -0.39 0.30 0.703 2.5
## dynamo  0.08 -0.09  0.35 0.13 0.866 1.2
## dot     0.08 -0.03  0.78 0.62 0.378 1.0
## smc    -0.01  0.98 -0.20 1.00 0.005 1.1
## persev  0.37 -0.07  0.37 0.27 0.726 2.1
## 
##                        ML1  ML2  ML3
## SS loadings           1.18 1.08 1.06
## Proportion Var        0.20 0.18 0.18
## Cumulative Var        0.20 0.38 0.55
## Proportion Explained  0.36 0.32 0.32
## Cumulative Proportion 0.36 0.68 1.00
## 
## Mean item complexity =  1.5
## Test of the hypothesis that 3 factors are sufficient.
## 
## The degrees of freedom for the null model are  15  and the objective function was  0.68 with Chi Square of  24.75
## The degrees of freedom for the model are 0  and the objective function was  0.02 
## 
## The root mean square of the residuals (RMSR) is  0.03 
## The df corrected root mean square of the residuals is  NA 
## 
## The harmonic number of observations is  40 with the empirical chi square  0.76  with prob <  NA 
## The total number of observations was  40  with Likelihood Chi Square =  0.56  with prob <  NA 
## 
## Tucker Lewis Index of factoring reliability =  -Inf
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy             
##                                                    ML1  ML2  ML3
## Correlation of (regression) scores with factors   1.00 0.99 0.83
## Multiple R square of scores with factors          0.99 0.98 0.69
## Minimum correlation of possible factor scores     0.99 0.96 0.38

Part H.

The LDA function is $z_hat = 0.0075152019y1 + 0.193260187y2 - 0.129182304y3 - 0.042785864y4 + 0.071839177y5 + -0.049062257y6

The classification confusion matrix using LDA is shown below.

#######
# Linear Discriminant Function
#########
n_1=dim(engineer)[1];n_1

## [1] 20

n_2=dim(pilot)[1];n_2

## [1] 20

xbar_1 = colMeans(engineer) 
xbar_2 = colMeans(pilot) 
S_1 = var(engineer)
S_2 = var(pilot)
S_p=((n_1-1)*S_1+(n_2-1)*S_2)/(n_1+n_2-2)
a_hat=solve(S_p)%*%(xbar_1-xbar_2);a_hat

##                [,1]
## intel   0.007515209
## form    0.193260187
## dynamo -0.129182304
## dot    -0.042785864
## smc     0.071839177
## persev -0.049062257

ybar_1=t(a_hat) %*% xbar_1
ybar_2=t(a_hat) %*% xbar_2
mhat =(1/2)*(ybar_1+ybar_2);mhat

##           [,1]
## [1,] -21.55094

#Confusion Matrix
library(MASS)

## Warning: package 'MASS' was built under R version 3.5.2

table(mydata$group)

## 
##  1  2 
## 20 20

z=lda(group~.,mydata)
newclass=predict(z,mydata)$class
table(mydata$group,newclass)

##    newclass
##      1  2
##   1 18  2
##   2  2 18

mean(newclass == mydata$group)

## [1] 0.9

Part I.

The classification confusion matrix using QDA is shown below.

#Quadratic
(solve(S_1)-solve(S_2))

##                intel         form        dynamo           dot
## intel   0.0011005098  0.001930897 -0.0028781276  0.0001634347
## form    0.0019308973 -0.014261930  0.0037659607 -0.0031646953
## dynamo -0.0028781276  0.003765961 -0.0010863433  0.0016114313
## dot     0.0001634347 -0.003164695  0.0016114313 -0.0005338296
## smc    -0.0001403828 -0.003589709  0.0011240129 -0.0006428513
## persev -0.0001428524  0.002976418 -0.0007224636  0.0003961864
##                  smc        persev
## intel  -0.0001403828 -0.0001428524
## form   -0.0035897088  0.0029764178
## dynamo  0.0011240129 -0.0007224636
## dot    -0.0006428513  0.0003961864
## smc    -0.0036431007  0.0024497932
## persev  0.0024497932  0.0012579371

#Just wanted to know what K was 
khat=1/2*(log(det(S_1))-log(det(S_2)))+
  1/2*(t(xbar_1)%*%solve(S_1)%*%xbar_1-t(xbar_2)%*%solve(S_2)%*%xbar_2)

#Confusion Matrix
z=qda(group~.,mydata)
newclass=predict(z,mydata)$class
table(mydata$group,newclass)

##    newclass
##      1  2
##   1 18  2
##   2  2 18

mean(newclass == mydata$group)

## [1] 0.9

Part J.

Both LDA and QDA predictions are accurate ~ 90% of the time, which is pretty good accuracy. The quadratic relationship and linear relationship are equally suited for this problem.

STAT 488 Multi Take Home

Kajal Chokshi

4/16/2019