Let us compute the PCA manually to apply the Spectral decomposition theorem. 1) Standardize each columns, i.e. subtract mean and divide by sd.2) Compute the correlation matrix for columns 3) Compute eigenvalues and eigenvectors for corr. matrix 4) Each eigenvalue represents the variance captured by the corresponding principal component 5) Each eigenvector represents the loading of the variable along the principal component.
t(t(names(cereals)))
## [,1]
## [1,] "record"
## [2,] "name"
## [3,] "mfr"
## [4,] "type"
## [5,] "protein"
## [6,] "fat"
## [7,] "sodium"
## [8,] "fiber"
## [9,] "carbo"
## [10,] "sugars"
## [11,] "potass"
## [12,] "vitamins"
## [13,] "calories"
## [14,] "rating"
df <- cereals[1:70 ,c(5:13)]
df <- scale(df)
cor.df <- as.data.frame(round(cor(df),3))
eig <- eigen(cor.df)
eig$values
## [1] 2.716 2.062 1.590 1.046 0.621 0.505 0.374 0.059 0.027
eig$vectors
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] -0.367 -0.073 -0.397 -0.40 -0.154 0.052 0.709 0.0099 -0.1075
## [2,] -0.078 -0.532 0.039 -0.41 -0.097 -0.587 -0.326 -0.0152 -0.2842
## [3,] 0.207 -0.165 -0.469 0.39 0.599 -0.393 0.208 -0.0112 -0.0086
## [4,] -0.542 -0.032 -0.195 0.25 0.072 0.154 -0.290 0.6639 -0.2250
## [5,] 0.333 0.262 -0.457 -0.33 0.131 0.325 -0.317 -0.1263 -0.5118
## [6,] 0.117 -0.544 0.305 0.30 0.027 0.407 0.242 -0.1210 -0.5204
## [7,] -0.530 -0.172 -0.213 0.12 0.115 0.201 -0.296 -0.6739 0.1862
## [8,] 0.216 -0.124 -0.445 0.41 -0.750 -0.077 -0.073 -0.0371 0.0250
## [9,] 0.266 -0.524 -0.194 -0.28 0.104 0.396 -0.110 0.2695 0.5376
##Verification using the princomp function
pca1 <- princomp(df, scores = TRUE, cor = TRUE)
pca1$sdev
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## 1.65 1.44 1.26 1.02 0.79 0.71 0.61 0.24 0.17
pca1$sdev ^2 #should equal the eigenvalues
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## 2.715 2.062 1.590 1.045 0.622 0.505 0.374 0.059 0.027
pca1$loadings
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## protein 0.367 0.397 0.402 0.154 0.709 0.108
## fat 0.532 0.410 -0.586 -0.328 0.284
## sodium -0.207 0.165 0.469 -0.393 -0.598 -0.394 0.207
## fiber 0.542 0.195 -0.250 0.156 -0.291 0.663 0.227
## carbo -0.333 -0.261 0.457 0.333 -0.131 0.327 -0.316 -0.129 0.511
## sugars -0.117 0.544 -0.305 -0.296 0.406 0.243 -0.123 0.520
## potass 0.530 0.172 0.213 -0.119 -0.115 0.202 -0.294 -0.674 -0.190
## vitamins -0.216 0.124 0.445 -0.406 0.749
## calories -0.266 0.524 0.194 0.276 -0.103 0.396 -0.108 0.272 -0.537
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## SS loadings 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## Proportion Var 0.11 0.11 0.11 0.11 0.11 0.11 0.11 0.11 0.11
## Cumulative Var 0.11 0.22 0.33 0.44 0.56 0.67 0.78 0.89 1.00
##Computation using the prcomp function
cereals.pca <- prcomp(cereals[1:70 ,c(5:13)], center = TRUE, scale. = TRUE)
names(cereals.pca)
## [1] "sdev" "rotation" "center" "scale" "x"
##standard dev explained by each component
cereals.pca$sdev
## [1] 1.65 1.44 1.26 1.02 0.79 0.71 0.61 0.24 0.17
##Total variance is equal to the number of components
## Initially every column has the variance of 1 (because of standardization)
cereals.pca$sdev^2 ##Information captured by each component
## [1] 2.715 2.062 1.590 1.045 0.622 0.505 0.374 0.059 0.027
sum(cereals.pca$sdev^2) ##Total number of PCA columns
## [1] 9
##loading/weight of each column on the principal components
cereals.pca$rotation
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
## protein -0.367 -0.073 -0.397 0.40 -0.154 0.050 -0.709 0.010 0.1083
## fat -0.078 -0.532 0.039 0.41 -0.097 -0.586 0.328 -0.017 0.2843
## sodium 0.207 -0.165 -0.469 -0.39 0.598 -0.394 -0.207 -0.011 0.0088
## fiber -0.542 -0.032 -0.195 -0.25 0.073 0.156 0.291 0.663 0.2275
## carbo 0.333 0.261 -0.457 0.33 0.131 0.327 0.316 -0.129 0.5107
## sugars 0.117 -0.544 0.305 -0.30 0.027 0.406 -0.243 -0.123 0.5197
## potass -0.530 -0.172 -0.213 -0.12 0.115 0.202 0.294 -0.674 -0.1895
## vitamins 0.216 -0.124 -0.445 -0.41 -0.749 -0.076 0.073 -0.037 -0.0244
## calories 0.266 -0.524 -0.194 0.28 0.103 0.396 0.108 0.272 -0.5368
##center - mean value of the columns
cereals.pca$center
## protein fat sodium fiber carbo sugars potass vitamins
## 2.6 1.0 155.5 2.2 14.6 7.1 98.4 26.4
## calories
## 106.6
##scale - standard dev of the columns
cereals.pca$scale
## protein fat sodium fiber carbo sugars potass vitamins
## 1.1 1.1 86.3 2.5 4.0 4.3 70.3 19.9
## calories
## 20.0
##weighted average of each record along each component
scores <- cereals.pca$x
head(scores, n= 3)
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
## 1 -5.0 -0.12 -0.13 -1.7 -0.30 -0.41 -0.076 0.240 -0.015
## 2 -1.7 -2.60 1.95 2.4 -0.50 -1.55 0.823 -0.016 -0.084
## 3 -4.6 -0.19 -1.17 -2.0 0.69 -0.88 -0.123 -0.467 -0.065
## scores can be computed manually as below. This computes the score for the first record
## for the first principal component. Loadings are stored at cereals.pca$rotation.
sc <- sum(scale(cereals[1:70 ,c(5:13)])[1,]*cereals.pca$rotation[, 1])
sc
## [1] -5
summary(cereals.pca, digits = 2)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## Standard deviation 1.648 1.436 1.261 1.022 0.7884 0.7107 0.6117 0.2437
## Proportion of Variance 0.302 0.229 0.177 0.116 0.0691 0.0561 0.0416 0.0066
## Cumulative Proportion 0.302 0.531 0.707 0.824 0.8927 0.9488 0.9904 0.9970
## PC9
## Standard deviation 0.16536
## Proportion of Variance 0.00304
## Cumulative Proportion 1.00000
# compute PCs on two dimensions
pcs <- prcomp(data.frame(cereals$calories, cereals$rating))
summary(pcs, digits = 2)
## Importance of components:
## PC1 PC2
## Standard deviation 22.317 8.884
## Proportion of Variance 0.863 0.137
## Cumulative Proportion 0.863 1.000
pcs$rot
## PC1 PC2
## cereals.calories 0.85 0.53
## cereals.rating -0.53 0.85
scores <- pcs$x
head(scores, 10)
## PC1 PC2
## [1,] -44.9 2.195
## [2,] 15.7 -0.385
## [3,] -40.2 -5.403
## [4,] -75.3 12.995
## [5,] 7.0 -5.361
## [6,] 9.6 -9.487
## [7,] 7.7 -6.386
## [8,] 22.6 7.522
## [9,] -17.7 -3.506
## [10,] -20.0 0.043