We will in this lab explore how principal components are calculated and analyzed.
We will be using the concrete data set from the {modeldata} package which is loaded with {tidymodels}.

Compressive Strength of Concrete Mixtures Data

library(ggbiplot)
library(tidymodels)
data(concrete)
head(concrete)
## # A tibble: 6 x 9
##   cement blast_furnace_slag fly_ash water superplasticizer coarse_aggregate
##    <dbl>              <dbl>   <dbl> <dbl>            <dbl>            <dbl>
## 1   540                  0        0   162              2.5            1040 
## 2   540                  0        0   162              2.5            1055 
## 3   332.               142.       0   228              0               932 
## 4   332.               142.       0   228              0               932 
## 5   199.               132.       0   192              0               978.
## 6   266                114        0   228              0               932 
## # … with 3 more variables: fine_aggregate <dbl>, age <int>,
## #   compressive_strength <dbl>
dim(concrete)
## [1] 1030    9

We must delete all non-numeric columns from the data because the prcomp() function can only deal with numeric columns. But the variables ofconcrete are all numeric variables so we don’t do that.

length(map_dbl(concrete, mean)) == length(concrete)
## [1] TRUE

Take a look on variable’s standard deviation.

map_dbl(concrete, sd)
##               cement   blast_furnace_slag              fly_ash 
##           104.506364            86.279342            63.997004 
##                water     superplasticizer     coarse_aggregate 
##            21.354219             5.973841            77.753954 
##       fine_aggregate                  age compressive_strength 
##            80.175980            63.169912            16.705742

Principal Component Analysis

  1. Calculate the PCA of the data
# we scale the variables to have standard deviation one
concrete_pca <- prcomp(~ . -compressive_strength, data = concrete, scale. = TRUE) 
concrete_pca
## Standard deviations (1, .., p=8):
## [1] 1.5099998 1.1899894 1.1576178 1.0070546 0.9754527 0.8889294 0.4217128
## [8] 0.1733285
## 
## Rotation (n x k) = (8 x 8):
##                            PC1         PC2          PC3         PC4         PC5
## cement              0.09840137 -0.11373709  0.814202242  0.05429700  0.14820612
## blast_furnace_slag  0.17726197  0.68605290 -0.171794367  0.36269932 -0.02093167
## fly_ash            -0.39466185 -0.14294751 -0.408220547 -0.22675120  0.54963115
## water               0.54700395  0.05325628 -0.213189748 -0.29606003  0.07022191
## superplasticizer   -0.50594541  0.28292960  0.234596531  0.03727351  0.35461841
## coarse_aggregate    0.03792808 -0.62994342 -0.174087807  0.54580513 -0.03308317
## fine_aggregate     -0.40192597 -0.01939111 -0.004569214 -0.38528206 -0.70123743
## age                 0.29147949 -0.12598089  0.100521372 -0.52791909  0.22801019
##                            PC6         PC7        PC8
## cement              0.20314214  0.22184381 0.44616267
## blast_furnace_slag -0.30488197  0.22836331 0.43738376
## fly_ash             0.18326720  0.35246257 0.38188581
## water               0.36597033 -0.52427468 0.38874117
## superplasticizer   -0.19329372 -0.66464314 0.05174995
## coarse_aggregate   -0.31455942 -0.22684015 0.34931986
## fine_aggregate     -0.09246568 -0.03902583 0.43336994
## age                -0.74390800  0.06936667 0.01288097
# NO scale the variables
concrete_pca_noscale <- prcomp(~ . -compressive_strength, 
                               data = concrete, scale. = FALSE,
                               rank. = 8) # rank is redundant
concrete_pca_noscale
## Standard deviations (1, .., p=8):
## [1] 113.317721  99.045229  85.347671  65.143267  63.080556  34.299040   8.465167
## [8]   3.367569
## 
## Rotation (n x k) = (8 x 8):
##                             PC1          PC2         PC3          PC4
## cement             -0.905635976 -0.032661425 -0.15481502  0.008262832
## blast_furnace_slag  0.262578087 -0.786027767 -0.07292914  0.199072273
## fly_ash             0.238605982  0.303038923  0.05149593 -0.687205212
## water              -0.005558858 -0.076252845  0.04146150 -0.075576473
## superplasticizer    0.001324899  0.005110168 -0.02405395 -0.020499454
## coarse_aggregate    0.009092380  0.274565575  0.76071165  0.480049052
## fine_aggregate      0.210124001  0.450725493 -0.61075565  0.485143132
## age                -0.098366601 -0.069861703  0.11857277 -0.126917182
##                            PC5        PC6           PC7          PC8
## cement             -0.15137697  0.3065328  0.1943716913  0.007940927
## blast_furnace_slag -0.10669449  0.4534733  0.2261792702  0.009265435
## fly_ash            -0.17763067  0.5123768  0.2867363755 -0.005599892
## water               0.09841332 -0.4824372  0.8246348527  0.253515908
## superplasticizer   -0.02292470  0.1044578 -0.2333081039  0.965972907
## coarse_aggregate   -0.07632579  0.2707131  0.1859286436  0.041486203
## fine_aggregate      0.13287750  0.2571217  0.2445811647  0.026836131
## age                 0.94892353  0.2341269 -0.0003404885 -0.002116320

There are quantities in concrete_pca.

names(concrete_pca)
## [1] "sdev"     "rotation" "center"   "scale"    "x"        "call"

The center and scale components correspond to the means and standard deviations of the variables that were used for scaling prior to implementing PCA.

concrete_pca$center
##             cement blast_furnace_slag            fly_ash              water 
##          281.16786           73.89583           54.18835          181.56728 
##   superplasticizer   coarse_aggregate     fine_aggregate                age 
##            6.20466          972.91893          773.58049           45.66214
concrete_pca$scale
##             cement blast_furnace_slag            fly_ash              water 
##         104.506364          86.279342          63.997004          21.354219 
##   superplasticizer   coarse_aggregate     fine_aggregate                age 
##           5.973841          77.753954          80.175980          63.169912
  1. Explore the loading, eigenvalues, and final projection using the broom package.

tidy(matrix = "u") returns from the original space into principle components space.

concrete_pca %>%
  tidy(matrix = "u") %>% 
  head()
## # A tibble: 6 x 3
##   row      PC  value
##   <chr> <dbl>  <dbl>
## 1 1         1  0.679
## 2 1         2 -1.46 
## 3 1         3  2.39 
## 4 1         4  1.35 
## 5 1         5  0.397
## 6 1         6  0.443

tidy(matrix = "loadings") returns from the principle components space back into the original space. That is, return the rotation matrix.

concrete_pca %>%
  tidy(matrix = "loadings") # v", "rotation", "loadings" or "variables":
## # A tibble: 64 x 3
##    column                PC   value
##    <chr>              <dbl>   <dbl>
##  1 cement                 1  0.0984
##  2 cement                 2 -0.114 
##  3 cement                 3  0.814 
##  4 cement                 4  0.0543
##  5 cement                 5  0.148 
##  6 cement                 6  0.203 
##  7 cement                 7  0.222 
##  8 cement                 8  0.446 
##  9 blast_furnace_slag     1  0.177 
## 10 blast_furnace_slag     2  0.686 
## # … with 54 more rows

tidy(matrix = "pcs") returns the information about the eigenvalues. Or we can say this way is how we look at the variance explained by each PC

concrete_pca %>%
  tidy(matrix = "pcs") # "d", "eigenvalues" or "pcs": returns information about the eigenvalues.
## # A tibble: 8 x 4
##      PC std.dev percent cumulative
##   <dbl>   <dbl>   <dbl>      <dbl>
## 1     1   1.51  0.285        0.285
## 2     2   1.19  0.177        0.462
## 3     3   1.16  0.168        0.630
## 4     4   1.01  0.127        0.756
## 5     5   0.975 0.119        0.875
## 6     6   0.889 0.0988       0.974
## 7     7   0.422 0.0222       0.996
## 8     8   0.173 0.00376      1

Data Visualization

  1. Visualize the projections. Look at how the scaling of the variables changes the projection
# scaled and not scaled
augment(concrete_pca, newdata = concrete) %>% 
  head()
## # A tibble: 6 x 18
##   .rownames cement blast_furnace_slag fly_ash water superplasticizer
##   <chr>      <dbl>              <dbl>   <dbl> <dbl>            <dbl>
## 1 1           540                  0        0   162              2.5
## 2 2           540                  0        0   162              2.5
## 3 3           332.               142.       0   228              0  
## 4 4           332.               142.       0   228              0  
## 5 5           199.               132.       0   192              0  
## 6 6           266                114        0   228              0  
## # … with 12 more variables: coarse_aggregate <dbl>, fine_aggregate <dbl>,
## #   age <int>, compressive_strength <dbl>, .fittedPC1 <dbl>, .fittedPC2 <dbl>,
## #   .fittedPC3 <dbl>, .fittedPC4 <dbl>, .fittedPC5 <dbl>, .fittedPC6 <dbl>,
## #   .fittedPC7 <dbl>, .fittedPC8 <dbl>
augment(concrete_pca_noscale, newdata = concrete) %>% 
  head()
## # A tibble: 6 x 18
##   .rownames cement blast_furnace_slag fly_ash water superplasticizer
##   <chr>      <dbl>              <dbl>   <dbl> <dbl>            <dbl>
## 1 1           540                  0        0   162              2.5
## 2 2           540                  0        0   162              2.5
## 3 3           332.               142.       0   228              0  
## 4 4           332.               142.       0   228              0  
## 5 5           199.               132.       0   192              0  
## 6 6           266                114        0   228              0  
## # … with 12 more variables: coarse_aggregate <dbl>, fine_aggregate <dbl>,
## #   age <int>, compressive_strength <dbl>, .fittedPC1 <dbl>, .fittedPC2 <dbl>,
## #   .fittedPC3 <dbl>, .fittedPC4 <dbl>, .fittedPC5 <dbl>, .fittedPC6 <dbl>,
## #   .fittedPC7 <dbl>, .fittedPC8 <dbl>
# another way to solely generate outcomes
# predict(concrete_pca_noscale, newdata = concrete)

We can see the location of rotated observations in the first principal direction (PC1) space
(Note: The PC2 axis is the second most important direction and it is orthogonal to the PC1 axis).

concrete_pca %>%
  tidy(matrix = "loadings") %>%
  filter(PC ==1) %>%
  ggplot(aes(value, column)) +
  geom_col() +
  theme_bw()

To see the outcomes on each PCA

concrete_pca %>%
  tidy(matrix = "loadings") %>%
  ggplot(aes(value, column)) +
  geom_col() +
  facet_wrap(~PC)

Only PC1 and PC2 are compared

concrete_pca %>%
  tidy(matrix = "loadings") %>%
  filter(PC <= 2) %>%
  ggplot(aes(value, column)) +
  geom_col() +
  facet_wrap(~PC)

See the outcomes on PC1 and PC2 with no scaling

concrete_pca_noscale %>%
  tidy(matrix = "loadings") %>%
  filter(PC <= 2) %>%
  ggplot(aes(value, column)) +
  geom_col() +
  facet_wrap(~PC)

The first component explains most of the variation in the data, which shows the largest variation.

concrete_pca %>%
  tidy(matrix = "pcs") %>%
  ggplot(aes(PC, percent)) +
  geom_col() +
  theme_bw() +
  scale_x_continuous(breaks = 1:8) +
  scale_y_continuous(
    labels = scales::percent_format(),
    expand = expansion(mult = c(0, 0.01))
  )

map_dbl(concrete, ~length(unique(.x)))
##               cement   blast_furnace_slag              fly_ash 
##                  278                  185                  156 
##                water     superplasticizer     coarse_aggregate 
##                  195                  111                  284 
##       fine_aggregate                  age compressive_strength 
##                  302                   14                  845

Look at the data in PC1 and PC2’s coordinates, we can see water, fine_aggregate, and fly_ash are all contribute to PC1.

ggbiplot(concrete_pca)

  1. Use the {recipes} package to calculate the principal components.
set.seed(1234)
concrete_split <- initial_split(concrete, strata = "compressive_strength")
concrete_train <- training(concrete_split)
concrete_test <- testing(concrete_split)
rec_spec <- recipe(compressive_strength ~., data = concrete_train) %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors(), threshold = 0.75)
lm_spec <- linear_reg() %>%
  set_mode("regression") %>%
  set_engine("lm")
pcr_wf <- workflow() %>%
  add_recipe(rec_spec) %>%
  add_model(lm_spec)
pcr_fit <- fit(pcr_wf, data = concrete_train)
pcr_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_normalize()
## • step_pca()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:
## stats::lm(formula = ..y ~ ., data = data)
## 
## Coefficients:
## (Intercept)          PC1          PC2          PC3          PC4          PC5  
##     35.8161      -0.6717       1.1604      -8.5265       0.9310       3.8816
augment(pcr_fit, new_data = concrete_test) %>%
  rmse(truth = compressive_strength, estimate = .pred) # root mean squared error
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        12.3
augment(pcr_fit, new_data = concrete_test) %>%
  ggplot(aes(compressive_strength, .pred)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, color = "green") +
  coord_fixed() +
  theme_bw()

References