We will in this lab explore how principal components are calculated and analyzed.
We will be using the concrete data set from the {modeldata} package which is loaded with {tidymodels}.
library(ggbiplot)
library(tidymodels)
data(concrete)
head(concrete)
## # A tibble: 6 x 9
## cement blast_furnace_slag fly_ash water superplasticizer coarse_aggregate
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 540 0 0 162 2.5 1040
## 2 540 0 0 162 2.5 1055
## 3 332. 142. 0 228 0 932
## 4 332. 142. 0 228 0 932
## 5 199. 132. 0 192 0 978.
## 6 266 114 0 228 0 932
## # … with 3 more variables: fine_aggregate <dbl>, age <int>,
## # compressive_strength <dbl>
dim(concrete)
## [1] 1030 9
We must delete all non-numeric columns from the data because the prcomp() function can only deal with numeric columns. But the variables ofconcrete are all numeric variables so we don’t do that.
length(map_dbl(concrete, mean)) == length(concrete)
## [1] TRUE
Take a look on variable’s standard deviation.
map_dbl(concrete, sd)
## cement blast_furnace_slag fly_ash
## 104.506364 86.279342 63.997004
## water superplasticizer coarse_aggregate
## 21.354219 5.973841 77.753954
## fine_aggregate age compressive_strength
## 80.175980 63.169912 16.705742
scale.: TRUE/FALSE: tells R if it should scale the data. That is, divide the standard deviation of each column from the corresponding column. The default is FALSE.# we scale the variables to have standard deviation one
concrete_pca <- prcomp(~ . -compressive_strength, data = concrete, scale. = TRUE)
concrete_pca
## Standard deviations (1, .., p=8):
## [1] 1.5099998 1.1899894 1.1576178 1.0070546 0.9754527 0.8889294 0.4217128
## [8] 0.1733285
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4 PC5
## cement 0.09840137 -0.11373709 0.814202242 0.05429700 0.14820612
## blast_furnace_slag 0.17726197 0.68605290 -0.171794367 0.36269932 -0.02093167
## fly_ash -0.39466185 -0.14294751 -0.408220547 -0.22675120 0.54963115
## water 0.54700395 0.05325628 -0.213189748 -0.29606003 0.07022191
## superplasticizer -0.50594541 0.28292960 0.234596531 0.03727351 0.35461841
## coarse_aggregate 0.03792808 -0.62994342 -0.174087807 0.54580513 -0.03308317
## fine_aggregate -0.40192597 -0.01939111 -0.004569214 -0.38528206 -0.70123743
## age 0.29147949 -0.12598089 0.100521372 -0.52791909 0.22801019
## PC6 PC7 PC8
## cement 0.20314214 0.22184381 0.44616267
## blast_furnace_slag -0.30488197 0.22836331 0.43738376
## fly_ash 0.18326720 0.35246257 0.38188581
## water 0.36597033 -0.52427468 0.38874117
## superplasticizer -0.19329372 -0.66464314 0.05174995
## coarse_aggregate -0.31455942 -0.22684015 0.34931986
## fine_aggregate -0.09246568 -0.03902583 0.43336994
## age -0.74390800 0.06936667 0.01288097
# NO scale the variables
concrete_pca_noscale <- prcomp(~ . -compressive_strength,
data = concrete, scale. = FALSE,
rank. = 8) # rank is redundant
concrete_pca_noscale
## Standard deviations (1, .., p=8):
## [1] 113.317721 99.045229 85.347671 65.143267 63.080556 34.299040 8.465167
## [8] 3.367569
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4
## cement -0.905635976 -0.032661425 -0.15481502 0.008262832
## blast_furnace_slag 0.262578087 -0.786027767 -0.07292914 0.199072273
## fly_ash 0.238605982 0.303038923 0.05149593 -0.687205212
## water -0.005558858 -0.076252845 0.04146150 -0.075576473
## superplasticizer 0.001324899 0.005110168 -0.02405395 -0.020499454
## coarse_aggregate 0.009092380 0.274565575 0.76071165 0.480049052
## fine_aggregate 0.210124001 0.450725493 -0.61075565 0.485143132
## age -0.098366601 -0.069861703 0.11857277 -0.126917182
## PC5 PC6 PC7 PC8
## cement -0.15137697 0.3065328 0.1943716913 0.007940927
## blast_furnace_slag -0.10669449 0.4534733 0.2261792702 0.009265435
## fly_ash -0.17763067 0.5123768 0.2867363755 -0.005599892
## water 0.09841332 -0.4824372 0.8246348527 0.253515908
## superplasticizer -0.02292470 0.1044578 -0.2333081039 0.965972907
## coarse_aggregate -0.07632579 0.2707131 0.1859286436 0.041486203
## fine_aggregate 0.13287750 0.2571217 0.2445811647 0.026836131
## age 0.94892353 0.2341269 -0.0003404885 -0.002116320
There are quantities in concrete_pca.
names(concrete_pca)
## [1] "sdev" "rotation" "center" "scale" "x" "call"
The center and scale components correspond to the means and standard deviations of the variables that were used for scaling prior to implementing PCA.
concrete_pca$center
## cement blast_furnace_slag fly_ash water
## 281.16786 73.89583 54.18835 181.56728
## superplasticizer coarse_aggregate fine_aggregate age
## 6.20466 972.91893 773.58049 45.66214
concrete_pca$scale
## cement blast_furnace_slag fly_ash water
## 104.506364 86.279342 63.997004 21.354219
## superplasticizer coarse_aggregate fine_aggregate age
## 5.973841 77.753954 80.175980 63.169912
tidy(matrix = "u") returns from the original space into principle components space.
concrete_pca %>%
tidy(matrix = "u") %>%
head()
## # A tibble: 6 x 3
## row PC value
## <chr> <dbl> <dbl>
## 1 1 1 0.679
## 2 1 2 -1.46
## 3 1 3 2.39
## 4 1 4 1.35
## 5 1 5 0.397
## 6 1 6 0.443
tidy(matrix = "loadings") returns from the principle components space back into the original space. That is, return the rotation matrix.
concrete_pca %>%
tidy(matrix = "loadings") # v", "rotation", "loadings" or "variables":
## # A tibble: 64 x 3
## column PC value
## <chr> <dbl> <dbl>
## 1 cement 1 0.0984
## 2 cement 2 -0.114
## 3 cement 3 0.814
## 4 cement 4 0.0543
## 5 cement 5 0.148
## 6 cement 6 0.203
## 7 cement 7 0.222
## 8 cement 8 0.446
## 9 blast_furnace_slag 1 0.177
## 10 blast_furnace_slag 2 0.686
## # … with 54 more rows
tidy(matrix = "pcs") returns the information about the eigenvalues. Or we can say this way is how we look at the variance explained by each PC
concrete_pca %>%
tidy(matrix = "pcs") # "d", "eigenvalues" or "pcs": returns information about the eigenvalues.
## # A tibble: 8 x 4
## PC std.dev percent cumulative
## <dbl> <dbl> <dbl> <dbl>
## 1 1 1.51 0.285 0.285
## 2 2 1.19 0.177 0.462
## 3 3 1.16 0.168 0.630
## 4 4 1.01 0.127 0.756
## 5 5 0.975 0.119 0.875
## 6 6 0.889 0.0988 0.974
## 7 7 0.422 0.0222 0.996
## 8 8 0.173 0.00376 1
# scaled and not scaled
augment(concrete_pca, newdata = concrete) %>%
head()
## # A tibble: 6 x 18
## .rownames cement blast_furnace_slag fly_ash water superplasticizer
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 540 0 0 162 2.5
## 2 2 540 0 0 162 2.5
## 3 3 332. 142. 0 228 0
## 4 4 332. 142. 0 228 0
## 5 5 199. 132. 0 192 0
## 6 6 266 114 0 228 0
## # … with 12 more variables: coarse_aggregate <dbl>, fine_aggregate <dbl>,
## # age <int>, compressive_strength <dbl>, .fittedPC1 <dbl>, .fittedPC2 <dbl>,
## # .fittedPC3 <dbl>, .fittedPC4 <dbl>, .fittedPC5 <dbl>, .fittedPC6 <dbl>,
## # .fittedPC7 <dbl>, .fittedPC8 <dbl>
augment(concrete_pca_noscale, newdata = concrete) %>%
head()
## # A tibble: 6 x 18
## .rownames cement blast_furnace_slag fly_ash water superplasticizer
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 540 0 0 162 2.5
## 2 2 540 0 0 162 2.5
## 3 3 332. 142. 0 228 0
## 4 4 332. 142. 0 228 0
## 5 5 199. 132. 0 192 0
## 6 6 266 114 0 228 0
## # … with 12 more variables: coarse_aggregate <dbl>, fine_aggregate <dbl>,
## # age <int>, compressive_strength <dbl>, .fittedPC1 <dbl>, .fittedPC2 <dbl>,
## # .fittedPC3 <dbl>, .fittedPC4 <dbl>, .fittedPC5 <dbl>, .fittedPC6 <dbl>,
## # .fittedPC7 <dbl>, .fittedPC8 <dbl>
# another way to solely generate outcomes
# predict(concrete_pca_noscale, newdata = concrete)
We can see the location of rotated observations in the first principal direction (PC1) space
(Note: The PC2 axis is the second most important direction and it is orthogonal to the PC1 axis).
concrete_pca %>%
tidy(matrix = "loadings") %>%
filter(PC ==1) %>%
ggplot(aes(value, column)) +
geom_col() +
theme_bw()
To see the outcomes on each PCA
concrete_pca %>%
tidy(matrix = "loadings") %>%
ggplot(aes(value, column)) +
geom_col() +
facet_wrap(~PC)
Only PC1 and PC2 are compared
concrete_pca %>%
tidy(matrix = "loadings") %>%
filter(PC <= 2) %>%
ggplot(aes(value, column)) +
geom_col() +
facet_wrap(~PC)
See the outcomes on PC1 and PC2 with no scaling
concrete_pca_noscale %>%
tidy(matrix = "loadings") %>%
filter(PC <= 2) %>%
ggplot(aes(value, column)) +
geom_col() +
facet_wrap(~PC)
The first component explains most of the variation in the data, which shows the largest variation.
concrete_pca %>%
tidy(matrix = "pcs") %>%
ggplot(aes(PC, percent)) +
geom_col() +
theme_bw() +
scale_x_continuous(breaks = 1:8) +
scale_y_continuous(
labels = scales::percent_format(),
expand = expansion(mult = c(0, 0.01))
)
map_dbl(concrete, ~length(unique(.x)))
## cement blast_furnace_slag fly_ash
## 278 185 156
## water superplasticizer coarse_aggregate
## 195 111 284
## fine_aggregate age compressive_strength
## 302 14 845
Look at the data in PC1 and PC2’s coordinates, we can see water, fine_aggregate, and fly_ash are all contribute to PC1.
ggbiplot(concrete_pca)
set.seed(1234)
concrete_split <- initial_split(concrete, strata = "compressive_strength")
concrete_train <- training(concrete_split)
concrete_test <- testing(concrete_split)
threshold = 0.75 indicates that step_pca should generate enough components to capture 75 percent of the variability in the variablesstep_pca creates a specification of a recipe step that will convert numeric data into one or more principal components.rec_spec <- recipe(compressive_strength ~., data = concrete_train) %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors(), threshold = 0.75)
lm_spec <- linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")
pcr_wf <- workflow() %>%
add_recipe(rec_spec) %>%
add_model(lm_spec)
pcr_fit <- fit(pcr_wf, data = concrete_train)
pcr_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: linear_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_normalize()
## • step_pca()
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call:
## stats::lm(formula = ..y ~ ., data = data)
##
## Coefficients:
## (Intercept) PC1 PC2 PC3 PC4 PC5
## 35.8161 -0.6717 1.1604 -8.5265 0.9310 3.8816
augment(pcr_fit, new_data = concrete_test) %>%
rmse(truth = compressive_strength, estimate = .pred) # root mean squared error
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 12.3
augment(pcr_fit, new_data = concrete_test) %>%
ggplot(aes(compressive_strength, .pred)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, color = "green") +
coord_fixed() +
theme_bw()