We will use the brca dataset from the dslabs package. The object brca contains two slots:

library(dslabs)
str(brca)
## List of 2
##  $ x: num [1:569, 1:30] 13.5 13.1 9.5 13 8.2 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:30] "radius_mean" "texture_mean" "perimeter_mean" "area_mean" ...
##  $ y: Factor w/ 2 levels "B","M": 1 1 1 1 1 1 1 1 1 1 ...
dim(brca$x)
## [1] 569  30

Let’s fit PCA to the covariate matrix. Looking at the scree plot, we can see that the first PC explains a large proportion of the variance.

pca_fit <- prcomp(brca$x)
screeplot(pca_fit, type = "lines")

# What percentage of variance explained?
scales::percent(pca_fit$sdev[1]/sum(pca_fit$sdev))
## [1] "84%"

Next, let’s plot the first and second PC and colour each point according to the tumour type. From this graph, we can see that much of the variance is in the direction of PC1, and furthermore, the first PC has some discriminatory properties with respect to the tumour type.

library(tidyverse)
library(broom)

varexp_pct <- scales::percent(pca_fit$sdev[1:2]/sum(pca_fit$sdev))

pca_fit %>% 
    augment %>% 
    bind_cols(class = brca$y) %>% 
    ggplot(aes(x = .fittedPC1,
               y = .fittedPC2)) + 
    geom_point(aes(colour = class),
               alpha = 0.5) +
    coord_fixed() + # Keep both axes on the same scale
    theme_minimal() +
    theme(legend.position = "top") +
    scale_colour_discrete(labels = c("Benign", "Malign"),
                          name = "Tumour type") +
    xlab(paste0("PC1 (", varexp_pct[1], ")")) +
    ylab(paste0("PC2 (", varexp_pct[2], ")"))

In the next sections, we want to identify which covariates contribute the most to the first PC.

1. Looking at loadings

First, we can look at the raw loadings, i.e. the eigenvectors of the sample covariance matrix. If we simply plot them sequentially, we can see that two covariates have a much larger loading than the other ones.

plot(pca_fit$rotation[, "PC1"])

# Which covariates?
which(pca_fit$rotation[, "PC1"] > 0.5)
##  area_mean area_worst 
##          4         24

Therefore, area_mean and area_worst contribute the most towards the first PC.

2. Varimax rotation

varimax_loadings <- varimax(pca_fit$rotation)
varimax_pca <- pca_fit$x %*% varimax_loadings$rotmat
colnames(varimax_pca) <- colnames(pca_fit$x)
as_tibble(varimax_pca) %>% 
    bind_cols(class = brca$y) %>% 
    ggplot(aes(x = PC1,
               y = PC2)) + 
    geom_point(aes(colour = class),
               alpha = 0.5) +
    coord_fixed() +
    theme_minimal()

plot(varimax_loadings$loadings[, "PC1"])

which(varimax_loadings$loadings[, "PC1"] > 0.5)
## area_worst 
##         24
cor(brca$x[, "area_worst"],
    brca$x[, "area_mean"])
## [1] 0.9592133
as_tibble(brca$x) %>% 
    bind_cols(class = brca$y) %>% 
    ggplot(aes(x = area_worst,
               y = area_mean)) + 
    geom_point(aes(colour = class),
               alpha = 0.5) +
    geom_smooth(method = "lm", se = FALSE) +
    theme_minimal()
## `geom_smooth()` using formula 'y ~ x'