1 1. Introduction and Goals

In this session we will:

  • Understand why we use PCA in biology (dimensionality reduction and visualization).
  • Learn how PCA is implemented in R (prcomp()).
  • See why standardization matters.
  • Study duplicated or near-duplicated samples (twins, technical replicates, many samples from the same location).
  • Explore biologically motivated examples:
    • Environmental data
    • Genotype data (population structure)
    • Morphometric / expression-like data
  • Practice with a set of exercises.

This R Markdown is also a lab notebook: it simulates datasets and saves them as CSV files that you can distribute to students.

2 2. Simulated Datasets for Teaching

In this section we generate several datasets that mimic real biological situations.

All CSVs are written into the working directory (check with getwd()).

set.seed(123)

## 2.1 Environmental dataset ------------------------------------------
n_env <- 200
env_data <- data.frame(
  sample_id = paste0("ENV_", seq_len(n_env)),
  temp     = rnorm(n_env, 15, 6),       # °C
  rainfall = rlnorm(n_env, 2, 0.5),     # mm
  pH       = rnorm(n_env, 7, 0.5),      # pH units
  nitrate  = rlnorm(n_env, 0, 0.8)      # mg/L
)

head(env_data)
##   sample_id     temp  rainfall       pH   nitrate
## 1     ENV_1 11.63715 22.184751 6.963222 2.3612941
## 2     ENV_2 13.61894 14.242159 6.415674 0.9783600
## 3     ENV_3 24.35225  6.471627 6.682626 0.9736881
## 4     ENV_4 15.42305  9.694871 6.985579 0.2973474
## 5     ENV_5 15.77573  6.006427 7.335348 1.8819496
## 6     ENV_6 25.29039  5.823355 6.174727 0.8448575
write.csv(env_data, "environmental_data.csv", row.names = FALSE)


## 2.2 Genotype-like dataset (0/1/2) with 3 populations ---------------
n_ind <- 150
n_snp <- 500
pop <- rep(c("PopA", "PopB", "PopC"), each = n_ind/3)

# Base MAFs
maf_A <- runif(n_snp, 0.05, 0.35)
maf_B <- pmin(pmax(maf_A + runif(n_snp, -0.1, 0.1), 0.01), 0.49)
maf_C <- pmin(pmax(maf_A + runif(n_snp, -0.2, 0.2), 0.01), 0.49)



sim_geno <- function(n, maf) {
  p <- maf
  probs <- cbind((1-p)^2, 2*p*(1-p), p^2)
  G <- apply(probs, 1, function(pr) sample(0:2, size = n, replace = TRUE, prob = pr))
  t(G)
}


G_A <- sim_geno(n_ind/3, maf_A)
G_B <- sim_geno(n_ind/3, maf_B)
G_C <- sim_geno(n_ind/3, maf_C)

geno_matrix <- rbind(G_A, G_B, G_C)

colnames(geno_matrix) <- paste0("SNP_", seq_len(ncol(geno_matrix)))
geno_data <- data.frame(
  sample_id = paste0("IND_", seq_len(n_ind)),
  pop       = pop,
  geno_matrix
)

head(geno_data[,1:10])
##   sample_id  pop SNP_1 SNP_2 SNP_3 SNP_4 SNP_5 SNP_6 SNP_7 SNP_8
## 1     IND_1 PopA     0     2     0     1     0     0     1     0
## 2     IND_2 PopA     0     1     0     0     0     0     0     0
## 3     IND_3 PopA     0     0     0     0     0     0     0     0
## 4     IND_4 PopA     0     0     1     0     2     0     0     1
## 5     IND_5 PopA     0     0     0     1     2     1     1     1
## 6     IND_6 PopA     1     0     0     0     0     0     0     0
write.csv(geno_data, "genotype_data.csv", row.names = FALSE)

## 2.3 Expression-like dataset with twins / replicates ----------------
n_expr <- 80
n_genes <- 50

# "True" expression means differ by group
group_expr <- rep(c("Control", "Treatment"), each = n_expr/2)
base_means <- runif(n_genes, 3, 8)    # baseline log-expression
treatment_effect <- rnorm(n_genes, 0, 0.7)

expr_matrix <- matrix(NA, nrow = n_expr, ncol = n_genes)
for (i in seq_len(n_expr)) {
  mu <- base_means
  if (group_expr[i] == "Treatment") {
    mu <- mu + treatment_effect
  }
  expr_matrix[i, ] <- rnorm(n_genes, mean = mu, sd = 0.5)
}

# Add a pair of "twins" with nearly identical expression
twin1 <- rnorm(n_genes, mean = base_means + 0.3, sd = 0.2)
twin2 <- twin1 + rnorm(n_genes, mean = 0, sd = 0.05)

expr_matrix <- rbind(expr_matrix, twin1, twin2)
expr_group  <- c(group_expr, "Twin", "Twin")
expr_id     <- paste0("S", seq_len(nrow(expr_matrix)))

expr_data <- data.frame(
  sample_id = expr_id,
  group     = expr_group,
  expr_matrix
)
colnames(expr_data)[-(1:2)] <- paste0("Gene_", seq_len(n_genes))

head(expr_data[,1:8])
##     sample_id   group   Gene_1   Gene_2   Gene_3   Gene_4   Gene_5   Gene_6
## X          S1 Control 7.668686 3.662886 5.925802 6.171390 5.460782 6.866024
## X.1        S2 Control 7.253561 4.349532 5.583942 6.799561 5.673894 7.255362
## X.2        S3 Control 7.476361 4.000081 5.541735 7.093655 6.094962 6.827718
## X.3        S4 Control 7.104922 4.628845 5.814073 6.617769 5.711166 7.137490
## X.4        S5 Control 7.923649 4.022753 6.015086 7.509066 4.735120 7.641594
## X.5        S6 Control 6.824361 3.734323 5.238527 7.228134 6.122697 5.650121
write.csv(expr_data, "expression_twins_data.csv", row.names = FALSE)


## 2.3b Expression-like dataset with twins / replicates ----------------
n_expr <- 80
n_genes <- 30000

# "True" expression means differ by group
group_expr <- rep(c("Control", "Treatment"), each = n_expr/2)
base_means <- runif(n_genes, 3, 8)    # baseline log-expression
treatment_effect <- rnorm(n_genes, 0, 0.7)

expr_matrix <- matrix(NA, nrow = n_expr, ncol = n_genes)
for (i in seq_len(n_expr)) {
  mu <- base_means
  if (group_expr[i] == "Treatment") {
    mu <- mu + treatment_effect
  }
  expr_matrix[i, ] <- rnorm(n_genes, mean = mu, sd = 2)
}

# Add a pair of "twins" with nearly identical expression
twin1 <- rnorm(n_genes, mean = base_means + 0.3, sd = 2)
twin2 <- twin1 + rnorm(n_genes, mean = 0, sd = 0.0005)

expr_matrix <- rbind(expr_matrix, twin1, twin2)
expr_group  <- c(group_expr, "Twin", "Twin")
expr_id     <- paste0("S", seq_len(nrow(expr_matrix)))

expr_data <- data.frame(
  sample_id = expr_id,
  group     = expr_group,
  expr_matrix
)
colnames(expr_data)[-(1:2)] <- paste0("Gene_", seq_len(n_genes))

head(expr_data[,1:8])
##     sample_id   group   Gene_1     Gene_2   Gene_3   Gene_4   Gene_5    Gene_6
## X          S1 Control 6.589128  3.0279967 4.770101 5.984451 4.753206  6.955786
## X.1        S2 Control 6.100408  3.2619345 7.300809 4.146241 6.996118  9.245594
## X.2        S3 Control 6.942099 -0.4959638 2.794651 1.493286 7.403729  7.009384
## X.3        S4 Control 8.286431  6.6272333 4.577211 4.870890 8.480992 11.661192
## X.4        S5 Control 5.408853  1.6685311 3.091760 2.545861 2.660046  6.100538
## X.5        S6 Control 3.797679  1.8299410 7.398065 5.115125 4.919272  4.336243
write.csv(expr_data, "expression_twins_data2.csv", row.names = FALSE)

#expr_data[c(nrow(expr_data)-1, nrow(expr_data)),]


## 2.4 Dataset with many samples from same location -------------------
n_loc_A <- 80
n_loc_B <- 20
n_loc_C <- 20
n_traits <- 5

location <- c(rep("LocA", n_loc_A),
              rep("LocB", n_loc_B),
              rep("LocC", n_loc_C))

mean_A <- rep(0, n_traits)
mean_B <- c(1, -1, 0.5, 0, 0)
mean_C <- c(-1, 0.5, -0.5, 0, 0.7)

cov_traits <- diag(n_traits) * 1
cov_traits[1,2] <- cov_traits[2,1] <- 0.6

sim_loc <- function(n, mu) {
  MASS::mvrnorm(n, mu = mu, Sigma = cov_traits)
}



traits_A <- sim_loc(n_loc_A, mean_A)
traits_B <- sim_loc(n_loc_B, mean_B)
traits_C <- sim_loc(n_loc_C, mean_C)

traits_matrix <- rbind(traits_A, traits_B, traits_C)

loc_data <- data.frame(
  sample_id = paste0("LOC_", seq_len(nrow(traits_matrix))),
  location  = location,
  traits_matrix
)
colnames(loc_data)[-(1:2)] <- paste0("Trait_", seq_len(n_traits))

head(loc_data)
##   sample_id location    Trait_1    Trait_2    Trait_3     Trait_4    Trait_5
## 1     LOC_1     LocA -1.2267678 -0.7630357  0.8876672  0.59117556 -0.1957244
## 2     LOC_2     LocA  0.1628424 -0.5032241 -0.1993700  0.07126979  0.8512445
## 3     LOC_3     LocA -1.3513255 -0.6145531  0.9950082 -0.14399184 -0.1342373
## 4     LOC_4     LocA  0.7356483 -0.3949018  0.9298862  1.57181404 -1.1403389
## 5     LOC_5     LocA  0.2760041 -1.3305879  0.5958680 -1.19653817  1.5699717
## 6     LOC_6     LocA -0.0915815 -0.3016421 -1.6813246  1.74697532  0.5915271
write.csv(loc_data, "location_unbalanced_data.csv", row.names = FALSE)

We now have four datasets you can share:

  • environmental_data.csv
  • genotype_data.csv
  • expression_twins_data.csv
  • location_unbalanced_data.csv

Each is used below.

3 3. PCA Basics and Standardization

3.1 3.1 Simple 2D geometric illustration

set.seed(42)
Sigma_2d <- matrix(c(4, 3.5,
                     3.5, 4), 2, 2)
X <- MASS::mvrnorm(400, mu = c(0, 0), Sigma = Sigma_2d)
df2 <- data.frame(x1 = X[,1], x2 = X[,2])

ggplot(df2, aes(x1, x2)) +
  geom_point(alpha = 0.3) +
  coord_equal() + theme_pca +
  labs(title = "Correlated 2D data: we want the 'long' direction")

Compute PCA and show axes:

pca2 <- prcomp(df2, center = TRUE, scale. = FALSE)
pca2$rotation
##           PC1        PC2
## x1 -0.7124363 -0.7017368
## x2 -0.7017368  0.7124363
pca2$sdev^2
## [1] 6.9112536 0.4857434
pc1 <- pca2$rotation[,1]
pc2 <- pca2$rotation[,2]

ggplot(df2, aes(x1, x2)) +
  geom_point(alpha = 0.3) +
  geom_segment(aes(x = 0, y = 0,
                   xend = pc1[1]*3, yend = pc1[2]*3),
               arrow = arrow(length = unit(0.15, "cm")),
               color = "red") +
  geom_segment(aes(x = 0, y = 0,
                   xend = pc2[1]*3, yend = pc2[2]*3),
               arrow = arrow(length = unit(0.15, "cm")),
               color = "blue") +
  coord_equal() + theme_pca +
  labs(title = "PCA axes: PC1 (red) = longest direction, PC2 (blue) orthogonal")

3.2 3.2 Standardization with height/weight example

set.seed(1)
n <- 200
height <- rnorm(n, mean = 1700, sd = 200)   # mm
weight <- rnorm(n, mean = 70, sd = 15)   # kg

df_hw <- data.frame(height = height, weight = weight)
apply(df_hw, 2, sd)
##    height    weight 
## 185.81945  15.16057

PCA without scaling:

df_hw
##       height    weight
## 1   1574.709  76.14103
## 2   1736.729  95.33310
## 3   1532.874  93.79883
## 4   2019.056  65.03638
## 5   1765.902  35.72147
## 6   1535.906 107.46492
## 7   1797.486  80.00599
## 8   1847.665  78.11991
## 9   1815.156  69.79901
## 10  1638.922  77.65163
## 11  2002.356  67.53436
## 12  1777.969  76.31042
## 13  1575.752  63.99630
## 14  1257.060  49.44688
## 15  1924.986  84.81757
## 16  1691.013  92.79618
## 17  1696.762  65.36889
## 18  1888.767  51.20065
## 19  1864.244  79.63362
## 20  1818.780  69.32936
## 21  1883.795  44.00172
## 22  1856.427  70.03198
## 23  1714.913  60.54549
## 24  1302.130  64.88547
## 25  1823.965  52.65141
## 26  1688.774  97.04713
## 27  1668.841  65.03302
## 28  1405.850  45.91730
## 29  1604.370  72.95790
## 30  1783.588  73.94763
## 31  1971.736  55.21260
## 32  1679.442  26.66619
## 33  1777.534  60.39277
## 34  1689.239  78.55761
## 35  1424.588  69.10415
## 36  1617.001  68.52732
## 37  1621.142  78.41231
## 38  1688.137  52.20312
## 39  1920.005  86.45166
## 40  1852.635  69.91984
## 41  1667.095  80.60966
## 42  1649.328  85.51162
## 43  1839.393  73.35221
## 44  1811.333  56.81939
## 45  1562.249  87.44447
## 46  1558.501  39.99753
## 47  1772.916  61.82814
## 48  1853.707  66.16494
## 49  1677.531  67.50818
## 50  1876.222  85.30696
## 51  1779.621  72.04333
## 52  1577.595  76.10751
## 53  1768.224  68.95518
## 54  1474.127  66.28503
## 55  1986.605  80.43326
## 56  2096.080  87.19343
## 57  1626.556  33.95356
## 58  1491.173  78.59109
## 59  1813.944  75.62087
## 60  1672.989  63.62098
## 61  2180.324  84.26519
## 62  1692.152  64.16144
## 63  1837.948  65.73504
## 64  1705.600  82.86115
## 65  1551.345  95.79441
## 66  1737.758  74.05082
## 67  1339.008  63.66724
## 68  1993.111  52.16330
## 69  1730.651  65.03451
## 70  2134.522  55.90256
## 71  1795.102  66.11601
## 72  1558.011  75.91569
## 73  1822.145  57.22214
## 74  1513.180 109.73750
## 75  1449.273  72.34018
## 76  1758.289  86.95311
## 77  1611.342  35.66314
## 78  1700.221  81.11502
## 79  1714.868  50.25632
## 80  1582.096  83.79706
## 81  1586.266  75.97195
## 82  1672.964  63.88707
## 83  1935.617  89.86388
## 84  1395.287  59.48152
## 85  1818.789  61.29079
## 86  1766.590  54.98392
## 87  1912.620  59.97732
## 88  1639.163  84.17777
## 89  1774.004  76.50553
## 90  1753.420  85.07739
## 91  1591.496  64.14822
## 92  1941.574  75.64555
## 93  1932.081  73.66247
## 94  1840.043  48.60614
## 95  2017.367  96.67644
## 96  1811.697  72.01671
## 97  1444.682  81.48398
## 98  1585.347  84.32705
## 99  1455.077  69.24151
## 100 1605.320  65.41277
## 101 1575.927  83.40511
## 102 1708.423  54.29053
## 103 1517.816  99.57006
## 104 1731.606  64.24552
## 105 1569.083  94.81218
## 106 2053.457  92.68319
## 107 1843.341  71.24449
## 108 1882.035  78.50831
## 109 1776.837  54.63177
## 110 2036.435  74.84510
## 111 1572.853  85.65419
## 112 1607.671  71.48618
## 113 1986.456  63.18795
## 114 1569.861  60.16327
## 115 1658.524  69.46116
## 116 1621.438  86.03742
## 117 1636.001  62.74038
## 118 1644.177  68.18485
## 119 1798.838  50.58790
## 120 1664.534  77.41469
## 121 1598.809  89.61852
## 122 1968.608  92.45562
## 123 1657.084  82.22054
## 124 1664.089  41.95317
## 125 1679.962  77.23044
## 126 1842.533  76.84203
## 127 1685.287  64.69900
## 128 1692.473  72.55734
## 129 1563.668  57.03946
## 130 1635.146  80.18846
## 131 1712.032  65.09348
## 132 1582.221  46.46377
## 133 1806.299  64.48824
## 134 1396.321  90.46652
## 135 1761.312  64.98578
## 136 1392.710  80.99125
## 137 1639.805  84.19878
## 138 1594.344  70.06598
## 139 1569.581  64.71517
## 140 1688.621  62.05457
## 141 1317.128  81.09384
## 142 1935.317  54.04814
## 143 1367.006  73.69316
## 144 1607.294  65.65751
## 145 1476.816  36.02666
## 146 1549.836  48.86724
## 147 2117.433  83.74029
## 148 1703.479  67.13082
## 149 1442.740  82.04925
## 150 1371.879  98.31212
## 151 1790.037  92.10822
## 152 1696.288  80.15903
## 153 1636.386  75.69944
## 154 1514.128  67.10802
## 155 1402.508  93.66838
## 156 1484.962  78.94351
## 157 1900.006  52.39635
## 158 1575.747  67.66536
## 159 1423.115  41.21635
## 160 2073.858  67.07112
## 161 1785.020  31.11508
## 162 1652.271  89.71003
## 163 1911.697  60.46685
## 164 1877.285  63.55032
## 165 1576.151  67.46023
## 166 2141.220  79.18327
## 167 1648.995  80.17510
## 168 1415.101  78.51928
## 169 1671.120  61.41186
## 170 1741.508  49.55063
## 171 2161.596  64.16917
## 172 1721.160  74.16871
## 173 1791.400  57.65378
## 174 1684.569  68.96739
## 175 1633.200  52.48507
## 176 1693.055  69.87536
## 177 1857.528  71.93283
## 178 2115.049  67.81187
## 179 1905.478  67.54134
## 180 1941.582  96.45328
## 181 1453.735  81.43880
## 182 1896.779  86.67147
## 183 1743.985  56.15190
## 184 1406.550  72.46513
## 185 1804.205  87.32238
## 186 1668.249  69.15218
## 187 1992.917  38.05959
## 188 1546.784  75.17269
## 189 1613.958  41.42567
## 190 1514.778  57.83245
## 191 1664.579  89.86006
## 192 1780.402  79.23455
## 193 1553.650  86.37503
## 194 1866.075  74.59907
## 195 1458.383  68.34762
## 196 1490.403  56.13531
## 197 1988.232  93.89371
## 198 1496.831  70.67516
## 199 1782.395  59.27307
## 200 1623.785  82.97835
pca_hw_ns <- prcomp(df_hw, center = TRUE, scale. = FALSE)
summary(pca_hw_ns)
## Importance of components:
##                             PC1      PC2
## Standard deviation     185.8197 15.15714
## Proportion of Variance   0.9934  0.00661
## Cumulative Proportion    0.9934  1.00000
pca_hw_ns$rotation
##                 PC1          PC2
## height  0.999998482 -0.001742428
## weight -0.001742428 -0.999998482
biplot(pca_hw_ns, main = "Height/Weight – PCA without scaling")

PCA with scaling:

pca_hw_sc <- prcomp(df_hw, center = TRUE, scale. = TRUE)
summary(pca_hw_sc)
## Importance of components:
##                           PC1    PC2
## Standard deviation     1.0106 0.9893
## Proportion of Variance 0.5106 0.4894
## Cumulative Proportion  0.5106 1.0000
pca_hw_sc$rotation
##               PC1        PC2
## height  0.7071068 -0.7071068
## weight -0.7071068 -0.7071068
biplot(pca_hw_sc, main = "Height/Weight – PCA with scaling")

3.3 3.3 Standardization with environmental data

env <- read.csv("environmental_data.csv")
head(env)
##   sample_id     temp  rainfall       pH   nitrate
## 1     ENV_1 11.63715 22.184751 6.963222 2.3612941
## 2     ENV_2 13.61894 14.242159 6.415674 0.9783600
## 3     ENV_3 24.35225  6.471627 6.682626 0.9736881
## 4     ENV_4 15.42305  9.694871 6.985579 0.2973474
## 5     ENV_5 15.77573  6.006427 7.335348 1.8819496
## 6     ENV_6 25.29039  5.823355 6.174727 0.8448575
apply(env[,-1], 2, sd)
##      temp  rainfall        pH   nitrate 
## 5.6589593 4.5185444 0.4824026 1.2430787

PCA without scaling:

pca_env_ns <- prcomp(env[,-1], center = TRUE, scale. = FALSE)
summary(pca_env_ns)
## Importance of components:
##                           PC1    PC2    PC3     PC4
## Standard deviation     5.6628 4.5144 1.2410 0.48132
## Proportion of Variance 0.5914 0.3759 0.0284 0.00427
## Cumulative Proportion  0.5914 0.9673 0.9957 1.00000
pca_env_ns$rotation
##                   PC1          PC2          PC3          PC4
## temp      0.998175993  0.059994780 -0.006169313 -0.002693042
## rainfall -0.059915988  0.998077820  0.014535611 -0.006281396
## pH       -0.002357740 -0.006338454 -0.006612133 -0.999955271
## nitrate  -0.007014414  0.014181534 -0.999853457  0.006538106
scores_env_ns <- as.data.frame(pca_env_ns$x)

ggplot(scores_env_ns, aes(PC1, PC2)) +
  geom_point(alpha = 0.6) +
  theme_pca +
  labs(title = "Environmental PCA – no scaling")

PCA with scaling:

pca_env_sc <- prcomp(env[,-1], center = TRUE, scale. = TRUE)


cor(env[,-1])
##                 temp    rainfall          pH     nitrate
## temp      1.00000000 -0.02728268 -0.03023254 -0.02810249
## rainfall -0.02728268  1.00000000 -0.05647078  0.04976955
## pH       -0.03023254 -0.05647078  1.00000000  0.01228194
## nitrate  -0.02810249  0.04976955  0.01228194  1.00000000
summary(pca_env_sc)
## Importance of components:
##                           PC1    PC2    PC3   PC4
## Standard deviation     1.0371 1.0223 0.9836 0.955
## Proportion of Variance 0.2689 0.2613 0.2419 0.228
## Cumulative Proportion  0.2689 0.5301 0.7720 1.000
pca_env_sc$rotation
##                 PC1        PC2        PC3        PC4
## temp      0.3340527  0.5688132  0.6937431 -0.2891033
## rainfall -0.7077953  0.2034733 -0.1045353 -0.6683538
## pH        0.3077299 -0.7262464  0.2059168 -0.5791949
## nitrate  -0.5410517 -0.3280492  0.6821958  0.3664092
scores_env_sc <- as.data.frame(pca_env_sc$x)

ggplot(scores_env_sc, aes(PC1, PC2)) +
  geom_point(alpha = 0.6) +
  theme_pca +
  labs(title = "Environmental PCA – with scaling")

Loadings:

load_env <- as.data.frame(pca_env_sc$rotation)
load_env$variable <- rownames(load_env)
load_env
##                 PC1        PC2        PC3        PC4 variable
## temp      0.3340527  0.5688132  0.6937431 -0.2891033     temp
## rainfall -0.7077953  0.2034733 -0.1045353 -0.6683538 rainfall
## pH        0.3077299 -0.7262464  0.2059168 -0.5791949       pH
## nitrate  -0.5410517 -0.3280492  0.6821958  0.3664092  nitrate
ggplot(load_env, aes(variable, PC1)) +
  geom_bar(stat = "identity") +
  theme_pca +
  labs(title = "Environmental PCA – Loadings on PC1", x = "Variable", y = "Loading")

4 4. Duplicated / Near-Duplicated Samples

4.1 4.1 Toy 2D example: one twin pair

set.seed(101)
n_main <- 180
main_cloud <- MASS::mvrnorm(
  n_main, mu = c(0,0),
  Sigma = matrix(c(1, 0.2,
                   0.2, 1), 2, 2)
)

twin1 <- c(5, 5)
twin2 <- twin1 + rnorm(2, 0, 0.05)

df_twin <- rbind(main_cloud, twin1, twin2)
df_twin <- as.data.frame(df_twin)
colnames(df_twin) <- c("x1", "x2")
df_twin$label <- c(rep("main", n_main), "twin1", "twin2")

ggplot(df_twin, aes(x1, x2, color = label)) +
  geom_point(size = 2, alpha = 0.8) +
  coord_equal() + theme_pca +
  labs(title = "Main cloud + 'twin' pair far away")

PCA with twins:

pca_with_twin <- prcomp(df_twin[,1:2], center = TRUE, scale. = FALSE)
pca_with_twin$rotation
##          PC1        PC2
## x1 0.6865180 -0.7271128
## x2 0.7271128  0.6865180
pca_with_twin$sdev^2
## [1] 1.6935210 0.6624451

Visualize PCs:

pc1 <- pca_with_twin$rotation[,1]
pc2 <- pca_with_twin$rotation[,2]

ggplot(df_twin, aes(x1, x2, color = label)) +
  geom_point(size = 2, alpha = 0.8) +
  geom_segment(aes(x = 0, y = 0,
                   xend = pc1[1]*3, yend = pc1[2]*3),
               arrow = arrow(length = unit(0.15, "cm")),
               color = "black") +
  geom_segment(aes(x = 0, y = 0,
                   xend = pc2[1]*3, yend = pc2[2]*3),
               arrow = arrow(length = unit(0.15, "cm")),
               color = "black") +
  coord_equal() + theme_pca +
  labs(title = "PC1 is influenced by the twin pair")

PCA without one twin:

df_no_twin <- subset(df_twin, label != "twin2")
pca_no_twin <- prcomp(df_no_twin[,1:2], center = TRUE, scale. = FALSE)
pca_no_twin$rotation
##          PC1        PC2
## x1 0.6786967 -0.7344187
## x2 0.7344187  0.6786967
pca_no_twin$sdev^2
## [1] 1.4269620 0.6657978

4.2 4.2 Expression-like data with twin samples

expr_data <- read.csv("expression_twins_data.csv")
dim(expr_data)
## [1] 82 52
table(expr_data$group)
## 
##   Control Treatment      Twin 
##        40        40         2

PCA (samples × genes):

expr_matrix <- expr_data[,-c(1,2)]
pca_expr <- prcomp(expr_matrix, center = TRUE, scale. = TRUE)

scores_expr <- as.data.frame(pca_expr$x)
scores_expr$group <- expr_data$group

ggplot(scores_expr, aes(PC1, PC2, color = group)) +
  geom_point(size = 2, alpha = 0.7) +
  theme_pca +
  labs(title = "Expression-like PCA with twins")

Remove one twin and re-run:

expr_no_twin <- expr_data[-nrow(expr_data), ]   # drop last row (second twin)

expr_matrix2 <- expr_no_twin[,-c(1,2)]
pca_expr2 <- prcomp(expr_matrix2, center = TRUE, scale. = TRUE)

scores_expr2 <- as.data.frame(pca_expr2$x)
scores_expr2$group <- expr_no_twin$group

ggplot(scores_expr2, aes(PC1, PC2, color = group)) +
  geom_point(size = 2, alpha = 0.7) +
  theme_pca +
  labs(title = "Expression-like PCA after removing one twin")

4.3 4.2b Expression-like data with twin samples

expr_data <- read.csv("expression_twins_data2.csv")
dim(expr_data)
## [1]    82 30002
table(expr_data$group)
## 
##   Control Treatment      Twin 
##        40        40         2

PCA (samples × genes):

expr_matrix <- expr_data[,-c(1,2)]
pca_expr <- prcomp(expr_matrix, center = TRUE, scale. = TRUE)

scores_expr <- as.data.frame(pca_expr$x)
scores_expr$group <- expr_data$group

ggplot(scores_expr, aes(PC1, PC2, color = group)) +
  geom_point(size = 2, alpha = 0.7) +
  theme_pca +
  labs(title = "Expression-like PCA with twins (large dataset)")

Remove one twin and re-run:

expr_no_twin <- expr_data[-nrow(expr_data), ]   # drop last row (second twin)

expr_matrix2 <- expr_no_twin[,-c(1,2)]
pca_expr2 <- prcomp(expr_matrix2, center = TRUE, scale. = TRUE)

scores_expr2 <- as.data.frame(pca_expr2$x)
scores_expr2$group <- expr_no_twin$group

ggplot(scores_expr2, aes(PC1, PC2, color = group)) +
  geom_point(size = 2, alpha = 0.7) +
  theme_pca +
  labs(title = "Expression-like PCA after removing one twin")

4.4 4.3 Many samples from the same location

loc_data <- read.csv("location_unbalanced_data.csv")
table(loc_data$location)
## 
## LocA LocB LocC 
##   80   20   20
head(loc_data)
##   sample_id location    Trait_1    Trait_2    Trait_3     Trait_4    Trait_5
## 1     LOC_1     LocA -1.2267678 -0.7630357  0.8876672  0.59117556 -0.1957244
## 2     LOC_2     LocA  0.1628424 -0.5032241 -0.1993700  0.07126979  0.8512445
## 3     LOC_3     LocA -1.3513255 -0.6145531  0.9950082 -0.14399184 -0.1342373
## 4     LOC_4     LocA  0.7356483 -0.3949018  0.9298862  1.57181404 -1.1403389
## 5     LOC_5     LocA  0.2760041 -1.3305879  0.5958680 -1.19653817  1.5699717
## 6     LOC_6     LocA -0.0915815 -0.3016421 -1.6813246  1.74697532  0.5915271

Run PCA (traits only):

pca_loc <- prcomp(loc_data[,-c(1,2)], center = TRUE, scale. = TRUE)
summary(pca_loc)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5
## Standard deviation     1.2069 1.0816 0.9853 0.9412 0.7191
## Proportion of Variance 0.2913 0.2340 0.1941 0.1772 0.1034
## Cumulative Proportion  0.2913 0.5253 0.7194 0.8966 1.0000
scores_loc <- as.data.frame(pca_loc$x)
scores_loc$location <- loc_data$location

ggplot(scores_loc, aes(PC1, PC2, color = location)) +
  geom_point(size = 2, alpha = 0.7) +
  theme_pca +
  labs(title = "PCA with many samples from LocA")

5 5. PCA Theory by Example: Covariance and Eigenvectors

Xc <- scale(df2, center = TRUE, scale = FALSE)
S <- cov(Xc)

S
##          x1       x2
## x1 3.747110 3.212387
## x2 3.212387 3.649887
eig <- eigen(S)
eig$values
## [1] 6.9112536 0.4857434
eig$vectors
##            [,1]       [,2]
## [1,] -0.7124363  0.7017368
## [2,] -0.7017368 -0.7124363
pca2$sdev^2         # eigenvalues
## [1] 6.9112536 0.4857434
pca2$rotation       # eigenvectors (up to sign)
##           PC1        PC2
## x1 -0.7124363 -0.7017368
## x2 -0.7017368  0.7124363

6 6. Global Optimization vs “Greedy” (3D Example)

set.seed(7)
Sigma3 <- matrix(c(5, 3, 1,
                   3, 4, 0.5,
                   1, 0.5, 1.5), 3, 3)
X3 <- MASS::mvrnorm(1000, mu = c(0,0,0), Sigma = Sigma3)
df3 <- as.data.frame(X3)
colnames(df3) <- paste0("x", 1:3)

pca3 <- prcomp(df3, center = TRUE, scale. = FALSE)
pca3$sdev^2
## [1] 7.462207 1.732005 1.081552
pca3$rotation
##           PC1        PC2        PC3
## x1 -0.7595428 -0.4465788  0.4729293
## x2 -0.6225865  0.7096783 -0.3297616
## x3 -0.1883631 -0.5449074 -0.8170650

Monte Carlo demo:

set.seed(999)
cov3 <- cov(df3)
eigs3 <- eigen(cov3)
lambda_pca12 <- sum(eigs3$values[1:2])

n_rand <- 2000
total_var_rand <- numeric(n_rand)

for (i in seq_len(n_rand)) {
  M <- matrix(rnorm(9), 3, 3)
  Q <- qr.Q(qr(M))
  V <- Q[,1:2]
  total_var_rand[i] <- sum(diag(t(V) %*% cov3 %*% V))
}

max(total_var_rand)
## [1] 9.190394
lambda_pca12
## [1] 9.194212

Histogram:

df_rand <- data.frame(total_var = total_var_rand)

ggplot(df_rand, aes(total_var)) +
  geom_histogram(bins = 40, alpha = 0.5) +
  geom_vline(xintercept = lambda_pca12, color = "red", linetype = 2, size = 1) +
  theme_pca +
  labs(title = "Total variance in random 2D subspaces vs PCA(PC1+PC2)",
       x = "Total variance in 2D subspace",
       y = "Count")

7 7. Biological PCA Examples

7.1 7.1 Environmental PCA (interpretation)

Already run in Section 3.3; use it to discuss ecological gradients, etc.

7.2 7.2 Genotype PCA (population structure)

geno_data <- read.csv("genotype_data.csv")
dim(geno_data)
## [1] 1500   52
table(geno_data$pop)
## 
## PopA PopB PopC 
##  500  500  500
geno_matrix <- as.matrix(geno_data[,-c(1,2)])

pca_geno <- prcomp(geno_matrix, center = TRUE, scale. = FALSE)
summary(pca_geno)
## Importance of components:
##                          PC1     PC2     PC3     PC4     PC5     PC6     PC7
## Standard deviation     1.638 0.64845 0.64306 0.63398 0.63174 0.62467 0.61448
## Proportion of Variance 0.154 0.02415 0.02375 0.02308 0.02292 0.02241 0.02168
## Cumulative Proportion  0.154 0.17818 0.20193 0.22501 0.24793 0.27033 0.29202
##                           PC8     PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.6134 0.61056 0.60942 0.60197 0.59916 0.59739 0.58784
## Proportion of Variance 0.0216 0.02141 0.02133 0.02081 0.02062 0.02049 0.01984
## Cumulative Proportion  0.3136 0.33503 0.35635 0.37716 0.39778 0.41827 0.43811
##                           PC15    PC16    PC17    PC18    PC19    PC20    PC21
## Standard deviation     0.58347 0.57904 0.57335 0.57181 0.56854 0.56546 0.55902
## Proportion of Variance 0.01955 0.01925 0.01888 0.01878 0.01856 0.01836 0.01795
## Cumulative Proportion  0.45766 0.47692 0.49579 0.51457 0.53313 0.55149 0.56944
##                           PC22   PC23   PC24    PC25    PC26   PC27   PC28
## Standard deviation     0.55712 0.5537 0.5504 0.54757 0.54338 0.5409 0.5377
## Proportion of Variance 0.01782 0.0176 0.0174 0.01722 0.01695 0.0168 0.0166
## Cumulative Proportion  0.58726 0.6049 0.6223 0.63948 0.65644 0.6732 0.6898
##                           PC29    PC30   PC31    PC32    PC33    PC34   PC35
## Standard deviation     0.53634 0.52869 0.5278 0.52425 0.52325 0.51713 0.5110
## Proportion of Variance 0.01652 0.01605 0.0160 0.01578 0.01572 0.01536 0.0150
## Cumulative Proportion  0.70636 0.72241 0.7384 0.75419 0.76991 0.78527 0.8003
##                          PC36    PC37    PC38    PC39    PC40    PC41    PC42
## Standard deviation     0.5076 0.50334 0.50141 0.50094 0.49610 0.49241 0.49069
## Proportion of Variance 0.0148 0.01455 0.01444 0.01441 0.01413 0.01392 0.01383
## Cumulative Proportion  0.8151 0.82961 0.84405 0.85846 0.87259 0.88651 0.90034
##                           PC43    PC44   PC45    PC46    PC47    PC48    PC49
## Standard deviation     0.48427 0.47678 0.4703 0.46767 0.46431 0.46050 0.45392
## Proportion of Variance 0.01347 0.01305 0.0127 0.01256 0.01238 0.01218 0.01183
## Cumulative Proportion  0.91381 0.92686 0.9396 0.95212 0.96450 0.97668 0.98851
##                           PC50
## Standard deviation     0.44737
## Proportion of Variance 0.01149
## Cumulative Proportion  1.00000
scores_geno <- as.data.frame(pca_geno$x)
scores_geno$pop <- geno_data$pop

ggplot(scores_geno, aes(PC1, PC2, color = pop)) +
  geom_point(size = 2, alpha = 0.7) +
  theme_pca +
  labs(title = "Genotype PCA – population structure")

7.3 7.3 Morphometric example: iris data

data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
iris_num <- iris[,1:4]
pca_iris <- prcomp(iris_num, center = TRUE, scale. = TRUE)
summary(pca_iris)
## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.7084 0.9560 0.38309 0.14393
## Proportion of Variance 0.7296 0.2285 0.03669 0.00518
## Cumulative Proportion  0.7296 0.9581 0.99482 1.00000
scores_iris <- as.data.frame(pca_iris$x)
scores_iris$Species <- iris$Species

ggplot(scores_iris, aes(PC1, PC2, color = Species)) +
  geom_point(size = 2, alpha = 0.8) +
  theme_pca +
  labs(title = "Iris morphometric PCA", x = "PC1", y = "PC2")

load_iris <- as.data.frame(pca_iris$rotation)
load_iris$Trait <- rownames(load_iris)

ggplot(load_iris, aes(Trait, PC1)) +
  geom_bar(stat = "identity") +
  theme_pca +
  labs(title = "Iris PCA – Loadings on PC1", x = "Trait", y = "Loading")

8 8. Summary

Key points:

  • PCA finds orthogonal directions of maximum variance.
  • PCs are eigenvectors of the covariance matrix; eigenvalues are variances explained.
  • Standardization is critical when variables have different units or scales.
  • Duplicated / near-duplicated samples and unbalanced sampling can distort PCA.
  • The top k PCs span the globally optimal k-dimensional subspace for variance.
  • PCA is central in biological data analysis.

9 9. Exercises

9.1 9.1 In-class exercises

  1. Environmental scaling

    • Load environmental_data.csv.
    • Run PCA
        1. without scaling
        1. with scaling
    • Plot PC1 vs PC2 in both cases.
    • Question: Which variables dominate PC1 in each case, and why?
  2. Effect of twins on expression PCA

    • Use expression_twins_data.csv.
    • Run PCA with all samples; color by group.
    • Remove one “Twin” sample and re-run.
    • Question: How did PC1 and PC2 change? Is Control vs Treatment separation clearer?
  3. Sampling imbalance

    • Use location_unbalanced_data.csv.
    • Run PCA and plot PC1 vs PC2 (color by location).
    • Then subsample LocA down to 20 samples and re-run.
    loc_data <- read.csv("location_unbalanced_data.csv")
    set.seed(1)
    subA <- subset(loc_data, location == "LocA")
    subA <- subA[sample(nrow(subA), 20), ]
    subBC <- subset(loc_data, location != "LocA")
    loc_balanced <- rbind(subA, subBC)
    • Question: What changes in the PCA plot?

9.2 9.2 Homework-style exercises

  1. Interpreting loadings (environmental data)
    • For the scaled environmental PCA, examine loadings for PC1 & PC2.
    • Write 5–8 sentences interpreting what each PC represents.
  2. Genotype PCA and population structure
    • Use genotype_data.csv.
    • Run PCA and plot PC1 vs PC2 (color by pop).
    • Answer:
        1. How much variance do PC1 & PC2 explain?
        1. Do populations cluster clearly?
        1. Repeat PCA on a random subset of 100 SNPs; compare.
  3. Creating your own duplicated samples
    • Take environmental_data.csv.
    • Duplicate ~5 samples, add tiny noise.
    • Run PCA before/after.
    • Question: Do any PCs become dominated by these duplicated samples?
  4. Global vs greedy (advanced)
    • Reproduce the 3D subspace simulation from Section 6.
    • Check if any random 2D subspace has higher total variance than PC1+PC2.
    • Question: What does this say about PCA’s optimality?
  5. PCA as preprocessing for clustering
    • Choose one dataset (environmental, genotype, or iris).
    • Run PCA, keep first 2–3 PCs.
    • Run k-means on PC scores.
    • Question: Is clustering on PCs more interpretable than clustering on raw variables?