In this session we will:
prcomp()).This R Markdown is also a lab notebook: it simulates datasets and saves them as CSV files that you can distribute to students.
In this section we generate several datasets that mimic real biological situations.
All CSVs are written into the working directory (check with
getwd()).
set.seed(123)
## 2.1 Environmental dataset ------------------------------------------
n_env <- 200
env_data <- data.frame(
sample_id = paste0("ENV_", seq_len(n_env)),
temp = rnorm(n_env, 15, 6), # °C
rainfall = rlnorm(n_env, 2, 0.5), # mm
pH = rnorm(n_env, 7, 0.5), # pH units
nitrate = rlnorm(n_env, 0, 0.8) # mg/L
)
head(env_data)
## sample_id temp rainfall pH nitrate
## 1 ENV_1 11.63715 22.184751 6.963222 2.3612941
## 2 ENV_2 13.61894 14.242159 6.415674 0.9783600
## 3 ENV_3 24.35225 6.471627 6.682626 0.9736881
## 4 ENV_4 15.42305 9.694871 6.985579 0.2973474
## 5 ENV_5 15.77573 6.006427 7.335348 1.8819496
## 6 ENV_6 25.29039 5.823355 6.174727 0.8448575
write.csv(env_data, "environmental_data.csv", row.names = FALSE)
## 2.2 Genotype-like dataset (0/1/2) with 3 populations ---------------
n_ind <- 150
n_snp <- 500
pop <- rep(c("PopA", "PopB", "PopC"), each = n_ind/3)
# Base MAFs
maf_A <- runif(n_snp, 0.05, 0.35)
maf_B <- pmin(pmax(maf_A + runif(n_snp, -0.1, 0.1), 0.01), 0.49)
maf_C <- pmin(pmax(maf_A + runif(n_snp, -0.2, 0.2), 0.01), 0.49)
sim_geno <- function(n, maf) {
p <- maf
probs <- cbind((1-p)^2, 2*p*(1-p), p^2)
G <- apply(probs, 1, function(pr) sample(0:2, size = n, replace = TRUE, prob = pr))
t(G)
}
G_A <- sim_geno(n_ind/3, maf_A)
G_B <- sim_geno(n_ind/3, maf_B)
G_C <- sim_geno(n_ind/3, maf_C)
geno_matrix <- rbind(G_A, G_B, G_C)
colnames(geno_matrix) <- paste0("SNP_", seq_len(ncol(geno_matrix)))
geno_data <- data.frame(
sample_id = paste0("IND_", seq_len(n_ind)),
pop = pop,
geno_matrix
)
head(geno_data[,1:10])
## sample_id pop SNP_1 SNP_2 SNP_3 SNP_4 SNP_5 SNP_6 SNP_7 SNP_8
## 1 IND_1 PopA 0 2 0 1 0 0 1 0
## 2 IND_2 PopA 0 1 0 0 0 0 0 0
## 3 IND_3 PopA 0 0 0 0 0 0 0 0
## 4 IND_4 PopA 0 0 1 0 2 0 0 1
## 5 IND_5 PopA 0 0 0 1 2 1 1 1
## 6 IND_6 PopA 1 0 0 0 0 0 0 0
write.csv(geno_data, "genotype_data.csv", row.names = FALSE)
## 2.3 Expression-like dataset with twins / replicates ----------------
n_expr <- 80
n_genes <- 50
# "True" expression means differ by group
group_expr <- rep(c("Control", "Treatment"), each = n_expr/2)
base_means <- runif(n_genes, 3, 8) # baseline log-expression
treatment_effect <- rnorm(n_genes, 0, 0.7)
expr_matrix <- matrix(NA, nrow = n_expr, ncol = n_genes)
for (i in seq_len(n_expr)) {
mu <- base_means
if (group_expr[i] == "Treatment") {
mu <- mu + treatment_effect
}
expr_matrix[i, ] <- rnorm(n_genes, mean = mu, sd = 0.5)
}
# Add a pair of "twins" with nearly identical expression
twin1 <- rnorm(n_genes, mean = base_means + 0.3, sd = 0.2)
twin2 <- twin1 + rnorm(n_genes, mean = 0, sd = 0.05)
expr_matrix <- rbind(expr_matrix, twin1, twin2)
expr_group <- c(group_expr, "Twin", "Twin")
expr_id <- paste0("S", seq_len(nrow(expr_matrix)))
expr_data <- data.frame(
sample_id = expr_id,
group = expr_group,
expr_matrix
)
colnames(expr_data)[-(1:2)] <- paste0("Gene_", seq_len(n_genes))
head(expr_data[,1:8])
## sample_id group Gene_1 Gene_2 Gene_3 Gene_4 Gene_5 Gene_6
## X S1 Control 7.668686 3.662886 5.925802 6.171390 5.460782 6.866024
## X.1 S2 Control 7.253561 4.349532 5.583942 6.799561 5.673894 7.255362
## X.2 S3 Control 7.476361 4.000081 5.541735 7.093655 6.094962 6.827718
## X.3 S4 Control 7.104922 4.628845 5.814073 6.617769 5.711166 7.137490
## X.4 S5 Control 7.923649 4.022753 6.015086 7.509066 4.735120 7.641594
## X.5 S6 Control 6.824361 3.734323 5.238527 7.228134 6.122697 5.650121
write.csv(expr_data, "expression_twins_data.csv", row.names = FALSE)
## 2.3b Expression-like dataset with twins / replicates ----------------
n_expr <- 80
n_genes <- 30000
# "True" expression means differ by group
group_expr <- rep(c("Control", "Treatment"), each = n_expr/2)
base_means <- runif(n_genes, 3, 8) # baseline log-expression
treatment_effect <- rnorm(n_genes, 0, 0.7)
expr_matrix <- matrix(NA, nrow = n_expr, ncol = n_genes)
for (i in seq_len(n_expr)) {
mu <- base_means
if (group_expr[i] == "Treatment") {
mu <- mu + treatment_effect
}
expr_matrix[i, ] <- rnorm(n_genes, mean = mu, sd = 2)
}
# Add a pair of "twins" with nearly identical expression
twin1 <- rnorm(n_genes, mean = base_means + 0.3, sd = 2)
twin2 <- twin1 + rnorm(n_genes, mean = 0, sd = 0.0005)
expr_matrix <- rbind(expr_matrix, twin1, twin2)
expr_group <- c(group_expr, "Twin", "Twin")
expr_id <- paste0("S", seq_len(nrow(expr_matrix)))
expr_data <- data.frame(
sample_id = expr_id,
group = expr_group,
expr_matrix
)
colnames(expr_data)[-(1:2)] <- paste0("Gene_", seq_len(n_genes))
head(expr_data[,1:8])
## sample_id group Gene_1 Gene_2 Gene_3 Gene_4 Gene_5 Gene_6
## X S1 Control 6.589128 3.0279967 4.770101 5.984451 4.753206 6.955786
## X.1 S2 Control 6.100408 3.2619345 7.300809 4.146241 6.996118 9.245594
## X.2 S3 Control 6.942099 -0.4959638 2.794651 1.493286 7.403729 7.009384
## X.3 S4 Control 8.286431 6.6272333 4.577211 4.870890 8.480992 11.661192
## X.4 S5 Control 5.408853 1.6685311 3.091760 2.545861 2.660046 6.100538
## X.5 S6 Control 3.797679 1.8299410 7.398065 5.115125 4.919272 4.336243
write.csv(expr_data, "expression_twins_data2.csv", row.names = FALSE)
#expr_data[c(nrow(expr_data)-1, nrow(expr_data)),]
## 2.4 Dataset with many samples from same location -------------------
n_loc_A <- 80
n_loc_B <- 20
n_loc_C <- 20
n_traits <- 5
location <- c(rep("LocA", n_loc_A),
rep("LocB", n_loc_B),
rep("LocC", n_loc_C))
mean_A <- rep(0, n_traits)
mean_B <- c(1, -1, 0.5, 0, 0)
mean_C <- c(-1, 0.5, -0.5, 0, 0.7)
cov_traits <- diag(n_traits) * 1
cov_traits[1,2] <- cov_traits[2,1] <- 0.6
sim_loc <- function(n, mu) {
MASS::mvrnorm(n, mu = mu, Sigma = cov_traits)
}
traits_A <- sim_loc(n_loc_A, mean_A)
traits_B <- sim_loc(n_loc_B, mean_B)
traits_C <- sim_loc(n_loc_C, mean_C)
traits_matrix <- rbind(traits_A, traits_B, traits_C)
loc_data <- data.frame(
sample_id = paste0("LOC_", seq_len(nrow(traits_matrix))),
location = location,
traits_matrix
)
colnames(loc_data)[-(1:2)] <- paste0("Trait_", seq_len(n_traits))
head(loc_data)
## sample_id location Trait_1 Trait_2 Trait_3 Trait_4 Trait_5
## 1 LOC_1 LocA -1.2267678 -0.7630357 0.8876672 0.59117556 -0.1957244
## 2 LOC_2 LocA 0.1628424 -0.5032241 -0.1993700 0.07126979 0.8512445
## 3 LOC_3 LocA -1.3513255 -0.6145531 0.9950082 -0.14399184 -0.1342373
## 4 LOC_4 LocA 0.7356483 -0.3949018 0.9298862 1.57181404 -1.1403389
## 5 LOC_5 LocA 0.2760041 -1.3305879 0.5958680 -1.19653817 1.5699717
## 6 LOC_6 LocA -0.0915815 -0.3016421 -1.6813246 1.74697532 0.5915271
write.csv(loc_data, "location_unbalanced_data.csv", row.names = FALSE)
We now have four datasets you can share:
environmental_data.csvgenotype_data.csvexpression_twins_data.csvlocation_unbalanced_data.csvEach is used below.
set.seed(42)
Sigma_2d <- matrix(c(4, 3.5,
3.5, 4), 2, 2)
X <- MASS::mvrnorm(400, mu = c(0, 0), Sigma = Sigma_2d)
df2 <- data.frame(x1 = X[,1], x2 = X[,2])
ggplot(df2, aes(x1, x2)) +
geom_point(alpha = 0.3) +
coord_equal() + theme_pca +
labs(title = "Correlated 2D data: we want the 'long' direction")
Compute PCA and show axes:
pca2 <- prcomp(df2, center = TRUE, scale. = FALSE)
pca2$rotation
## PC1 PC2
## x1 -0.7124363 -0.7017368
## x2 -0.7017368 0.7124363
pca2$sdev^2
## [1] 6.9112536 0.4857434
pc1 <- pca2$rotation[,1]
pc2 <- pca2$rotation[,2]
ggplot(df2, aes(x1, x2)) +
geom_point(alpha = 0.3) +
geom_segment(aes(x = 0, y = 0,
xend = pc1[1]*3, yend = pc1[2]*3),
arrow = arrow(length = unit(0.15, "cm")),
color = "red") +
geom_segment(aes(x = 0, y = 0,
xend = pc2[1]*3, yend = pc2[2]*3),
arrow = arrow(length = unit(0.15, "cm")),
color = "blue") +
coord_equal() + theme_pca +
labs(title = "PCA axes: PC1 (red) = longest direction, PC2 (blue) orthogonal")
set.seed(1)
n <- 200
height <- rnorm(n, mean = 1700, sd = 200) # mm
weight <- rnorm(n, mean = 70, sd = 15) # kg
df_hw <- data.frame(height = height, weight = weight)
apply(df_hw, 2, sd)
## height weight
## 185.81945 15.16057
PCA without scaling:
df_hw
## height weight
## 1 1574.709 76.14103
## 2 1736.729 95.33310
## 3 1532.874 93.79883
## 4 2019.056 65.03638
## 5 1765.902 35.72147
## 6 1535.906 107.46492
## 7 1797.486 80.00599
## 8 1847.665 78.11991
## 9 1815.156 69.79901
## 10 1638.922 77.65163
## 11 2002.356 67.53436
## 12 1777.969 76.31042
## 13 1575.752 63.99630
## 14 1257.060 49.44688
## 15 1924.986 84.81757
## 16 1691.013 92.79618
## 17 1696.762 65.36889
## 18 1888.767 51.20065
## 19 1864.244 79.63362
## 20 1818.780 69.32936
## 21 1883.795 44.00172
## 22 1856.427 70.03198
## 23 1714.913 60.54549
## 24 1302.130 64.88547
## 25 1823.965 52.65141
## 26 1688.774 97.04713
## 27 1668.841 65.03302
## 28 1405.850 45.91730
## 29 1604.370 72.95790
## 30 1783.588 73.94763
## 31 1971.736 55.21260
## 32 1679.442 26.66619
## 33 1777.534 60.39277
## 34 1689.239 78.55761
## 35 1424.588 69.10415
## 36 1617.001 68.52732
## 37 1621.142 78.41231
## 38 1688.137 52.20312
## 39 1920.005 86.45166
## 40 1852.635 69.91984
## 41 1667.095 80.60966
## 42 1649.328 85.51162
## 43 1839.393 73.35221
## 44 1811.333 56.81939
## 45 1562.249 87.44447
## 46 1558.501 39.99753
## 47 1772.916 61.82814
## 48 1853.707 66.16494
## 49 1677.531 67.50818
## 50 1876.222 85.30696
## 51 1779.621 72.04333
## 52 1577.595 76.10751
## 53 1768.224 68.95518
## 54 1474.127 66.28503
## 55 1986.605 80.43326
## 56 2096.080 87.19343
## 57 1626.556 33.95356
## 58 1491.173 78.59109
## 59 1813.944 75.62087
## 60 1672.989 63.62098
## 61 2180.324 84.26519
## 62 1692.152 64.16144
## 63 1837.948 65.73504
## 64 1705.600 82.86115
## 65 1551.345 95.79441
## 66 1737.758 74.05082
## 67 1339.008 63.66724
## 68 1993.111 52.16330
## 69 1730.651 65.03451
## 70 2134.522 55.90256
## 71 1795.102 66.11601
## 72 1558.011 75.91569
## 73 1822.145 57.22214
## 74 1513.180 109.73750
## 75 1449.273 72.34018
## 76 1758.289 86.95311
## 77 1611.342 35.66314
## 78 1700.221 81.11502
## 79 1714.868 50.25632
## 80 1582.096 83.79706
## 81 1586.266 75.97195
## 82 1672.964 63.88707
## 83 1935.617 89.86388
## 84 1395.287 59.48152
## 85 1818.789 61.29079
## 86 1766.590 54.98392
## 87 1912.620 59.97732
## 88 1639.163 84.17777
## 89 1774.004 76.50553
## 90 1753.420 85.07739
## 91 1591.496 64.14822
## 92 1941.574 75.64555
## 93 1932.081 73.66247
## 94 1840.043 48.60614
## 95 2017.367 96.67644
## 96 1811.697 72.01671
## 97 1444.682 81.48398
## 98 1585.347 84.32705
## 99 1455.077 69.24151
## 100 1605.320 65.41277
## 101 1575.927 83.40511
## 102 1708.423 54.29053
## 103 1517.816 99.57006
## 104 1731.606 64.24552
## 105 1569.083 94.81218
## 106 2053.457 92.68319
## 107 1843.341 71.24449
## 108 1882.035 78.50831
## 109 1776.837 54.63177
## 110 2036.435 74.84510
## 111 1572.853 85.65419
## 112 1607.671 71.48618
## 113 1986.456 63.18795
## 114 1569.861 60.16327
## 115 1658.524 69.46116
## 116 1621.438 86.03742
## 117 1636.001 62.74038
## 118 1644.177 68.18485
## 119 1798.838 50.58790
## 120 1664.534 77.41469
## 121 1598.809 89.61852
## 122 1968.608 92.45562
## 123 1657.084 82.22054
## 124 1664.089 41.95317
## 125 1679.962 77.23044
## 126 1842.533 76.84203
## 127 1685.287 64.69900
## 128 1692.473 72.55734
## 129 1563.668 57.03946
## 130 1635.146 80.18846
## 131 1712.032 65.09348
## 132 1582.221 46.46377
## 133 1806.299 64.48824
## 134 1396.321 90.46652
## 135 1761.312 64.98578
## 136 1392.710 80.99125
## 137 1639.805 84.19878
## 138 1594.344 70.06598
## 139 1569.581 64.71517
## 140 1688.621 62.05457
## 141 1317.128 81.09384
## 142 1935.317 54.04814
## 143 1367.006 73.69316
## 144 1607.294 65.65751
## 145 1476.816 36.02666
## 146 1549.836 48.86724
## 147 2117.433 83.74029
## 148 1703.479 67.13082
## 149 1442.740 82.04925
## 150 1371.879 98.31212
## 151 1790.037 92.10822
## 152 1696.288 80.15903
## 153 1636.386 75.69944
## 154 1514.128 67.10802
## 155 1402.508 93.66838
## 156 1484.962 78.94351
## 157 1900.006 52.39635
## 158 1575.747 67.66536
## 159 1423.115 41.21635
## 160 2073.858 67.07112
## 161 1785.020 31.11508
## 162 1652.271 89.71003
## 163 1911.697 60.46685
## 164 1877.285 63.55032
## 165 1576.151 67.46023
## 166 2141.220 79.18327
## 167 1648.995 80.17510
## 168 1415.101 78.51928
## 169 1671.120 61.41186
## 170 1741.508 49.55063
## 171 2161.596 64.16917
## 172 1721.160 74.16871
## 173 1791.400 57.65378
## 174 1684.569 68.96739
## 175 1633.200 52.48507
## 176 1693.055 69.87536
## 177 1857.528 71.93283
## 178 2115.049 67.81187
## 179 1905.478 67.54134
## 180 1941.582 96.45328
## 181 1453.735 81.43880
## 182 1896.779 86.67147
## 183 1743.985 56.15190
## 184 1406.550 72.46513
## 185 1804.205 87.32238
## 186 1668.249 69.15218
## 187 1992.917 38.05959
## 188 1546.784 75.17269
## 189 1613.958 41.42567
## 190 1514.778 57.83245
## 191 1664.579 89.86006
## 192 1780.402 79.23455
## 193 1553.650 86.37503
## 194 1866.075 74.59907
## 195 1458.383 68.34762
## 196 1490.403 56.13531
## 197 1988.232 93.89371
## 198 1496.831 70.67516
## 199 1782.395 59.27307
## 200 1623.785 82.97835
pca_hw_ns <- prcomp(df_hw, center = TRUE, scale. = FALSE)
summary(pca_hw_ns)
## Importance of components:
## PC1 PC2
## Standard deviation 185.8197 15.15714
## Proportion of Variance 0.9934 0.00661
## Cumulative Proportion 0.9934 1.00000
pca_hw_ns$rotation
## PC1 PC2
## height 0.999998482 -0.001742428
## weight -0.001742428 -0.999998482
biplot(pca_hw_ns, main = "Height/Weight – PCA without scaling")
PCA with scaling:
pca_hw_sc <- prcomp(df_hw, center = TRUE, scale. = TRUE)
summary(pca_hw_sc)
## Importance of components:
## PC1 PC2
## Standard deviation 1.0106 0.9893
## Proportion of Variance 0.5106 0.4894
## Cumulative Proportion 0.5106 1.0000
pca_hw_sc$rotation
## PC1 PC2
## height 0.7071068 -0.7071068
## weight -0.7071068 -0.7071068
biplot(pca_hw_sc, main = "Height/Weight – PCA with scaling")
env <- read.csv("environmental_data.csv")
head(env)
## sample_id temp rainfall pH nitrate
## 1 ENV_1 11.63715 22.184751 6.963222 2.3612941
## 2 ENV_2 13.61894 14.242159 6.415674 0.9783600
## 3 ENV_3 24.35225 6.471627 6.682626 0.9736881
## 4 ENV_4 15.42305 9.694871 6.985579 0.2973474
## 5 ENV_5 15.77573 6.006427 7.335348 1.8819496
## 6 ENV_6 25.29039 5.823355 6.174727 0.8448575
apply(env[,-1], 2, sd)
## temp rainfall pH nitrate
## 5.6589593 4.5185444 0.4824026 1.2430787
PCA without scaling:
pca_env_ns <- prcomp(env[,-1], center = TRUE, scale. = FALSE)
summary(pca_env_ns)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 5.6628 4.5144 1.2410 0.48132
## Proportion of Variance 0.5914 0.3759 0.0284 0.00427
## Cumulative Proportion 0.5914 0.9673 0.9957 1.00000
pca_env_ns$rotation
## PC1 PC2 PC3 PC4
## temp 0.998175993 0.059994780 -0.006169313 -0.002693042
## rainfall -0.059915988 0.998077820 0.014535611 -0.006281396
## pH -0.002357740 -0.006338454 -0.006612133 -0.999955271
## nitrate -0.007014414 0.014181534 -0.999853457 0.006538106
scores_env_ns <- as.data.frame(pca_env_ns$x)
ggplot(scores_env_ns, aes(PC1, PC2)) +
geom_point(alpha = 0.6) +
theme_pca +
labs(title = "Environmental PCA – no scaling")
PCA with scaling:
pca_env_sc <- prcomp(env[,-1], center = TRUE, scale. = TRUE)
cor(env[,-1])
## temp rainfall pH nitrate
## temp 1.00000000 -0.02728268 -0.03023254 -0.02810249
## rainfall -0.02728268 1.00000000 -0.05647078 0.04976955
## pH -0.03023254 -0.05647078 1.00000000 0.01228194
## nitrate -0.02810249 0.04976955 0.01228194 1.00000000
summary(pca_env_sc)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.0371 1.0223 0.9836 0.955
## Proportion of Variance 0.2689 0.2613 0.2419 0.228
## Cumulative Proportion 0.2689 0.5301 0.7720 1.000
pca_env_sc$rotation
## PC1 PC2 PC3 PC4
## temp 0.3340527 0.5688132 0.6937431 -0.2891033
## rainfall -0.7077953 0.2034733 -0.1045353 -0.6683538
## pH 0.3077299 -0.7262464 0.2059168 -0.5791949
## nitrate -0.5410517 -0.3280492 0.6821958 0.3664092
scores_env_sc <- as.data.frame(pca_env_sc$x)
ggplot(scores_env_sc, aes(PC1, PC2)) +
geom_point(alpha = 0.6) +
theme_pca +
labs(title = "Environmental PCA – with scaling")
Loadings:
load_env <- as.data.frame(pca_env_sc$rotation)
load_env$variable <- rownames(load_env)
load_env
## PC1 PC2 PC3 PC4 variable
## temp 0.3340527 0.5688132 0.6937431 -0.2891033 temp
## rainfall -0.7077953 0.2034733 -0.1045353 -0.6683538 rainfall
## pH 0.3077299 -0.7262464 0.2059168 -0.5791949 pH
## nitrate -0.5410517 -0.3280492 0.6821958 0.3664092 nitrate
ggplot(load_env, aes(variable, PC1)) +
geom_bar(stat = "identity") +
theme_pca +
labs(title = "Environmental PCA – Loadings on PC1", x = "Variable", y = "Loading")
set.seed(101)
n_main <- 180
main_cloud <- MASS::mvrnorm(
n_main, mu = c(0,0),
Sigma = matrix(c(1, 0.2,
0.2, 1), 2, 2)
)
twin1 <- c(5, 5)
twin2 <- twin1 + rnorm(2, 0, 0.05)
df_twin <- rbind(main_cloud, twin1, twin2)
df_twin <- as.data.frame(df_twin)
colnames(df_twin) <- c("x1", "x2")
df_twin$label <- c(rep("main", n_main), "twin1", "twin2")
ggplot(df_twin, aes(x1, x2, color = label)) +
geom_point(size = 2, alpha = 0.8) +
coord_equal() + theme_pca +
labs(title = "Main cloud + 'twin' pair far away")
PCA with twins:
pca_with_twin <- prcomp(df_twin[,1:2], center = TRUE, scale. = FALSE)
pca_with_twin$rotation
## PC1 PC2
## x1 0.6865180 -0.7271128
## x2 0.7271128 0.6865180
pca_with_twin$sdev^2
## [1] 1.6935210 0.6624451
Visualize PCs:
pc1 <- pca_with_twin$rotation[,1]
pc2 <- pca_with_twin$rotation[,2]
ggplot(df_twin, aes(x1, x2, color = label)) +
geom_point(size = 2, alpha = 0.8) +
geom_segment(aes(x = 0, y = 0,
xend = pc1[1]*3, yend = pc1[2]*3),
arrow = arrow(length = unit(0.15, "cm")),
color = "black") +
geom_segment(aes(x = 0, y = 0,
xend = pc2[1]*3, yend = pc2[2]*3),
arrow = arrow(length = unit(0.15, "cm")),
color = "black") +
coord_equal() + theme_pca +
labs(title = "PC1 is influenced by the twin pair")
PCA without one twin:
df_no_twin <- subset(df_twin, label != "twin2")
pca_no_twin <- prcomp(df_no_twin[,1:2], center = TRUE, scale. = FALSE)
pca_no_twin$rotation
## PC1 PC2
## x1 0.6786967 -0.7344187
## x2 0.7344187 0.6786967
pca_no_twin$sdev^2
## [1] 1.4269620 0.6657978
expr_data <- read.csv("expression_twins_data.csv")
dim(expr_data)
## [1] 82 52
table(expr_data$group)
##
## Control Treatment Twin
## 40 40 2
PCA (samples × genes):
expr_matrix <- expr_data[,-c(1,2)]
pca_expr <- prcomp(expr_matrix, center = TRUE, scale. = TRUE)
scores_expr <- as.data.frame(pca_expr$x)
scores_expr$group <- expr_data$group
ggplot(scores_expr, aes(PC1, PC2, color = group)) +
geom_point(size = 2, alpha = 0.7) +
theme_pca +
labs(title = "Expression-like PCA with twins")
Remove one twin and re-run:
expr_no_twin <- expr_data[-nrow(expr_data), ] # drop last row (second twin)
expr_matrix2 <- expr_no_twin[,-c(1,2)]
pca_expr2 <- prcomp(expr_matrix2, center = TRUE, scale. = TRUE)
scores_expr2 <- as.data.frame(pca_expr2$x)
scores_expr2$group <- expr_no_twin$group
ggplot(scores_expr2, aes(PC1, PC2, color = group)) +
geom_point(size = 2, alpha = 0.7) +
theme_pca +
labs(title = "Expression-like PCA after removing one twin")
expr_data <- read.csv("expression_twins_data2.csv")
dim(expr_data)
## [1] 82 30002
table(expr_data$group)
##
## Control Treatment Twin
## 40 40 2
PCA (samples × genes):
expr_matrix <- expr_data[,-c(1,2)]
pca_expr <- prcomp(expr_matrix, center = TRUE, scale. = TRUE)
scores_expr <- as.data.frame(pca_expr$x)
scores_expr$group <- expr_data$group
ggplot(scores_expr, aes(PC1, PC2, color = group)) +
geom_point(size = 2, alpha = 0.7) +
theme_pca +
labs(title = "Expression-like PCA with twins (large dataset)")
Remove one twin and re-run:
expr_no_twin <- expr_data[-nrow(expr_data), ] # drop last row (second twin)
expr_matrix2 <- expr_no_twin[,-c(1,2)]
pca_expr2 <- prcomp(expr_matrix2, center = TRUE, scale. = TRUE)
scores_expr2 <- as.data.frame(pca_expr2$x)
scores_expr2$group <- expr_no_twin$group
ggplot(scores_expr2, aes(PC1, PC2, color = group)) +
geom_point(size = 2, alpha = 0.7) +
theme_pca +
labs(title = "Expression-like PCA after removing one twin")
loc_data <- read.csv("location_unbalanced_data.csv")
table(loc_data$location)
##
## LocA LocB LocC
## 80 20 20
head(loc_data)
## sample_id location Trait_1 Trait_2 Trait_3 Trait_4 Trait_5
## 1 LOC_1 LocA -1.2267678 -0.7630357 0.8876672 0.59117556 -0.1957244
## 2 LOC_2 LocA 0.1628424 -0.5032241 -0.1993700 0.07126979 0.8512445
## 3 LOC_3 LocA -1.3513255 -0.6145531 0.9950082 -0.14399184 -0.1342373
## 4 LOC_4 LocA 0.7356483 -0.3949018 0.9298862 1.57181404 -1.1403389
## 5 LOC_5 LocA 0.2760041 -1.3305879 0.5958680 -1.19653817 1.5699717
## 6 LOC_6 LocA -0.0915815 -0.3016421 -1.6813246 1.74697532 0.5915271
Run PCA (traits only):
pca_loc <- prcomp(loc_data[,-c(1,2)], center = TRUE, scale. = TRUE)
summary(pca_loc)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5
## Standard deviation 1.2069 1.0816 0.9853 0.9412 0.7191
## Proportion of Variance 0.2913 0.2340 0.1941 0.1772 0.1034
## Cumulative Proportion 0.2913 0.5253 0.7194 0.8966 1.0000
scores_loc <- as.data.frame(pca_loc$x)
scores_loc$location <- loc_data$location
ggplot(scores_loc, aes(PC1, PC2, color = location)) +
geom_point(size = 2, alpha = 0.7) +
theme_pca +
labs(title = "PCA with many samples from LocA")
Xc <- scale(df2, center = TRUE, scale = FALSE)
S <- cov(Xc)
S
## x1 x2
## x1 3.747110 3.212387
## x2 3.212387 3.649887
eig <- eigen(S)
eig$values
## [1] 6.9112536 0.4857434
eig$vectors
## [,1] [,2]
## [1,] -0.7124363 0.7017368
## [2,] -0.7017368 -0.7124363
pca2$sdev^2 # eigenvalues
## [1] 6.9112536 0.4857434
pca2$rotation # eigenvectors (up to sign)
## PC1 PC2
## x1 -0.7124363 -0.7017368
## x2 -0.7017368 0.7124363
set.seed(7)
Sigma3 <- matrix(c(5, 3, 1,
3, 4, 0.5,
1, 0.5, 1.5), 3, 3)
X3 <- MASS::mvrnorm(1000, mu = c(0,0,0), Sigma = Sigma3)
df3 <- as.data.frame(X3)
colnames(df3) <- paste0("x", 1:3)
pca3 <- prcomp(df3, center = TRUE, scale. = FALSE)
pca3$sdev^2
## [1] 7.462207 1.732005 1.081552
pca3$rotation
## PC1 PC2 PC3
## x1 -0.7595428 -0.4465788 0.4729293
## x2 -0.6225865 0.7096783 -0.3297616
## x3 -0.1883631 -0.5449074 -0.8170650
Monte Carlo demo:
set.seed(999)
cov3 <- cov(df3)
eigs3 <- eigen(cov3)
lambda_pca12 <- sum(eigs3$values[1:2])
n_rand <- 2000
total_var_rand <- numeric(n_rand)
for (i in seq_len(n_rand)) {
M <- matrix(rnorm(9), 3, 3)
Q <- qr.Q(qr(M))
V <- Q[,1:2]
total_var_rand[i] <- sum(diag(t(V) %*% cov3 %*% V))
}
max(total_var_rand)
## [1] 9.190394
lambda_pca12
## [1] 9.194212
Histogram:
df_rand <- data.frame(total_var = total_var_rand)
ggplot(df_rand, aes(total_var)) +
geom_histogram(bins = 40, alpha = 0.5) +
geom_vline(xintercept = lambda_pca12, color = "red", linetype = 2, size = 1) +
theme_pca +
labs(title = "Total variance in random 2D subspaces vs PCA(PC1+PC2)",
x = "Total variance in 2D subspace",
y = "Count")
Already run in Section 3.3; use it to discuss ecological gradients, etc.
geno_data <- read.csv("genotype_data.csv")
dim(geno_data)
## [1] 1500 52
table(geno_data$pop)
##
## PopA PopB PopC
## 500 500 500
geno_matrix <- as.matrix(geno_data[,-c(1,2)])
pca_geno <- prcomp(geno_matrix, center = TRUE, scale. = FALSE)
summary(pca_geno)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.638 0.64845 0.64306 0.63398 0.63174 0.62467 0.61448
## Proportion of Variance 0.154 0.02415 0.02375 0.02308 0.02292 0.02241 0.02168
## Cumulative Proportion 0.154 0.17818 0.20193 0.22501 0.24793 0.27033 0.29202
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.6134 0.61056 0.60942 0.60197 0.59916 0.59739 0.58784
## Proportion of Variance 0.0216 0.02141 0.02133 0.02081 0.02062 0.02049 0.01984
## Cumulative Proportion 0.3136 0.33503 0.35635 0.37716 0.39778 0.41827 0.43811
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.58347 0.57904 0.57335 0.57181 0.56854 0.56546 0.55902
## Proportion of Variance 0.01955 0.01925 0.01888 0.01878 0.01856 0.01836 0.01795
## Cumulative Proportion 0.45766 0.47692 0.49579 0.51457 0.53313 0.55149 0.56944
## PC22 PC23 PC24 PC25 PC26 PC27 PC28
## Standard deviation 0.55712 0.5537 0.5504 0.54757 0.54338 0.5409 0.5377
## Proportion of Variance 0.01782 0.0176 0.0174 0.01722 0.01695 0.0168 0.0166
## Cumulative Proportion 0.58726 0.6049 0.6223 0.63948 0.65644 0.6732 0.6898
## PC29 PC30 PC31 PC32 PC33 PC34 PC35
## Standard deviation 0.53634 0.52869 0.5278 0.52425 0.52325 0.51713 0.5110
## Proportion of Variance 0.01652 0.01605 0.0160 0.01578 0.01572 0.01536 0.0150
## Cumulative Proportion 0.70636 0.72241 0.7384 0.75419 0.76991 0.78527 0.8003
## PC36 PC37 PC38 PC39 PC40 PC41 PC42
## Standard deviation 0.5076 0.50334 0.50141 0.50094 0.49610 0.49241 0.49069
## Proportion of Variance 0.0148 0.01455 0.01444 0.01441 0.01413 0.01392 0.01383
## Cumulative Proportion 0.8151 0.82961 0.84405 0.85846 0.87259 0.88651 0.90034
## PC43 PC44 PC45 PC46 PC47 PC48 PC49
## Standard deviation 0.48427 0.47678 0.4703 0.46767 0.46431 0.46050 0.45392
## Proportion of Variance 0.01347 0.01305 0.0127 0.01256 0.01238 0.01218 0.01183
## Cumulative Proportion 0.91381 0.92686 0.9396 0.95212 0.96450 0.97668 0.98851
## PC50
## Standard deviation 0.44737
## Proportion of Variance 0.01149
## Cumulative Proportion 1.00000
scores_geno <- as.data.frame(pca_geno$x)
scores_geno$pop <- geno_data$pop
ggplot(scores_geno, aes(PC1, PC2, color = pop)) +
geom_point(size = 2, alpha = 0.7) +
theme_pca +
labs(title = "Genotype PCA – population structure")
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
iris_num <- iris[,1:4]
pca_iris <- prcomp(iris_num, center = TRUE, scale. = TRUE)
summary(pca_iris)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.7084 0.9560 0.38309 0.14393
## Proportion of Variance 0.7296 0.2285 0.03669 0.00518
## Cumulative Proportion 0.7296 0.9581 0.99482 1.00000
scores_iris <- as.data.frame(pca_iris$x)
scores_iris$Species <- iris$Species
ggplot(scores_iris, aes(PC1, PC2, color = Species)) +
geom_point(size = 2, alpha = 0.8) +
theme_pca +
labs(title = "Iris morphometric PCA", x = "PC1", y = "PC2")
load_iris <- as.data.frame(pca_iris$rotation)
load_iris$Trait <- rownames(load_iris)
ggplot(load_iris, aes(Trait, PC1)) +
geom_bar(stat = "identity") +
theme_pca +
labs(title = "Iris PCA – Loadings on PC1", x = "Trait", y = "Loading")
Key points:
Environmental scaling
environmental_data.csv.Effect of twins on expression PCA
expression_twins_data.csv.group.Sampling imbalance
location_unbalanced_data.csv.location).loc_data <- read.csv("location_unbalanced_data.csv")
set.seed(1)
subA <- subset(loc_data, location == "LocA")
subA <- subA[sample(nrow(subA), 20), ]
subBC <- subset(loc_data, location != "LocA")
loc_balanced <- rbind(subA, subBC)
genotype_data.csv.pop).environmental_data.csv.