expr_matrix_file = "test.matrix" #Trinity_trans.TMM.EXPR.matrix"
data = read.table(expr_matrix_file)
log2data = log2(data+1)
Scale the expression data in the columns according to Z-scores and center at zero.
scaled_log2data = scale(log2data, center=T, scale=T)
Now center the expression for each gene
pca_data = t(scale(t(scaled_log2data), center=T, scale=F))
Consider remvoing those genes that have low variance
trans_sd = apply(pca_data, 1, sd)
hist(trans_sd, br=30)
First, transpose the matrix so that the genes (the observations) are the columns and the samples are the rows.
pca_data = t(pca_data)
Now, following how the ‘prcomp’ works under the hood: https://svn.r-project.org/R/trunk/src/library/stats/R/prcomp.R
s = svd(pca_data, nu=0)
sdev = s$d / sqrt(max(1, nrow(pca_data) - 1))
rotation = s$v # eigenvalues or weights or loadings
pc_scores = pca_data %*% rotation
head(rotation)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0.015380900 0.115042231 -0.04071386 -0.016166758 0.026375262
## [2,] 0.063644195 0.117248462 -0.01831308 0.011639030 0.032484166
## [3,] -0.137278043 0.059009336 -0.18996903 0.064540927 -0.001172617
## [4,] -0.156316119 -0.076723777 -0.01728652 0.023962771 -0.026674647
## [5,] 0.001916858 -0.048681511 -0.05613182 -0.012680826 0.012019248
## [6,] -0.032518032 -0.004278177 0.01154936 -0.002433161 0.009549742
## [,6] [,7] [,8] [,9]
## [1,] 0.0279106064 -0.016828122 -0.048915677 0.08417830
## [2,] 0.0354375099 -0.006320862 -0.049945737 0.09110938
## [3,] -0.0691692466 0.198356624 -0.003011987 -0.41107232
## [4,] -0.0843086959 -0.073580533 -0.022794484 0.11443270
## [5,] 0.0885567243 -0.147198369 0.066128228 -0.58500436
## [6,] -0.0004730328 0.010681301 -0.018931128 -0.27937977
head(pc_scores)
## [,1] [,2] [,3] [,4] [,5]
## wt_37_1 -6.717970 0.7209640 1.10175522 1.2286432 2.64543507
## wt_37_2 -6.924955 0.2120236 -0.87416865 -0.4582072 -0.72656328
## wt_37_3 -7.269614 0.3572181 -0.07139531 -0.6523621 -1.83772671
## wt_GSNO_1 2.957313 -3.0995315 3.23410554 -1.5778476 0.04694307
## wt_GSNO_2 3.127170 -3.9599796 0.30566905 2.9802351 -1.17675164
## wt_GSNO_3 2.543771 -4.5140454 -3.02848432 -1.3970307 1.16249076
## [,6] [,7] [,8] [,9]
## wt_37_1 -0.27315091 0.6200814 -0.2403687 7.632783e-16
## wt_37_2 1.72318626 -1.6070618 -0.6000073 6.561592e-16
## wt_37_3 -1.25805546 0.9670404 0.9112224 2.341877e-17
## wt_GSNO_1 -0.86443777 -0.9196316 -0.2327268 9.799561e-16
## wt_GSNO_2 0.50682366 0.2246499 0.0313378 1.017957e-15
## wt_GSNO_3 -0.01217303 0.5283262 0.1151434 6.533402e-16
plot(pc_scores[,1], pc_scores[,2])
Of course, you could just run the ‘prcomp’ like so:
pca = prcomp(pca_data)
rotation = pca$rotation
pc_scores = pca$x
head(rotation)
## PC1 PC2 PC3 PC4
## TR24|c0_g1_i1 0.015380900 0.115042231 -0.04071386 -0.016166758
## TR2779|c0_g1_i1 0.063644195 0.117248462 -0.01831308 0.011639030
## TR127|c1_g1_i1 -0.137278043 0.059009336 -0.18996903 0.064540927
## TR2107|c1_g1_i1 -0.156316119 -0.076723777 -0.01728652 0.023962771
## TR2011|c5_g1_i1 0.001916858 -0.048681511 -0.05613182 -0.012680826
## TR4163|c0_g1_i1 -0.032518032 -0.004278177 0.01154936 -0.002433161
## PC5 PC6 PC7 PC8
## TR24|c0_g1_i1 0.026375262 0.0279106064 -0.016828122 -0.048915677
## TR2779|c0_g1_i1 0.032484166 0.0354375099 -0.006320862 -0.049945737
## TR127|c1_g1_i1 -0.001172617 -0.0691692466 0.198356624 -0.003011987
## TR2107|c1_g1_i1 -0.026674647 -0.0843086959 -0.073580533 -0.022794484
## TR2011|c5_g1_i1 0.012019248 0.0885567243 -0.147198369 0.066128228
## TR4163|c0_g1_i1 0.009549742 -0.0004730328 0.010681301 -0.018931128
## PC9
## TR24|c0_g1_i1 0.01884967
## TR2779|c0_g1_i1 0.10910829
## TR127|c1_g1_i1 -0.44206123
## TR2107|c1_g1_i1 0.31352682
## TR2011|c5_g1_i1 -0.71080312
## TR4163|c0_g1_i1 -0.31861443
head(pc_scores)
## PC1 PC2 PC3 PC4 PC5
## wt_37_1 -6.717970 0.7209640 1.10175522 1.2286432 2.64543507
## wt_37_2 -6.924955 0.2120236 -0.87416865 -0.4582072 -0.72656328
## wt_37_3 -7.269614 0.3572181 -0.07139531 -0.6523621 -1.83772671
## wt_GSNO_1 2.957313 -3.0995315 3.23410554 -1.5778476 0.04694307
## wt_GSNO_2 3.127170 -3.9599796 0.30566905 2.9802351 -1.17675164
## wt_GSNO_3 2.543771 -4.5140454 -3.02848432 -1.3970307 1.16249076
## PC6 PC7 PC8 PC9
## wt_37_1 -0.27315091 0.6200814 -0.2403687 1.158145e-15
## wt_37_2 1.72318626 -1.6070618 -0.6000073 8.080559e-16
## wt_37_3 -1.25805546 0.9670404 0.9112224 1.113259e-15
## wt_GSNO_1 -0.86443777 -0.9196316 -0.2327268 -5.809223e-16
## wt_GSNO_2 0.50682366 0.2246499 0.0313378 1.206893e-15
## wt_GSNO_3 -0.01217303 0.5283262 0.1151434 -5.705614e-16
plot(pc_scores[,1], pc_scores[,2])
Another way to do this the longer way is to use eigen to extract the eigenvalues and eigenvectors from a correlation matrix
trans_cov_matrix = cov(pca_data)
trans_cov_matrix[1:5,1:5] # just for show
## TR24|c0_g1_i1 TR2779|c0_g1_i1 TR127|c1_g1_i1
## TR24|c0_g1_i1 0.15188295 0.17170011 0.02597495
## TR2779|c0_g1_i1 0.17170011 0.25956606 -0.16405131
## TR127|c1_g1_i1 0.02597495 -0.16405131 0.73150779
## TR2107|c1_g1_i1 -0.15814030 -0.37080771 0.55359069
## TR2011|c5_g1_i1 -0.04382272 -0.04770746 -0.04861672
## TR2107|c1_g1_i1 TR2011|c5_g1_i1
## TR24|c0_g1_i1 -0.15814030 -0.04382272
## TR2779|c0_g1_i1 -0.37080771 -0.04770746
## TR127|c1_g1_i1 0.55359069 -0.04861672
## TR2107|c1_g1_i1 0.75789618 0.03199903
## TR2011|c5_g1_i1 0.03199903 0.07460541
myEig = eigen(trans_cov_matrix)
names(myEig)
## [1] "values" "vectors"
#head(myEig$vectors)
scores = pca_data %*% myEig$vectors
scores[,1:15]
## [,1] [,2] [,3] [,4] [,5]
## wt_37_1 -6.717970 -0.7209640 -1.10175522 -1.2286432 -2.64543507
## wt_37_2 -6.924955 -0.2120236 0.87416865 0.4582072 0.72656328
## wt_37_3 -7.269614 -0.3572181 0.07139531 0.6523621 1.83772671
## wt_GSNO_1 2.957313 3.0995315 -3.23410554 1.5778476 -0.04694307
## wt_GSNO_2 3.127170 3.9599796 -0.30566905 -2.9802351 1.17675164
## wt_GSNO_3 2.543771 4.5140454 3.02848432 1.3970307 -1.16249076
## wt_ph8_1 4.322625 -3.0894574 -1.12777727 0.9942290 0.25213595
## wt_ph8_2 4.377269 -3.5323629 0.53911967 -0.5235525 -0.70512047
## wt_ph8_3 3.584390 -3.6615306 1.25613912 -0.3472459 0.56681179
## [,6] [,7] [,8] [,9] [,10]
## wt_37_1 -0.27315091 0.620081425 -0.24036870 9.107298e-18 -5.142750e-15
## wt_37_2 1.72318626 -1.607061817 -0.60000732 1.426376e-15 -5.995665e-15
## wt_37_3 -1.25805546 0.967040377 0.91122239 -1.443290e-15 -1.029206e-15
## wt_GSNO_1 -0.86443777 -0.919631635 -0.23272681 8.413409e-17 9.834053e-15
## wt_GSNO_2 0.50682366 0.224649859 0.03133780 1.245965e-15 2.790184e-15
## wt_GSNO_3 -0.01217303 0.528326169 0.11514339 1.455650e-15 -8.487595e-15
## wt_ph8_1 1.95376326 1.547826894 -0.04378211 -1.973248e-16 2.621248e-15
## wt_ph8_2 -0.15742830 -1.351335589 1.42379318 -2.094679e-16 4.785397e-15
## wt_ph8_3 -1.61852770 -0.009895683 -1.36461182 -3.400058e-16 3.783866e-17
## [,11] [,12] [,13] [,14]
## wt_37_1 8.456777e-16 -2.537033e-15 -7.502679e-16 7.615436e-16
## wt_37_2 6.062859e-16 -5.195497e-16 -8.153200e-17 5.577136e-16
## wt_37_3 9.679757e-16 -2.173609e-15 -1.961972e-15 8.743006e-16
## wt_GSNO_1 2.023989e-15 4.684946e-15 7.927686e-16 -6.292709e-16
## wt_GSNO_2 1.346579e-16 1.796848e-15 1.734723e-16 -4.098284e-16
## wt_GSNO_3 7.602426e-16 -3.240030e-15 6.409803e-16 3.035766e-17
## wt_ph8_1 -8.907805e-16 6.756748e-16 1.186551e-15 -3.764350e-16
## wt_ph8_2 -2.470246e-15 1.885644e-15 4.926615e-16 -1.368697e-15
## wt_ph8_3 -1.434616e-15 -1.150989e-15 7.112366e-17 -9.523632e-16
## [,15]
## wt_37_1 1.731254e-15
## wt_37_2 3.070461e-16
## wt_37_3 -6.071532e-16
## wt_GSNO_1 -4.774826e-15
## wt_GSNO_2 -3.472049e-15
## wt_GSNO_3 1.674876e-15
## wt_ph8_1 2.067790e-15
## wt_ph8_2 1.318390e-16
## wt_ph8_3 3.665471e-15
plot(scores[,1], scores[,2])