exercise 6

Chens Beispiel

Hier müsste man mal jemand anderen fragen

set.seed(99)
x <- matrix(rnorm(15), 5, 3)

prin <- princomp(x)

summary(prin)
## Importance of components:
##                        Comp.1  Comp.2 Comp.3
## Standard deviation     1.6671 0.55137 0.2682
## Proportion of Variance 0.8808 0.09636 0.0228
## Cumulative Proportion  0.8808 0.97720 1.0000

prin$laodings
## NULL
prin$scores
##       Comp.1  Comp.2     Comp.3
## [1,] -0.2853 -0.4550 -5.001e-06
## [2,] -1.7206  0.8594  1.899e-01
## [3,] -1.8083 -0.4837 -2.607e-01
## [4,]  1.5139 -0.3710  3.911e-01
## [5,]  2.3004  0.4503 -3.203e-01


pr <- prcomp(x)
pr$rotation
##          PC1      PC2      PC3
## [1,] 0.08693 -0.03664 -0.99554
## [2,] 0.21320 -0.97549  0.05451
## [3,] 0.97313  0.21699  0.07699
predict(pr)
##          PC1     PC2        PC3
## [1,]  0.2853 -0.4550  5.001e-06
## [2,]  1.7206  0.8594 -1.899e-01
## [3,]  1.8083 -0.4837  2.607e-01
## [4,] -1.5139 -0.3710 -3.911e-01
## [5,] -2.3004  0.4503  3.203e-01

pr2 <- prcomp(t(x))

biplot(prin)

plot of chunk unnamed-chunk-1


layout(matrix(1:2, 2, 1))
plot(prin$scores[, 1:2], xlim = c(-3, 3), ylim = c(-2, 2))
points(prin$loadings[, 1:2], pch = 20)

plot of chunk unnamed-chunk-1

An example from HSAUR

library(HSAUR)
## Loading required package: tools
data("heptathlon", package = "HSAUR")

heptathloaaAqqn$hurdles <- max(heptathlon$hurdles) - heptathlon$hurdles
## Error: object 'heptathloaaAqqn' not found
heptathlon$run200m <- max(heptathlon$run200m) - heptathlon$run200m
heptathlon$run800m <- max(heptathlon$run800m) - heptathlon$run800m
score <- which(colnames(heptathlon) == "score")

plot(heptathlon[, -score])

plot of chunk unnamed-chunk-2


round(cor(heptathlon[, -score]), 2)
##          hurdles highjump  shot run200m longjump javelin run800m
## hurdles     1.00    -0.81 -0.65   -0.77    -0.91   -0.01   -0.78
## highjump   -0.81     1.00  0.44    0.49     0.78    0.00    0.59
## shot       -0.65     0.44  1.00    0.68     0.74    0.27    0.42
## run200m    -0.77     0.49  0.68    1.00     0.82    0.33    0.62
## longjump   -0.91     0.78  0.74    0.82     1.00    0.07    0.70
## javelin    -0.01     0.00  0.27    0.33     0.07    1.00   -0.02
## run800m    -0.78     0.59  0.42    0.62     0.70   -0.02    1.00

How do you interpret the result? There are positive values and negative value in the correlation matrix, what does it mean?

Negativ d.h. mehr Score –> weniger sprünge o.ä.

Look into the graphical representation of correlation matrix in the cases of negative values and positive values, respectively. What do you find?

heptathlon_pca <- prcomp(heptathlon[, -score], scale = TRUE)
print(heptathlon_pca)
## Standard deviations:
## [1] 2.1119 1.0928 0.7218 0.6761 0.4952 0.2701 0.2214
## 
## Rotation:
##               PC1      PC2      PC3      PC4      PC5      PC6      PC7
## hurdles   0.45287 -0.15792 -0.04515  0.02654 -0.09495 -0.78334 -0.38025
## highjump -0.37720  0.24807  0.36778 -0.67999 -0.01880 -0.09940 -0.43393
## shot     -0.36307 -0.28941 -0.67619 -0.12432 -0.51165  0.05086 -0.21762
## run200m  -0.40790 -0.26039 -0.08359  0.36107  0.64983 -0.02496 -0.45338
## longjump -0.45623  0.05587 -0.13932 -0.11129  0.18430 -0.59021  0.61206
## javelin  -0.07541 -0.84169  0.47156 -0.12080 -0.13511  0.02724  0.17295
## run800m  -0.37496  0.22449  0.39586  0.60341 -0.50432 -0.15556 -0.09831
plot(heptathlon_pca)

plot of chunk unnamed-chunk-3

summary(heptathlon_pca)
## Importance of components:
##                          PC1   PC2    PC3    PC4   PC5    PC6   PC7
## Standard deviation     2.112 1.093 0.7218 0.6761 0.495 0.2701 0.221
## Proportion of Variance 0.637 0.171 0.0744 0.0653 0.035 0.0104 0.007
## Cumulative Proportion  0.637 0.808 0.8822 0.9475 0.983 0.9930 1.000
a1 <- heptathlon_pca$rotation[, 1]
a1
##  hurdles highjump     shot  run200m longjump  javelin  run800m 
##  0.45287 -0.37720 -0.36307 -0.40790 -0.45623 -0.07541 -0.37496


#### Extracting the first principal component score from all pre-computed
#### principal components
predict(heptathlon_pca)[, 1]
## Joyner-Kersee (USA)          John (GDR)        Behmer (GDR) 
##           -4.121448           -2.882186           -2.649634 
##  Sablovskaite (URS)   Choubenkova (URS)        Schulz (GDR) 
##           -1.343351           -1.359026           -1.043847 
##       Fleming (AUS)       Greiner (USA)    Lajbnerova (CZE) 
##           -1.100386           -0.923174           -0.530251 
##       Bouraga (URS)       Wijnsma (HOL)     Dimitrova (BUL) 
##           -0.759819           -0.556268           -1.186454 
##      Scheider (SWI)         Braun (FRG)  Ruotsalainen (FIN) 
##            0.015461            0.003774            0.090748 
##        Yuping (CHN)         Hagger (GB)         Brown (USA) 
##           -0.137225            0.171129            0.519253 
##       Mulliner (GB)    Hautenauve (BEL)        Kytola (FIN) 
##            1.125482            1.085698            1.447055 
##      Geremias (BRA)       Hui-Ing (TAI)      Jeong-Mi (KOR) 
##            2.014030            2.880299            2.970119 
##         Launa (PNG) 
##            6.270022
heptathlon_pca$x[, 1]
## Joyner-Kersee (USA)          John (GDR)        Behmer (GDR) 
##           -4.121448           -2.882186           -2.649634 
##  Sablovskaite (URS)   Choubenkova (URS)        Schulz (GDR) 
##           -1.343351           -1.359026           -1.043847 
##       Fleming (AUS)       Greiner (USA)    Lajbnerova (CZE) 
##           -1.100386           -0.923174           -0.530251 
##       Bouraga (URS)       Wijnsma (HOL)     Dimitrova (BUL) 
##           -0.759819           -0.556268           -1.186454 
##      Scheider (SWI)         Braun (FRG)  Ruotsalainen (FIN) 
##            0.015461            0.003774            0.090748 
##        Yuping (CHN)         Hagger (GB)         Brown (USA) 
##           -0.137225            0.171129            0.519253 
##       Mulliner (GB)    Hautenauve (BEL)        Kytola (FIN) 
##            1.125482            1.085698            1.447055 
##      Geremias (BRA)       Hui-Ing (TAI)      Jeong-Mi (KOR) 
##            2.014030            2.880299            2.970119 
##         Launa (PNG) 
##            6.270022

center <- heptathlon_pca$center
scale <- heptathlon_pca$scale

hm <- as.matrix(heptathlon[, -score])
drop((scale(hm, center = center, scale = scale) %*% heptathlon_pca$rotation)[, 
    1])
## Joyner-Kersee (USA)          John (GDR)        Behmer (GDR) 
##           -4.121448           -2.882186           -2.649634 
##  Sablovskaite (URS)   Choubenkova (URS)        Schulz (GDR) 
##           -1.343351           -1.359026           -1.043847 
##       Fleming (AUS)       Greiner (USA)    Lajbnerova (CZE) 
##           -1.100386           -0.923174           -0.530251 
##       Bouraga (URS)       Wijnsma (HOL)     Dimitrova (BUL) 
##           -0.759819           -0.556268           -1.186454 
##      Scheider (SWI)         Braun (FRG)  Ruotsalainen (FIN) 
##            0.015461            0.003774            0.090748 
##        Yuping (CHN)         Hagger (GB)         Brown (USA) 
##           -0.137225            0.171129            0.519253 
##       Mulliner (GB)    Hautenauve (BEL)        Kytola (FIN) 
##            1.125482            1.085698            1.447055 
##      Geremias (BRA)       Hui-Ing (TAI)      Jeong-Mi (KOR) 
##            2.014030            2.880299            2.970119 
##         Launa (PNG) 
##            6.270022

cor(heptathlon$score, heptathlon_pca$x[, 1])
## [1] -0.9911
plot(heptathlon$score, heptathlon_pca$x[, 1])

plot of chunk unnamed-chunk-3


biplot(heptathlon_pca, col = c("gray", "black"))

plot of chunk unnamed-chunk-3

Selbständige Aufgaben

Cancer cell line data Load the tumor cell line data from module 4 and do PCA:

load("~/Dropbox/Uni/Master/HT_Course/Module_4_ANOVA/tumorCellLines_anova.RData")

How should you scale the data, row-wise or column-wise?

Perform PCA.

tpcaTumors = prcomp(t(tumorCellLines), center = T, scale = T)

summary(tpcaTumors)
## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     32.646 29.844 19.9477 18.0815 15.9164 14.9927
## Proportion of Variance  0.187  0.156  0.0697  0.0573  0.0444  0.0394
## Cumulative Proportion   0.187  0.343  0.4123  0.4696  0.5139  0.5533
##                            PC7     PC8     PC9    PC10    PC11    PC12
## Standard deviation     14.4040 13.8198 12.9850 12.8974 12.7450 12.0700
## Proportion of Variance  0.0363  0.0335  0.0295  0.0291  0.0284  0.0255
## Cumulative Proportion   0.5896  0.6231  0.6526  0.6817  0.7102  0.7357
##                           PC13    PC14    PC15    PC16    PC17    PC18
## Standard deviation     12.0122 11.6009 11.2489 11.0931 10.7450 10.6577
## Proportion of Variance  0.0253  0.0236  0.0222  0.0215  0.0202  0.0199
## Cumulative Proportion   0.7610  0.7845  0.8067  0.8283  0.8485  0.8684
##                           PC19    PC20   PC21   PC22   PC23   PC24   PC25
## Standard deviation     10.5118 10.2072 9.8323 9.6369 9.2773 9.0553 8.6450
## Proportion of Variance  0.0193  0.0182 0.0169 0.0163 0.0151 0.0144 0.0131
## Cumulative Proportion   0.8877  0.9060 0.9229 0.9392 0.9542 0.9686 0.9817
##                          PC26    PC27     PC28
## Standard deviation     8.1777 6.13620 3.63e-14
## Proportion of Variance 0.0117 0.00659 0.00e+00
## Cumulative Proportion  0.9934 1.00000 1.00e+00
plot(tpcaTumors)

plot of chunk unnamed-chunk-5

How much variance are explained by first two PCs?

0.48 + 0.11

Plot cell lines on PC1 and PC2. Using different colors indicate different cancer type.

coln <- colnames(tumorCellLines)

coln_split <- strsplit(coln, split = "\\.")

cancerType <- sapply(coln_split, function(x) x[1])
fac <- factor(cancerType)
print(fac)
##  [1] CO CO CO CO CO CO CO ME ME ME ME ME ME ME OV OV OV OV OV OV OV RE RE
## [24] RE RE RE RE RE
## Levels: CO ME OV RE

plot(tpcaTumors$x[, 1], tpcaTumors$x[, 2])

plot of chunk unnamed-chunk-6

plot(predict(tpcaTumors), col = as.numeric(fac), pch = as.numeric(fac))
legend("bottomright", legend = levels(fac), col = unique(as.numeric(fac)), pch = unique(as.numeric(fac)))

plot of chunk unnamed-chunk-6


Which cancer type is most different with others?

How can you detect the genes most strongly associated with melanoma cell lines?