class 6

Load data

library(MASS)
data(crabs)
summary(crabs)

##  sp      sex         index            FL             RW      
##  B:100   F:100   Min.   : 1.0   Min.   : 7.2   Min.   : 6.5  
##  O:100   M:100   1st Qu.:13.0   1st Qu.:12.9   1st Qu.:11.0  
##                  Median :25.5   Median :15.6   Median :12.8  
##                  Mean   :25.5   Mean   :15.6   Mean   :12.7  
##                  3rd Qu.:38.0   3rd Qu.:18.1   3rd Qu.:14.3  
##                  Max.   :50.0   Max.   :23.1   Max.   :20.2  
##        CL             CW             BD      
##  Min.   :14.7   Min.   :17.1   Min.   : 6.1  
##  1st Qu.:27.3   1st Qu.:31.5   1st Qu.:11.4  
##  Median :32.1   Median :36.8   Median :13.9  
##  Mean   :32.1   Mean   :36.4   Mean   :14.0  
##  3rd Qu.:37.2   3rd Qu.:42.0   3rd Qu.:16.6  
##  Max.   :47.6   Max.   :54.6   Max.   :21.6

create name

nam <- paste(crabs[, 1], crabs[, 2], sep = ".")

fac = as.factor(nam)

plot

boxplot(crabs[, 4:6])

plot of chunk unnamed-chunk-3

create new data.frame

mydataframe = data.frame(mytype = as.factor(fac), crabs[, 4:8])

Plot the data

boxplot(FL ~ mytype, mydataframe, col = c(2, 3, 4, 7))

plot of chunk unnamed-chunk-5

More plots

seperate the graphs with different colours

plot(crabs$FL, crabs$RW, pch = as.numeric(fac), col = as.numeric(fac))

plot of chunk unnamed-chunk-6

Do all the plots at the same time

pairs(crabs[, -c(1:3)], col = as.numeric((fac)), pch = as.numeric(fac))

plot of chunk unnamed-chunk-7

We observe like in the plots above that there is something wrong with RW

Load pca stuff

Start with PCA

library("stats")
pcaCrabs = princomp(crabs[, 4:8])

summary(pcaCrabs)

## Importance of components:
##                         Comp.1   Comp.2   Comp.3    Comp.4   Comp.5
## Standard deviation     11.8323 1.135937 0.997631 0.3669098 0.278433
## Proportion of Variance  0.9825 0.009055 0.006984 0.0009447 0.000544
## Cumulative Proportion   0.9825 0.991527 0.998511 0.9994560 1.000000


plot(pcaCrabs)

plot of chunk unnamed-chunk-8

proportion of variance explains how much of the vairance is explained by one variable
die Größe der Balken im Boxplot sind die Eigenwerte

Rescale the PCA

pcaCrabs = princomp(scale(crabs[, 4:8]))
summary(pcaCrabs)

## Importance of components:
##                        Comp.1  Comp.2   Comp.3   Comp.4    Comp.5
## Standard deviation     2.1829 0.38849 0.215406 0.105260 0.0412689
## Proportion of Variance 0.9578 0.03034 0.009327 0.002227 0.0003423
## Cumulative Proportion  0.9578 0.98810 0.997431 0.999658 1.0000000

plot(pcaCrabs)

plot of chunk unnamed-chunk-9

in a pca the original variables doesnt exist anymore

biplot(pcaCrabs, xlabs = fac)

plot of chunk unnamed-chunk-10

red arrows are something like columns
in this plot you can recognize clustered values –> RW behaves different to the other variables
pca$scores are the coordinates of the observations

adjusting the plot with colors and shapes

plot(pcaCrabs$scores[, 1:2], col = as.numeric(fac), pch = as.numeric(fac))

plot of chunk unnamed-chunk-11

showing two variables without a special direction

# biplot(pcaCrabs,choices= c(2,3),xlabs= as.numeric(fac))
# biplot(pcaCrabs,choices= c(1,4),xlabs= as.numeric(fac))


unique(as.numeric(fac))

## [1] 2 1 4 3


plot(pcaCrabs$scores[, c(1, 4)], col = as.numeric(fac), pch = as.numeric(fac))
legend("topright", legend = levels(fac), col = c(2, 1, 4, 3), pch = c(2, 1, 
    4, 3))

plot of chunk unnamed-chunk-12



plot(pcaCrabs$scores[, c(1, 3)], col = as.numeric(fac), pch = as.numeric(fac))
legend("topright", legend = levels(fac), col = c(2, 1, 4, 3), pch = c(2, 1, 
    4, 3))

plot of chunk unnamed-chunk-12


plot(pcaCrabs$scores[, c(2, 3)], col = as.numeric(fac), pch = as.numeric(fac))
legend("topright", legend = levels(fac), col = c(2, 1, 4, 3), pch = c(2, 1, 
    4, 3))

plot of chunk unnamed-chunk-12


biplot(pcaCrabs, choices = c(2, 3), xlabs = as.numeric(fac))

plot of chunk unnamed-chunk-12

u have to look on all dimensions to find the real distinct difference
bad

loadings(pcaCrabs)

## 
## Loadings:
##    Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## FL -0.452 -0.138  0.531  0.697       
## RW -0.428  0.898                     
## CL -0.453 -0.268 -0.310        -0.792
## CW -0.451 -0.181 -0.653         0.575
## BD -0.451 -0.264  0.443 -0.707  0.176
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## SS loadings       1.0    1.0    1.0    1.0    1.0
## Proportion Var    0.2    0.2    0.2    0.2    0.2
## Cumulative Var    0.2    0.4    0.6    0.8    1.0

eigentlich sind die besten Informationen
zeigt die Informationen, die man sonst erst händisch suchen muss

Datenbeispiel 2

Loading data

# source('http://bioconductor.org/biocLite.R') biocLite()
library("Biobase")

## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist
## 
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

load("~/Dropbox/Uni/Master/HT_Course/Module_6_DataReduction/cellcycle.RData")

compute pca

pcaCho <- prcomp(exprs(yeast))
plot(pcaCho, n = 17)

plot of chunk unnamed-chunk-15

summary(pcaCho)

## Importance of components:
##                          PC1   PC2   PC3    PC4    PC5    PC6    PC7
## Standard deviation     1.762 1.628 1.451 1.2111 1.1459 0.9727 0.8654
## Proportion of Variance 0.195 0.167 0.132 0.0922 0.0825 0.0595 0.0471
## Cumulative Proportion  0.195 0.362 0.494 0.5860 0.6685 0.7280 0.7751
##                           PC8   PC9   PC10   PC11   PC12   PC13   PC14
## Standard deviation     0.8263 0.724 0.6975 0.6740 0.6105 0.5794 0.5227
## Proportion of Variance 0.0429 0.033 0.0306 0.0285 0.0234 0.0211 0.0172
## Cumulative Proportion  0.8180 0.851 0.8815 0.9101 0.9335 0.9546 0.9717
##                          PC15   PC16     PC17
## Standard deviation     0.4906 0.4571 4.48e-16
## Proportion of Variance 0.0151 0.0131 0.00e+00
## Cumulative Proportion  0.9869 1.0000 1.00e+00

plot PCA

biplot(pcaCho)

plot of chunk unnamed-chunk-16

biplot(pcaCho, choices = c(1, 3), cex = 0.5)

plot of chunk unnamed-chunk-16

interessant YMR296C ist genauso stark in c28-20 und ccdc29_90 expremiert