step5-pam50

rm(list = ls())  ## 魔幻操作，一键清空~
options(stringsAsFactors = F)
load(file = '../input.Rdata')
a[1:4,1:4]

##               SS2_15_0048_A3 SS2_15_0048_A6 SS2_15_0048_A5 SS2_15_0048_A4
## 0610005C13Rik              0              0              0              1
## 0610007P14Rik              0              0             18             11
## 0610009B22Rik              0              0              0              0
## 0610009L18Rik              0              0              0              0

head(df)

##                g plate  n_g all
## SS2_15_0048_A3 1  0048 2624 all
## SS2_15_0048_A6 1  0048 2664 all
## SS2_15_0048_A5 1  0048 3319 all
## SS2_15_0048_A4 2  0048 4447 all
## SS2_15_0048_A1 2  0048 4725 all
## SS2_15_0048_A2 2  0048 5263 all

## 载入第0步准备好的表达矩阵，及细胞的一些属性（hclust分群，plate批次，检测到的细胞基因）
# 注意 变量a是原始的counts矩阵，变量 dat是logCPM后的表达量矩阵。

group_list=df$g
plate=df$plate
table(plate)

## plate
## 0048 0049 
##  384  384

rownames(dat)=toupper(rownames(dat)) ##toupper()函数，把小写字符转换成大写字符
dat[1:4,1:4]

##               SS2_15_0048_A3 SS2_15_0048_A6 SS2_15_0048_A5 SS2_15_0048_A4
## 0610007P14RIK              0              0       8.091727       7.701429
## 0610009B22RIK              0              0       0.000000       0.000000
## 0610009L18RIK              0              0       0.000000       0.000000
## 0610009O20RIK              0              0       6.755828       6.647482

library(genefu)

## Loading required package: survcomp

## Loading required package: survival

## Loading required package: prodlim

## Loading required package: mclust

## Package 'mclust' version 5.4.2
## Type 'citation("mclust")' for citing this R package in publications.

## Loading required package: limma

## Loading required package: biomaRt

## Loading required package: iC10

## Loading required package: pamr

## Loading required package: cluster

## Loading required package: impute

## Loading required package: iC10TrainingData

## Loading required package: AIMS

## Loading required package: e1071

## Loading required package: Biobase

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following object is masked from 'package:limma':
## 
##     plotMA

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colMeans, colnames, colSums, dirname, do.call, duplicated,
##     eval, evalq, Filter, Find, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
##     Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which, which.max,
##     which.min

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

if(T){
  ddata=t(dat)
  ddata[1:4,1:4]
  s=colnames(ddata);head(s);tail(s) ##把实验检测到的基因赋值给S
  ##ERCC指的是spikes in
  library(org.Hs.eg.db) ##人类基因信息的包
  s2g=toTable(org.Hs.egSYMBOL)
  g=s2g[match(s,s2g$symbol),1];head(g) ##取出实验检测到的基因所对应的基因名
 
  # match（x, y）返回的是vector x中每个元素在vector y中对映的位置（positions in y），
  # 如果vector x中存在不在vector y中的元素，该元素处返回的是NA
  # probe Gene.symbol Gene.ID
  
  dannot=data.frame(probe=s,
                    "Gene.Symbol" =s, 
                    "EntrezGene.ID"=g)
  head(dannot)
  #s向量是实验检测基因的基因名字，g向量是标准基因ID
  # 这里s应该和g是一一对应的，制作一个数据框
  
  ddata=ddata[,!is.na(dannot$EntrezGene.ID)] #ID转换，仅保留ID不为NA的
  dim(ddata)
  #制作行为样本，列为实验检测基因（这里的剩下的实验检测基因都有标准基因ID对应）的矩阵。
  #即剔除无基因ID对应的列
  
  # !is.na去除dannot数据框EntrezGene.ID列为NA的行（去除NA值即去除没有标准基因ID对应的实验检测基因名））
  
  dannot=dannot[!is.na(dannot$EntrezGene.ID),] #去除有NA的行，即剔除无对应的基因
  head(dannot)
  ddata[1:4,1:4]
  library(genefu)
  # c("scmgene", "scmod1", "scmod2","pam50", "ssp2006", "ssp2003", "intClust", "AIMS","claudinLow")
  
  s<-molecular.subtyping(sbt.model = "pam50",data=ddata,
                         annot=dannot,do.mapping=TRUE)
  table(s$subtype)
  tmp=as.data.frame(s$subtype)
  subtypes=as.character(s$subtype)
}

## Loading required package: AnnotationDbi

## Loading required package: stats4

## Loading required package: IRanges

## Loading required package: S4Vectors

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:base':
## 
##     expand.grid

## 
## Attaching package: 'IRanges'

## The following object is masked from 'package:grDevices':
## 
##     windows

##

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

## Warning in cor(x = x, y = y, method = method.cor, use = "complete.obs"): 标
## 准差为零

head(df)

##                g plate  n_g all
## SS2_15_0048_A3 1  0048 2624 all
## SS2_15_0048_A6 1  0048 2664 all
## SS2_15_0048_A5 1  0048 3319 all
## SS2_15_0048_A4 2  0048 4447 all
## SS2_15_0048_A1 2  0048 4725 all
## SS2_15_0048_A2 2  0048 5263 all

df$subtypes=subtypes
table(df[,c(1,5)])

##    subtypes
## g   Basal Her2 LumA LumB Normal
##   1     6   17  251   14     23
##   2     5   26  225   25     30
##   3    32   18   54    8     15
##   4     1    0    5    0     13

library(genefu)
pam50genes=pam50$centroids.map[c(1,3)]
pam50genes[pam50genes$probe=='CDCA1',1]='NUF2'
pam50genes[pam50genes$probe=='KNTC2',1]='NDC80'
pam50genes[pam50genes$probe=='ORC6L',1]='ORC6'
x=dat
dim(x)

## [1] 12198   768

x=x[pam50genes$probe[pam50genes$probe %in% rownames(x)] ,]
table(group_list)

## group_list
##   1   2   3   4 
## 311 311 127  19

tmp=data.frame(group=group_list,
               subtypes=subtypes)
rownames(tmp)=colnames(x)

library(pheatmap)


pheatmap(x,show_rownames = T,show_colnames = F,
         annotation_col = tmp
        )

x=t(scale(t(x)))
x[x>1.6]=1.6
x[x< -1.6]= -1.6

pheatmap(x,show_rownames = T,show_colnames = F,
         annotation_col = tmp
         )

step5-pam50

Chard Liu

2019年6月21日