TCGA-PRAD实践

library(data.table)
library(dplyr)

TCGA数据下载后初步处理

Phenotype和Survival Data初步处理

文章中筛选出至少随访了5年了的。下面通过Survival data筛选超过5年的。

Survivaldata01<-data.table::fread(file="PRADsurvival.txt", stringsAsFactors = F)
Survivaldata01 %>% filter(OS.time >= 365*5) %>% rename(Sample_ID = sample,ID = `_PATIENT`)  %>% select(Sample_ID, ID,`OS.time`)-> Survivaldata02
summary(Survivaldata02)
  Sample_ID              ID               OS.time    
 Length:92          Length:92          Min.   :1828  
 Class :character   Class :character   1st Qu.:1964  
 Mode  :character   Mode  :character   Median :2228  
                                       Mean   :2429  
                                       3rd Qu.:2657  
                                       Max.   :5024  

整理BCR数据

Pheno<-data.table::fread(file="PRAD_clinicalMatrix", stringsAsFactors = F)
Pheno %>% select(sampleID, biochemical_recurrence) %>% rename(Sample_ID= sampleID, BCR = biochemical_recurrence) -> Pheno2
summary(Pheno2)
  Sample_ID             BCR           
 Length:566         Length:566        
 Class :character   Class :character  
 Mode  :character   Mode  :character  
merge(Survivaldata02, Pheno2, by = "Sample_ID",all.x = T) %>% filter(BCR != "")->M1
#用随访时间大于5年、存在BCR信息的为条件进行合并和筛选
summary(M1)
  Sample_ID              ID               OS.time         BCR           
 Length:86          Length:86          Min.   :1828   Length:86         
 Class :character   Class :character   1st Qu.:1954   Class :character  
 Mode  :character   Mode  :character   Median :2228   Mode  :character  
                                       Mean   :2436                     
                                       3rd Qu.:2670                     
                                       Max.   :5024                     
Geno01<-data.table::fread(file="HiSeqV2")#读入Illumina RNA-seq数据
ImmunoGene<-data.table::fread(file="ImmunoGene.txt")
#按照文章的要求从Angelova M et al. 2015 文章中获得免疫相关的基因,以此进行筛选。
Geno01 %>% rename(Gene=sample) ->Geno01
merge(ImmunoGene, Geno01, by = "Gene",all.x = T) ->M2
M3<-M2[complete.cases(M2),]#合并数据
r1<-colnames(M3)
r2<-grep(pattern = '-01',x = r1,value = T)#去除癌旁组织
M3 %>% select(Gene,r2) -> M31

head(M31)#试着读取前六行检查

merge BCR和RNA-seq数据并分别将其变为factor和matrix

M4<-t(M3)
colnames(M4)=M4[1,]
M4<-M4[-1,]
M5<-cbind(M4,row.names(M4))
as.data.frame(M5) %>% rename(`Sample_ID` = V788)->M6
merge(M1,M6,by = "Sample_ID",all.x = T)->M7
#将BCR数据和RNA-seq数据合并

M7 %>% dplyr::select(-c(Sample_ID, ID,`OS.time`,BCR))->M8
M7 %>% dplyr::select(BCR) ->M9
M8a<-as.matrix(M8)
GeneInfo <- matrix(as.numeric(M8a),nrow=nrow(M8a), dimnames = list(rownames(M8a),colnames(M8a)))
BCRInfo<-as.factor(M9$BCR)
#mixOmics要求输入的X为matrix,Y为factor,此处把RNA-seq数据变为matrix,BCR变为factor。
class(GeneInfo)
[1] "matrix" "array" 
class(BCRInfo)
[1] "factor"

用mixomics进行sPLS-DA分析

library(mixOmics)
#载入mixOmics

#以下tuning sPLS-DA分析的参数
tune.splsda.prad <- tune.splsda(GeneInfo, BCRInfo, ncomp = 10, validation = 'Mfold', folds = 5, progressBar = F, dist = 'max.dist', measure = "BER", test.keepX = list.keepX, nrepeat = 10, cpus = 2)
error <- tune.splsda.prad$error.rate
ncomp <- tune.splsda.prad$choice.ncomp$ncomp
ncomp
select.keepX <- tune.splsda.prad$choice.keepX[1:ncomp] 
select.keepX
#输出后面sPLS-DA分析需要的参数ncomp和keepX
splsda.prad <- splsda(GeneInfo, BCRInfo, ncomp = ncomp, keepX = select.keepX)
#进行sPLS-DA分析
perf.prad <- perf(splsda.prad, validation = "Mfold", folds = 5,
                   dist = 'max.dist', nrepeat = 200,
                   progressBar = FALSE) 
perf.prad$error.rate
$overall
       max.dist
comp1 0.1589535
comp2 0.2037791
comp3 0.1908721
comp4 0.1940698
comp5 0.1943605
comp6 0.1993023
comp7 0.2013953

$BER
       max.dist
comp1 0.4551290
comp2 0.4663641
comp3 0.4301736
comp4 0.4381250
comp5 0.4336954
comp6 0.4274405
comp7 0.4265327
LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KIyBUQ0dBLVBSQUTlrp7ot7UNCg0KYGBge3J9DQpsaWJyYXJ5KGRhdGEudGFibGUpDQpsaWJyYXJ5KGRwbHlyKQ0KDQpgYGANCg0KIyMgVENHQeaVsOaNruS4i+i9veWQjuWIneatpeWkhOeQhg0KDQojIyMgUGhlbm90eXBl5ZKMU3Vydml2YWwgRGF0YeWIneatpeWkhOeQhg0KDQrmlofnq6DkuK3nrZvpgInlh7roh7PlsJHpmo/orr/kuoY15bm05LqG55qE44CC5LiL6Z2i6YCa6L+HU3Vydml2YWwgZGF0Yeetm+mAiei2hei/hzXlubTnmoTjgIINCg0KYGBge3J9DQpTdXJ2aXZhbGRhdGEwMTwtZGF0YS50YWJsZTo6ZnJlYWQoZmlsZT0iUFJBRHN1cnZpdmFsLnR4dCIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBGKQ0KU3Vydml2YWxkYXRhMDEgJT4lIGZpbHRlcihPUy50aW1lID49IDM2NSo1KSAlPiUgcmVuYW1lKFNhbXBsZV9JRCA9IHNhbXBsZSxJRCA9IGBfUEFUSUVOVGApICAlPiUgc2VsZWN0KFNhbXBsZV9JRCwgSUQsYE9TLnRpbWVgKS0+IFN1cnZpdmFsZGF0YTAyDQpzdW1tYXJ5KFN1cnZpdmFsZGF0YTAyKQ0KYGBgDQrmlbTnkIZCQ1LmlbDmja4NCg0KYGBge3J9DQpQaGVubzwtZGF0YS50YWJsZTo6ZnJlYWQoZmlsZT0iUFJBRF9jbGluaWNhbE1hdHJpeCIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBGKQ0KUGhlbm8gJT4lIHNlbGVjdChzYW1wbGVJRCwgYmlvY2hlbWljYWxfcmVjdXJyZW5jZSkgJT4lIHJlbmFtZShTYW1wbGVfSUQ9IHNhbXBsZUlELCBCQ1IgPSBiaW9jaGVtaWNhbF9yZWN1cnJlbmNlKSAtPiBQaGVubzINCnN1bW1hcnkoUGhlbm8yKQ0KYGBgDQoNCmBgYHtyfQ0KbWVyZ2UoU3Vydml2YWxkYXRhMDIsIFBoZW5vMiwgYnkgPSAiU2FtcGxlX0lEIixhbGwueCA9IFQpICU+JSBmaWx0ZXIoQkNSICE9ICIiKS0+TTENCiPnlKjpmo/orr/ml7bpl7TlpKfkuo415bm044CB5a2Y5ZyoQkNS5L+h5oGv55qE5Li65p2h5Lu26L+b6KGM5ZCI5bm25ZKM562b6YCJDQpzdW1tYXJ5KE0xKQ0KYGBgDQoNCmBgYHtyfQ0KR2VubzAxPC1kYXRhLnRhYmxlOjpmcmVhZChmaWxlPSJIaVNlcVYyIikj6K+75YWlSWxsdW1pbmEgUk5BLXNlceaVsOaNrg0KSW1tdW5vR2VuZTwtZGF0YS50YWJsZTo6ZnJlYWQoZmlsZT0iSW1tdW5vR2VuZS50eHQiKQ0KI+aMieeFp+aWh+eroOeahOimgeaxguS7jkFuZ2Vsb3ZhIE0gZXQgYWwuIDIwMTUg5paH56ug5Lit6I635b6X5YWN55ar55u45YWz55qE5Z+65Zug77yM5Lul5q2k6L+b6KGM562b6YCJ44CCDQpHZW5vMDEgJT4lIHJlbmFtZShHZW5lPXNhbXBsZSkgLT5HZW5vMDENCm1lcmdlKEltbXVub0dlbmUsIEdlbm8wMSwgYnkgPSAiR2VuZSIsYWxsLnggPSBUKSAtPk0yDQpNMzwtTTJbY29tcGxldGUuY2FzZXMoTTIpLF0j5ZCI5bm25pWw5o2uDQpyMTwtY29sbmFtZXMoTTMpDQpyMjwtZ3JlcChwYXR0ZXJuID0gJy0wMScseCA9IHIxLHZhbHVlID0gVCkj5Y676Zmk55mM5peB57uE57uHDQpNMyAlPiUgc2VsZWN0KEdlbmUscjIpIC0+IE0zMQ0KaGVhZChNMzEpI+ivleedgOivu+WPluWJjeWFreihjOajgOafpQ0KYGBgDQoNCm1lcmdlIEJDUuWSjFJOQS1zZXHmlbDmja7lubbliIbliKvlsIblhbblj5jkuLpmYWN0b3LlkoxtYXRyaXgNCg0KYGBge3J9DQpNNDwtdChNMykNCmNvbG5hbWVzKE00KT1NNFsxLF0NCk00PC1NNFstMSxdDQpNNTwtY2JpbmQoTTQscm93Lm5hbWVzKE00KSkNCmFzLmRhdGEuZnJhbWUoTTUpICU+JSByZW5hbWUoYFNhbXBsZV9JRGAgPSBWNzg4KS0+TTYNCm1lcmdlKE0xLE02LGJ5ID0gIlNhbXBsZV9JRCIsYWxsLnggPSBUKS0+TTcNCiPlsIZCQ1LmlbDmja7lkoxSTkEtc2Vx5pWw5o2u5ZCI5bm2DQoNCk03ICU+JSBkcGx5cjo6c2VsZWN0KC1jKFNhbXBsZV9JRCwgSUQsYE9TLnRpbWVgLEJDUikpLT5NOA0KTTcgJT4lIGRwbHlyOjpzZWxlY3QoQkNSKSAtPk05DQpNOGE8LWFzLm1hdHJpeChNOCkNCkdlbmVJbmZvIDwtIG1hdHJpeChhcy5udW1lcmljKE04YSksbnJvdz1ucm93KE04YSksIGRpbW5hbWVzID0gbGlzdChyb3duYW1lcyhNOGEpLGNvbG5hbWVzKE04YSkpKQ0KQkNSSW5mbzwtYXMuZmFjdG9yKE05JEJDUikNCiNtaXhPbWljc+imgeaxgui+k+WFpeeahFjkuLptYXRyaXjvvIxZ5Li6ZmFjdG9y77yM5q2k5aSE5oqKUk5BLXNlceaVsOaNruWPmOS4um1hdHJpeO+8jEJDUuWPmOS4umZhY3RvcuOAgg0KY2xhc3MoR2VuZUluZm8pDQpjbGFzcyhCQ1JJbmZvKQ0KYGBgDQoNCg0KIyMg55SobWl4b21pY3Pov5vooYxzUExTLURB5YiG5p6QDQpgYGB7cn0NCmxpYnJhcnkobWl4T21pY3MpDQoj6L295YWlbWl4T21pY3MNCg0KI+S7peS4i3R1bmluZyBzUExTLURB5YiG5p6Q55qE5Y+C5pWwDQp0dW5lLnNwbHNkYS5wcmFkIDwtIHR1bmUuc3Bsc2RhKEdlbmVJbmZvLCBCQ1JJbmZvLCBuY29tcCA9IDEwLCB2YWxpZGF0aW9uID0gJ01mb2xkJywgZm9sZHMgPSA1LCBwcm9ncmVzc0JhciA9IEYsIGRpc3QgPSAnbWF4LmRpc3QnLCBtZWFzdXJlID0gIkJFUiIsIHRlc3Qua2VlcFggPSBsaXN0LmtlZXBYLCBucmVwZWF0ID0gMTAsIGNwdXMgPSAyKQ0KZXJyb3IgPC0gdHVuZS5zcGxzZGEucHJhZCRlcnJvci5yYXRlDQpuY29tcCA8LSB0dW5lLnNwbHNkYS5wcmFkJGNob2ljZS5uY29tcCRuY29tcA0KbmNvbXANCnNlbGVjdC5rZWVwWCA8LSB0dW5lLnNwbHNkYS5wcmFkJGNob2ljZS5rZWVwWFsxOm5jb21wXSANCnNlbGVjdC5rZWVwWA0KI+i+k+WHuuWQjumdonNQTFMtREHliIbmnpDpnIDopoHnmoTlj4LmlbBuY29tcOWSjGtlZXBYDQpgYGANCg0KYGBge3J9DQpzcGxzZGEucHJhZCA8LSBzcGxzZGEoR2VuZUluZm8sIEJDUkluZm8sIG5jb21wID0gbmNvbXAsIGtlZXBYID0gc2VsZWN0LmtlZXBYKQ0KI+i/m+ihjHNQTFMtREHliIbmnpANCnBlcmYucHJhZCA8LSBwZXJmKHNwbHNkYS5wcmFkLCB2YWxpZGF0aW9uID0gIk1mb2xkIiwgZm9sZHMgPSA1LA0KICAgICAgICAgICAgICAgICAgIGRpc3QgPSAnbWF4LmRpc3QnLCBucmVwZWF0ID0gMjAwLA0KICAgICAgICAgICAgICAgICAgIHByb2dyZXNzQmFyID0gRkFMU0UpIA0KcGVyZi5wcmFkJGVycm9yLnJhdGUNCg0KYGBgDQoNCg==