library(data.table)
library(dplyr)
文章中筛选出至少随访了5年了的。下面通过Survival data筛选超过5年的。
Survivaldata01<-data.table::fread(file="PRADsurvival.txt", stringsAsFactors = F)
Survivaldata01 %>% filter(OS.time >= 365*5) %>% rename(Sample_ID = sample,ID = `_PATIENT`) %>% select(Sample_ID, ID,`OS.time`)-> Survivaldata02
summary(Survivaldata02)
Sample_ID ID OS.time
Length:92 Length:92 Min. :1828
Class :character Class :character 1st Qu.:1964
Mode :character Mode :character Median :2228
Mean :2429
3rd Qu.:2657
Max. :5024
整理BCR数据
Pheno<-data.table::fread(file="PRAD_clinicalMatrix", stringsAsFactors = F)
Pheno %>% select(sampleID, biochemical_recurrence) %>% rename(Sample_ID= sampleID, BCR = biochemical_recurrence) -> Pheno2
summary(Pheno2)
Sample_ID BCR
Length:566 Length:566
Class :character Class :character
Mode :character Mode :character
merge(Survivaldata02, Pheno2, by = "Sample_ID",all.x = T) %>% filter(BCR != "")->M1
#用随访时间大于5年、存在BCR信息的为条件进行合并和筛选
summary(M1)
Sample_ID ID OS.time BCR
Length:86 Length:86 Min. :1828 Length:86
Class :character Class :character 1st Qu.:1954 Class :character
Mode :character Mode :character Median :2228 Mode :character
Mean :2436
3rd Qu.:2670
Max. :5024
Geno01<-data.table::fread(file="HiSeqV2")#读入Illumina RNA-seq数据
ImmunoGene<-data.table::fread(file="ImmunoGene.txt")
#按照文章的要求从Angelova M et al. 2015 文章中获得免疫相关的基因,以此进行筛选。
Geno01 %>% rename(Gene=sample) ->Geno01
merge(ImmunoGene, Geno01, by = "Gene",all.x = T) ->M2
M3<-M2[complete.cases(M2),]#合并数据
r1<-colnames(M3)
r2<-grep(pattern = '-01',x = r1,value = T)#去除癌旁组织
M3 %>% select(Gene,r2) -> M31
head(M31)#试着读取前六行检查
merge BCR和RNA-seq数据并分别将其变为factor和matrix
M4<-t(M3)
colnames(M4)=M4[1,]
M4<-M4[-1,]
M5<-cbind(M4,row.names(M4))
as.data.frame(M5) %>% rename(`Sample_ID` = V788)->M6
merge(M1,M6,by = "Sample_ID",all.x = T)->M7
#将BCR数据和RNA-seq数据合并
M7 %>% dplyr::select(-c(Sample_ID, ID,`OS.time`,BCR))->M8
M7 %>% dplyr::select(BCR) ->M9
M8a<-as.matrix(M8)
GeneInfo <- matrix(as.numeric(M8a),nrow=nrow(M8a), dimnames = list(rownames(M8a),colnames(M8a)))
BCRInfo<-as.factor(M9$BCR)
#mixOmics要求输入的X为matrix,Y为factor,此处把RNA-seq数据变为matrix,BCR变为factor。
class(GeneInfo)
[1] "matrix" "array"
class(BCRInfo)
[1] "factor"
library(mixOmics)
#载入mixOmics
#以下tuning sPLS-DA分析的参数
tune.splsda.prad <- tune.splsda(GeneInfo, BCRInfo, ncomp = 10, validation = 'Mfold', folds = 5, progressBar = F, dist = 'max.dist', measure = "BER", test.keepX = list.keepX, nrepeat = 10, cpus = 2)
error <- tune.splsda.prad$error.rate
ncomp <- tune.splsda.prad$choice.ncomp$ncomp
ncomp
select.keepX <- tune.splsda.prad$choice.keepX[1:ncomp]
select.keepX
#输出后面sPLS-DA分析需要的参数ncomp和keepX
splsda.prad <- splsda(GeneInfo, BCRInfo, ncomp = ncomp, keepX = select.keepX)
#进行sPLS-DA分析
perf.prad <- perf(splsda.prad, validation = "Mfold", folds = 5,
dist = 'max.dist', nrepeat = 200,
progressBar = FALSE)
perf.prad$error.rate
$overall
max.dist
comp1 0.1589535
comp2 0.2037791
comp3 0.1908721
comp4 0.1940698
comp5 0.1943605
comp6 0.1993023
comp7 0.2013953
$BER
max.dist
comp1 0.4551290
comp2 0.4663641
comp3 0.4301736
comp4 0.4381250
comp5 0.4336954
comp6 0.4274405
comp7 0.4265327