本次项目研究是基于省立医院的一项临床试验所得到的RNA-seq基因表达、病人信息等测序或统计相关的数据,希望对数据进行预处理,探索性数据分析,主要是针对患者是否达到pCR这一指标,构建模型筛选出最显著的biomarker,从而对临床研究,患者的用药指导提出建设性意见。
本次案例所需要的数据包括患者的RNA-seq结果(data),样本编号信息(symbol),以及省立病人信息汇总(pcr)三组数据。RNA-seq结果(data)里记录了患者的ID和基因表达,省立病人信息汇总(pcr)里记录了28名临床试验患者的统计信息,其中包括我们分析的label(是否pCR),样本编号信息(symbol)的作用是将RNA-seq结果(data)与省立病人信息汇总(pcr)按照姓名和ID串联在一起。
getwd()
## [1] "/home/Bob/Jiaxuan/week19_cfDNA_pCR"
setwd("/home/Bob/Jiaxuan/week19_cfDNA_pCR")
#install.packages("gdata")
#library(gdata)
#data <- read.xls("Tissue_RNA_Seq.xlsx", sheet = 1, na.strings = c("NA", "DIV/0!"))
#symbol <- read.xls("Tissue_RNA_Seq.xlsx", sheet = 2, na.strings = c("NA", "DIV/0!"))
#pcr <- read.xls("Tissue_RNA_Seq.xlsx", sheet = 7, na.strings = c("NA", "DIV/0!"))
#save.image("~/Jiaxuan/week19_cfDNA_pCR/RNA-seq_data.RData")
load("~/Jiaxuan/week19_cfDNA_pCR/RNA-seq_data.RData")
读取完数据后,我们最好将raw data保存起来,方便日后提取应用。
str(data)
## 'data.frame': 57820 obs. of 49 variables:
## $ GeneSymbol : Factor w/ 55765 levels "5S_rRNA","7SK",..: 53170 52462 13840 48746 8238 15604 9672 15941 16232 25069 ...
## $ GeneID : Factor w/ 57820 levels "ENSG00000000003.10",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ TC207J0248.T2D4J28XNF1.H004K133: num 4.59 0.53 7.71 5.23 3.21 ...
## $ TC207J0250.T2D4J28XNF1.H004K133: num 4.55 0.4 18.47 10.68 7.9 ...
## $ TC207J0252.R1D4J28XNF1.H004K133: num 2.28 1.85 4.36 1.09 0.69 ...
## $ TC207J0254.R1D4J28XNF1.H004K133: num 2.3 0.27 6.36 1.57 1.53 5.76 13.5 3.65 7.27 4.44 ...
## $ TC207J0256.T2D4J28XNF1.H004K133: num 3.05 1.08 12.63 6.74 5.4 ...
## $ TC207J0258.R1D4J28XNF1.H004K133: num 3.25 4.56 8.62 1.43 1.01 ...
## $ TC207J0260.R1D4J28XNF1.H004K133: num 0 0 8.22 2.48 0.86 1.23 3.4 7.88 8.43 4.12 ...
## $ TC207J0262.T2D4J28XNF1.H004K133: num 27.41 21.26 19.72 4.27 2.85 ...
## $ TC207J0264.R1D4J28XNF1.H004K133: num 5.34 4.33 5.15 1.81 0.62 ...
## $ TC207J0266.R1D4J28XNF1.H004K133: num 1.7 0 7.85 10.57 2.64 ...
## $ TC207J0268.R1D4J28XNF1.H004K133: num 4.41 0 11.55 16.74 8.84 ...
## $ TC207J0270.R1D4J29XNF1.H004K133: num 0.6 0.11 11.95 1.48 0.91 ...
## $ TC207J0272.R1D4J29XNF1.H004K133: num 7.04 2.17 13.4 9.16 11.93 ...
## $ TC207J0274.R1D4J29XNF1.H004K133: num 4.49 0.69 10.54 1.5 0.57 ...
## $ TC207J0276.R1D4J29XNF1.H004K133: num 0 0 0 0.22 0 0.05 0 0 0.2 0.49 ...
## $ TC207J0278.R1D4J29XNF1.H004K133: num 4.29 1.45 8.63 10 2.97 ...
## $ TC207J0280.R1D4J29XNF1.H004K133: num 4.1 0.77 12.13 1.03 0.87 ...
## $ TC207J0282.R1D4J29XNF1.H004K133: num 1.94 0 7.65 4.93 1.75 ...
## $ TC19BG0178.T2D4J22XKF1.H001K133: num 4.03 0.84 6.74 3.46 2.57 ...
## $ TC19BG0179.T2D4J22XKF1.H001K133: num 10.48 0.94 20.85 8.1 3.7 ...
## $ TC19BG0180.T2D4J22XKF1.H001K133: num 19.82 0.24 31.49 11.2 6.14 ...
## $ TC19BG0181.T2D4J22XKF1.H001K133: num 21.97 2.66 21.85 6.05 3.62 ...
## $ TC19BG0183.T2D4J22XKF1.H001K133: num 3.48 0.06 32.1 5.11 7.01 ...
## $ TC19BG0184.T2D4J22XKF1.H001K133: num 16.44 0.16 28.42 9.82 7.24 ...
## $ TC19BG0185.T2D4J22XKF1.H001K133: num 10.05 0.6 16.76 5.74 5.25 ...
## $ TC19BG0186.T2D4J22XKF1.H001K133: num 6.69 0.27 26.32 6.88 6.79 ...
## $ TC19BG0187.T2D4J26XKF1.H001K133: num 6.72 0.11 23.58 13.91 9.03 ...
## $ TC19BG0188.T2D4J26XKF1.H001K133: num 18.1 0.32 26.63 6.03 6.53 ...
## $ TC19BG0189.T2D4J26XKF1.H001K133: num 5.95 0.47 26.46 12.01 7.49 ...
## $ TC19BG0190.T2D4J26XKF1.H001K133: num 3.71 0.05 7.88 6.12 5.22 ...
## $ TC19BG0191.T2D4J26XKF1.H001K133: num 1.37 0 17.46 3.74 3.82 ...
## $ TC19BG0192.T2D4J26XKF1.H001K133: num 2.44 0 6.74 6.26 12.25 ...
## $ TC19BG0193.T2D4J26XKF1.H001K133: num 7.48 0.19 32.42 7.52 12.58 ...
## $ TC19BG0194.T2D4J26XKF1.H001K133: num 4.35 0.06 19.67 8.87 7.31 ...
## $ TC19BG0195.T2D4J26XKF1.H001K133: num 4.33 0 16.3 17.24 5.36 ...
## $ TC19BG0196.T2D4J26XKF1.H001K133: num 3.9 0.78 9.49 8.45 5.39 ...
## $ TC19BG0197.T2D4J26XKF1.H001K133: num 6.75 0 50.39 8.12 10.12 ...
## $ TC19BG0198.T2D4J26XKF1.H001K133: num 3.53 0.15 25.59 5.04 8.6 ...
## $ TC19BG0199.T2D4J26XKF1.H001K133: num 15.09 0.26 11.2 5.73 2.82 ...
## $ TC19BG0200.T2D4J26XKF1.H001K133: num 2.89 0 13.69 4.85 4.54 ...
## $ TC19BG0201.T2D4J26XKF1.H001K133: num 14.44 0 29.34 7.36 6.89 ...
## $ TC19BG0203.T2D4J26XKF1.H001K133: num 7.77 0 24.95 5.99 7.75 ...
## $ TC19BG0204.T2D4J26XKF1.H001K133: num 4.85 0 22.16 9.23 7.67 ...
## $ TC19BG0205.T2D4J26XKF1.H001K133: num 4.66 0.13 16.3 10.38 6.54 ...
## $ TC19BG0206.T2D4J26XKF1.H001K133: num 11.36 1.25 16.01 8.96 5.3 ...
## $ TC19BG0207.T2D4J26XKF1.H001K133: num 11.83 0 18 6.18 4.99 ...
## $ TC19BG0208.T2D4J26XKF1.H001K133: num 5.24 0 16.79 7.39 7.79 ...
dim(data)
## [1] 57820 49
data[1:5,1:5]
## GeneSymbol GeneID TC207J0248.T2D4J28XNF1.H004K133
## 1 TSPAN6 ENSG00000000003.10 4.59
## 2 TNMD ENSG00000000005.5 0.53
## 3 DPM1 ENSG00000000419.8 7.71
## 4 SCYL3 ENSG00000000457.9 5.23
## 5 C1orf112 ENSG00000000460.12 3.21
## TC207J0250.T2D4J28XNF1.H004K133 TC207J0252.R1D4J28XNF1.H004K133
## 1 4.55 2.28
## 2 0.40 1.85
## 3 18.47 4.36
## 4 10.68 1.09
## 5 7.90 0.69
str(pcr)
## 'data.frame': 28 obs. of 15 variables:
## $ 患者姓名 : Factor w/ 28 levels "余昔情","刘婷",..: 8 24 25 20 27 28 6 14 16 15 ...
## $ 编号 : Factor w/ 28 levels "P002","P003",..: 5 15 13 24 21 23 6 7 14 16 ...
## $ 年龄 : int 44 32 59 49 68 69 63 39 48 65 ...
## $ ctDNA状态 : Factor w/ 3 levels "","阳性","阴性": 3 2 2 2 3 2 3 3 3 3 ...
## $ 是否pCR.1是.2否 : int 2 2 2 1 2 2 2 2 2 2 ...
## $ T.0.4. : int 2 2 2 3 2 4 1 2 3 2 ...
## $ N.0.3. : int 1 1 1 1 2 3 1 1 1 1 ...
## $ M.0.1. : int 0 0 0 0 0 0 0 0 0 0 ...
## $ 临床分期.1.2.3.4期 : int 2 2 2 3 2 3 2 2 3 2 ...
## $ ER..1阳.2阴. : int 1 2 2 2 1 1 2 1 1 1 ...
## $ PR..1阳.2阴. : int 1 2 2 2 1 1 2 1 1 2 ...
## $ HER2..1阳.2阴. : int 2 2 2 2 2 2 1 2 1 2 ...
## $ KI67 : num 0.1 0.8 0.4 0.4 0.5 0.3 0.3 0.6 0.3 0.2 ...
## $ 分子分型.4分类..1LA.2LB.3HER2.4TNBC..不知道: int 1 4 4 4 1 2 3 2 2 2 ...
## $ MPR分级 : Factor w/ 11 levels ""," 拒绝手术",..: 8 3 2 10 8 3 4 6 11 6 ...
dim(pcr)
## [1] 28 15
head(pcr)
## 患者姓名 编号 年龄 ctDNA状态 是否pCR.1是.2否 T.0.4. N.0.3. M.0.1.
## 1 崔芳 P006 44 阴性 2 2 1 0
## 2 高吉如 P019 32 阳性 2 2 1 0
## 3 高雅丽(拒绝手术) P015 59 阳性 2 2 1 0
## 4 胡章琼 P037 49 阳性 1 3 1 0
## 5 黄典英 P027 68 阴性 2 2 2 0
## 6 黄珠江 P033 69 阳性 2 4 3 0
## 临床分期.1.2.3.4期 ER..1阳.2阴. PR..1阳.2阴. HER2..1阳.2阴. KI67
## 1 2 1 1 2 0.1
## 2 2 2 2 2 0.8
## 3 2 2 2 2 0.4
## 4 3 2 2 2 0.4
## 5 2 1 1 2 0.5
## 6 3 1 1 2 0.3
## 分子分型.4分类..1LA.2LB.3HER2.4TNBC..不知道 MPR分级
## 1 1 3级 30%-90%
## 2 4 1级
## 3 4 拒绝手术
## 4 4 5级
## 5 1 3级 30%-90%
## 6 2 1级
str(symbol)
## 'data.frame': 47 obs. of 3 variables:
## $ 姓名 : Factor w/ 29 levels "","余昔情","刘婷",..: 1 9 25 25 26 21 21 28 28 29 ...
## $ GeneSymbol: Factor w/ 47 levels "GeneID","TC19BG0178-T2D4J22XKF1-H001K133手术",..: 1 6 34 12 10 43 21 40 18 42 ...
## $ 样本 : Factor w/ 3 levels "","基线","手术": 1 2 3 2 2 3 2 3 2 3 ...
dim(symbol)
## [1] 47 3
head(pcr)
## 患者姓名 编号 年龄 ctDNA状态 是否pCR.1是.2否 T.0.4. N.0.3. M.0.1.
## 1 崔芳 P006 44 阴性 2 2 1 0
## 2 高吉如 P019 32 阳性 2 2 1 0
## 3 高雅丽(拒绝手术) P015 59 阳性 2 2 1 0
## 4 胡章琼 P037 49 阳性 1 3 1 0
## 5 黄典英 P027 68 阴性 2 2 2 0
## 6 黄珠江 P033 69 阳性 2 4 3 0
## 临床分期.1.2.3.4期 ER..1阳.2阴. PR..1阳.2阴. HER2..1阳.2阴. KI67
## 1 2 1 1 2 0.1
## 2 2 2 2 2 0.8
## 3 2 2 2 2 0.4
## 4 3 2 2 2 0.4
## 5 2 1 1 2 0.5
## 6 3 1 1 2 0.3
## 分子分型.4分类..1LA.2LB.3HER2.4TNBC..不知道 MPR分级
## 1 1 3级 30%-90%
## 2 4 1级
## 3 4 拒绝手术
## 4 4 5级
## 5 1 3级 30%-90%
## 6 2 1级
查看基因表达数据data中会发现GeneSymbol的levels有55765个,而数据data的dimension中GeneSymbol的个数是57820个,这表示在测序过程中出现了基因重复的情况。在这种情况下,我们可以将这些基因合并在一起取他们的平均值:
# duplicated GeneSymbol get average:
library(limma)
df <- avereps(data[,-c(1,2)], data$GeneSymbol)
dim(df)
## [1] 55765 47
df[1:5,1:5]
## TC207J0248.T2D4J28XNF1.H004K133 TC207J0250.T2D4J28XNF1.H004K133
## TSPAN6 4.59 4.55
## TNMD 0.53 0.40
## DPM1 7.71 18.47
## SCYL3 5.23 10.68
## C1orf112 3.21 7.90
## TC207J0252.R1D4J28XNF1.H004K133 TC207J0254.R1D4J28XNF1.H004K133
## TSPAN6 2.28 2.30
## TNMD 1.85 0.27
## DPM1 4.36 6.36
## SCYL3 1.09 1.57
## C1orf112 0.69 1.53
## TC207J0256.T2D4J28XNF1.H004K133
## TSPAN6 3.05
## TNMD 1.08
## DPM1 12.63
## SCYL3 6.74
## C1orf112 5.40
因为基因表达数据data变量太多,预处理后有57820个基因,所以这组数据要单独进行处理。而对于样本编号信息(symbol)和病人信息汇总(pcr)的数据,我们可以先将这两组数据按照姓名进行整合:
首先将直接手术,或是无疗效的患者从数据集中剔除出去,不纳入分析范围:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
symbol <- symbol[-c(1,5,21,29),-3]
colnames(symbol) <- c("name","GeneSymbol")
# remove unnecessary words
id <- symbol$GeneSymbol
library(stringr)
id <- str_replace(id, "基线", "")
id <- str_replace(id, "手术", "")
id <- str_replace(id, "-", ".")
id <- str_replace(id, "-", ".")
symbol$id <- id
head(symbol)
## name GeneSymbol id
## 2 崔芳 TC19BG0184-T2D4J22XKF1-H001K133基线 TC19BG0184.T2D4J22XKF1.H001K133
## 3 高吉如 TC207J0256-T2D4J28XNF1-H004K133手术 TC207J0256.T2D4J28XNF1.H004K133
## 4 高吉如 TC19BG0190-T2D4J26XKF1-H001K133基线 TC19BG0190.T2D4J26XKF1.H001K133
## 6 胡章琼 TC207J0274-R1D4J29XNF1-H004K133手术 TC207J0274.R1D4J29XNF1.H004K133
## 7 胡章琼 TC19BG0199-T2D4J26XKF1-H001K133基线 TC19BG0199.T2D4J26XKF1.H001K133
## 8 黄典英 TC207J0268-R1D4J28XNF1-H004K133手术 TC207J0268.R1D4J28XNF1.H004K133
pcr数据中MPR和label(pcr)意义相同,不纳入分析中:
pcr <- pcr[-c(3,13,18), -c(2,15)]
dim(pcr)
## [1] 25 13
colnames(pcr) <- c("name", "age", "ctDNA", "label", "T", "N", "M", "stage", "ER",
"PR", "HER2", "KI67", "mtype")
pcr[15,3] <- "阴性"
for (i in 1:length(symbol$name)) {
for (j in 1:length(pcr$name)) {
if(symbol$name[i] == pcr$name[j]){
symbol$age[i] = pcr$age[j]
symbol$ctNDA[i] = pcr$ctNDA[j]
symbol$T[i] = pcr$T[j]
symbol$N[i] = pcr$N[j]
symbol$M[i] = pcr$M[j]
symbol$stage[i] = pcr$stage[j]
symbol$ER[i] = pcr$ER[j]
symbol$PR[i] = pcr$PR[j]
symbol$HER2[i] = pcr$HER2[j]
symbol$KI67[i] = pcr$KI67[j]
symbol$mtype[i] = pcr$mtype[j]
symbol$label[i] = pcr$label[j]
}
}
}
head(symbol)
## name GeneSymbol id
## 2 崔芳 TC19BG0184-T2D4J22XKF1-H001K133基线 TC19BG0184.T2D4J22XKF1.H001K133
## 3 高吉如 TC207J0256-T2D4J28XNF1-H004K133手术 TC207J0256.T2D4J28XNF1.H004K133
## 4 高吉如 TC19BG0190-T2D4J26XKF1-H001K133基线 TC19BG0190.T2D4J26XKF1.H001K133
## 6 胡章琼 TC207J0274-R1D4J29XNF1-H004K133手术 TC207J0274.R1D4J29XNF1.H004K133
## 7 胡章琼 TC19BG0199-T2D4J26XKF1-H001K133基线 TC19BG0199.T2D4J26XKF1.H001K133
## 8 黄典英 TC207J0268-R1D4J28XNF1-H004K133手术 TC207J0268.R1D4J28XNF1.H004K133
## age T N M stage ER PR HER2 KI67 mtype label
## 2 44 2 1 0 2 1 1 2 0.1 1 2
## 3 32 2 1 0 2 2 2 2 0.8 4 2
## 4 32 2 1 0 2 2 2 2 0.8 4 2
## 6 49 3 1 0 3 2 2 2 0.4 4 1
## 7 49 3 1 0 3 2 2 2 0.4 4 1
## 8 68 2 2 0 2 1 1 2 0.5 1 2
我们首先按照symbol里ID名对基因表达数据进行筛选,然后进行对基因表达进行标准化处理,方法选择为mean/std scale。
df <- t(df)
df <- df[symbol$id,]
dim(df)
## [1] 43 55765
mean <- apply(df, 2, mean)
std <- apply(df, 2, sd)
df <- scale(df, center = mean, scale = std)
#df[is.na(df)] <- 0
# delete column where exists NaN value:
na_flag <- apply(is.na(df), 2, sum)
df <- df[,which(na_flag == 0)]
dim(df)
## [1] 43 42601
数据标准化后,我们将每个基因对应label(是否pCR)进行ANOVA差异分析,并将p值进行矫正,方法选择Benjamini & Hochberg法(BH法), 之后按照adjP值选择adjP值小于0.05的基因并按大小排列进行后续分析:
## ANOVA filter gene marker:
label <- symbol$label
p <- NULL
for (i in 1:ncol(df)) {
fit <- aov(label~df[,i])
p <- c(p, summary(fit)[[1]][5][1,])
}
adjp <- p.adjust(p, method = "BH")
mydata <- rbind(df, adjp)
dim(mydata)
## [1] 44 42601
# gene filter/p<0.05
b <- (mydata[44,] <= 0.05)
mydata <- as.data.frame(mydata[,b])
c <- sort(mydata[44,])
row.names(c) <- "adj_P_value"
c[,1:10]
## C1QTNF4 RPL18AP3 CLEC2L RPL13AP25 RPS12 EEF1A1P6
## adj_P_value 0.003482694 0.003482694 0.003482694 0.004378604 0.0146689 0.0146689
## RPL36 RPL37A AC093106.7 RPS28
## adj_P_value 0.0171684 0.0171684 0.0171684 0.0171684
#write.csv(c, "ANOVA_GeneMarker_by_adjP.csv")
#gene <- read.csv("ANOVA_GeneMarker_by_adjP.csv", header = T, row.names = 1)
gene <- colnames(c)
df <- df[,gene]
df <- as.data.frame(df)
label <- as.factor(symbol$label)
df$label <- label
将标准化后的,基因筛选后的基因表达数据final clean data,与之前合并过的symbol按照患者编号进行合并:
age <- symbol$age
T <- symbol$T
N <- symbol$N
stage <- symbol$stage
ER <- symbol$ER
PR <- symbol$PR
HER2 <- symbol$HER2
KI67 <- symbol$KI67
mtype <- symbol$mtype
df$age <- age
df$T <- T
df$N <- N
df$stage <-stage
df$ER <- ER
df$PR <- PR
df$HER2 <- HER2
df$KI67 <- KI67
df$mtype <- mtype
dim(df)
## [1] 43 39
我们使用逻辑回归来训练模型,对我们数据集一共38个变量进行筛选:
df$label <- as.numeric(df$label)
df$age <- as.numeric(df$age)
df$T <- as.numeric(df$T)
df$N <- as.numeric(df$N)
df$stage <- as.numeric(df$stage)
df$ER <- as.numeric(df$ER)
df$PR <- as.numeric(df$PR)
df$HER2 <- as.numeric(df$HER2)
df$KI67 <- as.numeric(df$KI67)
df$mtype <- as.numeric(df$mtype)
#df <- df[,-c(31:39)]
model <- glm(label~., data = df)
summary(model)
##
## Call:
## glm(formula = label ~ ., data = df)
##
## Deviance Residuals:
## TC19BG0184.T2D4J22XKF1.H001K133 TC207J0256.T2D4J28XNF1.H004K133
## 0.005478 0.047872
## TC19BG0190.T2D4J26XKF1.H001K133 TC207J0274.R1D4J29XNF1.H004K133
## -0.029864 0.007280
## TC19BG0199.T2D4J26XKF1.H001K133 TC207J0268.R1D4J28XNF1.H004K133
## -0.009168 -0.061650
## TC19BG0196.T2D4J26XKF1.H001K133 TC207J0272.R1D4J29XNF1.H004K133
## 0.041133 -0.015838
## TC19BG0198.T2D4J26XKF1.H001K133 TC19BG0205.T2D4J26XKF1.H001K133
## 0.006950 0.007973
## TC19BG0181.T2D4J22XKF1.H001K133 TC19BG0185.T2D4J22XKF1.H001K133
## 0.006796 -0.030169
## TC207J0254.R1D4J28XNF1.H004K133 TC19BG0189.T2D4J26XKF1.H001K133
## 0.000164 -0.000103
## TC207J0258.R1D4J28XNF1.H004K133 TC19BG0191.T2D4J26XKF1.H001K133
## -0.007974 -0.011569
## TC207J0252.R1D4J28XNF1.H004K133 TC207J0280.R1D4J29XNF1.H004K133
## 0.009450 -0.008491
## TC207J0282.R1D4J29XNF1.H004K133 TC19BG0203.T2D4J26XKF1.H001K133
## 0.003272 -0.020756
## TC19BG0179.T2D4J22XKF1.H001K133 TC19BG0183.T2D4J22XKF1.H001K133
## -0.001446 0.017825
## TC207J0264.R1D4J28XNF1.H004K133 TC19BG0194.T2D4J26XKF1.H001K133
## -0.005713 0.006055
## TC19BG0178.T2D4J22XKF1.H001K133 TC207J0278.R1D4J29XNF1.H004K133
## -0.009374 -0.006177
## TC19BG0201.T2D4J26XKF1.H001K133 TC207J0276.R1D4J29XNF1.H004K133
## 0.010041 -0.009082
## TC19BG0200.T2D4J26XKF1.H001K133 TC207J0250.T2D4J28XNF1.H004K133
## 0.012147 -0.009458
## TC19BG0187.T2D4J26XKF1.H001K133 TC19BG0204.T2D4J26XKF1.H001K133
## 0.038368 0.037241
## TC207J0262.T2D4J28XNF1.H004K133 TC19BG0193.T2D4J26XKF1.H001K133
## 0.012506 0.002595
## TC207J0260.R1D4J28XNF1.H004K133 TC19BG0192.T2D4J26XKF1.H001K133
## 0.002445 -0.016287
## TC207J0270.R1D4J29XNF1.H004K133 TC19BG0197.T2D4J26XKF1.H001K133
## -0.018438 0.001577
## TC19BG0186.T2D4J22XKF1.H001K133 TC207J0248.T2D4J28XNF1.H004K133
## -0.002940 0.005652
## TC19BG0206.T2D4J26XKF1.H001K133 TC207J0266.R1D4J28XNF1.H004K133
## -0.035901 0.029064
## TC19BG0195.T2D4J26XKF1.H001K133
## -0.001486
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.512065 0.560189 4.484 0.01095 *
## C1QTNF4 0.036345 0.056249 0.646 0.55338
## RPL18AP3 -0.314276 0.179972 -1.746 0.15570
## CLEC2L -0.251410 0.068689 -3.660 0.02158 *
## RPL13AP25 -0.052300 0.066522 -0.786 0.47571
## RPS12 0.065735 0.121948 0.539 0.61846
## EEF1A1P6 -0.289937 0.078621 -3.688 0.02106 *
## RPL36 -0.126051 0.136089 -0.926 0.40675
## RPL37A -0.450269 0.190018 -2.370 0.07685 .
## AC093106.7 -0.053686 0.043955 -1.221 0.28900
## RPS28 -0.088138 0.141869 -0.621 0.56808
## RPS27A -0.532689 0.261437 -2.038 0.11125
## FBL -0.508792 0.169930 -2.994 0.04017 *
## RPS10 -0.208203 0.118251 -1.761 0.15309
## RPL13 0.514247 0.210088 2.448 0.07061 .
## TOMM7 -0.096825 0.200536 -0.483 0.65445
## SYCE1L -0.080352 0.118820 -0.676 0.53597
## RPS18 0.273132 0.159435 1.713 0.16185
## RPS16 0.753290 0.199001 3.785 0.01935 *
## RPL18A 0.273894 0.076924 3.561 0.02357 *
## RPL12 0.430301 0.114620 3.754 0.01988 *
## UBA52 0.101434 0.146934 0.690 0.52795
## RPL24P4 -0.108329 0.075421 -1.436 0.22426
## RPS9 0.138650 0.135645 1.022 0.36449
## HNRNPA1 0.157747 0.163008 0.968 0.38798
## RPL13AP5 -0.405850 0.129394 -3.137 0.03496 *
## `RNU1-14P` -0.064956 0.114685 -0.566 0.60140
## RPS29 0.242408 0.118331 2.049 0.10987
## RPL5 0.049051 0.176222 0.278 0.79454
## `RP11-51O6.1` 0.156528 0.113325 1.381 0.23935
## age -0.022398 0.003963 -5.651 0.00483 **
## T -0.147541 0.085287 -1.730 0.15869
## N 0.313440 0.117466 2.668 0.05590 .
## stage 0.107124 0.225117 0.476 0.65900
## ER -0.280576 0.102539 -2.736 0.05210 .
## PR 0.381452 0.192537 1.981 0.11864
## HER2 0.079283 0.105046 0.755 0.49241
## KI67 0.863710 0.336302 2.568 0.06209 .
## mtype -0.229853 0.116707 -1.969 0.12024
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.004427658)
##
## Null deviance: 5.860465 on 42 degrees of freedom
## Residual deviance: 0.017711 on 4 degrees of freedom
## AIC: -133.15
##
## Number of Fisher Scoring iterations: 2
根据对model进行summary统计,我们筛选p值小于或约等于0.05的变量重新建模。根据结果,我们选择CLEC2L,EEF1A1P6,RPL37A,FBL,RPL13,RPS16,RPL18A,RPL12,RPL13AP5,age,N,ER,KI67进行建模:
model2 <- glm(label~ CLEC2L + EEF1A1P6 + RPL37A + FBL +
+ RPS16 + RPL18A + RPL12 + RPL13AP5 +
age + N + ER + KI67, data = df)
summary(model2)
##
## Call:
## glm(formula = label ~ CLEC2L + EEF1A1P6 + RPL37A + FBL + +RPS16 +
## RPL18A + RPL12 + RPL13AP5 + age + N + ER + KI67, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.49655 -0.08277 -0.00173 0.07917 0.35474
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2735049 0.2178630 10.435 1.68e-11 ***
## CLEC2L -0.2449086 0.0368324 -6.649 2.31e-07 ***
## EEF1A1P6 -0.1437896 0.0668370 -2.151 0.03962 *
## RPL37A -0.4517205 0.1398223 -3.231 0.00299 **
## FBL -0.2366576 0.1089162 -2.173 0.03782 *
## RPS16 0.4127877 0.1545059 2.672 0.01208 *
## RPL18A 0.0824971 0.0564310 1.462 0.15416
## RPL12 0.1483133 0.0905457 1.638 0.11187
## RPL13AP5 -0.0001417 0.1062973 -0.001 0.99895
## age -0.0081110 0.0046092 -1.760 0.08865 .
## N 0.0799929 0.0688648 1.162 0.25456
## ER -0.2770591 0.0863552 -3.208 0.00317 **
## KI67 0.6705565 0.2096666 3.198 0.00325 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.03383569)
##
## Null deviance: 5.8605 on 42 degrees of freedom
## Residual deviance: 1.0151 on 30 degrees of freedom
## AIC: -11.06
##
## Number of Fisher Scoring iterations: 2
继续根据P值选择变量:
model3 <- glm(label~ CLEC2L + EEF1A1P6 + RPL37A + FBL +
+ RPS16 + ER + KI67, data = df)
summary(model3)
##
## Call:
## glm(formula = label ~ CLEC2L + EEF1A1P6 + RPL37A + FBL + +RPS16 +
## ER + KI67, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.46497 -0.07791 0.02175 0.08349 0.38271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.97529 0.09619 20.534 < 2e-16 ***
## CLEC2L -0.22380 0.03380 -6.622 1.18e-07 ***
## EEF1A1P6 -0.09476 0.05745 -1.649 0.10800
## RPL37A -0.27241 0.10125 -2.690 0.01086 *
## FBL -0.09832 0.07462 -1.318 0.19617
## RPS16 0.26894 0.13348 2.015 0.05165 .
## ER -0.24684 0.08100 -3.047 0.00437 **
## KI67 0.51962 0.19372 2.682 0.01108 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.03375105)
##
## Null deviance: 5.8605 on 42 degrees of freedom
## Residual deviance: 1.1813 on 35 degrees of freedom
## AIC: -14.539
##
## Number of Fisher Scoring iterations: 2
再次剔除掉EEF1A1P6和FBL进行建模:
model4 <- glm(label~ CLEC2L + RPL37A + ER + KI67, data = df)
summary(model4)
##
## Call:
## glm(formula = label ~ CLEC2L + RPL37A + ER + KI67, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.51823 -0.07929 -0.00857 0.08023 0.47617
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.98666 0.09254 21.467 < 2e-16 ***
## CLEC2L -0.23327 0.03409 -6.843 4.01e-08 ***
## RPL37A -0.17222 0.03100 -5.555 2.32e-06 ***
## ER -0.21584 0.07928 -2.722 0.00973 **
## KI67 0.37039 0.18454 2.007 0.05189 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.03565618)
##
## Null deviance: 5.8605 on 42 degrees of freedom
## Residual deviance: 1.3549 on 38 degrees of freedom
## AIC: -14.641
##
## Number of Fisher Scoring iterations: 2
最终,我们选择基因CLEC2L和RPL37A,患者信息ER和KI67作为最终变量进行建模预测:
library(caret)
set.seed(23)
trainX <- createDataPartition(df$label, p=2/3,list=FALSE)
train <- df[trainX,]
test <- df[-trainX,]
fit <- glm(label~ CLEC2L + RPL37A + ER + KI67, data = train)
summary(fit)
##
## Call:
## glm(formula = label ~ CLEC2L + RPL37A + ER + KI67, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.59889 -0.08354 0.00470 0.09522 0.44518
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.93769 0.12521 15.476 5.49e-14 ***
## CLEC2L -0.29367 0.05563 -5.279 2.05e-05 ***
## RPL37A -0.13059 0.05123 -2.549 0.0176 *
## ER -0.21043 0.10515 -2.001 0.0568 .
## KI67 0.43820 0.25961 1.688 0.1044
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.04482331)
##
## Null deviance: 4.1379 on 28 degrees of freedom
## Residual deviance: 1.0758 on 24 degrees of freedom
## AIC: -1.2354
##
## Number of Fisher Scoring iterations: 2
#predict on the test set
LRtest<-predict.glm(fit,test[,-30], type="response")
# Convert probabilities into class values.
LRprediction <- cut(LRtest, c(-Inf,1.5,Inf), labels=c("1","2"))
test$label <- as.factor(test$label)
LRprediction <- as.factor(LRprediction)
confusionMatrix(LRprediction,test$label)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 2 0
## 2 0 12
##
## Accuracy : 1
## 95% CI : (0.7684, 1)
## No Information Rate : 0.8571
## P-Value [Acc > NIR] : 0.1155
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.1429
## Detection Rate : 0.1429
## Detection Prevalence : 0.1429
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 1
##
## ROC curve
{par(pty = "s")
r = roc(as.numeric(LRprediction), as.numeric(test$label))
plot.roc(r, print.auc=TRUE, auc.polygon=TRUE,
auc.polygon.col=rgb(.3,0,.8,0.2),
legacy.axes = T)
}
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
## Warning in if (legacy.axes) {: the condition has length > 1 and only the first
## element will be used
3折交叉验证(3-fold Cross Validation),因为数据samples较小,所以只能分成3份
k = 3
n = floor(nrow(df)/k)
error.cv = rep(NA, k)
i = 1
df <- as.data.frame(t(df))
df <- sample(df)
df <- as.data.frame(t(df))
for (i in 1:k) {
s1 = ((i - 1) * n + 1)
s2 = (i*n)
subset = s1:s2
cv.train = df[-subset, ]
cv.test = df[subset, ]
lr.cv <- glm(label~ CLEC2L + RPL37A + ER + KI67, data = train)
LRtest<-predict.glm(lr.cv,cv.test[,-30], type="response")
LRprediction <- cut(LRtest, c(-Inf,1.5,Inf), labels=c("1","2"))
r = roc(as.numeric(LRprediction), as.numeric(cv.test$label))
print(r$auc)
}
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
## Area under the curve: 1
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
## Area under the curve: 0.9583
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
## Area under the curve: 1