项目描述与目标

本次项目研究是基于省立医院的一项临床试验所得到的RNA-seq基因表达、病人信息等测序或统计相关的数据,希望对数据进行预处理,探索性数据分析,主要是针对患者是否达到pCR这一指标,构建模型筛选出最显著的biomarker,从而对临床研究,患者的用药指导提出建设性意见。

数据说明

本次案例所需要的数据包括患者的RNA-seq结果(data),样本编号信息(symbol),以及省立病人信息汇总(pcr)三组数据。RNA-seq结果(data)里记录了患者的ID和基因表达,省立病人信息汇总(pcr)里记录了28名临床试验患者的统计信息,其中包括我们分析的label(是否pCR),样本编号信息(symbol)的作用是将RNA-seq结果(data)与省立病人信息汇总(pcr)按照姓名和ID串联在一起。

读取数据

getwd()
## [1] "/home/Bob/Jiaxuan/week19_cfDNA_pCR"
setwd("/home/Bob/Jiaxuan/week19_cfDNA_pCR")

#install.packages("gdata")
#library(gdata)

#data <- read.xls("Tissue_RNA_Seq.xlsx", sheet = 1, na.strings = c("NA", "DIV/0!"))

#symbol <- read.xls("Tissue_RNA_Seq.xlsx", sheet = 2, na.strings = c("NA", "DIV/0!"))

#pcr <-  read.xls("Tissue_RNA_Seq.xlsx", sheet = 7, na.strings = c("NA", "DIV/0!"))

#save.image("~/Jiaxuan/week19_cfDNA_pCR/RNA-seq_data.RData")
load("~/Jiaxuan/week19_cfDNA_pCR/RNA-seq_data.RData")

读取完数据后,我们最好将raw data保存起来,方便日后提取应用。

查看数据

str(data)
## 'data.frame':    57820 obs. of  49 variables:
##  $ GeneSymbol                     : Factor w/ 55765 levels "5S_rRNA","7SK",..: 53170 52462 13840 48746 8238 15604 9672 15941 16232 25069 ...
##  $ GeneID                         : Factor w/ 57820 levels "ENSG00000000003.10",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ TC207J0248.T2D4J28XNF1.H004K133: num  4.59 0.53 7.71 5.23 3.21 ...
##  $ TC207J0250.T2D4J28XNF1.H004K133: num  4.55 0.4 18.47 10.68 7.9 ...
##  $ TC207J0252.R1D4J28XNF1.H004K133: num  2.28 1.85 4.36 1.09 0.69 ...
##  $ TC207J0254.R1D4J28XNF1.H004K133: num  2.3 0.27 6.36 1.57 1.53 5.76 13.5 3.65 7.27 4.44 ...
##  $ TC207J0256.T2D4J28XNF1.H004K133: num  3.05 1.08 12.63 6.74 5.4 ...
##  $ TC207J0258.R1D4J28XNF1.H004K133: num  3.25 4.56 8.62 1.43 1.01 ...
##  $ TC207J0260.R1D4J28XNF1.H004K133: num  0 0 8.22 2.48 0.86 1.23 3.4 7.88 8.43 4.12 ...
##  $ TC207J0262.T2D4J28XNF1.H004K133: num  27.41 21.26 19.72 4.27 2.85 ...
##  $ TC207J0264.R1D4J28XNF1.H004K133: num  5.34 4.33 5.15 1.81 0.62 ...
##  $ TC207J0266.R1D4J28XNF1.H004K133: num  1.7 0 7.85 10.57 2.64 ...
##  $ TC207J0268.R1D4J28XNF1.H004K133: num  4.41 0 11.55 16.74 8.84 ...
##  $ TC207J0270.R1D4J29XNF1.H004K133: num  0.6 0.11 11.95 1.48 0.91 ...
##  $ TC207J0272.R1D4J29XNF1.H004K133: num  7.04 2.17 13.4 9.16 11.93 ...
##  $ TC207J0274.R1D4J29XNF1.H004K133: num  4.49 0.69 10.54 1.5 0.57 ...
##  $ TC207J0276.R1D4J29XNF1.H004K133: num  0 0 0 0.22 0 0.05 0 0 0.2 0.49 ...
##  $ TC207J0278.R1D4J29XNF1.H004K133: num  4.29 1.45 8.63 10 2.97 ...
##  $ TC207J0280.R1D4J29XNF1.H004K133: num  4.1 0.77 12.13 1.03 0.87 ...
##  $ TC207J0282.R1D4J29XNF1.H004K133: num  1.94 0 7.65 4.93 1.75 ...
##  $ TC19BG0178.T2D4J22XKF1.H001K133: num  4.03 0.84 6.74 3.46 2.57 ...
##  $ TC19BG0179.T2D4J22XKF1.H001K133: num  10.48 0.94 20.85 8.1 3.7 ...
##  $ TC19BG0180.T2D4J22XKF1.H001K133: num  19.82 0.24 31.49 11.2 6.14 ...
##  $ TC19BG0181.T2D4J22XKF1.H001K133: num  21.97 2.66 21.85 6.05 3.62 ...
##  $ TC19BG0183.T2D4J22XKF1.H001K133: num  3.48 0.06 32.1 5.11 7.01 ...
##  $ TC19BG0184.T2D4J22XKF1.H001K133: num  16.44 0.16 28.42 9.82 7.24 ...
##  $ TC19BG0185.T2D4J22XKF1.H001K133: num  10.05 0.6 16.76 5.74 5.25 ...
##  $ TC19BG0186.T2D4J22XKF1.H001K133: num  6.69 0.27 26.32 6.88 6.79 ...
##  $ TC19BG0187.T2D4J26XKF1.H001K133: num  6.72 0.11 23.58 13.91 9.03 ...
##  $ TC19BG0188.T2D4J26XKF1.H001K133: num  18.1 0.32 26.63 6.03 6.53 ...
##  $ TC19BG0189.T2D4J26XKF1.H001K133: num  5.95 0.47 26.46 12.01 7.49 ...
##  $ TC19BG0190.T2D4J26XKF1.H001K133: num  3.71 0.05 7.88 6.12 5.22 ...
##  $ TC19BG0191.T2D4J26XKF1.H001K133: num  1.37 0 17.46 3.74 3.82 ...
##  $ TC19BG0192.T2D4J26XKF1.H001K133: num  2.44 0 6.74 6.26 12.25 ...
##  $ TC19BG0193.T2D4J26XKF1.H001K133: num  7.48 0.19 32.42 7.52 12.58 ...
##  $ TC19BG0194.T2D4J26XKF1.H001K133: num  4.35 0.06 19.67 8.87 7.31 ...
##  $ TC19BG0195.T2D4J26XKF1.H001K133: num  4.33 0 16.3 17.24 5.36 ...
##  $ TC19BG0196.T2D4J26XKF1.H001K133: num  3.9 0.78 9.49 8.45 5.39 ...
##  $ TC19BG0197.T2D4J26XKF1.H001K133: num  6.75 0 50.39 8.12 10.12 ...
##  $ TC19BG0198.T2D4J26XKF1.H001K133: num  3.53 0.15 25.59 5.04 8.6 ...
##  $ TC19BG0199.T2D4J26XKF1.H001K133: num  15.09 0.26 11.2 5.73 2.82 ...
##  $ TC19BG0200.T2D4J26XKF1.H001K133: num  2.89 0 13.69 4.85 4.54 ...
##  $ TC19BG0201.T2D4J26XKF1.H001K133: num  14.44 0 29.34 7.36 6.89 ...
##  $ TC19BG0203.T2D4J26XKF1.H001K133: num  7.77 0 24.95 5.99 7.75 ...
##  $ TC19BG0204.T2D4J26XKF1.H001K133: num  4.85 0 22.16 9.23 7.67 ...
##  $ TC19BG0205.T2D4J26XKF1.H001K133: num  4.66 0.13 16.3 10.38 6.54 ...
##  $ TC19BG0206.T2D4J26XKF1.H001K133: num  11.36 1.25 16.01 8.96 5.3 ...
##  $ TC19BG0207.T2D4J26XKF1.H001K133: num  11.83 0 18 6.18 4.99 ...
##  $ TC19BG0208.T2D4J26XKF1.H001K133: num  5.24 0 16.79 7.39 7.79 ...
dim(data)
## [1] 57820    49
data[1:5,1:5]
##   GeneSymbol             GeneID TC207J0248.T2D4J28XNF1.H004K133
## 1     TSPAN6 ENSG00000000003.10                            4.59
## 2       TNMD  ENSG00000000005.5                            0.53
## 3       DPM1  ENSG00000000419.8                            7.71
## 4      SCYL3  ENSG00000000457.9                            5.23
## 5   C1orf112 ENSG00000000460.12                            3.21
##   TC207J0250.T2D4J28XNF1.H004K133 TC207J0252.R1D4J28XNF1.H004K133
## 1                            4.55                            2.28
## 2                            0.40                            1.85
## 3                           18.47                            4.36
## 4                           10.68                            1.09
## 5                            7.90                            0.69
str(pcr)
## 'data.frame':    28 obs. of  15 variables:
##  $ 患者姓名                                   : Factor w/ 28 levels "余昔情","刘婷",..: 8 24 25 20 27 28 6 14 16 15 ...
##  $ 编号                                       : Factor w/ 28 levels "P002","P003",..: 5 15 13 24 21 23 6 7 14 16 ...
##  $ 年龄                                       : int  44 32 59 49 68 69 63 39 48 65 ...
##  $ ctDNA状态                                  : Factor w/ 3 levels "","阳性","阴性": 3 2 2 2 3 2 3 3 3 3 ...
##  $ 是否pCR.1是.2否                            : int  2 2 2 1 2 2 2 2 2 2 ...
##  $ T.0.4.                                     : int  2 2 2 3 2 4 1 2 3 2 ...
##  $ N.0.3.                                     : int  1 1 1 1 2 3 1 1 1 1 ...
##  $ M.0.1.                                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ 临床分期.1.2.3.4期                         : int  2 2 2 3 2 3 2 2 3 2 ...
##  $ ER..1阳.2阴.                               : int  1 2 2 2 1 1 2 1 1 1 ...
##  $ PR..1阳.2阴.                               : int  1 2 2 2 1 1 2 1 1 2 ...
##  $ HER2..1阳.2阴.                             : int  2 2 2 2 2 2 1 2 1 2 ...
##  $ KI67                                       : num  0.1 0.8 0.4 0.4 0.5 0.3 0.3 0.6 0.3 0.2 ...
##  $ 分子分型.4分类..1LA.2LB.3HER2.4TNBC..不知道: int  1 4 4 4 1 2 3 2 2 2 ...
##  $ MPR分级                                    : Factor w/ 11 levels "","   拒绝手术",..: 8 3 2 10 8 3 4 6 11 6 ...
dim(pcr)
## [1] 28 15
head(pcr)
##           患者姓名 编号 年龄 ctDNA状态 是否pCR.1是.2否 T.0.4. N.0.3. M.0.1.
## 1             崔芳 P006   44      阴性               2      2      1      0
## 2           高吉如 P019   32      阳性               2      2      1      0
## 3 高雅丽(拒绝手术) P015   59      阳性               2      2      1      0
## 4           胡章琼 P037   49      阳性               1      3      1      0
## 5           黄典英 P027   68      阴性               2      2      2      0
## 6           黄珠江 P033   69      阳性               2      4      3      0
##   临床分期.1.2.3.4期 ER..1阳.2阴. PR..1阳.2阴. HER2..1阳.2阴. KI67
## 1                  2            1            1              2  0.1
## 2                  2            2            2              2  0.8
## 3                  2            2            2              2  0.4
## 4                  3            2            2              2  0.4
## 5                  2            1            1              2  0.5
## 6                  3            1            1              2  0.3
##   分子分型.4分类..1LA.2LB.3HER2.4TNBC..不知道      MPR分级
## 1                                           1 3级  30%-90%
## 2                                           4          1级
## 3                                           4     拒绝手术
## 4                                           4          5级
## 5                                           1 3级  30%-90%
## 6                                           2          1级
str(symbol)
## 'data.frame':    47 obs. of  3 variables:
##  $ 姓名      : Factor w/ 29 levels "","余昔情","刘婷",..: 1 9 25 25 26 21 21 28 28 29 ...
##  $ GeneSymbol: Factor w/ 47 levels "GeneID","TC19BG0178-T2D4J22XKF1-H001K133手术",..: 1 6 34 12 10 43 21 40 18 42 ...
##  $ 样本      : Factor w/ 3 levels "","基线","手术": 1 2 3 2 2 3 2 3 2 3 ...
dim(symbol)
## [1] 47  3
head(pcr)
##           患者姓名 编号 年龄 ctDNA状态 是否pCR.1是.2否 T.0.4. N.0.3. M.0.1.
## 1             崔芳 P006   44      阴性               2      2      1      0
## 2           高吉如 P019   32      阳性               2      2      1      0
## 3 高雅丽(拒绝手术) P015   59      阳性               2      2      1      0
## 4           胡章琼 P037   49      阳性               1      3      1      0
## 5           黄典英 P027   68      阴性               2      2      2      0
## 6           黄珠江 P033   69      阳性               2      4      3      0
##   临床分期.1.2.3.4期 ER..1阳.2阴. PR..1阳.2阴. HER2..1阳.2阴. KI67
## 1                  2            1            1              2  0.1
## 2                  2            2            2              2  0.8
## 3                  2            2            2              2  0.4
## 4                  3            2            2              2  0.4
## 5                  2            1            1              2  0.5
## 6                  3            1            1              2  0.3
##   分子分型.4分类..1LA.2LB.3HER2.4TNBC..不知道      MPR分级
## 1                                           1 3级  30%-90%
## 2                                           4          1级
## 3                                           4     拒绝手术
## 4                                           4          5级
## 5                                           1 3级  30%-90%
## 6                                           2          1级

数据清理与整合

RNA-seq数据处理

查看基因表达数据data中会发现GeneSymbol的levels有55765个,而数据data的dimension中GeneSymbol的个数是57820个,这表示在测序过程中出现了基因重复的情况。在这种情况下,我们可以将这些基因合并在一起取他们的平均值:

# duplicated GeneSymbol get average:
library(limma)
df <- avereps(data[,-c(1,2)], data$GeneSymbol)
dim(df)
## [1] 55765    47
df[1:5,1:5]
##          TC207J0248.T2D4J28XNF1.H004K133 TC207J0250.T2D4J28XNF1.H004K133
## TSPAN6                              4.59                            4.55
## TNMD                                0.53                            0.40
## DPM1                                7.71                           18.47
## SCYL3                               5.23                           10.68
## C1orf112                            3.21                            7.90
##          TC207J0252.R1D4J28XNF1.H004K133 TC207J0254.R1D4J28XNF1.H004K133
## TSPAN6                              2.28                            2.30
## TNMD                                1.85                            0.27
## DPM1                                4.36                            6.36
## SCYL3                               1.09                            1.57
## C1orf112                            0.69                            1.53
##          TC207J0256.T2D4J28XNF1.H004K133
## TSPAN6                              3.05
## TNMD                                1.08
## DPM1                               12.63
## SCYL3                               6.74
## C1orf112                            5.40

样本编号信息(symbol)和病人信息汇总(pcr)的数据处理

因为基因表达数据data变量太多,预处理后有57820个基因,所以这组数据要单独进行处理。而对于样本编号信息(symbol)和病人信息汇总(pcr)的数据,我们可以先将这两组数据按照姓名进行整合:

数据合并

首先将直接手术,或是无疗效的患者从数据集中剔除出去,不纳入分析范围:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
symbol <- symbol[-c(1,5,21,29),-3]
colnames(symbol) <- c("name","GeneSymbol")

# remove unnecessary words
id <- symbol$GeneSymbol
library(stringr)
id <- str_replace(id, "基线", "")
id <- str_replace(id, "手术", "")
id <- str_replace(id, "-", ".")
id <- str_replace(id, "-", ".")
symbol$id <- id
head(symbol)
##     name                          GeneSymbol                              id
## 2   崔芳 TC19BG0184-T2D4J22XKF1-H001K133基线 TC19BG0184.T2D4J22XKF1.H001K133
## 3 高吉如 TC207J0256-T2D4J28XNF1-H004K133手术 TC207J0256.T2D4J28XNF1.H004K133
## 4 高吉如 TC19BG0190-T2D4J26XKF1-H001K133基线 TC19BG0190.T2D4J26XKF1.H001K133
## 6 胡章琼 TC207J0274-R1D4J29XNF1-H004K133手术 TC207J0274.R1D4J29XNF1.H004K133
## 7 胡章琼 TC19BG0199-T2D4J26XKF1-H001K133基线 TC19BG0199.T2D4J26XKF1.H001K133
## 8 黄典英 TC207J0268-R1D4J28XNF1-H004K133手术 TC207J0268.R1D4J28XNF1.H004K133

pcr数据中MPR和label(pcr)意义相同,不纳入分析中:

pcr <- pcr[-c(3,13,18), -c(2,15)]
dim(pcr)
## [1] 25 13
colnames(pcr) <- c("name", "age", "ctDNA", "label", "T", "N", "M", "stage", "ER", 
                   "PR", "HER2", "KI67", "mtype")
pcr[15,3] <- "阴性"


for (i in 1:length(symbol$name)) {
  for (j in 1:length(pcr$name)) {
    if(symbol$name[i] == pcr$name[j]){
      symbol$age[i] = pcr$age[j]
      symbol$ctNDA[i] = pcr$ctNDA[j]
      symbol$T[i] = pcr$T[j]
      symbol$N[i] = pcr$N[j]
      symbol$M[i] = pcr$M[j]
      symbol$stage[i] = pcr$stage[j]
      symbol$ER[i] = pcr$ER[j]
      symbol$PR[i] = pcr$PR[j]
      symbol$HER2[i] = pcr$HER2[j]
      symbol$KI67[i] = pcr$KI67[j]
      symbol$mtype[i] = pcr$mtype[j]
      symbol$label[i] = pcr$label[j]
    }
  }
}
head(symbol)
##     name                          GeneSymbol                              id
## 2   崔芳 TC19BG0184-T2D4J22XKF1-H001K133基线 TC19BG0184.T2D4J22XKF1.H001K133
## 3 高吉如 TC207J0256-T2D4J28XNF1-H004K133手术 TC207J0256.T2D4J28XNF1.H004K133
## 4 高吉如 TC19BG0190-T2D4J26XKF1-H001K133基线 TC19BG0190.T2D4J26XKF1.H001K133
## 6 胡章琼 TC207J0274-R1D4J29XNF1-H004K133手术 TC207J0274.R1D4J29XNF1.H004K133
## 7 胡章琼 TC19BG0199-T2D4J26XKF1-H001K133基线 TC19BG0199.T2D4J26XKF1.H001K133
## 8 黄典英 TC207J0268-R1D4J28XNF1-H004K133手术 TC207J0268.R1D4J28XNF1.H004K133
##   age T N M stage ER PR HER2 KI67 mtype label
## 2  44 2 1 0     2  1  1    2  0.1     1     2
## 3  32 2 1 0     2  2  2    2  0.8     4     2
## 4  32 2 1 0     2  2  2    2  0.8     4     2
## 6  49 3 1 0     3  2  2    2  0.4     4     1
## 7  49 3 1 0     3  2  2    2  0.4     4     1
## 8  68 2 2 0     2  1  1    2  0.5     1     2

RNA-seq数据分析

预处理

我们首先按照symbol里ID名对基因表达数据进行筛选,然后进行对基因表达进行标准化处理,方法选择为mean/std scale。

df <- t(df)
df <- df[symbol$id,]
dim(df)
## [1]    43 55765
mean <- apply(df, 2, mean)
std <- apply(df, 2, sd)
df <- scale(df, center = mean, scale = std)
#df[is.na(df)] <- 0

# delete column where exists NaN value:
na_flag <- apply(is.na(df), 2, sum)
df <- df[,which(na_flag == 0)]
dim(df)
## [1]    43 42601

ANOVA

数据标准化后,我们将每个基因对应label(是否pCR)进行ANOVA差异分析,并将p值进行矫正,方法选择Benjamini & Hochberg法(BH法), 之后按照adjP值选择adjP值小于0.05的基因并按大小排列进行后续分析:

## ANOVA filter gene marker:
label <- symbol$label
p <- NULL
for (i in 1:ncol(df)) {
  fit <- aov(label~df[,i])
  p <- c(p, summary(fit)[[1]][5][1,])
}

adjp <- p.adjust(p, method = "BH")
mydata <- rbind(df, adjp)
dim(mydata)
## [1]    44 42601
# gene filter/p<0.05
b <- (mydata[44,] <= 0.05)
mydata <- as.data.frame(mydata[,b])
c <- sort(mydata[44,])
row.names(c) <- "adj_P_value"
c[,1:10]
##                 C1QTNF4    RPL18AP3      CLEC2L   RPL13AP25     RPS12  EEF1A1P6
## adj_P_value 0.003482694 0.003482694 0.003482694 0.004378604 0.0146689 0.0146689
##                 RPL36    RPL37A AC093106.7     RPS28
## adj_P_value 0.0171684 0.0171684  0.0171684 0.0171684
#write.csv(c, "ANOVA_GeneMarker_by_adjP.csv")

#gene <- read.csv("ANOVA_GeneMarker_by_adjP.csv", header = T, row.names = 1)
gene <- colnames(c)
df <- df[,gene]
df <- as.data.frame(df)
label <- as.factor(symbol$label)
df$label <- label

数据合并

将标准化后的,基因筛选后的基因表达数据final clean data,与之前合并过的symbol按照患者编号进行合并:

age <- symbol$age
T <- symbol$T
N <- symbol$N
stage <- symbol$stage
ER <- symbol$ER
PR <- symbol$PR
HER2 <- symbol$HER2
KI67 <- symbol$KI67
mtype <- symbol$mtype

df$age <- age
df$T <- T
df$N <- N
df$stage <-stage
df$ER <- ER
df$PR <- PR
df$HER2 <- HER2
df$KI67 <- KI67
df$mtype <- mtype

dim(df)
## [1] 43 39

Logistic

我们使用逻辑回归来训练模型,对我们数据集一共38个变量进行筛选:

df$label <- as.numeric(df$label)
df$age <- as.numeric(df$age)
df$T <- as.numeric(df$T)
df$N <- as.numeric(df$N)
df$stage <- as.numeric(df$stage)
df$ER <- as.numeric(df$ER)
df$PR <- as.numeric(df$PR)
df$HER2 <- as.numeric(df$HER2)
df$KI67 <- as.numeric(df$KI67)
df$mtype <- as.numeric(df$mtype)

#df <- df[,-c(31:39)]
model <- glm(label~., data = df)
summary(model)
## 
## Call:
## glm(formula = label ~ ., data = df)
## 
## Deviance Residuals: 
## TC19BG0184.T2D4J22XKF1.H001K133  TC207J0256.T2D4J28XNF1.H004K133  
##                        0.005478                         0.047872  
## TC19BG0190.T2D4J26XKF1.H001K133  TC207J0274.R1D4J29XNF1.H004K133  
##                       -0.029864                         0.007280  
## TC19BG0199.T2D4J26XKF1.H001K133  TC207J0268.R1D4J28XNF1.H004K133  
##                       -0.009168                        -0.061650  
## TC19BG0196.T2D4J26XKF1.H001K133  TC207J0272.R1D4J29XNF1.H004K133  
##                        0.041133                        -0.015838  
## TC19BG0198.T2D4J26XKF1.H001K133  TC19BG0205.T2D4J26XKF1.H001K133  
##                        0.006950                         0.007973  
## TC19BG0181.T2D4J22XKF1.H001K133  TC19BG0185.T2D4J22XKF1.H001K133  
##                        0.006796                        -0.030169  
## TC207J0254.R1D4J28XNF1.H004K133  TC19BG0189.T2D4J26XKF1.H001K133  
##                        0.000164                        -0.000103  
## TC207J0258.R1D4J28XNF1.H004K133  TC19BG0191.T2D4J26XKF1.H001K133  
##                       -0.007974                        -0.011569  
## TC207J0252.R1D4J28XNF1.H004K133  TC207J0280.R1D4J29XNF1.H004K133  
##                        0.009450                        -0.008491  
## TC207J0282.R1D4J29XNF1.H004K133  TC19BG0203.T2D4J26XKF1.H001K133  
##                        0.003272                        -0.020756  
## TC19BG0179.T2D4J22XKF1.H001K133  TC19BG0183.T2D4J22XKF1.H001K133  
##                       -0.001446                         0.017825  
## TC207J0264.R1D4J28XNF1.H004K133  TC19BG0194.T2D4J26XKF1.H001K133  
##                       -0.005713                         0.006055  
## TC19BG0178.T2D4J22XKF1.H001K133  TC207J0278.R1D4J29XNF1.H004K133  
##                       -0.009374                        -0.006177  
## TC19BG0201.T2D4J26XKF1.H001K133  TC207J0276.R1D4J29XNF1.H004K133  
##                        0.010041                        -0.009082  
## TC19BG0200.T2D4J26XKF1.H001K133  TC207J0250.T2D4J28XNF1.H004K133  
##                        0.012147                        -0.009458  
## TC19BG0187.T2D4J26XKF1.H001K133  TC19BG0204.T2D4J26XKF1.H001K133  
##                        0.038368                         0.037241  
## TC207J0262.T2D4J28XNF1.H004K133  TC19BG0193.T2D4J26XKF1.H001K133  
##                        0.012506                         0.002595  
## TC207J0260.R1D4J28XNF1.H004K133  TC19BG0192.T2D4J26XKF1.H001K133  
##                        0.002445                        -0.016287  
## TC207J0270.R1D4J29XNF1.H004K133  TC19BG0197.T2D4J26XKF1.H001K133  
##                       -0.018438                         0.001577  
## TC19BG0186.T2D4J22XKF1.H001K133  TC207J0248.T2D4J28XNF1.H004K133  
##                       -0.002940                         0.005652  
## TC19BG0206.T2D4J26XKF1.H001K133  TC207J0266.R1D4J28XNF1.H004K133  
##                       -0.035901                         0.029064  
## TC19BG0195.T2D4J26XKF1.H001K133  
##                       -0.001486  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    2.512065   0.560189   4.484  0.01095 * 
## C1QTNF4        0.036345   0.056249   0.646  0.55338   
## RPL18AP3      -0.314276   0.179972  -1.746  0.15570   
## CLEC2L        -0.251410   0.068689  -3.660  0.02158 * 
## RPL13AP25     -0.052300   0.066522  -0.786  0.47571   
## RPS12          0.065735   0.121948   0.539  0.61846   
## EEF1A1P6      -0.289937   0.078621  -3.688  0.02106 * 
## RPL36         -0.126051   0.136089  -0.926  0.40675   
## RPL37A        -0.450269   0.190018  -2.370  0.07685 . 
## AC093106.7    -0.053686   0.043955  -1.221  0.28900   
## RPS28         -0.088138   0.141869  -0.621  0.56808   
## RPS27A        -0.532689   0.261437  -2.038  0.11125   
## FBL           -0.508792   0.169930  -2.994  0.04017 * 
## RPS10         -0.208203   0.118251  -1.761  0.15309   
## RPL13          0.514247   0.210088   2.448  0.07061 . 
## TOMM7         -0.096825   0.200536  -0.483  0.65445   
## SYCE1L        -0.080352   0.118820  -0.676  0.53597   
## RPS18          0.273132   0.159435   1.713  0.16185   
## RPS16          0.753290   0.199001   3.785  0.01935 * 
## RPL18A         0.273894   0.076924   3.561  0.02357 * 
## RPL12          0.430301   0.114620   3.754  0.01988 * 
## UBA52          0.101434   0.146934   0.690  0.52795   
## RPL24P4       -0.108329   0.075421  -1.436  0.22426   
## RPS9           0.138650   0.135645   1.022  0.36449   
## HNRNPA1        0.157747   0.163008   0.968  0.38798   
## RPL13AP5      -0.405850   0.129394  -3.137  0.03496 * 
## `RNU1-14P`    -0.064956   0.114685  -0.566  0.60140   
## RPS29          0.242408   0.118331   2.049  0.10987   
## RPL5           0.049051   0.176222   0.278  0.79454   
## `RP11-51O6.1`  0.156528   0.113325   1.381  0.23935   
## age           -0.022398   0.003963  -5.651  0.00483 **
## T             -0.147541   0.085287  -1.730  0.15869   
## N              0.313440   0.117466   2.668  0.05590 . 
## stage          0.107124   0.225117   0.476  0.65900   
## ER            -0.280576   0.102539  -2.736  0.05210 . 
## PR             0.381452   0.192537   1.981  0.11864   
## HER2           0.079283   0.105046   0.755  0.49241   
## KI67           0.863710   0.336302   2.568  0.06209 . 
## mtype         -0.229853   0.116707  -1.969  0.12024   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.004427658)
## 
##     Null deviance: 5.860465  on 42  degrees of freedom
## Residual deviance: 0.017711  on  4  degrees of freedom
## AIC: -133.15
## 
## Number of Fisher Scoring iterations: 2

根据对model进行summary统计,我们筛选p值小于或约等于0.05的变量重新建模。根据结果,我们选择CLEC2L,EEF1A1P6,RPL37A,FBL,RPL13,RPS16,RPL18A,RPL12,RPL13AP5,age,N,ER,KI67进行建模:

model2 <- glm(label~ CLEC2L + EEF1A1P6 + RPL37A + FBL + 
               + RPS16 + RPL18A + RPL12 + RPL13AP5 + 
               age + N + ER + KI67, data = df)
summary(model2)
## 
## Call:
## glm(formula = label ~ CLEC2L + EEF1A1P6 + RPL37A + FBL + +RPS16 + 
##     RPL18A + RPL12 + RPL13AP5 + age + N + ER + KI67, data = df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.49655  -0.08277  -0.00173   0.07917   0.35474  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.2735049  0.2178630  10.435 1.68e-11 ***
## CLEC2L      -0.2449086  0.0368324  -6.649 2.31e-07 ***
## EEF1A1P6    -0.1437896  0.0668370  -2.151  0.03962 *  
## RPL37A      -0.4517205  0.1398223  -3.231  0.00299 ** 
## FBL         -0.2366576  0.1089162  -2.173  0.03782 *  
## RPS16        0.4127877  0.1545059   2.672  0.01208 *  
## RPL18A       0.0824971  0.0564310   1.462  0.15416    
## RPL12        0.1483133  0.0905457   1.638  0.11187    
## RPL13AP5    -0.0001417  0.1062973  -0.001  0.99895    
## age         -0.0081110  0.0046092  -1.760  0.08865 .  
## N            0.0799929  0.0688648   1.162  0.25456    
## ER          -0.2770591  0.0863552  -3.208  0.00317 ** 
## KI67         0.6705565  0.2096666   3.198  0.00325 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.03383569)
## 
##     Null deviance: 5.8605  on 42  degrees of freedom
## Residual deviance: 1.0151  on 30  degrees of freedom
## AIC: -11.06
## 
## Number of Fisher Scoring iterations: 2

继续根据P值选择变量:

model3 <- glm(label~ CLEC2L + EEF1A1P6 + RPL37A + FBL + 
               + RPS16 + ER + KI67, data = df)
summary(model3)
## 
## Call:
## glm(formula = label ~ CLEC2L + EEF1A1P6 + RPL37A + FBL + +RPS16 + 
##     ER + KI67, data = df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.46497  -0.07791   0.02175   0.08349   0.38271  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.97529    0.09619  20.534  < 2e-16 ***
## CLEC2L      -0.22380    0.03380  -6.622 1.18e-07 ***
## EEF1A1P6    -0.09476    0.05745  -1.649  0.10800    
## RPL37A      -0.27241    0.10125  -2.690  0.01086 *  
## FBL         -0.09832    0.07462  -1.318  0.19617    
## RPS16        0.26894    0.13348   2.015  0.05165 .  
## ER          -0.24684    0.08100  -3.047  0.00437 ** 
## KI67         0.51962    0.19372   2.682  0.01108 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.03375105)
## 
##     Null deviance: 5.8605  on 42  degrees of freedom
## Residual deviance: 1.1813  on 35  degrees of freedom
## AIC: -14.539
## 
## Number of Fisher Scoring iterations: 2

再次剔除掉EEF1A1P6和FBL进行建模:

model4 <- glm(label~ CLEC2L + RPL37A + ER + KI67, data = df)
summary(model4)
## 
## Call:
## glm(formula = label ~ CLEC2L + RPL37A + ER + KI67, data = df)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.51823  -0.07929  -0.00857   0.08023   0.47617  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.98666    0.09254  21.467  < 2e-16 ***
## CLEC2L      -0.23327    0.03409  -6.843 4.01e-08 ***
## RPL37A      -0.17222    0.03100  -5.555 2.32e-06 ***
## ER          -0.21584    0.07928  -2.722  0.00973 ** 
## KI67         0.37039    0.18454   2.007  0.05189 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.03565618)
## 
##     Null deviance: 5.8605  on 42  degrees of freedom
## Residual deviance: 1.3549  on 38  degrees of freedom
## AIC: -14.641
## 
## Number of Fisher Scoring iterations: 2

最终,我们选择基因CLEC2L和RPL37A,患者信息ER和KI67作为最终变量进行建模预测:

library(caret)
set.seed(23)
trainX <- createDataPartition(df$label, p=2/3,list=FALSE) 
train <- df[trainX,]
test <- df[-trainX,]

fit <- glm(label~ CLEC2L + RPL37A + ER + KI67, data = train)
summary(fit)
## 
## Call:
## glm(formula = label ~ CLEC2L + RPL37A + ER + KI67, data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.59889  -0.08354   0.00470   0.09522   0.44518  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.93769    0.12521  15.476 5.49e-14 ***
## CLEC2L      -0.29367    0.05563  -5.279 2.05e-05 ***
## RPL37A      -0.13059    0.05123  -2.549   0.0176 *  
## ER          -0.21043    0.10515  -2.001   0.0568 .  
## KI67         0.43820    0.25961   1.688   0.1044    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.04482331)
## 
##     Null deviance: 4.1379  on 28  degrees of freedom
## Residual deviance: 1.0758  on 24  degrees of freedom
## AIC: -1.2354
## 
## Number of Fisher Scoring iterations: 2
#predict on the test set
LRtest<-predict.glm(fit,test[,-30], type="response")
# Convert probabilities into class values. 
LRprediction <- cut(LRtest, c(-Inf,1.5,Inf), labels=c("1","2"))

test$label <- as.factor(test$label)
LRprediction <- as.factor(LRprediction)
confusionMatrix(LRprediction,test$label)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2
##          1  2  0
##          2  0 12
##                                      
##                Accuracy : 1          
##                  95% CI : (0.7684, 1)
##     No Information Rate : 0.8571     
##     P-Value [Acc > NIR] : 0.1155     
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.1429     
##          Detection Rate : 0.1429     
##    Detection Prevalence : 0.1429     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 1          
## 
## ROC curve
{par(pty = "s")
r = roc(as.numeric(LRprediction), as.numeric(test$label))
plot.roc(r, print.auc=TRUE, auc.polygon=TRUE,
           auc.polygon.col=rgb(.3,0,.8,0.2),
           legacy.axes = T)
}
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
## Warning in if (legacy.axes) {: the condition has length > 1 and only the first
## element will be used

3-folds

3折交叉验证(3-fold Cross Validation),因为数据samples较小,所以只能分成3份

k = 3
n = floor(nrow(df)/k)
error.cv = rep(NA, k)
i = 1

df <- as.data.frame(t(df))
df <- sample(df)
df <- as.data.frame(t(df))

for (i in 1:k) {
  
  s1 = ((i - 1) * n + 1)
  s2 = (i*n)
  subset = s1:s2
  
  cv.train = df[-subset, ]
  cv.test = df[subset, ]
  
  lr.cv <- glm(label~ CLEC2L + RPL37A + ER + KI67, data = train)
  
  LRtest<-predict.glm(lr.cv,cv.test[,-30], type="response")
  LRprediction <- cut(LRtest, c(-Inf,1.5,Inf), labels=c("1","2"))
  
  
  r = roc(as.numeric(LRprediction), as.numeric(cv.test$label))
  print(r$auc)
}
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
## Area under the curve: 1
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
## Area under the curve: 0.9583
## Setting levels: control = 1, case = 2
## Setting direction: controls < cases
## Area under the curve: 1