准备

setwd("D:/R/R-4.0.5/bin/project_writing/dataProcess")
rm(list = ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 421397 22.6     882125 47.2   643731 34.4
## Vcells 767387  5.9    8388608 64.0  1720850 13.2
suppressMessages(library(survival)) 
suppressMessages(library(ggplot2))
suppressMessages(library(ggpubr))
suppressMessages(library(survminer))
suppressMessages(library(tidyverse))
suppressMessages(library(plyr))

1-临床数据处理

1.1 临床数据下载

1.1.1 表型数据处理(phenotype)

step-1:变量处理

#删除冗余数据
#删除变量
pdata <- read_tsv("D:/R/R-4.0.5/bin/project_writing/data/rawdata/TCGA-HNSC.GDC_phenotype.tsv.gz") %>% as.data.frame()
## Rows: 612 Columns: 139
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (93): submitter_id.samples, additional_pharmaceutical_therapy, additiona...
## dbl (40): age_at_initial_pathologic_diagnosis, day_of_dcc_upload, day_of_for...
## lgl  (6): withdrawn, releasable.project, days_to_sample_procurement.samples,...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
pdata <- pdata[,c(1,6,7,13:16,22,29:32,34,37:41,45:51,53:55,58:64,70,72:76,83,89:94,97:101,103:106,108,110,112,114:119)]
pdata <- pdata[,c(1:7,11,12,17:19,26:29,34,35,39,40,45,46,47,48,55,58,59,62:66)]
dim(pdata)
## [1] 612  32
## 612 32

#选择变量并改名
pdata <- pdata %>%  transmute(sampleID = submitter_id.samples,
                                    age = age_at_initial_pathologic_diagnosis,
                                    gender = gender.demographic,
                                    race = race.demographic,
                                    Stage = clinical_stage,
                                    AJCC_T = clinical_T,
                                    AJCC_N = clinical_N,
                                    AJCC_M = clinical_M,
                                    Grade = neoplasm_histologic_grade,
                                    Alcohol_history = alcohol_history.exposures,
                                    Alcohol_amount = amount_of_alcohol_consumption_per_day,
                                    Smoke_history = tobacco_smoking_history,
                                    hpv_status_ish = hpv_status_by_ish_testing,
                                    hpv_status_p16 = hpv_status_by_p16_testing,
                                    site = primary_site,
                                    radiation_therapy = radiation_therapy,
                                    therapy_outcome = primary_therapy_outcome_success,
                                    lymphovascular_invasion_present)
summary(pdata)
##    sampleID              age           gender              race          
##  Length:612         Min.   :19.00   Length:612         Length:612        
##  Class :character   1st Qu.:53.00   Class :character   Class :character  
##  Mode  :character   Median :61.00   Mode  :character   Mode  :character  
##                     Mean   :61.02                                        
##                     3rd Qu.:68.00                                        
##                     Max.   :90.00                                        
##                     NA's   :1                                            
##     Stage              AJCC_T             AJCC_N             AJCC_M         
##  Length:612         Length:612         Length:612         Length:612        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     Grade           Alcohol_history    Alcohol_amount     Smoke_history  
##  Length:612         Length:612         Length:612         Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:2.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :2.000  
##                                                           Mean   :2.494  
##                                                           3rd Qu.:4.000  
##                                                           Max.   :5.000  
##                                                           NA's   :15     
##  hpv_status_ish     hpv_status_p16         site           radiation_therapy 
##  Length:612         Length:612         Length:612         Length:612        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  therapy_outcome    lymphovascular_invasion_present
##  Length:612         Length:612                     
##  Class :character   Class :character               
##  Mode  :character   Mode  :character               
##                                                    
##                                                    
##                                                    
## 
str(pdata)
## 'data.frame':    612 obs. of  18 variables:
##  $ sampleID                       : chr  "TCGA-DQ-5631-01A" "TCGA-BA-7269-01A" "TCGA-BA-A4IH-01A" "TCGA-CV-6954-01A" ...
##  $ age                            : num  52 61 57 59 59 79 82 77 57 63 ...
##  $ gender                         : chr  "male" "male" "male" "male" ...
##  $ race                           : chr  "white" "white" "white" "black or african american" ...
##  $ Stage                          : chr  "Stage IVA" "Stage III" "Stage IVA" "Stage IVA" ...
##  $ AJCC_T                         : chr  "T3" "T2" "T2" "T4a" ...
##  $ AJCC_N                         : chr  "N2b" "N1" "N2b" "N1" ...
##  $ AJCC_M                         : chr  "M0" "M0" "M0" "M0" ...
##  $ Grade                          : chr  "G3" "G1" "G3" "G2" ...
##  $ Alcohol_history                : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Alcohol_amount                 : chr  "1" "2" "4" "3" ...
##  $ Smoke_history                  : num  4 2 4 3 3 4 3 1 3 1 ...
##  $ hpv_status_ish                 : chr  NA NA "Positive" NA ...
##  $ hpv_status_p16                 : chr  NA NA "Positive" NA ...
##  $ site                           : chr  "Other and unspecified parts of tongue" "Other and unspecified parts of tongue" "Tonsil" "Other and unspecified parts of tongue" ...
##  $ radiation_therapy              : chr  "YES" "YES" "YES" "YES" ...
##  $ therapy_outcome                : chr  "Complete Remission/Response" "Complete Remission/Response" "Complete Remission/Response" "Complete Remission/Response" ...
##  $ lymphovascular_invasion_present: chr  "YES" "NO" "YES" "NO" ...
pdata[1:4,1:4]
##           sampleID age gender                      race
## 1 TCGA-DQ-5631-01A  52   male                     white
## 2 TCGA-BA-7269-01A  61   male                     white
## 3 TCGA-BA-A4IH-01A  57   male                     white
## 4 TCGA-CV-6954-01A  59   male black or african american

step-2 对象重编码

#对象重编码,改数据类型
pdata <- pdata %>% mutate(agegroup = factor(ifelse(age > 65,">65","<=65"),levels = c("<=65",">65")),
                          Stage = factor(ifelse(Stage == "Stage I",1,
                                                      ifelse(Stage == "Stage II",2,
                                                             ifelse(Stage == "Stage III",3,
                                                                    4)))),
                          gender = as.factor(gender),
                          race = factor(ifelse(race=="not reported",NA,race),ordered = F),
                          Alcohol_amount = as.numeric(Alcohol_amount),
                          site = as.factor(site),
                          AJCC_T = factor(ifelse(AJCC_T == "T1",1,
                                                       ifelse(AJCC_T == "T2",2,
                                                              ifelse(AJCC_T == "T3",3,
                                                                     ifelse(AJCC_T == "TX",NA,4))))),
                          AJCC_N = factor(ifelse(AJCC_N == "N0",0,
                                                       ifelse(AJCC_N == "N1",1,
                                                              ifelse(AJCC_N == "N3",3,
                                                                     ifelse(AJCC_N == "NX",NA,2))))),
                          AJCC_M = factor(ifelse(AJCC_M == "M0",0,
                                                       ifelse(AJCC_M == "M1",1,NA))),
                          Grade = factor(ifelse(Grade == "G1",1,
                                                      ifelse(Grade == "G2",2,
                                                             ifelse(Grade == "G3",3,
                                                                    ifelse(Grade == "G4",4,NA))))),
                          Alcohol_history = factor(ifelse(Alcohol_history == "Not Reported",NA,Alcohol_history)),
                          Smoke_history = factor(Smoke_history),
                          HPV = factor(ifelse(hpv_status_p16 == "Positive" | hpv_status_ish == "Positive","Positive",
                                                    ifelse(hpv_status_p16 == "Negative" | hpv_status_ish == "Negative","Negative","Not Report"))),
                          lymphovascular_invasion_present = factor(lymphovascular_invasion_present),
                          radiation_therapy = as.factor(radiation_therapy),
                          therapy_outcome = factor(therapy_outcome,
                                                   levels = c("Complete Remission/Response","Partial Remission/Response","Stable Disease","Persistent Disease","Progressive Disease"),
                                                   ordered = TRUE)
)

pdata <- pdata %>% select(1,2,19,3:12,15:18,20)

#部位
SITE <- levels(pdata$site)
pdata$SiteClassify <- factor(ifelse(pdata$site %in% SITE[c(1,3,4,7,9,10,11,12)],"oral cavity",
                                    ifelse(pdata$site %in% SITE[c(8,13)],"pharynx",ifelse(pdata$site %in% SITE[c(5,6)],"larynx","other"))))
rm(SITE)

#therapy outcome
OUTCOME <- levels(pdata$therapy_outcome)
pdata$OUTCOME <- factor(ifelse(pdata$therapy_outcome %in% OUTCOME[c(1,2)],"Remission",
                               ifelse(pdata$therapy_outcome %in% OUTCOME[c(3:5)],"Resistent",NA)))
rm(OUTCOME)

#处理好的临床信息
str(pdata)
## 'data.frame':    612 obs. of  20 variables:
##  $ sampleID                       : chr  "TCGA-DQ-5631-01A" "TCGA-BA-7269-01A" "TCGA-BA-A4IH-01A" "TCGA-CV-6954-01A" ...
##  $ age                            : num  52 61 57 59 59 79 82 77 57 63 ...
##  $ agegroup                       : Factor w/ 2 levels "<=65",">65": 1 1 1 1 1 2 2 2 1 1 ...
##  $ gender                         : Factor w/ 2 levels "female","male": 2 2 2 2 2 1 1 1 2 2 ...
##  $ race                           : Factor w/ 4 levels "american indian or alaska native",..: 4 4 4 3 3 4 4 4 4 4 ...
##  $ Stage                          : Factor w/ 4 levels "1","2","3","4": 4 3 4 4 4 4 4 2 3 3 ...
##  $ AJCC_T                         : Factor w/ 4 levels "1","2","3","4": 3 2 2 4 4 4 4 2 3 3 ...
##  $ AJCC_N                         : Factor w/ 4 levels "0","1","2","3": 3 2 3 2 2 1 1 1 1 1 ...
##  $ AJCC_M                         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Grade                          : Factor w/ 4 levels "1","2","3","4": 3 1 3 2 2 2 3 2 2 3 ...
##  $ Alcohol_history                : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 1 2 ...
##  $ Alcohol_amount                 : num  1 2 4 3 3 4 NA 0 NA 5 ...
##  $ Smoke_history                  : Factor w/ 5 levels "1","2","3","4",..: 4 2 4 3 3 4 3 1 3 1 ...
##  $ site                           : Factor w/ 13 levels "Base of tongue",..: 11 11 13 11 11 10 3 10 4 1 ...
##  $ radiation_therapy              : Factor w/ 2 levels "NO","YES": 2 2 2 2 2 2 2 1 1 2 ...
##  $ therapy_outcome                : Ord.factor w/ 5 levels "Complete Remission/Response"<..: 1 1 1 1 1 1 1 1 NA 5 ...
##  $ lymphovascular_invasion_present: Factor w/ 2 levels "NO","YES": 2 1 2 1 1 1 NA 1 NA 2 ...
##  $ HPV                            : Factor w/ 2 levels "Negative","Positive": NA NA 2 NA NA NA NA NA NA NA ...
##  $ SiteClassify                   : Factor w/ 4 levels "larynx","oral cavity",..: 2 2 4 2 2 2 2 2 2 2 ...
##  $ OUTCOME                        : Factor w/ 2 levels "Remission","Resistent": 1 1 1 1 1 1 1 1 NA 2 ...
summary(pdata)
##    sampleID              age        agegroup      gender   
##  Length:612         Min.   :19.00   <=65:403   female:168  
##  Class :character   1st Qu.:53.00   >65 :208   male  :444  
##  Mode  :character   Median :61.00   NA's:  1               
##                     Mean   :61.02                          
##                     3rd Qu.:68.00                          
##                     Max.   :90.00                          
##                     NA's   :1                              
##                                race      Stage      AJCC_T     AJCC_N   
##  american indian or alaska native:  2   1   : 23   1   : 41   0   :293  
##  asian                           : 12   2   :120   2   :176   1   :101  
##  black or african american       : 58   3   :137   3   :171   2   :184  
##  white                           :522   4   :318   4   :208   3   : 10  
##  NA's                            : 18   NA's: 14   NA's: 16   NA's: 24  
##                                                                         
##                                                                         
##   AJCC_M     Grade     Alcohol_history Alcohol_amount   Smoke_history
##  0   :580   1   : 75   No  :197        Min.   : 0.000   1   :138     
##  1   :  6   2   :356   Yes :401        1st Qu.: 0.000   2   :199     
##  NA's: 26   3   :148   NA's: 14        Median : 2.000   3   : 89     
##             4   :  7                   Mean   : 3.218   4   :169     
##             NA's: 26                   3rd Qu.: 4.000   5   :  2     
##                                        Max.   :33.000   NA's: 15     
##                                        NA's   :356                   
##                                                           site    
##  Other and unspecified parts of tongue                      :158  
##  Larynx                                                     :144  
##  Other and ill-defined sites in lip, oral cavity and pharynx: 90  
##  Floor of mouth                                             : 62  
##  Tonsil                                                     : 47  
##  Other and unspecified parts of mouth                       : 44  
##  (Other)                                                    : 67  
##  radiation_therapy                    therapy_outcome
##  NO  :166          Complete Remission/Response:369   
##  YES :302          Partial Remission/Response :  7   
##  NA's:144          Stable Disease             :  5   
##                    Persistent Disease         :  5   
##                    Progressive Disease        : 33   
##                    NA's                       :193   
##                                                      
##  lymphovascular_invasion_present       HPV           SiteClassify
##  NO  :269                        Negative: 57   larynx     :153  
##  YES :148                        Positive: 45   oral cavity:401  
##  NA's:195                        NA's    :510   other      :  1  
##                                                 pharynx    : 57  
##                                                                  
##                                                                  
##                                                                  
##       OUTCOME   
##  Remission:376  
##  Resistent: 43  
##  NA's     :193  
##                 
##                 
##                 
## 
head(pdata)
##           sampleID age agegroup gender                      race Stage AJCC_T
## 1 TCGA-DQ-5631-01A  52     <=65   male                     white     4      3
## 2 TCGA-BA-7269-01A  61     <=65   male                     white     3      2
## 3 TCGA-BA-A4IH-01A  57     <=65   male                     white     4      2
## 4 TCGA-CV-6954-01A  59     <=65   male black or african american     4      4
## 5 TCGA-CV-6954-11A  59     <=65   male black or african american     4      4
## 6 TCGA-CN-4740-01A  79      >65 female                     white     4      4
##   AJCC_N AJCC_M Grade Alcohol_history Alcohol_amount Smoke_history
## 1      2      0     3             Yes              1             4
## 2      1      0     1             Yes              2             2
## 3      2      0     3             Yes              4             4
## 4      1      0     2             Yes              3             3
## 5      1      0     2             Yes              3             3
## 6      0      0     2             Yes              4             4
##                                    site radiation_therapy
## 1 Other and unspecified parts of tongue               YES
## 2 Other and unspecified parts of tongue               YES
## 3                                Tonsil               YES
## 4 Other and unspecified parts of tongue               YES
## 5 Other and unspecified parts of tongue               YES
## 6  Other and unspecified parts of mouth               YES
##               therapy_outcome lymphovascular_invasion_present      HPV
## 1 Complete Remission/Response                             YES     <NA>
## 2 Complete Remission/Response                              NO     <NA>
## 3 Complete Remission/Response                             YES Positive
## 4 Complete Remission/Response                              NO     <NA>
## 5 Complete Remission/Response                              NO     <NA>
## 6 Complete Remission/Response                              NO     <NA>
##   SiteClassify   OUTCOME
## 1  oral cavity Remission
## 2  oral cavity Remission
## 3      pharynx Remission
## 4  oral cavity Remission
## 5  oral cavity Remission
## 6  oral cavity Remission

表格说明

1. 吸烟史的定义

  • Lifelong Non-smoker (less than 100 cigarettes smoked in Lifetime) = 1:从未吸烟

  • Current smoker (includes daily smokers and non-daily smokers or occasional smokers) = 2:当前吸烟(包括规律吸烟者和偶尔吸烟者)

  • Current reformed smoker for > 15 years (greater than 15 years) = 3:戒烟>15年

  • Current reformed smoker for ≤15 years (less than or equal to 15 years) = 4:戒烟<15年

  • Current reformed smoker, duration not specified = 5:戒烟,戒烟时间不详

  • Smoker at Diagnosis = 6:确诊时吸烟

  • Smoking History not documented = 7:不详

作者:Angeladaddy

链接:https://www.jianshu.com/p/116eac65f4d8

来源:简书 著作权归作者所有。

商业转载请联系作者获得授权,非商业转载请注明出处。

2.NA值的处理

暂时不能删除,否则缺失的数据太多了,根据具体的应用再处理吧

step-4 :合

#加入生存时间
pd <- read_tsv("D:/R/R-4.0.5/bin/project_writing/data/rawdata/curated survival data.txt") %>% as.data.frame()
## Rows: 604 Columns: 11
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (2): sample, _PATIENT
## dbl (8): OS, OS.time, DSS, DSS.time, DFI, DFI.time, PFI, PFI.time
## lgl (1): Redaction
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
rownames(pd) <- pd$sample
pd <- pd[,-11]
colnames(pd)[1] <- "sampleID"

ID <- pdata$sampleID
pdata$ID <- substr(ID,1,15)
pd <- pd[pd$sampleID %in% pdata$ID,]
pdata <- pdata[pdata$ID %in% pd$sampleID,]
pd <- pd[match(pdata$ID,pd$sampleID),]
table(pd$sampleID == pdata$ID)
## 
## TRUE 
##  604
pdata <- merge.data.frame(pdata,pd,by.x="ID",by.y = "sampleID")

pdata$L <- substr(pdata$'sampleID',14,16)
pdata <- pdata %>% arrange(pdata$'_PATIENT',pdata$L)
pdata <- pdata[!duplicated(pdata$'_PATIENT'),]
pdata <- pdata %>% select(2:15,20,16:17,21,18:19,23:30)

str(pdata)
## 'data.frame':    528 obs. of  28 variables:
##  $ sampleID                       : chr  "TCGA-4P-AA8J-01A" "TCGA-BA-4074-01A" "TCGA-BA-4075-01A" "TCGA-BA-4076-01A" ...
##  $ age                            : num  66 69 49 39 45 83 47 72 56 51 ...
##  $ agegroup                       : Factor w/ 2 levels "<=65",">65": 2 2 1 1 1 2 1 2 1 1 ...
##  $ gender                         : Factor w/ 2 levels "female","male": 2 2 2 2 1 2 2 2 2 2 ...
##  $ race                           : Factor w/ 4 levels "american indian or alaska native",..: 3 4 3 4 4 4 4 4 4 4 ...
##  $ Stage                          : Factor w/ 4 levels "1","2","3","4": 4 4 4 4 4 4 4 4 4 3 ...
##  $ AJCC_T                         : Factor w/ 4 levels "1","2","3","4": 4 3 4 3 4 2 3 4 4 2 ...
##  $ AJCC_N                         : Factor w/ 4 levels "0","1","2","3": 3 3 2 3 4 3 3 1 1 2 ...
##  $ AJCC_M                         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Grade                          : Factor w/ 4 levels "1","2","3","4": 2 3 2 2 2 2 2 1 2 2 ...
##  $ Alcohol_history                : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 1 2 2 2 2 ...
##  $ Alcohol_amount                 : num  NA NA 5 NA 0 NA 7 0 0 0 ...
##  $ Smoke_history                  : Factor w/ 5 levels "1","2","3","4",..: NA 2 2 2 4 4 2 3 1 1 ...
##  $ site                           : Factor w/ 13 levels "Base of tongue",..: 11 11 11 6 11 6 3 10 9 13 ...
##  $ SiteClassify                   : Factor w/ 4 levels "larynx","oral cavity",..: 2 2 2 1 2 1 2 2 2 4 ...
##  $ radiation_therapy              : Factor w/ 2 levels "NO","YES": 1 2 1 2 1 2 2 2 2 NA ...
##  $ therapy_outcome                : Ord.factor w/ 5 levels "Complete Remission/Response"<..: 1 1 5 1 4 NA 1 1 1 NA ...
##  $ OUTCOME                        : Factor w/ 2 levels "Remission","Resistent": 1 1 2 1 2 NA 1 1 1 NA ...
##  $ lymphovascular_invasion_present: Factor w/ 2 levels "NO","YES": 2 NA NA NA NA 1 1 1 1 NA ...
##  $ HPV                            : Factor w/ 2 levels "Negative","Positive": NA NA NA NA NA NA NA NA NA NA ...
##  $ OS                             : num  0 1 1 1 1 1 1 0 0 1 ...
##  $ OS.time                        : num  102 462 283 415 1134 ...
##  $ DSS                            : num  0 1 1 1 1 1 1 0 0 1 ...
##  $ DSS.time                       : num  102 462 283 415 1134 ...
##  $ DFI                            : num  NA NA NA NA NA NA NA NA NA 1 ...
##  $ DFI.time                       : num  NA NA NA NA NA ...
##  $ PFI                            : num  0 1 1 1 1 1 1 1 0 1 ...
##  $ PFI.time                       : num  102 396 236 286 1134 ...
summary(pdata)
##    sampleID              age        agegroup      gender   
##  Length:528         Min.   :19.00   <=65:345   female:142  
##  Class :character   1st Qu.:53.00   >65 :182   male  :386  
##  Mode  :character   Median :61.00   NA's:  1               
##                     Mean   :60.91                          
##                     3rd Qu.:69.00                          
##                     Max.   :90.00                          
##                     NA's   :1                              
##                                race      Stage      AJCC_T     AJCC_N   
##  american indian or alaska native:  2   1   : 21   1   : 37   0   :246  
##  asian                           : 11   2   : 99   2   :152   1   : 85  
##  black or african american       : 48   3   :107   3   :139   2   :166  
##  white                           :452   4   :287   4   :184   3   :  9  
##  NA's                            : 15   NA's: 14   NA's: 16   NA's: 22  
##                                                                         
##                                                                         
##   AJCC_M     Grade     Alcohol_history Alcohol_amount   Smoke_history
##  0   :496   1   : 63   No  :165        Min.   : 0.000   1   :122     
##  1   :  6   2   :311   Yes :352        1st Qu.: 0.000   2   :178     
##  NA's: 26   3   :125   NA's: 11        Median : 2.000   3   : 73     
##             4   :  7                   Mean   : 3.239   4   :140     
##             NA's: 22                   3rd Qu.: 5.000   5   :  2     
##                                        Max.   :33.000   NA's: 13     
##                                        NA's   :307                   
##                                                           site    
##  Other and unspecified parts of tongue                      :132  
##  Larynx                                                     :117  
##  Other and ill-defined sites in lip, oral cavity and pharynx: 71  
##  Floor of mouth                                             : 56  
##  Tonsil                                                     : 46  
##  Other and unspecified parts of mouth                       : 43  
##  (Other)                                                    : 63  
##       SiteClassify radiation_therapy                    therapy_outcome
##  larynx     :126   NO  :144          Complete Remission/Response:338   
##  oral cavity:345   YES :272          Partial Remission/Response :  6   
##  other      :  1   NA's:112          Stable Disease             :  5   
##  pharynx    : 56                     Persistent Disease         :  4   
##                                      Progressive Disease        : 30   
##                                      NA's                       :145   
##                                                                        
##       OUTCOME    lymphovascular_invasion_present       HPV     
##  Remission:344   NO  :232                        Negative: 57  
##  Resistent: 39   YES :124                        Positive: 44  
##  NA's     :145   NA's:172                        NA's    :427  
##                                                                
##                                                                
##                                                                
##                                                                
##        OS            OS.time            DSS           DSS.time     
##  Min.   :0.0000   Min.   :   2.0   Min.   :0.000   Min.   :   2.0  
##  1st Qu.:0.0000   1st Qu.: 378.5   1st Qu.:0.000   1st Qu.: 378.5  
##  Median :0.0000   Median : 644.0   Median :0.000   Median : 644.0  
##  Mean   :0.4223   Mean   : 913.5   Mean   :0.259   Mean   : 913.5  
##  3rd Qu.:1.0000   3rd Qu.:1175.5   3rd Qu.:1.000   3rd Qu.:1175.5  
##  Max.   :1.0000   Max.   :6417.0   Max.   :1.000   Max.   :6417.0  
##                   NA's   :1        NA's   :26      NA's   :1       
##       DFI           DFI.time           PFI           PFI.time     
##  Min.   :0.000   Min.   :  56.0   Min.   :0.000   Min.   :   2.0  
##  1st Qu.:0.000   1st Qu.: 389.0   1st Qu.:0.000   1st Qu.: 250.5  
##  Median :0.000   Median : 717.0   Median :0.000   Median : 552.0  
##  Mean   :0.209   Mean   : 937.9   Mean   :0.375   Mean   : 821.1  
##  3rd Qu.:0.000   3rd Qu.:1152.0   3rd Qu.:1.000   3rd Qu.:1106.0  
##  Max.   :1.000   Max.   :5480.0   Max.   :1.000   Max.   :6417.0  
##  NA's   :394     NA's   :395                      NA's   :1
head(pdata)
##           sampleID age agegroup gender                      race Stage AJCC_T
## 1 TCGA-4P-AA8J-01A  66      >65   male black or african american     4      4
## 2 TCGA-BA-4074-01A  69      >65   male                     white     4      3
## 3 TCGA-BA-4075-01A  49     <=65   male black or african american     4      4
## 4 TCGA-BA-4076-01A  39     <=65   male                     white     4      3
## 5 TCGA-BA-4077-01B  45     <=65 female                     white     4      4
## 6 TCGA-BA-4078-01A  83      >65   male                     white     4      2
##   AJCC_N AJCC_M Grade Alcohol_history Alcohol_amount Smoke_history
## 1      2      0     2              No             NA          <NA>
## 2      2      0     3             Yes             NA             2
## 3      1      0     2             Yes              5             2
## 4      2      0     2             Yes             NA             2
## 5      3      0     2             Yes              0             4
## 6      2      0     2              No             NA             4
##                                    site SiteClassify radiation_therapy
## 1 Other and unspecified parts of tongue  oral cavity                NO
## 2 Other and unspecified parts of tongue  oral cavity               YES
## 3 Other and unspecified parts of tongue  oral cavity                NO
## 4                                Larynx       larynx               YES
## 5 Other and unspecified parts of tongue  oral cavity                NO
## 6                                Larynx       larynx               YES
##               therapy_outcome   OUTCOME lymphovascular_invasion_present  HPV OS
## 1 Complete Remission/Response Remission                             YES <NA>  0
## 2 Complete Remission/Response Remission                            <NA> <NA>  1
## 3         Progressive Disease Resistent                            <NA> <NA>  1
## 4 Complete Remission/Response Remission                            <NA> <NA>  1
## 5          Persistent Disease Resistent                            <NA> <NA>  1
## 6                        <NA>      <NA>                              NO <NA>  1
##   OS.time DSS DSS.time DFI DFI.time PFI PFI.time
## 1     102   0      102  NA       NA   0      102
## 2     462   1      462  NA       NA   1      396
## 3     283   1      283  NA       NA   1      236
## 4     415   1      415  NA       NA   1      286
## 5    1134   1     1134  NA       NA   1     1134
## 6     276   1      276  NA       NA   1      276
write.csv(pdata,file="D:/R/R-4.0.5/bin/project_writing/data/TCGA_phenotype_processed_528.csv")
#最终的临床资料表,以后也可以用

得到临床表型信息+生存信息的表格

if(F){
#加入基因表达情况和分组情况
#定义分组函数
GENEMERGE  <- function(x){
  gene_df <- as.data.frame(t(exprSet[x,]))
  colnames(gene_df) <- "expr"
  gene_df$paID <- rownames(gene_df)
  
  #分组,高表达组为1,低表达组为2
  Median <- round(median(gene_df$expr),2)
  print(paste("median of",x,"=",Median))
  gene_df$group <- factor(ifelse(gene_df$expr > Median,1,2 ),ordered = F)
  
  #对两个表格进行合并
  gene_df <- gene_df[rownames(pdata),]
  pdata <- merge.data.frame(pdata,gene_df,by.x = "sampleID",by.y = "paID")
  rownames(pdata) <- pdata$sampleID
  return(pdata)
}
pdata <- GENEMERGE("HOXC6")
head(pdata)
}