setwd("D:/R/R-4.0.5/bin/project_writing/dataProcess")
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 421397 22.6 882125 47.2 643731 34.4
## Vcells 767387 5.9 8388608 64.0 1720850 13.2
suppressMessages(library(survival))
suppressMessages(library(ggplot2))
suppressMessages(library(ggpubr))
suppressMessages(library(survminer))
suppressMessages(library(tidyverse))
suppressMessages(library(plyr))
内容:(1)phenotype;(2)survival
phenotype各变量的意义:https://blog.csdn.net/tuanzide5233/article/details/104183840
phenotype: 名称:pdata
; 612条观测
#删除冗余数据
#删除变量
pdata <- read_tsv("D:/R/R-4.0.5/bin/project_writing/data/rawdata/TCGA-HNSC.GDC_phenotype.tsv.gz") %>% as.data.frame()
## Rows: 612 Columns: 139
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (93): submitter_id.samples, additional_pharmaceutical_therapy, additiona...
## dbl (40): age_at_initial_pathologic_diagnosis, day_of_dcc_upload, day_of_for...
## lgl (6): withdrawn, releasable.project, days_to_sample_procurement.samples,...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
pdata <- pdata[,c(1,6,7,13:16,22,29:32,34,37:41,45:51,53:55,58:64,70,72:76,83,89:94,97:101,103:106,108,110,112,114:119)]
pdata <- pdata[,c(1:7,11,12,17:19,26:29,34,35,39,40,45,46,47,48,55,58,59,62:66)]
dim(pdata)
## [1] 612 32
## 612 32
#选择变量并改名
pdata <- pdata %>% transmute(sampleID = submitter_id.samples,
age = age_at_initial_pathologic_diagnosis,
gender = gender.demographic,
race = race.demographic,
Stage = clinical_stage,
AJCC_T = clinical_T,
AJCC_N = clinical_N,
AJCC_M = clinical_M,
Grade = neoplasm_histologic_grade,
Alcohol_history = alcohol_history.exposures,
Alcohol_amount = amount_of_alcohol_consumption_per_day,
Smoke_history = tobacco_smoking_history,
hpv_status_ish = hpv_status_by_ish_testing,
hpv_status_p16 = hpv_status_by_p16_testing,
site = primary_site,
radiation_therapy = radiation_therapy,
therapy_outcome = primary_therapy_outcome_success,
lymphovascular_invasion_present)
summary(pdata)
## sampleID age gender race
## Length:612 Min. :19.00 Length:612 Length:612
## Class :character 1st Qu.:53.00 Class :character Class :character
## Mode :character Median :61.00 Mode :character Mode :character
## Mean :61.02
## 3rd Qu.:68.00
## Max. :90.00
## NA's :1
## Stage AJCC_T AJCC_N AJCC_M
## Length:612 Length:612 Length:612 Length:612
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Grade Alcohol_history Alcohol_amount Smoke_history
## Length:612 Length:612 Length:612 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:2.000
## Mode :character Mode :character Mode :character Median :2.000
## Mean :2.494
## 3rd Qu.:4.000
## Max. :5.000
## NA's :15
## hpv_status_ish hpv_status_p16 site radiation_therapy
## Length:612 Length:612 Length:612 Length:612
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## therapy_outcome lymphovascular_invasion_present
## Length:612 Length:612
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
str(pdata)
## 'data.frame': 612 obs. of 18 variables:
## $ sampleID : chr "TCGA-DQ-5631-01A" "TCGA-BA-7269-01A" "TCGA-BA-A4IH-01A" "TCGA-CV-6954-01A" ...
## $ age : num 52 61 57 59 59 79 82 77 57 63 ...
## $ gender : chr "male" "male" "male" "male" ...
## $ race : chr "white" "white" "white" "black or african american" ...
## $ Stage : chr "Stage IVA" "Stage III" "Stage IVA" "Stage IVA" ...
## $ AJCC_T : chr "T3" "T2" "T2" "T4a" ...
## $ AJCC_N : chr "N2b" "N1" "N2b" "N1" ...
## $ AJCC_M : chr "M0" "M0" "M0" "M0" ...
## $ Grade : chr "G3" "G1" "G3" "G2" ...
## $ Alcohol_history : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Alcohol_amount : chr "1" "2" "4" "3" ...
## $ Smoke_history : num 4 2 4 3 3 4 3 1 3 1 ...
## $ hpv_status_ish : chr NA NA "Positive" NA ...
## $ hpv_status_p16 : chr NA NA "Positive" NA ...
## $ site : chr "Other and unspecified parts of tongue" "Other and unspecified parts of tongue" "Tonsil" "Other and unspecified parts of tongue" ...
## $ radiation_therapy : chr "YES" "YES" "YES" "YES" ...
## $ therapy_outcome : chr "Complete Remission/Response" "Complete Remission/Response" "Complete Remission/Response" "Complete Remission/Response" ...
## $ lymphovascular_invasion_present: chr "YES" "NO" "YES" "NO" ...
pdata[1:4,1:4]
## sampleID age gender race
## 1 TCGA-DQ-5631-01A 52 male white
## 2 TCGA-BA-7269-01A 61 male white
## 3 TCGA-BA-A4IH-01A 57 male white
## 4 TCGA-CV-6954-01A 59 male black or african american
#对象重编码,改数据类型
pdata <- pdata %>% mutate(agegroup = factor(ifelse(age > 65,">65","<=65"),levels = c("<=65",">65")),
Stage = factor(ifelse(Stage == "Stage I",1,
ifelse(Stage == "Stage II",2,
ifelse(Stage == "Stage III",3,
4)))),
gender = as.factor(gender),
race = factor(ifelse(race=="not reported",NA,race),ordered = F),
Alcohol_amount = as.numeric(Alcohol_amount),
site = as.factor(site),
AJCC_T = factor(ifelse(AJCC_T == "T1",1,
ifelse(AJCC_T == "T2",2,
ifelse(AJCC_T == "T3",3,
ifelse(AJCC_T == "TX",NA,4))))),
AJCC_N = factor(ifelse(AJCC_N == "N0",0,
ifelse(AJCC_N == "N1",1,
ifelse(AJCC_N == "N3",3,
ifelse(AJCC_N == "NX",NA,2))))),
AJCC_M = factor(ifelse(AJCC_M == "M0",0,
ifelse(AJCC_M == "M1",1,NA))),
Grade = factor(ifelse(Grade == "G1",1,
ifelse(Grade == "G2",2,
ifelse(Grade == "G3",3,
ifelse(Grade == "G4",4,NA))))),
Alcohol_history = factor(ifelse(Alcohol_history == "Not Reported",NA,Alcohol_history)),
Smoke_history = factor(Smoke_history),
HPV = factor(ifelse(hpv_status_p16 == "Positive" | hpv_status_ish == "Positive","Positive",
ifelse(hpv_status_p16 == "Negative" | hpv_status_ish == "Negative","Negative","Not Report"))),
lymphovascular_invasion_present = factor(lymphovascular_invasion_present),
radiation_therapy = as.factor(radiation_therapy),
therapy_outcome = factor(therapy_outcome,
levels = c("Complete Remission/Response","Partial Remission/Response","Stable Disease","Persistent Disease","Progressive Disease"),
ordered = TRUE)
)
pdata <- pdata %>% select(1,2,19,3:12,15:18,20)
#部位
SITE <- levels(pdata$site)
pdata$SiteClassify <- factor(ifelse(pdata$site %in% SITE[c(1,3,4,7,9,10,11,12)],"oral cavity",
ifelse(pdata$site %in% SITE[c(8,13)],"pharynx",ifelse(pdata$site %in% SITE[c(5,6)],"larynx","other"))))
rm(SITE)
#therapy outcome
OUTCOME <- levels(pdata$therapy_outcome)
pdata$OUTCOME <- factor(ifelse(pdata$therapy_outcome %in% OUTCOME[c(1,2)],"Remission",
ifelse(pdata$therapy_outcome %in% OUTCOME[c(3:5)],"Resistent",NA)))
rm(OUTCOME)
#处理好的临床信息
str(pdata)
## 'data.frame': 612 obs. of 20 variables:
## $ sampleID : chr "TCGA-DQ-5631-01A" "TCGA-BA-7269-01A" "TCGA-BA-A4IH-01A" "TCGA-CV-6954-01A" ...
## $ age : num 52 61 57 59 59 79 82 77 57 63 ...
## $ agegroup : Factor w/ 2 levels "<=65",">65": 1 1 1 1 1 2 2 2 1 1 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 2 2 2 1 1 1 2 2 ...
## $ race : Factor w/ 4 levels "american indian or alaska native",..: 4 4 4 3 3 4 4 4 4 4 ...
## $ Stage : Factor w/ 4 levels "1","2","3","4": 4 3 4 4 4 4 4 2 3 3 ...
## $ AJCC_T : Factor w/ 4 levels "1","2","3","4": 3 2 2 4 4 4 4 2 3 3 ...
## $ AJCC_N : Factor w/ 4 levels "0","1","2","3": 3 2 3 2 2 1 1 1 1 1 ...
## $ AJCC_M : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Grade : Factor w/ 4 levels "1","2","3","4": 3 1 3 2 2 2 3 2 2 3 ...
## $ Alcohol_history : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 1 2 ...
## $ Alcohol_amount : num 1 2 4 3 3 4 NA 0 NA 5 ...
## $ Smoke_history : Factor w/ 5 levels "1","2","3","4",..: 4 2 4 3 3 4 3 1 3 1 ...
## $ site : Factor w/ 13 levels "Base of tongue",..: 11 11 13 11 11 10 3 10 4 1 ...
## $ radiation_therapy : Factor w/ 2 levels "NO","YES": 2 2 2 2 2 2 2 1 1 2 ...
## $ therapy_outcome : Ord.factor w/ 5 levels "Complete Remission/Response"<..: 1 1 1 1 1 1 1 1 NA 5 ...
## $ lymphovascular_invasion_present: Factor w/ 2 levels "NO","YES": 2 1 2 1 1 1 NA 1 NA 2 ...
## $ HPV : Factor w/ 2 levels "Negative","Positive": NA NA 2 NA NA NA NA NA NA NA ...
## $ SiteClassify : Factor w/ 4 levels "larynx","oral cavity",..: 2 2 4 2 2 2 2 2 2 2 ...
## $ OUTCOME : Factor w/ 2 levels "Remission","Resistent": 1 1 1 1 1 1 1 1 NA 2 ...
summary(pdata)
## sampleID age agegroup gender
## Length:612 Min. :19.00 <=65:403 female:168
## Class :character 1st Qu.:53.00 >65 :208 male :444
## Mode :character Median :61.00 NA's: 1
## Mean :61.02
## 3rd Qu.:68.00
## Max. :90.00
## NA's :1
## race Stage AJCC_T AJCC_N
## american indian or alaska native: 2 1 : 23 1 : 41 0 :293
## asian : 12 2 :120 2 :176 1 :101
## black or african american : 58 3 :137 3 :171 2 :184
## white :522 4 :318 4 :208 3 : 10
## NA's : 18 NA's: 14 NA's: 16 NA's: 24
##
##
## AJCC_M Grade Alcohol_history Alcohol_amount Smoke_history
## 0 :580 1 : 75 No :197 Min. : 0.000 1 :138
## 1 : 6 2 :356 Yes :401 1st Qu.: 0.000 2 :199
## NA's: 26 3 :148 NA's: 14 Median : 2.000 3 : 89
## 4 : 7 Mean : 3.218 4 :169
## NA's: 26 3rd Qu.: 4.000 5 : 2
## Max. :33.000 NA's: 15
## NA's :356
## site
## Other and unspecified parts of tongue :158
## Larynx :144
## Other and ill-defined sites in lip, oral cavity and pharynx: 90
## Floor of mouth : 62
## Tonsil : 47
## Other and unspecified parts of mouth : 44
## (Other) : 67
## radiation_therapy therapy_outcome
## NO :166 Complete Remission/Response:369
## YES :302 Partial Remission/Response : 7
## NA's:144 Stable Disease : 5
## Persistent Disease : 5
## Progressive Disease : 33
## NA's :193
##
## lymphovascular_invasion_present HPV SiteClassify
## NO :269 Negative: 57 larynx :153
## YES :148 Positive: 45 oral cavity:401
## NA's:195 NA's :510 other : 1
## pharynx : 57
##
##
##
## OUTCOME
## Remission:376
## Resistent: 43
## NA's :193
##
##
##
##
head(pdata)
## sampleID age agegroup gender race Stage AJCC_T
## 1 TCGA-DQ-5631-01A 52 <=65 male white 4 3
## 2 TCGA-BA-7269-01A 61 <=65 male white 3 2
## 3 TCGA-BA-A4IH-01A 57 <=65 male white 4 2
## 4 TCGA-CV-6954-01A 59 <=65 male black or african american 4 4
## 5 TCGA-CV-6954-11A 59 <=65 male black or african american 4 4
## 6 TCGA-CN-4740-01A 79 >65 female white 4 4
## AJCC_N AJCC_M Grade Alcohol_history Alcohol_amount Smoke_history
## 1 2 0 3 Yes 1 4
## 2 1 0 1 Yes 2 2
## 3 2 0 3 Yes 4 4
## 4 1 0 2 Yes 3 3
## 5 1 0 2 Yes 3 3
## 6 0 0 2 Yes 4 4
## site radiation_therapy
## 1 Other and unspecified parts of tongue YES
## 2 Other and unspecified parts of tongue YES
## 3 Tonsil YES
## 4 Other and unspecified parts of tongue YES
## 5 Other and unspecified parts of tongue YES
## 6 Other and unspecified parts of mouth YES
## therapy_outcome lymphovascular_invasion_present HPV
## 1 Complete Remission/Response YES <NA>
## 2 Complete Remission/Response NO <NA>
## 3 Complete Remission/Response YES Positive
## 4 Complete Remission/Response NO <NA>
## 5 Complete Remission/Response NO <NA>
## 6 Complete Remission/Response NO <NA>
## SiteClassify OUTCOME
## 1 oral cavity Remission
## 2 oral cavity Remission
## 3 pharynx Remission
## 4 oral cavity Remission
## 5 oral cavity Remission
## 6 oral cavity Remission
1. 吸烟史的定义
Lifelong Non-smoker (less than 100 cigarettes smoked in Lifetime) = 1:从未吸烟
Current smoker (includes daily smokers and non-daily smokers or occasional smokers) = 2:当前吸烟(包括规律吸烟者和偶尔吸烟者)
Current reformed smoker for > 15 years (greater than 15 years) = 3:戒烟>15年
Current reformed smoker for ≤15 years (less than or equal to 15 years) = 4:戒烟<15年
Current reformed smoker, duration not specified = 5:戒烟,戒烟时间不详
Smoker at Diagnosis = 6:确诊时吸烟
Smoking History not documented = 7:不详
作者:Angeladaddy
链接:https://www.jianshu.com/p/116eac65f4d8
来源:简书 著作权归作者所有。
商业转载请联系作者获得授权,非商业转载请注明出处。
2.NA值的处理
暂时不能删除,否则缺失的数据太多了,根据具体的应用再处理吧
#加入生存时间
pd <- read_tsv("D:/R/R-4.0.5/bin/project_writing/data/rawdata/curated survival data.txt") %>% as.data.frame()
## Rows: 604 Columns: 11
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (2): sample, _PATIENT
## dbl (8): OS, OS.time, DSS, DSS.time, DFI, DFI.time, PFI, PFI.time
## lgl (1): Redaction
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
rownames(pd) <- pd$sample
pd <- pd[,-11]
colnames(pd)[1] <- "sampleID"
ID <- pdata$sampleID
pdata$ID <- substr(ID,1,15)
pd <- pd[pd$sampleID %in% pdata$ID,]
pdata <- pdata[pdata$ID %in% pd$sampleID,]
pd <- pd[match(pdata$ID,pd$sampleID),]
table(pd$sampleID == pdata$ID)
##
## TRUE
## 604
pdata <- merge.data.frame(pdata,pd,by.x="ID",by.y = "sampleID")
pdata$L <- substr(pdata$'sampleID',14,16)
pdata <- pdata %>% arrange(pdata$'_PATIENT',pdata$L)
pdata <- pdata[!duplicated(pdata$'_PATIENT'),]
pdata <- pdata %>% select(2:15,20,16:17,21,18:19,23:30)
str(pdata)
## 'data.frame': 528 obs. of 28 variables:
## $ sampleID : chr "TCGA-4P-AA8J-01A" "TCGA-BA-4074-01A" "TCGA-BA-4075-01A" "TCGA-BA-4076-01A" ...
## $ age : num 66 69 49 39 45 83 47 72 56 51 ...
## $ agegroup : Factor w/ 2 levels "<=65",">65": 2 2 1 1 1 2 1 2 1 1 ...
## $ gender : Factor w/ 2 levels "female","male": 2 2 2 2 1 2 2 2 2 2 ...
## $ race : Factor w/ 4 levels "american indian or alaska native",..: 3 4 3 4 4 4 4 4 4 4 ...
## $ Stage : Factor w/ 4 levels "1","2","3","4": 4 4 4 4 4 4 4 4 4 3 ...
## $ AJCC_T : Factor w/ 4 levels "1","2","3","4": 4 3 4 3 4 2 3 4 4 2 ...
## $ AJCC_N : Factor w/ 4 levels "0","1","2","3": 3 3 2 3 4 3 3 1 1 2 ...
## $ AJCC_M : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Grade : Factor w/ 4 levels "1","2","3","4": 2 3 2 2 2 2 2 1 2 2 ...
## $ Alcohol_history : Factor w/ 2 levels "No","Yes": 1 2 2 2 2 1 2 2 2 2 ...
## $ Alcohol_amount : num NA NA 5 NA 0 NA 7 0 0 0 ...
## $ Smoke_history : Factor w/ 5 levels "1","2","3","4",..: NA 2 2 2 4 4 2 3 1 1 ...
## $ site : Factor w/ 13 levels "Base of tongue",..: 11 11 11 6 11 6 3 10 9 13 ...
## $ SiteClassify : Factor w/ 4 levels "larynx","oral cavity",..: 2 2 2 1 2 1 2 2 2 4 ...
## $ radiation_therapy : Factor w/ 2 levels "NO","YES": 1 2 1 2 1 2 2 2 2 NA ...
## $ therapy_outcome : Ord.factor w/ 5 levels "Complete Remission/Response"<..: 1 1 5 1 4 NA 1 1 1 NA ...
## $ OUTCOME : Factor w/ 2 levels "Remission","Resistent": 1 1 2 1 2 NA 1 1 1 NA ...
## $ lymphovascular_invasion_present: Factor w/ 2 levels "NO","YES": 2 NA NA NA NA 1 1 1 1 NA ...
## $ HPV : Factor w/ 2 levels "Negative","Positive": NA NA NA NA NA NA NA NA NA NA ...
## $ OS : num 0 1 1 1 1 1 1 0 0 1 ...
## $ OS.time : num 102 462 283 415 1134 ...
## $ DSS : num 0 1 1 1 1 1 1 0 0 1 ...
## $ DSS.time : num 102 462 283 415 1134 ...
## $ DFI : num NA NA NA NA NA NA NA NA NA 1 ...
## $ DFI.time : num NA NA NA NA NA ...
## $ PFI : num 0 1 1 1 1 1 1 1 0 1 ...
## $ PFI.time : num 102 396 236 286 1134 ...
summary(pdata)
## sampleID age agegroup gender
## Length:528 Min. :19.00 <=65:345 female:142
## Class :character 1st Qu.:53.00 >65 :182 male :386
## Mode :character Median :61.00 NA's: 1
## Mean :60.91
## 3rd Qu.:69.00
## Max. :90.00
## NA's :1
## race Stage AJCC_T AJCC_N
## american indian or alaska native: 2 1 : 21 1 : 37 0 :246
## asian : 11 2 : 99 2 :152 1 : 85
## black or african american : 48 3 :107 3 :139 2 :166
## white :452 4 :287 4 :184 3 : 9
## NA's : 15 NA's: 14 NA's: 16 NA's: 22
##
##
## AJCC_M Grade Alcohol_history Alcohol_amount Smoke_history
## 0 :496 1 : 63 No :165 Min. : 0.000 1 :122
## 1 : 6 2 :311 Yes :352 1st Qu.: 0.000 2 :178
## NA's: 26 3 :125 NA's: 11 Median : 2.000 3 : 73
## 4 : 7 Mean : 3.239 4 :140
## NA's: 22 3rd Qu.: 5.000 5 : 2
## Max. :33.000 NA's: 13
## NA's :307
## site
## Other and unspecified parts of tongue :132
## Larynx :117
## Other and ill-defined sites in lip, oral cavity and pharynx: 71
## Floor of mouth : 56
## Tonsil : 46
## Other and unspecified parts of mouth : 43
## (Other) : 63
## SiteClassify radiation_therapy therapy_outcome
## larynx :126 NO :144 Complete Remission/Response:338
## oral cavity:345 YES :272 Partial Remission/Response : 6
## other : 1 NA's:112 Stable Disease : 5
## pharynx : 56 Persistent Disease : 4
## Progressive Disease : 30
## NA's :145
##
## OUTCOME lymphovascular_invasion_present HPV
## Remission:344 NO :232 Negative: 57
## Resistent: 39 YES :124 Positive: 44
## NA's :145 NA's:172 NA's :427
##
##
##
##
## OS OS.time DSS DSS.time
## Min. :0.0000 Min. : 2.0 Min. :0.000 Min. : 2.0
## 1st Qu.:0.0000 1st Qu.: 378.5 1st Qu.:0.000 1st Qu.: 378.5
## Median :0.0000 Median : 644.0 Median :0.000 Median : 644.0
## Mean :0.4223 Mean : 913.5 Mean :0.259 Mean : 913.5
## 3rd Qu.:1.0000 3rd Qu.:1175.5 3rd Qu.:1.000 3rd Qu.:1175.5
## Max. :1.0000 Max. :6417.0 Max. :1.000 Max. :6417.0
## NA's :1 NA's :26 NA's :1
## DFI DFI.time PFI PFI.time
## Min. :0.000 Min. : 56.0 Min. :0.000 Min. : 2.0
## 1st Qu.:0.000 1st Qu.: 389.0 1st Qu.:0.000 1st Qu.: 250.5
## Median :0.000 Median : 717.0 Median :0.000 Median : 552.0
## Mean :0.209 Mean : 937.9 Mean :0.375 Mean : 821.1
## 3rd Qu.:0.000 3rd Qu.:1152.0 3rd Qu.:1.000 3rd Qu.:1106.0
## Max. :1.000 Max. :5480.0 Max. :1.000 Max. :6417.0
## NA's :394 NA's :395 NA's :1
head(pdata)
## sampleID age agegroup gender race Stage AJCC_T
## 1 TCGA-4P-AA8J-01A 66 >65 male black or african american 4 4
## 2 TCGA-BA-4074-01A 69 >65 male white 4 3
## 3 TCGA-BA-4075-01A 49 <=65 male black or african american 4 4
## 4 TCGA-BA-4076-01A 39 <=65 male white 4 3
## 5 TCGA-BA-4077-01B 45 <=65 female white 4 4
## 6 TCGA-BA-4078-01A 83 >65 male white 4 2
## AJCC_N AJCC_M Grade Alcohol_history Alcohol_amount Smoke_history
## 1 2 0 2 No NA <NA>
## 2 2 0 3 Yes NA 2
## 3 1 0 2 Yes 5 2
## 4 2 0 2 Yes NA 2
## 5 3 0 2 Yes 0 4
## 6 2 0 2 No NA 4
## site SiteClassify radiation_therapy
## 1 Other and unspecified parts of tongue oral cavity NO
## 2 Other and unspecified parts of tongue oral cavity YES
## 3 Other and unspecified parts of tongue oral cavity NO
## 4 Larynx larynx YES
## 5 Other and unspecified parts of tongue oral cavity NO
## 6 Larynx larynx YES
## therapy_outcome OUTCOME lymphovascular_invasion_present HPV OS
## 1 Complete Remission/Response Remission YES <NA> 0
## 2 Complete Remission/Response Remission <NA> <NA> 1
## 3 Progressive Disease Resistent <NA> <NA> 1
## 4 Complete Remission/Response Remission <NA> <NA> 1
## 5 Persistent Disease Resistent <NA> <NA> 1
## 6 <NA> <NA> NO <NA> 1
## OS.time DSS DSS.time DFI DFI.time PFI PFI.time
## 1 102 0 102 NA NA 0 102
## 2 462 1 462 NA NA 1 396
## 3 283 1 283 NA NA 1 236
## 4 415 1 415 NA NA 1 286
## 5 1134 1 1134 NA NA 1 1134
## 6 276 1 276 NA NA 1 276
write.csv(pdata,file="D:/R/R-4.0.5/bin/project_writing/data/TCGA_phenotype_processed_528.csv")
#最终的临床资料表,以后也可以用
得到临床表型信息+生存信息的表格
if(F){
#加入基因表达情况和分组情况
#定义分组函数
GENEMERGE <- function(x){
gene_df <- as.data.frame(t(exprSet[x,]))
colnames(gene_df) <- "expr"
gene_df$paID <- rownames(gene_df)
#分组,高表达组为1,低表达组为2
Median <- round(median(gene_df$expr),2)
print(paste("median of",x,"=",Median))
gene_df$group <- factor(ifelse(gene_df$expr > Median,1,2 ),ordered = F)
#对两个表格进行合并
gene_df <- gene_df[rownames(pdata),]
pdata <- merge.data.frame(pdata,gene_df,by.x = "sampleID",by.y = "paID")
rownames(pdata) <- pdata$sampleID
return(pdata)
}
pdata <- GENEMERGE("HOXC6")
head(pdata)
}