### TCGA-LIHC数据整理
# 解压数据,创建存储文件夹
setwd("C:/Users/石源方/Desktop/数据搬家/华理工/班级-华/各科课程作业/高等生物信息学-注意PDF格式/24-12-19-practice6/6") # 设置工作路径
tar_file <- "./gdc_download_20241223_170223.170226.tar.gz"
getwd()
## [1] "C:/Users/石源方/Desktop/数据搬家/华理工/班级-华/各科课程作业/高等生物信息学-注意PDF格式/24-12-19-practice6/6"
# 定义解压缩的目标目录
extract_dir <- "C:/Users/石源方/Desktop/数据搬家/华理工/班级-华/各科课程作业/高等生物信息学-注意PDF格式/24-12-19-practice6/6/untar"
untar(tar_file, exdir = extract_dir) # 导入tar.gz,并解压文件
dir.create('Matrix_txt') # 新建文件夹存储count/TPM/差异表达矩阵等txt格式
## Warning in dir.create("Matrix_txt"): 'Matrix_txt' already exists
dir.create('Matrix_csv/') # 新建文件夹存储count/TPM/差异表达矩阵等csv格式
## Warning in dir.create("Matrix_csv/"): 'Matrix_csv' already exists
# 数据整理
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
sample_sheet <- fread("./gdc_sample_sheet.2024-12-23.tsv") # 读取样本信息
sample_sheet$Barcode <- substr(sample_sheet$`Sample ID`,1,15) # 取Sample ID前15字符作为barcode
sample_sheet1 <- sample_sheet %>% filter(!duplicated(sample_sheet$Barcode)) # 去重
sample_sheet2 <- sample_sheet1 %>% filter(grepl("01$|11$|06$",sample_sheet1$Barcode)) # Barcode的最后两位:01表示肿瘤样本,11表示正常样本,06表示转移样本
TCGA_LIHC_Exp <- fread("./untar/00fb4b52-e6a4-4ad9-bbed-584e25851aca/ba056a7d-5370-4fe9-af1e-2e3de42e205f.rna_seq.augmented_star_gene_counts.tsv") # 任意读取一个文件
TCGA_LIHC_Exp <- TCGA_LIHC_Exp[!1:4,c("gene_id","gene_name","gene_type")] # 创建包含"gene_id","gene_name","gene_type"的数据框,用于合并表达数据
# 将所有样本合并成一个数据框
for (i in 1:nrow(sample_sheet2)) {
folder_name <- sample_sheet2$`File ID`[i]
file_name <- sample_sheet2$`File Name`[i]
sample_name <- sample_sheet2$Barcode[i]
data1 <- fread(paste0("./untar/",folder_name,"/",file_name))
#unstranded代表count值;如果要保存TPM,则改为tpm_unstranded
data2 <- data1[!1:4,c("gene_id","gene_name","gene_type","unstranded")]
colnames(data2)[4] <- sample_name
TCGA_LIHC_Exp <- inner_join(TCGA_LIHC_Exp,data2)
}
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
## Joining with `by = join_by(gene_id, gene_name, gene_type)`
# 根据需要的表达比例筛选满足条件的基因
zero_percentage <- rowMeans(TCGA_LIHC_Exp[, 4:ncol(TCGA_LIHC_Exp)] == 0)
TCGA_LIHC_Exp1 <- TCGA_LIHC_Exp[zero_percentage < 0.6, ] # 筛选出表达超过60%的基因
# 使用 BiocManager 安装 edgeR
library(BiocManager)
library(limma)
library(edgeR)
TCGA_LIHC_Exp1 = avereps(TCGA_LIHC_Exp[,-c(1:3)],ID = TCGA_LIHC_Exp$gene_name) # 对重复基因名取平均表达量,并将基因名作为行名
TCGA_LIHC_Exp1 <- TCGA_LIHC_Exp1[rowMeans(TCGA_LIHC_Exp1)>100,] # 根据需要去除低表达基因,这里设置的平均表达量100为阈值
# 创建样本分组
library(stringr)
tumor <- colnames(TCGA_LIHC_Exp1)[substr(colnames(TCGA_LIHC_Exp1),14,15) == "01"]
normal <- colnames(TCGA_LIHC_Exp1)[substr(colnames(TCGA_LIHC_Exp1),14,15) == "11"]
# 提取肿瘤样本和正常样本的数据
tumor_sample <- TCGA_LIHC_Exp1[,tumor]
normal_sample <- TCGA_LIHC_Exp1[,normal]
View(normal_sample)
# 合并肿瘤样本和正常样本的数据
exprSet_by_group <- cbind(tumor_sample,normal_sample)
# 添加gene_name列为第一列,设置为数据框的行名,
gene_name <- rownames(exprSet_by_group)
exprSet <- cbind(gene_name, exprSet_by_group) # 将基因名称与 exprSet_by_group 合并,形成新的数据框 exprSet
### 将exprSet 数据框分别以两种不同的格式(txt 和 csv)存储counts和TPM数据
# exprSet 是一个矩阵 要先将其转换为 data.table 对象(即exprSetDT ),再fwrite存储为TXT 文件
exprSetDT <- as.data.table(exprSet)
# 使用 fwrite 将 data.table 对象写入 TXT 文件
fwrite(exprSetDT,"./Matrix_txt/TCGA_LIHC_Count.txt") # txt格式
fwrite(exprSetDT, "./Matrix_txt/TCGA_LIHC_Tpm.txt")# txt格式
write.csv(exprSet, "./Matrix_csv/TCGA_LIHC_Count.csv", row.names = FALSE) # csv格式
write.csv(exprSet, "./Matrix_csv/TCGA_LIHC_Tpm.csv", row.names = FALSE) # csv格式
remove(list = ls()) ##清空当前环境
setwd("C:/Users/石源方/Desktop/数据搬家/华理工/班级-华/各科课程作业/高等生物信息学-注意PDF格式/24-12-19-practice6/6") # 设置工作路径
getwd()
## [1] "C:/Users/石源方/Desktop/数据搬家/华理工/班级-华/各科课程作业/高等生物信息学-注意PDF格式/24-12-19-practice6/6"
# 解压 clinical_cart.2024-12-23.tar.gz 到 clinical_data 目录
extract_dir <- "./clinical_data" #解压后存放位置
untar("./clinical.cart.2024-12-23.tar.gz", exdir = "clinical_data")
# 提取临床数据整理生存分析需要的数据-#先将下载好的metadata.json文件放入clinical文件夹
library(readr)# 加载 readr 包,读取 CSV 文件
library(jsonlite)
json <- jsonlite::fromJSON("metadata.cart.2024-12-23.json") #读取JSON文件
entity_submitter_id <- sapply(json$associated_entities, function(x) unlist(x[, 1]))
case_id <- sapply(json$associated_entities, function(x) unlist(x[, 3]))
sample_case <- t(rbind(entity_submitter_id, case_id))
# 读取 CSV 文件
clinical <- read_tsv("C:/Users/石源方/Desktop/数据搬家/华理工/班级-华/各科课程作业/高等生物信息学-注意PDF格式/24-12-19-practice6/6/clinical_data/clinical.tsv")
## New names:
## Rows: 742 Columns: 197
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "\t" chr
## (197): case_id, case_submitter_id, project_id, age_at_index, age_is_obfu...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `margin_distance` -> `margin_distance...96`
## • `margins_involved_site` -> `margins_involved_site...97`
## • `residual_disease` -> `residual_disease...121`
## • `margin_distance` -> `margin_distance...163`
## • `margins_involved_site` -> `margins_involved_site...165`
## • `residual_disease` -> `residual_disease...176`
clinical <- as.data.frame(clinical[!duplicated(clinical$case_id),]) #去除重复的sample
str(sample_case) # 查看sample_case的结构
## chr [1:424, 1:2] "TCGA-FV-A3I0-01A-11R-A22L-07" ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "entity_submitter_id" "case_id"
str(clinical) # 查看clinical的结构
## 'data.frame': 371 obs. of 197 variables:
## $ case_id : chr "0004d251-3f70-4395-b175-c94c2f5b1b81" "001887aa-36d0-463f-8bca-dec7043b4f2e" "00f209c3-041a-4c6e-8b0f-6383eb3c85fc" "010d5817-9a3f-4a06-8643-653116c044ff" ...
## $ case_submitter_id : chr "TCGA-DD-AAVP" "TCGA-DD-A4NP" "TCGA-BC-4073" "TCGA-MI-A75E" ...
## $ project_id : chr "TCGA-LIHC" "TCGA-LIHC" "TCGA-LIHC" "TCGA-LIHC" ...
## $ age_at_index : chr "48" "32" "73" "61" ...
## $ age_is_obfuscated : chr "'--" "'--" "'--" "'--" ...
## $ cause_of_death : chr "'--" "'--" "'--" "'--" ...
## $ cause_of_death_source : chr "'--" "'--" "'--" "'--" ...
## $ country_of_birth : chr "'--" "'--" "'--" "'--" ...
## $ country_of_residence_at_enrollment : chr "'--" "'--" "'--" "'--" ...
## $ days_to_birth : chr "-17833" "-11838" "-26795" "-22457" ...
## $ days_to_death : chr "'--" "'--" "'--" "'--" ...
## $ education_level : chr "'--" "'--" "'--" "'--" ...
## $ ethnicity : chr "not hispanic or latino" "not hispanic or latino" "not hispanic or latino" "not hispanic or latino" ...
## $ gender : chr "male" "male" "male" "male" ...
## $ marital_status : chr "'--" "'--" "'--" "'--" ...
## $ occupation_duration_years : chr "'--" "'--" "'--" "'--" ...
## $ population_group : chr "'--" "'--" "'--" "'--" ...
## $ premature_at_birth : chr "'--" "'--" "'--" "'--" ...
## $ race : chr "asian" "white" "white" "white" ...
## $ vital_status : chr "Alive" "Alive" "Alive" "Alive" ...
## $ weeks_gestation_at_birth : chr "'--" "'--" "'--" "'--" ...
## $ year_of_birth : chr "1959" "1973" "1936" "1952" ...
## $ year_of_death : chr "'--" "'--" "'--" "'--" ...
## $ adrenal_hormone : chr "'--" "'--" "'--" "'--" ...
## $ age_at_diagnosis : chr "17833" "11838" "26795" "22457" ...
## $ ajcc_clinical_m : chr "'--" "'--" "'--" "'--" ...
## $ ajcc_clinical_n : chr "'--" "'--" "'--" "'--" ...
## $ ajcc_clinical_stage : chr "'--" "'--" "'--" "'--" ...
## $ ajcc_clinical_t : chr "'--" "'--" "'--" "'--" ...
## $ ajcc_pathologic_m : chr "M0" "M0" "MX" "M0" ...
## $ ajcc_pathologic_n : chr "N0" "N0" "N0" "N0" ...
## $ ajcc_pathologic_stage : chr "Stage I" "Stage I" "Stage IIIA" "Stage IIIC" ...
## $ ajcc_pathologic_t : chr "T1" "T1" "T3" "T4" ...
## $ ajcc_serum_tumor_markers : chr "'--" "'--" "'--" "'--" ...
## $ ajcc_staging_system_edition : chr "6th" "6th" "6th" "7th" ...
## $ ann_arbor_b_symptoms : chr "'--" "'--" "'--" "'--" ...
## $ ann_arbor_b_symptoms_described : chr "'--" "'--" "'--" "'--" ...
## $ ann_arbor_clinical_stage : chr "'--" "'--" "'--" "'--" ...
## $ ann_arbor_extranodal_involvement : chr "'--" "'--" "'--" "'--" ...
## $ ann_arbor_pathologic_stage : chr "'--" "'--" "'--" "'--" ...
## $ best_overall_response : chr "'--" "'--" "'--" "'--" ...
## $ burkitt_lymphoma_clinical_variant : chr "'--" "'--" "'--" "'--" ...
## $ calgb_risk_group : chr "'--" "'--" "'--" "'--" ...
## $ cancer_detection_method : chr "'--" "'--" "'--" "'--" ...
## $ child_pugh_classification : chr "'--" "'--" "'--" "'--" ...
## $ clark_level : chr "'--" "'--" "'--" "'--" ...
## $ classification_of_tumor : chr "not reported" "not reported" "not reported" "not reported" ...
## $ cog_liver_stage : chr "'--" "'--" "'--" "'--" ...
## $ cog_neuroblastoma_risk_group : chr "'--" "'--" "'--" "'--" ...
## $ cog_renal_stage : chr "'--" "'--" "'--" "'--" ...
## $ cog_rhabdomyosarcoma_risk_group : chr "'--" "'--" "'--" "'--" ...
## $ contiguous_organ_invaded : chr "'--" "'--" "'--" "'--" ...
## $ days_to_best_overall_response : chr "'--" "'--" "'--" "'--" ...
## $ days_to_diagnosis : chr "0" "0" "0" "0" ...
## $ days_to_last_follow_up : chr "2752.0" "3308.0" "849.0" "507.0" ...
## $ days_to_last_known_disease_status : chr "'--" "'--" "'--" "'--" ...
## $ days_to_recurrence : chr "'--" "'--" "'--" "'--" ...
## $ diagnosis_is_primary_disease : chr "'--" "'--" "'--" "'--" ...
## $ double_expressor_lymphoma : chr "'--" "'--" "'--" "'--" ...
## $ double_hit_lymphoma : chr "'--" "'--" "'--" "'--" ...
## $ eln_risk_classification : chr "'--" "'--" "'--" "'--" ...
## $ enneking_msts_grade : chr "'--" "'--" "'--" "'--" ...
## $ enneking_msts_metastasis : chr "'--" "'--" "'--" "'--" ...
## $ enneking_msts_stage : chr "'--" "'--" "'--" "'--" ...
## $ enneking_msts_tumor_site : chr "'--" "'--" "'--" "'--" ...
## $ ensat_clinical_m : chr "'--" "'--" "'--" "'--" ...
## $ ensat_pathologic_n : chr "'--" "'--" "'--" "'--" ...
## $ ensat_pathologic_stage : chr "'--" "'--" "'--" "'--" ...
## $ ensat_pathologic_t : chr "'--" "'--" "'--" "'--" ...
## $ esophageal_columnar_dysplasia_degree : chr "'--" "'--" "'--" "'--" ...
## $ esophageal_columnar_metaplasia_present : chr "'--" "'--" "'--" "'--" ...
## $ fab_morphology_code : chr "'--" "'--" "'--" "'--" ...
## $ figo_stage : chr "'--" "'--" "'--" "'--" ...
## $ figo_staging_edition_year : chr "'--" "'--" "'--" "'--" ...
## $ first_symptom_longest_duration : chr "'--" "'--" "'--" "'--" ...
## $ first_symptom_prior_to_diagnosis : chr "'--" "'--" "'--" "'--" ...
## $ gastric_esophageal_junction_involvement : chr "'--" "'--" "'--" "'--" ...
## $ gleason_grade_group : chr "'--" "'--" "'--" "'--" ...
## $ gleason_grade_tertiary : chr "'--" "'--" "'--" "'--" ...
## $ gleason_patterns_percent : chr "'--" "'--" "'--" "'--" ...
## $ gleason_score : chr "'--" "'--" "'--" "'--" ...
## $ goblet_cells_columnar_mucosa_present : chr "'--" "'--" "'--" "'--" ...
## $ icd_10_code : chr "C22.0" "C22.0" "C22.0" "C22.0" ...
## $ igcccg_stage : chr "'--" "'--" "'--" "'--" ...
## $ inpc_grade : chr "'--" "'--" "'--" "'--" ...
## $ inpc_histologic_group : chr "'--" "'--" "'--" "'--" ...
## $ inrg_stage : chr "'--" "'--" "'--" "'--" ...
## $ inss_stage : chr "'--" "'--" "'--" "'--" ...
## $ international_prognostic_index : chr "'--" "'--" "'--" "'--" ...
## $ irs_group : chr "'--" "'--" "'--" "'--" ...
## $ irs_stage : chr "'--" "'--" "'--" "'--" ...
## $ ishak_fibrosis_score : chr "'--" "'--" "'--" "'--" ...
## $ iss_stage : chr "'--" "'--" "'--" "'--" ...
## $ last_known_disease_status : chr "not reported" "not reported" "not reported" "not reported" ...
## $ laterality : chr "'--" "'--" "'--" "'--" ...
## $ margin_distance...96 : chr "'--" "'--" "'--" "'--" ...
## $ margins_involved_site...97 : chr "'--" "'--" "'--" "'--" ...
## $ masaoka_stage : chr "'--" "'--" "'--" "'--" ...
## $ max_tumor_bulk_site : chr "'--" "'--" "'--" "'--" ...
## [list output truncated]
sample_case <- as.data.frame(sample_case)
## 将sample_case$case_id和clinical$case_id转化为字符串格式,便于后续操作。
sample_case$case_id <- as.character(sample_case$case_id)#as.character(...): 将传入的值转换为字符型(字符串)格式
clinical$case_id <- as.character(clinical$case_id)
matrix <- merge(sample_case,clinical,by="case_id",all.x=T)#表示保留 sample_case 中的所有行,即使在 clinical 中没有匹配的 case_id。如果没有匹配的行,clinical 中的列将填充为 NA。
colnames(clinical)
## [1] "case_id"
## [2] "case_submitter_id"
## [3] "project_id"
## [4] "age_at_index"
## [5] "age_is_obfuscated"
## [6] "cause_of_death"
## [7] "cause_of_death_source"
## [8] "country_of_birth"
## [9] "country_of_residence_at_enrollment"
## [10] "days_to_birth"
## [11] "days_to_death"
## [12] "education_level"
## [13] "ethnicity"
## [14] "gender"
## [15] "marital_status"
## [16] "occupation_duration_years"
## [17] "population_group"
## [18] "premature_at_birth"
## [19] "race"
## [20] "vital_status"
## [21] "weeks_gestation_at_birth"
## [22] "year_of_birth"
## [23] "year_of_death"
## [24] "adrenal_hormone"
## [25] "age_at_diagnosis"
## [26] "ajcc_clinical_m"
## [27] "ajcc_clinical_n"
## [28] "ajcc_clinical_stage"
## [29] "ajcc_clinical_t"
## [30] "ajcc_pathologic_m"
## [31] "ajcc_pathologic_n"
## [32] "ajcc_pathologic_stage"
## [33] "ajcc_pathologic_t"
## [34] "ajcc_serum_tumor_markers"
## [35] "ajcc_staging_system_edition"
## [36] "ann_arbor_b_symptoms"
## [37] "ann_arbor_b_symptoms_described"
## [38] "ann_arbor_clinical_stage"
## [39] "ann_arbor_extranodal_involvement"
## [40] "ann_arbor_pathologic_stage"
## [41] "best_overall_response"
## [42] "burkitt_lymphoma_clinical_variant"
## [43] "calgb_risk_group"
## [44] "cancer_detection_method"
## [45] "child_pugh_classification"
## [46] "clark_level"
## [47] "classification_of_tumor"
## [48] "cog_liver_stage"
## [49] "cog_neuroblastoma_risk_group"
## [50] "cog_renal_stage"
## [51] "cog_rhabdomyosarcoma_risk_group"
## [52] "contiguous_organ_invaded"
## [53] "days_to_best_overall_response"
## [54] "days_to_diagnosis"
## [55] "days_to_last_follow_up"
## [56] "days_to_last_known_disease_status"
## [57] "days_to_recurrence"
## [58] "diagnosis_is_primary_disease"
## [59] "double_expressor_lymphoma"
## [60] "double_hit_lymphoma"
## [61] "eln_risk_classification"
## [62] "enneking_msts_grade"
## [63] "enneking_msts_metastasis"
## [64] "enneking_msts_stage"
## [65] "enneking_msts_tumor_site"
## [66] "ensat_clinical_m"
## [67] "ensat_pathologic_n"
## [68] "ensat_pathologic_stage"
## [69] "ensat_pathologic_t"
## [70] "esophageal_columnar_dysplasia_degree"
## [71] "esophageal_columnar_metaplasia_present"
## [72] "fab_morphology_code"
## [73] "figo_stage"
## [74] "figo_staging_edition_year"
## [75] "first_symptom_longest_duration"
## [76] "first_symptom_prior_to_diagnosis"
## [77] "gastric_esophageal_junction_involvement"
## [78] "gleason_grade_group"
## [79] "gleason_grade_tertiary"
## [80] "gleason_patterns_percent"
## [81] "gleason_score"
## [82] "goblet_cells_columnar_mucosa_present"
## [83] "icd_10_code"
## [84] "igcccg_stage"
## [85] "inpc_grade"
## [86] "inpc_histologic_group"
## [87] "inrg_stage"
## [88] "inss_stage"
## [89] "international_prognostic_index"
## [90] "irs_group"
## [91] "irs_stage"
## [92] "ishak_fibrosis_score"
## [93] "iss_stage"
## [94] "last_known_disease_status"
## [95] "laterality"
## [96] "margin_distance...96"
## [97] "margins_involved_site...97"
## [98] "masaoka_stage"
## [99] "max_tumor_bulk_site"
## [100] "medulloblastoma_molecular_classification"
## [101] "melanoma_known_primary"
## [102] "metastasis_at_diagnosis"
## [103] "metastasis_at_diagnosis_site"
## [104] "method_of_diagnosis"
## [105] "micropapillary_features"
## [106] "mitosis_karyorrhexis_index"
## [107] "mitotic_count"
## [108] "morphology"
## [109] "ovarian_specimen_status"
## [110] "ovarian_surface_involvement"
## [111] "papillary_renal_cell_type"
## [112] "pediatric_kidney_staging"
## [113] "peritoneal_fluid_cytological_status"
## [114] "pregnant_at_diagnosis"
## [115] "primary_diagnosis"
## [116] "primary_disease"
## [117] "primary_gleason_grade"
## [118] "prior_malignancy"
## [119] "prior_treatment"
## [120] "progression_or_recurrence"
## [121] "residual_disease...121"
## [122] "satellite_nodule_present"
## [123] "secondary_gleason_grade"
## [124] "site_of_resection_or_biopsy"
## [125] "sites_of_involvement"
## [126] "sites_of_involvement_count"
## [127] "supratentorial_localization"
## [128] "synchronous_malignancy"
## [129] "tissue_or_organ_of_origin"
## [130] "tumor_burden"
## [131] "tumor_confined_to_organ_of_origin"
## [132] "tumor_depth"
## [133] "tumor_focality"
## [134] "tumor_grade"
## [135] "tumor_grade_category"
## [136] "tumor_of_origin"
## [137] "tumor_regression_grade"
## [138] "uicc_clinical_m"
## [139] "uicc_clinical_n"
## [140] "uicc_clinical_stage"
## [141] "uicc_clinical_t"
## [142] "uicc_pathologic_m"
## [143] "uicc_pathologic_n"
## [144] "uicc_pathologic_stage"
## [145] "uicc_pathologic_t"
## [146] "uicc_staging_system_edition"
## [147] "ulceration_indicator"
## [148] "weiss_assessment_findings"
## [149] "weiss_assessment_score"
## [150] "who_cns_grade"
## [151] "who_nte_grade"
## [152] "wilms_tumor_histologic_subtype"
## [153] "year_of_diagnosis"
## [154] "chemo_concurrent_to_radiation"
## [155] "clinical_trial_indicator"
## [156] "course_number"
## [157] "days_to_treatment_end"
## [158] "days_to_treatment_start"
## [159] "drug_category"
## [160] "embolic_agent"
## [161] "initial_disease_status"
## [162] "lesions_treated_number"
## [163] "margin_distance...163"
## [164] "margin_status"
## [165] "margins_involved_site...165"
## [166] "number_of_cycles"
## [167] "number_of_fractions"
## [168] "prescribed_dose"
## [169] "prescribed_dose_units"
## [170] "pretreatment"
## [171] "protocol_identifier"
## [172] "radiosensitizing_agent"
## [173] "reason_treatment_ended"
## [174] "reason_treatment_not_given"
## [175] "regimen_or_line_of_therapy"
## [176] "residual_disease...176"
## [177] "route_of_administration"
## [178] "therapeutic_agents"
## [179] "therapeutic_level_achieved"
## [180] "therapeutic_levels_achieved"
## [181] "therapeutic_target_level"
## [182] "timepoint_category"
## [183] "treatment_anatomic_site"
## [184] "treatment_anatomic_sites"
## [185] "treatment_arm"
## [186] "treatment_dose"
## [187] "treatment_dose_max"
## [188] "treatment_dose_units"
## [189] "treatment_duration"
## [190] "treatment_effect"
## [191] "treatment_effect_indicator"
## [192] "treatment_frequency"
## [193] "treatment_intent_type"
## [194] "treatment_or_therapy"
## [195] "treatment_outcome"
## [196] "treatment_outcome_duration"
## [197] "treatment_type"
demo <- c("case_submitter_id","age_at_index","ethnicity","gender","race",
"vital_status","days_to_death","days_to_last_follow_up",
"ajcc_pathologic_stage","ajcc_pathologic_t","ajcc_pathologic_m",
"ajcc_pathologic_n","treatment_type")
# 筛选需要的临床信息
matrix = matrix[,demo] #用于从 matrix 数据框中选择特定的列。
# 返回数据框的前六行,默认显示前六行数据,以便快速查看数据结构和内容。
head(matrix)
## case_submitter_id age_at_index ethnicity gender race
## 1 TCGA-DD-AAVP 48 not hispanic or latino male asian
## 2 TCGA-DD-A4NP 32 not hispanic or latino male white
## 3 TCGA-BC-4073 73 not hispanic or latino male white
## 4 TCGA-MI-A75E 61 not hispanic or latino male white
## 5 TCGA-G3-A7M8 31 not hispanic or latino male asian
## 6 TCGA-DD-A11C 69 not hispanic or latino male white
## vital_status days_to_death days_to_last_follow_up ajcc_pathologic_stage
## 1 Alive '-- 2752.0 Stage I
## 2 Alive '-- 3308.0 Stage I
## 3 Alive '-- 849.0 Stage IIIA
## 4 Alive '-- 507.0 Stage IIIC
## 5 Alive '-- 430.0 Stage I
## 6 Alive '-- 662.0 Stage I
## ajcc_pathologic_t ajcc_pathologic_m ajcc_pathologic_n
## 1 T1 M0 N0
## 2 T1 M0 N0
## 3 T3 MX N0
## 4 T4 M0 N0
## 5 T1 MX NX
## 6 T1 M0 N0
## treatment_type
## 1 Pharmaceutical Therapy, NOS
## 2 Radiation Therapy, NOS
## 3 Radiation Therapy, NOS
## 4 Pharmaceutical Therapy, NOS
## 5 Radiation Therapy, NOS
## 6 Radiation Therapy, NOS
# colnames(matrix) <- ...: 将 matrix 数据框的列名重新命名为指定的名称。。c(...): 创建一个包含新列名的字符向量。
colnames(matrix) <- c("ID","Age","Ethnicity","Gender","Race",
"Status","days_to_death","days_to_last_follow_up",
"Stage","T","M","N","Treatment")
#排除结局为"Not Reported"的Sample,保留Alive和Dead的数据
matrix = matrix[matrix$Status %in% c('Alive','Dead'),]
# 把matrix数值列转换为数值型,便于记录生存信息
# as.numeric(...): 将传入的值转换为数值型。。注意:在尝试将某一列转换为数值型时,R 无法将某些值转换为数值,因此将这些值替换为 NA(缺失值)。
matrix$days_to_last_follow_up <- as.numeric(matrix$days_to_last_follow_up)#将 days_to_last_follow_up 列的数据类型转换为数值型。
## Warning: NAs introduced by coercion
matrix$days_to_death <- as.numeric(matrix$days_to_death)
## Warning: NAs introduced by coercion
matrix$Age <- as.numeric(matrix$Age)
## Warning: NAs introduced by coercion
# 去除NA,替换为0
matrix$days_to_last_follow_up[is.na(matrix$days_to_last_follow_up)] = 0
matrix$days_to_death[is.na(matrix$days_to_death)] = 0
matrix$Age [is.na(matrix$Age )] = 0
# 这行代码的作用是根据 matrix 数据框中 Status 列的值来创建一个新的列 days。具体来说:
# 如果 Status 列的值为 'Alive',则 days 列的值取自 days_to_last_follow_up 列。
# 如果 Status 列的值为其他值(例如 'Dead'),则 days 列的值取自 days_to_death 列。
matrix$days <- ifelse(matrix$Status=='Alive',matrix$days_to_last_follow_up,matrix$days_to_death)
## 在 matrix 数据框添加三个新的列(生存分析需要的信息):存活状态、月、年
matrix$OS <- ifelse(matrix$Status == "Alive", 0, 1)#OS: 存活状态,表示患者是否存活,如果 Status 列的值为 'Alive',则 OS 列的值为 0。否则,OS 列的值为 1
matrix$month=round(matrix$days/30,0) #以month为单位,小数不保留
matrix$OS.time <- floor(matrix$month/12)
write.csv(matrix, "C:/Users/石源方/Desktop/数据搬家/华理工/班级-华/各科课程作业/高等生物信息学-注意PDF格式/24-12-19-practice6/6/clinical_clear.csv", row.names = FALSE) # csv格式