00.Background Introduction and Purpose Statement

Methods and Data Source Description

Visualization and Interpretation of Results

01.Extraction of cell pyroptosis genes

# install.packages('knitr')
library(knitr)
#if (!requireNamespace("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("limma")


library(limma)        # Load the package

# Read input files and process the data
rt=read.table("symbol.txt", header=T, sep="\t", check.names=F)
rt=as.matrix(rt)
rownames(rt)=rt[,1]
exp=rt[,2:ncol(rt)]
dimnames=list(rownames(exp), colnames(exp))
data=matrix(as.numeric(as.matrix(exp)), nrow=nrow(exp), dimnames=dimnames)
data=avereps(data)
data=data[rowMeans(data)>0,]


# Extract gene expression of pyroptosis genes
gene=read.table("gene.txt", header=F, sep="\t", check.names=F)
sameGene=intersect(as.vector(gene[,1]), rownames(data))
geneExp=data[sameGene,]

# Output the results
out=rbind(ID=colnames(geneExp),geneExp)
write.table(out,file="tcga.pyroptosisExp.txt",sep="\t",quote=F,col.names=F)

#Viewing the Output Results.
# Read the first 6 lines of the output file
output_file <- "tcga.pyroptosisExp.txt"
output_data <- read.table(output_file, header = FALSE, sep = "\t")
# head(output_data)
# Structure and overall summary of the output file
# str(output_data)
# summary(output_data)

02.Differential analysis of cell pyroptosis genes

# # Check if the 'BiocManager' package is installed, and if not, install it.
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")


# BiocManager::install("limma")
# install.packages("pheatmap")

# Load the required packages.
library(limma)
library(pheatmap)
## Warning: 程辑包'pheatmap'是用R版本4.3.2 来建造的
# Set the expression file and working directory.
expFile = "tcga.pyroptosisExp.txt"  # Gene expression file
#setwd("D:\\University_Malaya\\pyroptosis\\10.TCGAdiff")  # Set the working directory

# Read the input file.
rt <- read.table(expFile, header = TRUE, sep = "\t", check.names = FALSE)
rt <- as.matrix(rt)
rownames(rt) <- rt[, 1]
exp <- rt[, 2:ncol(rt)] 
dimnames <- list(rownames(exp), colnames(exp))
data <- matrix(as.numeric(as.matrix(exp)), nrow = nrow(exp), dimnames = dimnames)
data <- avereps(data)

# Determine the number of normal and tumor samples.
group <- sapply(strsplit(colnames(data), "\\-"), "[", 4)
group <- sapply(strsplit(group, ""), "[", 1)
group <- gsub("2", "1", group)
conNum <- length(group[group == 1])       # Number of normal samples
treatNum <- length(group[group == 0])     # Number of tumor samples
sampleType <- c(rep(1, conNum), rep(2, treatNum))

# Perform differential analysis.
sigVec <- c()
outTab <- data.frame()
for (i in rownames(data)) {
  if (sd(data[i, ]) < 0.001) {
    next
  }
  wilcoxTest <- wilcox.test(data[i, ] ~ sampleType)
  pvalue <- wilcoxTest$p.value
  if (pvalue < 0.05) {
    Sig <- ifelse(pvalue < 0.001, "***", ifelse(pvalue < 0.01, "**", ifelse(pvalue < 0.05, "*", "")) )
    sigVec <- c(sigVec, paste0(i, Sig))
    conGeneMeans <- mean(data[i, 1:conNum])
    treatGeneMeans <- mean(data[i, (conNum + 1):ncol(data)])
    logFC <- log2(treatGeneMeans) - log2(conGeneMeans)
    outTab <- rbind(outTab, cbind(gene = i, conMean = conGeneMeans, treatMean = treatGeneMeans, logFC = logFC, pValue = pvalue))
  }
}

# ... [update code] ...

# Output the differential analysis results as tables
write.table(outTab, file = "diff.xls", sep = "\t", row.names = FALSE, quote = FALSE)
write.table(outTab, file = "diff.txt", sep = "\t", row.names = FALSE, quote = FALSE)

# Display the differential analysis results table
library(gridExtra)
grid.table(outTab)

# Output the expression file of differential genes
exp <- data[as.vector(outTab[, 1]), ]
expOut <- rbind(ID = colnames(exp), exp)
write.table(expOut, file = "diffGeneExp.txt", sep = "\t", col.names = FALSE, quote = FALSE)

# Create and display a heatmap of differential genes
exp <- log2(exp + 0.1)
row.names(exp) <- sigVec
Type <- c(rep("Normal", conNum), rep("Tumor", treatNum))
names(Type) <- colnames(data)
Type <- as.data.frame(Type)

# Display heatmap in R Markdown
library(pheatmap)
pheatmap(exp, 
         annotation = Type, 
         color = colorRampPalette(c(rep("blue", 5), "white", rep("red", 5)))(50),
         cluster_cols = FALSE,
         cluster_rows = TRUE,
         scale = "row",
         show_colnames = FALSE,
         show_rownames = TRUE,
         fontsize = 8,
         fontsize_row = 8,
         fontsize_col = 8)

03.Correlation

# Install necessary packages
#install.packages("reshape2")
#install.packages("igraph")

# Load required libraries
library(igraph)
## 
## 载入程辑包:'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(reshape2)

# Specify input files and parameters
expFile = "diffGeneExp.txt"     # Expression input files
cutoff = 0.2                    # Relevance threshold
out = "correlation.pdf"         # Output result files
#setwd("D:\\University_Malaya\\pyroptosis\\12.correlationn")

# Read input data from a tab-separated file
data = read.table(expFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)

# Remove normal samples based on the sample naming convention
group = sapply(strsplit(colnames(data), "\\-"), "[", 4)
group = sapply(strsplit(group, ""), "[", 1)
group = gsub("2", "1", group)
rt = data[, group == 0]

# Calculate the correlation matrix between genes
data = t(rt)
cordata = cor(data)

# Keep only the upper triangular part of the correlation matrix
mydata = cordata
upper = upper.tri(mydata)
mydata[upper] = NA

# Convert the correlation matrix to a data frame
df = data.frame(gene = rownames(mydata), mydata)
dfmeltdata = melt(df, id = "gene")
dfmeltdata = dfmeltdata[!is.na(dfmeltdata$value), ]
dfmeltdata = dfmeltdata[dfmeltdata$gene != dfmeltdata$variable, ]
dfmeltdata = dfmeltdata[abs(dfmeltdata$value) > cutoff, ]

# Define the nodes and edges of the network graph
corweight = dfmeltdata$value
weight = corweight+abs(min(corweight))+5
d = data.frame(p1=dfmeltdata$gene,p2=dfmeltdata$variable,weight=dfmeltdata$value)
g = graph.data.frame(dfmeltdata,directed = FALSE)

# Set colors, node sizes, and font sizes
E(g)$weight = weight
E(g)$color = ifelse(corweight > 0, rgb(254/255, 67/255, 101/255, abs(corweight)), rgb(0/255, 0/255, 255/255, abs(corweight)))
V(g)$size = 8
V(g)$shape = "circle"
V(g)$lable.cex = 1.2
V(g)$color = "white"

# Visualize the network graph
pdf(out, width = 7, height = 6)
layout(matrix(c(1, 1, 1, 0, 2, 0), byrow = TRUE, nc = 3), height = c(6, 1), width = c(3, 4, 3))
par(mar = c(1.5, 2, 2, 2))
vertex.frame.color = NA
plot(g, layout = layout_nicely, vertex.label.cex = V(g)$lable.cex, edge.width = E(g)$weight, edge.arrow.size = 0, vertex.label.color = "black", vertex.frame.color = vertex.frame.color, edge.color = E(g)$color, vertex.label.cex = V(g)$lable.cex, vertex.label.font = 2, vertex.size = V(g)$size, edge.curved = 0.4)

# Create a color legend
color_legend = c(rgb(254/255,67/255,101/255,seq(1,0,by=-0.01)),rgb(0/255,0/255,255/255,seq(0,1,by=0.01)))
par(mar=c(2,2,1,2),xpd = T,cex.axis=1.6,las=1)
barplot(rep(1,length(color_legend)),border = NA, space = 0,ylab="",xlab="",xlim=c(1,length(color_legend)),horiz=FALSE,
        axes = F, col=color_legend,main="")
axis(3,at=seq(1,length(color_legend),length=5),c(1,0.5,0,-0.5,-1),tick=FALSE)
dev.off()
## png 
##   2
# ... [update code] ...

# Save the network graph as an image and display it
out_image = "correlation.png"  # Define the output image file name

# Save the plot as a PNG file
png(out_image, width = 700, height = 600)
plot(g, layout = layout_nicely, vertex.label.cex = V(g)$lable.cex, edge.width = E(g)$weight, edge.arrow.size = 0, vertex.label.color = "black", vertex.frame.color = vertex.frame.color, edge.color = E(g)$color, vertex.label.cex = V(g)$lable.cex, vertex.label.font = 2, vertex.size = V(g)$size, edge.curved = 0.4)
dev.off()
## png 
##   2
# Display the saved image in R Markdown
knitr::include_graphics(out_image)

04. Cluster

# install.packages("survival")
# 
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("limma")

# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install(version = '3.18')
# BiocManager::install("ConsensusClusterPlus")


#Referencing packages
library(limma)
library(survival)
## Warning: 程辑包'survival'是用R版本4.3.2 来建造的
library(ConsensusClusterPlus)
# install.packages(ConsensusClusterPlus)
expFile="tcga.pyroptosisExp.txt"     # Expression data files
cliFile="time_tcga.txt"                   # Survival data files
workDir="D:\\University_Malaya\\pyroptosis"       #Working Catalog
#setwd(workDir)       # Set up a working directory

# Read expression input files
data=read.table(expFile, header=T, sep="\t", check.names=F, row.names=1)

# Delete normal samples
group=sapply(strsplit(colnames(data),"\\-"), "[", 4)
group=sapply(strsplit(group,""), "[", 1)
group=gsub("2", "1", group)
data=data[,group==0]
data=t(data)
rownames(data)=gsub("(.*?)\\-(.*?)\\-(.*?)\\-.*", "\\1\\-\\2\\-\\3", rownames(data))
data=avereps(data)
data=log2(data+1)

#Read survival data
cli=read.table(cliFile,sep="\t",check.names=F,header=T,row.names=1)      #Read clinical files

# Data merge and output results
sameSample=intersect(row.names(data),row.names(cli))
data=data[sameSample,]
cli=cli[sameSample,]
rt=cbind(cli,data)

# Single factor COX analysis
sigGenes=c()
for(i in colnames(rt)[3:ncol(rt)]){
  cox=coxph(Surv(futime, fustat) ~ rt[,i], data = rt)
  coxSummary=summary(cox)
  coxP=coxSummary$coefficients[,"Pr(>|z|)"]
  if(coxP<0.05){ sigGenes=c(sigGenes,i) }
}

#Clustering
maxK=9
data=t(data[,sigGenes])
results=ConsensusClusterPlus(data,
                             maxK=maxK,
                             reps=50,
                             pItem=0.8,
                             pFeature=1,
                             title=workDir,
                             clusterAlg="pam",
                             distance="euclidean",
                             seed=123456,
                             plot="png")
## end fraction
## clustered
## clustered
## clustered
## clustered
## clustered
## clustered
## clustered
## clustered
# Output typing results
clusterNum=2         # Into several categories
Cluster=results[[clusterNum]][["consensusClass"]]
Cluster=as.data.frame(Cluster)
Cluster[,1]=paste0("C", Cluster[,1])
ClusterOut=rbind(ID=colnames(Cluster), Cluster)
write.table(ClusterOut, file="cluster.txt", sep="\t", quote=F, col.names=F)


# Read and display the first five rows of the output file
cluster_data <- read.table("cluster.txt", header = FALSE, sep = "\t")
# head(cluster_data)

# Display the structure of the data frame
# str(cluster_data)

# Display summary statistics of the data
summary(cluster_data)
##       V1                 V2           
##  Length:1091        Length:1091       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character

05. ClusterSur

# # Install packages
# install.packages("survival")
# install.packages("survminer")

# Load libraries
library(survival)
library(survminer)
## Warning: 程辑包'survminer'是用R版本4.3.2 来建造的
## 载入需要的程辑包:ggplot2
## Warning: 程辑包'ggplot2'是用R版本4.3.2 来建造的
## 载入需要的程辑包:ggpubr
## 
## 载入程辑包:'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
# Define input files
ClusterFile = "cluster.txt"   # Clustering results file
cliFile = "time_tcga.txt"          # Survival data file

# Read input files
Cluster = read.table(ClusterFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)
cli = read.table(cliFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)
colnames(cli) = c("futime", "fustat")
cli$futime = cli$futime / 365

# Merge data
sameSample = intersect(row.names(Cluster), row.names(cli))
rt = cbind(cli[sameSample,, drop = FALSE], Cluster[sameSample,, drop = FALSE])

# Survival analysis
length = length(levels(factor(rt$Cluster)))
diff = survdiff(Surv(futime, fustat) ~ Cluster, data = rt)
pValue = 1 - pchisq(diff$chisq, df = length - 1)
if (pValue < 0.001) {
  pValue = "p<0.001"
} else {
  pValue = paste0("p=", sprintf("%.03f", pValue))
}
fit <- survfit(Surv(futime, fustat) ~ Cluster, data = rt)

# Plot survival curves
bioCol = c("#0066FF", "#FF9900", "#FF0000", "#6E568C", "#7CC767", "#223D6C", "#D20A13", "#FFD121", "#088247", "#11AA4D")
bioCol = bioCol[1:length]
surPlot = ggsurvplot(fit,
                     data = rt,
                     conf.int = FALSE,
                     pval = pValue,
                     pval.size = 6,
                     legend.title = "Cluster",
                     legend.labs = levels(factor(rt[,"Cluster"])),
                     legend = c(0.8, 0.8),
                     font.legend = 10,
                     xlab = "Time (years)",
                     break.time.by = 1,
                     palette = bioCol,
                     surv.median.line = "hv",
                     risk.table = TRUE,
                     cumevents = FALSE,
                     risk.table.height = 0.25
)

# Save plot as PDF and show in R Markdown
pdf(file = "survival.pdf", onefile = FALSE, width = 7, height = 5.5)
print(surPlot)
dev.off()
## png 
##   2
# Show plot in R Markdown
print(surPlot)

06. ClusterDiff

# Check if the 'BiocManager' package is available, and install it if necessary
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("limma")
# 
# # Install the 'ggplot2' package
# install.packages("ggplot2")

# Load required packages
library(limma)
library(ggplot2)

# Define input file paths
expFile = "tcga.pyroptosisExp.txt"        # Expression input file
cluFile = "cluster.txt"       # Clustering result file
logFCfilter = 0.585           # logFC filtering threshold (logFC = 0.585: Fold change = 1.5)
fdrFilter = 0.05              # FDR filtering threshold

# Read expression data file and format it
rt = read.table(expFile, header = TRUE, sep = "\t", check.names = FALSE)
rt = as.matrix(rt)
rownames(rt) = rt[, 1]
exp = rt[, 2:ncol(rt)]
dimnames = list(rownames(exp), colnames(exp))
data = matrix(as.numeric(as.matrix(exp)), nrow = nrow(exp), dimnames = dimnames)
data = avereps(data)

# Remove normal samples
group = sapply(strsplit(colnames(data), "\\-"), "[", 4)
group = sapply(strsplit(group, ""), "[", 1)
group = gsub("2", "1", group)
data = data[, group == 0]
colnames(data) = gsub("(.*?)\\-(.*?)\\-(.*?)\\-.*", "\\1\\-\\2\\-\\3", colnames(data))

# Read clustering result file
Type = read.table(cluFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)
sameSample = intersect(colnames(data), row.names(Type))
data = data[, sameSample, drop = FALSE]
Type = Type[sameSample,, drop = FALSE]

# Extract samples from different clusters
c1 = Type[Type$Cluster == "C1", , drop = FALSE]
c2 = Type[Type$Cluster == "C2", , drop = FALSE]
dataC1 = data[, row.names(c1)]
dataC2 = data[, row.names(c2)]
data = cbind(dataC1, dataC2)
data = data[rowMeans(data) > 0.5,]
conNum = ncol(dataC1)
treatNum = ncol(dataC2)
Type = c(rep(1, conNum), rep(2, treatNum))

# Perform differential analysis
outTab = data.frame()
for (i in row.names(data)) {
  rt = data.frame(expression = data[i, ], Type = Type)
  wilcoxTest = wilcox.test(expression ~ Type, data = rt)
  pvalue = wilcoxTest$p.value
  conGeneMeans = mean(data[i, 1:conNum])
  treatGeneMeans = mean(data[i, (conNum + 1):ncol(data)])
  logFC = log2(treatGeneMeans) - log2(conGeneMeans)
  conMed = median(data[i, 1:conNum])
  treatMed = median(data[i, (conNum + 1):ncol(data)])
  diffMed = treatMed - conMed
  if (((logFC > 0) & (diffMed > 0)) | ((logFC < 0) & (diffMed < 0))) {
    outTab = rbind(outTab, cbind(gene = i, Mean1 = conGeneMeans, Mean2 = treatGeneMeans, logFC = logFC, pValue = pvalue))
  }
}
pValue = outTab[, "pValue"]
fdr = p.adjust(as.numeric(as.vector(pValue)), method = "fdr")
outTab = cbind(outTab, fdr = fdr)

# Output differential table
outDiff = outTab[(abs(as.numeric(as.vector(outTab$logFC))) > logFCfilter & as.numeric(as.vector(outTab$fdr)) < fdrFilter), ]
write.table(outDiff, file = "diff.txt", sep = "\t", row.names = FALSE, quote = FALSE)

# Output expression file for differentially expressed genes
diffExp = rbind(ID = colnames(data[as.vector(outDiff[, 1]), ]), data[as.vector(outDiff[, 1]), ])
write.table(diffExp, file = "diffGeneExp.txt", sep = "\t", col.names = FALSE, quote = FALSE)



# Read and display the first five rows of the output file
output_file <- "diff.txt"
head_data <- read.table(output_file, header = TRUE, sep = "\t")
head(head_data)
##    gene     Mean1     Mean2     logFC        pValue           fdr
## 1 CASP1 4.0079569  9.053108 1.1755462  3.482304e-76  2.925136e-75
## 2 CASP4 4.5081481  7.684835 0.7694793  2.967513e-61  1.780508e-60
## 3  GZMB 0.9494651  9.880166 3.3793484 2.549220e-115 5.353363e-114
## 4  IL18 3.2757382  7.531955 1.2012044  3.079212e-74  2.155448e-73
## 5  IL1B 1.3867886  2.308930 0.7354768  9.865222e-20  3.187226e-19
## 6  IRF1 6.6328428 17.256115 1.3794085  2.191896e-97  2.301491e-96
# View the structure and data summary of the file
str(head_data)
## 'data.frame':    14 obs. of  6 variables:
##  $ gene  : chr  "CASP1" "CASP4" "GZMB" "IL18" ...
##  $ Mean1 : num  4.008 4.508 0.949 3.276 1.387 ...
##  $ Mean2 : num  9.05 7.68 9.88 7.53 2.31 ...
##  $ logFC : num  1.176 0.769 3.379 1.201 0.735 ...
##  $ pValue: num  3.48e-76 2.97e-61 2.55e-115 3.08e-74 9.87e-20 ...
##  $ fdr   : num  2.93e-75 1.78e-60 5.35e-114 2.16e-73 3.19e-19 ...
summary(head_data)
##      gene               Mean1            Mean2            logFC       
##  Length:14          Min.   :0.6361   Min.   : 1.158   Min.   :0.7355  
##  Class :character   1st Qu.:0.9995   1st Qu.: 2.364   1st Qu.:0.9193  
##  Mode  :character   Median :1.2740   Median : 2.923   Median :1.0243  
##                     Mean   :2.2349   Mean   : 6.473   Mean   :1.3762  
##                     3rd Qu.:3.5378   3rd Qu.: 8.711   3rd Qu.:1.3349  
##                     Max.   :6.6328   Max.   :21.414   Max.   :3.3793  
##      pValue               fdr           
##  Min.   :0.000e+00   Min.   :0.000e+00  
##  1st Qu.:0.000e+00   1st Qu.:0.000e+00  
##  Median :0.000e+00   Median :0.000e+00  
##  Mean   :4.008e-20   Mean   :1.153e-19  
##  3rd Qu.:0.000e+00   3rd Qu.:0.000e+00  
##  Max.   :4.625e-19   Max.   :1.295e-18
# Read and display the first five rows of another output file
output_file2 <- "diffGeneExp.txt"
head_data2 <- read.table(output_file2, header = FALSE, sep = "\t")
# head(head_data2)

# View the structure and data summary of the file
# str(head_data2)
# summary(head_data2)

07. ClusterHeatmap

# Check if the 'BiocManager' package is available, and install it if necessary
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("limma")

# Install the 'pheatmap' package
#install.packages("pheatmap")

# Load required packages
library(limma)
library(pheatmap)

# Define input file paths
expFile = "diffGeneExp.txt"  # Expression data file
ClusterFile = "cluster.txt"  # Clustering data file
cliFile = "clinical.txt"     # Clinical data file

# Read the expression data file
exp = read.table(expFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)

# Read the clustering data
Cluster = read.table(ClusterFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)

# Read the clinical data file
cli = read.table(cliFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)

# Merge data
samSample = intersect(row.names(Cluster), row.names(cli))
cli = cli[samSample,, drop = FALSE]
Cluster = Cluster[samSample,, drop = FALSE]
Type = cbind(Cluster, cli)
Type = Type[order(Type$Cluster),, drop = FALSE]
exp = exp[, row.names(Type)]

# Iterate through clinical traits to obtain significance labels
sigVec = c("Cluster")
for (clinical in colnames(Type[, 2:ncol(Type)])) {
  data = Type[c("Cluster", clinical)]
  colnames(data) = c("Cluster", "clinical")
  data = data[data[,"clinical"] != "unknow", ]
  tableStat = table(data)
  stat = chisq.test(tableStat)
  pvalue = stat$p.value
  Sig = ifelse(pvalue < 0.001, "***", ifelse(pvalue < 0.01, "**", ifelse(pvalue < 0.05, "*", "")))
  sigVec = c(sigVec, paste0(clinical, Sig))
}
## Warning in chisq.test(tableStat): Chi-squared近似算法有可能不准

## Warning in chisq.test(tableStat): Chi-squared近似算法有可能不准
colnames(Type) = sigVec

# Define heatmap annotation colors
colorList = list()
bioCol = c("#0066FF", "#FF0000", "#ed1299", "#0dbc21", "#246b93", "#cc8e12", "#d561dd",
           "#6ad157", "#f7aa5d", "#9ed84e", "#39ba30", "#373bbf", "#a1ce4c", "#ef3bb6", "#d66551",
           "#1a918f", "#ddd53e", "#ff66fc", "#2927c4", "#57e559", "#8e3af4", "#f9a270", "#22547f", "#db5e92",
           "#4aef7b", "#e86502", "#99db27", "#e07233", "#8249aa", "#cebb10", "#03827f", "#931635", "#ff523f",
           "#edd05e", "#6f25e8", "#0dbc21", "#167275", "#280f7a", "#6373ed", "#5b910f", "#7b34c1", "#0cf29a", "#d80fc1",
           "#dd27ce", "#07a301", "#391c82", "#2baeb5", "#925bea", "#09f9f5", "#63ff4f")
j=0
for(cli in colnames(Type[,1:ncol(Type)])){
  cliLength=length(levels(factor(Type[,cli])))
  cliCol=bioCol[(j+1):(j+cliLength)]
  j=j+cliLength
  names(cliCol)=levels(factor(Type[,cli]))
  if("unknow" %in% levels(factor(Type[,cli]))){
    cliCol["unknow"]="grey75"}
  colorList[[cli]]=cliCol
}

# Visualize the heatmap
pdf("heatmap.pdf", width = 8.5, height = 6)
exp = log2(exp + 0.01)
pheatmap(exp,
         annotation = Type,
         annotation_colors = colorList,
         color = colorRampPalette(c(rep("blue", 5), "white", rep("red", 5)))(100),
         cluster_cols = FALSE,
         cluster_rows = FALSE,
         scale = "row",
         show_colnames = FALSE,
         show_rownames = TRUE,
         fontsize = 6,
         fontsize_row = 1,
         fontsize_col = 6)
dev.off()
## pdf 
##   3
# Display the heatmap in R Markdown
heatmap_07=pheatmap(exp,
         annotation = Type,
         annotation_colors = colorList,
         color = colorRampPalette(c(rep("blue", 5), "white", rep("red", 5)))(100),
         cluster_cols = FALSE,
         cluster_rows = FALSE,
         scale = "row",
         show_colnames = FALSE,
         show_rownames = TRUE,
         fontsize = 6,
         fontsize_row = 1,
         fontsize_col = 6)
print(heatmap_07)

08. Typing differential gene expression

# # Install required packages if not already installed
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("limma")
# 
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("sva")

# Load necessary packages
library(limma)
library(sva)
## 载入需要的程辑包:mgcv
## 载入需要的程辑包:nlme
## This is mgcv 1.8-42. For overview type 'help("mgcv-package")'.
## 载入需要的程辑包:genefilter
## 载入需要的程辑包:BiocParallel
# Set file paths and working directory
tcgaExpFile = "symbol.txt"      # TCGA expression data file
geoExpFile = "geoMatrix.txt"    # GEO expression data file
geneFile = "diff.txt"           # Gene list file

rt=read.table(tcgaExpFile, header=T, sep="\t", check.names=F)
rt=as.matrix(rt)
rownames(rt)=rt[,1]
exp=rt[,2:ncol(rt)]
dimnames=list(rownames(exp),colnames(exp))
tcga=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
tcga=avereps(tcga)
tcga=log2(tcga+1) 

# Remove normal samples
group = sapply(strsplit(colnames(tcga), "\\-"), "[", 4)
group = sapply(strsplit(group, ""), "[", 1)
group = gsub("2", "1", group)
tcga = tcga[, group == 0]
tcga = t(tcga)
rownames(tcga) = gsub("(.*?)\\-(.*?)\\-(.*?)\\-.*", "\\1\\-\\2\\-\\3", rownames(tcga))
tcga = t(avereps(tcga))

# Read GEO gene expression file and process the data
rt = read.table(geoExpFile, header = TRUE, sep = "\t", check.names = FALSE)
rt = as.matrix(rt)
rownames(rt) = rt[, 1]
exp = rt[, 2:ncol(rt)]
dimnames = list(rownames(exp), colnames(exp))
geo = matrix(as.numeric(as.matrix(exp)), nrow = nrow(exp), dimnames = dimnames)
geo = avereps(geo)

# If GEO data is not log2-transformed, automatically apply log2 transformation
qx = as.numeric(quantile(geo, c(0, 0.25, 0.5, 0.75, 0.99, 1.0), na.rm = TRUE))
LogC = (qx[5] > 100) || ((qx[6] - qx[1]) > 50 && qx[2] > 0)
if (LogC) {
  geo[geo < 0] = 0
  geo = log2(geo + 1)
}
geo = normalizeBetweenArrays(geo)

# Get the intersection of genes and their expression in both TCGA and GEO datasets
sameGene = intersect(row.names(tcga), row.names(geo))
tcgaOut = tcga[sameGene,]
geoOut = geo[sameGene,]

# Batch correction
all = cbind(tcgaOut, geoOut)
batchType = c(rep(1, ncol(tcgaOut)), rep(2, ncol(geoOut)))
outTab = ComBat(all, batchType, par.prior = TRUE)
## Found 47 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
tcgaOut = outTab[, colnames(tcgaOut)]
tcgaOut[tcgaOut < 0] = 0
geoOut = outTab[, colnames(geoOut)]
geoOut[geoOut < 0] = 0

# Output the corrected data
tcgaTab = rbind(ID = colnames(tcgaOut), tcgaOut)
write.table(tcgaTab, file = "TCGA.normalize.txt", sep = "\t", quote = FALSE, col.names = FALSE)
geoTab = rbind(ID = colnames(geoOut), geoOut)
write.table(geoTab, file = "GEO.normalize.txt", sep = "\t", quote = FALSE, col.names = FALSE)

# Get the expression of differentially expressed genes
gene = read.table(geneFile, header = TRUE, sep = "\t", check.names = FALSE)
sameGene = intersect(as.vector(gene[, 1]), rownames(tcgaOut))
tcgaShareExp = tcgaOut[sameGene,]
geoShareExp = geoOut[sameGene,]

# Output the expression of differentially expressed genes
tcgaShareExp = rbind(ID = colnames(tcgaShareExp), tcgaShareExp)
write.table(tcgaShareExp, file = "TCGA.share.txt", sep = "\t", quote = FALSE, col.names = FALSE)
geoShareExp = rbind(ID = colnames(geoShareExp), geoShareExp)
write.table(geoShareExp, file = "GEO.share.txt", sep = "\t", quote = FALSE, col.names = FALSE)



# Read output files
tcga_normalize <- read.table("TCGA.normalize.txt", header = FALSE, sep = "\t")
geo_normalize <- read.table("GEO.normalize.txt", header = FALSE, sep = "\t")
tcga_share <- read.table("TCGA.share.txt", header = FALSE, sep = "\t")
geo_share <- read.table("GEO.share.txt", header = FALSE, sep = "\t")

# Display the first five rows of the output files
# head(tcga_normalize)
# head(geo_normalize)
# head(tcga_share)
# head(geo_share)

# Display the structure of the output files
# str(tcga_normalize)
# str(geo_normalize)
# str(tcga_share)
# str(geo_share)

# Display the summary of the output files
# summary(tcga_normalize)
# summary(geo_normalize)
# summary(tcga_share)
# summary(geo_share)

09. tcgaMergeTime

# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("limma")

library(limma)               # Load the package
expFile = "tcga.share.txt"     # Expression data file
cliFile = "time_tcga.txt"           # Survival data file

# Read the expression file and preprocess the input file
rt = read.table(expFile, header = TRUE, sep = "\t", check.names = FALSE)
rt = as.matrix(rt)
rownames(rt) = rt[, 1]
exp = rt[, 2:ncol(rt)]
dimnames = list(rownames(exp), colnames(exp))
data = matrix(as.numeric(as.matrix(exp)), nrow = nrow(exp), dimnames = dimnames)
data = avereps(data)
data = data[rowMeans(data) > 0,]
data = t(data)



rownames(data)=gsub("(.*?)\\-(.*?)\\-(.*?)\\-.*", "\\1\\-\\2\\-\\3", rownames(data))


# Read the survival data
cli = read.table(cliFile, sep = "\t", check.names = FALSE, header = TRUE, row.names = 1)     # Read the clinical file

# Merge the data and output the result
sameSample = intersect(row.names(data), row.names(cli))
data = data[sameSample,]
cli = cli[sameSample,]
out = cbind(cli, data)
out = cbind(id = row.names(out), out)
write.table(out, file = "tcga.expTime.txt", sep = "\t", row.names = FALSE, quote = FALSE)



# Run the R code
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("limma")

library(limma)               # Load the package
expFile = "tcga.share.txt"     # Expression data file
cliFile = "time_tcga.txt"           # Survival data file

# Read the expression file and preprocess the input file
rt = read.table(expFile, header = TRUE, sep = "\t", check.names = FALSE)
rt = as.matrix(rt)
rownames(rt) = rt[, 1]
exp = rt[, 2:ncol(rt)]
dimnames = list(rownames(exp), colnames(exp))
data = matrix(as.numeric(as.matrix(exp)), nrow = nrow(exp), dimnames = dimnames)
data = avereps(data)
data = data[rowMeans(data) > 0,]
data = t(data)

rownames(data)=gsub("(.*?)\\-(.*?)\\-(.*?)\\-.*", "\\1\\-\\2\\-\\3", rownames(data))

# Read the survival data
cli = read.table(cliFile, sep = "\t", check.names = FALSE, header = TRUE, row.names = 1)     # Read the clinical file

# Merge the data and output the result
sameSample = intersect(row.names(data), row.names(cli))
data = data[sameSample,]
cli = cli[sameSample,]
out = cbind(cli, data)
out = cbind(id = row.names(out), out)
write.table(out, file = "tcga.expTime.txt", sep = "\t", row.names = FALSE, quote = FALSE)

# Show the first five rows of the output file
# head(out)

# Display the structure of the output file
# str(out)

# View the summary statistics of the output file
summary(out)
##       id                futime           fustat           CASP1       
##  Length:1090        Min.   :  -7.0   Min.   :0.0000   Min.   :0.4887  
##  Class :character   1st Qu.: 446.0   1st Qu.:0.0000   1st Qu.:1.9989  
##  Mode  :character   Median : 823.5   Median :0.0000   Median :2.5485  
##                     Mean   :1231.8   Mean   :0.1367   Mean   :2.6065  
##                     3rd Qu.:1673.0   3rd Qu.:0.0000   3rd Qu.:3.1660  
##                     Max.   :8605.0   Max.   :1.0000   Max.   :5.2916  
##      CASP4             GZMB             IL18             IL1B       
##  Min.   :0.5875   Min.   :0.1860   Min.   :0.2892   Min.   :0.2644  
##  1st Qu.:2.3568   1st Qu.:0.6261   1st Qu.:1.6536   1st Qu.:0.8237  
##  Median :2.8377   Median :1.1427   Median :2.1724   Median :1.2301  
##  Mean   :2.7949   Mean   :1.5888   Mean   :2.2846   Mean   :1.3719  
##  3rd Qu.:3.2158   3rd Qu.:2.2003   3rd Qu.:2.8345   3rd Qu.:1.8250  
##  Max.   :4.8504   Max.   :6.5148   Max.   :5.4780   Max.   :4.5064  
##       IRF1            AIM2            GSDMC             IL6        
##  Min.   :1.184   Min.   :0.2194   Min.   :0.2925   Min.   :0.2028  
##  1st Qu.:2.884   1st Qu.:0.5278   1st Qu.:0.4723   1st Qu.:0.4190  
##  Median :3.392   Median :0.8802   Median :0.6789   Median :0.7357  
##  Mean   :3.463   Mean   :1.1920   Mean   :1.0491   Mean   :1.0702  
##  3rd Qu.:3.981   3rd Qu.:1.4941   3rd Qu.:1.2437   3rd Qu.:1.3454  
##  Max.   :6.329   Max.   :6.5610   Max.   :6.2548   Max.   :7.0604  
##      NLRC4            NLRP1            NLRP3             TNF        
##  Min.   :0.3405   Min.   :0.3044   Min.   :0.2581   Min.   :0.1493  
##  1st Qu.:0.7022   1st Qu.:0.9466   1st Qu.:0.6698   1st Qu.:0.6573  
##  Median :0.8949   Median :1.2440   Median :0.9305   Median :1.0588  
##  Mean   :0.9533   Mean   :1.3208   Mean   :1.0095   Mean   :1.2509  
##  3rd Qu.:1.1287   3rd Qu.:1.6077   3rd Qu.:1.2581   3rd Qu.:1.6260  
##  Max.   :3.5609   Max.   :4.0188   Max.   :3.1320   Max.   :5.2421  
##       GZMA       
##  Min.   :0.1795  
##  1st Qu.:1.7484  
##  Median :2.5966  
##  Mean   :2.7481  
##  3rd Qu.:3.5818  
##  Max.   :7.0045

10. geoMergeTime

# # Check if the "BiocManager" package is available; if not, install it
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("limma")

library(limma)                # Load the package
expFile = "geo.share.txt"     # Expression data file
cliFile = "time_geo.txt"          # Survival data file

# Read the expression file and preprocess the input file
rt = read.table(expFile, sep = "\t", header = TRUE, check.names = FALSE)
rt = as.matrix(rt)
rownames(rt) = rt[, 1]
exp = rt[, 2:ncol(rt)]
dimnames = list(rownames(exp), colnames(exp))
data = matrix(as.numeric(as.matrix(exp)), nrow = nrow(exp), dimnames = dimnames)
data = avereps(data)
data = data[rowMeans(data) > 0, ]
data = t(data)

# Read the survival data
cli = read.table(cliFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)     # Read the clinical file

# Merge the data and output the results
sameSample = intersect(row.names(data), row.names(cli))
data = data[sameSample, ]
cli = cli[sameSample, ]
out = cbind(cli, data)
out = cbind(id = row.names(out), out)
write.table(out, file = "geo.expTime.txt", sep = "\t", row.names = FALSE, quote = FALSE)
#==============================================================================#
# Read the output file
output_file <- "geo.expTime.txt"
output_data <- read.table(output_file, sep = "\t", header = TRUE)

# Display the first five rows of data
# head(output_data)

# Show the structure and summary of the data
# str(output_data)
# summary(output_data)

12. Construction of prognostic model

# Install packages
# install.packages("glmnet")
# install.packages("survival")
# install.packages("neuralnet")
# install.packages("rpart.plot")

# Load libraries
library("neuralnet")
## Warning: 程辑包'neuralnet'是用R版本4.3.2 来建造的
library("glmnet")
## Warning: 程辑包'glmnet'是用R版本4.3.2 来建造的
## 载入需要的程辑包:Matrix
## Loaded glmnet 4.1-8
library("survival")
library("rpart")
library(rpart.plot)
## Warning: 程辑包'rpart.plot'是用R版本4.3.2 来建造的
coxSigFile = "tcga.uniSigExp.txt"     # Expression file of TCGA significant genes for Cox analysis
geoFile = "geo.expTime.txt"           # Input file from the GEO database

# Read files and prepare the input files
rt = read.table(coxSigFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)
geo = read.table(geoFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)
sameGene = intersect(colnames(rt)[3:ncol(rt)], colnames(geo)[3:ncol(geo)])
rt = rt[, c("futime", "fustat", sameGene)]
rt$futime[rt$futime <= 0] = 0.003

sameCol = intersect(colnames(rt), colnames(geo))
geo2 = subset(geo, select = c(sameCol))
geo2$futime <- geo2$futime / 365
tcga2 = subset(rt, select = c(sameCol))
all_data = rbind(geo2, tcga2)
all_data$futime[all_data$futime < 1] = 0.5

set.seed(110)  # Set a random seed for reproducibility

# Split the data into a 70% training set and a 30% test set
sample_size <- floor(0.7 * nrow(all_data))  # Calculate the size of the training set
train_index <- sample(1:nrow(all_data), sample_size)  # Generate random indices for the training set
train_set <- all_data[train_index, ]  # Create the training set
test_set <- all_data[-train_index, ]  # Create the test set

fustat1 <- train_set$fustat
futime1 <- train_set$futime
surdata1 <- Surv(train_set$futime, train_set$fustat)
fustat2 <- test_set$fustat
futime2 <- test_set$futime
surdata2 <- Surv(test_set$futime, test_set$fustat)
train1 <- cbind(train_set[, 3:ncol(train_set)], fustat1)
train2 <- cbind(train_set[, 3:ncol(train_set)], futime1)
test1 <- cbind(test_set[, 3:ncol(test_set)], fustat2)
test2 <- cbind(test_set[, 3:ncol(test_set)], futime2)

# Build decision trees for predicting survival status
tree_model1 <- rpart(train1$fustat1 ~ ., data = train1, method = "class")
# Build decision trees for predicting survival time
tree_model2 <- rpart(train2$futime1 ~ ., data = train2, method = "anova")

# Visualize the decision tree for survival status prediction
prp(tree_model1, type = 1, extra = "auto")

# Visualize the decision tree for survival time prediction
prp(tree_model2, type = 2, extra = "auto")

# Predict survival status for the test set
predicted_class_test <- predict(tree_model1, test1[, 1:ncol(test1) - 1], type = "class")
# Predict survival time for the test set
predicted_time_test <- predict(tree_model2, test1[, 1:ncol(test1) - 1])
# Predict survival status for the training set
predicted_class_train <- predict(tree_model1, train1[, 1:ncol(train1) - 1], type = "class")
# Predict survival time for the training set
predicted_time_train <- predict(tree_model2, train1[, 1:ncol(train1) - 1])
risk_test = as.vector(ifelse(predicted_class_test == 1, "high", "low"))
risk_train = as.vector(ifelse(predicted_class_train == 1, "high", "low"))

# Combine the predictions with the original data for the test set
outTab1 = cbind(train_set, riskScore = as.vector(predicted_time_train), risk = risk_train)
# Combine the predictions with the original data for the training set
outTab2 = cbind(test_set, riskScore = as.vector(predicted_class_test), risk = risk_test)

# Write the results to files
write.table(cbind(id = rownames(outTab1), outTab1), file = "trainRisk.txt", sep = "\t", quote = FALSE, row.names = FALSE)
write.table(cbind(id = rownames(outTab2), outTab2), file = "testRisk.txt", sep = "\t", quote = FALSE, row.names = FALSE)

# View the first five lines of the output file
# head(read.table("trainRisk.txt", header = TRUE, sep = "\t"))

# View the structure and overall data of the file
str(read.table("trainRisk.txt", header = TRUE, sep = "\t"))
## 'data.frame':    835 obs. of  18 variables:
##  $ id       : chr  "TCGA-E2-A15R" "TCGA-BH-A1FD" "TCGA-A8-A07C" "TCGA-A8-A06P" ...
##  $ futime   : num  4.75 2.76 2.83 1.08 3.62 ...
##  $ fustat   : int  0 1 0 0 0 0 0 1 0 0 ...
##  $ CASP1    : num  1.15 2.44 2.19 1.83 2.26 ...
##  $ CASP4    : num  1.03 2.41 3.01 2.36 2.64 ...
##  $ GZMB     : num  0.434 0.46 3.09 0.548 0.604 ...
##  $ IL18     : num  0.946 1.72 2.308 2.27 1.128 ...
##  $ IL1B     : num  1.123 1.392 1.61 0.703 0.653 ...
##  $ IRF1     : num  1.97 4.67 2.72 2.42 2.93 ...
##  $ AIM2     : num  0.232 1.175 1.711 0.553 0.367 ...
##  $ GSDMC    : num  0.414 1.097 0.599 0.447 0.552 ...
##  $ IL6      : num  0.673 0.692 1.015 0.447 0.551 ...
##  $ NLRC4    : num  0.486 0.974 1.191 0.853 0.599 ...
##  $ NLRP1    : num  0.896 0.899 0.955 1.348 1.028 ...
##  $ NLRP3    : num  0.627 0.88 0.923 1 0.69 ...
##  $ GZMA     : num  0.521 0.911 3.845 1.805 2.025 ...
##  $ riskScore: num  7.27 2.36 2.36 1.51 2.76 ...
##  $ risk     : chr  "low" "high" "low" "low" ...

13. Survival Analysis

# Install packages
# install.packages("survival")
# install.packages("survminer")

# Load libraries
library(survival)
library(survminer)


# Function to plot survival curves
bioSurvival = function(inputFile = NULL, outFile = NULL) {
  # Read the input file
  rt = read.table(inputFile, header = TRUE, sep = "\t", check.names = FALSE)
  
  # Compare survival differences between high and low-risk groups and obtain significance p-value
  diff = survdiff(Surv(futime, fustat) ~ risk, data = rt)
  pValue = 1 - pchisq(diff$chisq, df = 1)
  if (pValue < 0.001) {
    pValue = "p < 0.001"
  } else {
    pValue = paste0("p = ", sprintf("%.03f", pValue))
  }
  fit = survfit(Surv(futime, fustat) ~ risk, data = rt)
  
  # Plot survival curves
  surPlot = ggsurvplot(fit,
                       data = rt,
                       conf.int = TRUE,
                       pval = pValue,
                       pval.size = 6,
                       legend.title = "Risk",
                       legend.labs = c("High risk", "Low risk"),
                       xlab = "Time (years)",
                       break.time.by = 1,
                       palette = c("red", "blue"),
                       risk.table = TRUE,
                       risk.table.title = "",
                       risk.table.height = 0.25
  )
  
 # Save plot as PDF
  pdf(file = outFile, onefile = FALSE, width = 6.5, height = 5.5)
  print(surPlot)
  dev.off()

  # Print plot in R Markdown
  print(surPlot)
}

# Generate survival plots for the training and test datasets
bioSurvival(inputFile = "trainRisk.txt", outFile = "trainSurv.pdf")

bioSurvival(inputFile = "testRisk.txt", outFile = "testSurv.pdf")

14. ROC curve

#Install necessary packages
# install.packages("survival")
# install.packages("survminer")
# install.packages("timeROC")

# Load required libraries
library(survival)
library(survminer)
library(timeROC)
## Warning: 程辑包'timeROC'是用R版本4.3.2 来建造的
## 
## 载入程辑包:'timeROC'
## The following object is masked from 'package:igraph':
## 
##     compare
# Define a function for drawing ROC curves
bioROC1 = function(inputFile = NULL, rocFile = NULL) {
  # Read the input file
  rt = read.table(inputFile, header = TRUE, sep = "\t", check.names = FALSE)
  # Generate ROC curves
  ROC_rt = timeROC(T = rt$futime, delta = rt$fustat,
                   marker = ifelse(rt$risk=='low',0,1), cause = 1,
                   weighting = 'aalen',
                   times = c(1, 3, 5), ROC = TRUE)
  pdf(file = rocFile, width = 10, height = 10)
  plot(ROC_rt, time = 1, col = 'green', title = FALSE, lwd = 2)
  plot(ROC_rt, time = 3, col = 'blue', add = TRUE, title = FALSE, lwd = 2)
  plot(ROC_rt, time = 5, col = 'red', add = TRUE, title = FALSE, lwd = 2)
  legend('bottomright',
         c(paste0('AUC at 1 year: ', sprintf("%.03f", ROC_rt$AUC[1])),
           paste0('AUC at 3 years: ', sprintf("%.03f", ROC_rt$AUC[2])),
           paste0('AUC at 5 years: ', sprintf("%.03f", ROC_rt$AUC[3]))),
         col = c("green", "blue", "red"), lwd = 2, bty = 'n')
  dev.off()
  
    # Display the ROC curves in R Markdown
  plotROC(ROC_rt, NULL)
}

# Helper function to plot ROC curves
plotROC = function(ROC_rt, rocFile) {
  plot(ROC_rt, time = 1, col = 'green', title = FALSE, lwd = 2)
  plot(ROC_rt, time = 3, col = 'blue', add = TRUE, title = FALSE, lwd = 2)
  plot(ROC_rt, time = 5, col = 'red', add = TRUE, title = FALSE, lwd = 2)
  legend('bottomright',
         c(paste0('AUC at 1 year: ', sprintf("%.03f", ROC_rt$AUC[1])),
           paste0('AUC at 3 years: ', sprintf("%.03f", ROC_rt$AUC[2])),
           paste0('AUC at 5 years: ', sprintf("%.03f", ROC_rt$AUC[3]))),
         col = c("green", "blue", "red"), lwd = 2, bty = 'n')
}


# Define a function for drawing ROC curves
bioROC2 = function(inputFile = NULL, rocFile = NULL) {
  # Read the input file
  rt = read.table(inputFile, header = TRUE, sep = "\t", check.names = FALSE)
  # Generate ROC curves
  ROC_rt = timeROC(T = rt$futime, delta = rt$fustat,
                   marker = ifelse(rt$risk=='low',0,1), cause = 1,
                   weighting = 'aalen',
                   times = c(1, 3, 5), ROC = TRUE)
  pdf(file = rocFile, width = 10, height = 10)
  plot(ROC_rt, time = 1, col = 'green', title = FALSE, lwd = 2)
  plot(ROC_rt, time = 3, col = 'blue', add = TRUE, title = FALSE, lwd = 2)
  plot(ROC_rt, time = 5, col = 'red', add = TRUE, title = FALSE, lwd = 2)
  legend('bottomright',
         c(paste0('AUC at 1 year: ', sprintf("%.03f", ROC_rt$AUC[1])),
           paste0('AUC at 3 years: ', sprintf("%.03f", ROC_rt$AUC[2])),
           paste0('AUC at 5 years: ', sprintf("%.03f", ROC_rt$AUC[3]))),
         col = c("green", "blue", "red"), lwd = 2, bty = 'n')
  dev.off()
  
    # Display the ROC curves in R Markdown
  plotROC(ROC_rt, NULL)
}

# Helper function to plot ROC curves
plotROC = function(ROC_rt, rocFile) {
  plot(ROC_rt, time = 1, col = 'green', title = FALSE, lwd = 2)
  plot(ROC_rt, time = 3, col = 'blue', add = TRUE, title = FALSE, lwd = 2)
  plot(ROC_rt, time = 5, col = 'red', add = TRUE, title = FALSE, lwd = 2)
  legend('bottomright',
         c(paste0('AUC at 1 year: ', sprintf("%.03f", ROC_rt$AUC[1])),
           paste0('AUC at 3 years: ', sprintf("%.03f", ROC_rt$AUC[2])),
           paste0('AUC at 5 years: ', sprintf("%.03f", ROC_rt$AUC[3]))),
         col = c("green", "blue", "red"), lwd = 2, bty = 'n')
}

# Generate ROC curves for the training data
bioROC1(inputFile = "trainRisk.txt", rocFile = "train.ROC.pdf")

# Generate ROC curves for the test data
bioROC2(inputFile = "testRisk.txt", rocFile = "test.ROC.pdf")

15. Risk curve

# Install the 'pheatmap' package
#install.packages("pheatmap")

# Load required libraries
library(pheatmap)


# Define a function for plotting risk curves
bioRiskPlot = function(inputFile = NULL, riskScoreFile = NULL, survStatFile = NULL) {
  rt = read.table(inputFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)  # Read the input file
  rt$riskScore[rt$riskScore > quantile(rt$riskScore, 0.99)] = quantile(rt$riskScore, 0.99)
  rt$risk = factor(rt$risk, levels = c("low", "high"))
  rt = rt[order(rt$riskScore), ]  # Sort samples by risk score
  
  # Calculate lengths of low and high risk groups
  riskClass = rt[, "risk"]
  lowLength = length(riskClass[riskClass == "low"])
  highLength = length(riskClass[riskClass == "high"])
  lowMax = max(rt$riskScore[riskClass == "low"])
  line = rt[, "riskScore"]
  
  # Plot risk curves and save as PDF
  pdf(file = riskScoreFile, width = 6, height = 5)
  plotRiskCurve(line, riskClass, lowLength, highLength, lowMax)
  dev.off()
  
  plotRiskCurve(line, riskClass, lowLength, highLength, lowMax)  # Plot on screen
  
  # Plot survival status and save as PDF
  color = as.vector(rt$fustat)
  color[color == 1] = "red"
  color[color == 0] = "blue"
  
  pdf(file = survStatFile, width = 6, height = 5)
  plotSurvivalStatus(rt$futime, color, lowLength)
  dev.off()
  
  plotSurvivalStatus(rt$futime, color, lowLength)  # Plot on screen
}

plotRiskCurve = function(line, riskClass, lowLength, highLength, lowMax) {
  plot(line, type = "p", pch = 20,
       xlab = "Patients (increasing risk score)", ylab = "Risk score",
       col = c(rep("blue", lowLength), rep("red", highLength)))
  abline(h = lowMax, v = lowLength, lty = 2)
  legend("topleft", c("High risk", "Low Risk"), bty = "n", pch = 19, col = c("red", "blue"), cex = 1.2)
}

plotSurvivalStatus = function(time, color, lowLength) {
  plot(time, pch = 19,
       xlab = "Patients (increasing risk score)", ylab = "Survival time (years)",
       col = color)
  legend("topleft", c("Dead", "Alive"), bty = "n", pch = 19, col = c("red", "blue"), cex = 1.2)
  abline(v = lowLength, lty = 2)
}

# Call the function to plot risk curves for the training and test data
bioRiskPlot(inputFile = "trainRisk.txt",
            riskScoreFile = "train.riskScore.pdf",
            survStatFile = "train.survStat.pdf")

bioRiskPlot(inputFile = "testRisk.txt",
            riskScoreFile = "test.riskScore.pdf",
            survStatFile = "test.survStat.pdf")

16. PCA and tSNE analysis

# Install the required packages
# install.packages("Rtsne")
# install.packages("ggplot2")

# Load the necessary libraries
library(Rtsne)
## Warning: 程辑包'Rtsne'是用R版本4.3.2 来建造的
library(ggplot2)

# Define a function for performing PCA and t-SNE analysis
bioPCA = function(inputFile = NULL, pcaFile = NULL, tsneFile = NULL) {
  # Read the input file and extract data
  rt = read.table(inputFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)
  data = rt[, c(3:(ncol(rt) - 2))]  # Fix the subsetting here
  risk = rt[, "risk"]
  
  # Perform PCA analysis
  data.pca = prcomp(data, scale. = TRUE)
  pcaPredict = predict(data.pca)
  PCA = data.frame(PC1 = pcaPredict[, 1], PC2 = pcaPredict[, 2], risk = risk)
  
  # Plot the PCA graph and save as PDF
  p_pca = ggplot(data = PCA, aes(PC1, PC2)) + geom_point(aes(color = risk)) +
    scale_colour_manual(name = "Risk", values = c("red", "blue")) +
    theme_bw() +
    theme(plot.margin = unit(rep(1.5, 4), 'lines')) +
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

  pdf(file = pcaFile, height = 4.5, width = 5.5)
  print(p_pca)
  dev.off()

  # Print the PCA plot to R Markdown
  print(p_pca)
  
  # Perform t-SNE analysis
  tsneOut = Rtsne(data, dims = 2, perplexity = 10, verbose = FALSE, max_iter = 500, check_duplicates = FALSE)
  tsne = data.frame(tSNE1 = tsneOut$Y[, 1], tSNE2 = tsneOut$Y[, 2], risk = risk)
  
  # Plot the t-SNE graph and save as PDF
  p_tsne = ggplot(data = tsne, aes(tSNE1, tSNE2)) + geom_point(aes(color = risk)) +
    scale_colour_manual(name = "Risk", values = c("red", "blue")) +
    theme_bw() +
    theme(plot.margin = unit(rep(1.5, 4), 'lines')) +
    theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

  pdf(file = tsneFile, height = 4.5, width = 5.5)
  print(p_tsne)
  dev.off()

  # Print the t-SNE plot to R Markdown
  print(p_tsne)
}

# Call the function to perform PCA and t-SNE analysis for the training data
bioPCA(inputFile = "trainRisk.txt", pcaFile = "train.PCA.pdf", tsneFile = "train.t-SNE.pdf")

# Call the function to perform PCA and t-SNE analysis for the test data
bioPCA(inputFile = "testRisk.txt", pcaFile = "test.PCA.pdf", tsneFile = "test.t-SNE.pdf")

Conclusion Section