Quarto AI base GPT-4.1

一、生成示例数据

生存示例数据，模拟一个两组对比的临床试验：对照组 vs 实验组。
生成基因表达示例数据，模拟 50 个基因在 20 个样本中的表达量，并带有 Z-score 标准化数据。
模拟具有明显生物学分层特征的高维数据，用于 PCA 分析。
生成疾病分期基因表达示例数据，模拟 GeneA 在 I-IV 期中的表达变化。

# 1. 生成生存分析示例数据 (Kaplan-Meier Curve Data)
# 模拟一个两组对比的临床试验：对照组 vs 实验组
set.seed(123)
n <- 100
survival_data <- data.frame(
  patient_id = paste0("P", 1:n),
  group = factor(rep(c("Control", "Treatment"), each = n/2)),
  # 模拟实验组生存时间更长
  time = c(rexp(n/2, rate = 0.05), rexp(n/2, rate = 0.03)),
  # 1表示死亡事件，0表示截尾 (Censored)
  status = sample(c(0, 1), n, replace = TRUE, prob = c(0.2, 0.8))
)

# 保存为 CSV
write.csv(survival_data, "survival_example_data.csv", row.names = FALSE)
print("生存分析数据已生成: survival_example_data.csv")

[1] "生存分析数据已生成: survival_example_data.csv"

# 2. 生成热图示例数据 (Genomic Heatmap Data)
# 模拟 50 个基因在 20 个样本中的表达量，并带有 Z-score 缩放特征
set.seed(456)
genes <- paste0("Gene_", 1:50)
samples <- paste0("Sample_", 1:20)

# 创建表达矩阵 (基础表达 + 随机噪声)
exp_matrix <- matrix(rnorm(50 * 20), nrow = 50, ncol = 20)
rownames(exp_matrix) <- genes
colnames(exp_matrix) <- samples

# 模拟前 10 个基因在后 10 个样本中高表达 (聚类特征)
exp_matrix[1:10, 11:20] <- exp_matrix[1:10, 11:20] + 2

# 生成列注释 (样本临床特征)
sample_annotation <- data.frame(
  Sample_ID = samples,
  Stage = factor(rep(c("Stage_I", "Stage_IV"), each = 10)),
  Gender = sample(c("Male", "Female"), 20, replace = TRUE)
)

# 保存矩阵和注释
write.csv(exp_matrix, "heatmap_expression_matrix.csv")
write.csv(sample_annotation, "heatmap_sample_metadata.csv", row.names = FALSE)
print("热图数据已生成: heatmap_expression_matrix.csv & heatmap_sample_metadata.csv")

[1] "热图数据已生成: heatmap_expression_matrix.csv & heatmap_sample_metadata.csv"

# 3. 生成 PCA 示例数据 (Dimensionality Reduction Data)
# 模拟具有明显生物学分层特征的高维数据
set.seed(789)
# 模拟 3 种亚型：A, B, C
subtypes <- factor(rep(c("Subtype_A", "Subtype_B", "Subtype_C"), length.out = 60))
# 模拟 100 个特征
n_features <- 100
pca_matrix <- matrix(rnorm(60 * n_features), nrow = 60, ncol = n_features)

# 为不同亚型添加特征位移，以便在 PCA 图上形成聚类
pca_matrix[subtypes == "Subtype_A", 1:20] <- pca_matrix[subtypes == "Subtype_A", 1:20] + 3
pca_matrix[subtypes == "Subtype_B", 21:40] <- pca_matrix[subtypes == "Subtype_B", 21:40] - 3
pca_matrix[subtypes == "Subtype_C", 41:60] <- pca_matrix[subtypes == "Subtype_C", 41:60] + 4

pca_data <- data.frame(pca_matrix)
pca_data$Subtype <- subtypes

write.csv(pca_data, "pca_example_data.csv", row.names = FALSE)
print("PCA 示例数据已生成: pca_example_data.csv")

[1] "PCA 示例数据已生成: pca_example_data.csv"

# 4. 生成 ggpubr 分期表达示例数据 (Gene Expression across Stages)
set.seed(101)
n_samples <- 120
# 模拟四个分期，样本量均等
stages <- factor(rep(c("Stage I", "Stage II", "Stage III", "Stage IV"), each = 30), 
                 levels = c("Stage I", "Stage II", "Stage III", "Stage IV"))

# 模拟 GeneA 的表达量随分期增加而上升的趋势 (临床意义)
gene_a_base <- c(rnorm(30, mean = 2, sd = 0.5),   # Stage I
                 rnorm(30, mean = 2.5, sd = 0.6), # Stage II
                 rnorm(30, mean = 4, sd = 0.8),   # Stage III
                 rnorm(30, mean = 5.5, sd = 1.0)) # Stage IV

ggpubr_data <- data.frame(
  Patient_ID = paste0("S", 1:n_samples),
  Stage = stages,
  GeneA = gene_a_base
)

# 保存为 CSV
write.csv(ggpubr_data, "ggpubr_stage_data.csv", row.names = FALSE)
print("ggpubr 数据已生成: ggpubr_stage_data.csv")

[1] "ggpubr 数据已生成: ggpubr_stage_data.csv"

二、低精度提示词

帮我画个ggpubr_data数据的箱线图, 用蓝色

library(ggpubr)

ggboxplot(ggpubr_data, 
          x = "Stage", 
          y = "GeneA",
          fill = "blue",
          add = "jitter",
          ylab = "GeneA Expression",
          xlab = "Clinical Stage",
          title = "Gene Expression across Cancer Stages")

三、高精度提示词

使用 ggpubr包，绘制 ggpubr_data 数据, GeneA 在 I-IV 期中的表达。添加 Wilcoxon 检验。使用 Nature NPG 色板。图例置于底部

> str(ggpubr_data)
'data.frame':   120 obs. of  3 variables:
 $ Patient_ID: chr  "S1" "S2" "S3" "S4" ...
 $ Stage     : Factor w/ 4 levels "Stage I","Stage II",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ GeneA     : num  1.84 2.28 1.66 2.11 2.16 ...

library(ggpubr)
library(ggsci)

# 分组两两比较
my_comparisons <- list(
  c("Stage I", "Stage II"),
  c("Stage II", "Stage III"),
  c("Stage III", "Stage IV"),
  c("Stage I", "Stage IV")
)

p <- ggboxplot(ggpubr_data, 
          x = "Stage", 
          y = "GeneA",
          color = "Stage",
          palette = "npg",
          add = "jitter",
          ylab = "GeneA Expression",
          xlab = "Clinical Stage",
          legend = "bottom",
          title = "GeneA Expression across Cancer Stages") +
  stat_compare_means(comparisons = my_comparisons, method = "wilcox.test") +
  stat_compare_means(label.y = max(ggpubr_data$GeneA) + 0.5) # 全局p值

print(p)

四、绘制Kaplan-Meier生存曲线

针对survival_data生成数据，绘制Kaplan-Meier生存曲线，比较对照组和实验组的生存差异，并风险表 (Risk Table)，添加Log-rank检验结果，截尾标记：必须清晰辨识

> str(survival_data)
'data.frame':   100 obs. of  4 variables:
 $ patient_id: chr  "P1" "P2" "P3" "P4" ...
 $ group     : Factor w/ 2 levels "Control","Treatment": 1 1 1 1 1 1 1 1 1 1 ...
 $ time      : num  16.869 11.532 26.581 0.632 1.124 ...
 $ status    : num  0 1 1 1 1 1 0 1 0 1 ...

# 加载必要包
library(survival)
library(survminer)

# 拟合生存曲线
fit <- survfit(Surv(time, status) ~ group, data = survival_data)

# 绘制Kaplan-Meier曲线，添加风险表、截尾标记和Log-rank检验
ggsurvplot(
  fit,
  data = survival_data,
  risk.table = TRUE,           # 显示风险表
  risk.table.col = "strata",   # 风险表按分组着色，消除警告
  pval = TRUE,                # 显示Log-rank检验p值
  conf.int = TRUE,            # 显示置信区间
  surv.median.line = "hv",    # 显示中位生存线
  legend.title = "Group",
  legend.labs = c("Control", "Treatment"),
  palette = c("#0072B5", "#E18727"), # Nature NPG色板风格
  xlab = "Time",
  ylab = "Survival Probability",
  risk.table.height = 0.25,
  risk.table.y.text.col = TRUE,
  risk.table.y.text = FALSE,
  censor.shape = "|",          # 截尾标记为竖线
  censor.size = 4,             # 截尾标记大小
  ggtheme = theme_bw()
)

> ggsurvplot(
  fit,
  data = survival_data)
> ggsurvplot(
  fit,
  data = survival_data,
  risk.table = TRUE)
Ignoring unknown labels:
• colour : "Strata"

回复：这个警告是 survminer 包内部的一个已知问题。当使用 risk.table = TRUE 时，包内部会尝试设置一些图层属性导致这个警告。虽然不影响图形显示，但可以通过添加 risk.table.col = “strata” 参数来消除警告并让风险表按分组着色。

五、ComplexHeatmap 热图示例

你是一位精通生物信息学和 R 语言可视化的专家，擅长使用 ComplexHeatmap 包进行多组学数据的深度叙事。

# Data Context
我有两个数据集：
1. 表达矩阵 (exp_matrix)：50 个基因 x 20 个样本，已进行过 Z-score 标准化。
2. 样本注释 (sample_annotation)：包含 Sample_ID, Stage (Stage_I, Stage_IV), Gender。
3. 基因注释：前 25 个基因属于 "Pathway_Alpha"，后 25 个基因属于 "Pathway_Beta"。

# Task
请使用 ComplexHeatmap 绘制一张具有高度叙事性的热图。

# Requirements
1. **热图切分 (Splitting)**：
   - 列切分 (Column Split)：根据样本的 Stage 进行切分。
   - 行切分 (Row Split)：根据基因所属的 Pathway 进行切分。
2. **注释栏 (Annotations)**：
   - 顶部注释：显示 Stage 和 Gender 的颜色条，使用 Nature 风格的色板。
   - 底部注释：为每个样本添加一个模拟的 "Immune Score" 的柱状图 (anno_barplot)。
3. **视觉细节**：
   - 使用环球通用的蓝-白-红 (Blue-White-Red) 渐变色表示表达量。
   - 在热图中展示行列聚类树，并隐藏样本名称 (Show_column_names = FALSE)。
   - 调整图例位置，确保热图主体清晰。
4. **输出**：请提供完整的 R 代码，包含数据模拟过程和 Heatmap() 函数的详细参数。

# 加载必要的包
library(ComplexHeatmap)
library(circlize)
library(RColorBrewer)

# 读取已生成的数据
exp_matrix <- as.matrix(read.csv("heatmap_expression_matrix.csv", row.names = 1))
sample_annotation <- read.csv("heatmap_sample_metadata.csv")

# 对表达矩阵进行 Z-score 标准化（按行）
exp_matrix_scaled <- t(scale(t(exp_matrix)))

# 创建基因注释：前 25 个基因属于 Pathway_Alpha，后 25 个属于 Pathway_Beta
gene_pathway <- data.frame(
  Gene = rownames(exp_matrix_scaled),
  Pathway = c(rep("Pathway_Alpha", 25), rep("Pathway_Beta", 25))
)

# 模拟 Immune Score（每个样本一个分数）
set.seed(2024)
immune_score <- runif(ncol(exp_matrix_scaled), min = 0, max = 100)

# === 1. 定义颜色方案 ===
# 热图主体：蓝-白-红渐变
col_fun <- colorRamp2(c(-2, 0, 2), c("#2166AC", "white", "#B2182B"))

# 顶部注释颜色：Nature NPG 风格
stage_colors <- c("Stage_I" = "#E64B35", "Stage_IV" = "#4DBBD5")
gender_colors <- c("Male" = "#00A087", "Female" = "#F39B7F")

# === 2. 构建顶部注释 (Top Annotation) ===
top_annotation <- HeatmapAnnotation(
  Stage = sample_annotation$Stage,
  Gender = sample_annotation$Gender,
  col = list(
    Stage = stage_colors,
    Gender = gender_colors
  ),
  annotation_name_side = "left",
  annotation_name_gp = gpar(fontsize = 10, fontface = "bold")
)

# === 3. 构建底部注释 (Bottom Annotation) - Immune Score 柱状图 ===
bottom_annotation <- HeatmapAnnotation(
  `Immune Score` = anno_barplot(
    immune_score,
    gp = gpar(fill = "#8491B4", col = NA),
    height = unit(2, "cm"),
    axis_param = list(side = "right", gp = gpar(fontsize = 8))
  ),
  annotation_name_side = "left",
  annotation_name_gp = gpar(fontsize = 10, fontface = "bold")
)

# === 4. 构建左侧注释 (Row Annotation) - Pathway ===
row_annotation <- rowAnnotation(
  Pathway = gene_pathway$Pathway,
  col = list(Pathway = c("Pathway_Alpha" = "#7570B3", "Pathway_Beta" = "#E7298A")),
  annotation_name_gp = gpar(fontsize = 10, fontface = "bold"),
  show_legend = TRUE
)

# === 5. 绘制 ComplexHeatmap ===
ht <- Heatmap(
  exp_matrix_scaled,
  name = "Z-score",
  
  # 颜色设置
  col = col_fun,
  
  # 聚类设置
  cluster_rows = TRUE,
  cluster_columns = TRUE,
  show_row_dend = TRUE,
  show_column_dend = TRUE,
  
  # 切分设置
  row_split = gene_pathway$Pathway,
  column_split = sample_annotation$Stage,
  row_title = "Pathway",
  column_title = "Clinical Stage",
  
  # 标签显示
  show_row_names = TRUE,
  show_column_names = FALSE,  # 隐藏样本名称
  row_names_gp = gpar(fontsize = 8),
  
  # 添加注释
  top_annotation = top_annotation,
  bottom_annotation = bottom_annotation,
  left_annotation = row_annotation,
  
  # 热图边框
  border = TRUE,
  rect_gp = gpar(col = "white", lwd = 0.5),
  
  # 图例设置
  heatmap_legend_param = list(
    title = "Expression\n(Z-score)",
    title_gp = gpar(fontsize = 10, fontface = "bold"),
    labels_gp = gpar(fontsize = 9),
    legend_height = unit(4, "cm"),
    legend_direction = "vertical"
  ),
  
  # 行列标题样式
  row_title_gp = gpar(fontsize = 12, fontface = "bold"),
  column_title_gp = gpar(fontsize = 12, fontface = "bold")
)

# 绘制热图
draw(ht, 
     heatmap_legend_side = "right",
     annotation_legend_side = "right",
     merge_legend = TRUE)

六、PCA 数据可视化

# Role
你是一位资深生物信息学数据分析师，擅长使用 R 语言进行高维数据的降维分析与可视化。

# Data Context
我有一个基因表达数据集 `pca_matrix`（60 个样本 x 100 个特征），以及样本的亚型标注 `subtypes`（Subtype_A, B, C）。

# Task
请执行 PCA 分析，并绘制一张专业的“PCA 方差解释与贡献率图（Scree Plot & Cumulative Variance Plot）”。

# Requirements
1. **PCA 计算**：使用 `prcomp` 函数对数据进行中心化和标准化处理（scale. = TRUE）。
2. **可视化要求**：
   - 绘制一张组合图：柱状图表示每个主成分（PC1, PC2...）的个体方差贡献率，折线图表示累积方差贡献率。
   - 在折线图上标注“拐点（Elbow Point）”，并添加一条水平虚线表示 80% 的累积解释阈值。
3. **审美细节**：
   - 使用 Nature 风格配色（推荐使用 `ggsci` 包中的 `scale_fill_npg`）。
   - 坐标轴需清晰标注“Variance Explained (%)”。
   - 字体和布局需符合学术论文出版标准（如 `theme_bw()` 或 `theme_classic()`）。
4. **进阶要求**：除了方差图，请额外生成一张 PC1 vs PC2 的散点图，并根据 `subtypes` 进行上色，同时添加置信椭圆（stat_ellipse）。

# Output
请提供完整的 R 代码，包含数据预处理、PCA 计算及基于 ggplot2 的绘图过程。

# 加载必要包
library(ggplot2)
library(ggsci)
library(ggrepel)

# 读取数据
pca_data <- read.csv("pca_example_data.csv")
subtypes <- as.factor(pca_data$Subtype)
pca_matrix <- as.matrix(pca_data[, 1:100])

# 1. PCA 计算
pca_res <- prcomp(pca_matrix, center = TRUE, scale. = TRUE)

# 2. 计算方差贡献率
var_explained <- pca_res$sdev^2 / sum(pca_res$sdev^2)
cum_var <- cumsum(var_explained)
pc_df <- data.frame(
  PC = paste0("PC", 1:length(var_explained)),
  Variance = var_explained * 100,
  Cumulative = cum_var * 100,
  idx = 1:length(var_explained)
)

# 3. 找到拐点（Elbow Point）
elbow <- which.max(diff(diff(pc_df$Variance)) < 0) + 1

# 4. 绘制 Scree Plot & Cumulative Variance Plot
library(tidyr)
pc_long <- pivot_longer(pc_df, cols = c("Variance", "Cumulative"), names_to = "Type", values_to = "Value")

ggplot() +
  geom_col(data = subset(pc_long, Type == "Variance"),
           aes(x = idx, y = Value, fill = "Variance"), width = 0.7) +
  geom_line(data = subset(pc_long, Type == "Cumulative"),
            aes(x = idx, y = Value, color = "Cumulative"), size = 1.2) +
  geom_point(data = subset(pc_long, Type == "Cumulative"),
             aes(x = idx, y = Value, color = "Cumulative"), size = 2) +
  geom_vline(xintercept = elbow, linetype = "dashed", color = "#E64B35", size = 1) +
  geom_hline(yintercept = 80, linetype = "dotted", color = "grey40", size = 1) +
  annotate("text", x = elbow + 1, y = pc_df$Cumulative[elbow], label = "Elbow", color = "#E64B35", hjust = 0) +
  scale_x_continuous(breaks = 1:10, labels = paste0("PC", 1:10), expand = c(0.01, 0)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 100)) +
  scale_fill_npg(name = NULL, labels = "Variance Explained") +
  scale_color_npg(name = NULL, labels = "Cumulative Variance") +
  labs(x = "Principal Component", y = "Variance Explained (%)",
       title = "Scree Plot & Cumulative Variance Explained") +
  theme_bw(base_size = 13) +
  theme(legend.position = "top",
        axis.text.x = element_text(angle = 45, hjust = 1))

# 5. PC1 vs PC2 散点图（亚型上色，含置信椭圆）
pca_scores <- as.data.frame(pca_res$x)
pca_scores$Subtype <- subtypes

ggplot(pca_scores, aes(x = PC1, y = PC2, color = Subtype, fill = Subtype)) +
  geom_point(size = 2.5, alpha = 0.8) +
  stat_ellipse(aes(fill = Subtype), geom = "polygon", alpha = 0.2, color = NA) +
  scale_color_npg() +
  scale_fill_npg() +
  labs(x = paste0("PC1 (", round(var_explained[1] * 100, 1), "%)"),
       y = paste0("PC2 (", round(var_explained[2] * 100, 1), "%)"),
       title = "PCA: PC1 vs PC2 by Subtype") +
  theme_bw(base_size = 13) +
  theme(legend.position = "top")