library(Seurat) library(magrittr) library(dplyr) library(ggplot2) library(ggpubr) library(DESeq2) library(EnhancedVolcano) library(clusterProfiler) library(org.Hs.eg.db)

setwd(“/Users/katehsia/Desktop”) getwd()

#save workspace saveRDS(pbmc.integrated, file = “pbmc_integrated_annotated.rds”) save.image(file = “pbmc_analysis_workspace.RData”)

#load workspace load(“~/Downloads/PBMC.integrated.RData”)

un-annotated UMAP

DimPlot(pbmc.integrated)

UMAP w/ cluster numbers

clusternum_dimplot <- DimPlot(pbmc.integrated, reduction = “umap”, label = TRUE, label.size = 4) pdf(“clusternum_dimplot.pdf”, width = 10, height = 10) print(clusternum_dimplot) dev.off()

CONT vs. PAH conditions

pbmc.integrated[[]] table(pbmc.integrated$subject, pbmc.integrated$condition)

TO-DO 2/6/25

what are the cell type of each cluster

what clusters are more or less in PAH patients

stores genes for each cluster

all_markers <- list()

2/19/25

pbmc.markers <- FindAllMarkers(pbmc.integrated, only.pos = TRUE) pbmc.markers %>% group_by(cluster) %>% dplyr::filter(avg_log2FC > 1) %>%

clusters <- unique(pbmc.markers$cluster) for (cluster in clusters) { cluster_info <- pbmc.markers %>% filter(cluster == !!cluster, avg_log2FC > 1) %>% arrange(desc(avg_log2FC)) %>% slice_head(n = 5)

cat(“Cluster”, cluster, “top 5 markers:”) print(cluster_info) cat(“”) }

feature plots

cell_types <- list( naivecd4t = c(“IL7R”, “CCR7”), cd14mono = c(“CD14”, “LYZ”), memorycd4t = c(“IL7R”, “S100A4”), b = c(“MS4A1”), cd8t = c(“CD8A”), fcgr3amono = c(“FCGR3A”, “MS4A7”), nk = c(“GNLY”, “NKG7”), dc = c(“FCER1A”, “CST3”), platelet = c(“PPBP”) )

plot_list <- list()

for (cell_type in names(cell_types)) { plot_list[[cell_type]] <- FeaturePlot(pbmc.integrated, features = cell_types[[cell_type]], ncol = 1, max.cutoff = 10) + coord_fixed(ratio = 1) }

print(plot_list)

convert feature plots to PDFs

pdf(“all_feature_plots.pdf”, width = 10, height = 10) for (plot in plot_list) { print(plot) } dev.off()

make heatmap

pbmc.markers %>% group_by(cluster) %>% dplyr::filter(avg_log2FC > 1) %>% slice_head(n = 10) %>% ungroup() -> top10 heatmap_plot <- DoHeatmap(pbmc.integrated, features = top10$gene) + NoLegend() print(heatmap_plot)

convert heatmap plot to PDF

pdf(“heatmap_plot.pdf”, width = 20, height = 15) print(heatmap_plot) dev.off()

reset to original cluster numbers

Idents(pbmc.integrated) <- pbmc.integrated$seurat_clusters

print(levels(Idents(pbmc.integrated)))

print(table(Idents(pbmc.integrated))) print(colnames(pbmc.integrated@meta.data))

annotating for UMAP

current_idents <- levels(Idents(pbmc.integrated)) new.cluster.ids <- c(“Naive CD4 T”, “CD14+ Mono”, “Memory CD4 T”, “CD8 T”, “CD8 T”, “NK”, “Memory CD4 T”, “NK”, “B”, “B”, “Memory CD4 T”, “CD8 T”, “FCGR3A+ Mono”, “Memory CD4 T”, “NK”, “Treg”, “Platelet”, “DC”, “NK”, “Platelet”, “DC”, “CD8 T”, “CD34 progenitor”, “Platelet”, “Naive CD4 T”) names(new.cluster.ids) <- current_idents pbmc.integrated <- RenameIdents(pbmc.integrated, new.cluster.ids)

annotated_umap <- DimPlot(pbmc.integrated, reduction = “umap”, label = TRUE, pt.size = 0.5) pdf(“new_labeled_umap.pdf”, width = 10, height = 8) print(annotated_umap) dev.off()

checking markers for regulatory T cells

FeaturePlot(pbmc.integrated, features = “FOXP3”) tregmarker_plot <- FeaturePlot(pbmc.integrated, features = “FOXP3”, order = TRUE) print(tregmarker_plot)

pdf(“tregmarker.pdf”, width = 10, height = 8) print(tregmarker_plot) dev.off()

BOX PLOTS (PAH vs CONT)

table(pbmc.integrated$subject, pbmc.integrated$condition)

subject_info <- data.frame(subject = c(“S1”, “S10”, “S11”, “S12”, “S2”, “S3”, “S4”, “S5”, “S6”, “S7”, “S8”, “S9”), condition = c(“CONT”, “CONT”, “CONT”, “CONT”, “PAH”, “PAH”, “PAH”, “CONT”, “PAH”, “PAH”, “PAH”, “CONT”) )

cell_percentages <- prop.table(table(Idents(pbmc.integrated), pbmc.integrated$subject), margin = 2) * 100

cell_percentages_df <- as.data.frame.matrix(cell_percentages) cell_percentages_df$cell_type <- rownames(cell_percentages_df) cell_percentages_df <- reshape2::melt(cell_percentages_df, id.vars = “cell_type”, variable.name = “subject”, value.name = “percentage”) cell_percentages_df <- merge(cell_percentages_df, subject_info, by = “subject”)

ggplot(cell_percentages_df, aes(x = condition, y = percentage, fill = condition)) + geom_boxplot() + geom_jitter(width = 0.2, alpha = 0.5) + facet_wrap(~cell_type, scales = “free_y”) + theme_bw() + labs(x = “Condition”, y = “Percentage of Cells”, title = “Cell Type Percentages: CONT vs PAH”) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ggsave(“cell_type_percentages_boxplot.pdf”, width = 12, height = 10)

STATISTICAL ANALYSIS

Wilcoxon test

wilcoxon_results <- cell_percentages_df %>% group_by(cell_type) %>% summarise(p_value = wilcox.test(percentage ~ condition)$p.value, CONT_median = median(percentage[condition == “CONT”]), PAH_median = median(percentage[condition == “PAH”])) print(wilcoxon_results)

Student’s t-test

ttest_results <- cell_percentages_df %>% group_by(cell_type) %>% summarise( p_value = t.test(percentage ~ condition)$p.value, CONT_mean = mean(percentage[condition == "CONT"]), PAH_mean = mean(percentage[condition == "PAH"]), CONT_sd = sd(percentage[condition == "CONT"]), PAH_sd = sd(percentage[condition == "PAH"]) ) %>% arrange(p_value) print(ttest_results) # # # above as a CSV file # stat_results <- cell_percentages_df %>% # group_by(cell_type) %>% # summarise( # wilcox_p = compare_means(percentage ~ condition, data = ., method = "wilcox.test")$p, # t_test_p = compare_means(percentage ~ condition, data = ., method = “t.test”)$p, # CONT_median = median(percentage[condition == “CONT”]), # PAH_median = median(percentage[condition == “PAH”]), # CONT_mean = mean(percentage[condition == “CONT”]), # PAH_mean = mean(percentage[condition == “PAH”]), # CONT_sd = sd(percentage[condition == “CONT”]), # PAH_sd = sd(percentage[condition == “PAH”]) # ) %>% # arrange(wilcox_p) # write.csv(stat_results, “statistical_test_results.csv”, row.names = FALSE)

differential abundant analysis

treg_cells <- subset(pbmc.integrated, idents = “Treg”) treg_counts <- GetAssayData(treg_cells, assay = “RNA”, slot = “counts”) treg_metadata <- treg_cells@meta.data dds <- DESeqDataSetFromMatrix(countData = counts, colData = treg_cells@meta.data, design = ~ condition) dds <- dds[rowSums(counts(dds)) > 10, ] # filter lowly-expressed genes dds <- DESeq(dds) results_DE <- results(dds, contrast = c(“condition”, “PAH”, “CONT”)) results_DE <- results_DE[order(results_DE$padj), ] write.csv(as.data.frame(results_DE), “Treg_DESeq2_results.csv”)

volcano plot for Treg

treg_vp <- EnhancedVolcano( results_DE, lab = rownames(results_DE), x = ‘log2FoldChange’, y = ‘pvalue’, title = ‘Treg Differential Expression: PAH vs CONT’, xlim = c(-1.75, 1.75), ylim = c(0, 12), pCutoff = 0.05, FCcutoff = 0.56, pointSize = 3.0, labSize = 6 ) print(treg_vp)

pdf(“treg_vp.pdf”, width = 10, height = 8) print(treg_vp) dev.off()

extract genes with padj < 0.05, |log2FC| > 0.2

sig_genes <- subset(as.data.frame(results_DE), padj < 0.05 & abs(log2FoldChange) > 0.2)

separate into up and downregulated genes

up_genes <- rownames(sig_genes[sig_genes$log2FoldChange > 0.2,]) down_genes <- rownames(sig_genes[sig_genes$log2FoldChange < -0.2,])

4/27/25

translating biological IDs

x <- c(“GPX3”, “GLRX”, “LBP”, “CRYAB”, “DEFB1”, “HCLS1”, “SOD2”, “HSPA2”, “ORM1”, “IGFBP1”, “PTHLH”, “GPC3”, “IGFBP3”,“TOB1”, “MITF”, “NDRG1”, “NR1H4”, “FGFR3”, “PVR”, “IL6”, “PTPRM”, “ERBB2”, “NID2”, “LAMB1”, “COMP”, “PLS3”, “MCAM”, “SPP1”, “LAMC1”, “COL4A2”, “COL4A1”, “MYOC”, “ANXA4”, “TFPI2”, “CST6”, “SLPI”, “TIMP2”, “CPM”, “GGT1”, “NNMT”, “MAL”, “EEF1A2”, “HGD”, “TCN2”, “CDA”, “PCCA”, “CRYM”, “PDXK”, “STC1”, “WARS”, “HMOX1”, “FXYD2”, “RBP4”, “SLC6A12”, “KDELR3”, “ITM2B”) eg = bitr(x, fromType=“SYMBOL”, toType=“ENTREZID”, OrgDb=“org.Hs.eg.db”) head(eg)

keytypes(org.Hs.eg.db) ids <- bitr(x, fromType=“SYMBOL”, toType=c(“UNIPROT”, “ENSEMBL”), OrgDb=“org.Hs.eg.db”) head(ids) m <- bitr(“GO:0006805”, fromType=“GO”, toType = “SYMBOL”, OrgDb=org.Hs.eg.db) dim(m) head(m)

data(gcSample) hg <- gcSample[[1]] head(hg) eg2np <- bitr_kegg(hg, fromType=‘kegg’, toType=‘ncbi-proteinid’, organism=‘hsa’) head(eg2np)

bitr_kegg(“Z5100”, fromType=“kegg”, toType=‘ncbi-proteinid’, organism=‘ece’) ## kegg ncbi-proteinid ## 1 Z5100 AAG58814

GO enrichment analysis

library(clusterProfiler) data(geneList, package=“DOSE”) gene <- names(geneList)[abs(geneList) > 2]

Entrez gene ID

head(gene)

ggo <- groupGO(gene = gene, OrgDb = org.Hs.eg.db, ont = “CC”, level = 3, readable = TRUE)

head(ggo)

ego <- enrichGO(gene = gene, universe = names(geneList), OrgDb = org.Hs.eg.db, ont = “CC”, pAdjustMethod = “BH”, pvalueCutoff = 0.01, qvalueCutoff = 0.05, readable = TRUE) head(ego)

gene.df <- bitr(gene, fromType = “ENTREZID”, toType = c(“ENSEMBL”, “SYMBOL”), OrgDb = org.Hs.eg.db)

ego2 <- enrichGO(gene = gene.df$ENSEMBL, OrgDb = org.Hs.eg.db, keyType = ‘ENSEMBL’, ont = “CC”, pAdjustMethod = “BH”, pvalueCutoff = 0.01, qvalueCutoff = 0.05) head(ego2, 3)

ego3 <- gseGO(geneList = geneList, OrgDb = org.Hs.eg.db, ont = “CC”, minGSSize = 100, maxGSSize = 500, pvalueCutoff = 0.05, verbose = FALSE)

ego_plot <- goplot(ego) pdf(“ego_plot.pdf”, width = 10, height = 8) print(ego_plot) dev.off()

enrichment analysis visualization

DOWNREGULATED

entrez_ids <- bitr(down_genes, fromType = “SYMBOL”, toType = “ENTREZID”, OrgDb = org.Hs.eg.db)

edo <- enrichGO(gene = entrez_ids$ENTREZID, OrgDb = org.Hs.eg.db, keyType = “ENTREZID”, ont = “BP”, pAdjustMethod = “BH”)

top 10 most significant biological processes

DOWNea_bp <- barplot(edo, showCategory=10) + ggtitle(“Top GO Biological Processes Enriched in Treg Downregulated Genes”) print(DOWNea_bp)

pdf(“DOWNea_bp.pdf”, width = 10, height = 8) print(DOWNea_bp) dev.off()

UPREGULATED

UPentrez_ids <- bitr(up_genes, fromType = “SYMBOL”, toType = “ENTREZID”, OrgDb = org.Hs.eg.db)

edo <- enrichGO(gene = UPentrez_ids$ENTREZID, OrgDb = org.Hs.eg.db, keyType = “ENTREZID”, ont = “BP”, pAdjustMethod = “BH”, pvalueCutoff = 0.1, # from default 0.05 qvalueCutoff = 0.3 # from default 0.2 )

top 10 most significant biological processes

UPea_bp <- barplot(edo, showCategory=10) + ggtitle(“Top GO Biological Processes Enriched in Treg Upregulated Genes”) print(UPea_bp)

pdf(“UPea_bp.pdf”, width = 10, height = 8) print(UPea_bp) dev.off()

hsia_kate_biotech2025_pahanalysis

Kate Hsia

2025-05-05

un-annotated UMAP

UMAP w/ cluster numbers

CONT vs. PAH conditions

TO-DO 2/6/25

what are the cell type of each cluster

what clusters are more or less in PAH patients

stores genes for each cluster

2/19/25

feature plots

convert feature plots to PDFs

make heatmap

convert heatmap plot to PDF

reset to original cluster numbers

annotating for UMAP

checking markers for regulatory T cells

BOX PLOTS (PAH vs CONT)

STATISTICAL ANALYSIS

Wilcoxon test

Student’s t-test

differential abundant analysis

volcano plot for Treg

extract genes with padj < 0.05, |log2FC| > 0.2

separate into up and downregulated genes

4/27/25

translating biological IDs

GO enrichment analysis

Entrez gene ID

enrichment analysis visualization

DOWNREGULATED

top 10 most significant biological processes

UPREGULATED

top 10 most significant biological processes