Francesca Lo Bianco
2025-07-23
gtf_path <-“Homo_sapiens.GRCh38.111.gtf.gz” gtf <- read.delim(gzfile(gtf_path), header = FALSE, comment.char = “#”)
unzip(“filtered_feature_bc_matrix.zip”,exdir = “filtered_feature_bc_matrix”)
if (!requireNamespace(“DropletUtils”, quietly = TRUE)) { BiocManager::install(“DropletUtils”) } library(DropletUtils)
matrix_dir <-“filtered_feature_bc_matrix/filtered_feature_bc_matrix” sce <- read10xCounts(matrix_dir)
sce
gtf <-read.table(gzfile(“C:/Users/Utente/Desktop/magistrale/programming/exam/Homo_sapiens.GRCh38.111.gtf.gz”), header = FALSE, sep = “, comment.char =”#“, stringsAsFactors = FALSE, quote =”“)
head(gtf)
library(stringr)
extract_characteristics <- function(attr_string, key) { pattern <- paste0(key, ” “([^\”]+)”“) match <- regmatches(attr_string, regexec(pattern, attr_string)) sapply(match, function(x) if(length(x) > 1) x[2] else NA) }
gtf\(gene_id <- extract_characteristics(gtf\)V9, “gene_id”) gtf\(gene_biotype <- extract_characteristics(gtf\)V9,“gene_biotype”)
protein_coding_genes <- unique(gtf\(gene_id[gtf\)gene_biotype == “protein_coding”])
filtered_matrix <- sce[rownames(sce) %in% protein_coding_genes, ]
ls() class(sce) dim(sce) dim(filtered_matrix)
[1] 36601 10194 [1] 19250 10194
library(SingleCellExperiment) counts_matrix <- counts(sce)
genes_over_3 <-colSums(counts_matrix >= 3)
summary(genes_over_3)
library(ggplot2)
df <- data.frame(GenesOver3UMI = genes_over_3)
ggplot(df, aes(x = ““, y = GenesOver3UMI)) + geom_violin(fill =”blue”, alpha = 0.6) + geom_boxplot(width = 0.1, outlier.shape = NA) + labs( title = “Number of Genes with ≥3 UMIs”, y = “Genes with ≥3 UMIs”, x = “” ) + theme_minimal(12)
gtf\(gene_name <- extract_characteristics(gtf\)V9, “gene_name”)
ribosomal_proteins <-unique(gtf\(gene_id[grepl("^RP[SL]", gtf\)gene_name)])
ribosomal_pseudogenes <- unique(gtf\(gene_id[grepl("^RP[SL]", gtf\)gene_name) & grepl(“pseudogene”,gtf$gene_biotype) ])
mitochondrial_genes <- unique(gtf\(gene_id[grepl("^MT-", gtf\)gene_name)])
all_genes_to_remove <- unique(c(ribosomal_proteins, ribosomal_pseudogenes, mitochondrial_genes)) summary_table <- data.frame( Category =c(“Ribosomal Proteins”, “Ribosomal Pseudogenes”, “Mitochondrial Genes”),Genes_Removed = c(length(ribosomal_proteins),length(ribosomal_pseudogenes), length(mitochondrial_genes)) )
print(summary_table) | | categories | genes removed | |:——|:———————-|:————–| | 1 | Ribosomal Proteins | 1732 | | 2 | Ribosomal Pseudogenes | 1614 | | 3 | Mitochondrial Genes | 37 | ## filtereing them out
valid_genes <- !is.na(rownames(counts_matrix)) filtered_matrix <- counts_matrix[ valid_genes &!(rownames(counts_matrix) %in% all_genes_to_remove),]
dim(sce) dim(filtered_matrix)
[1] 36601 10194 [1] 36480 10194
install.packages(“BiocManager”) BiocManager::install(c(“DropletUtils”,“scater”,“scran”,“SingleCellExperiment”)) a library(DropletUtils) library(scater) library(SingleCellExperiment)
sce <- logNormCounts(sce)
sce <- runPCA(sce)
var_explained <-attr(reducedDim(sce, “PCA”), “percentVar”)
barplot(var_explained[1:20], names.arg = 1:20, xlab = “Principal Component”, ylab = “Percentage of Variance Explained”, main = “Variance Explained by First 20 PCs”, col = “red”)
var_explained [1] 33.7911121 13.5315540 4.1787460 3.2193831 1.4789011 1.1716284 0.9223103 0.7084933 [9] 0.5936406 0.5133497 0.4682541 0.4036942 0.3596480 0.3245573 0.3142087 0.2694902 [17] 0.2592534 0.2471285 0.2406565 0.2343458 0.2238528 0.2169301 0.2149126 0.2039220 [25] 0.1923200 0.1894952 0.1847988 0.1801468 0.1748084 0.1723348 0.1714931 0.1650353 [33] 0.1637611 0.1621283 0.1579898 0.1561988 0.1543730 0.1533566 0.1514067 0.1493944 [41] 0.1476268 0.1467160 0.1456808 0.1448550 0.1436084 0.1419941 0.1411057 0.1407010 [49] 0.1392670 0.1386287
BiocManager::install(c(“scater”, “scran”)) a library(scater) sce <- runUMAP(sce, dimred = “PCA”, n_dimred = 6)
library(ggplot2) umap_coords <- as.data.frame(reducedDim(sce, “UMAP”)) colnames(umap_coords) <- c(“UMAP1”, “UMAP2”)
ggplot(umap_coords, aes(x = UMAP1, y = UMAP2)) + geom_point(size = 0.3, alpha = 1) + labs(title = “UMAP Based on First 6 Principal Components”) + theme_minimal(12)
library(SingleCellExperiment) library(scran) library(scater) library(igraph) snn_graph <-buildSNNGraph(sce, use.dimred = “PCA”, k = 10)
clusters <- igraph::cluster_walktrap(snn_graph)$membership
colLabels(sce) <- factor(clusters)
umap_df <- as.data.frame(reducedDim(sce, “UMAP”)) umap_df$Cluster <- colLabels(sce)
library(ggplot2) ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = Cluster)) + geom_point(size = 0.3, alpha = 1) + labs(title = “UMAP Colored by Clusters (Walktrap, k=10)”) + theme_minimal(10)
BiocManager::install(“org.Hs.eg.db”) a ## selecting all library(org.Hs.eg.db) library(SingleCellExperiment)
ensembl_ids <- rownames(sce)
ensembl_ids <- sub(“\..*”, ““, ensembl_ids)
gene_symbols <- mapIds( org.Hs.eg.db, keys = ensembl_ids, column = “SYMBOL”, keytype = “ENSEMBL”, multiVals = “first” )
rownames(sce) <- gene_symbols
sce <- sce[!is.na(rownames(sce)), ]
library(celldex) library(SingleR) ref <- celldex::HumanPrimaryCellAtlasData() pred <- SingleR(test = sce, ref = ref, labels = ref\(label.main) sce\)predicted_cell_type <- pred$labels
library(ggplot2) umap_df <-as.data.frame(reducedDim(sce, “UMAP”)) umap_df\(CellType <- sce\)predicted_cell_type
ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = CellType)) + geom_point(size = 0.3, alpha = 1) + labs(title = “UMAP Colored by Predicted Cell Types”) + theme_minimal(10)
markers_genes <- toupper(trimws(gtf\(gene_symbol)) panglao_genes <- toupper(trimws(panglao\)official.gene.symbol))
matched_genes <- intersect(markers_genes, panglao_genes)
matched_table <- panglao[toupper(panglao$official.gene.symbol) %in% matched_genes, ]
celltype_counts <- as.data.frame(table(matched_table\(cell.type)) celltype_counts <- celltype_counts[order(-celltype_counts\)Freq), ] print(celltype_counts)
library(ggplot2) colnames(celltype_counts) <- c(“CellType”, “Count”)
ggplot(celltype_counts, aes(x = reorder(CellType, -Count), y = Count)) + geom_bar(stat = “identity”, fill = “purple”) + theme_minimal() + labs(title = “Cell Type Frequency (from PanglaoDB markers)”, x = “Cell Type”, y = “Number of Marker Genes”) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(ggplot2)
top25 <-head(celltype_counts[order(-celltype_counts$Count), ], 25)
ggplot(top25, aes(x = reorder(CellType, -Count), y = Count)) + geom_bar(stat = “identity”, fill = “yellow”) + theme_minimal() + labs(title = “Top 25 Cell Types by Marker Frequency”, x = “Cell Type”, y = “Number of Marker Genes”) + theme(axis.text.x = element_text(angle = 45, hjust = 1))