gtf_path <-“C:/Users/Utente/Desktop/magistrale/programming/exam/Homo_sapiens.GRCh38.111.gtf.gz” gtf <- read.delim(gzfile(gtf_path), header = FALSE, comment.char = “#”)
“C:/Users/Utente/Desktop/magistrale/programming/exam/filtered_feature_bc_matrix”)
if (!requireNamespace(“DropletUtils”, quietly = TRUE)) { BiocManager::install(“DropletUtils”) } library(DropletUtils)
matrix_dir <-“C:/Users/Utente/Desktop/magistrale/programming/exam/filtered_feature_bc_matrix/filtered_feature_bc_matrix” sce <- read10xCounts(matrix_dir)
sce
gtf <-read.table(gzfile(“C:/Users/Utente/Desktop/magistrale/programming/exam/Homo_sapiens.GRCh38.111.gtf.gz”), header = FALSE, sep = “, comment.char =”#“, stringsAsFactors = FALSE, quote =”“)
head(gtf)
library(stringr)
extract_characteristics <- function(attr_string, key) { pattern <- paste0(key, ” “([^\”]+)”“) match <- regmatches(attr_string, regexec(pattern, attr_string)) sapply(match, function(x) if(length(x) > 1) x[2] else NA) }
gtf\(gene_id <- extract_characteristics(gtf\)V9, “gene_id”) gtf\(gene_biotype <- extract_characteristics(gtf\)V9,“gene_biotype”)
protein_coding_genes <- unique(gtf\(gene_id[gtf\)gene_biotype == “protein_coding”])
filtered_matrix <- sce[rownames(sce) %in% protein_coding_genes, ]
ls() class(sce) dim(sce) dim(filtered_matrix)
library(SingleCellExperiment) counts_matrix <- counts(sce)
genes_over_3 <-colSums(counts_matrix >= 3)
summary(genes_over_3)
library(ggplot2)
df <- data.frame(GenesOver3UMI = genes_over_3)
ggplot(df, aes(x = ““, y = GenesOver3UMI)) + geom_violin(fill =”blue”, alpha = 0.6) + geom_boxplot(width = 0.1, outlier.shape = NA) + labs( title = “Number of Genes with ≥3 UMIs”, y = “Genes with ≥3 UMIs”, x = “” ) + theme_minimal(12)
gtf\(gene_name <- extract_characteristics(gtf\)V9, “gene_name”)
ribosomal_proteins <-unique(gtf\(gene_id[grepl("^RP[SL]", gtf\)gene_name)])
ribosomal_pseudogenes <- unique(gtf\(gene_id[grepl("^RP[SL]", gtf\)gene_name) & grepl(“pseudogene”,gtf$gene_biotype) ])
mitochondrial_genes <- unique(gtf\(gene_id[grepl("^MT-", gtf\)gene_name)])
all_genes_to_remove <- unique(c(ribosomal_proteins, ribosomal_pseudogenes, mitochondrial_genes)) summary_table <- data.frame( Category =c(“Ribosomal Proteins”, “Ribosomal Pseudogenes”, “Mitochondrial Genes”),Genes_Removed = c(length(ribosomal_proteins),length(ribosomal_pseudogenes), length(mitochondrial_genes)) )
print(summary_table)
valid_genes <- !is.na(rownames(counts_matrix)) filtered_matrix <- counts_matrix[ valid_genes &!(rownames(counts_matrix) %in% all_genes_to_remove),]
dim(sce) dim(filtered_matrix)
install.packages(“BiocManager”) BiocManager::install(c(“DropletUtils”,“scater”,“scran”,“SingleCellExperiment”)) a library(DropletUtils) library(scater) library(SingleCellExperiment)
sce <- logNormCounts(sce)
sce <- runPCA(sce)
var_explained <-attr(reducedDim(sce, “PCA”), “percentVar”)
barplot(var_explained[1:20], names.arg = 1:20, xlab = “Principal Component”, ylab = “Percentage of Variance Explained”, main = “Variance Explained by First 20 PCs”, col = “red”)
var_explained
BiocManager::install(c(“scater”, “scran”)) a library(scater) sce <- runUMAP(sce, dimred = “PCA”, n_dimred = 6)
library(ggplot2) umap_coords <- as.data.frame(reducedDim(sce, “UMAP”)) colnames(umap_coords) <- c(“UMAP1”, “UMAP2”)
ggplot(umap_coords, aes(x = UMAP1, y = UMAP2)) + geom_point(size = 0.3, alpha = 1) + labs(title = “UMAP Based on First 6 Principal Components”) + theme_minimal(12)
BiocManager::install(“org.Hs.eg.db”) a ## selecting all library(org.Hs.eg.db) library(SingleCellExperiment)
ensembl_ids <- rownames(sce)
ensembl_ids <- sub(“\..*”, ““, ensembl_ids)
gene_symbols <- mapIds( org.Hs.eg.db, keys = ensembl_ids, column = “SYMBOL”, keytype = “ENSEMBL”, multiVals = “first” )
rownames(sce) <- gene_symbols
sce <- sce[!is.na(rownames(sce)), ]
library(celldex) library(SingleR) ref <- celldex::HumanPrimaryCellAtlasData() pred <- SingleR(test = sce, ref = ref, labels = ref\(label.main) sce\)predicted_cell_type <- pred$labels
library(ggplot2) umap_df <-as.data.frame(reducedDim(sce, “UMAP”)) umap_df\(CellType <- sce\)predicted_cell_type
ggplot(umap_df, aes(x = UMAP1, y = UMAP2, color = CellType)) + geom_point(size = 0.3, alpha = 1) + labs(title = “UMAP Colored by Predicted Cell Types”) + theme_minimal(10)
markers_genes <- toupper(trimws(gtf\(gene_symbol)) panglao_genes <- toupper(trimws(panglao\)official.gene.symbol))
matched_genes <- intersect(markers_genes, panglao_genes)
matched_table <- panglao[toupper(panglao$official.gene.symbol) %in% matched_genes, ]
celltype_counts <- as.data.frame(table(matched_table\(cell.type)) celltype_counts <- celltype_counts[order(-celltype_counts\)Freq), ] print(celltype_counts)
library(ggplot2) colnames(celltype_counts) <- c(“CellType”, “Count”)
ggplot(celltype_counts, aes(x = reorder(CellType, -Count), y = Count)) + geom_bar(stat = “identity”, fill = “yellow”) + theme_minimal() + labs(title = “Cell Type Frequency (from PanglaoDB markers)”, x = “Cell Type”, y = “Number of Marker Genes”) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(ggplot2)
top25 <-head(celltype_counts[order(-celltype_counts$Count), ], 25)
ggplot(top25, aes(x = reorder(CellType, -Count), y = Count)) + geom_bar(stat = “identity”, fill = “yellow”) + theme_minimal() + labs(title = “Top 25 Cell Types by Marker Frequency”, x = “Cell Type”, y = “Number of Marker Genes”) + theme(axis.text.x = element_text(angle = 45, hjust = 1))