1. We will first use the pasilla data from Bioconductor. You can laod the gene-level read count matrix using the script below.
In this exercise we use K-Means clustering on randomly generated data
using the kmeans()
function.
## Loading required package: DEXSeq
## Loading required package: BiocParallel
## Loading required package: Biobase
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomeInfoDb
## Loading required package: DESeq2
## Loading required package: AnnotationDbi
## Loading required package: RColorBrewer
pasCts <- system.file ("extdata", "pasilla_gene_counts.tsv",
package ="pasilla", mustWork = TRUE )
cts <- as.matrix ( read.csv(pasCts, sep="\t", row.names="gene_id"))
t_cts <- t(cts)## Package: pasilla
## Title: Data package with per-exon and per-gene read counts of RNA-seq
## samples of Pasilla knock-down by Brooks et al., Genome Research
## 2011.
## Version: 1.30.0
## Author: Wolfgang Huber, Alejandro Reyes
## Maintainer: Alejandro Reyes <alejandro.reyes.ds@gmail.com>
## Description: This package provides per-exon and per-gene read counts
## computed for selected genes from RNA-seq data that were
## presented in the article "Conservation of an RNA regulatory map
## between Drosophila and mammals" by Brooks AN, Yang L, Duff MO,
## Hansen KD, Park JW, Dudoit S, Brenner SE, Graveley BR, Genome
## Res. 2011 Feb;21(2):193-202, Epub 2010 Oct 4, PMID: 20921232.
## The experiment studied the effect of RNAi knockdown of Pasilla,
## the Drosophila melanogaster ortholog of mammalian NOVA1 and
## NOVA2, on the transcriptome. The package vignette describes
## how the data provided here were derived from the RNA-Seq read
## sequence data that are provided by NCBI Gene Expression Omnibus
## under accession numbers GSM461176 to GSM461181.
## biocViews: ExperimentData, Genome, Drosophila_melanogaster_Data,
## RNASeqData
## License: LGPL
## Suggests: rmarkdown, BiocStyle, knitr, roxygen2
## Depends: R (>= 3.5.0), DEXSeq
## VignetteBuilder: knitr
## RoxygenNote: 7.2.3
## Encoding: UTF-8
## git_url: https://git.bioconductor.org/packages/pasilla
## git_branch: RELEASE_3_18
## git_last_commit: edbc4d8
## git_last_commit_date: 2023-10-24
## Date/Publication: 2023-10-26
## NeedsCompilation: no
## Packaged: 2023-10-26 15:29:26 UTC; biocbuild
## Built: R 4.3.2; ; 2024-02-10 21:58:07 UTC; windows
##
## -- File: C:/Users/18327/AppData/Local/R/win-library/4.3/pasilla/Meta/package.rds
## [1] "The Model organism being studied is Drosophila_melanogaster"
## [1] "Number of genes: 14599"
## [1] "Number of samples: 7"
## [1] "Sample names are : untreated1, untreated2, untreated3, untreated4, treated1, treated2, treated3"
## [1] "So there were three treated samples and four untreated samples that were assayed"
##Method 1: Total Count
sample_tc <- matrix(rowSums(t_cts, na.rm = TRUE),nrow = nrow(t_cts))
tc_denom <- colSums(sample_tc)
n_sample_tc <- sample_tc/tc_denom
##Method 2: Median Ratio
col_sums <- colSums(t_cts, na.rm = TRUE)
# Create a vector to store Theta_j values
Theta_j <- numeric(length(col_sums))
l_cts<-log(t_cts)
l_cts[is.infinite(l_cts)] <- NA
# Apply condition to each column sum
for (i in seq_along(col_sums)) {
if (col_sums[i] %in% c(0, 1)) {
Theta_j[i] <- col_sums[i]
} else {
Theta_j[i] <- exp(mean(l_cts[, i], na.rm = TRUE))
}
}
theta_jv <- Theta_j
# Reshape Theta_j into a matrix
Theta_j <- matrix(Theta_j, ncol = ncol(t_cts))
# Apply condition to each element
for (j in 1:14599)
{
for (i in 1:7)
{
if (Theta_j[j] %in% c(0))
{
t_cts[i,j] <- t_cts[i,j] }
else
{
t_cts[i,j] <- t_cts[i,j]/Theta_j[j] }
}
}
sample_med <- matrix(rowMedians(t_cts, na.rm = TRUE),nrow = nrow(t_cts))
## Method 3 using 75th Quantile
#sample_quantile <- apply(t_cts, 2, function(x) quantile(x, 0.75) / quantile(rowSums(t_cts), 0.75))
sample_quantile <-apply(t_cts, 1, function(x) quantile(x, 0.75))
# Combine results into a data frame
library_size_factors <- data.frame(
Total_Count = sample_tc,
Median_Ratio = sample_med,
Quantile = sample_quantile
)
# Generate pairwise plot
pairs(library_size_factors)
2. We will reuse the toy single-cell sequencing dataset of mouse liver
from the Liver Cell Atlas (Guilliams et al., Cell 2022). The data has
been subsetted, cleaned, and normalized. You will have the opportunity
to learn these processes later on! For now, you can access the processed
data from the “Data” tab under “Modules” in Canvas and use the readRDS
function to read them. (a) You have previously carried out principal
component analysis on this data. Now perform hierarchical clustering on
the top 10 principal components, using complete linkage and Euclidean
distance. Visualize your result. Bonus point: show colors on the
dendrogram that correspond to the true underlying cell types.
pbmc3=readRDS('C:/STAT646/mouse_liver_cts.RDS')
tpbmc3 <- t(pbmc3)
pbmc4=readRDS('C:/STAT646/mouse_liver_annot.RDS')
library(irlba)## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
# prcomp_irlba
pbmc3.pc2=prcomp_irlba(tpbmc3, n=10)
#install.packages("dendextend")
suppressPackageStartupMessages(library(dendextend))
# Perform hierarchical clustering
hc.complete <- hclust(dist(as.matrix(pbmc3.pc2$x), method = "euclidean"), method = "complete")
# Cut dendrogram into 4 clusters since we know 4 cell types
clusters <- cutree(hc.complete, k = 4)
avg_dend_obj <- as.dendrogram(hc.complete)
avg_col_dend <- color_branches(avg_dend_obj,col = pbmc4)
plot(avg_col_dend, labels = NULL)
legend("topright", legend = levels(pbmc4), col = unique(pbmc4), pch=16, title = "Cell Types")
title('Hierarchical Clustering Dendogram for 10 PCAs', line = 3.4)set.seed(2)
# Perform kmeans clustering
x <- as.matrix(pbmc3.pc2$x)
dd <-as.matrix(as.dist(1 - cor(t(x))))
km.out <- kmeans(dd, 4,nstart = 20)
plot(dd, col = (km.out$cluster + 1), main = "K-Means Clustering Results with K=4", xlab = "", ylab = "", pch = 20, cex = 2)library(cluster)
silhouette_score <- function(k){
km <- kmeans(x, centers = k, nstart=20)
ss <- silhouette(km$cluster, dist(x))
mean(ss[, 3])
}
k <- 2:10
avg_sil <- sapply(k, silhouette_score)
plot(k, type='b', avg_sil, xlab='Number of clusters', ylab='Average Silhouette Scores', frame=FALSE)## [1] 0.2843647
## pbmc4
## Endothelial cells Fibroblasts Hepatocytes Kupffer cells
## 1 0 0 885 0
## 2 0 0 512 0
## 3 0 0 1407 0
## 4 424 412 8 154