In this exercise we use K-Means clustering on randomly generated data
using the kmeans()
function.
## Loading required package: DEXSeq
## Loading required package: BiocParallel
## Loading required package: Biobase
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
##
## rowMedians
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomeInfoDb
## Loading required package: DESeq2
## Loading required package: AnnotationDbi
## Loading required package: RColorBrewer
pasCts <- system.file ("extdata", "pasilla_gene_counts.tsv",
package ="pasilla", mustWork = TRUE )
cts <- as.matrix ( read.csv(pasCts, sep="\t", row.names="gene_id"))
t_cts <- t(cts)## Package: pasilla
## Title: Data package with per-exon and per-gene read counts of RNA-seq
## samples of Pasilla knock-down by Brooks et al., Genome Research
## 2011.
## Version: 1.30.0
## Author: Wolfgang Huber, Alejandro Reyes
## Maintainer: Alejandro Reyes <alejandro.reyes.ds@gmail.com>
## Description: This package provides per-exon and per-gene read counts
## computed for selected genes from RNA-seq data that were
## presented in the article "Conservation of an RNA regulatory map
## between Drosophila and mammals" by Brooks AN, Yang L, Duff MO,
## Hansen KD, Park JW, Dudoit S, Brenner SE, Graveley BR, Genome
## Res. 2011 Feb;21(2):193-202, Epub 2010 Oct 4, PMID: 20921232.
## The experiment studied the effect of RNAi knockdown of Pasilla,
## the Drosophila melanogaster ortholog of mammalian NOVA1 and
## NOVA2, on the transcriptome. The package vignette describes
## how the data provided here were derived from the RNA-Seq read
## sequence data that are provided by NCBI Gene Expression Omnibus
## under accession numbers GSM461176 to GSM461181.
## biocViews: ExperimentData, Genome, Drosophila_melanogaster_Data,
## RNASeqData
## License: LGPL
## Suggests: rmarkdown, BiocStyle, knitr, roxygen2
## Depends: R (>= 3.5.0), DEXSeq
## VignetteBuilder: knitr
## RoxygenNote: 7.2.3
## Encoding: UTF-8
## git_url: https://git.bioconductor.org/packages/pasilla
## git_branch: RELEASE_3_18
## git_last_commit: edbc4d8
## git_last_commit_date: 2023-10-24
## Date/Publication: 2023-10-26
## NeedsCompilation: no
## Packaged: 2023-10-26 15:29:26 UTC; biocbuild
## Built: R 4.3.2; ; 2024-02-10 21:58:07 UTC; windows
##
## -- File: C:/Users/18327/AppData/Local/R/win-library/4.3/pasilla/Meta/package.rds
## [1] "The Model organism being studied is Drosophila_melanogaster from package info."
## [1] "Number of genes: 14599"
## [1] "Number of samples: 7"
## [1] "Sample names are : untreated1, untreated2, untreated3, untreated4, treated1, treated2, treated3"
## [1] "So there were three treated samples and four untreated samples that were assayed"
##Method 1: Total Count
sample_tc <- matrix(rowSums(t_cts, na.rm = TRUE),nrow = nrow(t_cts))
tc_denom <- colSums(sample_tc)
n_sample_tc <- sample_tc/tc_denom
##Method 2: Median Ratio
col_sums <- colSums(t_cts, na.rm = TRUE)
# Create a vector to store Theta_j values
Theta_j <- numeric(length(col_sums))
l_cts<-log(t_cts)
l_cts[is.infinite(l_cts)] <- NA
# Apply condition to each column sum
for (i in seq_along(col_sums)) {
if (col_sums[i] %in% c(0, 1)) {
Theta_j[i] <- col_sums[i]
} else {
Theta_j[i] <- exp(mean(l_cts[, i], na.rm = TRUE))
}
}
theta_jv <- Theta_j
# Reshape Theta_j into a matrix
Theta_j <- matrix(Theta_j, ncol = ncol(t_cts))
# Apply condition to each element
for (j in 1:14599)
{
for (i in 1:7)
{
if (Theta_j[j] %in% c(0))
{
t_cts[i,j] <- t_cts[i,j] }
else
{
t_cts[i,j] <- t_cts[i,j]/Theta_j[j] }
}
}
sample_med <- matrix(rowMedians(t_cts, na.rm = TRUE),nrow = nrow(t_cts))
## Method 3 using 75th Quantile
#sample_quantile <- apply(t_cts, 2, function(x) quantile(x, 0.75) / quantile(rowSums(t_cts), 0.75))
sample_quantile <-apply(t_cts, 1, function(x) quantile(x, 0.75))
# Combine results into a data frame
library_size_factors <- data.frame(
Total_Count = sample_tc,
Median_Ratio = sample_med,
Quantile = sample_quantile
)
# Generate pairwise plot
#pairs(library_size_factors)
pairs(library_size_factors, main = "Pairs Plot of Library Size Factors") pbmc3=readRDS('C:/STAT646/mouse_liver_cts.RDS')
tpbmc3 <- t(pbmc3)
pbmc4=readRDS('C:/STAT646/mouse_liver_annot.RDS')
library(irlba)## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
# prcomp_irlba
pbmc3.pc2=prcomp_irlba(tpbmc3, n=10)
#install.packages("dendextend")
suppressPackageStartupMessages(library(dendextend))
# Perform hierarchical clustering
hc.complete <- hclust(dist(as.matrix(pbmc3.pc2$x), method = "euclidean"), method = "complete")
# Cut dendrogram into 4 clusters since we know 4 cell types
clusters <- cutree(hc.complete, k = 4)
avg_dend_obj <- as.dendrogram(hc.complete)
avg_col_dend <- color_branches(avg_dend_obj,col = pbmc4)
plot(avg_col_dend, labels = NULL)
legend("topright", legend = levels(pbmc4), col = unique(pbmc4), pch=16, title = "Cell Types")
title('Hierarchical Clustering Dendogram for 10 PCAs', line = 3.4)set.seed(2)
# Perform kmeans clustering
x <- as.matrix(pbmc3.pc2$x)
dd <-as.matrix(as.dist(1 - cor(t(x))))
km.out <- kmeans(dd, 4,nstart = 20)
plot(dd, col = (km.out$cluster + 1), main = "K-Means Clustering Results with K=4", xlab = "", ylab = "", pch = 20, cex = 2)
legend("topleft", legend = unique(km.out$cluster), col = unique(km.out$cluster), pch = 20, cex = 1, title = "Clusters")library(cluster)
silhouette_score <- function(k){
km <- kmeans(x, centers = k, nstart=20)
ss <- silhouette(km$cluster, dist(x))
mean(ss[, 3])
}
k <- 2:10
avg_sil <- sapply(k, silhouette_score)
plot(k, type='b', avg_sil, xlab='Number of clusters', ylab='Average Silhouette Scores', frame=FALSE)
title("Average Silhouette Scores for Different Numbers of Clusters")## [1] "Based on the plot for Silhouette Score, the optimum number of clusters would be 3"
library(aricode)
km.outfinal <- kmeans(dd, 3,nstart = 20)
print(paste("Adjusted Rand Index (ARI) with 3 clusters for kmeans is"))## [1] "Adjusted Rand Index (ARI) with 3 clusters for kmeans is"
## [1] 0.3875469
## [1] "Confusion Matrix is"
## cell_types
## kmclusters Endothelial cells Fibroblasts Hepatocytes Kupffer cells
## 1 0 0 1488 0
## 2 424 412 18 154
## 3 0 0 1306 0