Load Data

#read data
PRJDB5546 <- read.table("~/Documents/IGDB/lab_works/SMY/PRJDB5546/05.analysis/PRJDB5546.umi_counts.txt",header = T,row.names=1, stringsAsFactors=FALSE)

#use all pc gene
all_pc_gene <- read.table("~/Documents/IGDB/lab_works/SMY/genes/mm10.all_pc.gene_id2gene_name.txt", header=FALSE, row.names=1, stringsAsFactors=FALSE)
colnames(all_pc_gene) <- c("GeneName")
all_pc_gene <- all_pc_gene[!duplicated(all_pc_gene$GeneName),,drop=FALSE]

#preprocess
pos <- rownames(PRJDB5546) %in% rownames(all_pc_gene)
PRJDB5546 <- PRJDB5546[pos,]
rownames(PRJDB5546) <- all_pc_gene[rownames(PRJDB5546),1]
PRJDB5546 <- data.matrix(PRJDB5546)

PRJDB5546 <- PRJDB5546[rowSums(PRJDB5546) > 1, ]
PRJDB5546 <- log2(PRJDB5546+1)

Plot expression

par(cex.axis=0.7,cex.lab=0.7)
boxplot(PRJDB5546,col=rep(2:4,each=3), ylab="log2(Number of Transcripts + 1)", las=2)

Normalization

par(cex.axis=0.7,cex.lab=0.7)
norm_PRJDB5546 = normalize.quantiles(as.matrix(PRJDB5546))
colnames(norm_PRJDB5546) = colnames(PRJDB5546)
boxplot(norm_PRJDB5546,col=rep(2:4,each=3), ylab="log2(Number of Transcripts + 1)", las=2)

Heatmap to evaluate sample distance

dist = dist(t(PRJDB5546))
heatmap(as.matrix(dist), Colv= NA, Rowv=NA, cexRow = 0.8, cexCol = 0.8)

Hierachical Clustering

Hclust = hclust(dist)
dend = as.dendrogram(Hclust)
dend = color_labels(Hclust,3,col=1:3)
plot(dend)

Correlation between samples

#correlation plot
M = cor(PRJDB5546)
corrplot(M, method = "ellipse", order = "hclust")