library(limma)
library(edgeR)
setwd("/Users/dongzeyuan/Desktop/TRGN_lab/")
counts <- read.table("ProstCa_030921_2.txt")
file<-"ProstCa_030921_2.txt"
dim(counts)
## [1] 58779 6
x<-DGEList(counts)
library(limma)
library(Glimma)
library(edgeR)
library(Homo.sapiens)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:limma':
##
## plotMA
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: OrganismDbi
## Loading required package: GenomicFeatures
## Loading required package: GenomeInfoDb
## Loading required package: GenomicRanges
## Loading required package: GO.db
##
## Loading required package: org.Hs.eg.db
##
## Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene
library(stringr)
setwd("/Users/dongzeyuan/Desktop/TRGN_lab/")
library(Homo.sapiens)
geneid <- rownames(x)
gene <- str_match(geneid, "(\\w*).*")
geneid <- gene[,2]
genes <- select(Homo.sapiens,
keys=geneid,
columns=c("SYMBOL","TXCHROM"),
keytype="ENSEMBL")
## 'select()' returned many:many mapping between keys and columns
dim(genes)
## [1] 60213 3
genes <- genes[!duplicated(genes$ENSEMBL),]
x$genes <- genes
samplenames <- substring(colnames(x), 6, nchar(colnames(x)))
cpm <- cpm(x)
lcpm <- cpm(x, log=TRUE)
L <- mean(x$samples$lib.size) * 1e-6
M <- median(x$samples$lib.size) * 1e-6
c(L, M)
## [1] 43.72306 40.43761
summary(lcpm)
## KRPSUSCSL_0034_02_PRCPRI1TR_R00146S8A2M0000P0000_C1_NEBKAP_A01034.proj.Aligned.out.sorted.md.bam.htSeqCounts
## Min. :-4.4503
## 1st Qu.:-4.4503
## Median :-1.0828
## Mean :-0.4579
## 3rd Qu.: 2.2291
## Max. :15.7358
## KRPSUSCSL_0035_02_PRCPRI1TR_R00146S8A2M0000P0000_C1_NEBKAP_A01036.proj.Aligned.out.sorted.md.bam.htSeqCounts
## Min. :-4.450
## 1st Qu.:-4.450
## Median :-2.849
## Mean :-1.130
## 3rd Qu.: 2.017
## Max. :15.899
## KRPSUSCSL_0036_02_PRCPRI1TR_R00146S8A2M0000P0000_C1_NEBKAP_A01047.proj.Aligned.out.sorted.md.bam.htSeqCounts
## Min. :-4.450
## 1st Qu.:-4.450
## Median :-2.514
## Mean :-0.967
## 3rd Qu.: 2.257
## Max. :16.464
## KRPSUSCSL_0034_02_PRCPRI1TR_R00146S8A2M0000P0000_T1_NEBKAP_A01033.proj.Aligned.out.sorted.md.bam.htSeqCounts
## Min. :-4.4503
## 1st Qu.:-4.4503
## Median :-1.9200
## Mean :-0.8838
## 3rd Qu.: 2.1005
## Max. :16.3088
## KRPSUSCSL_0035_02_PRCPRI1TR_R00146S8A2M0000P0000_T1_NEBKAP_A01035.proj.Aligned.out.sorted.md.bam.htSeqCounts
## Min. :-4.4503
## 1st Qu.:-4.4503
## Median : 0.6721
## Mean : 0.2754
## 3rd Qu.: 3.1734
## Max. :15.5588
## KRPSUSCSL_0036_02_PRCPRI1TR_R00146S8A2M0000P0000_T1_NEBKAP_A01046.proj.Aligned.out.sorted.md.bam.htSeqCounts
## Min. :-4.4503
## 1st Qu.:-4.4503
## Median :-2.3416
## Mean :-0.9174
## 3rd Qu.: 2.0058
## Max. :16.3109
keep.exprs <- filterByExpr(x)
## Warning in filterByExpr.DGEList(x): All samples appear to belong to the same
## group.
x <- x[keep.exprs,, keep.lib.sizes=FALSE]
dim(x)
## [1] 20432 6
lcpm.cutoff <- log2(5/M + 2/L)
library(RColorBrewer)
nsamples <- ncol(x)
col <- brewer.pal(nsamples, "Paired")
par(mfrow=c(1,2))
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="A. Raw data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", samplenames, text.col=col, bty="n")
lcpm <- cpm(x, log=TRUE)
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. Filtered data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
legend("topright", samplenames, text.col=col, bty="n")

x <- calcNormFactors(x, method = "TMM")
x$samples$norm.factors
## [1] 1.0258504 0.9216353 1.0571712 0.9899505 1.1484898 0.8799761
x2 <- x
x2$samples$norm.factors <- 1
x2$counts[,1] <- ceiling(x2$counts[,1]*0.05)
x2$counts[,2] <- x2$counts[,2]*5
par(mfrow=c(1,2))
lcpm <- cpm(x2, log=TRUE)
boxplot(lcpm, las=2, col=col, main="")
title(main="A. Example: Unnormalised data",ylab="Log-cpm")
x2 <- calcNormFactors(x2)
x2$samples$norm.factors
## [1] 0.06698684 5.66930728 1.32495094 1.22640411 1.47484214 1.09875684
lcpm <- cpm(x2, log=TRUE)
boxplot(lcpm, las=2, col=col, main="")
title(main="B. Example: Normalised data",ylab="Log-cpm")
