package installation:
library("DESeq2")
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
## tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(ggplot2)
library(ggrepel)
library("pheatmap")
library(EnhancedVolcano)
library(RColorBrewer)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:GenomicRanges':
##
## subtract
library("edgeR")
## Loading required package: limma
##
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
##
## plotMA
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(AnnotationDbi)
library(org.Hs.eg.db)
##
library("biomaRt")
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:IRanges':
##
## space
## The following object is masked from 'package:S4Vectors':
##
## space
## The following object is masked from 'package:stats':
##
## lowess
DESeq2::plotMA
## nonstandardGenericFunction for "plotMA" defined from package "BiocGenerics"
##
## function (object, ...)
## {
## standardGeneric("plotMA")
## }
## <bytecode: 0x116064708>
## <environment: 0x11605dc00>
## Methods may be defined for arguments: object
## Use showMethods(plotMA) for currently available ones.
packageVersion("DESeq2")
## [1] '1.44.0'
read in counts file:
Counts <- as.matrix(read.csv("/Users/ewamble/Desktop/RC_RNAseq/data/mapped/counts/RC43N_Complete2.featureCounts.csv", row.names = "Geneid", header = TRUE, sep = ","))
#View
#Counts
pre-processing for DESeq2:
Counts <- Counts[which(rowSums(Counts) > 50),]
#View
#Counts
edit ENSEMBL ID row names:
rownames(Counts) <- gsub("\\..*","",rownames(Counts))
#View
#Counts
subset grouped data:
Group1 <- subset(Counts, select = c(RC43N1, RC43N2, RC43N3, RC43N4, RC43T1, RC43T2, RC43T4))
Group2 <- subset(Counts, select = c(RC43N1, RC43N2, RC43N3, RC43N4, RC43N.8d.DMEM1, RC43N.8d.DMEM2, RC43N.8d.DMEM3, RC43N.8d.DMEM4))
Group3 <- subset(Counts, select = c(RC43T1, RC43T2, RC43T4, RC43T.8d.DMEM1, RC43T.8d.DMEM2, RC43T.8d.DMEM3, RC43T.8d.DMEM4))
Group4 <- subset(Counts, select = c(RC43N.8d.DMEM1, RC43N.8d.DMEM2, RC43N.8d.DMEM3, RC43N.8d.DMEM4, RC43T.8d.DMEM1, RC43T.8d.DMEM2, RC43T.8d.DMEM3, RC43T.8d.DMEM4))
create matrix of data:
Group1 <- as.matrix(Group1)
Group2 <- as.matrix(Group2)
Group3 <- as.matrix(Group3)
Group4 <- as.matrix(Group4)
assign the conditions of the experiment:
condition1 <- factor(c("Control", "Control", "Control", "Control", "Tumor", "Tumor", "Tumor"))
condition2 <- factor(c("Control", "Control", "Control", "Control", "Treatment", "Treatment", "Treatment", "Treatment"))
condition3 <- factor(c("Tumor", "Tumor", "Tumor", "Treatment", "Treatment", "Treatment", "Treatment"))
condition4 <- factor(c("DMEM_Control", "DMEM_Control", "DMEM_Control", "DMEM_Control", "DMEM_Tumor", "DMEM_Tumor", "DMEM_Tumor", "DMEM_Tumor"))
make data frame for conditions and samples:
coldata1 <- data.frame(row.names = colnames(Group1), condition1)
coldata2 <- data.frame(row.names = colnames(Group2), condition2)
coldata3 <- data.frame(row.names = colnames(Group3), condition3)
coldata4 <- data.frame(row.names = colnames(Group4), condition4)
create DESeq2 matrix:
dds1 <- DESeqDataSetFromMatrix(countData = Group1, colData = coldata1, design = ~condition1)
dds2 <- DESeqDataSetFromMatrix(countData = Group2, colData = coldata2, design = ~condition2)
dds3 <- DESeqDataSetFromMatrix(countData = Group3, colData = coldata3, design = ~condition3)
dds4 <- DESeqDataSetFromMatrix(countData = Group4, colData = coldata4, design = ~condition4)
run DESeq function:
dds1 <-DESeq(dds1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds2 <-DESeq(dds2)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds3 <-DESeq(dds3)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
dds4 <-DESeq(dds4)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
save the normalized read counts:
normalized_Counts1 <- counts(dds1, normalized = TRUE)
normalized_Counts2 <- counts(dds2, normalized = TRUE)
normalized_Counts3 <- counts(dds3, normalized = TRUE)
normalized_Counts4 <- counts(dds4, normalized = TRUE)
write csv files:
write.csv(normalized_Counts1, 'ControlvsTumor_normalized_counts.csv')
write.csv(normalized_Counts2, 'ControlvsTreatment_RC43N_normalized_counts.csv')
write.csv(normalized_Counts3, 'TumorvsTreatment_RC43T_normalized_counts.csv')
write.csv(normalized_Counts4, 'Treatment_RC43NvsTreatment_RC43T_normalized_counts.csv')
read in csv files and set parameters:
res1 <- results(dds1, contrast = c('condition1', 'Control', 'Tumor'), alpha = 0.05)
res2 <- results(dds2, contrast = c('condition2', 'Control', 'Treatment'), alpha = 0.05)
res3 <- results(dds3, contrast = c('condition3', 'Tumor', 'Treatment'), alpha = 0.05)
res4 <- results(dds4, contrast = c('condition4', 'DMEM_Control', 'DMEM_Tumor'), alpha = 0.05)
ordering the results table by the smallest padj-value:
reordered1 <- res1[order(res1$padj),]
reordered2 <- res2[order(res2$padj),]
reordered3 <- res3[order(res3$padj),]
reordered4 <- res4[order(res4$padj),]
view tables:
reordered1
## log2 fold change (MLE): condition1 Control vs Tumor
## Wald test p-value: condition1 Control vs Tumor
## DataFrame with 20867 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000075340 1416.287 6.11097 0.236210 25.8709 1.41644e-147
## ENSG00000129514 530.723 -4.91367 0.189755 -25.8948 7.62744e-148
## ENSG00000106991 805.240 6.35620 0.272518 23.3240 2.53194e-120
## ENSG00000124134 943.516 7.44837 0.326759 22.7947 5.17607e-115
## ENSG00000187134 2682.634 3.35759 0.152513 22.0151 2.06540e-107
## ... ... ... ... ... ...
## ENSG00000215414 441.260068 -4.438049 1.36256 -3.2571325 NA
## ENSG00000210049 1.152670 -1.682264 1.83172 -0.9184043 0.358407
## ENSG00000210107 1.022524 1.191988 1.85778 0.6416201 0.521120
## ENSG00000210117 0.779581 -1.506123 1.71139 -0.8800561 0.378829
## ENSG00000210156 1.885336 -0.101351 1.23815 -0.0818573 0.934760
## padj
## <numeric>
## ENSG00000075340 1.37926e-143
## ENSG00000129514 1.37926e-143
## ENSG00000106991 1.64365e-116
## ENSG00000124134 2.52010e-111
## ENSG00000187134 8.04473e-104
## ... ...
## ENSG00000215414 NA
## ENSG00000210049 NA
## ENSG00000210107 NA
## ENSG00000210117 NA
## ENSG00000210156 NA
reordered2
## log2 fold change (MLE): condition2 Control vs Treatment
## Wald test p-value: condition2 Control vs Treatment
## DataFrame with 20867 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000148346 4634.61 -6.66077 0.208003 -32.0225 5.30555e-225
## ENSG00000156804 5502.23 -3.75204 0.132451 -28.3277 1.57585e-176
## ENSG00000103034 3019.76 -4.15300 0.149247 -27.8264 2.08176e-170
## ENSG00000157601 10018.89 -4.72908 0.173194 -27.3051 3.68857e-164
## ENSG00000130513 2239.73 -5.62205 0.223676 -25.1348 2.07215e-139
## ... ... ... ... ... ...
## ENSG00000215568 0.00000 NA NA NA NA
## ENSG00000236754 8.83752 -1.996252 1.235924 -1.615190 NA
## ENSG00000278948 122.63640 -2.151011 0.652992 -3.294082 NA
## ENSG00000182916 0.00000 NA NA NA NA
## ENSG00000169084 183.22936 -0.786863 1.334253 -0.589741 NA
## padj
## <numeric>
## ENSG00000148346 1.10037e-220
## ENSG00000156804 1.63416e-172
## ENSG00000103034 1.43919e-166
## ENSG00000157601 1.91252e-160
## ENSG00000130513 8.59527e-136
## ... ...
## ENSG00000215568 NA
## ENSG00000236754 NA
## ENSG00000278948 NA
## ENSG00000182916 NA
## ENSG00000169084 NA
reordered3
## log2 fold change (MLE): condition3 Tumor vs Treatment
## Wald test p-value: condition3 Tumor vs Treatment
## DataFrame with 20867 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000177614 6621.8 -5.69583 0.1475827 -38.5942 0
## ENSG00000189143 16791.5 -3.58272 0.0885237 -40.4718 0
## ENSG00000149591 12905.9 -7.27922 0.1712628 -42.5032 0
## ENSG00000013588 41477.6 -3.24702 0.0783965 -41.4179 0
## ENSG00000167767 10195.4 -3.49215 0.0872989 -40.0022 0
## ... ... ... ... ... ...
## ENSG00000196406 0.438199 -1.002902 1.85986 -0.539236 0.589724
## ENSG00000002586 514.759939 -0.548812 1.47386 -0.372364 NA
## ENSG00000215414 464.767549 4.019597 1.35332 2.970178 NA
## ENSG00000210107 0.923801 -1.054345 1.71451 -0.614954 0.538585
## ENSG00000210117 1.034389 0.750476 1.38620 0.541392 0.588237
## padj
## <numeric>
## ENSG00000177614 0
## ENSG00000189143 0
## ENSG00000149591 0
## ENSG00000013588 0
## ENSG00000167767 0
## ... ...
## ENSG00000196406 NA
## ENSG00000002586 NA
## ENSG00000215414 NA
## ENSG00000210107 NA
## ENSG00000210117 NA
reordered4
## log2 fold change (MLE): condition4 DMEM_Control vs DMEM_Tumor
## Wald test p-value: condition4 DMEM_Control vs DMEM_Tumor
## DataFrame with 20867 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000204941 1289.579 -6.36499 0.188140 -33.8312 6.85901e-251
## ENSG00000144476 1705.926 5.34255 0.166100 32.1647 5.50868e-227
## ENSG00000158786 1091.256 -3.82749 0.125756 -30.4357 1.85097e-203
## ENSG00000148677 567.624 -5.03273 0.171472 -29.3502 2.37673e-189
## ENSG00000118849 1695.505 -4.58680 0.157041 -29.2077 1.54945e-187
## ... ... ... ... ... ...
## ENSG00000280441 194.81385 2.849387 1.159058 2.45836 NA
## ENSG00000142188 24.38037 -2.578583 1.112013 -2.31884 NA
## ENSG00000236754 9.29026 2.037665 1.103569 1.84643 NA
## ENSG00000278948 172.64145 0.651496 0.626938 1.03917 NA
## ENSG00000215115 61.59791 -2.672531 0.830076 -3.21962 NA
## padj
## <numeric>
## ENSG00000204941 1.42386e-246
## ENSG00000144476 5.71773e-223
## ENSG00000158786 1.28081e-199
## ENSG00000148677 1.23346e-185
## ENSG00000118849 6.43302e-184
## ... ...
## ENSG00000280441 NA
## ENSG00000142188 NA
## ENSG00000236754 NA
## ENSG00000278948 NA
## ENSG00000215115 NA
generate MA plot (for gene expression changes in terms of log fold change on y-axis and normalized expression counts on x-axis: RC43N vs RC43T (MA plot):
DESeq2::plotMA(res1, cex = 0.7, ylim = c(-10, 10), main = "RC43N vs RC43T \n (MA plot)")
abline (h=c(-1,1), col="red", lwd=3)
DESeq2::plotMA(res2, cex = 0.7, ylim = c(-10, 10), main = "RC43N vs 8 Day DMEM Treatment \n (MA plot)")
abline(h=c(-1, 1), col = "red",lwd = 3)
DESeq2::plotMA(res3, cex = 0.7, ylim = c(-10, 10), main = "RC43T vs 8 Day DMEM Treatment \n (MA plot)")
abline(h=c(-1, 1), col = "red",lwd = 3)
DESeq2::plotMA(res4, cex = 0.7, ylim = c(-10, 10), main = "RC43N DMEM Treatment vs \n RC43T DMEM Treatment \n (MA plot)")
abline(h=c(-1, 1), col = "red",lwd = 3)
generate MA plot and remove noise associated with log2 fold changes from low count genes without required filtering thresholds:
resultsNames(dds1)
## [1] "Intercept" "condition1_Tumor_vs_Control"
resultsNames(dds2)
## [1] "Intercept" "condition2_Treatment_vs_Control"
resultsNames(dds3)
## [1] "Intercept" "condition3_Tumor_vs_Treatment"
resultsNames(dds4)
## [1] "Intercept"
## [2] "condition4_DMEM_Tumor_vs_DMEM_Control"
*apeglm- Heavy-tailed prior distributions for sequence count data: removing the noise and preserving large differences when ranking genes: RC43N vs RC43T (MA plot):
#Shrinkage of effect size
res1LFC <- lfcShrink(dds = dds1, coef = 2, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
DESeq2::plotMA(res1LFC, cex =0.7, ylim= c(-10, 10),main = "RC43N vs RC43T \n (MA plot RN)")
abline (h=c(-1, 1), col = "red", lwd = 2)
lfcShrink(dds2, coef = 2, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## log2 fold change (MAP): condition2 Treatment vs Control
## Wald test p-value: condition2 Treatment vs Control
## DataFrame with 20867 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000227232 384.60089 0.0619329 0.161246 0.69563268 0.787394081
## ENSG00000278267 5.51363 0.1705986 0.608193 0.66070962 0.760946106
## ENSG00000233750 1.40433 -0.1694630 0.746924 0.51559544 0.639598630
## ENSG00000268903 25.70287 1.9872959 0.598649 0.00010207 0.000597346
## ENSG00000269981 14.34092 -0.0266832 0.579342 0.94186608 0.962755175
## ... ... ... ... ... ...
## ENSG00000198695 22424.0082 0.708335 0.250948 2.40393e-03 9.12807e-03
## ENSG00000210194 20.8239 0.435487 0.389085 1.96913e-01 3.15218e-01
## ENSG00000198727 52583.9777 -0.259060 0.333927 3.81061e-01 5.15404e-01
## ENSG00000210195 68.6341 2.330344 0.467775 6.36681e-08 7.49845e-07
## ENSG00000210196 1023.7574 -0.196084 0.220187 3.51147e-01 4.85876e-01
DESeq2::plotMA(res1LFC, cex =0.7, ylim= c(-10, 10),main = "RC43N vs 8 Day DMEM Treatment \n (MA plot RN)")
abline (h=c(-1, 1), col = "red", lwd = 2)
lfcShrink(dds3, coef = 2, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## log2 fold change (MAP): condition3 Tumor vs Treatment
## Wald test p-value: condition3 Tumor vs Treatment
## DataFrame with 20867 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000227232 284.39836 -0.0122665 0.198573 0.948468 0.968680
## ENSG00000278267 3.79801 0.0665160 0.628975 0.831632 0.894633
## ENSG00000233750 6.77519 0.3904684 0.614918 0.353542 0.489004
## ENSG00000268903 34.65685 -0.0278862 0.405333 0.936975 0.962461
## ENSG00000269981 40.94799 -0.0463471 0.371887 0.889962 0.932857
## ... ... ... ... ... ...
## ENSG00000198695 14846.4444 -1.2905942 0.0882189 6.58724e-49 3.67607e-47
## ENSG00000210194 15.8674 -1.0812105 0.5416965 1.27532e-02 3.15650e-02
## ENSG00000198727 28620.0994 0.0364073 0.1005423 7.16044e-01 8.10330e-01
## ENSG00000210195 29.2136 -0.8421250 0.4048899 1.88271e-02 4.42296e-02
## ENSG00000210196 1443.8644 -0.7337048 0.1415139 1.63009e-07 1.02707e-06
DESeq2::plotMA(res1LFC, cex =0.7, ylim= c(-10, 10),main = "RC43T vs 8 Day DMEM Treatment \n (MA plot)")
abline (h=c(-1, 1), col = "red", lwd = 2)
lfcShrink(dds4, coef = 2, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
## log2 fold change (MAP): condition4 DMEM Tumor vs DMEM Control
## Wald test p-value: condition4 DMEM Tumor vs DMEM Control
## DataFrame with 20867 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000227232 422.98050 0.0776363 0.128162 0.53966595 0.67487503
## ENSG00000278267 6.34266 -0.0349633 0.610488 0.92416038 0.95517278
## ENSG00000233750 3.19420 1.3730043 1.385494 0.03080737 0.07831621
## ENSG00000268903 51.24619 0.2817783 0.415962 0.42440822 0.57321342
## ENSG00000269981 41.74411 1.8605831 0.717418 0.00105577 0.00454139
## ... ... ... ... ... ...
## ENSG00000198695 29443.6426 -0.00920489 0.234458 9.62410e-01 9.78915e-01
## ENSG00000210194 26.7622 0.08346588 0.323317 7.78083e-01 8.58157e-01
## ENSG00000198727 45271.0421 -0.18499224 0.332769 5.30157e-01 6.66961e-01
## ENSG00000210195 90.8450 -0.87381046 0.422536 1.57771e-02 4.54694e-02
## ENSG00000210196 1798.9156 1.35823257 0.205744 1.04562e-11 1.70378e-10
DESeq2::plotMA(res1LFC, cex =0.7, ylim= c(-10, 10),main = "RC43N 8 Day DMEM Treatment vs \n RC43T8 Day DMEM Treatment \n (MA plot RN)")
abline (h=c(-1,1), col = "red", lwd = 2)
create a dispersion plot (used to measure the spread or variability of the data:
dis.plot1<- plotDispEsts(dds1, main = "RC43N vs RC43T \n (dispersion plot)")
dis.plot2<- plotDispEsts(dds2, main = "RC43N vs 8 Day DMEM Treatment RC43N \n (dispersion plot)")
dis.plot3<- plotDispEsts(dds3, main = "RC43T vs 8 Day DMEM Treatment RC43T \n (dispersion plot)")
dis.plot4<- plotDispEsts(dds4, main = "8 Day DMEM Treatment RC43N vs 8 Day DMEM Treatment RC43T \n (dispersion plot)")
create Principle Component Analysis plot:
rld1 <- rlogTransformation(dds1, blind = FALSE)
rld2 <- rlogTransformation(dds2, blind = FALSE)
rld3 <- rlogTransformation(dds3, blind = FALSE)
rld4 <- rlogTransformation(dds4, blind = FALSE)
Plot formation:
PCA1 <- plotPCA(rld1, intgroup = "condition1")
## using ntop=500 top features by variance
PCA1 + geom_text(aes(label = name), size = 2.5) +
ggtitle("RC43N vs RC43T \n (PCA plot)")
PCA2 <- plotPCA(rld2, intgroup = "condition2")
## using ntop=500 top features by variance
PCA2 + geom_text(aes(label = name), size = 2.5) +
ggtitle("RC43N vs 8 Day DMEM Treatment RC43N \n (PCA plot)")
PCA3 <- plotPCA(rld3, intgroup = "condition3")
## using ntop=500 top features by variance
PCA3 + geom_text(aes(label = name), size = 2.5) +
ggtitle("RC43T vs 8 Day DMEM Treatment RC43T \n (PCA plot)")
PCA4 <- plotPCA(rld4, intgroup = "condition4")
## using ntop=500 top features by variance
PCA4 + geom_text(aes(label = name), size = 2.5) +
ggtitle("8 Day DMEM Treatment RC43N vs \n 8 Day DMEM Treatment RC43T \n (PCA plot)")
create Volcano plot: convert ensembl IDs to gene symbols:
res1.df <- as.data.frame(res1)
res1.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(res1.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
res2.df <- as.data.frame(res2)
res2.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(res2.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
res3.df <- as.data.frame(res3)
res3.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(res3.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
res4.df <- as.data.frame(res4)
res4.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(res4.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
RC43N vs RC43T (Volcano plot)
EnhancedVolcano(res1,
lab = res1.df$symbol,
x = "log2FoldChange",
y = "padj",
title = "RC43N vs RC43T \n (Volcano plot)",
pCutoff = 1e-1,
FCcutoff = 1)
EnhancedVolcano(res2,
lab = res2.df$symbol,
x = "log2FoldChange",
y = "padj",
title = "RC43N vs 8 Day DMEM Treatment RC43N \n (Volcano plot)",
pCutoff = 1e-1,
FCcutoff = 1)
EnhancedVolcano(res3,
lab = res2.df$symbol,
x = "log2FoldChange",
y = "padj",
title = "RC43T vs 8 Day DMEM Treatment RC43T \n (Volcano plot)",
pCutoff = 1e-1,
FCcutoff = 1)
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
## non-zero p-value...
EnhancedVolcano(res4,
lab = res2.df$symbol,
x = "log2FoldChange",
y = "padj",
title = "8 Day DMEM Treatment RC43N vs 8 Day DMEM Treatment RC43T \n (Volcano plot)",
pCutoff = 1e-1,
FCcutoff = 1)
create heatmap with clustering and visualization based on normalized counts:
sampleDists1 <- dist(t(assay(rld1)))
sampleDists2 <- dist(t(assay(rld2)))
sampleDists3 <- dist(t(assay(rld3)))
sampleDists4 <- dist(t(assay(rld4)))
#Create matrix
sampleDistsMatrix1 <- as.matrix(sampleDists1)
sampleDistsMatrix2 <- as.matrix(sampleDists2)
sampleDistsMatrix3 <- as.matrix(sampleDists3)
sampleDistsMatrix4 <- as.matrix(sampleDists4)
#Creat heatmap
hmat.1 <- pheatmap(sampleDistsMatrix1,
clustering_distance_rows = sampleDists1,
clustering_distance_cols = sampleDists1,
main = "RC43N vs RC43T \n (pheatmap normalized)")
hmat.2 <- pheatmap(sampleDistsMatrix2,
clustering_distance_rows = sampleDists2,
clustering_distance_cols = sampleDists2,
main = "RC43N vs 8 Day DMEM Treatment RC43N \n (pheatmap normalized)")
hmat.3 <- pheatmap(sampleDistsMatrix3,
clustering_distance_rows = sampleDists3,
clustering_distance_cols = sampleDists3,
main = "RC43T vs 8 Day DMEM Treatment RC43T \n (pheatmap normalized)")
hmat.4 <- pheatmap(sampleDistsMatrix4,
clustering_distance_rows = sampleDists4,
clustering_distance_cols = sampleDists4,
main = "8 Day DMEM Treatment RC43N vs \n 8 Day DMEM Treatment RC43T \n (pheatmap normalized)")
create heatmap of log transformed normalized counts:
top_genes1 <- res1[order(res1$padj),][1:20,]
top_genes2 <- res2[order(res2$padj),][1:20,]
top_genes3 <- res3[order(res3$padj),][1:20,]
top_genes4 <- res4[order(res4$padj),][1:20,]
#make data frame and replace ENSEMBL id with SYMBOL
top_genes1 <- as.data.frame(top_genes1)
symbol1 <- mapIds(org.Hs.eg.db, keys = row.names(top_genes1), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
top_genes2 <- as.data.frame(top_genes2)
symbol2 <- mapIds(org.Hs.eg.db, keys = row.names(top_genes2), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
top_genes3 <- as.data.frame(top_genes3)
symbol3 <- mapIds(org.Hs.eg.db, keys = row.names(top_genes3), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
top_genes4 <- as.data.frame(top_genes4)
symbol4 <- mapIds(org.Hs.eg.db, keys = row.names(top_genes4), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
#Extract gene names from top genes
gene_names1 <- rownames(top_genes1)
gene_names2 <- rownames(top_genes2)
gene_names3 <- rownames(top_genes3)
gene_names4 <- rownames(top_genes4)
#Subset the assay data
subset_data1 <- assay(rld1)[gene_names1, ]
subset_data2 <- assay(rld2)[gene_names2, ]
subset_data3 <- assay(rld3)[gene_names3, ]
subset_data4 <- assay(rld4)[gene_names4, ]
use heatmap function:
pheatmap(subset_data1,
main = "RC43N vs RC43T \n (heatmap)",
cluster_rows = TRUE,
show_rownames=TRUE,
cluster_cols = TRUE,
fontsize_row = 8,
labels_row = symbol1,
annotation_col = coldata1)
pheatmap(subset_data2,
main = "RC43N vs 8 Day DMEM Treatment RC43N \n (heatmap)",
cluster_rows = TRUE,
show_rownames=TRUE,
cluster_cols = TRUE,
fontsize_row = 8,
labels_row = symbol2,
annotation_col = coldata2)
pheatmap(subset_data3,
main = "RC43T vs 8 Day DMEM Treatment RC43T \n (heatmap)",
cluster_rows = TRUE,
show_rownames=TRUE,
cluster_cols = TRUE,
fontsize_row = 8,
labels_row = symbol3,
annotation_col = coldata3)
pheatmap(subset_data4,
main = "8 Day DMEM Treatment RC43N vs 8 Day DMEM Treatment RC43T \n (heatmap)",
cluster_rows = TRUE,
show_rownames=TRUE,
cluster_cols = TRUE,
fontsize_row = 8,
labels_row = symbol4,
annotation_col = coldata4)