RNA_Seq_Pipeline

package installation:

library("DESeq2")

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
##     tapply, union, unique, unsplit, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

library(ggplot2)
library(ggrepel)
library("pheatmap")
library(EnhancedVolcano)
library(RColorBrewer)
library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:GenomicRanges':
## 
##     subtract

library("edgeR")

## Loading required package: limma

## 
## Attaching package: 'limma'

## The following object is masked from 'package:DESeq2':
## 
##     plotMA

## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

library(AnnotationDbi)
library(org.Hs.eg.db)

##

library("biomaRt")
library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:IRanges':
## 
##     space

## The following object is masked from 'package:S4Vectors':
## 
##     space

## The following object is masked from 'package:stats':
## 
##     lowess

DESeq2::plotMA

## nonstandardGenericFunction for "plotMA" defined from package "BiocGenerics"
## 
## function (object, ...) 
## {
##     standardGeneric("plotMA")
## }
## <bytecode: 0x116064708>
## <environment: 0x11605dc00>
## Methods may be defined for arguments: object
## Use  showMethods(plotMA)  for currently available ones.

packageVersion("DESeq2")

## [1] '1.44.0'

read in counts file:

Counts <- as.matrix(read.csv("/Users/ewamble/Desktop/RC_RNAseq/data/mapped/counts/RC43N_Complete2.featureCounts.csv", row.names = "Geneid", header = TRUE, sep = ","))
#View
#Counts

pre-processing for DESeq2:

Counts <- Counts[which(rowSums(Counts) > 50),]
#View
#Counts

edit ENSEMBL ID row names:

rownames(Counts) <- gsub("\\..*","",rownames(Counts))
#View
#Counts

subset grouped data:

Group1 <- subset(Counts, select = c(RC43N1, RC43N2, RC43N3, RC43N4, RC43T1, RC43T2, RC43T4))
Group2 <- subset(Counts, select = c(RC43N1, RC43N2, RC43N3, RC43N4, RC43N.8d.DMEM1, RC43N.8d.DMEM2, RC43N.8d.DMEM3, RC43N.8d.DMEM4))
Group3 <- subset(Counts, select = c(RC43T1, RC43T2, RC43T4, RC43T.8d.DMEM1, RC43T.8d.DMEM2, RC43T.8d.DMEM3, RC43T.8d.DMEM4))
Group4 <- subset(Counts, select = c(RC43N.8d.DMEM1, RC43N.8d.DMEM2, RC43N.8d.DMEM3, RC43N.8d.DMEM4, RC43T.8d.DMEM1, RC43T.8d.DMEM2, RC43T.8d.DMEM3, RC43T.8d.DMEM4))

create matrix of data:

Group1 <- as.matrix(Group1)
Group2 <- as.matrix(Group2)
Group3 <- as.matrix(Group3)
Group4 <- as.matrix(Group4)

assign the conditions of the experiment:

condition1 <- factor(c("Control", "Control", "Control", "Control", "Tumor", "Tumor", "Tumor"))
condition2 <- factor(c("Control", "Control", "Control", "Control", "Treatment", "Treatment", "Treatment", "Treatment"))
condition3 <- factor(c("Tumor", "Tumor", "Tumor", "Treatment", "Treatment", "Treatment", "Treatment"))
condition4 <- factor(c("DMEM_Control", "DMEM_Control", "DMEM_Control", "DMEM_Control", "DMEM_Tumor", "DMEM_Tumor", "DMEM_Tumor", "DMEM_Tumor"))

make data frame for conditions and samples:

coldata1 <- data.frame(row.names = colnames(Group1), condition1)
coldata2 <- data.frame(row.names = colnames(Group2), condition2)
coldata3 <- data.frame(row.names = colnames(Group3), condition3)
coldata4 <- data.frame(row.names = colnames(Group4), condition4)

create DESeq2 matrix:

dds1 <- DESeqDataSetFromMatrix(countData = Group1, colData = coldata1, design = ~condition1)
dds2 <- DESeqDataSetFromMatrix(countData = Group2, colData = coldata2, design = ~condition2)
dds3 <- DESeqDataSetFromMatrix(countData = Group3, colData = coldata3, design = ~condition3)
dds4 <- DESeqDataSetFromMatrix(countData = Group4, colData = coldata4, design = ~condition4)

run DESeq function:

dds1 <-DESeq(dds1)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

dds2 <-DESeq(dds2)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

dds3 <-DESeq(dds3)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

dds4 <-DESeq(dds4)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

save the normalized read counts:

normalized_Counts1 <- counts(dds1, normalized = TRUE)
normalized_Counts2 <- counts(dds2, normalized = TRUE)
normalized_Counts3 <- counts(dds3, normalized = TRUE)
normalized_Counts4 <- counts(dds4, normalized = TRUE)

write csv files:

write.csv(normalized_Counts1, 'ControlvsTumor_normalized_counts.csv')
write.csv(normalized_Counts2, 'ControlvsTreatment_RC43N_normalized_counts.csv')
write.csv(normalized_Counts3, 'TumorvsTreatment_RC43T_normalized_counts.csv')
write.csv(normalized_Counts4, 'Treatment_RC43NvsTreatment_RC43T_normalized_counts.csv')

read in csv files and set parameters:

res1 <- results(dds1, contrast = c('condition1', 'Control', 'Tumor'), alpha = 0.05)
res2 <- results(dds2, contrast = c('condition2', 'Control', 'Treatment'), alpha = 0.05)
res3 <- results(dds3, contrast = c('condition3', 'Tumor', 'Treatment'), alpha = 0.05)
res4 <- results(dds4, contrast = c('condition4', 'DMEM_Control', 'DMEM_Tumor'), alpha = 0.05)

ordering the results table by the smallest padj-value:

reordered1 <- res1[order(res1$padj),]
reordered2 <- res2[order(res2$padj),]
reordered3 <- res3[order(res3$padj),]
reordered4 <- res4[order(res4$padj),]

view tables:

reordered1

## log2 fold change (MLE): condition1 Control vs Tumor 
## Wald test p-value: condition1 Control vs Tumor 
## DataFrame with 20867 rows and 6 columns
##                   baseMean log2FoldChange     lfcSE       stat       pvalue
##                  <numeric>      <numeric> <numeric>  <numeric>    <numeric>
## ENSG00000075340   1416.287        6.11097  0.236210    25.8709 1.41644e-147
## ENSG00000129514    530.723       -4.91367  0.189755   -25.8948 7.62744e-148
## ENSG00000106991    805.240        6.35620  0.272518    23.3240 2.53194e-120
## ENSG00000124134    943.516        7.44837  0.326759    22.7947 5.17607e-115
## ENSG00000187134   2682.634        3.35759  0.152513    22.0151 2.06540e-107
## ...                    ...            ...       ...        ...          ...
## ENSG00000215414 441.260068      -4.438049   1.36256 -3.2571325           NA
## ENSG00000210049   1.152670      -1.682264   1.83172 -0.9184043     0.358407
## ENSG00000210107   1.022524       1.191988   1.85778  0.6416201     0.521120
## ENSG00000210117   0.779581      -1.506123   1.71139 -0.8800561     0.378829
## ENSG00000210156   1.885336      -0.101351   1.23815 -0.0818573     0.934760
##                         padj
##                    <numeric>
## ENSG00000075340 1.37926e-143
## ENSG00000129514 1.37926e-143
## ENSG00000106991 1.64365e-116
## ENSG00000124134 2.52010e-111
## ENSG00000187134 8.04473e-104
## ...                      ...
## ENSG00000215414           NA
## ENSG00000210049           NA
## ENSG00000210107           NA
## ENSG00000210117           NA
## ENSG00000210156           NA

reordered2

## log2 fold change (MLE): condition2 Control vs Treatment 
## Wald test p-value: condition2 Control vs Treatment 
## DataFrame with 20867 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat       pvalue
##                 <numeric>      <numeric> <numeric> <numeric>    <numeric>
## ENSG00000148346   4634.61       -6.66077  0.208003  -32.0225 5.30555e-225
## ENSG00000156804   5502.23       -3.75204  0.132451  -28.3277 1.57585e-176
## ENSG00000103034   3019.76       -4.15300  0.149247  -27.8264 2.08176e-170
## ENSG00000157601  10018.89       -4.72908  0.173194  -27.3051 3.68857e-164
## ENSG00000130513   2239.73       -5.62205  0.223676  -25.1348 2.07215e-139
## ...                   ...            ...       ...       ...          ...
## ENSG00000215568   0.00000             NA        NA        NA           NA
## ENSG00000236754   8.83752      -1.996252  1.235924 -1.615190           NA
## ENSG00000278948 122.63640      -2.151011  0.652992 -3.294082           NA
## ENSG00000182916   0.00000             NA        NA        NA           NA
## ENSG00000169084 183.22936      -0.786863  1.334253 -0.589741           NA
##                         padj
##                    <numeric>
## ENSG00000148346 1.10037e-220
## ENSG00000156804 1.63416e-172
## ENSG00000103034 1.43919e-166
## ENSG00000157601 1.91252e-160
## ENSG00000130513 8.59527e-136
## ...                      ...
## ENSG00000215568           NA
## ENSG00000236754           NA
## ENSG00000278948           NA
## ENSG00000182916           NA
## ENSG00000169084           NA

reordered3

## log2 fold change (MLE): condition3 Tumor vs Treatment 
## Wald test p-value: condition3 Tumor vs Treatment 
## DataFrame with 20867 rows and 6 columns
##                   baseMean log2FoldChange     lfcSE      stat    pvalue
##                  <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000177614     6621.8       -5.69583 0.1475827  -38.5942         0
## ENSG00000189143    16791.5       -3.58272 0.0885237  -40.4718         0
## ENSG00000149591    12905.9       -7.27922 0.1712628  -42.5032         0
## ENSG00000013588    41477.6       -3.24702 0.0783965  -41.4179         0
## ENSG00000167767    10195.4       -3.49215 0.0872989  -40.0022         0
## ...                    ...            ...       ...       ...       ...
## ENSG00000196406   0.438199      -1.002902   1.85986 -0.539236  0.589724
## ENSG00000002586 514.759939      -0.548812   1.47386 -0.372364        NA
## ENSG00000215414 464.767549       4.019597   1.35332  2.970178        NA
## ENSG00000210107   0.923801      -1.054345   1.71451 -0.614954  0.538585
## ENSG00000210117   1.034389       0.750476   1.38620  0.541392  0.588237
##                      padj
##                 <numeric>
## ENSG00000177614         0
## ENSG00000189143         0
## ENSG00000149591         0
## ENSG00000013588         0
## ENSG00000167767         0
## ...                   ...
## ENSG00000196406        NA
## ENSG00000002586        NA
## ENSG00000215414        NA
## ENSG00000210107        NA
## ENSG00000210117        NA

reordered4

## log2 fold change (MLE): condition4 DMEM_Control vs DMEM_Tumor 
## Wald test p-value: condition4 DMEM_Control vs DMEM_Tumor 
## DataFrame with 20867 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat       pvalue
##                 <numeric>      <numeric> <numeric> <numeric>    <numeric>
## ENSG00000204941  1289.579       -6.36499  0.188140  -33.8312 6.85901e-251
## ENSG00000144476  1705.926        5.34255  0.166100   32.1647 5.50868e-227
## ENSG00000158786  1091.256       -3.82749  0.125756  -30.4357 1.85097e-203
## ENSG00000148677   567.624       -5.03273  0.171472  -29.3502 2.37673e-189
## ENSG00000118849  1695.505       -4.58680  0.157041  -29.2077 1.54945e-187
## ...                   ...            ...       ...       ...          ...
## ENSG00000280441 194.81385       2.849387  1.159058   2.45836           NA
## ENSG00000142188  24.38037      -2.578583  1.112013  -2.31884           NA
## ENSG00000236754   9.29026       2.037665  1.103569   1.84643           NA
## ENSG00000278948 172.64145       0.651496  0.626938   1.03917           NA
## ENSG00000215115  61.59791      -2.672531  0.830076  -3.21962           NA
##                         padj
##                    <numeric>
## ENSG00000204941 1.42386e-246
## ENSG00000144476 5.71773e-223
## ENSG00000158786 1.28081e-199
## ENSG00000148677 1.23346e-185
## ENSG00000118849 6.43302e-184
## ...                      ...
## ENSG00000280441           NA
## ENSG00000142188           NA
## ENSG00000236754           NA
## ENSG00000278948           NA
## ENSG00000215115           NA

generate MA plot (for gene expression changes in terms of log fold change on y-axis and normalized expression counts on x-axis: RC43N vs RC43T (MA plot):

DESeq2::plotMA(res1, cex = 0.7, ylim = c(-10, 10), main = "RC43N vs RC43T \n (MA plot)")
abline (h=c(-1,1), col="red", lwd=3)

DESeq2::plotMA(res2, cex = 0.7, ylim = c(-10, 10), main = "RC43N vs 8 Day DMEM Treatment \n (MA plot)")
abline(h=c(-1, 1), col = "red",lwd = 3)

DESeq2::plotMA(res3, cex = 0.7, ylim = c(-10, 10), main = "RC43T vs 8 Day DMEM Treatment \n (MA plot)")
abline(h=c(-1, 1), col = "red",lwd = 3)

DESeq2::plotMA(res4, cex = 0.7, ylim = c(-10, 10), main = "RC43N DMEM Treatment vs \n RC43T DMEM Treatment \n (MA plot)")
abline(h=c(-1, 1), col = "red",lwd = 3)

generate MA plot and remove noise associated with log2 fold changes from low count genes without required filtering thresholds:

resultsNames(dds1)

## [1] "Intercept"                   "condition1_Tumor_vs_Control"

resultsNames(dds2)

## [1] "Intercept"                       "condition2_Treatment_vs_Control"

resultsNames(dds3)

## [1] "Intercept"                     "condition3_Tumor_vs_Treatment"

resultsNames(dds4)

## [1] "Intercept"                            
## [2] "condition4_DMEM_Tumor_vs_DMEM_Control"

*apeglm- Heavy-tailed prior distributions for sequence count data: removing the noise and preserving large differences when ranking genes: RC43N vs RC43T (MA plot):

#Shrinkage of effect size
res1LFC <- lfcShrink(dds = dds1, coef = 2, type = "apeglm")

## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

DESeq2::plotMA(res1LFC, cex =0.7, ylim= c(-10, 10),main = "RC43N  vs RC43T \n (MA plot RN)")
abline (h=c(-1, 1), col = "red", lwd = 2)

lfcShrink(dds2, coef = 2, type = "apeglm")

## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

## log2 fold change (MAP): condition2 Treatment vs Control 
## Wald test p-value: condition2 Treatment vs Control 
## DataFrame with 20867 rows and 5 columns
##                   baseMean log2FoldChange     lfcSE      pvalue        padj
##                  <numeric>      <numeric> <numeric>   <numeric>   <numeric>
## ENSG00000227232  384.60089      0.0619329  0.161246  0.69563268 0.787394081
## ENSG00000278267    5.51363      0.1705986  0.608193  0.66070962 0.760946106
## ENSG00000233750    1.40433     -0.1694630  0.746924  0.51559544 0.639598630
## ENSG00000268903   25.70287      1.9872959  0.598649  0.00010207 0.000597346
## ENSG00000269981   14.34092     -0.0266832  0.579342  0.94186608 0.962755175
## ...                    ...            ...       ...         ...         ...
## ENSG00000198695 22424.0082       0.708335  0.250948 2.40393e-03 9.12807e-03
## ENSG00000210194    20.8239       0.435487  0.389085 1.96913e-01 3.15218e-01
## ENSG00000198727 52583.9777      -0.259060  0.333927 3.81061e-01 5.15404e-01
## ENSG00000210195    68.6341       2.330344  0.467775 6.36681e-08 7.49845e-07
## ENSG00000210196  1023.7574      -0.196084  0.220187 3.51147e-01 4.85876e-01

DESeq2::plotMA(res1LFC, cex =0.7, ylim= c(-10, 10),main = "RC43N  vs 8 Day DMEM Treatment \n (MA plot RN)")
abline (h=c(-1, 1), col = "red", lwd = 2)

lfcShrink(dds3, coef = 2, type = "apeglm")

## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

## log2 fold change (MAP): condition3 Tumor vs Treatment 
## Wald test p-value: condition3 Tumor vs Treatment 
## DataFrame with 20867 rows and 5 columns
##                   baseMean log2FoldChange     lfcSE      pvalue        padj
##                  <numeric>      <numeric> <numeric>   <numeric>   <numeric>
## ENSG00000227232  284.39836     -0.0122665  0.198573    0.948468    0.968680
## ENSG00000278267    3.79801      0.0665160  0.628975    0.831632    0.894633
## ENSG00000233750    6.77519      0.3904684  0.614918    0.353542    0.489004
## ENSG00000268903   34.65685     -0.0278862  0.405333    0.936975    0.962461
## ENSG00000269981   40.94799     -0.0463471  0.371887    0.889962    0.932857
## ...                    ...            ...       ...         ...         ...
## ENSG00000198695 14846.4444     -1.2905942 0.0882189 6.58724e-49 3.67607e-47
## ENSG00000210194    15.8674     -1.0812105 0.5416965 1.27532e-02 3.15650e-02
## ENSG00000198727 28620.0994      0.0364073 0.1005423 7.16044e-01 8.10330e-01
## ENSG00000210195    29.2136     -0.8421250 0.4048899 1.88271e-02 4.42296e-02
## ENSG00000210196  1443.8644     -0.7337048 0.1415139 1.63009e-07 1.02707e-06

DESeq2::plotMA(res1LFC, cex =0.7, ylim= c(-10, 10),main = "RC43T  vs 8 Day DMEM Treatment \n (MA plot)")
abline (h=c(-1, 1), col = "red", lwd = 2)

lfcShrink(dds4, coef = 2, type = "apeglm")

## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

## log2 fold change (MAP): condition4 DMEM Tumor vs DMEM Control 
## Wald test p-value: condition4 DMEM Tumor vs DMEM Control 
## DataFrame with 20867 rows and 5 columns
##                   baseMean log2FoldChange     lfcSE      pvalue        padj
##                  <numeric>      <numeric> <numeric>   <numeric>   <numeric>
## ENSG00000227232  422.98050      0.0776363  0.128162  0.53966595  0.67487503
## ENSG00000278267    6.34266     -0.0349633  0.610488  0.92416038  0.95517278
## ENSG00000233750    3.19420      1.3730043  1.385494  0.03080737  0.07831621
## ENSG00000268903   51.24619      0.2817783  0.415962  0.42440822  0.57321342
## ENSG00000269981   41.74411      1.8605831  0.717418  0.00105577  0.00454139
## ...                    ...            ...       ...         ...         ...
## ENSG00000198695 29443.6426    -0.00920489  0.234458 9.62410e-01 9.78915e-01
## ENSG00000210194    26.7622     0.08346588  0.323317 7.78083e-01 8.58157e-01
## ENSG00000198727 45271.0421    -0.18499224  0.332769 5.30157e-01 6.66961e-01
## ENSG00000210195    90.8450    -0.87381046  0.422536 1.57771e-02 4.54694e-02
## ENSG00000210196  1798.9156     1.35823257  0.205744 1.04562e-11 1.70378e-10

DESeq2::plotMA(res1LFC, cex =0.7, ylim= c(-10, 10),main = "RC43N 8 Day DMEM Treatment  vs \n RC43T8 Day DMEM Treatment \n (MA plot RN)")
abline (h=c(-1,1), col = "red", lwd = 2)

create a dispersion plot (used to measure the spread or variability of the data:

dis.plot1<- plotDispEsts(dds1, main = "RC43N vs RC43T \n (dispersion plot)")

dis.plot2<- plotDispEsts(dds2, main = "RC43N vs 8 Day DMEM Treatment RC43N \n (dispersion plot)")

dis.plot3<- plotDispEsts(dds3, main = "RC43T vs 8 Day DMEM Treatment RC43T \n (dispersion plot)")

dis.plot4<- plotDispEsts(dds4, main = "8 Day DMEM Treatment RC43N vs 8 Day DMEM Treatment RC43T \n (dispersion plot)")

create Principle Component Analysis plot:

rld1 <- rlogTransformation(dds1, blind = FALSE)
rld2 <- rlogTransformation(dds2, blind = FALSE)
rld3 <- rlogTransformation(dds3, blind = FALSE)
rld4 <- rlogTransformation(dds4, blind = FALSE)

Plot formation:

PCA1 <- plotPCA(rld1, intgroup = "condition1")

## using ntop=500 top features by variance

PCA1 + geom_text(aes(label = name), size = 2.5) + 
  ggtitle("RC43N vs RC43T \n (PCA plot)")

PCA2 <- plotPCA(rld2, intgroup = "condition2")

## using ntop=500 top features by variance

PCA2 + geom_text(aes(label = name), size = 2.5) + 
  ggtitle("RC43N vs 8 Day DMEM Treatment RC43N \n (PCA plot)")

PCA3 <- plotPCA(rld3, intgroup = "condition3")

## using ntop=500 top features by variance

PCA3 + geom_text(aes(label = name), size = 2.5) + 
  ggtitle("RC43T vs 8 Day DMEM Treatment RC43T \n (PCA plot)")

PCA4 <- plotPCA(rld4, intgroup = "condition4")

## using ntop=500 top features by variance

PCA4 + geom_text(aes(label = name), size = 2.5) + 
  ggtitle("8 Day DMEM Treatment RC43N vs \n 8 Day DMEM Treatment RC43T \n (PCA plot)")

create Volcano plot: convert ensembl IDs to gene symbols:

res1.df <- as.data.frame(res1)
res1.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(res1.df), keytype = "ENSEMBL", column = "SYMBOL")

## 'select()' returned 1:many mapping between keys and columns

res2.df <- as.data.frame(res2)
res2.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(res2.df), keytype = "ENSEMBL", column = "SYMBOL")

## 'select()' returned 1:many mapping between keys and columns

res3.df <- as.data.frame(res3)
res3.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(res3.df), keytype = "ENSEMBL", column = "SYMBOL")

## 'select()' returned 1:many mapping between keys and columns

res4.df <- as.data.frame(res4)
res4.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(res4.df), keytype = "ENSEMBL", column = "SYMBOL")

## 'select()' returned 1:many mapping between keys and columns

RC43N vs RC43T (Volcano plot)

EnhancedVolcano(res1,
                lab = res1.df$symbol,
                x = "log2FoldChange",
                y = "padj",
                title = "RC43N vs RC43T \n (Volcano plot)",
                pCutoff = 1e-1,
                FCcutoff = 1)

EnhancedVolcano(res2,
                lab = res2.df$symbol,
                x = "log2FoldChange",
                y = "padj", 
                title = "RC43N vs 8 Day DMEM Treatment RC43N \n (Volcano plot)",
                pCutoff = 1e-1,
                FCcutoff = 1)

EnhancedVolcano(res3,
                lab = res2.df$symbol,
                x = "log2FoldChange",
                y = "padj",
                title = "RC43T vs 8 Day DMEM Treatment RC43T \n (Volcano plot)",
                pCutoff = 1e-1,
                FCcutoff = 1)

## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
## non-zero p-value...

EnhancedVolcano(res4,
                lab = res2.df$symbol,
                x = "log2FoldChange",
                y = "padj",
                title = "8 Day DMEM Treatment RC43N vs 8 Day DMEM Treatment RC43T \n (Volcano plot)",
                pCutoff = 1e-1,
                FCcutoff = 1)

create heatmap with clustering and visualization based on normalized counts:

sampleDists1 <- dist(t(assay(rld1)))
sampleDists2 <- dist(t(assay(rld2)))
sampleDists3 <- dist(t(assay(rld3)))
sampleDists4 <- dist(t(assay(rld4)))
#Create matrix
sampleDistsMatrix1 <- as.matrix(sampleDists1)
sampleDistsMatrix2 <- as.matrix(sampleDists2)
sampleDistsMatrix3 <- as.matrix(sampleDists3)
sampleDistsMatrix4 <- as.matrix(sampleDists4)
#Creat heatmap
hmat.1 <- pheatmap(sampleDistsMatrix1,
         clustering_distance_rows = sampleDists1,
         clustering_distance_cols = sampleDists1,
         main = "RC43N vs RC43T \n (pheatmap normalized)")

hmat.2 <- pheatmap(sampleDistsMatrix2,
         clustering_distance_rows = sampleDists2,
         clustering_distance_cols = sampleDists2,
         main = "RC43N vs 8 Day DMEM Treatment RC43N \n (pheatmap normalized)")

hmat.3 <- pheatmap(sampleDistsMatrix3,
         clustering_distance_rows = sampleDists3,
         clustering_distance_cols = sampleDists3,
         main = "RC43T vs 8 Day DMEM Treatment RC43T \n (pheatmap normalized)")

hmat.4 <- pheatmap(sampleDistsMatrix4,
         clustering_distance_rows = sampleDists4,
         clustering_distance_cols = sampleDists4,
         main = "8 Day DMEM Treatment RC43N vs \n 8 Day DMEM Treatment RC43T \n (pheatmap normalized)")

create heatmap of log transformed normalized counts:

top_genes1 <- res1[order(res1$padj),][1:20,]
top_genes2 <- res2[order(res2$padj),][1:20,]
top_genes3 <- res3[order(res3$padj),][1:20,]
top_genes4 <- res4[order(res4$padj),][1:20,]
#make data frame and replace ENSEMBL id with SYMBOL
top_genes1 <- as.data.frame(top_genes1)
symbol1 <- mapIds(org.Hs.eg.db, keys = row.names(top_genes1), keytype = "ENSEMBL", column = "SYMBOL")

## 'select()' returned 1:1 mapping between keys and columns

top_genes2 <- as.data.frame(top_genes2)
symbol2 <- mapIds(org.Hs.eg.db, keys = row.names(top_genes2), keytype = "ENSEMBL", column = "SYMBOL")

## 'select()' returned 1:1 mapping between keys and columns

top_genes3 <- as.data.frame(top_genes3)
symbol3 <- mapIds(org.Hs.eg.db, keys = row.names(top_genes3), keytype = "ENSEMBL", column = "SYMBOL")

## 'select()' returned 1:1 mapping between keys and columns

top_genes4 <- as.data.frame(top_genes4)
symbol4 <- mapIds(org.Hs.eg.db, keys = row.names(top_genes4), keytype = "ENSEMBL", column = "SYMBOL")

## 'select()' returned 1:1 mapping between keys and columns

#Extract gene names from top genes
gene_names1 <- rownames(top_genes1)
gene_names2 <- rownames(top_genes2)
gene_names3 <- rownames(top_genes3)
gene_names4 <- rownames(top_genes4)
#Subset the assay data
subset_data1 <- assay(rld1)[gene_names1, ]
subset_data2 <- assay(rld2)[gene_names2, ]
subset_data3 <- assay(rld3)[gene_names3, ]
subset_data4 <- assay(rld4)[gene_names4, ]

use heatmap function:

pheatmap(subset_data1, 
         main = "RC43N vs RC43T \n (heatmap)",
         cluster_rows = TRUE, 
         show_rownames=TRUE, 
         cluster_cols = TRUE, 
         fontsize_row = 8,
         labels_row = symbol1,
         annotation_col = coldata1)

pheatmap(subset_data2, 
         main = "RC43N vs 8 Day DMEM Treatment RC43N \n (heatmap)",
         cluster_rows = TRUE, 
         show_rownames=TRUE, 
         cluster_cols = TRUE, 
         fontsize_row = 8,
         labels_row = symbol2,
         annotation_col = coldata2)

pheatmap(subset_data3, 
         main = "RC43T vs 8 Day DMEM Treatment RC43T \n (heatmap)",
         cluster_rows = TRUE, 
         show_rownames=TRUE, 
         cluster_cols = TRUE, 
         fontsize_row = 8,
         labels_row = symbol3,
         annotation_col = coldata3)

pheatmap(subset_data4, 
         main = "8 Day DMEM Treatment RC43N vs 8 Day DMEM Treatment RC43T \n (heatmap)",
         cluster_rows = TRUE, 
         show_rownames=TRUE, 
         cluster_cols = TRUE, 
         fontsize_row = 8,
         labels_row = symbol4,
         annotation_col = coldata4)

RNA_Seq_Pipeline

Ezekiel Wmable

2024-08-19