load packages:
library(ggvenn)
## Loading required package: ggplot2
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
library(stringr)
library(purrr)
library(RColorBrewer)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(EnhancedVolcano)
## Loading required package: ggrepel
library("DESeq2")
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
## tapply, union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:gplots':
##
## space
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:purrr':
##
## reduce
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(AnnotationDbi)
library("org.Hs.eg.db")
##
read in counts file:
Counts <- as.matrix(read.csv("/Users/ewamble/Desktop/THP1_RNAseq/data/mapped/counts/THP1.featureCounts.csv", row.names = "Geneid", header = TRUE, sep = ","))
edit ENSEMBL ID row names:
rownames(Counts) <- gsub("\\..*","",rownames(Counts))
subset grouped data:
Set1 <- subset(Counts, select = c(M0_HCC38_T1, M0_HCC38_T2, M0_HCC38_T3, M0_HCC70_T1, M0_HCC70_T2, M0_HCC70_T3))
Set2 <- subset(Counts, select = c(M0_HCC38_T1, M0_HCC38_T2, M0_HCC38_T3, M0_MDA_MB231_T1, M0_MDA_MB231_T2, M0_MDA_MB231_T3))
Set3 <- subset(Counts, select = c(M0_HCC70_T1, M0_HCC70_T2, M0_HCC70_T3, M0_MDA_MB468_T1, M0_MDA_MB468_T2, M0_MDA_MB468_T3))
Set4 <- subset(Counts, select = c(M0_MDA_MB231_T1, M0_MDA_MB231_T2, M0_MDA_MB231_T3, M0_MDA_MB468_T1, M0_MDA_MB468_T2, M0_MDA_MB468_T3))
create matrix of data:
Set1 <- as.matrix(Set1)
Set2 <- as.matrix(Set2)
Set3 <- as.matrix(Set3)
Set4 <- as.matrix(Set4)
assign the conditions of the experiment:
cond1 <- factor(c("HCC38", "HCC38", "HCC38", "HCC70", "HCC70", "HCC70"))
cond2 <- factor(c("HCC38", "HCC38", "HCC38", "MDA_MB_231", "MDA_MB_231", "MDA_MB_231"))
cond3 <- factor(c("HCC70", "HCC70", "HCC70", "MDA_MB_468", "MDA_MB_468", "MDA_MB_468"))
cond4 <- factor(c("MDA_MB_231", "MDA_MB_231", "MDA_MB_231", "MDA_MB_468", "MDA_MB_468", "MDA_MB_468"))
make data frame for conditions and samples:
cold1 <- data.frame(row.names = colnames(Set1), cond1)
cold2 <- data.frame(row.names = colnames(Set2), cond2)
cold3 <- data.frame(row.names = colnames(Set3), cond3)
cold4 <- data.frame(row.names = colnames(Set4), cond4)
create DESeq2 matrix:
de1 <- DESeqDataSetFromMatrix(countData = Set1, colData = cold1, design = ~cond1)
## Warning in DESeqDataSet(se, design = design, ignoreRank): 46 duplicate rownames
## were renamed by adding numbers
de2 <- DESeqDataSetFromMatrix(countData = Set2, colData = cold2, design = ~cond2)
## Warning in DESeqDataSet(se, design = design, ignoreRank): 46 duplicate rownames
## were renamed by adding numbers
de3 <- DESeqDataSetFromMatrix(countData = Set3, colData = cold3, design = ~cond3)
## Warning in DESeqDataSet(se, design = design, ignoreRank): 46 duplicate rownames
## were renamed by adding numbers
de4 <- DESeqDataSetFromMatrix(countData = Set4, colData = cold4, design = ~cond4)
## Warning in DESeqDataSet(se, design = design, ignoreRank): 46 duplicate rownames
## were renamed by adding numbers
run DESeq function:
de1 <-DESeq(de1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
de2 <-DESeq(de2)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
de3 <-DESeq(de3)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
de4 <-DESeq(de4)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
save the normalized read counts:
r1 <- counts(de1, normalized = TRUE)
r2 <- counts(de2, normalized = TRUE)
r3 <- counts(de3, normalized = TRUE)
r4 <- counts(de4, normalized = TRUE)
read in csv files and set parameters:
r1 <- results(de1, contrast = c('cond1', 'HCC38', 'HCC70'), alpha = 0.05)
r2 <- results(de2, contrast = c('cond2', 'HCC38', 'MDA_MB_231'), alpha = 0.05)
r3 <- results(de3, contrast = c('cond3', 'HCC70', 'MDA_MB_468'), alpha = 0.05)
r4 <- results(de4, contrast = c('cond4', 'MDA_MB_231', 'MDA_MB_468'), alpha = 0.05)
filter the results table by p-value and Log2FoldChange:
r1 <- r1[order(r1$pvalue <= 0.01 & r1$log2FoldChange > 2),]
r2 <- r2[order(r2$pvalue <= 0.01 & r2$log2FoldChange > 2),]
r3 <- r3[order(r3$pvalue <= 0.01 & r3$log2FoldChange > 2),]
r4 <- r4[order(r4$pvalue <= 0.01 & r4$log2FoldChange > 2),]
genes without required filtering thresholds:
resultsNames(de1)
## [1] "Intercept" "cond1_HCC70_vs_HCC38"
resultsNames(de2)
## [1] "Intercept" "cond2_MDA_MB_231_vs_HCC38"
resultsNames(de3)
## [1] "Intercept" "cond3_MDA_MB_468_vs_HCC70"
resultsNames(de4)
## [1] "Intercept" "cond4_MDA_MB_468_vs_MDA_MB_231"
create Principle Component Analysis plot:
rd1 <- rlogTransformation(de1, blind = FALSE)
rd2 <- rlogTransformation(de2, blind = FALSE)
rd3 <- rlogTransformation(de3, blind = FALSE)
rd4 <- rlogTransformation(de4, blind = FALSE)
plot formation:
PCA1 <- plotPCA(rd1, intgroup = "cond1")
## using ntop=500 top features by variance
PCA1 + geom_text(aes(label = name), size = 2.5) +
ggtitle("HCC38 vs HCC70 \n (PCA plot)")
PCA2 <- plotPCA(rd2, intgroup = "cond2")
## using ntop=500 top features by variance
PCA2 + geom_text(aes(label = name), size = 2.5) +
ggtitle("HCC38 vs MDA_MB_231 \n (PCA plot)")
PCA3 <- plotPCA(rd3, intgroup = "cond3")
## using ntop=500 top features by variance
PCA3 + geom_text(aes(label = name), size = 2.5) +
ggtitle("HCC70 vs MDA_MB_468 \n (PCA plot)")
PCA4 <- plotPCA(rd4, intgroup = "cond4")
## using ntop=500 top features by variance
PCA4 + geom_text(aes(label = name), size = 2.5) +
ggtitle("MDA_MB_231 vs MDA_MB_468 \n (PCA plot)")
create Volcano plot: convert ensembl IDs to gene symbols:
r1.df <- as.data.frame(r1)
r1.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(r1.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
r2.df <- as.data.frame(r2)
r2.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(r2.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
r3.df <- as.data.frame(r3)
r3.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(r3.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
r4.df <- as.data.frame(r4)
r4.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(r4.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
volcano plot:
EnhancedVolcano(r1,
lab = r1.df$symbol,
x = "log2FoldChange",
y = "padj",
title = "HCC38 vs HCC70 \n (Volcano plot)",
pCutoff = 1e-1,
FCcutoff = 1)
EnhancedVolcano(r2,
lab = r2.df$symbol,
x = "log2FoldChange",
y = "padj",
title = "HCC38 vs MDA-MB-231 \n (Volcano plot)",
pCutoff = 1e-1,
FCcutoff = 1)
EnhancedVolcano(r3,
lab = r3.df$symbol,
x = "log2FoldChange",
y = "padj",
title = "HCC70 vs MDA-MB-468 \n (Volcano plot)",
pCutoff = 1e-1,
FCcutoff = 1)
EnhancedVolcano(r4,
lab = r4.df$symbol,
x = "log2FoldChange",
y = "padj",
title = "MDA-MB-231 vs MDA-MB-468 \n (Volcano plot)",
pCutoff = 1e-1,
FCcutoff = 1)
create heatmap of log transformed normalized counts:
#Remove NAs
r1 <- na.omit(as.data.frame(r1))
r2 <- na.omit(as.data.frame(r2))
r3 <- na.omit(as.data.frame(r3))
r4 <- na.omit(as.data.frame(r4))
# Step 1: Apply filter conditions
fd1 <- r1[r1$pvalue <= 0.01 & r1$log2FoldChange > 2, ]
fd2 <- r2[r2$pvalue <= 0.01 & r2$log2FoldChange > 2, ]
fd3 <- r3[r3$pvalue <= 0.01 & r3$log2FoldChange > 2, ]
fd4 <- r4[r4$pvalue <= 0.01 & r4$log2FoldChange > 2, ]
# Step 2: Sort by a specific column, for example, by pvalue (or use another column as needed)
tg1 <- fd1[order(fd1$pvalue), ]
tg2 <- fd2[order(fd2$pvalue), ]
tg3 <- fd3[order(fd3$pvalue), ]
tg4 <- fd4[order(fd4$pvalue), ]
tg1$symbol <- mapIds(org.Hs.eg.db, keys = row.names(tg1), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
tg2$symbol <- mapIds(org.Hs.eg.db, keys = row.names(tg2), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
tg3$symbol <- mapIds(org.Hs.eg.db, keys = row.names(tg3), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
tg4$symbol <- mapIds(org.Hs.eg.db, keys = row.names(tg4), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
SET1 <- tg1$symbol
SET2 <- tg2$symbol
SET3 <- tg3$symbol
SET4 <- tg4$symbol
SET1 <- as.matrix(na.omit(SET1))
SET2 <- as.matrix(na.omit(SET2))
SET3 <- as.matrix(na.omit(SET3))
SET4 <- as.matrix(na.omit(SET4))
make data frame with gene symbol and ENSEMBL ID: (Version 1)
venn <- list("HCC38 vs HCC70" = SET1,
"HCC38 vs MDA-MB-231" = SET2,
"HCC70 vs MDA-MB-468" = SET3,
"MDA-MB-231 vs MDA-MB-468" = SET4
)
ggvenn(venn)
Extract gene names from different sections:
only_in_set1 <- as.matrix(setdiff(SET1, SET2))
only_in_set2 <- as.matrix(setdiff(SET2, SET3))
only_in_set3 <-as.matrix( setdiff(SET3, SET2))
only_in_set4 <- as.matrix(setdiff(SET4, SET3))
common_genes1 <- as.matrix(intersect(SET2, SET3))
common_genes2 <- as.matrix(intersect(SET1, SET4))