load packages:

library(ggvenn)
## Loading required package: ggplot2
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
library(stringr)
library(purrr)
library(RColorBrewer)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(EnhancedVolcano)
## Loading required package: ggrepel
library("DESeq2")
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
##     tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:gplots':
## 
##     space
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:purrr':
## 
##     reduce
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(AnnotationDbi)
library("org.Hs.eg.db")
## 

read in counts file:

Counts <- as.matrix(read.csv("/Users/ewamble/Desktop/THP1_RNAseq/data/mapped/counts/THP1.featureCounts.csv", row.names = "Geneid", header = TRUE, sep = ","))

edit ENSEMBL ID row names:

rownames(Counts) <- gsub("\\..*","",rownames(Counts))

subset grouped data:

Set1 <- subset(Counts, select = c(M0_HCC38_T1, M0_HCC38_T2, M0_HCC38_T3, M0_HCC70_T1, M0_HCC70_T2, M0_HCC70_T3))
Set2 <- subset(Counts, select = c(M0_HCC38_T1, M0_HCC38_T2, M0_HCC38_T3, M0_MDA_MB231_T1, M0_MDA_MB231_T2, M0_MDA_MB231_T3))
Set3 <- subset(Counts, select = c(M0_HCC70_T1, M0_HCC70_T2, M0_HCC70_T3, M0_MDA_MB468_T1, M0_MDA_MB468_T2, M0_MDA_MB468_T3))
Set4 <- subset(Counts, select = c(M0_MDA_MB231_T1, M0_MDA_MB231_T2, M0_MDA_MB231_T3, M0_MDA_MB468_T1, M0_MDA_MB468_T2, M0_MDA_MB468_T3))

create matrix of data:

Set1 <- as.matrix(Set1)
Set2 <- as.matrix(Set2)
Set3 <- as.matrix(Set3)
Set4 <- as.matrix(Set4)

assign the conditions of the experiment:

cond1 <- factor(c("HCC38", "HCC38", "HCC38", "HCC70", "HCC70", "HCC70"))
cond2 <- factor(c("HCC38", "HCC38", "HCC38", "MDA_MB_231", "MDA_MB_231", "MDA_MB_231"))
cond3 <- factor(c("HCC70", "HCC70", "HCC70", "MDA_MB_468", "MDA_MB_468", "MDA_MB_468"))
cond4 <- factor(c("MDA_MB_231", "MDA_MB_231", "MDA_MB_231", "MDA_MB_468", "MDA_MB_468", "MDA_MB_468"))

make data frame for conditions and samples:

cold1 <- data.frame(row.names = colnames(Set1), cond1)
cold2 <- data.frame(row.names = colnames(Set2), cond2)
cold3 <- data.frame(row.names = colnames(Set3), cond3)
cold4 <- data.frame(row.names = colnames(Set4), cond4)

create DESeq2 matrix:

de1 <- DESeqDataSetFromMatrix(countData = Set1, colData = cold1, design = ~cond1)
## Warning in DESeqDataSet(se, design = design, ignoreRank): 46 duplicate rownames
## were renamed by adding numbers
de2 <- DESeqDataSetFromMatrix(countData = Set2, colData = cold2, design = ~cond2)
## Warning in DESeqDataSet(se, design = design, ignoreRank): 46 duplicate rownames
## were renamed by adding numbers
de3 <- DESeqDataSetFromMatrix(countData = Set3, colData = cold3, design = ~cond3)
## Warning in DESeqDataSet(se, design = design, ignoreRank): 46 duplicate rownames
## were renamed by adding numbers
de4 <- DESeqDataSetFromMatrix(countData = Set4, colData = cold4, design = ~cond4)
## Warning in DESeqDataSet(se, design = design, ignoreRank): 46 duplicate rownames
## were renamed by adding numbers

run DESeq function:

de1 <-DESeq(de1)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
de2 <-DESeq(de2)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
de3 <-DESeq(de3)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
de4 <-DESeq(de4)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

save the normalized read counts:

r1 <- counts(de1, normalized = TRUE)
r2 <- counts(de2, normalized = TRUE)
r3 <- counts(de3, normalized = TRUE)
r4 <- counts(de4, normalized = TRUE)

read in csv files and set parameters:

r1 <- results(de1, contrast = c('cond1', 'HCC38', 'HCC70'), alpha = 0.05)
r2 <- results(de2, contrast = c('cond2', 'HCC38', 'MDA_MB_231'), alpha = 0.05)
r3 <- results(de3, contrast = c('cond3', 'HCC70', 'MDA_MB_468'), alpha = 0.05)
r4 <- results(de4, contrast = c('cond4', 'MDA_MB_231', 'MDA_MB_468'), alpha = 0.05)

filter the results table by p-value and Log2FoldChange:

r1 <- r1[order(r1$pvalue <= 0.01 & r1$log2FoldChange > 2),]
r2 <- r2[order(r2$pvalue <= 0.01 & r2$log2FoldChange > 2),]
r3 <- r3[order(r3$pvalue <= 0.01 & r3$log2FoldChange > 2),]
r4 <- r4[order(r4$pvalue <= 0.01 & r4$log2FoldChange > 2),]

genes without required filtering thresholds:

resultsNames(de1)
## [1] "Intercept"            "cond1_HCC70_vs_HCC38"
resultsNames(de2)
## [1] "Intercept"                 "cond2_MDA_MB_231_vs_HCC38"
resultsNames(de3)
## [1] "Intercept"                 "cond3_MDA_MB_468_vs_HCC70"
resultsNames(de4)
## [1] "Intercept"                      "cond4_MDA_MB_468_vs_MDA_MB_231"

create Principle Component Analysis plot:

rd1 <- rlogTransformation(de1, blind = FALSE)
rd2 <- rlogTransformation(de2, blind = FALSE)
rd3 <- rlogTransformation(de3, blind = FALSE)
rd4 <- rlogTransformation(de4, blind = FALSE)

plot formation:

PCA1 <- plotPCA(rd1, intgroup = "cond1")
## using ntop=500 top features by variance
PCA1 + geom_text(aes(label = name), size = 2.5) + 
  ggtitle("HCC38 vs HCC70 \n (PCA plot)")

PCA2 <- plotPCA(rd2, intgroup = "cond2")
## using ntop=500 top features by variance
PCA2 + geom_text(aes(label = name), size = 2.5) + 
  ggtitle("HCC38 vs MDA_MB_231 \n (PCA plot)")

PCA3 <- plotPCA(rd3, intgroup = "cond3")
## using ntop=500 top features by variance
PCA3 + geom_text(aes(label = name), size = 2.5) + 
  ggtitle("HCC70 vs MDA_MB_468 \n (PCA plot)")

PCA4 <- plotPCA(rd4, intgroup = "cond4")
## using ntop=500 top features by variance
PCA4 + geom_text(aes(label = name), size = 2.5) + 
  ggtitle("MDA_MB_231 vs MDA_MB_468 \n (PCA plot)")

create Volcano plot: convert ensembl IDs to gene symbols:

r1.df <- as.data.frame(r1)
r1.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(r1.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
r2.df <- as.data.frame(r2)
r2.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(r2.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
r3.df <- as.data.frame(r3)
r3.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(r3.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
r4.df <- as.data.frame(r4)
r4.df$symbol <- mapIds(org.Hs.eg.db, keys = row.names(r4.df), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns

volcano plot:

EnhancedVolcano(r1,
                lab = r1.df$symbol,
                x = "log2FoldChange",
                y = "padj",
                title = "HCC38 vs HCC70 \n (Volcano plot)",
                pCutoff = 1e-1,
                FCcutoff = 1)

EnhancedVolcano(r2,
                lab = r2.df$symbol,
                x = "log2FoldChange",
                y = "padj", 
                title = "HCC38 vs MDA-MB-231 \n (Volcano plot)",
                pCutoff = 1e-1,
                FCcutoff = 1)

EnhancedVolcano(r3,
                lab = r3.df$symbol,
                x = "log2FoldChange",
                y = "padj",
                title = "HCC70 vs MDA-MB-468 \n (Volcano plot)",
                pCutoff = 1e-1,
                FCcutoff = 1)

EnhancedVolcano(r4,
                lab = r4.df$symbol,
                x = "log2FoldChange",
                y = "padj",
                title = "MDA-MB-231 vs MDA-MB-468 \n (Volcano plot)",
                pCutoff = 1e-1,
                FCcutoff = 1)

create heatmap of log transformed normalized counts:

#Remove NAs
r1 <- na.omit(as.data.frame(r1))
r2 <- na.omit(as.data.frame(r2))
r3 <- na.omit(as.data.frame(r3))
r4 <- na.omit(as.data.frame(r4))

# Step 1: Apply filter conditions
fd1 <- r1[r1$pvalue <= 0.01 & r1$log2FoldChange > 2, ]
fd2 <- r2[r2$pvalue <= 0.01 & r2$log2FoldChange > 2, ]
fd3 <- r3[r3$pvalue <= 0.01 & r3$log2FoldChange > 2, ]
fd4 <- r4[r4$pvalue <= 0.01 & r4$log2FoldChange > 2, ]

# Step 2: Sort by a specific column, for example, by pvalue (or use another column as needed)
tg1 <- fd1[order(fd1$pvalue), ]
tg2 <- fd2[order(fd2$pvalue), ]
tg3 <- fd3[order(fd3$pvalue), ]
tg4 <- fd4[order(fd4$pvalue), ]

tg1$symbol <- mapIds(org.Hs.eg.db, keys = row.names(tg1), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
tg2$symbol <- mapIds(org.Hs.eg.db, keys = row.names(tg2), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
tg3$symbol <- mapIds(org.Hs.eg.db, keys = row.names(tg3), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
tg4$symbol <- mapIds(org.Hs.eg.db, keys = row.names(tg4), keytype = "ENSEMBL", column = "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
SET1 <- tg1$symbol
SET2 <- tg2$symbol
SET3 <- tg3$symbol
SET4 <- tg4$symbol

SET1 <- as.matrix(na.omit(SET1))
SET2 <- as.matrix(na.omit(SET2))
SET3 <- as.matrix(na.omit(SET3))
SET4 <- as.matrix(na.omit(SET4))

make data frame with gene symbol and ENSEMBL ID: (Version 1)

venn <- list("HCC38 vs HCC70" = SET1, 
                    "HCC38 vs MDA-MB-231" = SET2, 
                    "HCC70 vs MDA-MB-468" = SET3, 
                    "MDA-MB-231 vs MDA-MB-468" = SET4
)
ggvenn(venn)

Extract gene names from different sections:

only_in_set1 <- as.matrix(setdiff(SET1, SET2))
only_in_set2 <- as.matrix(setdiff(SET2, SET3))
only_in_set3 <-as.matrix( setdiff(SET3, SET2))
only_in_set4 <- as.matrix(setdiff(SET4, SET3))
common_genes1 <- as.matrix(intersect(SET2, SET3))
common_genes2 <- as.matrix(intersect(SET1, SET4))