DESeq

Loading packages

#if installation is need, uncomment the following
#if (!require("DESeq2")) install.packages("DESeq2"); library(DESeq2)
#if (!require("apeglm")) install.packages("apeglm"); library(apeglm)
#if (!require("ggplot2")) install.packages("ggplot2"); library(ggplot2)
#if (!require("pheatmap")) install.packages("pheatmap"); library(pheatmap)

library(DESeq2)

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

library(apeglm)
library(ggplot2)
library(pheatmap)

Loading in the files

#load count matrix and reference 
#tximport summerization of counts = 'intALL.csv'
#'ref.txt' is the reference file needed by deseq

#data <- read.csv("intAll.csv", header=T, row.names = 1)
#info <- read.table("ref.txt", header = T, sep ="\t")

data <- read.csv("roundedMergedCountFile.csv", header=T, row.names = 1)
info <- read.table("refFile.txt", header = T, sep ="\t")
head(data)

##                           chvi_count cabr_count stlu_count arja_count
## sp|A0A024RBG1|NUD4B_HUMAN       3674        858          0          0
## sp|A0A061I403|FICD_CRIGR           0          0          0          0
## sp|A0A075B6H9|LV469_HUMAN          0          0          0          0
## sp|A0A075B6I0|LV861_HUMAN          0          0          0          2
## sp|A0A075B6I1|LV460_HUMAN          0          0          0          0
## sp|A0A075B6I3|LVK55_HUMAN          1          0          0          0
##                           loth_count ange_count glso_count nole_count
## sp|A0A024RBG1|NUD4B_HUMAN       4949       1230          0       1859
## sp|A0A061I403|FICD_CRIGR           0          0          0          0
## sp|A0A075B6H9|LV469_HUMAN          0          0          0          0
## sp|A0A075B6I0|LV861_HUMAN          0          2          2          0
## sp|A0A075B6I1|LV460_HUMAN          0          0          0          0
## sp|A0A075B6I3|LVK55_HUMAN          0          1          0          0
##                           mobl_count mawa_count ptpa_count dero_count
## sp|A0A024RBG1|NUD4B_HUMAN        854        128         28         18
## sp|A0A061I403|FICD_CRIGR           0          0          0          8
## sp|A0A075B6H9|LV469_HUMAN          0          0         26          0
## sp|A0A075B6I0|LV861_HUMAN          1          8       1631          2
## sp|A0A075B6I1|LV460_HUMAN          0          0        235          0
## sp|A0A075B6I3|LVK55_HUMAN          0          0         14          0

Running DESeq

de <- DESeqDataSetFromMatrix(data, info, ~diet)

## converting counts to integer mode

## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

Editing DESeq output

#removes low expressed genes
keep <- rowSums(counts(de)) >= 100
de <- de[keep,]
deSeqData <- DESeq(de)

## estimating size factors

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## final dispersion estimates

##   Note: levels of factors in the design contain characters other than
##   letters, numbers, '_' and '.'. It is recommended (but not required) to use
##   only letters, numbers, and delimiters '_' or '.', as these are safe characters
##   for column names in R. [This is a message, not a warning or an error]

## fitting model and testing

###un-comment to export normalized read count
normCounts <- counts(deSeqData, normalized = T)
write.csv(normCounts, "normal.allCounts.txm.csv")

#p value less than .05 is diff. exp.   
result <- results(deSeqData, alpha = 0.05)

# order based on p adjusted value 
resOrdered <- result[order(result$padj),]

###un-comment to export ordered file 
##write.csv(resOrdered, "deSeq.order.csv")

Editing data for plots

#files generated above 
normCount <- read.csv("normal.allCounts.txm.csv", row.names = 1)
deSeqRes <- read.csv("deSeq.order.csv", row.names = 1)

#if padj is <= .05, the value is significant 
deSeqRes$Significant <- ifelse(deSeqRes$padj <= 0.05, "Yes", "No")
#taking out any na values 
deSeqRes <- na.omit(deSeqRes)
head(deSeqRes)

##                          baseMean log2FoldChange     lfcSE       stat
## sp|P62629|EF1A1_CRIGR 382512.2689      -9.713983 0.6999627 -13.877859
## sp|Q9JK95|PERP_MOUSE   34527.8387     -11.476806 0.8907129 -12.884967
## sp|P15104|GLNA_HUMAN   30268.1036     -12.563098 1.1393125 -11.026911
## sp|Q6EWQ7|IF5A1_BOVIN  13133.7191      -6.217528 0.5929447 -10.485848
## sp|P31939|PUR9_HUMAN     588.3423      -5.272017 0.5369890  -9.817737
## sp|Q76B49|CD63_FELCA    1583.5959      -9.136213 0.9370022  -9.750471
##                             pvalue         padj Significant
## sp|P62629|EF1A1_CRIGR 8.628312e-44 1.409003e-39         Yes
## sp|Q9JK95|PERP_MOUSE  5.469912e-38 4.466184e-34         Yes
## sp|P15104|GLNA_HUMAN  2.834340e-28 1.542826e-24         Yes
## sp|Q6EWQ7|IF5A1_BOVIN 1.003418e-25 4.096456e-22         Yes
## sp|P31939|PUR9_HUMAN  9.443974e-23 3.084402e-19         Yes
## sp|Q76B49|CD63_FELCA  1.836148e-22 4.997383e-19         Yes

##Creating Volcano plot

#baseMean is the normalized count values, dividing by size factor, taken over all samples.. use log bc it needs to be put on a log scale 
#log2foldchage is the effective size estimate
#ggplot(deSeqRes, aes( x = log10(baseMean), y = log2FoldChange, color = Significant)) + geom_point()

#volcano plot..

ggplot(deSeqRes, aes(x = log2FoldChange, y = -log10(padj), color = Significant)) + geom_point() #+ scale_color_brewer(palette = "YlGn")

## Creating heatmap

#editing data for heat map 
signi <- subset(deSeqRes, padj <= 0.05)
allSig <- merge(normCount, signi, by = 0)
sigCounts <- allSig[,2:13]
row.names(sigCounts) <- allSig$Row.names

#creating heatmap
#log2 looks at exponents instead of raw numbers, is used to  normalize the data .. if there is a 0 it wont work so you have to do +1
#scale compares expression within a col/row... finds median read count of a row/col.. doesn't look at raw numbers
pheatmap(log2(sigCounts + 1), scale = "row", show_rownames = F, treeheight_row = 0, treeheight_col = 50) #, color = hcl.colors(50, "BluYl"))

Looking at the outliers from Volcano plot

#adding in column to show -log10(padj) values that are seen on the volcano plot above
deSeqRes$neglog10 <- (-log10(deSeqRes$padj))
##neglog10 values above 10 are observed to be outliers according to the volcano plot 
head(deSeqRes)

##                          baseMean log2FoldChange     lfcSE       stat
## sp|P62629|EF1A1_CRIGR 382512.2689      -9.713983 0.6999627 -13.877859
## sp|Q9JK95|PERP_MOUSE   34527.8387     -11.476806 0.8907129 -12.884967
## sp|P15104|GLNA_HUMAN   30268.1036     -12.563098 1.1393125 -11.026911
## sp|Q6EWQ7|IF5A1_BOVIN  13133.7191      -6.217528 0.5929447 -10.485848
## sp|P31939|PUR9_HUMAN     588.3423      -5.272017 0.5369890  -9.817737
## sp|Q76B49|CD63_FELCA    1583.5959      -9.136213 0.9370022  -9.750471
##                             pvalue         padj Significant neglog10
## sp|P62629|EF1A1_CRIGR 8.628312e-44 1.409003e-39         Yes 38.85109
## sp|Q9JK95|PERP_MOUSE  5.469912e-38 4.466184e-34         Yes 33.35006
## sp|P15104|GLNA_HUMAN  2.834340e-28 1.542826e-24         Yes 23.81168
## sp|Q6EWQ7|IF5A1_BOVIN 1.003418e-25 4.096456e-22         Yes 21.38759
## sp|P31939|PUR9_HUMAN  9.443974e-23 3.084402e-19         Yes 18.51083
## sp|Q76B49|CD63_FELCA  1.836148e-22 4.997383e-19         Yes 18.30126

Plotting just the outliers

signi <- subset(deSeqRes, neglog10 >= 10)
allSig <- merge(normCount, signi, by = 0)
sigCounts <- allSig[,2:13]
row.names(sigCounts) <- allSig$Row.names
pheatmap(log2(sigCounts + 1), scale = "row", show_rownames = F,treeheight_row = 0) #,color = hcl.colors(50, "YlGn"))