#if installation is need, uncomment the following
#if (!require("DESeq2")) install.packages("DESeq2"); library(DESeq2)
#if (!require("apeglm")) install.packages("apeglm"); library(apeglm)
#if (!require("ggplot2")) install.packages("ggplot2"); library(ggplot2)
#if (!require("pheatmap")) install.packages("pheatmap"); library(pheatmap)
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
library(apeglm)
library(ggplot2)
library(pheatmap)
#load count matrix and reference
#tximport summerization of counts = 'intALL.csv'
#'ref.txt' is the reference file needed by deseq
#data <- read.csv("intAll.csv", header=T, row.names = 1)
#info <- read.table("ref.txt", header = T, sep ="\t")
data <- read.csv("roundedMergedCountFile.csv", header=T, row.names = 1)
info <- read.table("refFile.txt", header = T, sep ="\t")
head(data)
## chvi_count cabr_count stlu_count arja_count
## sp|A0A024RBG1|NUD4B_HUMAN 3674 858 0 0
## sp|A0A061I403|FICD_CRIGR 0 0 0 0
## sp|A0A075B6H9|LV469_HUMAN 0 0 0 0
## sp|A0A075B6I0|LV861_HUMAN 0 0 0 2
## sp|A0A075B6I1|LV460_HUMAN 0 0 0 0
## sp|A0A075B6I3|LVK55_HUMAN 1 0 0 0
## loth_count ange_count glso_count nole_count
## sp|A0A024RBG1|NUD4B_HUMAN 4949 1230 0 1859
## sp|A0A061I403|FICD_CRIGR 0 0 0 0
## sp|A0A075B6H9|LV469_HUMAN 0 0 0 0
## sp|A0A075B6I0|LV861_HUMAN 0 2 2 0
## sp|A0A075B6I1|LV460_HUMAN 0 0 0 0
## sp|A0A075B6I3|LVK55_HUMAN 0 1 0 0
## mobl_count mawa_count ptpa_count dero_count
## sp|A0A024RBG1|NUD4B_HUMAN 854 128 28 18
## sp|A0A061I403|FICD_CRIGR 0 0 0 8
## sp|A0A075B6H9|LV469_HUMAN 0 0 26 0
## sp|A0A075B6I0|LV861_HUMAN 1 8 1631 2
## sp|A0A075B6I1|LV460_HUMAN 0 0 235 0
## sp|A0A075B6I3|LVK55_HUMAN 0 0 14 0
de <- DESeqDataSetFromMatrix(data, info, ~diet)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
#removes low expressed genes
keep <- rowSums(counts(de)) >= 100
de <- de[keep,]
deSeqData <- DESeq(de)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
###un-comment to export normalized read count
normCounts <- counts(deSeqData, normalized = T)
write.csv(normCounts, "normal.allCounts.txm.csv")
#p value less than .05 is diff. exp.
result <- results(deSeqData, alpha = 0.05)
# order based on p adjusted value
resOrdered <- result[order(result$padj),]
###un-comment to export ordered file
##write.csv(resOrdered, "deSeq.order.csv")
#files generated above
normCount <- read.csv("normal.allCounts.txm.csv", row.names = 1)
deSeqRes <- read.csv("deSeq.order.csv", row.names = 1)
#if padj is <= .05, the value is significant
deSeqRes$Significant <- ifelse(deSeqRes$padj <= 0.05, "Yes", "No")
#taking out any na values
deSeqRes <- na.omit(deSeqRes)
head(deSeqRes)
## baseMean log2FoldChange lfcSE stat
## sp|P62629|EF1A1_CRIGR 382512.2689 -9.713983 0.6999627 -13.877859
## sp|Q9JK95|PERP_MOUSE 34527.8387 -11.476806 0.8907129 -12.884967
## sp|P15104|GLNA_HUMAN 30268.1036 -12.563098 1.1393125 -11.026911
## sp|Q6EWQ7|IF5A1_BOVIN 13133.7191 -6.217528 0.5929447 -10.485848
## sp|P31939|PUR9_HUMAN 588.3423 -5.272017 0.5369890 -9.817737
## sp|Q76B49|CD63_FELCA 1583.5959 -9.136213 0.9370022 -9.750471
## pvalue padj Significant
## sp|P62629|EF1A1_CRIGR 8.628312e-44 1.409003e-39 Yes
## sp|Q9JK95|PERP_MOUSE 5.469912e-38 4.466184e-34 Yes
## sp|P15104|GLNA_HUMAN 2.834340e-28 1.542826e-24 Yes
## sp|Q6EWQ7|IF5A1_BOVIN 1.003418e-25 4.096456e-22 Yes
## sp|P31939|PUR9_HUMAN 9.443974e-23 3.084402e-19 Yes
## sp|Q76B49|CD63_FELCA 1.836148e-22 4.997383e-19 Yes
##Creating Volcano plot
#baseMean is the normalized count values, dividing by size factor, taken over all samples.. use log bc it needs to be put on a log scale
#log2foldchage is the effective size estimate
#ggplot(deSeqRes, aes( x = log10(baseMean), y = log2FoldChange, color = Significant)) + geom_point()
#volcano plot..
ggplot(deSeqRes, aes(x = log2FoldChange, y = -log10(padj), color = Significant)) + geom_point() #+ scale_color_brewer(palette = "YlGn")
## Creating heatmap
#editing data for heat map
signi <- subset(deSeqRes, padj <= 0.05)
allSig <- merge(normCount, signi, by = 0)
sigCounts <- allSig[,2:13]
row.names(sigCounts) <- allSig$Row.names
#creating heatmap
#log2 looks at exponents instead of raw numbers, is used to normalize the data .. if there is a 0 it wont work so you have to do +1
#scale compares expression within a col/row... finds median read count of a row/col.. doesn't look at raw numbers
pheatmap(log2(sigCounts + 1), scale = "row", show_rownames = F, treeheight_row = 0, treeheight_col = 50) #, color = hcl.colors(50, "BluYl"))
#adding in column to show -log10(padj) values that are seen on the volcano plot above
deSeqRes$neglog10 <- (-log10(deSeqRes$padj))
##neglog10 values above 10 are observed to be outliers according to the volcano plot
head(deSeqRes)
## baseMean log2FoldChange lfcSE stat
## sp|P62629|EF1A1_CRIGR 382512.2689 -9.713983 0.6999627 -13.877859
## sp|Q9JK95|PERP_MOUSE 34527.8387 -11.476806 0.8907129 -12.884967
## sp|P15104|GLNA_HUMAN 30268.1036 -12.563098 1.1393125 -11.026911
## sp|Q6EWQ7|IF5A1_BOVIN 13133.7191 -6.217528 0.5929447 -10.485848
## sp|P31939|PUR9_HUMAN 588.3423 -5.272017 0.5369890 -9.817737
## sp|Q76B49|CD63_FELCA 1583.5959 -9.136213 0.9370022 -9.750471
## pvalue padj Significant neglog10
## sp|P62629|EF1A1_CRIGR 8.628312e-44 1.409003e-39 Yes 38.85109
## sp|Q9JK95|PERP_MOUSE 5.469912e-38 4.466184e-34 Yes 33.35006
## sp|P15104|GLNA_HUMAN 2.834340e-28 1.542826e-24 Yes 23.81168
## sp|Q6EWQ7|IF5A1_BOVIN 1.003418e-25 4.096456e-22 Yes 21.38759
## sp|P31939|PUR9_HUMAN 9.443974e-23 3.084402e-19 Yes 18.51083
## sp|Q76B49|CD63_FELCA 1.836148e-22 4.997383e-19 Yes 18.30126
signi <- subset(deSeqRes, neglog10 >= 10)
allSig <- merge(normCount, signi, by = 0)
sigCounts <- allSig[,2:13]
row.names(sigCounts) <- allSig$Row.names
pheatmap(log2(sigCounts + 1), scale = "row", show_rownames = F,treeheight_row = 0) #,color = hcl.colors(50, "YlGn"))