script for nominal data

load DESeq2

library(DESeq2)

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     grep, grepl, intersect, is.unsorted, lapply, lengths, Map,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unlist, unsplit

## Loading required package: IRanges

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: Rcpp

## Loading required package: RcppArmadillo

Read in csv Ensembl count file

curData <- read.csv (file= "C:/Users/Jim/P68 Ensembl counts/P68A Ensembl defNASH.csv")
dim(curData)

## [1] 56300    96

curData[1:5,1:5]

##          geneName lib2009 lib2010 lib2011 lib2012
## 1         defNASH       0       0       1       0
## 2 ENSG00000000003     198     363     675     290
## 3 ENSG00000000005       0       2       0       2
## 4 ENSG00000000419      58      89     163      93
## 5 ENSG00000000457      62      89     167      99

Create count matrix, assign design data

rawCounts<-curData[2:nrow(curData), 2:ncol(curData)]
rownames(rawCounts)<-as.character(curData[2:nrow(curData),1])
designData<-t(curData[1,2:ncol(curData)])
colnames(designData)<-as.character(curData[1,1])
table(designData[,1])

## 
##  0  1 
## 43 52

designData<-as.data.frame(designData)

rawCounts<-as.matrix(rawCounts)

remove rows with null values

remRows<-which(apply(rawCounts, 1, sum)==0)

rawCountsSub<-rawCounts[-remRows,]

length(remRows)

## [1] 17749

countsCompare1<-rawCountsSub
designData1<-designData
colnames(designData1)

## [1] "defNASH"

colnames(designData1)[1]<-"defNASH"

Run algorithim

firstDESeq<-DESeqDataSetFromMatrix(countData=countsCompare1, colData=designData1, design=~defNASH)

## the design formula contains a numeric variable with integer values,
##   specifying a model with increasing fold change for higher values.
##   did you mean for this to be a factor? if so, first convert
##   this variable to a factor using the factor() function

firstDESeq$defNASH<-factor(firstDESeq$defNASH, levels=c(0,1))
firstDESeq<-DESeq(firstDESeq)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

## -- replacing outliers and refitting for 383 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)

## estimating dispersions

## fitting model and testing

res1<-results(firstDESeq)

ANALYSIS

MA plot (log2 FC over normalized mean expression)

plotMA(res1, main="DESeq2 defNASH", ylim=c(-2,2))

adjust P values, order and write results file

sum(res1$padj < 0.05, na.rm=T)

## [1] 534

res1Ordered<-res1[order(res1$padj),]
head (res1Ordered)

## log2 fold change (MAP): defNASH 1 vs 0 
## Wald test p-value: defNASH 1 vs 0 
## DataFrame with 6 rows and 6 columns
##                   baseMean log2FoldChange      lfcSE      stat
##                  <numeric>      <numeric>  <numeric> <numeric>
## ENSG00000213606   9.110225      0.6712725 0.09498844  7.066886
## ENSG00000083857 954.096228      0.5364576 0.07841993  6.840832
## ENSG00000198074  83.225461      0.6216194 0.09071975  6.852084
## ENSG00000173391   4.615674      0.6804876 0.10132357  6.715985
## ENSG00000165912  65.513518     -0.4965216 0.07805611 -6.361086
## ENSG00000115009   2.762327      0.6071290 0.09649355  6.291913
##                       pvalue         padj
##                    <numeric>    <numeric>
## ENSG00000213606 1.584485e-12 3.501078e-08
## ENSG00000083857 7.873440e-12 5.799051e-08
## ENSG00000198074 7.278158e-12 5.799051e-08
## ENSG00000173391 1.867996e-11 1.031881e-07
## ENSG00000165912 2.003329e-10 8.853110e-07
## ENSG00000115009 3.135777e-10 1.154802e-06

write.csv(res1Ordered, file="C:/Users/Jim/P68 Ensembl counts/P68A Ensembl defNASH DESeq2 results.csv")

NAFLD liver Gene Expression using DESeq2

Jim Nelson

October 19, 2014

script for nominal data

load DESeq2

Read in csv Ensembl count file

Create count matrix, assign design data

remove rows with null values

Run algorithim

ANALYSIS

MA plot (log2 FC over normalized mean expression)

adjust P values, order and write results file