Read counts

Read raw counts from htcount.csv file, that contains:

htcount <- read.csv("htcount.csv") # read file
tag_type <- names(htcount)[1] # locus_tag or old_locus_tag?
row.names(htcount) <- htcount[,1]
htcount <- htcount[,-1]
head(htcount)
##               splA_1 splA_2 splA_3 splB_1 splB_2 splB_3
## OG1RF_RS00015   1239   1499   1005   1185   1218   1160
## OG1RF_RS00020   1473   1770   1072   1209   1267   1315
## OG1RF_RS00025    235    273    250    223    243    240
## OG1RF_RS00030    776    873    789    784    798    897
## OG1RF_RS00035   1868   2115   1801   1933   2013   2009
## OG1RF_RS00040   3177   3692   2919   3158   3468   3337

DE analysis

Sample Info

Create sampleInfo data frame, where:

  • ROW NAMES indicate sample full name (incl. replicate number), as in htcount (e.g. splA_1, splA_2, splA_3, splB_1, splB_2, splB_3).
  • group indicates sample groupping (e.g. splA, splB).
  • counts indicates total number of counts
# "auto" sampleInfo extracted from sample names
library(stringr)
sampleInfo <- data.frame(group = str_remove(names(htcount), "_\\d+"),
                         reads = colSums(htcount))
sampleInfo
##        group   reads
## splA_1  splA 2327934
## splA_2  splA 2928045
## splA_3  splA 2035158
## splB_1  splB 2202843
## splB_2  splB 2478059
## splB_3  splB 2571678

In the example above I used regular expression "_\\d+" (means everything before _ followed by any number of digits) to extract group from sample full name (so e.g. splA is extracted from splA_1, splA_2 and splA_3).

Of course sampleInfo can be also created manually:

# "manual" sampleInfo
sampleInfo <- data.frame(group = c("splA", "splA", "splA", "splB", "splB", "splB"))
row.names(sampleInfo) <- c("splA_1", "splA_2", "splA_3", "splB_1", "splB_2", "splB_3")
sampleInfo$counts <- colSums(htcount)

Create DGEList object

Create DGEList object dgList to combine data:

library(edgeR)
dgList <- DGEList(counts = htcount, group = sampleInfo$group)

Normalize counts

dgList <- calcNormFactors(dgList, method="TMM")

MDS plot

Examine data using multidimensional scaling. Look for outliers or weird clustering (e.g. indicating mixed saples) .

plotMDS(dgList)

Estimate common and tagwise dispersion

dgList <- estimateCommonDisp(dgList)
dgList <- estimateGLMTrendedDisp(dgList)
dgList <- estimateTagwiseDisp(dgList)
plotBCV(dgList)

Test

## Exact test on HTS filtered data
dgeTest <- exactTest(dgList)
edgeR_result <- topTags(dgeTest, n = nrow(dgeTest$table))
edgeR_result <- as.data.frame(edgeR_result)
head(edgeR_result)
##                   logFC    logCPM       PValue          FDR
## OG1RF_RS00400 -5.983438  9.147997 7.838820e-65 2.024767e-61
## OG1RF_RS03265 -3.633666 12.972384 1.647485e-63 2.127727e-60
## OG1RF_RS00405 -6.115077  5.398205 7.433365e-36 6.400127e-33
## OG1RF_RS10325  3.722214  6.038759 2.669296e-34 1.723698e-31
## OG1RF_RS10330  3.317968  5.564405 9.943868e-26 5.137002e-23
## OG1RF_RS06520 -4.138438  9.988277 1.819638e-19 7.833543e-17

Annotate

Prepare results

First convert row names (with locus tags) to separate column. To be sure that I use correct tags (old vs new) I will use previously saved tag_type variable, keeping the type which was read from htcount.csv file.

library(tidyverse)
edgeR_result <- rownames_to_column(edgeR_result, tag_type)
head(edgeR_result)
##       locus_tag     logFC    logCPM       PValue          FDR
## 1 OG1RF_RS00400 -5.983438  9.147997 7.838820e-65 2.024767e-61
## 2 OG1RF_RS03265 -3.633666 12.972384 1.647485e-63 2.127727e-60
## 3 OG1RF_RS00405 -6.115077  5.398205 7.433365e-36 6.400127e-33
## 4 OG1RF_RS10325  3.722214  6.038759 2.669296e-34 1.723698e-31
## 5 OG1RF_RS10330  3.317968  5.564405 9.943868e-26 5.137002e-23
## 6 OG1RF_RS06520 -4.138438  9.988277 1.819638e-19 7.833543e-17

Use annotation file (OG1RF_annotation_2020-12-07.csv), that contains the following columns:

  • locus_tag
  • old_locus_tag
  • gene_name
  • gene_product
  • (optionally) operon_id

Merge simply merge it with the result using left_join. I will not define by = parameter, so join will be done by either old_locus_tag or locus_tag (depending which one was used).

annotation <- read.csv("OG1RF_annotation_2020-12-07.csv")
edgeR_result_annotated <- edgeR_result %>%
  left_join(annotation) %>%
  select(contains("locus_tag"), everything())
## Joining, by = "locus_tag"
head(edgeR_result_annotated)
##       locus_tag old_locus_tag     logFC    logCPM       PValue          FDR
## 1 OG1RF_RS00400   OG1RF_10079 -5.983438  9.147997 7.838820e-65 2.024767e-61
## 2 OG1RF_RS03265   OG1RF_10627 -3.633666 12.972384 1.647485e-63 2.127727e-60
## 3 OG1RF_RS00405   OG1RF_10080 -6.115077  5.398205 7.433365e-36 6.400127e-33
## 4 OG1RF_RS10325   OG1RF_12018  3.722214  6.038759 2.669296e-34 1.723698e-31
## 5 OG1RF_RS10330   OG1RF_12019  3.317968  5.564405 9.943868e-26 5.137002e-23
## 6 OG1RF_RS06520   OG1RF_11260 -4.138438  9.988277 1.819638e-19 7.833543e-17
##   gene_name                                       gene_product operon_id
## 1      <NA>                                    MFS transporter   1028012
## 2       aad bifunctional acetaldehyde-CoA/alcoholdehydrogenase   3586064
## 3      <NA>                               hypothetical protein   1028012
## 4      glxK                                   glycerate kinase   1028438
## 5      <NA>                               GntP family permease   1028438
## 6      <NA>                        ECF transporter S component   3586258

Prepare for Database export

For Database export, no annotations are necessary (they will be added automatically).

# reorder results based on locus
edgeR_result <- edgeR_result[order(edgeR_result[,tag_type]),] 

# extract normalized counts
counts_normalized <- as.data.frame(cpm(dgList, normalized.lib.sizes=FALSE)) %>%
  rownames_to_column(tag_type)

library(xlsx)
write.xlsx(edgeR_result, file = "results.xlsx", sheet = "data", row.names = FALSE)
write.xlsx(counts_normalized, file = "results.xlsx", sheet = "counts", row.names = FALSE, append = TRUE)

Before export file still needs adding metadata - I will provide instructions separately.