Read raw counts from htcount.csv file, that contains:
locus_tag) or old (old_locus_tag) versionsplA, splB) and their replicates (i.e. here as splA_1, splA_2, splA_3, etc…)htcount <- read.csv("htcount.csv") # read file
tag_type <- names(htcount)[1] # locus_tag or old_locus_tag?
row.names(htcount) <- htcount[,1]
htcount <- htcount[,-1]
head(htcount)
## splA_1 splA_2 splA_3 splB_1 splB_2 splB_3
## OG1RF_RS00015 1239 1499 1005 1185 1218 1160
## OG1RF_RS00020 1473 1770 1072 1209 1267 1315
## OG1RF_RS00025 235 273 250 223 243 240
## OG1RF_RS00030 776 873 789 784 798 897
## OG1RF_RS00035 1868 2115 1801 1933 2013 2009
## OG1RF_RS00040 3177 3692 2919 3158 3468 3337
Create sampleInfo data frame, where:
htcount (e.g. splA_1, splA_2, splA_3, splB_1, splB_2, splB_3).group indicates sample groupping (e.g. splA, splB).counts indicates total number of counts# "auto" sampleInfo extracted from sample names
library(stringr)
sampleInfo <- data.frame(group = str_remove(names(htcount), "_\\d+"),
reads = colSums(htcount))
sampleInfo
## group reads
## splA_1 splA 2327934
## splA_2 splA 2928045
## splA_3 splA 2035158
## splB_1 splB 2202843
## splB_2 splB 2478059
## splB_3 splB 2571678
In the example above I used regular expression "_\\d+" (means everything before _ followed by any number of digits) to extract group from sample full name (so e.g. splA is extracted from splA_1, splA_2 and splA_3).
Of course sampleInfo can be also created manually:
# "manual" sampleInfo
sampleInfo <- data.frame(group = c("splA", "splA", "splA", "splB", "splB", "splB"))
row.names(sampleInfo) <- c("splA_1", "splA_2", "splA_3", "splB_1", "splB_2", "splB_3")
sampleInfo$counts <- colSums(htcount)
Create DGEList object dgList to combine data:
library(edgeR)
dgList <- DGEList(counts = htcount, group = sampleInfo$group)
dgList <- calcNormFactors(dgList, method="TMM")
Examine data using multidimensional scaling. Look for outliers or weird clustering (e.g. indicating mixed saples) .
plotMDS(dgList)
dgList <- estimateCommonDisp(dgList)
dgList <- estimateGLMTrendedDisp(dgList)
dgList <- estimateTagwiseDisp(dgList)
plotBCV(dgList)
## Exact test on HTS filtered data
dgeTest <- exactTest(dgList)
edgeR_result <- topTags(dgeTest, n = nrow(dgeTest$table))
edgeR_result <- as.data.frame(edgeR_result)
head(edgeR_result)
## logFC logCPM PValue FDR
## OG1RF_RS00400 -5.983438 9.147997 7.838820e-65 2.024767e-61
## OG1RF_RS03265 -3.633666 12.972384 1.647485e-63 2.127727e-60
## OG1RF_RS00405 -6.115077 5.398205 7.433365e-36 6.400127e-33
## OG1RF_RS10325 3.722214 6.038759 2.669296e-34 1.723698e-31
## OG1RF_RS10330 3.317968 5.564405 9.943868e-26 5.137002e-23
## OG1RF_RS06520 -4.138438 9.988277 1.819638e-19 7.833543e-17
First convert row names (with locus tags) to separate column. To be sure that I use correct tags (old vs new) I will use previously saved tag_type variable, keeping the type which was read from htcount.csv file.
library(tidyverse)
edgeR_result <- rownames_to_column(edgeR_result, tag_type)
head(edgeR_result)
## locus_tag logFC logCPM PValue FDR
## 1 OG1RF_RS00400 -5.983438 9.147997 7.838820e-65 2.024767e-61
## 2 OG1RF_RS03265 -3.633666 12.972384 1.647485e-63 2.127727e-60
## 3 OG1RF_RS00405 -6.115077 5.398205 7.433365e-36 6.400127e-33
## 4 OG1RF_RS10325 3.722214 6.038759 2.669296e-34 1.723698e-31
## 5 OG1RF_RS10330 3.317968 5.564405 9.943868e-26 5.137002e-23
## 6 OG1RF_RS06520 -4.138438 9.988277 1.819638e-19 7.833543e-17
Use annotation file (OG1RF_annotation_2020-12-07.csv), that contains the following columns:
locus_tagold_locus_taggene_namegene_productoperon_idMerge simply merge it with the result using left_join. I will not define by = parameter, so join will be done by either old_locus_tag or locus_tag (depending which one was used).
annotation <- read.csv("OG1RF_annotation_2020-12-07.csv")
edgeR_result_annotated <- edgeR_result %>%
left_join(annotation) %>%
select(contains("locus_tag"), everything())
## Joining, by = "locus_tag"
head(edgeR_result_annotated)
## locus_tag old_locus_tag logFC logCPM PValue FDR
## 1 OG1RF_RS00400 OG1RF_10079 -5.983438 9.147997 7.838820e-65 2.024767e-61
## 2 OG1RF_RS03265 OG1RF_10627 -3.633666 12.972384 1.647485e-63 2.127727e-60
## 3 OG1RF_RS00405 OG1RF_10080 -6.115077 5.398205 7.433365e-36 6.400127e-33
## 4 OG1RF_RS10325 OG1RF_12018 3.722214 6.038759 2.669296e-34 1.723698e-31
## 5 OG1RF_RS10330 OG1RF_12019 3.317968 5.564405 9.943868e-26 5.137002e-23
## 6 OG1RF_RS06520 OG1RF_11260 -4.138438 9.988277 1.819638e-19 7.833543e-17
## gene_name gene_product operon_id
## 1 <NA> MFS transporter 1028012
## 2 aad bifunctional acetaldehyde-CoA/alcoholdehydrogenase 3586064
## 3 <NA> hypothetical protein 1028012
## 4 glxK glycerate kinase 1028438
## 5 <NA> GntP family permease 1028438
## 6 <NA> ECF transporter S component 3586258
For Database export, no annotations are necessary (they will be added automatically).
# reorder results based on locus
edgeR_result <- edgeR_result[order(edgeR_result[,tag_type]),]
# extract normalized counts
counts_normalized <- as.data.frame(cpm(dgList, normalized.lib.sizes=FALSE)) %>%
rownames_to_column(tag_type)
library(xlsx)
write.xlsx(edgeR_result, file = "results.xlsx", sheet = "data", row.names = FALSE)
write.xlsx(counts_normalized, file = "results.xlsx", sheet = "counts", row.names = FALSE, append = TRUE)
Before export file still needs adding metadata - I will provide instructions separately.