# module load cutadapt; module load trimgalore; ls *fastq.gz | awk '{split($0,a,"_"); print a[3]}' | uniq | parallel --eta -j1 trim_galore --paired SO_9588_{}_R1.fastq.gz SO_9588_{}_R2.fastq.gz --fastqc > trim_galore.log
# module load STAR/2.7.3a; STAR --runThreadN 15 --runMode genomeGenerate --genomeDir ~/third_party_sofware/hg38/karyotypic/ --genomeFastaFiles ~/third_party_sofware/hg38/karyotypic/Homo_sapiens_assembly38.fasta --sjdbGTFfile ~/third_party_sofware/hg38/karyotypic/hg38_ucsc.gtf --sjdbOverhang 100
# module load STAR/2.7.3a; for i in $(ls *_val_*fq.gz | rev | cut -c 16- | rev | uniq); do mkdir -p alignment_hg38/$i; cd alignment_hg38/$i; STAR --genomeDir ~/third_party_sofware/hg38/karyotypic/ --readFilesIn ../../$i'_R1_val_1.fq.gz' ../../$i'_R2_val_2.fq.gz' --readFilesCommand zcat --genomeLoad LoadAndKeep --runThreadN 42 --outSAMtype BAM SortedByCoordinate --limitBAMsortRAM 40000000000 --outReadsUnmapped Fastx; cd ../../; done
Import libraries
Include filtering, scaling, dimensionality reduction
counts_scaled =
read_csv("company_derived_normalised_counts.csv") %>%
pivot_longer(cols = -Gene_Name, names_to = "sample", values_to = "count") %>%
# Add factor of interest
inner_join(
tibble(
sample = c("W1", "W2", "W3", "Ko1", "Ko2", "Ko3"),
type = c(rep("Wild_type", 3), rep("Knock_out", 3))
),
by="sample"
) %>%
as_SummarizedExperiment(.sample = sample, .transcript = Gene_Name , .abundance = count) %>%
# Filter
identify_abundant(factor_of_interest=type) %>%
# Scale
scale_abundance() %>%
# Reduce dimensions
reduce_dimensions(method = "PCA")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## Gene_Name = col_character(),
## W1 = col_double(),
## W2 = col_double(),
## W3 = col_double(),
## Ko1 = col_double(),
## Ko2 = col_double(),
## Ko3 = col_double()
## )
## Getting the 500 most variable genes
## Fraction of variance explained by the selected principal components
## # A tibble: 2 x 2
## `Fraction of variance` PC
## <dbl> <int>
## 1 0.560 1
## 2 0.265 2
## tidybulk says: to access the raw results do `attr(..., "internals")$PCA`
The libraries are correctly scaled and have quite low depth
counts_scaled %>%
keep_abundant(factor_of_interest=type) %>%
ggplot(aes(count_scaled+1, group=sample, color=type)) +
geom_density() +
scale_x_log10() +
scale_color_brewer(palette = "Set1") +
theme_bw()
The biological groups are quite separated, except for Ko1
counts_scaled %>%
as_tibble() %>%
nanny::subset(sample) %>%
ggplot(aes(PC1,PC2, label=sample)) +
geom_point(aes(color=type)) +
ggrepel::geom_text_repel() +
scale_color_brewer(palette = "Set1") +
theme_bw()
ggsave(
"PCA.pdf",
useDingbats=FALSE,
units = c("mm"),
width = 183 ,
height = 183,
limitsize = FALSE
)
The most variable genes put Ko1 more similar to wild-type
plot_heatmap =
counts_scaled %>%
keep_variable(.abundance = counts_scaled, top = 500) %>%
as_tibble() %>%
heatmap(
.column = sample,
.row = feature,
.value = count_scaled,
transform = log1p
) %>%
add_tile(type, palette = c("#E41A1C" ,"#377EB8"))
## Getting the 500 most variable genes
plot_heatmap
plot_heatmap %>% save_pdf("heatmap_variable_genes.pdf")
## tidyHeatmap says: saving 7 x 5 in image
Robust and stringent analyses show a total of 522 downregulated genes and 468 up-regulated genes with a 2-fold difference between wild-type and knock-out
counts_de =
counts_scaled %>%
test_differential_abundance(
~0+type,
method="edger_robust_likelihood_ratio",
.contrasts = "typeKnock_out-typeWild_type",
test_above_log2_fold_change = 1,
omit_contrast_in_colnames=TRUE
)
## =====================================
## tidybulk says: All testing methods use raw counts, irrespective of if scale_abundance
## or adjust_abundance have been calculated. Therefore, it is essential to add covariates
## such as batch effects (if applicable) in the formula.
## =====================================
## tidybulk says: The design column names are "typeKnock_out, typeWild_type"
## tidybulk says: to access the raw results (fitted GLM) do `attr(..., "internals")$edger_robust_likelihood_ratio`
counts_de %>%
as_tibble() %>%
nanny::subset(feature) %>%
arrange(FDR) %>%
describe_transcript(feature) %>%
dplyr::select(feature, description, logFC, FDR) %>%
DT::datatable()
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
These genes define the two biological groups as expected
# Heatmap
plot_heatmap_de =
counts_de %>%
filter(FDR<0.05)%>% # & abs(logFC) >=2
as_tibble() %>%
heatmap(
.column = sample,
.row = feature,
.value = count_scaled,
transform = log1p
) %>%
add_tile(type, palette = c("#E41A1C" ,"#377EB8"))
#%>%
# InteractiveComplexHeatmap::ht_shiny()
plot_heatmap_de
plot_heatmap_de %>% save_pdf("heatmap_differential_genes.pdf", width = 183, height = 183, units = "mm")
topgenes_symbols <-
counts_de %>%
as_tibble() %>%
nanny::subset(feature) %>%
filter(FDR<0.05) %>%
arrange(desc(abs(logFC))) %>%
pull(feature)
counts_de %>%
as_tibble() %>%
nanny::subset(feature) %>%
# Subset data
mutate(significant = case_when(
FDR<0.05 & logFC < 0 ~ "Down-regulated",
FDR<0.05 & logFC > 0 ~ "Up-regulated",
TRUE ~ ""
)) %>%
mutate(feature = ifelse(feature %in% topgenes_symbols, as.character(feature), "")) %>%
# Plot
ggplot(aes(x = logFC, y = PValue, label=feature)) +
geom_vline(xintercept = c(-1, 1), linetype="dashed", color="#8c8c8c") +
geom_hline(yintercept = mean(c(1.24E-4, 3.74E-4)), linetype="dashed", color="#8c8c8c") +
geom_point(aes(color = significant, alpha=significant)) +
ggrepel::geom_text_repel() +
# Custom scales
xlab("Log 2 fold change") +
theme_bw() +
scale_y_continuous(trans = "log10_reverse") +
scale_color_manual(values=c("black", "#1c4966", "#e11f28")) +
scale_size_discrete(range = c(0, 2))
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.
## Warning: Removed 9546 rows containing missing values (geom_point).
## Warning: Removed 9546 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 149 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggsave(
"volcano_plot.pdf",
useDingbats=FALSE,
units = c("mm"),
width = 183 ,
height = 183,
limitsize = FALSE
)
## Warning: Using alpha for a discrete variable is not advised.
## Warning: Removed 9546 rows containing missing values (geom_point).
## Warning: Removed 9546 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 141 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
EGSEA uses many algorithms on many datasets (GSEA, GO, KEGG). The pathways are ranked by the overall gene set that scores well in most algorithms. The web_page column includes the web page where ll information of that gene set are.
Please download from the email the compressed directory, and open index.html with a browser to investigate the results for each pathway in the table below.
counts_gene_rank =
counts_de %>%
symbol_to_entrez(feature) %>%
filter(PValue %>% is.na %>% `!`) %>%
test_gene_rank(
.entrez = entrez,
.arrange_desc = logFC ,
species="Homo sapiens",
gene_collections = c("H", "C2", "C5")
)
Curated aging gene sets, look p.adjust < 0.05
aging_sets =
counts_gene_rank %>%
filter(gs_cat == "C2" ) %>%
unnest(test) %>%
rowid_to_column(var = "GSEA rank") %>%
filter(grepl("aging", ID, ignore.case = T)) %>%
filter(p.adjust < 0.05)
aging_sets %>%
dplyr::select(-fit) %>%
DT::datatable()
aging_sets %>%
mutate(aging = if_else(grepl("UP", ID), "Aging up", "Aging down")) %>%
ggplot(aes(forcats::fct_reorder(ID, enrichmentScore), enrichmentScore, fill=p.adjust, size=Count)) +
#ggplot(aes(forcats::fct_reorder(ID, enrichmentScore), enrichmentScore,size=Count, color=p.adjust )) +
geom_hline(yintercept = 0, color="#c8c8c8", linetype="dashed") +
geom_point(shape = 21) +
facet_grid(~ aging , scales = "free", space = "free") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
scale_fill_distiller( trans = "log10_reverse", palette = "Spectral") +
#scale_y_discrete(labels = label_func) +
ylab("Enrichment score") +
xlab(NULL) +
scale_size(range=c(3, 8))
ggsave(
"aging_sets_point.pdf",
useDingbats=FALSE,
units = c("mm"),
width = 183/2 ,
height = 183/2,
limitsize = FALSE
)
aging_sets %>%
mutate(plot = pmap(
list(fit, ID, idx_for_plotting, p.adjust),
~ enrichplot::gseaplot2(
..1,
geneSetID = ..3,
title = sprintf("%s \nadj pvalue %s", ..2, round(..4, 2)),
base_size = 6
)
)) %>%
pull(plot)%>%
wrap_plots()
##
## Registered S3 method overwritten by 'treeio':
## method from
## root.phylo ape
ggsave(
"aging_sets_trends.pdf",
useDingbats=FALSE,
units = c("mm"),
width = 183 ,
height = 183,
limitsize = FALSE
)
All curated gene sets, look p.adjust < 0.05
counts_gene_rank %>%
filter(gs_cat == "C2" ) %>%
dplyr::select(-fit) %>%
unnest(test) %>%
DT::datatable()
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
All hallmarks, look p.adjust < 0.05
counts_gene_rank %>%
filter(gs_cat == "H" ) %>%
dplyr::select(-fit) %>%
unnest(test) %>%
DT::datatable()
counts_de %>% get_bibliography() %>%
c(counts_gene_rank %>% get_bibliography()) %>%
unique()
## @Article{tidybulk,
## title = {tidybulk: an R tidy framework for modular transcriptomic data analysis},
## author = {Stefano Mangiola and Ramyar Molania and Ruining Dong and Maria A. Doyle & Anthony T. Papenfuss},
## journal = {Genome Biology},
## year = {2021},
## volume = {22},
## number = {42},
## url = {https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02233-7},
## }
## @article{wickham2019welcome,
## title={Welcome to the Tidyverse},
## author={Wickham, Hadley and Averick, Mara and Bryan, Jennifer and Chang, Winston and McGowan, Lucy D'Agostino and Francois, Romain and Grolemund, Garrett and Hayes, Alex and Henry, Lionel and Hester, Jim and others},
## journal={Journal of Open Source Software},
## volume={4},
## number={43},
## pages={1686},
## year={2019}
## }
## @article{robinson2010edger,
## title={edgeR: a Bioconductor package for differential expression analysis of digital gene expression data},
## author={Robinson, Mark D and McCarthy, Davis J and Smyth, Gordon K},
## journal={Bioinformatics},
## volume={26},
## number={1},
## pages={139--140},
## year={2010},
## publisher={Oxford University Press}
## }
## @article{robinson2010scaling,
## title={A scaling normalization method for differential expression analysis of RNA-seq data},
## author={Robinson, Mark D and Oshlack, Alicia},
## journal={Genome biology},
## volume={11},
## number={3},
## pages={1--9},
## year={2010},
## publisher={BioMed Central}
## }
## @Manual{,
## title = {R: A Language and Environment for Statistical Computing},
## author = {{R Core Team}},
## organization = {R Foundation for Statistical Computing},
## address = {Vienna, Austria},
## year = {2020},
## url = {https://www.R-project.org/},
## }
## @article{zhou2014robustly,
## title={Robustly detecting differential expression in RNA sequencing data using observation weights},
## author={Zhou, Xiaobei and Lindsay, Helen and Robinson, Mark D},
## journal={Nucleic acids research},
## volume={42},
## number={11},
## pages={e91--e91},
## year={2014},
## publisher={Oxford University Press}
## }
## @article{McCarthy2009,
## doi = {10.1093/bioinformatics/btp053},
## url = {https://doi.org/10.1093/bioinformatics/btp053},
## year = {2009},
## month = jan,
## publisher = {Oxford University Press ({OUP})},
## volume = {25},
## number = {6},
## pages = {765--771},
## author = {D. J. McCarthy and G. K. Smyth},
## title = {Testing significance relative to a fold-change threshold is a {TREAT}},
## journal = {Bioinformatics}
## }
## @Article{tidybulk,
## title = {tidybulk: an R tidy framework for modular transcriptomic data analysis},
## author = {Stefano Mangiola and Ramyar Molania and Ruining Dong and Maria A. Doyle & Anthony T. Papenfuss},
## journal = {Genome Biology},
## year = {2021},
## volume = {22},
## number = {42},
## url = {https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02233-7},
## }
## @article{wickham2019welcome,
## title={Welcome to the Tidyverse},
## author={Wickham, Hadley and Averick, Mara and Bryan, Jennifer and Chang, Winston and McGowan, Lucy D'Agostino and Francois, Romain and Grolemund, Garrett and Hayes, Alex and Henry, Lionel and Hester, Jim and others},
## journal={Journal of Open Source Software},
## volume={4},
## number={43},
## pages={1686},
## year={2019}
## }
## @Article{,
## title = {clusterProfiler: an R package for comparing biological themes among gene clusters},
## author = {Guangchuang Yu and Li-Gen Wang and Yanyan Han and Qing-Yu He},
## journal = {OMICS: A Journal of Integrative Biology},
## year = {2012},
## volume = {16},
## number = {5},
## pages = {284-287},
## pmid = {22455463},
## doi = {10.1089/omi.2011.0118},
## }
## @Manual{,
## title = {msigdbr: MSigDB Gene Sets for Multiple Organisms in a Tidy Data Format},
## author = {Igor Dolgalev},
## year = {2020},
## note = {R package version 7.1.1},
## url = {https://CRAN.R-project.org/package=msigdbr},
## }
## @Manual{,
## title = {enrichplot: Visualization of Functional Enrichment Result},
## author = {Guangchuang Yu},
## year = {2021},
## note = {R package version 1.11.2.994},
## url = {https://yulab-smu.top/biomedical-knowledge-mining-book/},
## }
## NULL
sessionInfo()
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS: /stornext/System/data/apps/R/R-4.0.5/lib64/R/lib/libRblas.so
## LAPACK: /stornext/System/data/apps/R/R-4.0.5/lib64/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] patchwork_1.1.1 DT_0.17
## [3] tidyHeatmap_1.2.2 tidybulk_1.3.1
## [5] tidySummarizedExperiment_1.2.1 SummarizedExperiment_1.20.0
## [7] Biobase_2.50.0 GenomicRanges_1.42.0
## [9] GenomeInfoDb_1.26.4 IRanges_2.24.1
## [11] S4Vectors_0.28.1 BiocGenerics_0.36.0
## [13] MatrixGenerics_1.2.1 matrixStats_0.58.0
## [15] forcats_0.5.1 stringr_1.4.0
## [17] dplyr_1.0.5 purrr_0.3.4
## [19] readr_1.4.0 tidyr_1.1.3
## [21] tibble_3.1.0 ggplot2_3.3.3
## [23] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] shadowtext_0.0.7 readxl_1.3.1 backports_1.2.1
## [4] circlize_0.4.12 fastmatch_1.1-0 plyr_1.8.6
## [7] igraph_1.2.6 lazyeval_0.2.2 splines_4.0.5
## [10] BiocParallel_1.24.1 crosstalk_1.1.1 digest_0.6.27
## [13] htmltools_0.5.1.1 GOSemSim_2.16.1 viridis_0.5.1
## [16] GO.db_3.12.1 fansi_0.4.2 magrittr_2.0.1
## [19] memoise_2.0.0 cluster_2.1.1 limma_3.46.0
## [22] ComplexHeatmap_2.6.2 graphlayouts_0.7.1 modelr_0.1.8
## [25] enrichplot_1.11.2.994 colorspace_2.0-0 blob_1.2.1
## [28] rvest_1.0.0 ggrepel_0.9.1 haven_2.3.1
## [31] xfun_0.22 crayon_1.4.1 RCurl_1.98-1.3
## [34] jsonlite_1.7.2 scatterpie_0.1.5 org.Mm.eg.db_3.12.0
## [37] ape_5.4-1 glue_1.4.2 polyclip_1.10-0
## [40] gtable_0.3.0 zlibbioc_1.36.0 XVector_0.30.0
## [43] GetoptLong_1.0.5 DelayedArray_0.16.3 shape_1.4.5
## [46] DOSE_3.16.0 scales_1.1.1 DBI_1.1.1
## [49] edgeR_3.32.1 Rcpp_1.0.6 widyr_0.1.3
## [52] viridisLite_0.3.0 clue_0.3-58 tidytree_0.3.3
## [55] bit_4.0.4 preprocessCore_1.52.1 htmlwidgets_1.5.3
## [58] httr_1.4.2 fgsea_1.17.1 RColorBrewer_1.1-2
## [61] ellipsis_0.3.1 pkgconfig_2.0.3 farver_2.1.0
## [64] sass_0.3.1 dbplyr_2.1.0 locfit_1.5-9.4
## [67] utf8_1.2.1 reshape2_1.4.4 tidyselect_1.1.0
## [70] labeling_0.4.2 rlang_0.4.10 nanny_0.1.8
## [73] AnnotationDbi_1.52.0 munsell_0.5.0 cellranger_1.1.0
## [76] tools_4.0.5 cachem_1.0.4 cli_2.4.0
## [79] generics_0.1.0 RSQLite_2.2.5 broom_0.7.6
## [82] evaluate_0.14 fastmap_1.1.0 yaml_2.2.1
## [85] ggtree_2.4.1 org.Hs.eg.db_3.12.0 knitr_1.31
## [88] bit64_4.0.5 fs_1.5.0 tidygraph_1.2.0
## [91] ggraph_2.0.5 nlme_3.1-152 aplot_0.0.6
## [94] DO.db_2.9 xml2_1.3.2 compiler_4.0.5
## [97] rstudioapi_0.13 plotly_4.9.3 png_0.1-7
## [100] treeio_1.14.3 reprex_2.0.0 tweenr_1.0.2
## [103] bslib_0.2.4 stringi_1.5.3 highr_0.8
## [106] lattice_0.20-41 Matrix_1.3-2 vctrs_0.3.7
## [109] pillar_1.5.1 lifecycle_1.0.0 BiocManager_1.30.12
## [112] jquerylib_0.1.3 GlobalOptions_0.1.2 cowplot_1.1.1
## [115] data.table_1.14.0 bitops_1.0-6 qvalue_2.22.0
## [118] R6_2.5.0 gridExtra_2.3 MASS_7.3-53.1
## [121] gtools_3.8.2 assertthat_0.2.1 rjson_0.2.20
## [124] withr_2.4.1 GenomeInfoDbData_1.2.4 hms_1.0.0
## [127] grid_4.0.5 rvcheck_0.1.8 rmarkdown_2.7
## [130] Cairo_1.5-12.2 Rtsne_0.15 ggforce_0.3.3
## [133] lubridate_1.7.10