Preprocessing

# module load cutadapt; module load trimgalore; ls *fastq.gz | awk '{split($0,a,"_"); print a[3]}' | uniq | parallel --eta -j1 trim_galore --paired SO_9588_{}_R1.fastq.gz SO_9588_{}_R2.fastq.gz --fastqc > trim_galore.log

# module load STAR/2.7.3a; STAR --runThreadN 15 --runMode genomeGenerate --genomeDir  ~/third_party_sofware/hg38/karyotypic/ --genomeFastaFiles ~/third_party_sofware/hg38/karyotypic/Homo_sapiens_assembly38.fasta --sjdbGTFfile ~/third_party_sofware/hg38/karyotypic/hg38_ucsc.gtf --sjdbOverhang 100

# module load STAR/2.7.3a; for i in $(ls *_val_*fq.gz | rev | cut -c 16- | rev | uniq); do mkdir -p alignment_hg38/$i; cd alignment_hg38/$i; STAR --genomeDir ~/third_party_sofware/hg38/karyotypic/ --readFilesIn ../../$i'_R1_val_1.fq.gz' ../../$i'_R2_val_2.fq.gz' --readFilesCommand zcat --genomeLoad LoadAndKeep --runThreadN 42 --outSAMtype BAM SortedByCoordinate --limitBAMsortRAM 40000000000 --outReadsUnmapped Fastx; cd ../../; done

Import libraries

Gene counting and Data preprocessing

Include filtering, scaling, dimensionality reduction

counts_scaled = 
  read_csv("company_derived_normalised_counts.csv") %>%
  pivot_longer(cols = -Gene_Name, names_to = "sample", values_to = "count") %>%
  
  # Add factor of interest
  inner_join(
    tibble(
      sample = c("W1", "W2", "W3", "Ko1", "Ko2", "Ko3"),
      type = c(rep("Wild_type", 3), rep("Knock_out", 3))
    ),
    by="sample"
  ) %>%
  as_SummarizedExperiment(.sample = sample, .transcript = Gene_Name , .abundance = count) %>%
  
  # Filter
  identify_abundant(factor_of_interest=type) %>%
  
  # Scale
  scale_abundance() %>%
  
  # Reduce dimensions
  reduce_dimensions(method = "PCA")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Gene_Name = col_character(),
##   W1 = col_double(),
##   W2 = col_double(),
##   W3 = col_double(),
##   Ko1 = col_double(),
##   Ko2 = col_double(),
##   Ko3 = col_double()
## )

## Getting the 500 most variable genes

## Fraction of variance explained by the selected principal components

## # A tibble: 2 x 2
##   `Fraction of variance`    PC
##                    <dbl> <int>
## 1                  0.560     1
## 2                  0.265     2

## tidybulk says: to access the raw results do `attr(..., "internals")$PCA`

Density plot

The libraries are correctly scaled and have quite low depth

counts_scaled %>%
  keep_abundant(factor_of_interest=type) %>%
  ggplot(aes(count_scaled+1, group=sample, color=type)) +
  geom_density() +
  scale_x_log10() +
  scale_color_brewer(palette = "Set1") +
  theme_bw()

Principal component analysis

The biological groups are quite separated, except for Ko1

counts_scaled %>%
  as_tibble() %>%
  nanny::subset(sample) %>%
  ggplot(aes(PC1,PC2, label=sample)) +
  geom_point(aes(color=type)) +
  ggrepel::geom_text_repel() +
  scale_color_brewer(palette = "Set1") +
  theme_bw()

ggsave(
  "PCA.pdf",
  useDingbats=FALSE,
  units = c("mm"),
  width = 183 ,
  height = 183,
  limitsize = FALSE
)

Heatmap

The most variable genes put Ko1 more similar to wild-type

plot_heatmap = 
  counts_scaled %>%
  keep_variable(.abundance = counts_scaled, top = 500) %>%
  as_tibble() %>%
  heatmap(
    .column = sample,
    .row = feature,
    .value = count_scaled,
    transform = log1p
  ) %>%
  add_tile(type, palette  = c("#E41A1C" ,"#377EB8"))

## Getting the 500 most variable genes

plot_heatmap

plot_heatmap %>% save_pdf("heatmap_variable_genes.pdf")

## tidyHeatmap says: saving 7 x 5 in image

Differential gene feature-abundance

Robust and stringent analyses show a total of 522 downregulated genes and 468 up-regulated genes with a 2-fold difference between wild-type and knock-out

counts_de = 
  counts_scaled %>%
  test_differential_abundance(
    ~0+type, 
    method="edger_robust_likelihood_ratio", 
    .contrasts = "typeKnock_out-typeWild_type", 
    test_above_log2_fold_change = 1,
    omit_contrast_in_colnames=TRUE
  )

## =====================================
## tidybulk says: All testing methods use raw counts, irrespective of if scale_abundance 
## or adjust_abundance have been calculated. Therefore, it is essential to add covariates 
## such as batch effects (if applicable) in the formula.
## =====================================

## tidybulk says: The design column names are "typeKnock_out, typeWild_type"

## tidybulk says: to access the raw results (fitted GLM) do `attr(..., "internals")$edger_robust_likelihood_ratio`

counts_de %>%
  as_tibble() %>%
  nanny::subset(feature) %>%
  arrange(FDR) %>%
  describe_transcript(feature) %>%
  dplyr::select(feature, description, logFC, FDR) %>%
  DT::datatable()

## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

Gene feature-abundance distribution

Heatmap with significant genes

These genes define the two biological groups as expected

# Heatmap
plot_heatmap_de = 
  counts_de %>%
  filter(FDR<0.05)%>% #  & abs(logFC) >=2
  as_tibble() %>%
  heatmap(
    .column = sample,
    .row = feature,
    .value = count_scaled,
    transform = log1p
  ) %>%
  add_tile(type, palette  = c("#E41A1C" ,"#377EB8")) 
#%>%
#   InteractiveComplexHeatmap::ht_shiny()

plot_heatmap_de

plot_heatmap_de %>% save_pdf("heatmap_differential_genes.pdf", width = 183, height = 183, units = "mm")

Volcano plot

topgenes_symbols <-
  counts_de %>%
  as_tibble() %>%
  nanny::subset(feature)  %>%
  filter(FDR<0.05) %>%
  arrange(desc(abs(logFC)))  %>%
  pull(feature)

counts_de %>%
  as_tibble() %>%
  nanny::subset(feature) %>%
  
  # Subset data
  mutate(significant = case_when(
    FDR<0.05 & logFC < 0 ~ "Down-regulated", 
    FDR<0.05 & logFC > 0 ~ "Up-regulated",
    TRUE ~ ""
  )) %>%
  mutate(feature = ifelse(feature %in% topgenes_symbols, as.character(feature), "")) %>%
  
  # Plot
  ggplot(aes(x = logFC, y = PValue, label=feature)) +
  geom_vline(xintercept = c(-1, 1), linetype="dashed", color="#8c8c8c") +
  geom_hline(yintercept = mean(c(1.24E-4, 3.74E-4)), linetype="dashed", color="#8c8c8c") +
  geom_point(aes(color = significant,  alpha=significant)) +
  ggrepel::geom_text_repel() +
  
  
  # Custom scales
  xlab("Log 2 fold change") +
  theme_bw() +
  scale_y_continuous(trans = "log10_reverse") +
  scale_color_manual(values=c("black", "#1c4966", "#e11f28")) +
  scale_size_discrete(range = c(0, 2))

## Warning: Using size for a discrete variable is not advised.

## Warning: Using alpha for a discrete variable is not advised.

## Warning: Removed 9546 rows containing missing values (geom_point).

## Warning: Removed 9546 rows containing missing values (geom_text_repel).

## Warning: ggrepel: 149 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

ggsave(
  "volcano_plot.pdf",
  useDingbats=FALSE,
  units = c("mm"),
  width = 183 ,
  height = 183,
  limitsize = FALSE
)

## Warning: Using alpha for a discrete variable is not advised.

## Warning: Removed 9546 rows containing missing values (geom_point).

## Warning: Removed 9546 rows containing missing values (geom_text_repel).

## Warning: ggrepel: 141 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

EGSEA gene enrichment

EGSEA uses many algorithms on many datasets (GSEA, GO, KEGG). The pathways are ranked by the overall gene set that scores well in most algorithms. The web_page column includes the web page where ll information of that gene set are.

Please download from the email the compressed directory, and open index.html with a browser to investigate the results for each pathway in the table below.

counts_gene_rank = 
  counts_de %>%
  symbol_to_entrez(feature) %>%
  filter(PValue   %>% is.na %>% `!`) %>%
  test_gene_rank(
    .entrez = entrez,
    .arrange_desc = logFC ,
    species="Homo sapiens",
    gene_collections = c("H", "C2", "C5")
  )

Curated aging gene sets, look p.adjust < 0.05

aging_sets = 
  counts_gene_rank %>%
  filter(gs_cat  == "C2" ) %>%
  unnest(test) %>%
  rowid_to_column(var = "GSEA rank") %>%
  filter(grepl("aging", ID, ignore.case = T)) %>%
  filter(p.adjust < 0.05) 

aging_sets %>%
    dplyr::select(-fit) %>%
  DT::datatable()

aging_sets %>%
  mutate(aging = if_else(grepl("UP", ID), "Aging up", "Aging down")) %>%
  ggplot(aes(forcats::fct_reorder(ID, enrichmentScore), enrichmentScore, fill=p.adjust, size=Count)) +
  #ggplot(aes(forcats::fct_reorder(ID, enrichmentScore), enrichmentScore,size=Count, color=p.adjust )) +
  geom_hline(yintercept = 0, color="#c8c8c8", linetype="dashed") +
  geom_point(shape = 21) +
  facet_grid(~ aging , scales = "free", space = "free") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
  scale_fill_distiller( trans = "log10_reverse", palette = "Spectral") +
  #scale_y_discrete(labels = label_func) +
  ylab("Enrichment score") + 
  xlab(NULL) +
  scale_size(range=c(3, 8))

ggsave(
  "aging_sets_point.pdf",
  useDingbats=FALSE,
  units = c("mm"),
  width = 183/2 ,
  height = 183/2,
  limitsize = FALSE
)

aging_sets %>% 
  mutate(plot = pmap(
    list(fit, ID, idx_for_plotting, p.adjust), 
    ~ enrichplot::gseaplot2(
      ..1, 
      geneSetID = ..3, 
      title = sprintf("%s \nadj pvalue %s", ..2, round(..4, 2)),
      base_size = 6
    ) 
  )) %>%
  pull(plot)%>%
  wrap_plots()

##

## Registered S3 method overwritten by 'treeio':
##   method     from
##   root.phylo ape

ggsave(
  "aging_sets_trends.pdf",
  useDingbats=FALSE,
  units = c("mm"),
  width = 183 ,
  height = 183,
  limitsize = FALSE
)

All curated gene sets, look p.adjust < 0.05

counts_gene_rank %>%
  filter(gs_cat  == "C2" ) %>%
  dplyr::select(-fit) %>%
  unnest(test) %>%
  DT::datatable()

## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

All hallmarks, look p.adjust < 0.05

counts_gene_rank %>%
  filter(gs_cat  == "H" ) %>%
  dplyr::select(-fit) %>%
  unnest(test) %>%
  DT::datatable()

Bibliography

counts_de %>% get_bibliography() %>%
  c(counts_gene_rank %>% get_bibliography()) %>%
  unique()

##  @Article{tidybulk,
##   title = {tidybulk: an R tidy framework for modular transcriptomic data analysis},
##   author = {Stefano Mangiola and Ramyar Molania and Ruining Dong and Maria A. Doyle & Anthony T. Papenfuss},
##   journal = {Genome Biology},
##   year = {2021},
##   volume = {22},
##   number = {42},
##   url = {https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02233-7},
##   }
## @article{wickham2019welcome,
##   title={Welcome to the Tidyverse},
##   author={Wickham, Hadley and Averick, Mara and Bryan, Jennifer and Chang, Winston and McGowan, Lucy D'Agostino and Francois, Romain and Grolemund, Garrett and Hayes, Alex and Henry, Lionel and Hester, Jim and others},
##   journal={Journal of Open Source Software},
##   volume={4},
##   number={43},
##   pages={1686},
##   year={2019}
##  }
## @article{robinson2010edger,
##   title={edgeR: a Bioconductor package for differential expression analysis of digital gene expression data},
##   author={Robinson, Mark D and McCarthy, Davis J and Smyth, Gordon K},
##   journal={Bioinformatics},
##   volume={26},
##   number={1},
##   pages={139--140},
##   year={2010},
##   publisher={Oxford University Press}
##  }
## @article{robinson2010scaling,
##   title={A scaling normalization method for differential expression analysis of RNA-seq data},
##   author={Robinson, Mark D and Oshlack, Alicia},
##   journal={Genome biology},
##   volume={11},
##   number={3},
##   pages={1--9},
##   year={2010},
##   publisher={BioMed Central}
##  }
## @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2020},
##     url = {https://www.R-project.org/},
##   }
## @article{zhou2014robustly,
##   title={Robustly detecting differential expression in RNA sequencing data using observation weights},
##   author={Zhou, Xiaobei and Lindsay, Helen and Robinson, Mark D},
##   journal={Nucleic acids research},
##   volume={42},
##   number={11},
##   pages={e91--e91},
##   year={2014},
##   publisher={Oxford University Press}
##  }
## @article{McCarthy2009,
##   doi = {10.1093/bioinformatics/btp053},
##   url = {https://doi.org/10.1093/bioinformatics/btp053},
##   year = {2009},
##   month = jan,
##   publisher = {Oxford University Press ({OUP})},
##   volume = {25},
##   number = {6},
##   pages = {765--771},
##   author = {D. J. McCarthy and G. K. Smyth},
##   title = {Testing significance relative to a fold-change threshold is a {TREAT}},
##   journal = {Bioinformatics}
## }
##  @Article{tidybulk,
##   title = {tidybulk: an R tidy framework for modular transcriptomic data analysis},
##   author = {Stefano Mangiola and Ramyar Molania and Ruining Dong and Maria A. Doyle & Anthony T. Papenfuss},
##   journal = {Genome Biology},
##   year = {2021},
##   volume = {22},
##   number = {42},
##   url = {https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02233-7},
##   }
## @article{wickham2019welcome,
##   title={Welcome to the Tidyverse},
##   author={Wickham, Hadley and Averick, Mara and Bryan, Jennifer and Chang, Winston and McGowan, Lucy D'Agostino and Francois, Romain and Grolemund, Garrett and Hayes, Alex and Henry, Lionel and Hester, Jim and others},
##   journal={Journal of Open Source Software},
##   volume={4},
##   number={43},
##   pages={1686},
##   year={2019}
##  }
## @Article{,
##     title = {clusterProfiler: an R package for comparing biological themes among gene clusters},
##     author = {Guangchuang Yu and Li-Gen Wang and Yanyan Han and Qing-Yu He},
##     journal = {OMICS: A Journal of Integrative Biology},
##     year = {2012},
##     volume = {16},
##     number = {5},
##     pages = {284-287},
##     pmid = {22455463},
##     doi = {10.1089/omi.2011.0118},
##   }
## @Manual{,
##     title = {msigdbr: MSigDB Gene Sets for Multiple Organisms in a Tidy Data Format},
##     author = {Igor Dolgalev},
##     year = {2020},
##     note = {R package version 7.1.1},
##     url = {https://CRAN.R-project.org/package=msigdbr},
##   }
##   @Manual{,
##     title = {enrichplot: Visualization of Functional Enrichment Result},
##     author = {Guangchuang Yu},
##     year = {2021},
##     note = {R package version 1.11.2.994},
##     url = {https://yulab-smu.top/biomedical-knowledge-mining-book/},
##   }

## NULL

Session information

sessionInfo()

## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS:   /stornext/System/data/apps/R/R-4.0.5/lib64/R/lib/libRblas.so
## LAPACK: /stornext/System/data/apps/R/R-4.0.5/lib64/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] patchwork_1.1.1                DT_0.17                       
##  [3] tidyHeatmap_1.2.2              tidybulk_1.3.1                
##  [5] tidySummarizedExperiment_1.2.1 SummarizedExperiment_1.20.0   
##  [7] Biobase_2.50.0                 GenomicRanges_1.42.0          
##  [9] GenomeInfoDb_1.26.4            IRanges_2.24.1                
## [11] S4Vectors_0.28.1               BiocGenerics_0.36.0           
## [13] MatrixGenerics_1.2.1           matrixStats_0.58.0            
## [15] forcats_0.5.1                  stringr_1.4.0                 
## [17] dplyr_1.0.5                    purrr_0.3.4                   
## [19] readr_1.4.0                    tidyr_1.1.3                   
## [21] tibble_3.1.0                   ggplot2_3.3.3                 
## [23] tidyverse_1.3.0               
## 
## loaded via a namespace (and not attached):
##   [1] shadowtext_0.0.7       readxl_1.3.1           backports_1.2.1       
##   [4] circlize_0.4.12        fastmatch_1.1-0        plyr_1.8.6            
##   [7] igraph_1.2.6           lazyeval_0.2.2         splines_4.0.5         
##  [10] BiocParallel_1.24.1    crosstalk_1.1.1        digest_0.6.27         
##  [13] htmltools_0.5.1.1      GOSemSim_2.16.1        viridis_0.5.1         
##  [16] GO.db_3.12.1           fansi_0.4.2            magrittr_2.0.1        
##  [19] memoise_2.0.0          cluster_2.1.1          limma_3.46.0          
##  [22] ComplexHeatmap_2.6.2   graphlayouts_0.7.1     modelr_0.1.8          
##  [25] enrichplot_1.11.2.994  colorspace_2.0-0       blob_1.2.1            
##  [28] rvest_1.0.0            ggrepel_0.9.1          haven_2.3.1           
##  [31] xfun_0.22              crayon_1.4.1           RCurl_1.98-1.3        
##  [34] jsonlite_1.7.2         scatterpie_0.1.5       org.Mm.eg.db_3.12.0   
##  [37] ape_5.4-1              glue_1.4.2             polyclip_1.10-0       
##  [40] gtable_0.3.0           zlibbioc_1.36.0        XVector_0.30.0        
##  [43] GetoptLong_1.0.5       DelayedArray_0.16.3    shape_1.4.5           
##  [46] DOSE_3.16.0            scales_1.1.1           DBI_1.1.1             
##  [49] edgeR_3.32.1           Rcpp_1.0.6             widyr_0.1.3           
##  [52] viridisLite_0.3.0      clue_0.3-58            tidytree_0.3.3        
##  [55] bit_4.0.4              preprocessCore_1.52.1  htmlwidgets_1.5.3     
##  [58] httr_1.4.2             fgsea_1.17.1           RColorBrewer_1.1-2    
##  [61] ellipsis_0.3.1         pkgconfig_2.0.3        farver_2.1.0          
##  [64] sass_0.3.1             dbplyr_2.1.0           locfit_1.5-9.4        
##  [67] utf8_1.2.1             reshape2_1.4.4         tidyselect_1.1.0      
##  [70] labeling_0.4.2         rlang_0.4.10           nanny_0.1.8           
##  [73] AnnotationDbi_1.52.0   munsell_0.5.0          cellranger_1.1.0      
##  [76] tools_4.0.5            cachem_1.0.4           cli_2.4.0             
##  [79] generics_0.1.0         RSQLite_2.2.5          broom_0.7.6           
##  [82] evaluate_0.14          fastmap_1.1.0          yaml_2.2.1            
##  [85] ggtree_2.4.1           org.Hs.eg.db_3.12.0    knitr_1.31            
##  [88] bit64_4.0.5            fs_1.5.0               tidygraph_1.2.0       
##  [91] ggraph_2.0.5           nlme_3.1-152           aplot_0.0.6           
##  [94] DO.db_2.9              xml2_1.3.2             compiler_4.0.5        
##  [97] rstudioapi_0.13        plotly_4.9.3           png_0.1-7             
## [100] treeio_1.14.3          reprex_2.0.0           tweenr_1.0.2          
## [103] bslib_0.2.4            stringi_1.5.3          highr_0.8             
## [106] lattice_0.20-41        Matrix_1.3-2           vctrs_0.3.7           
## [109] pillar_1.5.1           lifecycle_1.0.0        BiocManager_1.30.12   
## [112] jquerylib_0.1.3        GlobalOptions_0.1.2    cowplot_1.1.1         
## [115] data.table_1.14.0      bitops_1.0-6           qvalue_2.22.0         
## [118] R6_2.5.0               gridExtra_2.3          MASS_7.3-53.1         
## [121] gtools_3.8.2           assertthat_0.2.1       rjson_0.2.20          
## [124] withr_2.4.1            GenomeInfoDbData_1.2.4 hms_1.0.0             
## [127] grid_4.0.5             rvcheck_0.1.8          rmarkdown_2.7         
## [130] Cairo_1.5-12.2         Rtsne_0.15             ggforce_0.3.3         
## [133] lubridate_1.7.10

Report