1 ) 🧬 Introduction to HCC and cfDNA

1.1 Project Overview

This script provides the core analytical pipeline for a PhD thesis developing cfDNA-based biomarkers for hepatocellular carcinoma (HCC). It analyzes cfDNA fragmentomics features from whole-genome sequencing data across three cohorts: healthy controls (n=150), patients with chronic liver disease (n=186), and patients with HCC (n=139). The primary objective is to identify and validate robust biomarkers that can accurately distinguish between these clinical groups, addressing the critical need for non-invasive HCC diagnostics.

2 ) 🧬 Data Preparation and Loading Packages

2.1 Install Necessary Packages

# Function to safely install and load packages
install_and_load <- function(packages, bioconductor = FALSE) {
  # Set a default CRAN mirror if not already set
  if (is.null(getOption("repos")) || getOption("repos")["CRAN"] == "@CRAN@") {
    options(repos = c(CRAN = "https://cran.r-project.org"))
  }
  
  for (package in packages) {
    # Check if package is installed
    if (!requireNamespace(package, quietly = TRUE)) {
      message(paste("Installing package:", package))
      
      # Install from appropriate source
      if (bioconductor) {
        if (!requireNamespace("BiocManager", quietly = TRUE)) {
          install.packages("BiocManager")
        }
        BiocManager::install(package, update = FALSE, ask = FALSE)
      } else {
        install.packages(package)
      }
    }
    
    # Load package
    suppressWarnings(suppressPackageStartupMessages(
      library(package, character.only = TRUE)
    ))
  }
}

# --- Package Installation ---

# 1. Install and load remotes package first, as it's needed for GitHub installations
install_and_load("remotes")

# 2. Install maftools from GitHub if needed
if (!requireNamespace("maftools", quietly = TRUE)) {
  message("Installing maftools from GitHub...")
  remotes::install_github("PoisonAlien/maftools", quiet = TRUE)
}
suppressWarnings(suppressPackageStartupMessages(library(maftools)))


# 3. Define CRAN and Bioconductor packages
cran_packages <- c("dunn.test", "PMCMRplus", "FSA", "rstatix", 
                   "evaluate", "knitr", "dplyr", "ggplot2", "magrittr", "grid",
                   "broom.helpers", "modelsummary", "kableExtra", "tinytex", "ggplotify",
                   "tidyverse", "car", "multcomp", "agricolae", "kableExtra", "coin", "moments","nortest","reshape2",
                   "dendextend","viridis", "tidytext","kableExtra")

bioc_packages <- c("ggsurvfit", "gtsummary", "tidycmprsk", "ranger", 
                   "survminer", "survcomp", "pheatmap", "patchwork","RColorBrewer")

# 4. Install and load packages from CRAN and Bioconductor
install_and_load(cran_packages)
install_and_load(bioc_packages, bioconductor = TRUE)

# 5. Install TinyTeX for PDF generation if not already installed
if (!requireNamespace("tinytex", quietly = TRUE)) {
  install.packages("tinytex")
}
if (!tinytex::is_tinytex()) {
  tinytex::install_tinytex()
}

# 6. Install UHHformats from the correct GitHub repository
# The repository path has been corrected thanks to your input.
if (!requireNamespace("UHHformats", quietly = TRUE)) {
  message("Installing UHHformats from GitHub (uham-bio/UHHformats)...")
  # Using the correct repository. build_vignettes = FALSE for faster, non-interactive installation.
  remotes::install_github("uham-bio/UHHformats", quiet = TRUE, build_vignettes = FALSE)
}

# 7. Load UHHformats
# With the correct installation source, we can simplify the loading logic.
if (requireNamespace("UHHformats", quietly = TRUE)) {
    suppressWarnings(suppressPackageStartupMessages(library(UHHformats)))
} else {
    warning("UHHformats package could not be installed or loaded.")
}


# --- Bibliography Generation ---

# Create bibliography directory if it doesn't exist
if (!dir.exists("bib")) {
  dir.create("bib")
}

# Generate package references for all loaded packages and essential rendering packages
# Start with the list of currently loaded packages
all_packages <- unique(.packages())

# Add essential packages for rendering the document that might not be attached
all_packages <- unique(c(all_packages, 'rmarkdown', 'knitr'))

# Add UHHformats to the bibliography list ONLY if it is currently loaded
if ("UHHformats" %in% .packages()) {
  all_packages <- unique(c(all_packages, 'UHHformats'))
}

# Write the .bib file
message("Writing bibliography file to bib/packages.bib...")
knitr::write_bib(x = all_packages, file = 'bib/packages.bib')

message("Package setup complete.")

2.2 Load and Inspect Data

# Read the data
data <- read.csv("/Users/sultanalharbi/Library/CloudStorage/OneDrive-Personal/Projects/Thesis_Chapters/Chapter 3 (Diagnostic Indicators for HCC)/HCC_Diagnostic_Tables/Training_set_final_01062026.csv")

# Check the structure of the data
str(data)
## 'data.frame':    475 obs. of  362 variables:
##  $ Sample_ID       : chr  "CGPLH1000P_Healthy" "CGPLH1001P_Healthy" "CGPLH1002P_Healthy" "CGPLH1003P_Healthy" ...
##  $ Group           : chr  "Healthy" "Healthy" "Healthy" "Healthy" ...
##  $ TF_Score        : num  0.00127 0.00105 0.00113 0.00156 0.00102 ...
##  $ DELFI_Score     : num  0.225 0.233 0.206 0.258 0.23 ...
##  $ mtcfDNA_fraction: num  0.000713 0.000513 0.0011 0.000732 0.003268 ...
##  $ P.1.10.         : num  1.92e-06 2.76e-06 2.85e-06 2.16e-06 2.21e-06 2.56e-06 2.16e-06 2.10e-06 2.50e-06 2.72e-06 ...
##  $ P.1.20.         : num  2.52e-05 2.68e-05 2.46e-05 2.67e-05 3.30e-05 2.56e-05 2.13e-05 1.95e-05 2.11e-05 1.85e-05 ...
##  $ P.1.30.         : num  7.86e-06 1.03e-05 1.06e-05 7.79e-06 8.09e-06 9.05e-06 7.92e-06 7.20e-06 9.69e-06 9.05e-06 ...
##  $ P.1.40.         : num  2.57e-05 2.90e-05 2.95e-05 2.56e-05 3.20e-05 3.73e-05 2.64e-05 2.40e-05 2.58e-05 2.38e-05 ...
##  $ P.1.50.         : num  5.72e-05 6.03e-05 5.76e-05 5.61e-05 6.95e-05 7.63e-05 5.38e-05 4.71e-05 5.69e-05 4.95e-05 ...
##  $ P.1.60.         : num  0.000108 0.000106 0.000102 0.000108 0.00013 ...
##  $ P.1.70.         : num  0.00019 0.000181 0.000171 0.000193 0.000231 ...
##  $ P.1.80.         : num  0.000349 0.000326 0.000299 0.000355 0.000425 ...
##  $ P.1.90.         : num  0.000721 0.000859 0.000584 0.000886 0.001087 ...
##  $ P.1.100.        : num  0.00226 0.00244 0.00189 0.00271 0.00317 ...
##  $ P.10.20.        : num  2.86e-06 3.85e-06 3.97e-06 2.63e-06 2.69e-06 2.78e-06 2.47e-06 2.66e-06 3.25e-06 2.90e-06 ...
##  $ P.20.30.        : num  3.08e-06 3.73e-06 3.78e-06 2.99e-06 3.19e-06 3.71e-06 3.29e-06 2.43e-06 3.94e-06 3.43e-06 ...
##  $ P.20.40.        : num  8.12e-05 8.14e-05 7.81e-05 7.97e-05 9.62e-05 7.68e-05 6.98e-05 5.97e-05 6.08e-05 5.71e-05 ...
##  $ P.30.40.        : num  1.78e-05 1.87e-05 1.89e-05 1.78e-05 2.39e-05 2.83e-05 1.85e-05 1.68e-05 1.62e-05 1.48e-05 ...
##  $ P.30.60.        : num  1.01e-04 9.60e-05 9.12e-05 1.00e-04 1.22e-04 ...
##  $ P.40.50.        : num  3.15e-05 3.13e-05 2.80e-05 3.04e-05 3.75e-05 3.90e-05 2.74e-05 2.31e-05 3.10e-05 2.57e-05 ...
##  $ P.40.60.        : num  0.00019 0.000177 0.000176 0.000186 0.000219 ...
##  $ P.40.80.        : num  0.000323 0.000297 0.00027 0.000329 0.000393 ...
##  $ P.50.60.        : num  5.13e-05 4.60e-05 4.42e-05 5.21e-05 6.09e-05 5.85e-05 4.35e-05 3.89e-05 4.74e-05 4.54e-05 ...
##  $ P.50.100.       : num  0.0022 0.00238 0.00183 0.00265 0.0031 ...
##  $ P.60.70.        : num  8.15e-05 7.49e-05 6.96e-05 8.46e-05 1.01e-04 ...
##  $ P.60.80.        : num  0.000363 0.000338 0.000321 0.000365 0.00043 ...
##  $ P.60.90.        : num  0.000612 0.000753 0.000482 0.000778 0.000956 ...
##  $ P.60.120.       : num  0.0154 0.015 0.0128 0.018 0.0194 ...
##  $ P.70.80.        : num  0.000159 0.000145 0.000128 0.000162 0.000194 ...
##  $ P.70.140.       : num  0.0779 0.0814 0.0706 0.0915 0.0869 ...
##  $ P.80.90.        : num  0.000372 0.000533 0.000285 0.000531 0.000662 ...
##  $ P.80.100.       : num  0.00197 0.00218 0.00167 0.00241 0.00282 ...
##  $ P.80.120.       : num  0.0151 0.0147 0.0126 0.0178 0.0191 ...
##  $ P.80.160.       : num  0.321 0.321 0.301 0.341 0.31 ...
##  $ P.90.100.       : num  0.00154 0.00158 0.0013 0.00182 0.00209 ...
##  $ P.90.120.       : num  0.0148 0.0142 0.0123 0.0172 0.0184 ...
##  $ P.90.180.       : num  0.815 0.783 0.789 0.805 0.755 ...
##  $ P.100.110.      : num  0.0043 0.00399 0.00346 0.00494 0.00527 ...
##  $ P.100.120.      : num  0.013 0.0122 0.0108 0.0151 0.0159 ...
##  $ P.100.150.      : num  0.172 0.177 0.16 0.193 0.175 ...
##  $ P.100.200.      : num  0.956 0.942 0.946 0.951 0.93 ...
##  $ P.110.120.      : num  0.00893 0.00863 0.00754 0.01049 0.01107 ...
##  $ P.120.130.      : num  0.0179 0.0189 0.016 0.0215 0.0212 ...
##  $ P.120.140.      : num  0.0609 0.0638 0.0558 0.0713 0.0653 ...
##  $ P.120.150.      : num  0.159 0.164 0.149 0.177 0.159 ...
##  $ P.120.160.      : num  0.305 0.306 0.289 0.323 0.291 ...
##  $ P.120.180.      : num  0.8 0.769 0.777 0.788 0.737 ...
##  $ P.130.140.      : num  0.0447 0.0476 0.0419 0.0521 0.0464 ...
##  $ P.140.150.      : num  0.0963 0.0979 0.091 0.1036 0.0915 ...
##  $ P.140.160.      : num  0.236 0.23 0.222 0.242 0.216 ...
##  $ P.140.210.      : num  0.9 0.889 0.902 0.884 0.877 ...
##  $ P.150.160.      : num  0.146 0.142 0.14 0.146 0.132 ...
##  $ P.150.180.      : num  0.641 0.604 0.628 0.61 0.578 ...
##  $ P.150.200.      : num  0.784 0.765 0.786 0.759 0.755 ...
##  $ P.160.170.      : num  0.275 0.254 0.266 0.257 0.238 ...
##  $ P.160.180.      : num  0.481 0.444 0.471 0.451 0.43 ...
##  $ P.160.200.      : num  0.637 0.623 0.647 0.613 0.623 ...
##  $ P.160.240.      : num  0.675 0.672 0.693 0.653 0.68 ...
##  $ P.170.180.      : num  0.22 0.208 0.222 0.207 0.208 ...
##  $ P.180.190.      : num  0.0994 0.1079 0.1083 0.1018 0.1165 ...
##  $ P.180.200.      : num  0.138 0.154 0.152 0.143 0.17 ...
##  $ P.180.210.      : num  0.163 0.187 0.183 0.17 0.207 ...
##  $ P.180.240.      : num  0.18 0.21 0.205 0.188 0.234 ...
##  $ P.180.270.      : num  0.183 0.214 0.209 0.192 0.239 ...
##  $ P.190.200.      : num  0.0428 0.0528 0.0499 0.0464 0.0604 ...
##  $ P.200.210.      : num  0.0204 0.0267 0.0249 0.0222 0.0306 ...
##  $ P.200.220.      : num  0.0295 0.0379 0.0359 0.0315 0.0441 ...
##  $ P.200.240.      : num  0.0376 0.049 0.0465 0.04 0.0571 ...
##  $ P.200.250.      : num  0.039 0.0508 0.0484 0.0416 0.0594 ...
##  $ P.200.300.      : num  0.042 0.0557 0.0518 0.046 0.0667 ...
##  $ P.210.220.      : num  0.00991 0.01304 0.01238 0.01044 0.01515 ...
##  $ P.210.240.      : num  0.0172 0.0223 0.0215 0.0178 0.0265 ...
##  $ P.210.280.      : num  0.0216 0.0277 0.0269 0.0227 0.0327 ...
##  $ P.220.230.      : num  0.00472 0.00621 0.00604 0.00489 0.0075 ...
##  $ P.220.240.      : num  0.00705 0.00887 0.0088 0.00712 0.01093 ...
##  $ P.230.240.      : num  0.00253 0.00307 0.0031 0.00247 0.00384 ...
##  $ P.240.250.      : num  0.00142 0.00183 0.00192 0.0016 0.00229 ...
##  $ P.240.260.      : num  0.00249 0.003 0.0031 0.00266 0.00373 ...
##  $ P.240.270.      : num  0.00357 0.00422 0.00435 0.00379 0.00506 ...
##  $ P.240.280.      : num  0.0044 0.00536 0.00537 0.0049 0.00624 ...
##  $ P.240.300.      : num  0.0044 0.00672 0.00537 0.00606 0.00964 ...
##  $ P.240.320.      : num  0.0044 0.00672 0.00537 0.00606 0.00964 ...
##  $ P.250.260.      : num  0.00114 0.0013 0.00131 0.00115 0.00158 ...
##  $ P.250.300.      : num  0.00298 0.00489 0.00345 0.00445 0.00735 ...
##  $ P.260.270.      : num  0.000999 0.001083 0.001122 0.001045 0.001196 ...
##  $ P.260.280.      : num  0.002 0.00213 0.00218 0.00208 0.00229 ...
##  $ P.270.280.      : num  0.000833 0.001141 0.00102 0.001107 0.001177 ...
##  $ P.270.300.      : num  0.000833 0.002501 0.00102 0.002262 0.004578 ...
##  $ P.270.360.      : num  0.000833 0.002501 0.00102 0.002262 0.004578 ...
##  $ P.280.290.      : num  0 0.00136 0 0.00116 0.00148 ...
##  $ P.280.300.      : num  0.00366 0.00375 0.00361 0.0036 0.00353 ...
##  $ P.280.320.      : num  0 0.00136 0 0.00116 0.0034 ...
##  $ P.280.350.      : num  0 0.00136 0 0.00116 0.0034 ...
##  $ P.290.300.      : num  0 0 0 0 0.00192 ...
##  $ P.300.310.      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ P.300.320.      : num  0.00758 0.00831 0.00767 0.00752 0.00754 ...
##  $ P.300.330.      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ P.300.350.      : num  0 0 0 0 0 0 0 0 0 0 ...
##   [list output truncated]
head(data)
##            Sample_ID   Group TF_Score DELFI_Score mtcfDNA_fraction  P.1.10.
## 1 CGPLH1000P_Healthy Healthy 0.001271    0.224852         0.000713 1.92e-06
## 2 CGPLH1001P_Healthy Healthy 0.001048    0.232668         0.000513 2.76e-06
## 3 CGPLH1002P_Healthy Healthy 0.001135    0.206212         0.001100 2.85e-06
## 4 CGPLH1003P_Healthy Healthy 0.001559    0.257690         0.000732 2.16e-06
## 5 CGPLH1004P_Healthy Healthy 0.001024    0.230369         0.003268 2.21e-06
## 6 CGPLH1005P_Healthy Healthy 0.001137    0.223513         0.000627 2.56e-06
##    P.1.20.  P.1.30.  P.1.40.  P.1.50.    P.1.60.    P.1.70.    P.1.80.
## 1 2.52e-05 7.86e-06 2.57e-05 5.72e-05 0.00010848 0.00018996 0.00034861
## 2 2.68e-05 1.03e-05 2.90e-05 6.03e-05 0.00010637 0.00018123 0.00032618
## 3 2.46e-05 1.06e-05 2.95e-05 5.76e-05 0.00010179 0.00017138 0.00029913
## 4 2.67e-05 7.79e-06 2.56e-05 5.61e-05 0.00010820 0.00019276 0.00035508
## 5 3.30e-05 8.09e-06 3.20e-05 6.95e-05 0.00013048 0.00023127 0.00042499
## 6 2.56e-05 9.05e-06 3.73e-05 7.63e-05 0.00013486 0.00021398 0.00035623
##      P.1.90.   P.1.100. P.10.20. P.20.30. P.20.40. P.30.40.   P.30.60. P.40.50.
## 1 0.00072056 0.00226138 2.86e-06 3.08e-06 8.12e-05 1.78e-05 0.00010062 3.15e-05
## 2 0.00085920 0.00243872 3.85e-06 3.73e-06 8.14e-05 1.87e-05 0.00009600 3.13e-05
## 3 0.00058365 0.00188779 3.97e-06 3.78e-06 7.81e-05 1.89e-05 0.00009120 2.80e-05
## 4 0.00088615 0.00270756 2.63e-06 2.99e-06 7.97e-05 1.78e-05 0.00010042 3.04e-05
## 5 0.00108656 0.00317432 2.69e-06 3.19e-06 9.62e-05 2.39e-05 0.00012239 3.75e-05
## 6 0.00082914 0.00229093 2.78e-06 3.71e-06 7.68e-05 2.83e-05 0.00012581 3.90e-05
##     P.40.60.   P.40.80. P.50.60.  P.50.100.  P.60.70.   P.60.80.   P.60.90.
## 1 0.00019020 0.00032294 5.13e-05 0.00220422 0.0000815 0.00036289 0.00061207
## 2 0.00017702 0.00029717 4.60e-05 0.00237837 0.0000749 0.00033776 0.00075284
## 3 0.00017594 0.00026958 4.42e-05 0.00183021 0.0000696 0.00032051 0.00048186
## 4 0.00018552 0.00032945 5.21e-05 0.00265149 0.0000846 0.00036547 0.00077794
## 5 0.00021883 0.00039295 6.09e-05 0.00310479 0.0001008 0.00042968 0.00095609
## 6 0.00017804 0.00031889 5.85e-05 0.00221458 0.0000791 0.00031872 0.00069428
##    P.60.120.   P.70.80.  P.70.140.   P.80.90.  P.80.100.  P.80.120. P.80.160.
## 1 0.01537820 0.00015865 0.07794972 0.00037195 0.00197408 0.01513807 0.3205178
## 2 0.01495560 0.00014495 0.08136730 0.00053302 0.00217635 0.01473579 0.3210052
## 3 0.01278487 0.00012775 0.07057450 0.00028452 0.00166980 0.01258753 0.3011379
## 4 0.01802736 0.00016232 0.09150606 0.00053107 0.00240517 0.01778048 0.3410335
## 5 0.01938781 0.00019371 0.08693940 0.00066158 0.00281820 0.01909330 0.3101904
## 6 0.01450267 0.00014225 0.07979761 0.00047291 0.00197823 0.01428130 0.3088499
##    P.90.100.  P.90.120. P.90.180. P.100.110. P.100.120. P.100.150. P.100.200.
## 1 0.00154083 0.01476612 0.8150515 0.00429833 0.01297260  0.1722271  0.9557386
## 2 0.00157952 0.01420276 0.7827151 0.00399373 0.01223760  0.1770198  0.9418744
## 3 0.00130415 0.01230300 0.7893111 0.00346127 0.01077330  0.1598179  0.9462750
## 4 0.00182141 0.01724941 0.8048547 0.00494235 0.01506375  0.1925962  0.9512647
## 5 0.00208776 0.01843172 0.7552904 0.00527143 0.01594124  0.1754659  0.9300836
## 6 0.00146179 0.01380840 0.7618336 0.00372940 0.01195419  0.1719612  0.9334588
##   P.110.120. P.120.130. P.120.140. P.120.150. P.120.160. P.120.180. P.130.140.
## 1 0.00892697 0.01792847 0.06089326  0.1590018  0.3053798  0.8002853 0.04472454
## 2 0.00862952 0.01887180 0.06377190  0.1643966  0.3062694  0.7685123 0.04761476
## 3 0.00753759 0.01597703 0.05579454  0.1488190  0.2885504  0.7770081 0.04188220
## 4 0.01048565 0.02149663 0.07131890  0.1771682  0.3232530  0.7876053 0.05206662
## 5 0.01107253 0.02122286 0.06533301  0.1591220  0.2910971  0.7368586 0.04642953
## 6 0.00861721 0.01940487 0.06273912  0.1596146  0.2945686  0.7480252 0.04596919
##   P.140.150. P.140.160. P.140.210. P.150.160. P.150.180. P.150.200. P.160.170.
## 1 0.09634883  0.2357995  0.9002941  0.1463779  0.6412835  0.7835115  0.2751536
## 2 0.09791000  0.2300145  0.8894173  0.1418729  0.6041157  0.7648546  0.2538522
## 3 0.09095976  0.2222594  0.9023600  0.1397314  0.6281891  0.7864572  0.2661824
## 4 0.10360499  0.2421560  0.8844456  0.1460848  0.6104370  0.7586685  0.2569860
## 5 0.09146957  0.2156335  0.8767035  0.1319751  0.5777367  0.7546177  0.2376068
## 6 0.09424053  0.2200718  0.8855377  0.1349540  0.5884106  0.7614976  0.2419553
##   P.160.180. P.160.200. P.160.240. P.170.180. P.180.190. P.180.200. P.180.210.
## 1  0.4813707  0.6371336  0.6747355  0.2197520 0.09943374  0.1379590  0.1626618
## 2  0.4438643  0.6229817  0.6719493  0.2083907 0.10791691  0.1538091  0.1873916
## 3  0.4709955  0.6467258  0.6931947  0.2222753 0.10832770  0.1521923  0.1832111
## 4  0.4505759  0.6125837  0.6525562  0.2073662 0.10180785  0.1433754  0.1704035
## 5  0.4303480  0.6226425  0.6797481  0.2081548 0.11648551  0.1704273  0.2074972
## 6  0.4357395  0.6265435  0.6824650  0.2115013 0.11524121  0.1657891  0.2028865
##   P.180.240. P.180.270. P.190.200. P.200.210. P.200.220. P.200.240. P.200.250.
## 1  0.1798299  0.1833952 0.04279428 0.02043379 0.02946796 0.03760186 0.03902489
## 2  0.2097064  0.2139251 0.05282196 0.02665271 0.03794541 0.04896758 0.05079826
## 3  0.2047370  0.2090856 0.04994038 0.02494302 0.03587537 0.04646890 0.04838731
## 4  0.1882039  0.1919971 0.04642362 0.02217208 0.03152955 0.03997246 0.04157358
## 5  0.2339866  0.2390447 0.06039549 0.03061622 0.04409517 0.05710556 0.05939157
## 6  0.2290084  0.2343700 0.05784576 0.02979956 0.04281935 0.05592144 0.05823635
##   P.200.300. P.210.220. P.210.240. P.210.280. P.220.230. P.220.240. P.230.240.
## 1 0.04199998 0.00991246 0.01716807 0.02156618 0.00472396 0.00705261 0.00253164
## 2 0.05568687 0.01304123 0.02231487 0.02767407 0.00620512 0.00887315 0.00306851
## 3 0.05183716 0.01238131 0.02152589 0.02689415 0.00603965 0.00879559 0.00310493
## 4 0.04602770 0.01043997 0.01780038 0.02270057 0.00489064 0.00712120 0.00246977
## 5 0.06674208 0.01515415 0.02648934 0.03272438 0.00749696 0.01093269 0.00383823
## 6 0.06425028 0.01496026 0.02612189 0.03287465 0.00738315 0.01067695 0.00377848
##   P.240.250. P.240.260. P.240.270. P.240.280. P.240.300. P.240.320. P.250.260.
## 1 0.00142303 0.00248932 0.00356536 0.00439812 0.00439812 0.00439812 0.00114338
## 2 0.00183069 0.00300115 0.00421866 0.00535920 0.00671929 0.00671929 0.00130496
## 3 0.00191841 0.00310136 0.00434861 0.00536826 0.00536826 0.00536826 0.00130781
## 4 0.00160112 0.00266020 0.00379321 0.00490019 0.00605525 0.00605525 0.00114722
## 5 0.00228601 0.00372674 0.00505813 0.00623504 0.00963652 0.00963652 0.00157565
## 6 0.00231491 0.00383380 0.00536159 0.00675276 0.00832884 0.00832884 0.00168950
##   P.250.300. P.260.270. P.260.280. P.270.280. P.270.300. P.270.360. P.280.290.
## 1 0.00297509 0.00099895 0.00199601 0.00083276 0.00083276 0.00083276 0.00000000
## 2 0.00488861 0.00108301 0.00212631 0.00114055 0.00250064 0.00250064 0.00136009
## 3 0.00344985 0.00112239 0.00217718 0.00101965 0.00101965 0.00101965 0.00000000
## 4 0.00445412 0.00104487 0.00207949 0.00110698 0.00226204 0.00226204 0.00115506
## 5 0.00735051 0.00119647 0.00228592 0.00117691 0.00457839 0.00457839 0.00148196
## 6 0.00601393 0.00135718 0.00262894 0.00139118 0.00296726 0.00296726 0.00157608
##   P.280.300. P.280.320. P.280.350. P.290.300. P.300.310. P.300.320. P.300.330.
## 1 0.00365540 0.00000000 0.00000000 0.00000000          0 0.00757653          0
## 2 0.00374697 0.00136009 0.00136009 0.00000000          0 0.00830685          0
## 3 0.00361225 0.00000000 0.00000000 0.00000000          0 0.00766506          0
## 4 0.00360190 0.00115506 0.00115506 0.00000000          0 0.00751969          0
## 5 0.00352748 0.00340148 0.00340148 0.00191952          0 0.00753975          0
## 6 0.00413155 0.00157608 0.00157608 0.00000000          0 0.00826162          0
##   P.300.350. P.300.360. P.300.400. P.310.320. P.320.330. P.320.340. P.320.360.
## 1          0          0          0          0          0 0.00771922          0
## 2          0          0          0          0          0 0.01052846          0
## 3          0          0          0          0          0 0.00890304          0
## 4          0          0          0          0          0 0.00874517          0
## 5          0          0          0          0          0 0.00919688          0
## 6          0          0          0          0          0 0.01002904          0
##   P.320.400.        AACC        ACCC       CCCT        CTAA        TAAC
## 1          0 0.003143340 0.003796098 0.01160814 0.003353300 0.002407976
## 2          0 0.003270072 0.003847939 0.01147993 0.003222585 0.002438626
## 3          0 0.003227095 0.003806106 0.01159249 0.003332356 0.002349842
## 4          0 0.003214644 0.003820024 0.01137570 0.003280539 0.002496725
## 5          0 0.003312022 0.003976018 0.01182085 0.003274980 0.002498459
## 6          0 0.003258333 0.004091867 0.01236269 0.002900330 0.002368610
##          CCCC        CCTA        AGCC        ATCC       CCTC        AAAC
## 1 0.008116026 0.007619187 0.003965200 0.000682021 0.01057766 0.005357880
## 2 0.007779957 0.008131660 0.003893866 0.000720632 0.01044118 0.005669073
## 3 0.007786994 0.007705751 0.003970617 0.000731007 0.01029219 0.005507635
## 4 0.007704763 0.007862202 0.003879691 0.000696311 0.01027094 0.005543094
## 5 0.007843958 0.007937819 0.003893364 0.000730640 0.01029901 0.005559666
## 6 0.008698478 0.008276515 0.004015813 0.000647489 0.01134925 0.005377047
##          TACC       CCCA        GGCG        TCCC       CCTG        TGTA
## 1 0.002221607 0.01599779 0.000982856 0.002685428 0.01499361 0.005243176
## 2 0.002174759 0.01603900 0.000960930 0.002484062 0.01466085 0.005191527
## 3 0.002135104 0.01580311 0.001007948 0.002490876 0.01456118 0.005199914
## 4 0.002285280 0.01590348 0.000929112 0.002594528 0.01441496 0.005187107
## 5 0.002325028 0.01606972 0.000819623 0.002621185 0.01475627 0.005544642
## 6 0.002287620 0.01725183 0.000985393 0.002819663 0.01623076 0.005298354
##          GCCA        CACC        TGTG        CTGC        GAGC        GCAC
## 1 0.006634208 0.005338965 0.006685072 0.002419808 0.002377139 0.003512217
## 2 0.006685120 0.005174561 0.006295583 0.002036618 0.002244544 0.003399065
## 3 0.006727032 0.005280163 0.006392916 0.002259609 0.002324746 0.003412470
## 4 0.006392771 0.005251900 0.006360705 0.002154120 0.002310018 0.003368372
## 5 0.006260472 0.005278301 0.006810724 0.002084326 0.002194663 0.003315070
## 6 0.007091456 0.005495741 0.006615800 0.001996861 0.002335464 0.003692086
##          CACT        GAGG        AGTG        GGCC       CCAG        ACAG
## 1 0.006860647 0.004493395 0.004860461 0.004245889 0.01352731 0.006097119
## 2 0.006919334 0.004213183 0.005063661 0.004012020 0.01283922 0.006252359
## 3 0.006992908 0.004370662 0.005029584 0.004250947 0.01287564 0.006022268
## 4 0.006969073 0.004330431 0.004959480 0.003886799 0.01289435 0.006255211
## 5 0.007092915 0.004159884 0.004863522 0.003843969 0.01320733 0.006250288
## 6 0.006986652 0.004413781 0.004766078 0.004425069 0.01409552 0.006310755
##          AGAG        CTAG        ACGG        CATC        AGTC        TCCA
## 1 0.006745735 0.002222899 0.000584211 0.004747910 0.003175111 0.003421416
## 2 0.006632864 0.001989073 0.000557534 0.005109270 0.003304134 0.003323094
## 3 0.006739833 0.002138697 0.000562921 0.004951090 0.003284710 0.003282551
## 4 0.006654320 0.002050073 0.000577874 0.004841137 0.003231692 0.003406965
## 5 0.006625299 0.002025105 0.000558697 0.004825387 0.003224598 0.003496408
## 6 0.006412724 0.001764874 0.000598712 0.005058327 0.003202414 0.003517060
##          GTCA       CCTT       CCAA        GCCT        GGGC        CGCT
## 1 0.002818883 0.01250933 0.01106532 0.007529410 0.003123356 0.001046355
## 2 0.002889293 0.01304993 0.01126807 0.007200240 0.002885058 0.000954990
## 3 0.002935296 0.01271220 0.01083966 0.007386966 0.003017113 0.001033803
## 4 0.002797622 0.01271180 0.01115337 0.007094603 0.002858169 0.000978321
## 5 0.002730303 0.01304532 0.01128002 0.007273356 0.002703266 0.001016345
## 6 0.002793785 0.01336671 0.01164134 0.008093395 0.003097215 0.001066400
##          CTGG        GCCC        GTGC       CATT        TCTG        AGGG
## 1 0.003139391 0.004462544 0.002414671 0.01046451 0.003878363 0.003918849
## 2 0.002461517 0.004069508 0.002291887 0.01142773 0.003670433 0.003677182
## 3 0.002811702 0.004204431 0.002403493 0.01134326 0.003564955 0.003847731
## 4 0.002716803 0.004060421 0.002302470 0.01101025 0.003882286 0.003759980
## 5 0.002592534 0.004162305 0.002253149 0.01140778 0.003832350 0.003692555
## 6 0.002355941 0.004819395 0.002376794 0.01076693 0.003982810 0.003734851
##          TAGA        GGTG        AACG        TGAG        AAGA        CAGA
## 1 0.003380409 0.004947759 0.000754279 0.006734704 0.006303620 0.008585118
## 2 0.003164794 0.005185978 0.000780302 0.005915570 0.006444331 0.008039474
## 3 0.003238733 0.005253785 0.000794795 0.006183001 0.006347884 0.008313586
## 4 0.003537747 0.004854678 0.000794760 0.006420371 0.006457486 0.008403701
## 5 0.003278817 0.004381904 0.000784392 0.006389779 0.006523550 0.008210345
## 6 0.003051812 0.004990204 0.000790697 0.006184864 0.006216575 0.007946598
##          AAGG        GACA        GCGC        CGTT        CAGC        CAGT
## 1 0.004485928 0.004365255 0.000469641 0.002548662 0.005870412 0.005770747
## 2 0.004442671 0.004629845 0.000387803 0.002488599 0.005343859 0.005469051
## 3 0.004489450 0.004608153 0.000410958 0.002615870 0.005562922 0.005639730
## 4 0.004517059 0.004501993 0.000407429 0.002508818 0.005538345 0.005656090
## 5 0.004491022 0.004213852 0.000383742 0.002470399 0.005464605 0.005732111
## 6 0.004422970 0.004411018 0.000452471 0.002454553 0.005650255 0.005510834
##          AATA        GGAG        CGAA        GTGA        CCAC        GCTG
## 1 0.005788816 0.006804987 0.001601526 0.003861304 0.008828200 0.007389971
## 2 0.006378635 0.006759053 0.001735538 0.003818762 0.008715770 0.007072330
## 3 0.006024933 0.006990716 0.001660663 0.003892106 0.008596243 0.007155379
## 4 0.006295266 0.006531228 0.001670716 0.003766459 0.008536025 0.006997648
## 5 0.006391374 0.006142783 0.001611851 0.003663491 0.008647679 0.007010063
## 6 0.005864763 0.006809598 0.001933691 0.003747252 0.009290819 0.008023011
##          TGCC        AAGC        AGAA       AAAA        GAAG        GCTC
## 1 0.003279705 0.003344748 0.008898632 0.01537159 0.005572843 0.003854103
## 2 0.002931953 0.003308426 0.009042588 0.01607889 0.005705461 0.003703425
## 3 0.003037386 0.003313095 0.009011738 0.01553056 0.005734500 0.003758864
## 4 0.003117092 0.003356844 0.009109480 0.01611030 0.005626996 0.003601047
## 5 0.003211420 0.003331475 0.009042573 0.01640061 0.005396850 0.003627256
## 6 0.003270949 0.003290905 0.008527956 0.01533631 0.005594479 0.004289883
##          TGGG        GGCA        AATG        CCGG        GGAT        AACA
## 1 0.004933848 0.004251382 0.006137902 0.000873526 0.004451142 0.006032596
## 2 0.003932913 0.004584352 0.006716061 0.000716745 0.004675830 0.006387248
## 3 0.004268842 0.004531564 0.006496052 0.000781926 0.004759492 0.006243213
## 4 0.004637367 0.004345733 0.006742562 0.000768795 0.004434730 0.006304435
## 5 0.004293566 0.003777612 0.006551197 0.000754110 0.004234883 0.006400178
## 6 0.004206793 0.004139011 0.006614974 0.000843853 0.004436537 0.006098885
##          CATG       CCAT        GGGT        CTGA        CAGG        TATT
## 1 0.007569391 0.01128457 0.003155915 0.003912688 0.008035380 0.005848722
## 2 0.007712974 0.01187048 0.003029124 0.003478287 0.007206974 0.006243717
## 3 0.007687215 0.01169651 0.003155853 0.003744562 0.007630410 0.006085179
## 4 0.007624526 0.01129186 0.003009050 0.003667366 0.007531582 0.006206820
## 5 0.007697475 0.01194304 0.002939198 0.003544548 0.007410821 0.006552649
## 6 0.007713516 0.01195988 0.003111051 0.003300991 0.007547839 0.006084116
##          GAGA        GAAC        GGCT        GGGA        AGAC        GCAG
## 1 0.005167785 0.002494101 0.005539112 0.004814155 0.003923169 0.006299507
## 2 0.005152465 0.002712653 0.005647251 0.004797452 0.003990964 0.005873343
## 3 0.005211164 0.002677094 0.005841650 0.004886257 0.003973940 0.005930281
## 4 0.005180061 0.002573329 0.005347880 0.004711644 0.003940600 0.006087093
## 5 0.004961919 0.002383979 0.005169021 0.004326054 0.003890244 0.005862924
## 6 0.005163885 0.002573463 0.005771821 0.004711971 0.003840032 0.006476970
##          CCGC        CGAG        TTTC        ATTC        TGCA        AAAT
## 1 0.000780988 0.001698712 0.002097520 0.004608946 0.004911830 0.009192594
## 2 0.000663275 0.001557858 0.001966919 0.005006217 0.004635438 0.009945799
## 3 0.000707412 0.001656210 0.002008019 0.004944647 0.004786225 0.009544890
## 4 0.000684972 0.001561495 0.001981427 0.004724131 0.004824907 0.009840248
## 5 0.000676726 0.001605970 0.002014724 0.004918853 0.004921363 0.010060388
## 6 0.000768588 0.001746318 0.001911366 0.004648783 0.004816614 0.009305355
##          ACAA        GAAA        AGCA       CAAA        GCTA        CGCC
## 1 0.006171337 0.009231818 0.004941999 0.01255090 0.004053121 0.001372325
## 2 0.006592761 0.009595945 0.005133476 0.01278388 0.004324733 0.001209696
## 3 0.006243989 0.009501882 0.005102344 0.01253070 0.004200940 0.001293523
## 4 0.006473716 0.009570788 0.005064974 0.01293974 0.004098873 0.001246522
## 5 0.006604250 0.009334410 0.004909440 0.01279444 0.004015445 0.001251793
## 6 0.006419005 0.009337729 0.004796083 0.01231372 0.004400304 0.001417351
##          ACCA        CCCG        TCTC        CAAC        ACTA        GCGG
## 1 0.005080725 0.002558713 0.004093697 0.004593921 0.003474833 0.000541721
## 2 0.005531871 0.002353609 0.003950371 0.004703822 0.003940544 0.000458932
## 3 0.005282322 0.002456233 0.003846770 0.004657570 0.003678801 0.000489553
## 4 0.005406855 0.002390051 0.003987497 0.004627994 0.003819568 0.000479850
## 5 0.005290564 0.002400080 0.004031402 0.004614537 0.003729472 0.000456321
## 6 0.005317161 0.002739068 0.004370855 0.004564186 0.003614075 0.000531917
##          TGCT        GAGT        CGTG        AGGC        TAGC        CTCC
## 1 0.004325941 0.003243480 0.002110406 0.003842968 0.001805963 0.002159489
## 2 0.004062166 0.003199101 0.001972093 0.003630330 0.001673354 0.001993856
## 3 0.004259482 0.003272380 0.002070609 0.003752337 0.001685743 0.002144330
## 4 0.004174895 0.003256257 0.001947095 0.003688031 0.001852704 0.002020790
## 5 0.004510995 0.003161684 0.002028960 0.003627884 0.001702574 0.002017395
## 6 0.004353663 0.003251747 0.002182206 0.003715039 0.001645785 0.001915673
##          TCCT        ACTG        CAAG        AAGT        CGCA        TAAT
## 1 0.003568604 0.005243309 0.007984219 0.003997788 0.001026445 0.004573285
## 2 0.003372489 0.005609996 0.007733004 0.004044608 0.000968979 0.004696495
## 3 0.003351195 0.005423103 0.007838908 0.004001128 0.001014576 0.004560540
## 4 0.003523876 0.005578275 0.007859608 0.004078271 0.000965456 0.004872795
## 5 0.003807285 0.005541325 0.007854877 0.004148732 0.000972704 0.005070965
## 6 0.003721141 0.005561190 0.007730224 0.003914526 0.001058307 0.004602698
##          GTTT        TGAT        CTAT        TAGG        TGTT        AGAT
## 1 0.006804869 0.005122993 0.002581428 0.002513891 0.006762155 0.004930196
## 2 0.007163880 0.004912789 0.002579209 0.002274791 0.006630292 0.005145254
## 3 0.007267858 0.005027679 0.002694044 0.002375951 0.006805324 0.005074701
## 4 0.006817075 0.005012117 0.002571599 0.002517138 0.006435092 0.005081550
## 5 0.007155040 0.005347042 0.002652436 0.002315740 0.007359453 0.005120773
## 6 0.006923698 0.004932652 0.002270303 0.002179389 0.006792262 0.004786393
##          GGTA        GTTG        GTGG        CTCA        CTCT        GCAT
## 1 0.002870060 0.003376564 0.003566437 0.003094198 0.003005772 0.005038442
## 2 0.003267485 0.003474703 0.003299379 0.002995279 0.002939974 0.005116698
## 3 0.003148685 0.003504777 0.003448141 0.003177830 0.003165484 0.005095109
## 4 0.003022386 0.003325375 0.003382133 0.003027215 0.002953976 0.004974562
## 5 0.002662297 0.003288373 0.003290668 0.002925554 0.003027519 0.004999535
## 6 0.002761195 0.003375233 0.003406279 0.002669330 0.002699300 0.005241681
##          GCAA        GTTA        ACTC        CTTT        GATG        ATGG
## 1 0.005440724 0.002943015 0.004358262 0.006245362 0.003388352 0.004121460
## 2 0.005410740 0.003249334 0.004681451 0.006543135 0.003652007 0.004176217
## 3 0.005303607 0.003161048 0.004520534 0.006687937 0.003617392 0.004168506
## 4 0.005406147 0.003056497 0.004627696 0.006318227 0.003450826 0.004197746
## 5 0.005343133 0.002993722 0.004616205 0.006716541 0.003263003 0.004076584
## 6 0.005668901 0.002966407 0.004668901 0.006188615 0.003516342 0.004084061
##          ACAT        CTTA        ACCT        ACAC        TCTA        ATAA
## 1 0.005855655 0.002969234 0.002813078 0.004193852 0.002857366 0.005663778
## 2 0.006451875 0.003194448 0.003164432 0.004411904 0.002929612 0.005981439
## 3 0.006128811 0.003155583 0.003017956 0.004255299 0.002717927 0.005753238
## 4 0.006252238 0.003046275 0.003132394 0.004321986 0.002908809 0.005880477
## 5 0.006392468 0.003103288 0.003324196 0.004322701 0.003021261 0.006047005
## 6 0.006090774 0.002898769 0.003164602 0.004342751 0.003071032 0.005362798
##          ATTA        GCTT        TCTT        GGAA        GGAC        TTTT
## 1 0.004417292 0.006258664 0.004927835 0.008536094 0.002799435 0.004319542
## 2 0.004802583 0.006384170 0.004928078 0.008946009 0.002894148 0.004151216
## 3 0.004590292 0.006404351 0.004755073 0.009092054 0.002931029 0.004150173
## 4 0.004651993 0.006191502 0.004919361 0.008771971 0.002763495 0.004236433
## 5 0.004743898 0.006371150 0.005252016 0.008266213 0.002542547 0.004380218
## 6 0.004346556 0.006788260 0.005233443 0.008831690 0.002844446 0.003957291
##          ATTG        TCAT        TGAA        AGGA        ATAT        ATGA
## 1 0.003410815 0.003462585 0.007052969 0.005396273 0.001468234 0.004358856
## 2 0.003641516 0.003463807 0.006487209 0.005243061 0.001615794 0.004504147
## 3 0.003536992 0.003275265 0.006565186 0.005330356 0.001570025 0.004406081
## 4 0.003532023 0.003488476 0.007023487 0.005361357 0.001596078 0.004431255
## 5 0.003521527 0.003661071 0.006983456 0.005220604 0.001602779 0.004426046
## 6 0.003338839 0.003686720 0.006574793 0.005117297 0.001264129 0.004168622
##          TTAT        GGGG        GAAT        ACTT        CAAT        AAAG
## 1 0.001884501 0.003891977 0.007088838 0.005756035 0.006839506 0.007868787
## 2 0.001683528 0.003620677 0.008070227 0.006459737 0.007241238 0.008088941
## 3 0.001756647 0.003809209 0.007980009 0.006153975 0.007079549 0.007985002
## 4 0.001788964 0.003609932 0.007957836 0.006370329 0.007171097 0.008081400
## 5 0.001832920 0.003408374 0.007465953 0.006501640 0.007291196 0.008188220
## 6 0.001504532 0.003772753 0.007980372 0.006167098 0.006893226 0.007762598
##          TTTA        TAAA        ATGT        TGGC        TGAC        CGGT
## 1 0.002411895 0.007073487 0.000852236 0.003108970 0.003063718 0.000777142
## 2 0.002372222 0.006915808 0.000852732 0.002516794 0.002814146 0.000706340
## 3 0.002373151 0.006773970 0.000867622 0.002580705 0.002834471 0.000743252
## 4 0.002399141 0.007473299 0.000857021 0.003117674 0.002970048 0.000725689
## 5 0.002478647 0.007416666 0.000897491 0.002863841 0.003010378 0.000717014
## 6 0.002256466 0.006744598 0.000711161 0.002816953 0.002951494 0.000755326
##          GTAA        CGGA        AACT        CTTG        AGGT        TATG
## 1 0.003434688 0.000932764 0.001593925 0.002965998 0.003529409 0.003608111
## 2 0.003576210 0.000850709 0.001798907 0.002914236 0.003419398 0.003684407
## 3 0.003547938 0.000902080 0.001733929 0.003038415 0.003488873 0.003518017
## 4 0.003477027 0.000883568 0.001809645 0.002861472 0.003461363 0.003760027
## 5 0.003447711 0.000873376 0.001887264 0.002906155 0.003446528 0.003909733
## 6 0.003387975 0.000928468 0.001701560 0.002768320 0.003368180 0.003671520
##          AGTT        GATT        GGTT        TACA        GACC        AATC
## 1 0.004926187 0.003791570 0.005096551 0.004399461 0.002212877 0.002817636
## 2 0.005298467 0.004195235 0.005350579 0.004393435 0.002190193 0.003135515
## 3 0.005220761 0.004219526 0.005533558 0.004309540 0.002231612 0.003013841
## 4 0.005191180 0.004010253 0.005006958 0.004617017 0.002138347 0.003025705
## 5 0.005297593 0.003946578 0.005075573 0.004729375 0.002171606 0.002954958
## 6 0.004907671 0.003997041 0.005246849 0.004477059 0.002376758 0.003027746
##          TATA        CACA        TATC        TCAC        AATT        GTGT
## 1 0.004613623 0.009214834 0.002801572 0.002501613 0.006263356 0.003720202
## 2 0.004940174 0.009294475 0.002928355 0.002391891 0.006918467 0.003716114
## 3 0.004665784 0.009290449 0.002805950 0.002318472 0.006598176 0.003784602
## 4 0.004947386 0.009417611 0.002852602 0.002516383 0.006797464 0.003675182
## 5 0.005246691 0.009327507 0.002948127 0.002436781 0.006945589 0.003606494
## 6 0.004891197 0.009197249 0.002884825 0.002564024 0.006410678 0.003569982
##          AGTA        CCGT        TAAG        TTAA        CTGT        GTTC
## 1 0.003639689 0.000793340 0.003465896 0.002201460 0.003017456 0.002446695
## 2 0.003923823 0.000710603 0.003317025 0.001881063 0.002687595 0.002508802
## 3 0.003771868 0.000763289 0.003297005 0.001942089 0.002947524 0.002568898
## 4 0.003833282 0.000724855 0.003573447 0.002099722 0.002813223 0.002373617
## 5 0.003792583 0.000759291 0.003464440 0.002026647 0.002886038 0.002427799
## 6 0.003539815 0.000799796 0.003210256 0.001674390 0.002531685 0.002520738
##          CCGA        GATA        AGCG        CGGG        CATA        AGCT
## 1 0.000815625 0.002538328 0.000681931 0.001455318 0.006407486 0.001709179
## 2 0.000720444 0.002872139 0.000661512 0.001264019 0.006941632 0.001887176
## 3 0.000783107 0.002760463 0.000690141 0.001369589 0.006593403 0.001869582
## 4 0.000749216 0.002735455 0.000677675 0.001311929 0.006819576 0.001857045
## 5 0.000764509 0.002534927 0.000644185 0.001288028 0.006842890 0.001869783
## 6 0.000821403 0.002565352 0.000654380 0.001421784 0.006536425 0.001716653
##          TGCG        ATAG        ATGC        TTGC        TGTC        TCAA
## 1 0.000538098 0.003205443 0.002823308 0.000860461 0.004168954 0.002978721
## 2 0.000472950 0.003298570 0.002795604 0.000710892 0.004003985 0.002820099
## 3 0.000515105 0.003231868 0.002821838 0.000745849 0.004095240 0.002686252
## 4 0.000495797 0.003257405 0.002805501 0.000822769 0.003930881 0.003067695
## 5 0.000494671 0.003240627 0.002783661 0.000734387 0.004233251 0.002936258
## 6 0.000505034 0.002960216 0.002700987 0.000658849 0.004228741 0.002908047
##          GGTC        TTCT        TTTG        GACT        TTGT        CACG
## 1 0.002783281 0.001367129 0.001797337 0.003526529 0.001388775 0.001412218
## 2 0.002856502 0.001184883 0.001611314 0.003687587 0.001172469 0.001317590
## 3 0.002935903 0.001281278 0.001657964 0.003724086 0.001229382 0.001386843
## 4 0.002616561 0.001266039 0.001696995 0.003682149 0.001306708 0.001372367
## 5 0.002539122 0.001284640 0.001648875 0.003583473 0.001319853 0.001342570
## 6 0.002919873 0.001067585 0.001471332 0.003720333 0.001089515 0.001416507
##          CGTA        CTTC        ATTT        CGTC        GTAT        TTGA
## 1 0.001143585 0.003182208 0.002335332 0.001130119 0.003120135 0.001778750
## 2 0.001148032 0.003212093 0.002808192 0.001070168 0.003326794 0.001518435
## 3 0.001149521 0.003322338 0.002718331 0.001111724 0.003276917 0.001592406
## 4 0.001145668 0.003123367 0.002668554 0.001036415 0.003198367 0.001710740
## 5 0.001163276 0.003180815 0.002815791 0.001067300 0.003165395 0.001555911
## 6 0.001172927 0.003076631 0.002409312 0.001176929 0.003006929 0.001420868
##          TGGT        GTAC        TTAG        TCGT        CTAC        TTGG
## 1 0.003090664 0.001526447 0.001202718 0.000187111 0.001599611 0.001142130
## 2 0.002531780 0.001625693 0.000949398 0.000154514 0.001548595 0.000848209
## 3 0.002615803 0.001625042 0.001018303 0.000160008 0.001597483 0.000939994
## 4 0.003127173 0.001560127 0.001120238 0.000184048 0.001532149 0.001066405
## 5 0.002994026 0.001527206 0.000969603 0.000175385 0.001508470 0.000838862
## 6 0.002695639 0.001516879 0.000790249 0.000170755 0.001379862 0.000706351
##          ACGC        CGCG        TCAG        ATAC        TGGA        GTAG
## 1 0.000441818 0.000303092 0.003048025 0.002594047 0.005320793 0.002728082
## 2 0.000404205 0.000261989 0.002736354 0.002725212 0.004508598 0.002706642
## 3 0.000424029 0.000286486 0.002702882 0.002646077 0.004725237 0.002772438
## 4 0.000422558 0.000266833 0.003089398 0.002663930 0.005365446 0.002662121
## 5 0.000414438 0.000259259 0.002765427 0.002643597 0.004909888 0.002600244
## 6 0.000448325 0.000308867 0.002775498 0.002426253 0.004697991 0.002633474
##          TACT        ACGT        GATC        ATCT        CGGC        CGAC
## 1 0.003054157 0.000659276 0.002869213 0.000989329 0.001023625 0.000602636
## 2 0.003084068 0.000643794 0.001605692 0.001123306 0.000894193 0.000567534
## 3 0.003028042 0.000650320 0.001308787 0.001138895 0.000943604 0.000586482
## 4 0.003205523 0.000650720 0.001102860 0.001105266 0.000919833 0.000577890
## 5 0.003376424 0.000652091 0.000986044 0.001180219 0.000891574 0.000578438
## 6 0.003171044 0.000659154 0.001108807 0.000939613 0.001025609 0.000623746
##          CGAT        ATCA        ACCG        TTCA        GTCC        GCCG
## 1 0.001001177 0.000798477 0.000658355 0.001315582 0.002078130 0.001028569
## 2 0.001040268 0.000919830 0.000682221 0.001148740 0.002017904 0.000950655
## 3 0.001057365 0.000883426 0.000685402 0.001239434 0.002088824 0.000972090
## 4 0.000929096 0.000860985 0.000687504 0.001227053 0.001983518 0.000944461
## 5 0.001032266 0.000816217 0.000667170 0.001213424 0.002012393 0.000899158
## 6 0.001075714 0.000724620 0.000700501 0.001020656 0.002107479 0.001115142
##          TAGT        ACGA        GCGT        CTCG        TCCG         TCGG
## 1 0.001897017 0.000445203 0.000541825 0.000452151 0.000314836 1.449760e-04
## 2 0.001755741 0.000429105 0.000486679 0.000405621 0.000270559 9.666526e-05
## 3 0.001751655 0.000433761 0.000510703 0.000462281 0.000290551 1.147230e-04
## 4 0.002029047 0.000443553 0.000492259 0.000428754 0.000302596 1.258440e-04
## 5 0.001925095 0.000440632 0.000490853 0.000403232 0.000288555 1.039900e-04
## 6 0.001704163 0.000450227 0.000548930 0.000403424 0.000301294 1.038710e-04
##          GTCT        TTCC         TCGC        GCGA        GACG        TTAC
## 1 0.001125680 0.001298820 1.218600e-04 0.000423290 0.000618106 0.000954172
## 2 0.001215737 0.001205823 9.069684e-05 0.000372340 0.000598504 0.000838743
## 3 0.001325181 0.001321673 9.800849e-05 0.000402036 0.000629187 0.000853758
## 4 0.001128699 0.001176067 1.063750e-04 0.000389690 0.000605333 0.000889465
## 5 0.001153988 0.001284245 1.001710e-04 0.000378005 0.000569993 0.000862367
## 6 0.001140750 0.001124222 1.083220e-04 0.000436552 0.000650522 0.000756080
##          TCGA        TACG        TTCG        GTCG        ATCG
## 1 0.000247343 0.000384822 0.000173007 0.000321636 0.000144962
## 2 0.000222970 0.000362282 0.000154977 0.000305733 0.000181928
## 3 0.000223930 0.000377985 0.000162757 0.000329073 0.000164899
## 4 0.000250430 0.000398811 0.000147736 0.000304530 0.000148900
## 5 0.000212033 0.000381698 0.000146035 0.000299367 0.000130454
## 6 0.000244082 0.000374388 0.000132925 0.000321178 0.000142401
# Check for the classes in the group.
unique(data$Group)
## [1] "Healthy" "CLD"     "HCC"
# Basic summary of the data
summary(data)
##   Sample_ID            Group              TF_Score         DELFI_Score     
##  Length:475         Length:475         Min.   :0.001012   Min.   :0.09211  
##  Class :character   Class :character   1st Qu.:0.001572   1st Qu.:0.19719  
##  Mode  :character   Mode  :character   Median :0.010420   Median :0.23311  
##                                        Mean   :0.031400   Mean   :0.25739  
##                                        3rd Qu.:0.014970   3rd Qu.:0.28785  
##                                        Max.   :0.568900   Max.   :0.93001  
##  mtcfDNA_fraction      P.1.10.             P.1.20.             P.1.30.         
##  Min.   :0.000286   Min.   :1.000e-06   Min.   :4.490e-06   Min.   :6.490e-06  
##  1st Qu.:0.001092   1st Qu.:2.430e-06   1st Qu.:2.340e-05   1st Qu.:1.340e-05  
##  Median :0.002286   Median :2.960e-06   Median :2.800e-05   Median :4.120e-05  
##  Mean   :0.003830   Mean   :6.212e-06   Mean   :4.899e-05   Mean   :7.829e-05  
##  3rd Qu.:0.004765   3rd Qu.:3.985e-06   3rd Qu.:3.345e-05   3rd Qu.:5.360e-05  
##  Max.   :0.069923   Max.   :2.121e-04   Max.   :1.907e-03   Max.   :3.988e-03  
##     P.1.40.             P.1.50.             P.1.60.         
##  Min.   :1.780e-05   Min.   :3.780e-05   Min.   :0.0000703  
##  1st Qu.:3.475e-05   1st Qu.:6.895e-05   1st Qu.:0.0001302  
##  Median :5.880e-05   Median :1.013e-04   Median :0.0004005  
##  Mean   :1.126e-04   Mean   :2.693e-04   Mean   :0.0012657  
##  3rd Qu.:7.550e-05   3rd Qu.:1.565e-04   3rd Qu.:0.0007458  
##  Max.   :5.058e-03   Max.   :6.737e-03   Max.   :0.0207264  
##     P.1.70.             P.1.80.             P.1.90.             P.1.100.       
##  Min.   :0.0001194   Min.   :0.0002007   Min.   :0.0003532   Min.   :0.000720  
##  1st Qu.:0.0002159   1st Qu.:0.0003901   1st Qu.:0.0009353   1st Qu.:0.002497  
##  Median :0.0010549   Median :0.0023869   Median :0.0048540   Median :0.009615  
##  Mean   :0.0027901   Mean   :0.0050515   Mean   :0.0084436   Mean   :0.014177  
##  3rd Qu.:0.0020041   3rd Qu.:0.0042368   3rd Qu.:0.0082918   3rd Qu.:0.015368  
##  Max.   :0.0393278   Max.   :0.0608565   Max.   :0.0876294   Max.   :0.125969  
##     P.10.20.            P.20.30.            P.20.40.        
##  Min.   :2.040e-06   Min.   :2.430e-06   Min.   :1.360e-05  
##  1st Qu.:4.580e-06   1st Qu.:4.715e-06   1st Qu.:3.630e-05  
##  Median :2.060e-05   Median :1.650e-05   Median :5.160e-05  
##  Mean   :3.546e-05   Mean   :3.662e-05   Mean   :8.966e-05  
##  3rd Qu.:2.640e-05   3rd Qu.:2.555e-05   3rd Qu.:8.240e-05  
##  Max.   :1.892e-03   Max.   :2.081e-03   Max.   :3.151e-03  
##     P.30.40.            P.30.60.            P.40.50.        
##  Min.   :6.170e-06   Min.   :0.0000638   Min.   :1.350e-05  
##  1st Qu.:1.280e-05   1st Qu.:0.0001178   1st Qu.:3.165e-05  
##  Median :1.770e-05   Median :0.0003527   Median :4.090e-05  
##  Mean   :3.428e-05   Mean   :0.0011874   Mean   :1.567e-04  
##  3rd Qu.:2.545e-05   3rd Qu.:0.0006953   3rd Qu.:7.645e-05  
##  Max.   :1.070e-03   Max.   :0.0187529   Max.   :2.704e-03  
##     P.40.60.            P.40.80.            P.50.60.        
##  Min.   :0.0000949   Min.   :0.0001828   Min.   :3.250e-05  
##  1st Qu.:0.0001945   1st Qu.:0.0003524   1st Qu.:5.745e-05  
##  Median :0.0003388   Median :0.0023045   Median :2.951e-04  
##  Mean   :0.0011849   Mean   :0.0049389   Mean   :9.964e-04  
##  3rd Qu.:0.0006764   3rd Qu.:0.0040765   3rd Qu.:5.905e-04  
##  Max.   :0.0183577   Max.   :0.0584878   Max.   :1.565e-02  
##    P.50.100.            P.60.70.            P.60.80.        
##  Min.   :0.0006822   Min.   :0.0000490   Min.   :0.0002108  
##  1st Qu.:0.0024342   1st Qu.:0.0000901   1st Qu.:0.0003888  
##  Median :0.0094142   Median :0.0006482   Median :0.0019409  
##  Mean   :0.0139074   Mean   :0.0015244   Mean   :0.0038227  
##  3rd Qu.:0.0152070   3rd Qu.:0.0012002   3rd Qu.:0.0035351  
##  Max.   :0.1208964   Max.   :0.0186014   Max.   :0.0401301  
##     P.60.90.           P.60.120.           P.70.80.           P.70.140.      
##  Min.   :0.0002829   Min.   :0.004097   Min.   :0.0000813   Min.   :0.02530  
##  1st Qu.:0.0008043   1st Qu.:0.015009   1st Qu.:0.0001684   1st Qu.:0.07192  
##  Median :0.0043589   Median :0.027026   Median :0.0012942   Median :0.09071  
##  Mean   :0.0071779   Mean   :0.033651   Mean   :0.0022614   Mean   :0.10069  
##  3rd Qu.:0.0074344   3rd Qu.:0.039751   3rd Qu.:0.0022951   3rd Qu.:0.11967  
##  Max.   :0.0669030   Max.   :0.188646   Max.   :0.0215287   Max.   :0.30717  
##     P.80.90.           P.80.100.           P.80.120.          P.80.160.     
##  Min.   :0.0001525   Min.   :0.0005754   Min.   :0.003919   Min.   :0.1438  
##  1st Qu.:0.0005505   1st Qu.:0.0022011   1st Qu.:0.014685   1st Qu.:0.2862  
##  Median :0.0024191   Median :0.0071709   Median :0.024816   Median :0.3166  
##  Mean   :0.0033920   Mean   :0.0091426   Mean   :0.029865   Mean   :0.3283  
##  3rd Qu.:0.0040296   3rd Qu.:0.0111257   3rd Qu.:0.036613   3rd Qu.:0.3641  
##  Max.   :0.0267729   Max.   :0.0651127   Max.   :0.148516   Max.   :0.6114  
##    P.90.100.           P.90.120.          P.90.180.        P.100.110.      
##  Min.   :0.0003668   Min.   :0.003663   Min.   :0.3597   Min.   :0.001097  
##  1st Qu.:0.0015975   1st Qu.:0.013905   1st Qu.:0.7415   1st Qu.:0.004072  
##  Median :0.0046647   Median :0.022505   Median :0.7744   Median :0.007180  
##  Mean   :0.0057332   Mean   :0.026473   Mean   :0.7689   Mean   :0.008503  
##  3rd Qu.:0.0071214   3rd Qu.:0.033114   3rd Qu.:0.8127   3rd Qu.:0.010733  
##  Max.   :0.0383398   Max.   :0.121743   Max.   :0.9139   Max.   :0.042918  
##    P.100.120.         P.100.150.        P.100.200.       P.110.120.      
##  Min.   :0.002977   Min.   :0.07425   Min.   :0.5034   Min.   :0.001956  
##  1st Qu.:0.011932   1st Qu.:0.15224   1st Qu.:0.9049   1st Qu.:0.007829  
##  Median :0.017795   Median :0.17365   Median :0.9268   Median :0.010691  
##  Mean   :0.020635   Mean   :0.18216   Mean   :0.9124   Mean   :0.012237  
##  3rd Qu.:0.026043   3rd Qu.:0.20532   3rd Qu.:0.9402   3rd Qu.:0.015053  
##  Max.   :0.083403   Max.   :0.42563   Max.   :0.9663   Max.   :0.043520  
##    P.120.130.         P.120.140.        P.120.150.        P.120.160.    
##  Min.   :0.004614   Min.   :0.01966   Min.   :0.06511   Min.   :0.1285  
##  1st Qu.:0.015847   1st Qu.:0.05407   1st Qu.:0.13757   1st Qu.:0.2669  
##  Median :0.019396   Median :0.06396   Median :0.15628   Median :0.2921  
##  Mean   :0.020971   Mean   :0.06781   Mean   :0.16142   Mean   :0.2984  
##  3rd Qu.:0.024316   3rd Qu.:0.07678   3rd Qu.:0.17781   3rd Qu.:0.3233  
##  Max.   :0.068162   Max.   :0.19530   Max.   :0.36252   Max.   :0.5243  
##    P.120.180.       P.130.140.        P.140.150.        P.140.160.     
##  Min.   :0.3398   Min.   :0.01666   Min.   :0.03749   Min.   :0.09206  
##  1st Qu.:0.7173   1st Qu.:0.03895   1st Qu.:0.08252   1st Qu.:0.20563  
##  Median :0.7508   Median :0.04518   Median :0.09128   Median :0.22406  
##  Mean   :0.7425   Mean   :0.04760   Mean   :0.09285   Mean   :0.22676  
##  3rd Qu.:0.7827   3rd Qu.:0.05264   3rd Qu.:0.10142   3rd Qu.:0.24816  
##  Max.   :0.8747   Max.   :0.12732   Max.   :0.16722   Max.   :0.36099  
##    P.140.210.       P.150.160.        P.150.180.       P.150.200.    
##  Min.   :0.4718   Min.   :0.05457   Min.   :0.2619   Min.   :0.3927  
##  1st Qu.:0.8230   1st Qu.:0.12734   1st Qu.:0.5623   1st Qu.:0.7001  
##  Median :0.8661   Median :0.13677   Median :0.5912   Median :0.7493  
##  Mean   :0.8483   Mean   :0.13700   Mean   :0.5810   Mean   :0.7303  
##  3rd Qu.:0.8914   3rd Qu.:0.14790   3rd Qu.:0.6141   3rd Qu.:0.7737  
##  Max.   :0.9346   Max.   :0.20759   Max.   :0.6862   Max.   :0.8385  
##    P.160.170.        P.160.180.       P.160.200.       P.160.240.    
##  Min.   :0.09594   Min.   :0.2016   Min.   :0.3136   Min.   :0.3387  
##  1st Qu.:0.23303   1st Qu.:0.4205   1st Qu.:0.5627   1st Qu.:0.6013  
##  Median :0.24886   Median :0.4453   Median :0.6091   Median :0.6539  
##  Mean   :0.24588   Mean   :0.4375   Mean   :0.5933   Mean   :0.6419  
##  3rd Qu.:0.26339   3rd Qu.:0.4652   3rd Qu.:0.6403   3rd Qu.:0.6923  
##  Max.   :0.31414   Max.   :0.5349   Max.   :0.7191   Max.   :0.8037  
##    P.170.180.        P.180.190.        P.180.200.        P.180.210.     
##  Min.   :0.09431   Min.   :0.03215   Min.   :0.04516   Min.   :0.05195  
##  1st Qu.:0.18568   1st Qu.:0.08723   1st Qu.:0.12473   1st Qu.:0.14382  
##  Median :0.20458   Median :0.10255   Median :0.15043   Median :0.17870  
##  Mean   :0.19816   Mean   :0.10027   Mean   :0.14646   Mean   :0.17436  
##  3rd Qu.:0.21658   3rd Qu.:0.11545   3rd Qu.:0.16969   3rd Qu.:0.20525  
##  Max.   :0.24567   Max.   :0.15057   Max.   :0.21596   Max.   :0.28035  
##    P.180.240.        P.180.270.        P.190.200.        P.200.210.      
##  Min.   :0.05914   Min.   :0.06244   Min.   :0.01301   Min.   :0.006796  
##  1st Qu.:0.16157   1st Qu.:0.16795   1st Qu.:0.03841   1st Qu.:0.018590  
##  Median :0.20085   Median :0.20772   Median :0.04940   Median :0.024842  
##  Mean   :0.19785   Mean   :0.20494   Mean   :0.04895   Mean   :0.025141  
##  3rd Qu.:0.23250   3rd Qu.:0.23993   3rd Qu.:0.05888   3rd Qu.:0.030166  
##  Max.   :0.33012   Max.   :0.35282   Max.   :0.08716   Max.   :0.062060  
##    P.200.220.        P.200.240.        P.200.250.        P.200.300.     
##  Min.   :0.01049   Min.   :0.01398   Min.   :0.01520   Min.   :0.02070  
##  1st Qu.:0.02832   1st Qu.:0.03666   1st Qu.:0.03926   1st Qu.:0.04800  
##  Median :0.03666   Median :0.04693   Median :0.04937   Median :0.06059  
##  Mean   :0.03732   Mean   :0.04863   Mean   :0.05134   Mean   :0.06293  
##  3rd Qu.:0.04409   3rd Qu.:0.05775   3rd Qu.:0.06025   3rd Qu.:0.07285  
##  Max.   :0.10214   Max.   :0.14022   Max.   :0.14922   Max.   :0.18598  
##    P.210.220.         P.210.240.         P.210.280.        P.220.230.      
##  Min.   :0.003690   Min.   :0.007185   Min.   :0.01214   Min.   :0.002069  
##  1st Qu.:0.009567   1st Qu.:0.017700   1st Qu.:0.02518   1st Qu.:0.005037  
##  Median :0.012334   Median :0.022283   Median :0.03089   Median :0.006302  
##  Mean   :0.012914   Mean   :0.023487   Mean   :0.03280   Mean   :0.006720  
##  3rd Qu.:0.015353   3rd Qu.:0.027239   3rd Qu.:0.03723   3rd Qu.:0.007778  
##  Max.   :0.040079   Max.   :0.078164   Max.   :0.10793   Max.   :0.024022  
##    P.220.240.         P.230.240.         P.240.250.          P.240.260.      
##  Min.   :0.003495   Min.   :0.001380   Min.   :0.0009052   Min.   :0.001522  
##  1st Qu.:0.007821   1st Qu.:0.002887   1st Qu.:0.0019966   1st Qu.:0.003383  
##  Median :0.009750   Median :0.003526   Median :0.0024283   Median :0.004361  
##  Mean   :0.010393   Mean   :0.003853   Mean   :0.0027116   Mean   :0.004906  
##  3rd Qu.:0.011775   3rd Qu.:0.004363   3rd Qu.:0.0030301   3rd Qu.:0.005590  
##  Max.   :0.038085   Max.   :0.014063   Max.   :0.0135096   Max.   :0.026305  
##    P.240.270.         P.240.280.         P.240.300.         P.240.320.      
##  Min.   :0.002140   Min.   :0.002484   Min.   :0.002484   Min.   :0.002484  
##  1st Qu.:0.004768   1st Qu.:0.006119   1st Qu.:0.007329   1st Qu.:0.007329  
##  Median :0.006241   Median :0.008057   Median :0.011649   Median :0.012103  
##  Mean   :0.007090   Mean   :0.009316   Mean   :0.014298   Mean   :0.018305  
##  3rd Qu.:0.008209   3rd Qu.:0.010961   3rd Qu.:0.018539   3rd Qu.:0.023329  
##  Max.   :0.038724   Max.   :0.051314   Max.   :0.078829   Max.   :0.107244  
##    P.250.260.          P.250.300.         P.260.270.          P.260.280.      
##  Min.   :0.0006169   Min.   :0.001425   Min.   :0.0005529   Min.   :0.000667  
##  1st Qu.:0.0014817   1st Qu.:0.005416   1st Qu.:0.0012646   1st Qu.:0.002464  
##  Median :0.0019652   Median :0.009189   Median :0.0017676   Median :0.003624  
##  Mean   :0.0022549   Mean   :0.011587   Mean   :0.0021237   Mean   :0.004336  
##  3rd Qu.:0.0026474   3rd Qu.:0.015818   3rd Qu.:0.0026121   3rd Qu.:0.005340  
##  Max.   :0.0127957   Max.   :0.065320   Max.   :0.0124183   Max.   :0.025008  
##    P.270.280.         P.270.300.         P.270.360.         P.280.290.      
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.001257   1st Qu.:0.002652   1st Qu.:0.002652   1st Qu.:0.001386  
##  Median :0.001933   Median :0.005593   Median :0.005905   Median :0.002208  
##  Mean   :0.002226   Mean   :0.007208   Mean   :0.014879   Mean   :0.002474  
##  3rd Qu.:0.002719   3rd Qu.:0.010774   3rd Qu.:0.015829   3rd Qu.:0.003402  
##  Max.   :0.012590   Max.   :0.040106   Max.   :0.146189   Max.   :0.013262  
##    P.280.300.         P.280.320.         P.280.350.         P.290.300.      
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.003208   1st Qu.:0.001432   1st Qu.:0.001432   1st Qu.:0.000000  
##  Median :0.004303   Median :0.004073   Median :0.004073   Median :0.001986  
##  Mean   :0.005643   Mean   :0.008989   Mean   :0.011968   Mean   :0.002508  
##  3rd Qu.:0.008241   3rd Qu.:0.013398   3rd Qu.:0.013398   3rd Qu.:0.004671  
##  Max.   :0.027515   Max.   :0.061544   Max.   :0.114419   Max.   :0.014253  
##    P.300.310.         P.300.320.         P.300.330.         P.300.350.      
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.006900   Median :0.000000   Median :0.000000  
##  Mean   :0.002271   Mean   :0.006576   Mean   :0.005311   Mean   :0.006985  
##  3rd Qu.:0.004935   3rd Qu.:0.009404   3rd Qu.:0.005342   3rd Qu.:0.005342  
##  Max.   :0.018048   Max.   :0.039102   Max.   :0.059337   Max.   :0.094000  
##    P.300.360.         P.300.400.         P.310.320.         P.320.330.      
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.000000   Median :0.000000   Median :0.000000   Median :0.000000  
##  Mean   :0.007671   Mean   :0.009427   Mean   :0.001735   Mean   :0.001304  
##  3rd Qu.:0.005342   3rd Qu.:0.005342   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :0.123057   Max.   :0.214529   Max.   :0.021054   Max.   :0.020461  
##    P.320.340.         P.320.360.         P.320.400.            AACC         
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.000000   Min.   :0.002290  
##  1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.002466  
##  Median :0.000000   Median :0.000000   Median :0.000000   Median :0.002516  
##  Mean   :0.005737   Mean   :0.003664   Mean   :0.005421   Mean   :0.002728  
##  3rd Qu.:0.010506   3rd Qu.:0.000000   3rd Qu.:0.000000   3rd Qu.:0.003160  
##  Max.   :0.042049   Max.   :0.098087   Max.   :0.189560   Max.   :0.004468  
##       ACCC               CCCT               CTAA               TAAC         
##  Min.   :0.002532   Min.   :0.005934   Min.   :0.002772   Min.   :0.001588  
##  1st Qu.:0.002992   1st Qu.:0.009717   1st Qu.:0.003328   1st Qu.:0.002476  
##  Median :0.003203   Median :0.010378   Median :0.004983   Median :0.002921  
##  Mean   :0.003358   Mean   :0.010539   Mean   :0.004616   Mean   :0.002820  
##  3rd Qu.:0.003789   3rd Qu.:0.011449   3rd Qu.:0.005403   3rd Qu.:0.003124  
##  Max.   :0.004946   Max.   :0.013820   Max.   :0.014935   Max.   :0.003546  
##       CCCC               CCTA               AGCC               ATCC          
##  Min.   :0.004245   Min.   :0.004579   Min.   :0.003008   Min.   :0.0005892  
##  1st Qu.:0.006861   1st Qu.:0.006195   1st Qu.:0.003269   1st Qu.:0.0006949  
##  Median :0.007263   Median :0.006522   Median :0.003465   Median :0.0022321  
##  Mean   :0.007478   Mean   :0.006808   Mean   :0.003545   Mean   :0.0017716  
##  3rd Qu.:0.007990   3rd Qu.:0.007738   3rd Qu.:0.003803   3rd Qu.:0.0022786  
##  Max.   :0.011879   Max.   :0.009345   Max.   :0.004958   Max.   :0.0033270  
##       CCTC               AAAC               TACC               CCCA         
##  Min.   :0.007092   Min.   :0.003158   Min.   :0.001492   Min.   :0.005945  
##  1st Qu.:0.008446   1st Qu.:0.004403   1st Qu.:0.002253   1st Qu.:0.014634  
##  Median :0.008979   Median :0.004536   Median :0.002412   Median :0.015347  
##  Mean   :0.009261   Mean   :0.004756   Mean   :0.002365   Mean   :0.015452  
##  3rd Qu.:0.010152   3rd Qu.:0.005478   3rd Qu.:0.002478   3rd Qu.:0.016171  
##  Max.   :0.012475   Max.   :0.006611   Max.   :0.002906   Max.   :0.022195  
##       GGCG                TCCC               CCTG               TGTA         
##  Min.   :0.0007021   Min.   :0.001072   Min.   :0.009325   Min.   :0.003602  
##  1st Qu.:0.0008685   1st Qu.:0.002681   1st Qu.:0.011570   1st Qu.:0.005315  
##  Median :0.0009227   Median :0.003710   Median :0.012336   Median :0.005568  
##  Mean   :0.0009743   Mean   :0.003490   Mean   :0.012919   Mean   :0.005522  
##  3rd Qu.:0.0010016   3rd Qu.:0.003874   3rd Qu.:0.014414   3rd Qu.:0.005744  
##  Max.   :0.0026048   Max.   :0.005178   Max.   :0.017603   Max.   :0.007028  
##       GCCA               CACC               TGTG               CTGC         
##  Min.   :0.003932   Min.   :0.003512   Min.   :0.003809   Min.   :0.001672  
##  1st Qu.:0.004816   1st Qu.:0.004505   1st Qu.:0.006595   1st Qu.:0.002210  
##  Median :0.005172   Median :0.004756   Median :0.006724   Median :0.004626  
##  Mean   :0.005559   Mean   :0.004845   Mean   :0.006737   Mean   :0.004060  
##  3rd Qu.:0.006461   3rd Qu.:0.005152   3rd Qu.:0.006878   3rd Qu.:0.004949  
##  Max.   :0.008169   Max.   :0.006986   Max.   :0.007907   Max.   :0.007656  
##       GAGC               GCAC               CACT               GAGG         
##  Min.   :0.001950   Min.   :0.002038   Min.   :0.004403   Min.   :0.002952  
##  1st Qu.:0.002150   1st Qu.:0.002441   1st Qu.:0.005687   1st Qu.:0.003272  
##  Median :0.002212   Median :0.002591   Median :0.005782   Median :0.003511  
##  Mean   :0.002257   Mean   :0.002833   Mean   :0.006100   Mean   :0.003714  
##  3rd Qu.:0.002319   3rd Qu.:0.003386   3rd Qu.:0.006794   3rd Qu.:0.004210  
##  Max.   :0.003079   Max.   :0.004007   Max.   :0.007436   Max.   :0.005047  
##       AGTG               GGCC               CCAG              ACAG         
##  Min.   :0.004068   Min.   :0.002570   Min.   :0.00611   Min.   :0.004382  
##  1st Qu.:0.004353   1st Qu.:0.003117   1st Qu.:0.01176   1st Qu.:0.004984  
##  Median :0.004451   Median :0.003387   Median :0.01233   Median :0.005101  
##  Mean   :0.004595   Mean   :0.003584   Mean   :0.01250   Mean   :0.005426  
##  3rd Qu.:0.004830   3rd Qu.:0.004038   3rd Qu.:0.01319   3rd Qu.:0.006181  
##  Max.   :0.006224   Max.   :0.005604   Max.   :0.01758   Max.   :0.007346  
##       AGAG               CTAG               ACGG                CATC         
##  Min.   :0.005228   Min.   :0.001515   Min.   :0.0003943   Min.   :0.004301  
##  1st Qu.:0.005612   1st Qu.:0.002145   1st Qu.:0.0004651   1st Qu.:0.004725  
##  Median :0.005822   Median :0.003485   Median :0.0005022   Median :0.004811  
##  Mean   :0.006010   Mean   :0.003209   Mean   :0.0005161   Mean   :0.004820  
##  3rd Qu.:0.006487   3rd Qu.:0.003798   3rd Qu.:0.0005536   3rd Qu.:0.004908  
##  Max.   :0.008505   Max.   :0.009402   Max.   :0.0009250   Max.   :0.005986  
##       AGTC               TCCA               GTCA               CCTT         
##  Min.   :0.002357   Min.   :0.001764   Min.   :0.002164   Min.   :0.007326  
##  1st Qu.:0.002655   1st Qu.:0.003534   1st Qu.:0.002447   1st Qu.:0.009408  
##  Median :0.002692   Median :0.004896   Median :0.002481   Median :0.009793  
##  Mean   :0.002849   Mean   :0.004499   Mean   :0.002584   Mean   :0.010578  
##  3rd Qu.:0.003167   3rd Qu.:0.005050   3rd Qu.:0.002773   3rd Qu.:0.012512  
##  Max.   :0.004020   Max.   :0.005698   Max.   :0.003502   Max.   :0.015071  
##       CCAA               GCCT               GGGC               CGCT          
##  Min.   :0.004922   Min.   :0.003850   Min.   :0.002087   Min.   :0.0004568  
##  1st Qu.:0.009861   1st Qu.:0.004834   1st Qu.:0.002422   1st Qu.:0.0008891  
##  Median :0.010216   Median :0.005223   Median :0.002594   Median :0.0009504  
##  Mean   :0.010356   Mean   :0.005822   Mean   :0.002721   Mean   :0.0009688  
##  3rd Qu.:0.011045   3rd Qu.:0.007174   3rd Qu.:0.002969   3rd Qu.:0.0010167  
##  Max.   :0.012734   Max.   :0.008381   Max.   :0.004459   Max.   :0.0018322  
##       CTGG               GCCC               GTGC               CATT         
##  Min.   :0.001780   Min.   :0.002334   Min.   :0.001831   Min.   :0.006098  
##  1st Qu.:0.002786   1st Qu.:0.002926   1st Qu.:0.001984   1st Qu.:0.008153  
##  Median :0.005823   Median :0.003211   Median :0.002067   Median :0.008397  
##  Mean   :0.005188   Mean   :0.003519   Mean   :0.002141   Mean   :0.009032  
##  3rd Qu.:0.006414   3rd Qu.:0.004174   3rd Qu.:0.002296   3rd Qu.:0.010694  
##  Max.   :0.010821   Max.   :0.005435   Max.   :0.002820   Max.   :0.011593  
##       TCTG               AGGG               TAGA               GGTG         
##  Min.   :0.001076   Min.   :0.002883   Min.   :0.002359   Min.   :0.003833  
##  1st Qu.:0.003968   1st Qu.:0.003194   1st Qu.:0.003225   1st Qu.:0.004516  
##  Median :0.004668   Median :0.003478   Median :0.003781   Median :0.004719  
##  Mean   :0.004454   Mean   :0.003494   Mean   :0.003623   Mean   :0.004890  
##  3rd Qu.:0.004796   3rd Qu.:0.003701   3rd Qu.:0.003928   3rd Qu.:0.005255  
##  Max.   :0.005653   Max.   :0.005293   Max.   :0.004527   Max.   :0.007730  
##       AACG                TGAG               AAGA               CAGA         
##  Min.   :0.0005399   Min.   :0.003878   Min.   :0.002888   Min.   :0.003683  
##  1st Qu.:0.0005845   1st Qu.:0.006436   1st Qu.:0.004730   1st Qu.:0.007163  
##  Median :0.0006092   Median :0.006828   Median :0.004835   Median :0.007437  
##  Mean   :0.0006565   Mean   :0.006768   Mean   :0.005229   Mean   :0.007548  
##  3rd Qu.:0.0007627   3rd Qu.:0.007068   3rd Qu.:0.006299   3rd Qu.:0.007980  
##  Max.   :0.0009039   Max.   :0.008380   Max.   :0.007090   Max.   :0.008875  
##       AAGG               GACA               GCGC                CGTT         
##  Min.   :0.002632   Min.   :0.003091   Min.   :0.0002795   Min.   :0.001348  
##  1st Qu.:0.003379   1st Qu.:0.003515   1st Qu.:0.0003486   1st Qu.:0.001746  
##  Median :0.003475   Median :0.003615   Median :0.0003723   Median :0.001811  
##  Mean   :0.003756   Mean   :0.003869   Mean   :0.0003900   Mean   :0.002017  
##  3rd Qu.:0.004366   3rd Qu.:0.004379   3rd Qu.:0.0004031   3rd Qu.:0.002455  
##  Max.   :0.006034   Max.   :0.005882   Max.   :0.0011260   Max.   :0.002908  
##       CAGC               CAGT               AATA               GGAG         
##  Min.   :0.003023   Min.   :0.003351   Min.   :0.002025   Min.   :0.005213  
##  1st Qu.:0.004625   1st Qu.:0.004508   1st Qu.:0.004589   1st Qu.:0.005826  
##  Median :0.004937   Median :0.004624   Median :0.004913   Median :0.006128  
##  Mean   :0.005044   Mean   :0.004876   Mean   :0.005062   Mean   :0.006370  
##  3rd Qu.:0.005397   3rd Qu.:0.005484   3rd Qu.:0.006057   3rd Qu.:0.006916  
##  Max.   :0.007197   Max.   :0.005966   Max.   :0.007106   Max.   :0.009093  
##       CGAA                GTGA               CCAC               GCTG         
##  Min.   :0.0008035   Min.   :0.003051   Min.   :0.004622   Min.   :0.003630  
##  1st Qu.:0.0014848   1st Qu.:0.003193   1st Qu.:0.007805   1st Qu.:0.004323  
##  Median :0.0015785   Median :0.003241   Median :0.008178   Median :0.004685  
##  Mean   :0.0016074   Mean   :0.003411   Mean   :0.008249   Mean   :0.005418  
##  3rd Qu.:0.0017051   3rd Qu.:0.003709   3rd Qu.:0.008720   3rd Qu.:0.007022  
##  Max.   :0.0031049   Max.   :0.004573   Max.   :0.010842   Max.   :0.008234  
##       TGCC               AAGC               AGAA               AAAA         
##  Min.   :0.001791   Min.   :0.002136   Min.   :0.005071   Min.   :0.006633  
##  1st Qu.:0.003193   1st Qu.:0.002549   1st Qu.:0.006958   1st Qu.:0.010990  
##  Median :0.003870   Median :0.002591   Median :0.007215   Median :0.011409  
##  Mean   :0.003750   Mean   :0.002802   Mean   :0.007598   Mean   :0.012414  
##  3rd Qu.:0.004021   3rd Qu.:0.003272   3rd Qu.:0.008727   3rd Qu.:0.015507  
##  Max.   :0.005624   Max.   :0.003631   Max.   :0.010270   Max.   :0.021323  
##       GAAG               GCTC               TGGG               GGCA         
##  Min.   :0.003386   Min.   :0.002168   Min.   :0.003154   Min.   :0.003385  
##  1st Qu.:0.004457   1st Qu.:0.002572   1st Qu.:0.004385   1st Qu.:0.004372  
##  Median :0.004561   Median :0.002770   Median :0.004967   Median :0.004565  
##  Mean   :0.004899   Mean   :0.003040   Mean   :0.004954   Mean   :0.004617  
##  3rd Qu.:0.005553   3rd Qu.:0.003604   3rd Qu.:0.005294   3rd Qu.:0.004765  
##  Max.   :0.006829   Max.   :0.004348   Max.   :0.007953   Max.   :0.007440  
##       AATG               CCGG                GGAT               AACA         
##  Min.   :0.003621   Min.   :0.0003995   Min.   :0.003631   Min.   :0.003239  
##  1st Qu.:0.004949   1st Qu.:0.0007632   1st Qu.:0.003920   1st Qu.:0.004833  
##  Median :0.005122   Median :0.0008379   Median :0.003993   Median :0.004979  
##  Mean   :0.005507   Mean   :0.0008962   Mean   :0.004160   Mean   :0.005278  
##  3rd Qu.:0.006457   3rd Qu.:0.0009317   3rd Qu.:0.004391   3rd Qu.:0.006200  
##  Max.   :0.007580   Max.   :0.0024997   Max.   :0.006385   Max.   :0.007025  
##       CATG               CCAT               GGGT               CTGA         
##  Min.   :0.005508   Min.   :0.008862   Min.   :0.002200   Min.   :0.002299  
##  1st Qu.:0.006324   1st Qu.:0.010265   1st Qu.:0.002406   1st Qu.:0.003744  
##  Median :0.006422   Median :0.010669   Median :0.002553   Median :0.006206  
##  Mean   :0.006771   Mean   :0.010797   Mean   :0.002709   Mean   :0.005571  
##  3rd Qu.:0.007569   3rd Qu.:0.011445   3rd Qu.:0.003011   3rd Qu.:0.006574  
##  Max.   :0.007959   Max.   :0.012870   Max.   :0.004523   Max.   :0.008925  
##       CAGG               TATT               GAGA               GAAC         
##  Min.   :0.004196   Min.   :0.003360   Min.   :0.003549   Min.   :0.002257  
##  1st Qu.:0.006668   1st Qu.:0.006199   1st Qu.:0.003908   1st Qu.:0.002693  
##  Median :0.007057   Median :0.006716   Median :0.004017   Median :0.002786  
##  Mean   :0.007166   Mean   :0.006831   Mean   :0.004347   Mean   :0.002755  
##  3rd Qu.:0.007428   3rd Qu.:0.007624   3rd Qu.:0.005035   3rd Qu.:0.002839  
##  Max.   :0.011134   Max.   :0.008731   Max.   :0.006200   Max.   :0.003410  
##       GGCT               GGGA               AGAC               GCAG         
##  Min.   :0.003737   Min.   :0.003888   Min.   :0.003176   Min.   :0.003249  
##  1st Qu.:0.004398   1st Qu.:0.004238   1st Qu.:0.003327   1st Qu.:0.003877  
##  Median :0.004713   Median :0.004399   Median :0.003399   Median :0.004134  
##  Mean   :0.004956   Mean   :0.004571   Mean   :0.003549   Mean   :0.004701  
##  3rd Qu.:0.005527   3rd Qu.:0.004865   3rd Qu.:0.003844   3rd Qu.:0.005919  
##  Max.   :0.007272   Max.   :0.006563   Max.   :0.004904   Max.   :0.006796  
##       CCGC                CGAG                TTTC                ATTC         
##  Min.   :0.0003767   Min.   :0.0007919   Min.   :0.0006224   Min.   :0.003152  
##  1st Qu.:0.0006862   1st Qu.:0.0015214   1st Qu.:0.0020804   1st Qu.:0.003874  
##  Median :0.0007320   Median :0.0015763   Median :0.0045618   Median :0.004002  
##  Mean   :0.0007805   Mean   :0.0016194   Mean   :0.0038022   Mean   :0.004194  
##  3rd Qu.:0.0007902   3rd Qu.:0.0016660   3rd Qu.:0.0048003   3rd Qu.:0.004678  
##  Max.   :0.0022275   Max.   :0.0033590   Max.   :0.0052895   Max.   :0.006564  
##       TGCA               AAAT               ACAA               GAAA         
##  Min.   :0.002892   Min.   :0.003869   Min.   :0.003378   Min.   :0.004225  
##  1st Qu.:0.004915   1st Qu.:0.007370   1st Qu.:0.004948   1st Qu.:0.006664  
##  Median :0.005466   Median :0.007763   Median :0.005190   Median :0.006839  
##  Mean   :0.005324   Mean   :0.008035   Mean   :0.005476   Mean   :0.007570  
##  3rd Qu.:0.005615   3rd Qu.:0.009525   3rd Qu.:0.006471   3rd Qu.:0.009355  
##  Max.   :0.006753   Max.   :0.011041   Max.   :0.007702   Max.   :0.011033  
##       AGCA               CAAA               GCTA               CGCC          
##  Min.   :0.004101   Min.   :0.005595   Min.   :0.002250   Min.   :0.0005664  
##  1st Qu.:0.004374   1st Qu.:0.010473   1st Qu.:0.002690   1st Qu.:0.0011759  
##  Median :0.004437   Median :0.010901   Median :0.002820   Median :0.0012245  
##  Mean   :0.004615   Mean   :0.011142   Mean   :0.003219   Mean   :0.0012841  
##  3rd Qu.:0.004898   3rd Qu.:0.012408   3rd Qu.:0.004123   3rd Qu.:0.0013162  
##  Max.   :0.005900   Max.   :0.013543   Max.   :0.005170   Max.   :0.0034118  
##       ACCA               CCCG                TCTC               CAAC         
##  Min.   :0.004005   Min.   :0.0009275   Min.   :0.001246   Min.   :0.002704  
##  1st Qu.:0.004539   1st Qu.:0.0022363   1st Qu.:0.004226   1st Qu.:0.004042  
##  Median :0.004683   Median :0.0023451   Median :0.005046   Median :0.004128  
##  Mean   :0.004852   Mean   :0.0024635   Mean   :0.004803   Mean   :0.004230  
##  3rd Qu.:0.005230   3rd Qu.:0.0025097   3rd Qu.:0.005203   3rd Qu.:0.004548  
##  Max.   :0.006669   Max.   :0.0060046   Max.   :0.005823   Max.   :0.004841  
##       ACTA               GCGG                TGCT               GAGT         
##  Min.   :0.002007   Min.   :0.0003187   Min.   :0.002898   Min.   :0.002132  
##  1st Qu.:0.002904   1st Qu.:0.0004096   1st Qu.:0.004372   1st Qu.:0.002348  
##  Median :0.003072   Median :0.0004385   Median :0.004914   Median :0.002412  
##  Mean   :0.003205   Mean   :0.0004605   Mean   :0.004760   Mean   :0.002661  
##  3rd Qu.:0.003672   3rd Qu.:0.0004813   3rd Qu.:0.005019   3rd Qu.:0.003172  
##  Max.   :0.004638   Max.   :0.0012246   Max.   :0.006109   Max.   :0.003835  
##       CGTG               AGGC               TAGC               CTCC         
##  Min.   :0.001164   Min.   :0.002964   Min.   :0.001383   Min.   :0.001731  
##  1st Qu.:0.001728   1st Qu.:0.003300   1st Qu.:0.001711   1st Qu.:0.002026  
##  Median :0.001836   Median :0.003512   Median :0.002002   Median :0.004517  
##  Mean   :0.001887   Mean   :0.003546   Mean   :0.001918   Mean   :0.003910  
##  3rd Qu.:0.002028   3rd Qu.:0.003666   3rd Qu.:0.002052   3rd Qu.:0.004752  
##  Max.   :0.003025   Max.   :0.005682   Max.   :0.002423   Max.   :0.007973  
##       TCCT               ACTG               CAAG               AAGT         
##  Min.   :0.001614   Min.   :0.003530   Min.   :0.004549   Min.   :0.002092  
##  1st Qu.:0.003718   1st Qu.:0.004050   1st Qu.:0.006985   1st Qu.:0.003049  
##  Median :0.004812   Median :0.004202   Median :0.007281   Median :0.003111  
##  Mean   :0.004484   Mean   :0.004554   Mean   :0.007327   Mean   :0.003352  
##  3rd Qu.:0.004959   3rd Qu.:0.005375   3rd Qu.:0.007702   3rd Qu.:0.004012  
##  Max.   :0.005655   Max.   :0.006635   Max.   :0.008624   Max.   :0.004463  
##       CGCA                TAAT               GTTT               TGAT         
##  Min.   :0.0003577   Min.   :0.003165   Min.   :0.003451   Min.   :0.004650  
##  1st Qu.:0.0008449   1st Qu.:0.004815   1st Qu.:0.004931   1st Qu.:0.005176  
##  Median :0.0009012   Median :0.005578   Median :0.005167   Median :0.006095  
##  Mean   :0.0009236   Mean   :0.005565   Mean   :0.005622   Mean   :0.005841  
##  3rd Qu.:0.0009897   3rd Qu.:0.006312   3rd Qu.:0.006808   3rd Qu.:0.006328  
##  Max.   :0.0015208   Max.   :0.007239   Max.   :0.008885   Max.   :0.007156  
##       CTAT               TAGG               TGTT               AGAT         
##  Min.   :0.002168   Min.   :0.001507   Min.   :0.004916   Min.   :0.002883  
##  1st Qu.:0.002605   1st Qu.:0.002303   1st Qu.:0.006774   1st Qu.:0.004128  
##  Median :0.004055   Median :0.002455   Median :0.007129   Median :0.004246  
##  Mean   :0.003742   Mean   :0.002428   Mean   :0.007036   Mean   :0.004399  
##  3rd Qu.:0.004444   3rd Qu.:0.002535   3rd Qu.:0.007311   3rd Qu.:0.004909  
##  Max.   :0.013611   Max.   :0.003059   Max.   :0.009400   Max.   :0.007347  
##       GGTA               GTTG               GTGG               CTCA         
##  Min.   :0.001911   Min.   :0.002383   Min.   :0.002501   Min.   :0.002534  
##  1st Qu.:0.002818   1st Qu.:0.002659   1st Qu.:0.002757   1st Qu.:0.003036  
##  Median :0.002910   Median :0.002709   Median :0.002928   Median :0.005875  
##  Mean   :0.002960   Mean   :0.002917   Mean   :0.003043   Mean   :0.005158  
##  3rd Qu.:0.003024   3rd Qu.:0.003340   3rd Qu.:0.003326   3rd Qu.:0.006223  
##  Max.   :0.005146   Max.   :0.004011   Max.   :0.004162   Max.   :0.010311  
##       CTCT               GCAT               GCAA               GTTA         
##  Min.   :0.002565   Min.   :0.002832   Min.   :0.002878   Min.   :0.001224  
##  1st Qu.:0.002978   1st Qu.:0.003418   1st Qu.:0.003455   1st Qu.:0.002402  
##  Median :0.005690   Median :0.003546   Median :0.003589   Median :0.002536  
##  Mean   :0.004969   Mean   :0.004001   Mean   :0.004142   Mean   :0.002631  
##  3rd Qu.:0.005950   3rd Qu.:0.005075   3rd Qu.:0.005446   3rd Qu.:0.003018  
##  Max.   :0.011078   Max.   :0.006111   Max.   :0.006242   Max.   :0.004413  
##       ACTC               CTTT               GATG               ATGG         
##  Min.   :0.002940   Min.   :0.004677   Min.   :0.003103   Min.   :0.003294  
##  1st Qu.:0.003222   1st Qu.:0.006509   1st Qu.:0.003491   1st Qu.:0.003711  
##  Median :0.003376   Median :0.009196   Median :0.003563   Median :0.003911  
##  Mean   :0.003684   Mean   :0.008464   Mean   :0.003578   Mean   :0.003923  
##  3rd Qu.:0.004406   3rd Qu.:0.009652   3rd Qu.:0.003660   3rd Qu.:0.004113  
##  Max.   :0.005409   Max.   :0.015564   Max.   :0.004669   Max.   :0.005528  
##       ACAT               CTTA               ACCT               ACAC         
##  Min.   :0.003048   Min.   :0.002539   Min.   :0.002553   Min.   :0.003159  
##  1st Qu.:0.004754   1st Qu.:0.003157   1st Qu.:0.003288   1st Qu.:0.003553  
##  Median :0.005031   Median :0.004586   Median :0.004017   Median :0.003655  
##  Mean   :0.005274   Mean   :0.004192   Mean   :0.003827   Mean   :0.003845  
##  3rd Qu.:0.006177   3rd Qu.:0.004916   3rd Qu.:0.004217   3rd Qu.:0.004296  
##  Max.   :0.008669   Max.   :0.007894   Max.   :0.005017   Max.   :0.005082  
##       TCTA               ATAA               ATTA               GCTT         
##  Min.   :0.001057   Min.   :0.002379   Min.   :0.002035   Min.   :0.002709  
##  1st Qu.:0.003055   1st Qu.:0.004800   1st Qu.:0.003722   1st Qu.:0.003454  
##  Median :0.003628   Median :0.005111   Median :0.003930   Median :0.003647  
##  Mean   :0.003457   Mean   :0.005064   Mean   :0.003962   Mean   :0.004428  
##  3rd Qu.:0.003826   3rd Qu.:0.005713   3rd Qu.:0.004550   3rd Qu.:0.006183  
##  Max.   :0.004201   Max.   :0.009312   Max.   :0.006654   Max.   :0.007491  
##       TCTT               GGAA               GGAC               TTTT         
##  Min.   :0.001912   Min.   :0.006592   Min.   :0.002369   Min.   :0.002078  
##  1st Qu.:0.005118   1st Qu.:0.007076   1st Qu.:0.002598   1st Qu.:0.004387  
##  Median :0.005911   Median :0.007323   Median :0.002684   Median :0.009058  
##  Mean   :0.005651   Mean   :0.007740   Mean   :0.002757   Mean   :0.007744  
##  3rd Qu.:0.006120   3rd Qu.:0.008566   3rd Qu.:0.002922   3rd Qu.:0.009894  
##  Max.   :0.006985   Max.   :0.010799   Max.   :0.003794   Max.   :0.010851  
##       ATTG               TCAT               TGAA               AGGA         
##  Min.   :0.001789   Min.   :0.001998   Min.   :0.004616   Min.   :0.004087  
##  1st Qu.:0.002834   1st Qu.:0.003698   1st Qu.:0.006866   1st Qu.:0.004369  
##  Median :0.002916   Median :0.004974   Median :0.007839   Median :0.004569  
##  Mean   :0.003034   Mean   :0.004556   Mean   :0.007621   Mean   :0.004736  
##  3rd Qu.:0.003477   3rd Qu.:0.005158   3rd Qu.:0.008253   3rd Qu.:0.005170  
##  Max.   :0.004327   Max.   :0.005431   Max.   :0.009684   Max.   :0.005918  
##       ATAT               ATGA               TTAT               GGGG         
##  Min.   :0.001264   Min.   :0.002323   Min.   :0.001131   Min.   :0.002588  
##  1st Qu.:0.001622   1st Qu.:0.003648   1st Qu.:0.001872   1st Qu.:0.002958  
##  Median :0.004074   Median :0.003749   Median :0.004123   Median :0.003197  
##  Mean   :0.003464   Mean   :0.003873   Mean   :0.003520   Mean   :0.003394  
##  3rd Qu.:0.004671   3rd Qu.:0.004360   3rd Qu.:0.004645   3rd Qu.:0.003712  
##  Max.   :0.011717   Max.   :0.004906   Max.   :0.005929   Max.   :0.007454  
##       GAAT               ACTT               CAAT               AAAG         
##  Min.   :0.005301   Min.   :0.002992   Min.   :0.004277   Min.   :0.004128  
##  1st Qu.:0.006139   1st Qu.:0.004218   1st Qu.:0.006040   1st Qu.:0.006284  
##  Median :0.006481   Median :0.004422   Median :0.006245   Median :0.006442  
##  Mean   :0.006788   Mean   :0.004869   Mean   :0.006332   Mean   :0.006815  
##  3rd Qu.:0.007480   3rd Qu.:0.005899   3rd Qu.:0.006976   3rd Qu.:0.007933  
##  Max.   :0.009877   Max.   :0.007253   Max.   :0.007446   Max.   :0.009388  
##       TTTA                TAAA               ATGT                TGGC         
##  Min.   :0.0009052   Min.   :0.004142   Min.   :0.0007112   Min.   :0.002076  
##  1st Qu.:0.0025056   1st Qu.:0.006997   1st Qu.:0.0008906   1st Qu.:0.002809  
##  Median :0.0049148   Median :0.007659   Median :0.0034715   Median :0.003368  
##  Mean   :0.0043734   Mean   :0.007749   Mean   :0.0027029   Mean   :0.003279  
##  3rd Qu.:0.0056841   3rd Qu.:0.008641   3rd Qu.:0.0036681   3rd Qu.:0.003512  
##  Max.   :0.0062992   Max.   :0.010517   Max.   :0.0051020   Max.   :0.005112  
##       TGAC               CGGT                GTAA               CGGA          
##  Min.   :0.002165   Min.   :0.0003545   Min.   :0.002074   Min.   :0.0004008  
##  1st Qu.:0.003000   1st Qu.:0.0006181   1st Qu.:0.002859   1st Qu.:0.0008425  
##  Median :0.003558   Median :0.0006548   Median :0.002941   Median :0.0008720  
##  Mean   :0.003378   Mean   :0.0006744   Mean   :0.003057   Mean   :0.0008980  
##  3rd Qu.:0.003615   3rd Qu.:0.0007208   3rd Qu.:0.003469   3rd Qu.:0.0009194  
##  Max.   :0.004034   Max.   :0.0010933   Max.   :0.004325   Max.   :0.0016371  
##       AACT               CTTG               AGGT               TATG         
##  Min.   :0.001517   Min.   :0.002114   Min.   :0.002636   Min.   :0.001937  
##  1st Qu.:0.001812   1st Qu.:0.002934   1st Qu.:0.002813   1st Qu.:0.003786  
##  Median :0.003355   Median :0.005025   Median :0.002919   Median :0.003988  
##  Mean   :0.002881   Mean   :0.004479   Mean   :0.003059   Mean   :0.003932  
##  3rd Qu.:0.003500   3rd Qu.:0.005254   3rd Qu.:0.003388   3rd Qu.:0.004183  
##  Max.   :0.003924   Max.   :0.008019   Max.   :0.003840   Max.   :0.004561  
##       AGTT               GATT               GGTT               TACA         
##  Min.   :0.002618   Min.   :0.002547   Min.   :0.003094   Min.   :0.002774  
##  1st Qu.:0.003827   1st Qu.:0.003368   1st Qu.:0.003640   1st Qu.:0.004547  
##  Median :0.003951   Median :0.003478   Median :0.003756   Median :0.004832  
##  Mean   :0.004229   Mean   :0.003581   Mean   :0.004183   Mean   :0.004771  
##  3rd Qu.:0.004960   3rd Qu.:0.003862   3rd Qu.:0.005052   3rd Qu.:0.005040  
##  Max.   :0.006067   Max.   :0.005001   Max.   :0.006589   Max.   :0.005829  
##       GACC               AATC               TATA               CACA         
##  Min.   :0.001657   Min.   :0.002331   Min.   :0.002174   Min.   :0.004728  
##  1st Qu.:0.001936   1st Qu.:0.002906   1st Qu.:0.004887   1st Qu.:0.007731  
##  Median :0.002046   Median :0.002993   Median :0.005302   Median :0.007874  
##  Mean   :0.002076   Mean   :0.002974   Mean   :0.005240   Mean   :0.008236  
##  3rd Qu.:0.002217   3rd Qu.:0.003052   3rd Qu.:0.005764   3rd Qu.:0.009067  
##  Max.   :0.002617   Max.   :0.004196   Max.   :0.006743   Max.   :0.009935  
##       TATC               TCAC                AATT               GTGT         
##  Min.   :0.001737   Min.   :0.0009431   Min.   :0.002470   Min.   :0.002974  
##  1st Qu.:0.002942   1st Qu.:0.0025234   1st Qu.:0.004656   1st Qu.:0.003128  
##  Median :0.003284   Median :0.0035309   Median :0.004921   Median :0.003184  
##  Mean   :0.003213   Mean   :0.0032265   Mean   :0.005255   Mean   :0.003331  
##  3rd Qu.:0.003508   3rd Qu.:0.0036285   3rd Qu.:0.006487   3rd Qu.:0.003612  
##  Max.   :0.003915   Max.   :0.0041136   Max.   :0.007856   Max.   :0.004384  
##       AGTA               CCGT                TAAG               TTAA         
##  Min.   :0.002214   Min.   :0.0004765   Min.   :0.001705   Min.   :0.001328  
##  1st Qu.:0.003185   1st Qu.:0.0007311   1st Qu.:0.003386   1st Qu.:0.002101  
##  Median :0.003280   Median :0.0007613   Median :0.003686   Median :0.003988  
##  Mean   :0.003371   Mean   :0.0007878   Mean   :0.003606   Mean   :0.003521  
##  3rd Qu.:0.003735   3rd Qu.:0.0008073   3rd Qu.:0.003858   3rd Qu.:0.004551  
##  Max.   :0.004812   Max.   :0.0014894   Max.   :0.004340   Max.   :0.005592  
##       CTGT               GTTC               CCGA                GATA         
##  Min.   :0.002329   Min.   :0.002151   Min.   :0.0003616   Min.   :0.001348  
##  1st Qu.:0.002864   1st Qu.:0.002356   1st Qu.:0.0007457   1st Qu.:0.002486  
##  Median :0.005629   Median :0.002397   Median :0.0007883   Median :0.002599  
##  Mean   :0.004931   Mean   :0.002408   Mean   :0.0008184   Mean   :0.002580  
##  3rd Qu.:0.005971   3rd Qu.:0.002447   3rd Qu.:0.0008428   3rd Qu.:0.002707  
##  Max.   :0.009707   Max.   :0.003632   Max.   :0.0017440   Max.   :0.003838  
##       AGCG                CGGG                CATA               AGCT         
##  Min.   :0.0005254   Min.   :0.0006588   Min.   :0.003179   Min.   :0.001549  
##  1st Qu.:0.0005984   1st Qu.:0.0012456   1st Qu.:0.005235   1st Qu.:0.001870  
##  Median :0.0006273   Median :0.0012935   Median :0.005485   Median :0.003561  
##  Mean   :0.0006486   Mean   :0.0013664   Mean   :0.005677   Mean   :0.003071  
##  3rd Qu.:0.0006736   3rd Qu.:0.0013792   3rd Qu.:0.006620   3rd Qu.:0.003650  
##  Max.   :0.0012577   Max.   :0.0033410   Max.   :0.007298   Max.   :0.004267  
##       TGCG                ATAG               ATGC               TTGC          
##  Min.   :0.0001903   Min.   :0.001707   Min.   :0.001868   Min.   :0.0002764  
##  1st Qu.:0.0005027   1st Qu.:0.002781   1st Qu.:0.002280   1st Qu.:0.0007983  
##  Median :0.0005331   Median :0.002965   Median :0.002341   Median :0.0022271  
##  Mean   :0.0005476   Mean   :0.002949   Mean   :0.002462   Mean   :0.0017803  
##  3rd Qu.:0.0005587   3rd Qu.:0.003191   3rd Qu.:0.002756   3rd Qu.:0.0022753  
##  Max.   :0.0009778   Max.   :0.004920   Max.   :0.003102   Max.   :0.0024776  
##       TGTC               TCAA               GGTC               TTCT          
##  Min.   :0.002844   Min.   :0.001474   Min.   :0.002216   Min.   :0.0006882  
##  1st Qu.:0.004170   1st Qu.:0.003040   1st Qu.:0.002594   1st Qu.:0.0012814  
##  Median :0.004356   Median :0.004238   Median :0.002687   Median :0.0045398  
##  Mean   :0.004321   Mean   :0.003855   Mean   :0.002751   Mean   :0.0035484  
##  3rd Qu.:0.004453   3rd Qu.:0.004445   3rd Qu.:0.002892   3rd Qu.:0.0047297  
##  Max.   :0.005171   Max.   :0.004628   Max.   :0.003634   Max.   :0.0055298  
##       TTTG                GACT               TTGT               CACG          
##  Min.   :0.0006146   Min.   :0.002266   Min.   :0.000503   Min.   :0.0006426  
##  1st Qu.:0.0017310   1st Qu.:0.002601   1st Qu.:0.001320   1st Qu.:0.0011636  
##  Median :0.0046628   Median :0.002689   Median :0.003545   Median :0.0012436  
##  Mean   :0.0037387   Mean   :0.002952   Mean   :0.002859   Mean   :0.0012650  
##  3rd Qu.:0.0048058   3rd Qu.:0.003549   3rd Qu.:0.003691   3rd Qu.:0.0013430  
##  Max.   :0.0051625   Max.   :0.004275   Max.   :0.004085   Max.   :0.0021641  
##       CGTA                CTTC               ATTT               CGTC          
##  Min.   :0.0006038   Min.   :0.002296   Min.   :0.002098   Min.   :0.0007170  
##  1st Qu.:0.0008710   1st Qu.:0.003172   1st Qu.:0.002704   1st Qu.:0.0009202  
##  Median :0.0009004   Median :0.005309   Median :0.006143   Median :0.0009849  
##  Mean   :0.0009690   Mean   :0.004728   Mean   :0.005188   Mean   :0.0010024  
##  3rd Qu.:0.0011404   3rd Qu.:0.005492   3rd Qu.:0.006839   3rd Qu.:0.0010753  
##  Max.   :0.0014488   Max.   :0.008466   Max.   :0.012129   Max.   :0.0016191  
##       GTAT               TTGA                TGGT               GTAC         
##  Min.   :0.001581   Min.   :0.0004935   Min.   :0.002294   Min.   :0.001014  
##  1st Qu.:0.002676   1st Qu.:0.0016770   1st Qu.:0.002883   1st Qu.:0.001341  
##  Median :0.002809   Median :0.0034272   Median :0.003532   Median :0.001381  
##  Mean   :0.002842   Mean   :0.0028896   Mean   :0.003346   Mean   :0.001428  
##  3rd Qu.:0.003147   3rd Qu.:0.0035855   3rd Qu.:0.003643   3rd Qu.:0.001557  
##  Max.   :0.005572   Max.   :0.0039643   Max.   :0.004377   Max.   :0.002108  
##       TTAG                TCGT                CTAC               TTGG          
##  Min.   :0.0003805   Min.   :8.181e-05   Min.   :0.001289   Min.   :0.0003397  
##  1st Qu.:0.0010875   1st Qu.:1.696e-04   1st Qu.:0.001569   1st Qu.:0.0009718  
##  Median :0.0023496   Median :4.200e-04   Median :0.003060   Median :0.0026845  
##  Mean   :0.0019530   Mean   :3.476e-04   Mean   :0.002676   Mean   :0.0021643  
##  3rd Qu.:0.0024648   3rd Qu.:4.349e-04   3rd Qu.:0.003221   3rd Qu.:0.0027672  
##  Max.   :0.0029018   Max.   :6.536e-04   Max.   :0.007059   Max.   :0.0033687  
##       ACGC                CGCG                TCAG               ATAC         
##  Min.   :0.0003036   Min.   :0.0001145   Min.   :0.001009   Min.   :0.001491  
##  1st Qu.:0.0003594   1st Qu.:0.0002623   1st Qu.:0.002899   1st Qu.:0.002290  
##  Median :0.0003832   Median :0.0002765   Median :0.003739   Median :0.002405  
##  Mean   :0.0003914   Mean   :0.0002997   Mean   :0.003481   Mean   :0.002401  
##  3rd Qu.:0.0004123   3rd Qu.:0.0003013   3rd Qu.:0.003845   3rd Qu.:0.002622  
##  Max.   :0.0007267   Max.   :0.0009874   Max.   :0.004673   Max.   :0.004557  
##       TGGA               GTAG               TACT               ACGT          
##  Min.   :0.003977   Min.   :0.002104   Min.   :0.002230   Min.   :0.0004811  
##  1st Qu.:0.004958   1st Qu.:0.002202   1st Qu.:0.003179   1st Qu.:0.0005127  
##  Median :0.006041   Median :0.002245   Median :0.003483   Median :0.0005250  
##  Mean   :0.005815   Mean   :0.002376   Mean   :0.003409   Mean   :0.0005599  
##  3rd Qu.:0.006398   3rd Qu.:0.002642   3rd Qu.:0.003636   3rd Qu.:0.0006371  
##  Max.   :0.008511   Max.   :0.003261   Max.   :0.004307   Max.   :0.0007101  
##       GATC                ATCT                CGGC          
##  Min.   :0.0009177   Min.   :0.0008322   Min.   :0.0004467  
##  1st Qu.:0.0018457   1st Qu.:0.0010949   1st Qu.:0.0008398  
##  Median :0.0019858   Median :0.0032574   Median :0.0008817  
##  Mean   :0.0018882   Mean   :0.0026068   Mean   :0.0009242  
##  3rd Qu.:0.0020487   3rd Qu.:0.0034151   3rd Qu.:0.0009554  
##  Max.   :0.0035019   Max.   :0.0056962   Max.   :0.0022846  
##       CGAC                CGAT               ATCA                ACCG          
##  Min.   :0.0002929   Min.   :0.000884   Min.   :0.0006918   Min.   :0.0004739  
##  1st Qu.:0.0005518   1st Qu.:0.001026   1st Qu.:0.0009202   1st Qu.:0.0005649  
##  Median :0.0005705   Median :0.001060   Median :0.0032631   Median :0.0006138  
##  Mean   :0.0005795   Mean   :0.001075   Mean   :0.0025481   Mean   :0.0006232  
##  3rd Qu.:0.0005975   3rd Qu.:0.001102   3rd Qu.:0.0034032   3rd Qu.:0.0006665  
##  Max.   :0.0009464   Max.   :0.001916   Max.   :0.0048867   Max.   :0.0010016  
##       TTCA                GTCC               GCCG                TAGT         
##  Min.   :0.0006472   Min.   :0.001528   Min.   :0.0005879   Min.   :0.001512  
##  1st Qu.:0.0012519   1st Qu.:0.001645   1st Qu.:0.0007359   1st Qu.:0.001841  
##  Median :0.0040287   Median :0.001741   Median :0.0008148   Median :0.002311  
##  Mean   :0.0032096   Mean   :0.001803   Mean   :0.0008662   Mean   :0.002161  
##  3rd Qu.:0.0042349   3rd Qu.:0.001986   3rd Qu.:0.0009691   3rd Qu.:0.002413  
##  Max.   :0.0057198   Max.   :0.002193   Max.   :0.0021263   Max.   :0.002606  
##       ACGA                GCGT                CTCG          
##  Min.   :0.0003693   Min.   :0.0003422   Min.   :0.0002895  
##  1st Qu.:0.0004040   1st Qu.:0.0003990   1st Qu.:0.0004305  
##  Median :0.0004170   Median :0.0004292   Median :0.0009141  
##  Mean   :0.0004211   Mean   :0.0004485   Mean   :0.0008130  
##  3rd Qu.:0.0004366   3rd Qu.:0.0004932   3rd Qu.:0.0009866  
##  Max.   :0.0005431   Max.   :0.0008635   Max.   :0.0023204  
##       TCCG                TCGG                GTCT               TTCC          
##  Min.   :0.0001150   Min.   :5.283e-05   Min.   :0.000874   Min.   :0.0006142  
##  1st Qu.:0.0002974   1st Qu.:1.149e-04   1st Qu.:0.001235   1st Qu.:0.0012657  
##  Median :0.0004463   Median :3.381e-04   Median :0.002919   Median :0.0035202  
##  Mean   :0.0004227   Mean   :2.827e-04   Mean   :0.002404   Mean   :0.0028661  
##  3rd Qu.:0.0004742   3rd Qu.:3.560e-04   3rd Qu.:0.002987   3rd Qu.:0.0036743  
##  Max.   :0.0009986   Max.   :7.922e-04   Max.   :0.004203   Max.   :0.0043989  
##       TCGC                GCGA                GACG          
##  Min.   :3.912e-05   Min.   :0.0002849   Min.   :0.0003813  
##  1st Qu.:1.015e-04   1st Qu.:0.0003421   1st Qu.:0.0004607  
##  Median :2.830e-04   Median :0.0003618   Median :0.0005039  
##  Mean   :2.396e-04   Mean   :0.0003707   Mean   :0.0005291  
##  3rd Qu.:2.986e-04   3rd Qu.:0.0003896   3rd Qu.:0.0005982  
##  Max.   :6.348e-04   Max.   :0.0007549   Max.   :0.0008905  
##       TTAC                TCGA                TACG          
##  Min.   :0.0003610   Min.   :9.825e-05   Min.   :0.0002086  
##  1st Qu.:0.0009079   1st Qu.:2.412e-04   1st Qu.:0.0003623  
##  Median :0.0021535   Median :6.637e-04   Median :0.0003698  
##  Mean   :0.0017814   Mean   :5.638e-04   Mean   :0.0003706  
##  3rd Qu.:0.0022875   3rd Qu.:7.262e-04   3rd Qu.:0.0003784  
##  Max.   :0.0026693   Max.   :1.823e-03   Max.   :0.0004968  
##       TTCG                GTCG                ATCG          
##  Min.   :5.807e-05   Min.   :0.0002218   Min.   :0.0001089  
##  1st Qu.:1.659e-04   1st Qu.:0.0002531   1st Qu.:0.0001660  
##  Median :5.011e-04   Median :0.0002728   Median :0.0004993  
##  Mean   :4.096e-04   Mean   :0.0002817   Mean   :0.0004114  
##  3rd Qu.:5.246e-04   3rd Qu.:0.0003069   3rd Qu.:0.0005314  
##  Max.   :1.094e-03   Max.   :0.0004958   Max.   :0.0012315
# Remove rows with missing DELFI_Score if any
data_clean <- data %>%
  filter(!is.na(DELFI_Score))

3 ) 🧬 DELFI, TF_Score and mtcfDNA fraction by Group

3.1 Normality test

library(ggpubr)
library(rstatix)
library(tidyverse)
library(nortest)

# Reorder groups
data_clean$Group <- factor(data_clean$Group, 
                           levels = c("Healthy", "CLD", "HCC"))

# ============================================
# 1. NORMALITY TESTING
# ============================================

# Function to perform comprehensive normality tests
test_normality <- function(data, variable_name) {
  cat("\n", paste(rep("=", 50), collapse=""), "\n")
  cat("NORMALITY TESTS FOR:", variable_name, "\n")
  cat(paste(rep("=", 50), collapse=""), "\n\n")
  
  # Get the variable
  var_data <- data[[variable_name]]
  
  # 1. Overall Shapiro-Wilk test
  overall_shapiro <- shapiro.test(var_data)
  cat("Overall Shapiro-Wilk test:\n")
  cat(sprintf("  W = %.4f, p-value = %.4f\n", overall_shapiro$statistic, overall_shapiro$p.value))
  cat(sprintf("  Conclusion: %s\n\n", 
              ifelse(overall_shapiro$p.value < 0.05, 
                     "NOT normally distributed (p < 0.05)", 
                     "Normally distributed (p >= 0.05)")))
  
  # 2. Shapiro-Wilk test by group
  cat("Shapiro-Wilk test by group:\n")
  shapiro_by_group <- data %>%
    group_by(Group) %>%
    summarise(
      n = n(),
      W_statistic = shapiro.test(get(variable_name))$statistic,
      p_value = shapiro.test(get(variable_name))$p.value,
      normality = ifelse(shapiro.test(get(variable_name))$p.value < 0.05, "NOT Normal", "Normal")
    )
  print(as.data.frame(shapiro_by_group))
  
  # 3. Anderson-Darling test (more sensitive to tails)
  ad_test <- ad.test(var_data)
  cat("\nAnderson-Darling test:\n")
  cat(sprintf("  A = %.4f, p-value = %.4f\n", ad_test$statistic, ad_test$p.value))
  cat(sprintf("  Conclusion: %s\n", 
              ifelse(ad_test$p.value < 0.05, 
                     "NOT normally distributed (p < 0.05)", 
                     "Normally distributed (p >= 0.05)")))
  
  # 4. Check for outliers
  outliers <- data %>%
    group_by(Group) %>%
    identify_outliers(all_of(variable_name))
  
  cat("\nOutlier Detection:\n")
  if(nrow(outliers) > 0) {
    cat(sprintf("  Found %d outliers (%d extreme)\n", 
                nrow(outliers), 
                sum(outliers$is.extreme)))
    print(outliers %>% select(Group, all_of(variable_name), is.outlier, is.extreme))
  } else {
    cat("  No outliers detected\n")
  }
  
  # Return results
  return(list(
    overall_normal = overall_shapiro$p.value >= 0.05,
    group_normal = all(shapiro_by_group$p_value >= 0.05),
    has_outliers = nrow(outliers) > 0,
    recommendation = ifelse(
      overall_shapiro$p.value >= 0.05 & all(shapiro_by_group$p_value >= 0.05),
      "Use parametric tests (ANOVA)",
      "Use non-parametric tests (Kruskal-Wallis)"
    )
  ))
}

# Test normality for all three biomarkers
delfi_norm <- test_normality(data_clean, "DELFI_Score")
## 
##  ================================================== 
## NORMALITY TESTS FOR: DELFI_Score 
## ================================================== 
## 
## Overall Shapiro-Wilk test:
##   W = 0.7977, p-value = 0.0000
##   Conclusion: NOT normally distributed (p < 0.05)
## 
## Shapiro-Wilk test by group:
##     Group   n W_statistic      p_value  normality
## 1 Healthy 150   0.9897750 3.468464e-01     Normal
## 2     CLD 186   0.8528740 2.016845e-12 NOT Normal
## 3     HCC 139   0.8641797 5.804626e-10 NOT Normal
## 
## Anderson-Darling test:
##   A = 20.9481, p-value = 0.0000
##   Conclusion: NOT normally distributed (p < 0.05)
## 
## Outlier Detection:
##   Found 18 outliers (3 extreme)
## # A tibble: 18 × 4
##    Group   DELFI_Score is.outlier is.extreme
##    <fct>         <dbl> <lgl>      <lgl>     
##  1 Healthy      0.0921 TRUE       FALSE     
##  2 CLD          0.437  TRUE       FALSE     
##  3 CLD          0.404  TRUE       FALSE     
##  4 CLD          0.410  TRUE       FALSE     
##  5 CLD          0.411  TRUE       FALSE     
##  6 CLD          0.464  TRUE       FALSE     
##  7 CLD          0.703  TRUE       TRUE      
##  8 CLD          0.459  TRUE       FALSE     
##  9 CLD          0.446  TRUE       FALSE     
## 10 HCC          0.583  TRUE       FALSE     
## 11 HCC          0.903  TRUE       TRUE      
## 12 HCC          0.623  TRUE       FALSE     
## 13 HCC          0.647  TRUE       FALSE     
## 14 HCC          0.677  TRUE       FALSE     
## 15 HCC          0.930  TRUE       TRUE      
## 16 HCC          0.622  TRUE       FALSE     
## 17 HCC          0.609  TRUE       FALSE     
## 18 HCC          0.649  TRUE       FALSE
tf_norm <- test_normality(data_clean, "TF_Score")
## 
##  ================================================== 
## NORMALITY TESTS FOR: TF_Score 
## ================================================== 
## 
## Overall Shapiro-Wilk test:
##   W = 0.5058, p-value = 0.0000
##   Conclusion: NOT normally distributed (p < 0.05)
## 
## Shapiro-Wilk test by group:
##     Group   n W_statistic      p_value  normality
## 1 Healthy 150   0.6584843 3.472946e-17 NOT Normal
## 2     CLD 186   0.4059785 1.925713e-24 NOT Normal
## 3     HCC 139   0.7732059 2.190308e-13 NOT Normal
## 
## Anderson-Darling test:
##   A = 83.9776, p-value = 0.0000
##   Conclusion: NOT normally distributed (p < 0.05)
## 
## Outlier Detection:
##   Found 44 outliers (20 extreme)
## # A tibble: 44 × 4
##    Group   TF_Score is.outlier is.extreme
##    <fct>      <dbl> <lgl>      <lgl>     
##  1 Healthy  0.00251 TRUE       FALSE     
##  2 Healthy  0.00264 TRUE       FALSE     
##  3 Healthy  0.00256 TRUE       FALSE     
##  4 Healthy  0.00211 TRUE       FALSE     
##  5 Healthy  0.00444 TRUE       TRUE      
##  6 Healthy  0.00307 TRUE       TRUE      
##  7 Healthy  0.00249 TRUE       FALSE     
##  8 Healthy  0.00219 TRUE       FALSE     
##  9 Healthy  0.00410 TRUE       TRUE      
## 10 Healthy  0.00254 TRUE       FALSE     
## # ℹ 34 more rows
mt_norm <- test_normality(data_clean, "mtcfDNA_fraction")
## 
##  ================================================== 
## NORMALITY TESTS FOR: mtcfDNA_fraction 
## ================================================== 
## 
## Overall Shapiro-Wilk test:
##   W = 0.5517, p-value = 0.0000
##   Conclusion: NOT normally distributed (p < 0.05)
## 
## Shapiro-Wilk test by group:
##     Group   n W_statistic      p_value  normality
## 1 Healthy 150   0.8531280 6.176345e-11 NOT Normal
## 2     CLD 186   0.7973768 8.620336e-15 NOT Normal
## 3     HCC 139   0.5804583 3.009384e-18 NOT Normal
## 
## Anderson-Darling test:
##   A = 47.5761, p-value = 0.0000
##   Conclusion: NOT normally distributed (p < 0.05)
## 
## Outlier Detection:
##   Found 31 outliers (12 extreme)
## # A tibble: 31 × 4
##    Group   mtcfDNA_fraction is.outlier is.extreme
##    <fct>              <dbl> <lgl>      <lgl>     
##  1 Healthy          0.00327 TRUE       TRUE      
##  2 Healthy          0.00315 TRUE       TRUE      
##  3 Healthy          0.00272 TRUE       FALSE     
##  4 Healthy          0.00229 TRUE       FALSE     
##  5 Healthy          0.00296 TRUE       FALSE     
##  6 Healthy          0.00221 TRUE       FALSE     
##  7 Healthy          0.00233 TRUE       FALSE     
##  8 Healthy          0.00256 TRUE       FALSE     
##  9 Healthy          0.00268 TRUE       FALSE     
## 10 Healthy          0.00228 TRUE       FALSE     
## # ℹ 21 more rows
# Summary of recommendations
cat("\n", paste(rep("=", 50), collapse=""), "\n")
## 
##  ==================================================
cat("STATISTICAL TEST RECOMMENDATIONS\n")
## STATISTICAL TEST RECOMMENDATIONS
cat(paste(rep("=", 50), collapse=""), "\n")
## ==================================================
cat("DELFI_Score:", delfi_norm$recommendation, "\n")
## DELFI_Score: Use non-parametric tests (Kruskal-Wallis)
cat("TF_Score:", tf_norm$recommendation, "\n")
## TF_Score: Use non-parametric tests (Kruskal-Wallis)
cat("mtcfDNA_fraction:", mt_norm$recommendation, "\n")
## mtcfDNA_fraction: Use non-parametric tests (Kruskal-Wallis)

🕵🏻 The Shapiro-Wilk test assesses the null hypothesis that a sample came from a normally distributed population. A small p-value (typically < 0.05) means you reject the null hypothesis and conclude the data is not normally distributed.

CLD: The p-value is 2.02×10e−12 , which is far below 0.05. This data is not normally distributed.

HCC: The p-value is 5.80×10e−10, also far below 0.05. This data is also not normally distributed.

Healthy: The p-value is 0.347, which is greater than 0.05. This data is consistent with a normal distribution.

Since a parametric test like a one-way ANOVA requires that the data in all groups be approximately normally distributed, it is not suitable for my analysis. Two of my three groups violate this critical assumption.

Therefore, the correct choice is Kruskal-Wallis test.

3.2 Visualization of Normality

# ============================================
# 2. VISUAL ASSESSMENT OF NORMALITY
# ============================================

# Function to create normality diagnostic plots
create_normality_plots <- function(data, variable_name) {
  
  # Q-Q plot by group
  qq_plot <- ggqqplot(data, 
                      x = variable_name,
                      color = "Group",
                      palette = c("#4CAF50", "#FFC107", "#F44336"),
                      facet.by = "Group") +
    labs(title = paste("Q-Q Plot:", variable_name),
         subtitle = "Points should follow the diagonal line for normal distribution")
  
  # Histogram with density curve
  hist_plot <- gghistogram(data,
                           x = variable_name,
                           fill = "Group",
                           palette = c("#4CAF50", "#FFC107", "#F44336"),
                           facet.by = "Group",
                           add_density = TRUE,
                           bins = 15) +
    labs(title = paste("Histogram:", variable_name),
         subtitle = "Check for bell-shaped distribution")
  
  # Boxplot to check for outliers and symmetry
  box_plot <- ggboxplot(data,
                        x = "Group",
                        y = variable_name,
                        fill = "Group",
                        palette = c("#4CAF50", "#FFC107", "#F44336"),
                        add = "jitter",
                        add.params = list(alpha = 0.3)) +
    labs(title = paste("Boxplot:", variable_name),
         subtitle = "Check for outliers and symmetry") +
    theme(legend.position = "none")
  
  # Combine plots
  library(patchwork)
  combined <- (qq_plot / hist_plot / box_plot) +
    plot_annotation(
      title = paste("Normality Assessment for", variable_name),
      theme = theme(plot.title = element_text(size = 16, face = "bold"))
    )
  
  return(combined)
}

# Create normality plots for each biomarker
delfi_norm_plots <- create_normality_plots(data_clean, "DELFI_Score")
tf_norm_plots <- create_normality_plots(data_clean, "TF_Score")
mt_norm_plots <- create_normality_plots(data_clean, "mtcfDNA_fraction")

# Display and save plots
print(delfi_norm_plots)

ggsave("delfi_normality_assessment.png", delfi_norm_plots, width = 12, height = 14, dpi = 300)

print(tf_norm_plots)

ggsave("tf_normality_assessment.png", tf_norm_plots, width = 12, height = 14, dpi = 300)

print(mt_norm_plots)

ggsave("mt_normality_assessment.png", mt_norm_plots, width = 12, height = 14, dpi = 300)

🕵🏻 The Shapiro-Wilk test assesses the null hypothesis that a sample came from a normally distributed population. A small p-value (typically < 0.05) means you reject the null hypothesis and conclude the data is not normally distributed.

CLD: The p-value is 2.02×10e−12 , which is far below 0.05. This data is not normally distributed.

HCC: The p-value is 5.80×10e−10, also far below 0.05. This data is also not normally distributed.

Healthy: The p-value is 0.347, which is greater than 0.05. This data is consistent with a normal distribution.

Since a parametric test like a one-way ANOVA requires that the data in all groups be approximately normally distributed, it is not suitable for my analysis. Two of my three groups violate this critical assumption.

Therefore, the correct choice is Kruskal-Wallis test.

3.3 Homogeneity of Variance Test

# ============================================
# 3. HOMOGENEITY OF VARIANCE TESTING
# ============================================

cat("\n", paste(rep("=", 50), collapse=""), "\n")
## 
##  ==================================================
cat("HOMOGENEITY OF VARIANCE TESTS\n")
## HOMOGENEITY OF VARIANCE TESTS
cat(paste(rep("=", 50), collapse=""), "\n\n")
## ==================================================
# Levene's test for each biomarker
levene_delfi <- data_clean %>% levene_test(DELFI_Score ~ Group)
levene_tf <- data_clean %>% levene_test(TF_Score ~ Group)
levene_mt <- data_clean %>% levene_test(mtcfDNA_fraction ~ Group)

cat("Levene's Test Results:\n")
## Levene's Test Results:
cat("(H0: variances are equal across groups)\n\n")
## (H0: variances are equal across groups)
cat("DELFI_Score:\n")
## DELFI_Score:
print(levene_delfi)
## # A tibble: 1 × 4
##     df1   df2 statistic        p
##   <int> <int>     <dbl>    <dbl>
## 1     2   472      40.7 4.83e-17
cat(sprintf("Conclusion: %s\n\n", 
            ifelse(levene_delfi$p < 0.05, 
                   "Variances are NOT equal (use Welch's ANOVA or non-parametric)",
                   "Variances are equal (can use standard ANOVA if normal)")))
## Conclusion: Variances are NOT equal (use Welch's ANOVA or non-parametric)
cat("TF_Score:\n")
## TF_Score:
print(levene_tf)
## # A tibble: 1 × 4
##     df1   df2 statistic        p
##   <int> <int>     <dbl>    <dbl>
## 1     2   472      94.5 2.95e-35
cat(sprintf("Conclusion: %s\n\n", 
            ifelse(levene_tf$p < 0.05, 
                   "Variances are NOT equal (use Welch's ANOVA or non-parametric)",
                   "Variances are equal (can use standard ANOVA if normal)")))
## Conclusion: Variances are NOT equal (use Welch's ANOVA or non-parametric)
cat("mtcfDNA_fraction:\n")
## mtcfDNA_fraction:
print(levene_mt)
## # A tibble: 1 × 4
##     df1   df2 statistic        p
##   <int> <int>     <dbl>    <dbl>
## 1     2   472      26.9 8.82e-12
cat(sprintf("Conclusion: %s\n\n", 
            ifelse(levene_mt$p < 0.05, 
                   "Variances are NOT equal (use Welch's ANOVA or non-parametric)",
                   "Variances are equal (can use standard ANOVA if normal)")))
## Conclusion: Variances are NOT equal (use Welch's ANOVA or non-parametric)

3.4 Final Statistical Decision Tree

## 
##  ---------------------------------------- 
## Decision for DELFI_Score :
## ---------------------------------------- 
## ✗ Data is NOT normally distributed
## → RECOMMENDATION: Use Kruskal-Wallis with Dunn's post-hoc
## 
##  ---------------------------------------- 
## Decision for TF_Score :
## ---------------------------------------- 
## ✗ Data is NOT normally distributed
## → RECOMMENDATION: Use Kruskal-Wallis with Dunn's post-hoc
## 
##  ---------------------------------------- 
## Decision for mtcfDNA_fraction :
## ---------------------------------------- 
## ✗ Data is NOT normally distributed
## → RECOMMENDATION: Use Kruskal-Wallis with Dunn's post-hoc
## 
## 
##  ==================================================
## FINAL STATISTICAL TEST RECOMMENDATIONS
## ==================================================
##          Biomarker Normal_Distribution Equal_Variances Has_Outliers
## 1      DELFI_Score               FALSE           FALSE         TRUE
## 2         TF_Score               FALSE           FALSE         TRUE
## 3 mtcfDNA_fraction               FALSE           FALSE         TRUE
##   Recommended_Test
## 1   Kruskal-Wallis
## 2   Kruskal-Wallis
## 3   Kruskal-Wallis

3.5 Apply the Appropriate Tests Based on Results

# ============================================
# 5. APPLY APPROPRIATE STATISTICAL TESTS
# ============================================

perform_appropriate_analysis <- function(data, variable_name, test_type) {
  cat("\n", paste(rep("=", 50), collapse=""), "\n")
  cat("ANALYSIS FOR", variable_name, "using", test_type, "\n")
  cat(paste(rep("=", 50), collapse=""), "\n\n")
  
  if (test_type == "ANOVA") {
    # Parametric ANOVA
    aov_result <- aov(as.formula(paste(variable_name, "~ Group")), data = data)
    cat("One-way ANOVA:\n")
    print(summary(aov_result))
    
    # Tukey's HSD post-hoc
    tukey_result <- TukeyHSD(aov_result)
    cat("\nTukey's HSD Post-hoc:\n")
    print(tukey_result)
    
  } else if (test_type == "Welch") {
    # Welch's ANOVA
    welch_result <- oneway.test(as.formula(paste(variable_name, "~ Group")), 
                                data = data, var.equal = FALSE)
    cat("Welch's ANOVA:\n")
    print(welch_result)
    
    # Games-Howell post-hoc
    library(rstatix)
    gh_result <- data %>%
      games_howell_test(as.formula(paste(variable_name, "~ Group")))
    cat("\nGames-Howell Post-hoc:\n")
    print(gh_result)
    
  } else {
    # Kruskal-Wallis (non-parametric)
    kw_result <- kruskal.test(as.formula(paste(variable_name, "~ Group")), data = data)
    cat("Kruskal-Wallis test:\n")
    print(kw_result)
    
    # Dunn's post-hoc
    dunn_result <- data %>%
      dunn_test(as.formula(paste(variable_name, "~ Group")), 
                p.adjust.method = "bonferroni")
    cat("\nDunn's Post-hoc (Bonferroni adjusted):\n")
    print(dunn_result)
  }
}

# Apply appropriate tests based on decisions
perform_appropriate_analysis(data_clean, "DELFI_Score", delfi_decision)
## 
##  ================================================== 
## ANALYSIS FOR DELFI_Score using Kruskal-Wallis 
## ================================================== 
## 
## Kruskal-Wallis test:
## 
##  Kruskal-Wallis rank sum test
## 
## data:  DELFI_Score by Group
## Kruskal-Wallis chi-squared = 166.62, df = 2, p-value < 2.2e-16
## 
## 
## Dunn's Post-hoc (Bonferroni adjusted):
## # A tibble: 3 × 9
##   .y.         group1 group2    n1    n2 statistic        p    p.adj p.adj.signif
## * <chr>       <chr>  <chr>  <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
## 1 DELFI_Score Healt… CLD      150   186     10.2  1.91e-24 5.72e-24 ****        
## 2 DELFI_Score Healt… HCC      150   139     12.0  2.26e-33 6.78e-33 ****        
## 3 DELFI_Score CLD    HCC      186   139      2.65 7.98e- 3 2.40e- 2 *
perform_appropriate_analysis(data_clean, "TF_Score", tf_decision)
## 
##  ================================================== 
## ANALYSIS FOR TF_Score using Kruskal-Wallis 
## ================================================== 
## 
## Kruskal-Wallis test:
## 
##  Kruskal-Wallis rank sum test
## 
## data:  TF_Score by Group
## Kruskal-Wallis chi-squared = 370.14, df = 2, p-value < 2.2e-16
## 
## 
## Dunn's Post-hoc (Bonferroni adjusted):
## # A tibble: 3 × 9
##   .y.      group1  group2    n1    n2 statistic        p    p.adj p.adj.signif
## * <chr>    <chr>   <chr>  <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
## 1 TF_Score Healthy CLD      150   186     12.3  9.43e-35 2.83e-34 ****        
## 2 TF_Score Healthy HCC      150   139     19.0  1.18e-80 3.54e-80 ****        
## 3 TF_Score CLD     HCC      186   139      7.94 2.09e-15 6.28e-15 ****
perform_appropriate_analysis(data_clean, "mtcfDNA_fraction", mt_decision)
## 
##  ================================================== 
## ANALYSIS FOR mtcfDNA_fraction using Kruskal-Wallis 
## ================================================== 
## 
## Kruskal-Wallis test:
## 
##  Kruskal-Wallis rank sum test
## 
## data:  mtcfDNA_fraction by Group
## Kruskal-Wallis chi-squared = 234.91, df = 2, p-value < 2.2e-16
## 
## 
## Dunn's Post-hoc (Bonferroni adjusted):
## # A tibble: 3 × 9
##   .y.         group1 group2    n1    n2 statistic        p    p.adj p.adj.signif
## * <chr>       <chr>  <chr>  <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
## 1 mtcfDNA_fr… Healt… CLD      150   186     11.5  2.18e-30 6.53e-30 ****        
## 2 mtcfDNA_fr… Healt… HCC      150   139     14.6  1.74e-48 5.22e-48 ****        
## 3 mtcfDNA_fr… CLD    HCC      186   139      4.15 3.30e- 5 9.90e- 5 ****

3.6 Plots

library(ggpubr)
library(rstatix)
library(tidyverse)
library(patchwork)

# Ensure groups are properly ordered
data_clean$Group <- factor(data_clean$Group, 
                           levels = c("Healthy", "CLD", "HCC"))

# ============================================
# 1. DELFI_Score Boxplot
# ============================================

# Perform Dunn's test for DELFI_Score
dunn_delfi <- data_clean %>%
  dunn_test(DELFI_Score ~ Group, p.adjust.method = "bonferroni") %>%
  add_significance() %>%
  add_xy_position(x = "Group", step.increase = 0.08)

# Create boxplot for DELFI_Score
boxplot_delfi <- ggboxplot(data_clean, 
                           x = "Group", 
                           y = "DELFI_Score",
                           fill = "Group",
                           palette = c("#4CAF50", "#FFC107", "#F44336"),
                           add = "jitter",
                           add.params = list(alpha = 0.3, size = 1.5)) +
  stat_pvalue_manual(dunn_delfi, 
                     label = "p.adj.signif",
                     tip.length = 0.01,
                     hide.ns = FALSE,
                     size = 4) +
  labs(title = "DELFI Score Distribution by Group",
       subtitle = "Kruskal-Wallis: p < 2.2e-16 | Post-hoc: Dunn's test (Bonferroni adjusted)",
       y = "DELFI Score",
       x = "Group") +
  scale_y_continuous(expand = expansion(mult = c(0.05, 0.15))) +
  theme_classic() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
        plot.subtitle = element_text(hjust = 0.5, size = 10, face = "italic"),
        axis.title = element_text(size = 12),
        axis.text = element_text(size = 11))

print(boxplot_delfi)

ggsave("delfi_score_boxplot_final.png", boxplot_delfi, width = 8, height = 6, dpi = 300)

# ============================================
# 2. TF_Score Boxplot
# ============================================

# Perform Dunn's test for TF_Score
dunn_tf <- data_clean %>%
  dunn_test(TF_Score ~ Group, p.adjust.method = "bonferroni") %>%
  add_significance() %>%
  add_xy_position(x = "Group", step.increase = 0.08)

# Create boxplot for TF_Score
boxplot_tf <- ggboxplot(data_clean, 
                        x = "Group", 
                        y = "TF_Score",
                        fill = "Group",
                        palette = c("#4CAF50", "#FFC107", "#F44336"),
                        add = "jitter",
                        add.params = list(alpha = 0.3, size = 1.5)) +
  stat_pvalue_manual(dunn_tf, 
                     label = "p.adj.signif",
                     tip.length = 0.01,
                     hide.ns = FALSE,
                     size = 4) +
  labs(title = "TF Score Distribution by Group",
       subtitle = "Kruskal-Wallis: p < 2.2e-16 | Post-hoc: Dunn's test (Bonferroni adjusted)",
       y = "TF Score",
       x = "Group") +
  scale_y_continuous(expand = expansion(mult = c(0.05, 0.15))) +
  theme_classic() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
        plot.subtitle = element_text(hjust = 0.5, size = 10, face = "italic"),
        axis.title = element_text(size = 12),
        axis.text = element_text(size = 11))

print(boxplot_tf)

ggsave("tf_score_boxplot_final.png", boxplot_tf, width = 8, height = 6, dpi = 300)

# ============================================
# 3. mtcfDNA_fraction Boxplot
# ============================================

# Perform Dunn's test for mtcfDNA_fraction
dunn_mt <- data_clean %>%
  dunn_test(mtcfDNA_fraction ~ Group, p.adjust.method = "bonferroni") %>%
  add_significance() %>%
  add_xy_position(x = "Group", step.increase = 0.08)

# Create boxplot for mtcfDNA_fraction
boxplot_mt <- ggboxplot(data_clean, 
                        x = "Group", 
                        y = "mtcfDNA_fraction",
                        fill = "Group",
                        palette = c("#4CAF50", "#FFC107", "#F44336"),
                        add = "jitter",
                        add.params = list(alpha = 0.3, size = 1.5)) +
  stat_pvalue_manual(dunn_mt, 
                     label = "p.adj.signif",
                     tip.length = 0.01,
                     hide.ns = FALSE,
                     size = 4) +
  labs(title = "mtcfDNA Fraction Distribution by Group",
       subtitle = "Kruskal-Wallis: p < 2.2e-16 | Post-hoc: Dunn's test (Bonferroni adjusted)",
       y = "mtcfDNA Fraction",
       x = "Group") +
  scale_y_continuous(expand = expansion(mult = c(0.05, 0.15))) +
  theme_classic() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
        plot.subtitle = element_text(hjust = 0.5, size = 10, face = "italic"),
        axis.title = element_text(size = 12),
        axis.text = element_text(size = 11))

print(boxplot_mt)

ggsave("mtcfdna_fraction_boxplot_final.png", boxplot_mt, width = 8, height = 6, dpi = 300)


# ============================================
# 7. Create Summary Table
# ============================================

# Extract p-values from Dunn's tests
summary_pvalues <- bind_rows(
  dunn_delfi %>% 
    select(group1, group2, p.adj, p.adj.signif) %>%
    mutate(Biomarker = "DELFI_Score"),
  dunn_tf %>% 
    select(group1, group2, p.adj, p.adj.signif) %>%
    mutate(Biomarker = "TF_Score"),
  dunn_mt %>% 
    select(group1, group2, p.adj, p.adj.signif) %>%
    mutate(Biomarker = "mtcfDNA_fraction")
) %>%
  mutate(Comparison = paste(group1, "vs", group2),
         p.adj = format(p.adj, scientific = TRUE, digits = 3)) %>%
  select(Biomarker, Comparison, p.adj, p.adj.signif) %>%
  arrange(Biomarker, Comparison)

print("Summary of Post-hoc Comparisons:")
## [1] "Summary of Post-hoc Comparisons:"
print(summary_pvalues)
## # A tibble: 9 × 4
##   Biomarker        Comparison     p.adj    p.adj.signif
##   <chr>            <chr>          <chr>    <chr>       
## 1 DELFI_Score      CLD vs HCC     2.40e-02 *           
## 2 DELFI_Score      Healthy vs CLD 5.72e-24 ****        
## 3 DELFI_Score      Healthy vs HCC 6.78e-33 ****        
## 4 TF_Score         CLD vs HCC     6.28e-15 ****        
## 5 TF_Score         Healthy vs CLD 2.83e-34 ****        
## 6 TF_Score         Healthy vs HCC 3.54e-80 ****        
## 7 mtcfDNA_fraction CLD vs HCC     9.90e-05 ****        
## 8 mtcfDNA_fraction Healthy vs CLD 6.53e-30 ****        
## 9 mtcfDNA_fraction Healthy vs HCC 5.22e-48 ****
# Save summary table as CSV
write.csv(summary_pvalues, "post_hoc_summary.csv", row.names = FALSE)
# ================================
# Panel-tagged boxplots with clear brackets
# ================================
library(ggpubr)
library(rstatix)
library(tidyverse)
library(patchwork)

# Ensure groups are ordered
data_clean$Group <- factor(data_clean$Group, levels = c("Healthy", "CLD", "HCC"))

# Helper to nudge bracket heights upward
raise_brackets <- function(df, y_buffer = 0.12) {
  if (!"y.position" %in% names(df)) return(df)
  df %>% mutate(y.position = y.position * (1 + y_buffer))
}

# -------------------------------
# 1) DELFI Score
# -------------------------------
dunn_delfi <- data_clean %>%
  dunn_test(DELFI_Score ~ Group, p.adjust.method = "bonferroni") %>%
  add_significance() %>%
  add_xy_position(x = "Group", step.increase = 0.18) %>%
  raise_brackets(y_buffer = 0.15)

boxplot_delfi_enhanced <- ggboxplot(
  data_clean, x = "Group", y = "DELFI_Score",
  fill = "Group",
  palette = c("#4CAF50", "#FFC107", "#F44336"),
  alpha = 0.8, width = 0.7,
  add = "jitter",
  add.params = list(alpha = 0.4, size = 1.2, color = "darkgray"),
  outlier.shape = NA
) +
  stat_pvalue_manual(
    dunn_delfi,
    label = "p.adj.signif",
    tip.length = 0.02,
    hide.ns = FALSE,
    size = 5.5,
    bracket.size = 1.2
  ) +
  labs(title = "", y = "DELFI Score", x = "") +
  scale_y_continuous(
    expand = expansion(mult = c(0.05, 0.22)),
    breaks = scales::pretty_breaks(n = 6)
  ) +
  theme_classic(base_size = 11) +
  theme(
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
    axis.title.y = element_text(size = 16),
    axis.text = element_text(size = 15),
    axis.text.x = element_text(size = 15)
  )

# -------------------------------
# 2) TF Score
# -------------------------------
dunn_tf <- data_clean %>%
  dunn_test(TF_Score ~ Group, p.adjust.method = "bonferroni") %>%
  add_significance() %>%
  add_xy_position(x = "Group", step.increase = 0.18) %>%
  raise_brackets(y_buffer = 0.15)

boxplot_tf_enhanced <- ggboxplot(
  data_clean, x = "Group", y = "TF_Score",
  fill = "Group",
  palette = c("#4CAF50", "#FFC107", "#F44336"),
  alpha = 0.8, width = 0.7,
  add = "jitter",
  add.params = list(alpha = 0.4, size = 1.2, color = "darkgray"),
  outlier.shape = NA
) +
  stat_pvalue_manual(
    dunn_tf,
    label = "p.adj.signif",
    tip.length = 0.02,
    hide.ns = FALSE,
    size = 5.5,
    bracket.size = 1.2
  ) +
  labs(title = "", y = "TF Score", x = "") +
  scale_y_continuous(
    expand = expansion(mult = c(0.05, 0.22)),
    breaks = scales::pretty_breaks(n = 6)
  ) +
  theme_classic(base_size = 11) +
  theme(
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
    axis.title.y = element_text(size = 16),
    axis.text = element_text(size = 15),
    axis.text.x = element_text(size = 15)
  )

# -------------------------------
# 3) mtcfDNA Fraction
# -------------------------------
dunn_mt <- data_clean %>%
  dunn_test(mtcfDNA_fraction ~ Group, p.adjust.method = "bonferroni") %>%
  add_significance() %>%
  add_xy_position(x = "Group", step.increase = 0.18) %>%
  raise_brackets(y_buffer = 0.15)

boxplot_mt_enhanced <- ggboxplot(
  data_clean, x = "Group", y = "mtcfDNA_fraction",
  fill = "Group",
  palette = c("#4CAF50", "#FFC107", "#F44336"),
  alpha = 0.8, width = 0.7,
  add = "jitter",
  add.params = list(alpha = 0.4, size = 1.2, color = "darkgray"),
  outlier.shape = NA
) +
  stat_pvalue_manual(
    dunn_mt,
    label = "p.adj.signif",
    tip.length = 0.02,
    hide.ns = FALSE,
    size = 5.5,
    bracket.size = 1.2
  ) +
  labs(title = "", y = "mtcfDNA Fraction", x = "") +
  scale_y_continuous(
    expand = expansion(mult = c(0.05, 0.22)),
    breaks = scales::pretty_breaks(n = 6)
  ) +
  theme_classic(base_size = 11) +
  theme(
    legend.position = "none",
    plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
    axis.title.y = element_text(size = 16),
    axis.text = element_text(size = 15),
    axis.text.x = element_text(size = 15)
  )

# -------------------------------
# 4) Summary statistics table
# -------------------------------
summary_stats <- data_clean %>%
  pivot_longer(
    cols = c(DELFI_Score, TF_Score, mtcfDNA_fraction),
    names_to = "Biomarker", values_to = "Value"
  ) %>%
  group_by(Biomarker, Group) %>%
  summarise(
    n = n(),
    Mean = mean(Value),
    SD = sd(Value),
    Median = median(Value),
    Q1 = quantile(Value, 0.25),
    Q3 = quantile(Value, 0.75),
    Min = min(Value),
    Max = max(Value),
    .groups = "drop"
  ) %>%
  mutate(across(where(is.numeric), ~ round(., 4)))

print("Summary Statistics by Biomarker and Group:")
## [1] "Summary Statistics by Biomarker and Group:"
print(summary_stats)
## # A tibble: 9 × 10
##   Biomarker        Group      n   Mean     SD Median     Q1     Q3    Min    Max
##   <chr>            <fct>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 DELFI_Score      Healt…   150 0.194  0.0316 0.190  0.170  0.219  0.0921 0.267 
## 2 DELFI_Score      CLD      186 0.264  0.0681 0.248  0.220  0.292  0.165  0.703 
## 3 DELFI_Score      HCC      139 0.318  0.136  0.287  0.231  0.369  0.124  0.93  
## 4 TF_Score         Healt…   150 0.0014 0.0005 0.0013 0.0011 0.0015 0.001  0.0044
## 5 TF_Score         CLD      186 0.0151 0.0178 0.0107 0.0096 0.0119 0.0034 0.151 
## 6 TF_Score         HCC      139 0.0856 0.0951 0.0514 0.0138 0.127  0.0056 0.569 
## 7 mtcfDNA_fraction Healt…   150 0.0011 0.0006 0.0009 0.0007 0.0013 0.0003 0.0033
## 8 mtcfDNA_fraction CLD      186 0.0039 0.0032 0.0029 0.0017 0.0049 0.0005 0.0226
## 9 mtcfDNA_fraction HCC      139 0.0068 0.0078 0.0046 0.0028 0.0075 0.0008 0.0699
write.csv(summary_stats, "biomarker_summary_statistics.csv", row.names = FALSE)

# -------------------------------
# 5) Combine with A/B/C panel tags
# -------------------------------
combined_with_legend <-
  (boxplot_delfi_enhanced + theme(legend.position = "none")) +
  (boxplot_tf_enhanced   + theme(legend.position = "none")) +
  (boxplot_mt_enhanced   + theme(
    legend.position = "none",
    legend.title = element_blank(),
    legend.text  = element_text(size = 10)
  )) +
  plot_layout(ncol = 3, widths = c(1, 1, 1), guides = "collect") +
  plot_annotation(
    tag_levels = "A",  # adds A, B, C
    title = "",
    subtitle = "",
    caption = "",
    theme = theme(
      plot.tag = element_text(face = "bold", size = 28, hjust = -0.2, vjust = 1.2),
      plot.caption = element_text(size = 18, hjust = 0.5, color = "gray40")
    )
  )

print(combined_with_legend)

ggsave(
  "biomarkers_horizontal_with_legend.png",
  combined_with_legend,
  width = 15, height = 12, dpi = 300, bg = "white"
)

4 ) 🧬 End Motif Analysis

4.1 Load and Inspect Data for end motifs

# Load required libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(reshape2)  # For melt function
library(pheatmap)
library(viridis)
library(RColorBrewer)
library(gridExtra)
library(dendextend)


# Read the CSV file
my_data <- read.csv("/Users/sultanalharbi/Library/CloudStorage/OneDrive-Personal/Projects/Thesis_Chapters/Chapter 3 (Diagnostic Indicators for HCC)/HCC_Diagnostic_Tables/EndMotif_only.csv", stringsAsFactors = FALSE)

# Check column names
print("Column names in data:")
## [1] "Column names in data:"
print(names(my_data)[1:5])
## [1] "Sample_ID" "Group"     "AACC"      "ACCC"      "CCCT"
# Identify ID column
id_col <- if("Sample" %in% names(my_data)) "Sample" else 
  if("Sample_ID" %in% names(my_data)) "Sample_ID" else 
    names(my_data)[1]

print(paste("Using ID column:", id_col))
## [1] "Using ID column: Sample_ID"
# Standardize column name to Sample_ID for consistency
if(id_col != "Sample_ID") {
  my_data$Sample_ID <- my_data[[id_col]]
}


# Check what groups are present in the data
print("Groups present in the data:")
## [1] "Groups present in the data:"
print(table(my_data$Group))
## 
##     CLD     HCC Healthy 
##     186     139     150
# Verify all three groups are present
if(!all(c("HCC", "CLD", "Healthy") %in% unique(my_data$Group))) {
  warning("Not all three groups (HCC, CLD, Healthy) are present in the data!")
  print("Available groups:")
  print(unique(my_data$Group))
}
# ========== STATISTICAL ANALYSIS ==========

# Identify motif columns (exclude metadata columns)
metadata_cols <- c("Sample_ID", "Group", id_col)
motif_cols <- setdiff(names(my_data), metadata_cols)
print(paste("Number of motif columns found:", length(motif_cols)))
## [1] "Number of motif columns found: 256"
print("First 5 motif columns:")
## [1] "First 5 motif columns:"
print(head(motif_cols, 5))
## [1] "AAAA" "AAAC" "AAAG" "AAAT" "AACA"
# Melt the data for analysis
data_melted <- my_data %>%
  select(Sample_ID, Group, all_of(motif_cols)) %>%
  pivot_longer(cols = all_of(motif_cols),
               names_to = "Motif",
               values_to = "Frequency")

# Statistical analysis for each motif
motif_statistics <- data_melted %>%
  group_by(Motif) %>%
  summarise(
    # Calculate means for each group
    mean_HCC = mean(Frequency[Group == "HCC"], na.rm = TRUE),
    mean_CLD = mean(Frequency[Group == "CLD"], na.rm = TRUE),
    mean_Healthy = mean(Frequency[Group == "Healthy"], na.rm = TRUE),
    
    # Calculate standard deviations
    sd_HCC = sd(Frequency[Group == "HCC"], na.rm = TRUE),
    sd_CLD = sd(Frequency[Group == "CLD"], na.rm = TRUE),
    sd_Healthy = sd(Frequency[Group == "Healthy"], na.rm = TRUE),
    
    # Overall variance
    variance_across_groups = var(Frequency, na.rm = TRUE),
    
    # Calculate fold changes
    fc_HCC_vs_Healthy = log2((mean_HCC + 0.0001) / (mean_Healthy + 0.0001)),
    fc_CLD_vs_Healthy = log2((mean_CLD + 0.0001) / (mean_Healthy + 0.0001)),
    fc_HCC_vs_CLD = log2((mean_HCC + 0.0001) / (mean_CLD + 0.0001)),
    
    # Maximum absolute fold change
    max_abs_fc = max(abs(c(fc_HCC_vs_Healthy, fc_CLD_vs_Healthy, fc_HCC_vs_CLD))),
    
    # CLD vs HCC specific fold change (absolute)
    abs_fc_CLD_HCC = abs(fc_HCC_vs_CLD),
    
    # Perform ANOVA
    anova_p = tryCatch({
      motif_data_temp <- data_melted[data_melted$Motif == first(Motif),]
      if(length(unique(motif_data_temp$Group)) > 1 && nrow(motif_data_temp) > 3) {
        aov_result <- aov(Frequency ~ Group, data = motif_data_temp)
        summary(aov_result)[[1]][["Pr(>F)"]][1]
      } else {
        NA
      }
    }, error = function(e) NA),
    
    # Perform t-test for CLD vs HCC
    ttest_CLD_HCC_p = tryCatch({
      t.test(Frequency[Group == "CLD"], Frequency[Group == "HCC"])$p.value
    }, error = function(e) NA),
    
    # Identify which group has highest mean
    highest_group = case_when(
      mean_HCC == max(c(mean_HCC, mean_CLD, mean_Healthy), na.rm = TRUE) ~ "HCC",
      mean_CLD == max(c(mean_HCC, mean_CLD, mean_Healthy), na.rm = TRUE) ~ "CLD",
      mean_Healthy == max(c(mean_HCC, mean_CLD, mean_Healthy), na.rm = TRUE) ~ "Healthy",
      TRUE ~ "Equal"
    )
  ) %>%
  ungroup()

# Apply FDR correction
motif_statistics$anova_p_adj <- p.adjust(motif_statistics$anova_p, method = "BH")
motif_statistics$ttest_CLD_HCC_p_adj <- p.adjust(motif_statistics$ttest_CLD_HCC_p, method = "BH")

# Create scoring system
motif_statistics <- motif_statistics %>%
  mutate(
    # Combined score considering fold change and statistical significance
    combined_score = max_abs_fc * -log10(anova_p_adj + 0.0001),
    
    # CLD vs HCC specific score
    CLD_HCC_score = abs_fc_CLD_HCC * -log10(ttest_CLD_HCC_p_adj + 0.0001)
  )
# Top 50 by combined score (overall significance)
top_50_combined <- motif_statistics %>%
  filter(!is.na(anova_p_adj)) %>%
  arrange(desc(combined_score)) %>%
  slice_head(n = 50)

print("Top 50 most significant motifs selected")
## [1] "Top 50 most significant motifs selected"
print(paste("Mean fold change (HCC vs Healthy):", round(mean(top_50_combined$fc_HCC_vs_Healthy), 3)))
## [1] "Mean fold change (HCC vs Healthy): -0.047"
print(paste("Mean fold change (CLD vs Healthy):", round(mean(top_50_combined$fc_CLD_vs_Healthy), 3)))
## [1] "Mean fold change (CLD vs Healthy): -0.094"
print(paste("Mean fold change (HCC vs CLD):", round(mean(top_50_combined$fc_HCC_vs_CLD), 3)))
## [1] "Mean fold change (HCC vs CLD): 0.047"

4.2 Heatmap Plots

# ========== PREPARE DATA FOR HEATMAPS ==========

# Prepare heatmap data matrix
heatmap_data <- my_data %>%
  select(Sample_ID, all_of(motif_cols))

# Set row names and remove ID column
rownames(heatmap_data) <- heatmap_data$Sample_ID
heatmap_data$Sample_ID <- NULL

# Create annotation for samples
annotation_row <- data.frame(
  Group = my_data$Group,
  row.names = my_data$Sample_ID
)

# Set group order
group_order <- c("Healthy", "CLD", "HCC")
annotation_row$Group <- factor(annotation_row$Group, levels = group_order)

# ========== CLUSTERING FUNCTION ==========

order_samples_by_group_enhanced <- function(annotation_data, zscore_matrix, group_order) {
  ordered_samples <- c()
  
  for(grp in group_order) {
    if(grp %in% unique(annotation_data$Group)) {
      # Get sample names for this group
      group_sample_names <- rownames(annotation_data)[annotation_data$Group == grp]
      
      # Make sure samples exist in the zscore matrix
      group_sample_names <- intersect(group_sample_names, rownames(zscore_matrix))
      
      if(length(group_sample_names) > 2) {
        group_matrix <- zscore_matrix[group_sample_names, , drop = FALSE]
        # Use correlation distance for better separation
        dist_matrix <- as.dist(1 - cor(t(group_matrix)))
        hc <- hclust(dist_matrix, method = "ward.D2")
        ordered_samples <- c(ordered_samples, group_sample_names[hc$order])
      } else if(length(group_sample_names) > 0) {
        ordered_samples <- c(ordered_samples, group_sample_names)
      }
    }
  }
  return(ordered_samples)
}

# Get top 50 motif names and verify they exist in data
top_50_motif_names <- as.character(top_50_combined$Motif)
available_motifs <- intersect(top_50_motif_names, colnames(heatmap_data))

print(paste("Found", length(available_motifs), "of", length(top_50_motif_names), "top motifs in data"))
## [1] "Found 50 of 50 top motifs in data"
# Filter heatmap data for available top 50 motifs
heatmap_data_top50 <- heatmap_data[, available_motifs, drop = FALSE]

# Calculate z-scores
heatmap_data_zscore_50 <- scale(heatmap_data_top50)

# Order samples
ordered_samples_50 <- order_samples_by_group_enhanced(annotation_row, heatmap_data_zscore_50, group_order)

# Perform hierarchical clustering on motifs
motif_matrix <- heatmap_data_zscore_50[ordered_samples_50, ]
motif_dist <- dist(t(motif_matrix), method = "euclidean")
motif_hclust <- hclust(motif_dist, method = "complete")
motif_dendro <- as.dendrogram(motif_hclust)

# Order motifs by clustering
motif_order_clustered <- labels(motif_dendro)

# Perform hierarchical clustering within each group for samples
cluster_samples_within_group <- function(data_matrix, group_info) {
  all_ordered_samples <- c()
  
  for(grp in c("Healthy", "CLD", "HCC")) {
    group_samples <- rownames(group_info)[group_info$Group == grp]
    group_samples <- intersect(group_samples, rownames(data_matrix))
    
    if(length(group_samples) > 2) {
      group_data <- data_matrix[group_samples, ]
      sample_dist <- dist(group_data, method = "euclidean")
      sample_hclust <- hclust(sample_dist, method = "complete")
      all_ordered_samples <- c(all_ordered_samples, group_samples[sample_hclust$order])
    } else {
      all_ordered_samples <- c(all_ordered_samples, group_samples)
    }
  }
  
  return(all_ordered_samples)
}

# Get clustered sample order
samples_clustered <- cluster_samples_within_group(heatmap_data_zscore_50, annotation_row)

# Prepare data for clustered heatmap
heatmap_long_clustered <- heatmap_data_zscore_50 %>%
  as.data.frame() %>%
  mutate(Sample_ID = rownames(.)) %>%
  pivot_longer(cols = -Sample_ID, names_to = "Motif", values_to = "Z_score") %>%
  left_join(my_data[, c("Sample_ID", "Group")], by = "Sample_ID")

# Set factor levels based on clustering
heatmap_long_clustered$Sample_ID <- factor(heatmap_long_clustered$Sample_ID, 
                                           levels = samples_clustered)
heatmap_long_clustered$Motif <- factor(heatmap_long_clustered$Motif, 
                                       levels = motif_order_clustered)
heatmap_long_clustered$Group <- factor(heatmap_long_clustered$Group, 
                                       levels = c("Healthy", "CLD", "HCC"))

# Create enhanced clustered heatmap
p_heatmap_clustered <- ggplot(heatmap_long_clustered, 
                              aes(x = Sample_ID, y = Motif, fill = Z_score)) +
  geom_tile(color = NA) +  # Remove gridlines for cleaner look
  scale_fill_gradient2(
    low = "darkblue", 
    mid = "white", 
    high = "darkred",
    midpoint = 0,
    limits = c(-3, 3),
    oob = scales::squish,
    name = "Z-score",
    breaks = c(-3, -2, -1, 0, 1, 2, 3)
  ) +
  facet_grid(. ~ Group, scales = "free_x", space = "free_x") +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks = element_blank(),
    axis.text.y = element_text(size = 12, hjust = 1),
    axis.title.x = element_text(size = 18, face = "bold", margin = margin(t = 10)),
    axis.title.y = element_text(size = 18, face = "bold", margin = margin(r = 10)),
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 11, hjust = 0.5, face = "italic"),
    strip.text = element_text(size = 14, face = "bold", color = "white"),
    strip.background = element_rect(fill = "gray30", color = "gray30"),
    legend.position = "right",
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 14),
    legend.key.height = unit(1.2, "cm"),
    legend.key.width = unit(0.6, "cm"),
    panel.spacing.x = unit(0.3, "lines"),
    panel.grid = element_blank(),
    panel.background = element_rect(fill = "white"),
    plot.background = element_rect(fill = "white")
  ) +
  labs(
    title = "",
    subtitle = "",
    x = "Samples(grouped and clustered)",
    y = "5'End Motifs"
  )

print(p_heatmap_clustered)

ggsave("ggplot_heatmap_top50_clustered.png", p_heatmap_clustered, 
       width = 16, height = 12, dpi = 500)
library(ggplot2)
library(dplyr)
library(tidytext)

# Prepare q-values
sig_df <- motif_statistics %>%
  transmute(Motif = as.character(Motif), q = ttest_CLD_HCC_p_adj)

# Top 10 per direction
top10 <- top_50_combined %>%
  mutate(Motif = as.character(Motif)) %>%
  filter(!is.na(fc_HCC_vs_CLD)) %>%
  mutate(Direction = ifelse(fc_HCC_vs_CLD > 0, "Up in HCC", "Up in CLD")) %>%
  group_by(Direction) %>%
  slice_max(order_by = abs(fc_HCC_vs_CLD), n = 10, with_ties = FALSE) %>%
  ungroup() %>%
  left_join(sig_df, by = "Motif") %>%
  mutate(
    star = case_when(
      is.na(q) ~ "ns",
      q < 1e-4 ~ "****",
      q < 1e-3 ~ "***",
      q < 1e-2 ~ "**",
      q < 0.05 ~ "*",
      TRUE ~ "ns"
    ),
    star_col = ifelse(star == "ns", "grey40", "black")
  )

# Reorder motifs for x-axis
top10 <- top10 %>%
  arrange(fc_HCC_vs_CLD) %>%
  mutate(Motif = factor(Motif, levels = Motif))

pad <- 0.06 * max(abs(top10$fc_HCC_vs_CLD), na.rm = TRUE)

# Plot
p <- ggplot(top10, aes(x = Motif, y = fc_HCC_vs_CLD, fill = Direction)) +
  geom_col(width = 0.7, color = "grey20") +
  geom_hline(yintercept = 0, linewidth = 1.1, color = "black") +
  geom_text(
    aes(label = star,
        y = ifelse(fc_HCC_vs_CLD > 0, fc_HCC_vs_CLD + pad, fc_HCC_vs_CLD - pad),
        color = star_col),
    size = 6, fontface = "bold", show.legend = FALSE
  ) +
  scale_color_identity() +
  scale_fill_manual(values = c("Up in HCC" = "#E74C3C", "Up in CLD" = "#F39C12")) +
  labs(
    title = "",
    subtitle = "",
    x = "5' End motif",
    y = "Log2 fold change"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "top",
    legend.title = element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_line(color = "grey85"),
    axis.text.x = element_text(size = 12, angle = 45, hjust = 1),
    axis.text.y = element_text(size = 12),
    plot.margin = margin(10, 20, 10, 10)
  )

print(p)

ggsave("top10_FC_barplot_CLD_HCC.png", p, width = 11, height = 8, dpi = 500)
# Select top 10 motifs for trajectory visualization
top_10_trajectory <- top_50_combined %>%
  arrange(desc(combined_score)) %>%
  slice_head(n = 10)

trajectory_data <- top_10_trajectory %>%
  select(Motif, mean_Healthy, mean_CLD, mean_HCC) %>%
  pivot_longer(cols = starts_with("mean_"), 
               names_to = "Group", 
               values_to = "Mean_Expression") %>%
  mutate(
    Group = gsub("mean_", "", Group),
    Group = factor(Group, levels = c("Healthy", "CLD", "HCC"))
  )

p_trajectory <- ggplot(trajectory_data, 
                       aes(x = Group, y = log2(Mean_Expression + 0.0001), 
                           group = Motif, color = Motif)) +
  geom_line(size = 1.2, alpha = 0.8) +
  geom_point(size = 3) +
  scale_color_viridis_d(option = "turbo") +
  labs(
    title = "Expression Trajectories: Top 10 Motifs",
    subtitle = "Disease progression from Healthy to HCC",
    x = "Disease State",
    y = "Log2(Mean Expression)",
    color = "Motif"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    plot.subtitle = element_text(size = 11, hjust = 0.5),
    legend.position = "right",
    legend.text = element_text(size = 8)
  )

print(p_trajectory)

ggsave("expression_trajectories_top10.png", p_trajectory, width = 12, height = 8, dpi = 500)
library(ggplot2)
library(dplyr)
library(tidyr)
library(gridExtra)
library(ggpubr)
library(ggsignif)

# ---- Select top 6 motifs ----
top_6_motifs <- top_50_combined %>%
  arrange(desc(combined_score)) %>%
  slice_head(n = 6)

print("Top 6 motifs selected for violin plots:")
## [1] "Top 6 motifs selected for violin plots:"
print(top_6_motifs %>% select(Motif, highest_group, fc_HCC_vs_CLD, anova_p_adj))
## # A tibble: 6 × 4
##   Motif highest_group fc_HCC_vs_CLD anova_p_adj
##   <chr> <chr>                 <dbl>       <dbl>
## 1 GCTT  Healthy             -0.0855   1.70e-254
## 2 CTGG  HCC                  0.233    7.90e- 82
## 3 CTCG  HCC                  0.206    5.06e- 73
## 4 CCGG  HCC                  0.236    4.39e- 52
## 5 GCTG  Healthy             -0.0158   1.29e-195
## 6 GCTC  Healthy             -0.0291   2.98e-196
# ---- Prepare data ----
violin_data <- data_melted %>%
  filter(Motif %in% top_6_motifs$Motif) %>%
  left_join(top_6_motifs[, c("Motif", "highest_group")], by = "Motif") %>%
  mutate(
    Group = factor(Group, levels = c("Healthy", "CLD", "HCC")),
    Percentage = Frequency * 100  # still numeric, but we will not print "%"
  )

# Define colors for groups
group_colors <- c("Healthy" = "#27AE60", "CLD" = "#F39C12", "HCC" = "#E74C3C")
comparisons <- list(c("Healthy", "CLD"), c("CLD", "HCC"), c("Healthy", "HCC"))

# ---------- Plot function ----------
create_violin_plot <- function(motif_name, data_subset) {
  d <- data_subset %>% filter(Motif == motif_name)
  highest_grp <- dplyr::first(d$highest_group)
  y_max <- max(d$Percentage, na.rm = TRUE)
  y_lim <- c(0, y_max * 1.35)
  
  ggplot(d, aes(x = Group, y = Percentage, fill = Group)) +
    geom_violin(trim = FALSE, width = 0.75, alpha = 0.9, color = "white") +
    geom_boxplot(width = 0.16, fill = "white", alpha = 0.9,
                 outlier.shape = NA, linewidth = 0.4) +
    geom_jitter(width = 0.08, alpha = 0.35, size = 0.9) +
    scale_fill_manual(values = group_colors) +
    scale_y_continuous(limits = y_lim, expand = c(0.02, 0)) +
    
    # Pairwise significance stars (with "ns" shown)
    stat_compare_means(
      comparisons = comparisons,
      method = "t.test",
      label = "p.signif",
      hide.ns = FALSE,           # <-- show "ns"
      bracket.size = 0.6,
      step.increase = 0.15,
      size = 7,                  # <-- larger stars
      color = "black"
    ) +
    
    labs(
      title = paste0("Motif: ", motif_name),
      subtitle = paste0("Highest in: ", highest_grp),
      x = "Group",
      y = "Frequency (%)"
    ) +
    coord_cartesian(clip = "off") +  # prevents stars from being clipped
    theme_minimal(base_size = 12) +
    theme(
      legend.position = "none",
      plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
      plot.subtitle = element_text(size = 11, hjust = 0.5),
      axis.title.x = element_text(size = 13, face = "bold", margin = margin(t = 6)),
      axis.title.y = element_text(size = 13, face = "bold", margin = margin(r = 6)),
      axis.text.x  = element_text(size = 12, face = "bold"),
      axis.text.y  = element_text(size = 12),
      panel.grid.major.x = element_blank(),
      panel.grid.minor = element_blank(),
      plot.margin = margin(10, 20, 10, 10)
    )
}

# ---------- Build plots ----------
plot_list <- lapply(top_6_motifs$Motif, create_violin_plot, data_subset = violin_data)

# Arrange 2x3 grid
combined_violin_plots <- grid.arrange(
  grobs = plot_list,
  ncol = 3, nrow = 2,
  top = grid::textGrob("",
                       gp = grid::gpar(fontsize = 16, fontface = "bold"))
)

print(combined_violin_plots)# Save final figure
## TableGrob (3 x 3) "arrange": 7 grobs
##   z     cells    name                 grob
## 1 1 (2-2,1-1) arrange       gtable[layout]
## 2 2 (2-2,2-2) arrange       gtable[layout]
## 3 3 (2-2,3-3) arrange       gtable[layout]
## 4 4 (3-3,1-1) arrange       gtable[layout]
## 5 5 (3-3,2-2) arrange       gtable[layout]
## 6 6 (3-3,3-3) arrange       gtable[layout]
## 7 7 (1-1,1-3) arrange text[GRID.text.1533]
ggsave("top6_violin_plots.png", combined_violin_plots,
       width = 15, height = 10, dpi = 500)
# Summary statistics for top 50
summary_top50 <- top_50_combined %>%
  summarise(
    Total_Motifs = n(),
    Mean_FC_HCC_vs_CLD = round(mean(fc_HCC_vs_CLD), 3),
    Mean_FC_HCC_vs_Healthy = round(mean(fc_HCC_vs_Healthy), 3),
    Mean_FC_CLD_vs_Healthy = round(mean(fc_CLD_vs_Healthy), 3),
    Motifs_Higher_in_HCC = sum(fc_HCC_vs_CLD > 0.5),
    Motifs_Higher_in_CLD = sum(fc_HCC_vs_CLD < -0.5),
    Significant_CLD_HCC = sum(ttest_CLD_HCC_p_adj < 0.05, na.rm = TRUE)
  )

write.csv(top_50_combined, "top50_significant_motifs.csv", row.names = FALSE)
write.csv(summary_top50, "top50_summary_statistics.csv", row.names = FALSE)



# Final statistics
cat("\n================================================================================\n")
## 
## ================================================================================
cat("TOP 50 MOTIFS SUMMARY\n")
## TOP 50 MOTIFS SUMMARY
cat("================================================================================\n")
## ================================================================================
print(summary_top50)
## # A tibble: 1 × 7
##   Total_Motifs Mean_FC_HCC_vs_CLD Mean_FC_HCC_vs_Healthy Mean_FC_CLD_vs_Healthy
##          <int>              <dbl>                  <dbl>                  <dbl>
## 1           50              0.047                 -0.047                 -0.094
## # ℹ 3 more variables: Motifs_Higher_in_HCC <int>, Motifs_Higher_in_CLD <int>,
## #   Significant_CLD_HCC <int>
cat("\nTop 5 motifs with highest HCC vs CLD fold change:\n")
## 
## Top 5 motifs with highest HCC vs CLD fold change:
print(top_50_combined %>% 
        arrange(desc(fc_HCC_vs_CLD)) %>% 
        slice_head(n = 5) %>% 
        select(Motif, fc_HCC_vs_CLD, ttest_CLD_HCC_p_adj))
## # A tibble: 5 × 3
##   Motif fc_HCC_vs_CLD ttest_CLD_HCC_p_adj
##   <chr>         <dbl>               <dbl>
## 1 CCGG          0.236            1.27e- 5
## 2 CTGG          0.233            9.52e- 8
## 3 CTCG          0.206            3.49e- 6
## 4 TTGG          0.205            7.45e- 7
## 5 ATGG          0.192            2.99e-11
cat("\nTop 5 motifs with highest CLD vs HCC fold change:\n")
## 
## Top 5 motifs with highest CLD vs HCC fold change:
print(top_50_combined %>% 
        arrange(fc_HCC_vs_CLD) %>% 
        slice_head(n = 5) %>% 
        select(Motif, fc_HCC_vs_CLD, ttest_CLD_HCC_p_adj))
## # A tibble: 5 × 3
##   Motif fc_HCC_vs_CLD ttest_CLD_HCC_p_adj
##   <chr>         <dbl>               <dbl>
## 1 ACTT        -0.130          0.000000265
## 2 TCTA        -0.121          0.000000265
## 3 TATG        -0.101          0.0000127  
## 4 GCAT        -0.0928         0.00000193 
## 5 TCTT        -0.0886         0.00000168
cat("================================================================================\n")
## ================================================================================