# CREADOR: SAMANTHA CARRILLO GARNICA
# GRAFICA DE VOLCANO

library(pacman)

p_load(
  "readr",
  "ggplot2",
  "ggrepel",
  "dplyr",
  "matrixTests",
  "grid"
)

datos <- read_csv("https://raw.githubusercontent.com/ManuelLaraMVZ/Transcript-mica/refs/heads/main/datos_miRNAs.csv")
## Rows: 363 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Gen, Condicion
## dbl (6): Cx1, Cx2, Cx3, T1, T2, T3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(datos)
## # A tibble: 6 × 8
##   Gen                Condicion   Cx1   Cx2   Cx3    T1    T2    T3
##   <chr>              <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 U6 snRNA-001973    Control    13.8  11.7  11.9  13.2  13.0  12.6
## 2 ath-miR159a-000338 Target     35    35    35    35    35    35  
## 3 hsa-let-7a-000377  Target     20.5  21.0  21.0  20.4  19.6  20.9
## 4 hsa-let-7b-002619  Target     18.4  19.0  19.1  18.3  17.4  19.0
## 5 hsa-let-7c-000379  Target     22.2  23.7  23.8  22.9  22.0  24.0
## 6 hsa-let-7d-002283  Target     22.2  22.7  21.9  22.0  21.2  21.9
# Extracción de controles
Controles <- datos %>%
  filter(Condicion == "Control")

head(Controles)
## # A tibble: 3 × 8
##   Gen             Condicion   Cx1   Cx2   Cx3    T1    T2    T3
##   <chr>           <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 U6 snRNA-001973 Control    13.8  11.7  11.9  13.2  13.0  12.6
## 2 RNU44-001094    Control    17.3  16.9  16.8  17.7  17.3  16.8
## 3 RNU48-001006    Control    15.1  16.4  17.0  16.0  15.1  16.4
# Promedio de controles
promedio_controles <- Controles %>%
  summarise(
    Mean_C1 = mean(Cx1),
    Mean_C2 = mean(Cx2),
    Mean_C3 = mean(Cx3),
    Mean_T1 = mean(T1),
    Mean_T2 = mean(T2),
    Mean_T3 = mean(T3)
  ) %>%
  mutate(Gen = "Promedio_controles") %>%
  select(Gen, Mean_C1, Mean_C2, Mean_C3, Mean_T1, Mean_T2, Mean_T3)

promedio_controles
## # A tibble: 1 × 7
##   Gen                Mean_C1 Mean_C2 Mean_C3 Mean_T1 Mean_T2 Mean_T3
##   <chr>                <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 Promedio_controles    15.4    15.0    15.3    15.6    15.1    15.3
# Genes target
genes <- datos %>%
  filter(Condicion == "Target") %>%
  select(-2)

head(genes)
## # A tibble: 6 × 7
##   Gen                  Cx1   Cx2   Cx3    T1    T2    T3
##   <chr>              <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ath-miR159a-000338  35    35    35    35    35    35  
## 2 hsa-let-7a-000377   20.5  21.0  21.0  20.4  19.6  20.9
## 3 hsa-let-7b-002619   18.4  19.0  19.1  18.3  17.4  19.0
## 4 hsa-let-7c-000379   22.2  23.7  23.8  22.9  22.0  24.0
## 5 hsa-let-7d-002283   22.2  22.7  21.9  22.0  21.2  21.9
## 6 hsa-let-7e-002406   18.0  18.4  18.5  18.0  17.3  18.6
# Cálculo de 2^-DCT
DCT <- genes %>%
  mutate(
    DCT_C1 = 2^-(Cx1 - promedio_controles$Mean_C1),
    DCT_C2 = 2^-(Cx2 - promedio_controles$Mean_C2),
    DCT_C3 = 2^-(Cx3 - promedio_controles$Mean_C3),
    DCT_T1 = 2^-(T1 - promedio_controles$Mean_T1),
    DCT_T2 = 2^-(T2 - promedio_controles$Mean_T2),
    DCT_T3 = 2^-(T3 - promedio_controles$Mean_T3)
  ) %>%
  select(Gen, DCT_C1, DCT_C2, DCT_C3, DCT_T1, DCT_T2, DCT_T3)

head(DCT)
## # A tibble: 6 × 7
##   Gen                    DCT_C1      DCT_C2     DCT_C3    DCT_T1  DCT_T2  DCT_T3
##   <chr>                   <dbl>       <dbl>      <dbl>     <dbl>   <dbl>   <dbl>
## 1 ath-miR159a-000338 0.00000124 0.000000960 0.00000114   1.47e-6 1.03e-6 1.16e-6
## 2 hsa-let-7a-000377  0.0289     0.0157      0.0185       3.70e-2 4.49e-2 1.98e-2
## 3 hsa-let-7b-002619  0.127      0.0615      0.0677       1.62e-1 1.98e-1 7.75e-2
## 4 hsa-let-7c-000379  0.00900    0.00243     0.00272      6.40e-3 8.67e-3 2.39e-3
## 5 hsa-let-7d-002283  0.00893    0.00492     0.0102       1.23e-2 1.46e-2 9.87e-3
## 6 hsa-let-7e-002406  0.167      0.0960      0.108        1.91e-1 2.21e-1 1.03e-1
# Promedios
promedio_genes <- DCT %>%
  mutate(
    Mean_DCT_Cx = (DCT_C1 + DCT_C2 + DCT_C3) / 3,
    Mean_DCT_Tx = (DCT_T1 + DCT_T2 + DCT_T3) / 3
  )

head(promedio_genes)
## # A tibble: 6 × 9
##   Gen     DCT_C1  DCT_C2  DCT_C3  DCT_T1  DCT_T2  DCT_T3 Mean_DCT_Cx Mean_DCT_Tx
##   <chr>    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>       <dbl>       <dbl>
## 1 ath-m… 1.24e-6 9.60e-7 1.14e-6 1.47e-6 1.03e-6 1.16e-6  0.00000111  0.00000122
## 2 hsa-l… 2.89e-2 1.57e-2 1.85e-2 3.70e-2 4.49e-2 1.98e-2  0.0210      0.0339    
## 3 hsa-l… 1.27e-1 6.15e-2 6.77e-2 1.62e-1 1.98e-1 7.75e-2  0.0856      0.146     
## 4 hsa-l… 9.00e-3 2.43e-3 2.72e-3 6.40e-3 8.67e-3 2.39e-3  0.00472     0.00582   
## 5 hsa-l… 8.93e-3 4.92e-3 1.02e-2 1.23e-2 1.46e-2 9.87e-3  0.00802     0.0123    
## 6 hsa-l… 1.67e-1 9.60e-2 1.08e-1 1.91e-1 2.21e-1 1.03e-1  0.123       0.172
# T-test Welch
tvalue_gen <- row_t_welch(
  promedio_genes[, c("DCT_C1", "DCT_C2", "DCT_C3")],
  promedio_genes[, c("DCT_T1", "DCT_T2", "DCT_T3")]
)

head(tvalue_gen)
##   obs.x obs.y obs.tot       mean.x       mean.y     mean.diff        var.x
## 1     3     3       6 1.113319e-06 1.220085e-06 -1.067666e-07 2.043512e-14
## 2     3     3       6 2.104945e-02 3.390214e-02 -1.285269e-02 4.856207e-05
## 3     3     3       6 8.558097e-02 1.457383e-01 -6.015735e-02 1.327202e-03
## 4     3     3       6 4.716674e-03 5.820710e-03 -1.104036e-03 1.377166e-05
## 5     3     3       6 8.019463e-03 1.226943e-02 -4.249971e-03 7.633725e-06
## 6     3     3       6 1.234439e-01 1.716779e-01 -4.823397e-02 1.436199e-03
##          var.y       stderr       df  statistic    pvalue      conf.low
## 1 5.127956e-14 1.546121e-07 3.375569 -0.6905449 0.5343619 -5.692448e-07
## 2 1.636924e-04 8.411390e-03 3.090675 -1.5280102 0.2213573 -3.918263e-02
## 3 3.822375e-03 4.143097e-02 3.239447 -1.4519899 0.2359597 -1.866636e-01
## 4 1.011733e-05 2.821878e-03 3.908539 -0.3912417 0.7160073 -9.011661e-03
## 5 5.688217e-06 2.107284e-03 3.916473 -2.0168001 0.1154112 -1.015027e-02
## 6 3.763194e-03 4.163089e-02 3.332494 -1.1586103 0.3228507 -1.735489e-01
##      conf.high mean.null alternative conf.level
## 1 3.557117e-07         0   two.sided       0.95
## 2 1.347725e-02         0   two.sided       0.95
## 3 6.634885e-02         0   two.sided       0.95
## 4 6.803589e-03         0   two.sided       0.95
## 5 1.650329e-03         0   two.sided       0.95
## 6 7.708100e-02         0   two.sided       0.95
# Fold change y p-value
FCyPV <- promedio_genes %>%
  select(Gen, Mean_DCT_Cx, Mean_DCT_Tx) %>%
  mutate(
    p_value = tvalue_gen$pvalue,
    Fold_change = Mean_DCT_Tx / Mean_DCT_Cx
  )

head(FCyPV)
## # A tibble: 6 × 5
##   Gen                Mean_DCT_Cx Mean_DCT_Tx p_value Fold_change
##   <chr>                    <dbl>       <dbl>   <dbl>       <dbl>
## 1 ath-miR159a-000338  0.00000111  0.00000122   0.534        1.10
## 2 hsa-let-7a-000377   0.0210      0.0339       0.221        1.61
## 3 hsa-let-7b-002619   0.0856      0.146        0.236        1.70
## 4 hsa-let-7c-000379   0.00472     0.00582      0.716        1.23
## 5 hsa-let-7d-002283   0.00802     0.0123       0.115        1.53
## 6 hsa-let-7e-002406   0.123       0.172        0.323        1.39
# Logs
Logs <- FCyPV %>%
  mutate(
    LPV = -log10(p_value),
    LFC = log2(Fold_change)
  ) %>%
  select(Gen, p_value, Fold_change, LPV, LFC)

head(Logs)
## # A tibble: 6 × 5
##   Gen                p_value Fold_change   LPV   LFC
##   <chr>                <dbl>       <dbl> <dbl> <dbl>
## 1 ath-miR159a-000338   0.534        1.10 0.272 0.132
## 2 hsa-let-7a-000377    0.221        1.61 0.655 0.688
## 3 hsa-let-7b-002619    0.236        1.70 0.627 0.768
## 4 hsa-let-7c-000379    0.716        1.23 0.145 0.303
## 5 hsa-let-7d-002283    0.115        1.53 0.938 0.613
## 6 hsa-let-7e-002406    0.323        1.39 0.491 0.476
# Gráfica base
volcano <- ggplot(Logs, aes(x = LFC, y = LPV)) +
  geom_point(size = 2, color = "gray", alpha = 0.7) +
  theme_classic(base_size = 12) +
  labs(
    title = "Analisis comparativo de miRNAs",
    caption = "Creador: Samantha Carrillo",
    x = "Log2 (2-DDCT)",
    y = "-Log10(valor de p)"
  )

volcano

# Umbrales
limite_p <- 0.05
limite_FC <- 1.5

# Down-regulated
down_regulated <- Logs %>%
  filter(
    LFC < -log2(limite_FC),
    LPV > -log10(limite_p)
  )

head(down_regulated)
## # A tibble: 1 × 5
##   Gen                   p_value Fold_change   LPV    LFC
##   <chr>                   <dbl>       <dbl> <dbl>  <dbl>
## 1 hsa-miR-502-3p-002083  0.0327       0.634  1.49 -0.658
# Up-regulated
up_regulated <- Logs %>%
  filter(
    LFC > log2(limite_FC),
    LPV > -log10(limite_p)
  )

head(up_regulated)
## # A tibble: 2 × 5
##   Gen                 p_value Fold_change   LPV   LFC
##   <chr>                 <dbl>       <dbl> <dbl> <dbl>
## 1 hsa-miR-148a-000470  0.0355        5.52  1.45  2.46
## 2 hsa-miR-429-001024   0.0455        3.09  1.34  1.63
# Top genes
top_down_regulated <- down_regulated %>%
  arrange(desc(LPV)) %>%
  head(5)

top_down_regulated
## # A tibble: 1 × 5
##   Gen                   p_value Fold_change   LPV    LFC
##   <chr>                   <dbl>       <dbl> <dbl>  <dbl>
## 1 hsa-miR-502-3p-002083  0.0327       0.634  1.49 -0.658
top_up_regulated <- up_regulated %>%
  arrange(desc(LPV)) %>%
  head(5)

top_up_regulated
## # A tibble: 2 × 5
##   Gen                 p_value Fold_change   LPV   LFC
##   <chr>                 <dbl>       <dbl> <dbl> <dbl>
## 1 hsa-miR-148a-000470  0.0355        5.52  1.45  2.46
## 2 hsa-miR-429-001024   0.0455        3.09  1.34  1.63
# Mejora gráfica
volcano2 <- volcano +
  geom_hline(yintercept = -log10(limite_p), linetype = "dashed") +
  geom_vline(xintercept = c(-log2(limite_FC), log2(limite_FC)), linetype = "dashed")

volcano2

volcano3 <- volcano2 +
  geom_point(data = up_regulated, aes(x = LFC, y = LPV), color = "#E64B35B2", size = 3) +
  geom_point(data = down_regulated, aes(x = LFC, y = LPV), color = "#4DBBD5B2", size = 3)

volcano3

# Etiquetas
volcano4 <- volcano3 +
  geom_label_repel(
    data = top_up_regulated,
    aes(x = LFC, y = LPV, label = Gen),
    box.padding = unit(0.35, "lines"),
    point.padding = unit(0.5, "lines"),
    segment.color = "grey50",
    segment.size = 0.2,
    nudge_x = 0.5,
    nudge_y = 0.5
  ) +
  geom_label_repel(
    data = top_down_regulated,
    aes(x = LFC, y = LPV, label = Gen),
    box.padding = unit(0.35, "lines"),
    point.padding = unit(0.5, "lines"),
    segment.color = "grey50",
    segment.size = 0.2,
    nudge_x = -0.5,
    nudge_y = 0.5
  )

volcano4

# Guardar
ggsave("Volcano4.jpeg", plot = volcano4, height = 5, width = 6, dpi = 300)