library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read_tsv("ENCFF166SFX.tsv", show_col_types = FALSE)

glimpse(df)
## Rows: 59,526
## Columns: 17
## $ gene_id                                <chr> "10904", "12954", "12956", "129…
## $ `transcript_id(s)`                     <chr> "10904", "12954", "12956", "129…
## $ length                                 <dbl> 93, 94, 72, 82, 73, 72, 74, 82,…
## $ effective_length                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ expected_count                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ TPM                                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ FPKM                                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ posterior_mean_count                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ posterior_standard_deviation_of_count  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ pme_TPM                                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ pme_FPKM                               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ TPM_ci_lower_bound                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ TPM_ci_upper_bound                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ TPM_coefficient_of_quartile_variation  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ FPKM_ci_lower_bound                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ FPKM_ci_upper_bound                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ FPKM_coefficient_of_quartile_variation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
df2 <- df %>%
  mutate(
    log2tpm = log2(TPM + 1),   
    gene = gene_id
  ) %>%
  filter(TPM > 0)
sum(df2$TPM > 0)
## [1] 28089
summary(df2$TPM)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##      0.01      0.33      1.88     35.60      9.65 267430.21
p1 <- ggplot(df2, aes(x = log2tpm)) +
  geom_density(fill = "lightblue", alpha = 0.6) +
  geom_rug(alpha = 0.05) +
  theme_minimal() +
  labs(
    title = "Global Expression Distribution (DLPFC, 90+ Female)",
    x = "log2(TPM + 1)",
    y = "Density"
  )

p1

ggsave("plot1_expression_distribution.png", p1, width = 6, height = 4)
top_genes <- df2 %>%
  arrange(desc(TPM)) %>%
  slice_head(n = 25) %>%
  mutate(gene = fct_reorder(gene, TPM))

p2 <- ggplot(top_genes, aes(x = gene, y = TPM)) +
  geom_col(fill = "lightpink") +
  coord_flip() +
  theme_minimal() +
  labs(
    title = "Top 25 Expressed Genes in DLPFC (TPM)",
    x = "Gene ID",
    y = "TPM"
  )

p2

ggsave("plot2_top_genes.png", p2, width = 6, height = 6)
df_len <- df2 %>%
  filter(!is.na(length)) %>%
  mutate(
    len_q = ntile(length, 4),
    len_q = factor(len_q, labels = c("Q1: shortest", "Q2", "Q3", "Q4: longest"))
  )

p3 <- ggplot(df_len, aes(x = len_q, y = log2tpm)) +
  geom_violin(fill = "lightblue", alpha = 0.6) +
  stat_summary(fun = median, geom = "point", color = "violet", size = 1.5) +
  theme_minimal() +
  labs(
    title = "Expression Distribution by Gene Length Quartile",
    x = "Gene Length Quartile",
    y = "log2(TPM + 1)"
  )

p3

ggsave("plot3_expression_vs_length.png", p3, width = 6, height = 4)