library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read_tsv("ENCFF166SFX.tsv", show_col_types = FALSE)
glimpse(df)
## Rows: 59,526
## Columns: 17
## $ gene_id <chr> "10904", "12954", "12956", "129…
## $ `transcript_id(s)` <chr> "10904", "12954", "12956", "129…
## $ length <dbl> 93, 94, 72, 82, 73, 72, 74, 82,…
## $ effective_length <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ expected_count <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ TPM <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ FPKM <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ posterior_mean_count <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ posterior_standard_deviation_of_count <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ pme_TPM <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ pme_FPKM <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ TPM_ci_lower_bound <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ TPM_ci_upper_bound <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ TPM_coefficient_of_quartile_variation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ FPKM_ci_lower_bound <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ FPKM_ci_upper_bound <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ FPKM_coefficient_of_quartile_variation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
df2 <- df %>%
mutate(
log2tpm = log2(TPM + 1),
gene = gene_id
) %>%
filter(TPM > 0)
sum(df2$TPM > 0)
## [1] 28089
summary(df2$TPM)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01 0.33 1.88 35.60 9.65 267430.21
p1 <- ggplot(df2, aes(x = log2tpm)) +
geom_density(fill = "lightblue", alpha = 0.6) +
geom_rug(alpha = 0.05) +
theme_minimal() +
labs(
title = "Global Expression Distribution (DLPFC, 90+ Female)",
x = "log2(TPM + 1)",
y = "Density"
)
p1

ggsave("plot1_expression_distribution.png", p1, width = 6, height = 4)
top_genes <- df2 %>%
arrange(desc(TPM)) %>%
slice_head(n = 25) %>%
mutate(gene = fct_reorder(gene, TPM))
p2 <- ggplot(top_genes, aes(x = gene, y = TPM)) +
geom_col(fill = "lightpink") +
coord_flip() +
theme_minimal() +
labs(
title = "Top 25 Expressed Genes in DLPFC (TPM)",
x = "Gene ID",
y = "TPM"
)
p2

ggsave("plot2_top_genes.png", p2, width = 6, height = 6)
df_len <- df2 %>%
filter(!is.na(length)) %>%
mutate(
len_q = ntile(length, 4),
len_q = factor(len_q, labels = c("Q1: shortest", "Q2", "Q3", "Q4: longest"))
)
p3 <- ggplot(df_len, aes(x = len_q, y = log2tpm)) +
geom_violin(fill = "lightblue", alpha = 0.6) +
stat_summary(fun = median, geom = "point", color = "violet", size = 1.5) +
theme_minimal() +
labs(
title = "Expression Distribution by Gene Length Quartile",
x = "Gene Length Quartile",
y = "log2(TPM + 1)"
)
p3

ggsave("plot3_expression_vs_length.png", p3, width = 6, height = 4)