This report analyzed the m.2639 mitochondrial mutation across different haplogroups using data from mutect2.haplogroup1.tab and mutect2.03.merge.vcf.
First import data, then merge haplogroup information with genetic variant data. Extract m.2639 mutation status for all samples.
# Load clean haplogroup data
haplo_clean <- read_tsv("data/mutect2.haplogroup1.tab")
# Load VCF data
vcf <- read.vcfR("data/mutect2.03.merge.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 47
## header_line: 48
## variant count: 4394
## column count: 3807
##
Meta line 47 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 4394
## Character matrix gt cols: 3807
## skip: 0
## nrows: 4394
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant: 4394
## All variants processed
# Clean sample IDs
haplo_clean <- haplo_clean %>%
mutate(IID_clean = str_split_i(Run, "_", 1))
# Extract m.2639 genotypes
m2639_pos <- "2639"
m2639_index <- which(vcf@fix[, "POS"] == m2639_pos)
gt <- extract.gt(vcf, element = "GT")
m2639_gt <- gt[m2639_index, ]
mutation_data <- data.frame(
IID_vcf = names(m2639_gt),
m2639_mutation = ifelse(grepl("1", m2639_gt), 1, 0)
) %>%
mutate(IID_clean = str_split_i(IID_vcf, "_", 1))
# Merge data
final_data <- haplo_clean %>%
left_join(mutation_data, by = "IID_clean") %>%
filter(!is.na(m2639_mutation))
Summary the merge data:
total_samples <- nrow(final_data)
mutated_samples <- sum(final_data$m2639_mutation)
mutation_rate <- mutated_samples / total_samples * 100
cat("**Total samples analyzed:**", total_samples, "\n\n")
## **Total samples analyzed:** 3797
cat("**Samples with m.2639 mutation:**", mutated_samples, "\n\n")
## **Samples with m.2639 mutation:** 7
cat("**Overall mutation frequency:**", round(mutation_rate, 2), "%\n")
## **Overall mutation frequency:** 0.18 %
Calculated mutation frequencies across haplogroups
haplo_summary <- final_data %>%
group_by(haplogroup) %>%
summarise(
n_samples = n(),
n_mutated = sum(m2639_mutation),
mutation_freq = n_mutated / n_samples * 100
) %>%
arrange(desc(mutation_freq))
# Display top haplogroups with mutations
haplo_summary_mutated <- haplo_summary %>%
filter(n_mutated > 0)
haplo_summary %>%
knitr::kable(
col.names = c("Haplogroup", "Total Samples", "Mutated Samples", "Mutation Frequency (%)"),
digits = 2
)
| Haplogroup | Total Samples | Mutated Samples | Mutation Frequency (%) |
|---|---|---|---|
| N | 9 | 7 | 77.78 |
| A | 9 | 0 | 0.00 |
| B | 3 | 0 | 0.00 |
| C | 3 | 0 | 0.00 |
| D | 1 | 0 | 0.00 |
| H | 1655 | 0 | 0.00 |
| HV | 77 | 0 | 0.00 |
| I | 128 | 0 | 0.00 |
| J | 424 | 0 | 0.00 |
| K | 319 | 0 | 0.00 |
| L0 | 1 | 0 | 0.00 |
| L1 | 3 | 0 | 0.00 |
| L2 | 8 | 0 | 0.00 |
| L3 | 6 | 0 | 0.00 |
| M | 10 | 0 | 0.00 |
| R | 11 | 0 | 0.00 |
| T | 380 | 0 | 0.00 |
| U | 479 | 0 | 0.00 |
| V | 140 | 0 | 0.00 |
| W | 86 | 0 | 0.00 |
| X | 45 | 0 | 0.00 |
The m.2639 mutation shows an high frequency (77.78%) within the N haplogroup.
This represents a significant enrichment pattern compared to other haplogroups.
However, with the small number in the N haplogroup, the result may need further validation.
Load in data for Dolorisk
# Cohort 1: euNeuP
haplo_eu <- read_tsv("data/euNeuP/mutect2.haplogroup1.tab")
vcf_eu <- read.vcfR("data/euNeuP/mutect2.03.merge.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 63
## header_line: 64
## variant count: 504
## column count: 692
##
Meta line 63 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 504
## Character matrix gt cols: 692
## skip: 0
## nrows: 504
## row_num: 0
##
Processed variant: 504
## All variants processed
# Cohort 2: eunoNeuP
haplo_euno <- read_tsv("data/eunoNeuP/mutect2.haplogroup1.tab")
vcf_euno <- read.vcfR("data/eunoNeuP/mutect2.03.merge.vcf")
## Scanning file to determine attributes.
## File attributes:
## meta lines: 63
## header_line: 64
## variant count: 353
## column count: 361
##
Meta line 63 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 353
## Character matrix gt cols: 361
## skip: 0
## nrows: 353
## row_num: 0
##
Processed variant: 353
## All variants processed
m2639_index_eu <- which(vcf_eu@fix[, "POS"] == m2639_pos)
gt <- extract.gt(vcf_eu, element = "GT")
m2639_gt <- gt[m2639_index_eu, ]
mutation_data_eu <- data.frame(
IID_vcf = names(m2639_gt),
m2639_mutation = ifelse(grepl("1", m2639_gt), 1, 0)
)
# Merge data
eu_data <- haplo_eu %>%
left_join(mutation_data_eu, by = c("Run" = "IID_vcf") )%>%
filter(!is.na(m2639_mutation))
cat("NeuP sample:", nrow(eu_data), "mutation:", sum(eu_data$m2639_mutation, na.rm = TRUE), "\n")
## NeuP sample: 682 mutation: 13
m2639_index_euno <- which(vcf_euno@fix[, "POS"] == m2639_pos)
gt <- extract.gt(vcf_euno, element = "GT")
m2639_gt <- gt[m2639_index_euno, ]
mutation_data_euno <- data.frame(
IID_vcf = names(m2639_gt),
m2639_mutation = ifelse(grepl("1", m2639_gt), 1, 0)
)
# Merge data
euno_data <- haplo_euno %>%
left_join(mutation_data_euno, by = c("Run" = "IID_vcf") )%>%
filter(!is.na(m2639_mutation))
cat("No NeuP sample:", nrow(euno_data), "mutation:", sum(euno_data$m2639_mutation, na.rm = TRUE), "\n")
## No NeuP sample: 352 mutation: 4
final_data_eu <- bind_rows(eu_data, euno_data)
total_samples <- nrow(final_data_eu)
mutated_samples <- sum(final_data_eu$m2639_mutation)
mutation_rate <- mutated_samples / total_samples * 100
cat("**Total samples analyzed:**", total_samples, "\n\n")
## **Total samples analyzed:** 1034
cat("**Samples with m.2639 mutation:**", mutated_samples, "\n\n")
## **Samples with m.2639 mutation:** 17
cat("**Overall mutation frequency:**", round(mutation_rate, 2), "%\n")
## **Overall mutation frequency:** 1.64 %
haplo_summary_eu <- final_data_eu %>%
group_by(haplogroup) %>%
summarise(
n_samples = n(),
n_mutated = sum(m2639_mutation),
mutation_freq = n_mutated / n_samples * 100
) %>%
arrange(desc(mutation_freq))
haplo_summary_eu %>%
knitr::kable(
col.names = c("Haplogroup", "Total Samples", "Mutated Samples", "Mutation Frequency (%)"),
digits = 2
)
| Haplogroup | Total Samples | Mutated Samples | Mutation Frequency (%) |
|---|---|---|---|
| N | 22 | 17 | 77.27 |
| A | 3 | 0 | 0.00 |
| B | 1 | 0 | 0.00 |
| C | 1 | 0 | 0.00 |
| D | 2 | 0 | 0.00 |
| H | 427 | 0 | 0.00 |
| HV | 29 | 0 | 0.00 |
| I | 27 | 0 | 0.00 |
| J | 94 | 0 | 0.00 |
| JT | 1 | 0 | 0.00 |
| K | 121 | 0 | 0.00 |
| L0 | 1 | 0 | 0.00 |
| L2 | 7 | 0 | 0.00 |
| L3 | 2 | 0 | 0.00 |
| M | 7 | 0 | 0.00 |
| R | 6 | 0 | 0.00 |
| T | 87 | 0 | 0.00 |
| U | 140 | 0 | 0.00 |
| V | 18 | 0 | 0.00 |
| W | 11 | 0 | 0.00 |
| X | 25 | 0 | 0.00 |
| Z | 2 | 0 | 0.00 |
# 计算 haplogroup 分布比例
haplo_dist_main <- final_data %>%
count(haplogroup, name = "n_main") %>%
mutate(pct_main = n_main / sum(n_main) * 100)
haplo_dist_eu <- final_data_eu %>%
count(haplogroup, name = "n_eu") %>%
mutate(pct_eu = n_eu / sum(n_eu) * 100)
# 合并
haplo_compare <- full_join(haplo_dist_main, haplo_dist_eu, by = "haplogroup") %>%
mutate(across(c(n_main, n_eu, pct_main, pct_eu), ~replace_na(., 0))) %>%
arrange(desc(pct_main))
# 输出比较表
haplo_compare %>%
knitr::kable(
col.names = c("Haplogroup",
"UKB (n)", "UKB (%)",
"Dolorisk (n)", "Dolorisk (%)"),
digits = 2
)
| Haplogroup | UKB (n) | UKB (%) | Dolorisk (n) | Dolorisk (%) |
|---|---|---|---|---|
| H | 1655 | 43.59 | 427 | 41.30 |
| U | 479 | 12.62 | 140 | 13.54 |
| J | 424 | 11.17 | 94 | 9.09 |
| T | 380 | 10.01 | 87 | 8.41 |
| K | 319 | 8.40 | 121 | 11.70 |
| V | 140 | 3.69 | 18 | 1.74 |
| I | 128 | 3.37 | 27 | 2.61 |
| W | 86 | 2.26 | 11 | 1.06 |
| HV | 77 | 2.03 | 29 | 2.80 |
| X | 45 | 1.19 | 25 | 2.42 |
| R | 11 | 0.29 | 6 | 0.58 |
| M | 10 | 0.26 | 7 | 0.68 |
| A | 9 | 0.24 | 3 | 0.29 |
| N | 9 | 0.24 | 22 | 2.13 |
| L2 | 8 | 0.21 | 7 | 0.68 |
| L3 | 6 | 0.16 | 2 | 0.19 |
| B | 3 | 0.08 | 1 | 0.10 |
| C | 3 | 0.08 | 1 | 0.10 |
| L1 | 3 | 0.08 | 0 | 0.00 |
| D | 1 | 0.03 | 2 | 0.19 |
| L0 | 1 | 0.03 | 1 | 0.10 |
| JT | 0 | 0.00 | 1 | 0.10 |
| Z | 0 | 0.00 | 2 | 0.19 |
haplo_long <- haplo_compare %>%
select(haplogroup, pct_main, pct_eu) %>%
pivot_longer(cols = starts_with("pct"),
names_to = "dataset", values_to = "percentage") %>%
mutate(dataset = recode(dataset,
pct_main = "UKB",
pct_eu = "Dolorisk"))
ggplot(haplo_long, aes(x = reorder(haplogroup, -percentage),
y = percentage,
fill = dataset)) +
geom_col(position = "dodge") +
coord_flip() +
labs(x = "Haplogroup", y = "Sample Proportion (%)",
title = "Comparison of Haplogroup Distribution") +
theme_minimal()
haplo_table <- haplo_compare %>%
select(haplogroup, n_main, n_eu) %>%
column_to_rownames("haplogroup") %>%
as.matrix()
chisq.test(haplo_table)
##
## Pearson's Chi-squared test
##
## data: haplo_table
## X-squared = 117.66, df = 22, p-value = 4.627e-15
Note that there is some small number (<5) haplogroup, the chi-square test may not accurate.