This report analyzed the m.2639 mitochondrial mutation across different haplogroups using data from mutect2.haplogroup1.tab and mutect2.03.merge.vcf.

Data processing

First import data, then merge haplogroup information with genetic variant data. Extract m.2639 mutation status for all samples.

# Load clean haplogroup data
haplo_clean <- read_tsv("data/mutect2.haplogroup1.tab")

# Load VCF data
vcf <- read.vcfR("data/mutect2.03.merge.vcf")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 47
##   header_line: 48
##   variant count: 4394
##   column count: 3807
## 
Meta line 47 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 4394
##   Character matrix gt cols: 3807
##   skip: 0
##   nrows: 4394
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant: 4394
## All variants processed
# Clean sample IDs
haplo_clean <- haplo_clean %>%
  mutate(IID_clean = str_split_i(Run, "_", 1))

# Extract m.2639 genotypes
m2639_pos <- "2639"
m2639_index <- which(vcf@fix[, "POS"] == m2639_pos)
gt <- extract.gt(vcf, element = "GT")
m2639_gt <- gt[m2639_index, ]

mutation_data <- data.frame(
  IID_vcf = names(m2639_gt),
  m2639_mutation = ifelse(grepl("1", m2639_gt), 1, 0)
) %>%
  mutate(IID_clean = str_split_i(IID_vcf, "_", 1))

# Merge data
final_data <- haplo_clean %>%
  left_join(mutation_data, by = "IID_clean") %>%
  filter(!is.na(m2639_mutation))

Summary the merge data:

total_samples <- nrow(final_data)
mutated_samples <- sum(final_data$m2639_mutation)
mutation_rate <- mutated_samples / total_samples * 100

cat("**Total samples analyzed:**", total_samples, "\n\n")
## **Total samples analyzed:** 3797
cat("**Samples with m.2639 mutation:**", mutated_samples, "\n\n") 
## **Samples with m.2639 mutation:** 7
cat("**Overall mutation frequency:**", round(mutation_rate, 2), "%\n")
## **Overall mutation frequency:** 0.18 %

Calculated mutation frequencies across haplogroups

haplo_summary <- final_data %>%
  group_by(haplogroup) %>%
  summarise(
    n_samples = n(),
    n_mutated = sum(m2639_mutation),
    mutation_freq = n_mutated / n_samples * 100
  ) %>%
  arrange(desc(mutation_freq))

# Display top haplogroups with mutations
haplo_summary_mutated <- haplo_summary %>%
  filter(n_mutated > 0)



haplo_summary %>%
  knitr::kable(
    col.names = c("Haplogroup", "Total Samples", "Mutated Samples", "Mutation Frequency (%)"),
    digits = 2
  )
Haplogroup Total Samples Mutated Samples Mutation Frequency (%)
N 9 7 77.78
A 9 0 0.00
B 3 0 0.00
C 3 0 0.00
D 1 0 0.00
H 1655 0 0.00
HV 77 0 0.00
I 128 0 0.00
J 424 0 0.00
K 319 0 0.00
L0 1 0 0.00
L1 3 0 0.00
L2 8 0 0.00
L3 6 0 0.00
M 10 0 0.00
R 11 0 0.00
T 380 0 0.00
U 479 0 0.00
V 140 0 0.00
W 86 0 0.00
X 45 0 0.00

Specific Enrichment in N Haplogroup

The m.2639 mutation shows an high frequency (77.78%) within the N haplogroup.

This represents a significant enrichment pattern compared to other haplogroups.

However, with the small number in the N haplogroup, the result may need further validation.

Dolorisk

Load in data for Dolorisk

# Cohort 1: euNeuP
haplo_eu <- read_tsv("data/euNeuP/mutect2.haplogroup1.tab")
vcf_eu <- read.vcfR("data/euNeuP/mutect2.03.merge.vcf")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 63
##   header_line: 64
##   variant count: 504
##   column count: 692
## 
Meta line 63 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 504
##   Character matrix gt cols: 692
##   skip: 0
##   nrows: 504
##   row_num: 0
## 
Processed variant: 504
## All variants processed
# Cohort 2: eunoNeuP
haplo_euno <- read_tsv("data/eunoNeuP/mutect2.haplogroup1.tab")
vcf_euno <- read.vcfR("data/eunoNeuP/mutect2.03.merge.vcf")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 63
##   header_line: 64
##   variant count: 353
##   column count: 361
## 
Meta line 63 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 353
##   Character matrix gt cols: 361
##   skip: 0
##   nrows: 353
##   row_num: 0
## 
Processed variant: 353
## All variants processed
m2639_index_eu <- which(vcf_eu@fix[, "POS"] == m2639_pos)

gt <- extract.gt(vcf_eu, element = "GT")
m2639_gt <- gt[m2639_index_eu, ]

mutation_data_eu <- data.frame(
  IID_vcf = names(m2639_gt),
  m2639_mutation = ifelse(grepl("1", m2639_gt), 1, 0)
) 

# Merge data
eu_data <- haplo_eu %>%
  left_join(mutation_data_eu, by = c("Run" = "IID_vcf") )%>%
  filter(!is.na(m2639_mutation))

cat("NeuP sample:", nrow(eu_data), "mutation:", sum(eu_data$m2639_mutation, na.rm = TRUE), "\n")
## NeuP sample: 682 mutation: 13
m2639_index_euno <- which(vcf_euno@fix[, "POS"] == m2639_pos)

gt <- extract.gt(vcf_euno, element = "GT")
m2639_gt <- gt[m2639_index_euno, ]

mutation_data_euno <- data.frame(
  IID_vcf = names(m2639_gt),
  m2639_mutation = ifelse(grepl("1", m2639_gt), 1, 0)
) 

# Merge data
euno_data <- haplo_euno %>%
  left_join(mutation_data_euno, by = c("Run" = "IID_vcf") )%>%
  filter(!is.na(m2639_mutation))

cat("No NeuP sample:", nrow(euno_data), "mutation:", sum(euno_data$m2639_mutation, na.rm = TRUE), "\n")
## No NeuP sample: 352 mutation: 4
final_data_eu <- bind_rows(eu_data, euno_data)


total_samples <- nrow(final_data_eu)
mutated_samples <- sum(final_data_eu$m2639_mutation)
mutation_rate <- mutated_samples / total_samples * 100

cat("**Total samples analyzed:**", total_samples, "\n\n")
## **Total samples analyzed:** 1034
cat("**Samples with m.2639 mutation:**", mutated_samples, "\n\n") 
## **Samples with m.2639 mutation:** 17
cat("**Overall mutation frequency:**", round(mutation_rate, 2), "%\n")
## **Overall mutation frequency:** 1.64 %

Specific Enrichment in N Haplogroup

haplo_summary_eu <- final_data_eu %>%
  group_by(haplogroup) %>%
  summarise(
    n_samples = n(),
    n_mutated = sum(m2639_mutation),
    mutation_freq = n_mutated / n_samples * 100
  ) %>%
  arrange(desc(mutation_freq))


haplo_summary_eu %>%
  knitr::kable(
    col.names = c("Haplogroup", "Total Samples", "Mutated Samples", "Mutation Frequency (%)"),
    digits = 2
  )
Haplogroup Total Samples Mutated Samples Mutation Frequency (%)
N 22 17 77.27
A 3 0 0.00
B 1 0 0.00
C 1 0 0.00
D 2 0 0.00
H 427 0 0.00
HV 29 0 0.00
I 27 0 0.00
J 94 0 0.00
JT 1 0 0.00
K 121 0 0.00
L0 1 0 0.00
L2 7 0 0.00
L3 2 0 0.00
M 7 0 0.00
R 6 0 0.00
T 87 0 0.00
U 140 0 0.00
V 18 0 0.00
W 11 0 0.00
X 25 0 0.00
Z 2 0 0.00

Comparison of N Haplogroup frequency

Haplogroup distribution

# 计算 haplogroup 分布比例
haplo_dist_main <- final_data %>%
  count(haplogroup, name = "n_main") %>%
  mutate(pct_main = n_main / sum(n_main) * 100)

haplo_dist_eu <- final_data_eu %>%
  count(haplogroup, name = "n_eu") %>%
  mutate(pct_eu = n_eu / sum(n_eu) * 100)

# 合并
haplo_compare <- full_join(haplo_dist_main, haplo_dist_eu, by = "haplogroup") %>%
  mutate(across(c(n_main, n_eu, pct_main, pct_eu), ~replace_na(., 0))) %>%
  arrange(desc(pct_main))

# 输出比较表
haplo_compare %>%
  knitr::kable(
    col.names = c("Haplogroup", 
                  "UKB (n)", "UKB (%)",
                  "Dolorisk (n)", "Dolorisk (%)"),
    digits = 2
  )
Haplogroup UKB (n) UKB (%) Dolorisk (n) Dolorisk (%)
H 1655 43.59 427 41.30
U 479 12.62 140 13.54
J 424 11.17 94 9.09
T 380 10.01 87 8.41
K 319 8.40 121 11.70
V 140 3.69 18 1.74
I 128 3.37 27 2.61
W 86 2.26 11 1.06
HV 77 2.03 29 2.80
X 45 1.19 25 2.42
R 11 0.29 6 0.58
M 10 0.26 7 0.68
A 9 0.24 3 0.29
N 9 0.24 22 2.13
L2 8 0.21 7 0.68
L3 6 0.16 2 0.19
B 3 0.08 1 0.10
C 3 0.08 1 0.10
L1 3 0.08 0 0.00
D 1 0.03 2 0.19
L0 1 0.03 1 0.10
JT 0 0.00 1 0.10
Z 0 0.00 2 0.19
haplo_long <- haplo_compare %>%
  select(haplogroup, pct_main, pct_eu) %>%
  pivot_longer(cols = starts_with("pct"),
               names_to = "dataset", values_to = "percentage") %>%
  mutate(dataset = recode(dataset,
                          pct_main = "UKB",
                          pct_eu = "Dolorisk"))

ggplot(haplo_long, aes(x = reorder(haplogroup, -percentage),
                       y = percentage,
                       fill = dataset)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(x = "Haplogroup", y = "Sample Proportion (%)",
       title = "Comparison of Haplogroup Distribution") +
  theme_minimal()

Chi-square test

haplo_table <- haplo_compare %>%
  select(haplogroup, n_main, n_eu) %>%
  column_to_rownames("haplogroup") %>%
  as.matrix()

chisq.test(haplo_table)
## 
##  Pearson's Chi-squared test
## 
## data:  haplo_table
## X-squared = 117.66, df = 22, p-value = 4.627e-15

Note that there is some small number (<5) haplogroup, the chi-square test may not accurate.