m.2639 Mutation Analysis Report

This report analyzed the m.2639 mitochondrial mutation across different haplogroups using data from mutect2.haplogroup1.tab and mutect2.03.merge.vcf.

Data processing

First import data, then merge haplogroup information with genetic variant data. Extract m.2639 mutation status for all samples.

# Load clean haplogroup data
haplo_clean <- read_tsv("data/mutect2.haplogroup1.tab")

# Load VCF data
vcf <- read.vcfR("data/mutect2.03.merge.vcf")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 47
##   header_line: 48
##   variant count: 4394
##   column count: 3807
## 
Meta line 47 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 4394
##   Character matrix gt cols: 3807
##   skip: 0
##   nrows: 4394
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant: 4394
## All variants processed

# Clean sample IDs
haplo_clean <- haplo_clean %>%
  mutate(IID_clean = str_split_i(Run, "_", 1))

# Extract m.2639 genotypes
m2639_pos <- "2639"
m2639_index <- which(vcf@fix[, "POS"] == m2639_pos)
gt <- extract.gt(vcf, element = "GT")
m2639_gt <- gt[m2639_index, ]

mutation_data <- data.frame(
  IID_vcf = names(m2639_gt),
  m2639_mutation = ifelse(grepl("1", m2639_gt), 1, 0)
) %>%
  mutate(IID_clean = str_split_i(IID_vcf, "_", 1))

# Merge data
final_data <- haplo_clean %>%
  left_join(mutation_data, by = "IID_clean") %>%
  filter(!is.na(m2639_mutation))

Summary the merge data:

total_samples <- nrow(final_data)
mutated_samples <- sum(final_data$m2639_mutation)
mutation_rate <- mutated_samples / total_samples * 100

cat("**Total samples analyzed:**", total_samples, "\n\n")

## **Total samples analyzed:** 3797

cat("**Samples with m.2639 mutation:**", mutated_samples, "\n\n")

## **Samples with m.2639 mutation:** 7

cat("**Overall mutation frequency:**", round(mutation_rate, 2), "%\n")

## **Overall mutation frequency:** 0.18 %

Calculated mutation frequencies across haplogroups

haplo_summary <- final_data %>%
  group_by(haplogroup) %>%
  summarise(
    n_samples = n(),
    n_mutated = sum(m2639_mutation),
    mutation_freq = n_mutated / n_samples * 100
  ) %>%
  arrange(desc(mutation_freq))

# Display top haplogroups with mutations
haplo_summary_mutated <- haplo_summary %>%
  filter(n_mutated > 0)



haplo_summary %>%
  knitr::kable(
    col.names = c("Haplogroup", "Total Samples", "Mutated Samples", "Mutation Frequency (%)"),
    digits = 2
  )

Haplogroup	Total Samples	Mutated Samples	Mutation Frequency (%)
N	9	7	77.78
A	9	0	0.00
B	3	0	0.00
C	3	0	0.00
D	1	0	0.00
H	1655	0	0.00
HV	77	0	0.00
I	128	0	0.00
J	424	0	0.00
K	319	0	0.00
L0	1	0	0.00
L1	3	0	0.00
L2	8	0	0.00
L3	6	0	0.00
M	10	0	0.00
R	11	0	0.00
T	380	0	0.00
U	479	0	0.00
V	140	0	0.00
W	86	0	0.00
X	45	0	0.00

Specific Enrichment in N Haplogroup

The m.2639 mutation shows an high frequency (77.78%) within the N haplogroup.

This represents a significant enrichment pattern compared to other haplogroups.

However, with the small number in the N haplogroup, the result may need further validation.

Dolorisk

Load in data for Dolorisk

# Cohort 1: euNeuP
haplo_eu <- read_tsv("data/euNeuP/mutect2.haplogroup1.tab")
vcf_eu <- read.vcfR("data/euNeuP/mutect2.03.merge.vcf")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 63
##   header_line: 64
##   variant count: 504
##   column count: 692
## 
Meta line 63 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 504
##   Character matrix gt cols: 692
##   skip: 0
##   nrows: 504
##   row_num: 0
## 
Processed variant: 504
## All variants processed

# Cohort 2: eunoNeuP
haplo_euno <- read_tsv("data/eunoNeuP/mutect2.haplogroup1.tab")
vcf_euno <- read.vcfR("data/eunoNeuP/mutect2.03.merge.vcf")

## Scanning file to determine attributes.
## File attributes:
##   meta lines: 63
##   header_line: 64
##   variant count: 353
##   column count: 361
## 
Meta line 63 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 353
##   Character matrix gt cols: 361
##   skip: 0
##   nrows: 353
##   row_num: 0
## 
Processed variant: 353
## All variants processed

m2639_index_eu <- which(vcf_eu@fix[, "POS"] == m2639_pos)

gt <- extract.gt(vcf_eu, element = "GT")
m2639_gt <- gt[m2639_index_eu, ]

mutation_data_eu <- data.frame(
  IID_vcf = names(m2639_gt),
  m2639_mutation = ifelse(grepl("1", m2639_gt), 1, 0)
) 

# Merge data
eu_data <- haplo_eu %>%
  left_join(mutation_data_eu, by = c("Run" = "IID_vcf") )%>%
  filter(!is.na(m2639_mutation))

cat("NeuP sample:", nrow(eu_data), "mutation:", sum(eu_data$m2639_mutation, na.rm = TRUE), "\n")

## NeuP sample: 682 mutation: 13

m2639_index_euno <- which(vcf_euno@fix[, "POS"] == m2639_pos)

gt <- extract.gt(vcf_euno, element = "GT")
m2639_gt <- gt[m2639_index_euno, ]

mutation_data_euno <- data.frame(
  IID_vcf = names(m2639_gt),
  m2639_mutation = ifelse(grepl("1", m2639_gt), 1, 0)
) 

# Merge data
euno_data <- haplo_euno %>%
  left_join(mutation_data_euno, by = c("Run" = "IID_vcf") )%>%
  filter(!is.na(m2639_mutation))

cat("No NeuP sample:", nrow(euno_data), "mutation:", sum(euno_data$m2639_mutation, na.rm = TRUE), "\n")

## No NeuP sample: 352 mutation: 4

final_data_eu <- bind_rows(eu_data, euno_data)


total_samples <- nrow(final_data_eu)
mutated_samples <- sum(final_data_eu$m2639_mutation)
mutation_rate <- mutated_samples / total_samples * 100

cat("**Total samples analyzed:**", total_samples, "\n\n")

## **Total samples analyzed:** 1034

cat("**Samples with m.2639 mutation:**", mutated_samples, "\n\n")

## **Samples with m.2639 mutation:** 17

cat("**Overall mutation frequency:**", round(mutation_rate, 2), "%\n")

## **Overall mutation frequency:** 1.64 %

Specific Enrichment in N Haplogroup

haplo_summary_eu <- final_data_eu %>%
  group_by(haplogroup) %>%
  summarise(
    n_samples = n(),
    n_mutated = sum(m2639_mutation),
    mutation_freq = n_mutated / n_samples * 100
  ) %>%
  arrange(desc(mutation_freq))


haplo_summary_eu %>%
  knitr::kable(
    col.names = c("Haplogroup", "Total Samples", "Mutated Samples", "Mutation Frequency (%)"),
    digits = 2
  )

Haplogroup	Total Samples	Mutated Samples	Mutation Frequency (%)
N	22	17	77.27
A	3	0	0.00
B	1	0	0.00
C	1	0	0.00
D	2	0	0.00
H	427	0	0.00
HV	29	0	0.00
I	27	0	0.00
J	94	0	0.00
JT	1	0	0.00
K	121	0	0.00
L0	1	0	0.00
L2	7	0	0.00
L3	2	0	0.00
M	7	0	0.00
R	6	0	0.00
T	87	0	0.00
U	140	0	0.00
V	18	0	0.00
W	11	0	0.00
X	25	0	0.00
Z	2	0	0.00

Comparison of N Haplogroup frequency

Haplogroup distribution

# 计算 haplogroup 分布比例
haplo_dist_main <- final_data %>%
  count(haplogroup, name = "n_main") %>%
  mutate(pct_main = n_main / sum(n_main) * 100)

haplo_dist_eu <- final_data_eu %>%
  count(haplogroup, name = "n_eu") %>%
  mutate(pct_eu = n_eu / sum(n_eu) * 100)

# 合并
haplo_compare <- full_join(haplo_dist_main, haplo_dist_eu, by = "haplogroup") %>%
  mutate(across(c(n_main, n_eu, pct_main, pct_eu), ~replace_na(., 0))) %>%
  arrange(desc(pct_main))

# 输出比较表
haplo_compare %>%
  knitr::kable(
    col.names = c("Haplogroup", 
                  "UKB (n)", "UKB (%)",
                  "Dolorisk (n)", "Dolorisk (%)"),
    digits = 2
  )

Haplogroup	UKB (n)	UKB (%)	Dolorisk (n)	Dolorisk (%)
H	1655	43.59	427	41.30
U	479	12.62	140	13.54
J	424	11.17	94	9.09
T	380	10.01	87	8.41
K	319	8.40	121	11.70
V	140	3.69	18	1.74
I	128	3.37	27	2.61
W	86	2.26	11	1.06
HV	77	2.03	29	2.80
X	45	1.19	25	2.42
R	11	0.29	6	0.58
M	10	0.26	7	0.68
A	9	0.24	3	0.29
N	9	0.24	22	2.13
L2	8	0.21	7	0.68
L3	6	0.16	2	0.19
B	3	0.08	1	0.10
C	3	0.08	1	0.10
L1	3	0.08	0	0.00
D	1	0.03	2	0.19
L0	1	0.03	1	0.10
JT	0	0.00	1	0.10
Z	0	0.00	2	0.19

haplo_long <- haplo_compare %>%
  select(haplogroup, pct_main, pct_eu) %>%
  pivot_longer(cols = starts_with("pct"),
               names_to = "dataset", values_to = "percentage") %>%
  mutate(dataset = recode(dataset,
                          pct_main = "UKB",
                          pct_eu = "Dolorisk"))

ggplot(haplo_long, aes(x = reorder(haplogroup, -percentage),
                       y = percentage,
                       fill = dataset)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(x = "Haplogroup", y = "Sample Proportion (%)",
       title = "Comparison of Haplogroup Distribution") +
  theme_minimal()

Chi-square test

haplo_table <- haplo_compare %>%
  select(haplogroup, n_main, n_eu) %>%
  column_to_rownames("haplogroup") %>%
  as.matrix()

chisq.test(haplo_table)

## 
##  Pearson's Chi-squared test
## 
## data:  haplo_table
## X-squared = 117.66, df = 22, p-value = 4.627e-15

Note that there is some small number (<5) haplogroup, the chi-square test may not accurate.

m.2639 Mutation Analysis Report

Hening Cui

December 04, 2025

Data processing

Specific Enrichment in N Haplogroup

Dolorisk

Specific Enrichment in N Haplogroup

Comparison of N Haplogroup frequency

Haplogroup distribution

Chi-square test