Software

library(ggplot2)
library(dplyr)
library(tidyr)
library(knitr)
library(stringr)

Data

## CanFam 3.1
setwd("C:/Users/edlarsen/Documents/StructuralVariants")
vcf_Cfam3 <- read.csv("PTCLandCTRL_StructuralVariants_CanFam3.filtered.ann.csv")
valid_chromosomes <- c(as.character(1:38), "X")
vcf_Cfam3 <- vcf_Cfam3[vcf_Cfam3$X.CHROM %in% valid_chromosomes, ]

# Exclude any variants only called in controls 
## define ptcl and ctrl columns
ptcl_samples <- c("CF215206",   "CF215393", "CF215422", "CF215442", "CF217246") # must correspond to column names in csv file
ctrl_samples <- c("CF51135",    "CF51406",  "CF53469",  "CF54434")

## Exclude rows where all PTCL samples are "0/0:" (meaning no variant call)
ptcl_vcf_Cfam3 <- vcf_Cfam3 %>%
  filter(!if_all(all_of(ptcl_samples), ~ str_starts(.x, fixed("0/0:"))))

## Exclude rows where all PTCL samples are "./.:" (meaning missing data)
ptcl_vcf_Cfam3 <- ptcl_vcf_Cfam3 %>%
  filter(!if_all(all_of(ptcl_samples), ~ str_starts(.x, fixed("./.:"))))

## CanFam 4
vcf_Cfam4 <- read.csv("PTCLandCTRL_StructuralVariants_CanFam4.filtered.ann.csv")
vcf_Cfam4 <- vcf_Cfam4[vcf_Cfam4$X.CHROM %in% valid_chromosomes, ]
ptcl_vcf_Cfam4 <- vcf_Cfam4 %>%
  filter(!if_all(all_of(ptcl_samples), ~ str_starts(.x, fixed("0/0:"))))
ptcl_vcf_Cfam4 <- ptcl_vcf_Cfam4 %>%
  filter(!if_all(all_of(ptcl_samples), ~ str_starts(.x, fixed("./.:"))))

Evaluate overlap of variants called in both PTCL and control samples

# Include only rows where variants were called in both a PTCL sample and a control sample
shared_vcf_Cfam3 <- vcf_Cfam3 %>%
  # Keep rows where at least one PTCL sample has a variant
  filter(if_any(all_of(ptcl_samples), ~ !(str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:"))))) %>%
  # Keep rows where at least one control sample has a variant
  filter(if_any(all_of(ctrl_samples), ~ !(str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:")))))

shared_vcf_Cfam4 <- vcf_Cfam4 %>%
  # Keep rows where at least one PTCL sample has a variant
  filter(if_any(all_of(ptcl_samples), ~ !(str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:"))))) %>%
  # Keep rows where at least one control sample has a variant
  filter(if_any(all_of(ctrl_samples), ~ !(str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:")))))

Number of variants per chromosome

# CanFam 3.1
shared_variant_counts_Cfam3 <- table(shared_vcf_Cfam3$X.CHROM)
shared_variant_df_Cfam3 <- as.data.frame(shared_variant_counts_Cfam3)
colnames(shared_variant_df_Cfam3) <- c("Chromosome", "VariantCount")

print(ggplot(shared_variant_df_Cfam3, aes(x = Chromosome, y = VariantCount)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(
    title = "Structural variant calls per chromosome for variants called in \nboth PTCLs and controls (CanFam 3.1)",
    x = "Chromosome",
    y = "Number of Variants"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)))

# CanFam 4
shared_variant_counts_Cfam4 <- table(shared_vcf_Cfam4$X.CHROM)
shared_variant_df_Cfam4 <- as.data.frame(shared_variant_counts_Cfam4)
colnames(shared_variant_df_Cfam4) <- c("Chromosome", "VariantCount")

print(ggplot(shared_variant_df_Cfam4, aes(x = Chromosome, y = VariantCount)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(
    title = "Structural variant calls per chromosome for variants called in \nboth PTCLs and controls (CanFam4)",
    x = "Chromosome",
    y = "Number of Variants"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)))

Types of variants by chromosome

## CanFam 3.1
# Extract the variant type (SVTYPE) from the INFO column
shared_vcf_Cfam3$SVTYPE <- sapply(shared_vcf_Cfam3$INFO, function(info) {
  match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
  if (length(match) > 0) {
    sub("SVTYPE=", "", match)
  } else {
    NA
  }
})

shared_vcf_Cfam3 <- shared_vcf_Cfam3[!is.na(shared_vcf_Cfam3$SVTYPE),] # remove NAs

# Summarize the counts of each variant type per chromosome
shared_variant_summary_Cfam3 <- shared_vcf_Cfam3 %>%
  group_by(X.CHROM, SVTYPE) %>%
  summarise(Count = n()) %>%
  ungroup()

# Rename chromosome column for better usability in plotting
colnames(shared_variant_summary_Cfam3)[1] <- "Chromosome"

# Ensure the chromosome column is ordered correctly
shared_variant_summary_Cfam3$Chromosome <- factor(shared_variant_summary_Cfam3$Chromosome, levels = c(as.character(1:38), "X"))

# Plot the number of variants per chromosome by type

ggplot(shared_variant_summary_Cfam3, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(
    title = "Structural variant calls per chromosome by type for variants called in \nboth PTCLs and controls (CanFam 3.1)",
    x = "Chromosome",
    y = "Number of Variants",
    fill = "Variant Type"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) +
  scale_fill_brewer(palette = "Set2")

kable(shared_variant_summary_Cfam3, caption = "Variant count per chromosome, shared Samples (CanFam 3.1)")
Variant count per chromosome, shared Samples (CanFam 3.1)
Chromosome SVTYPE Count
1 BND 31
1 DEL 1788
1 DUP 177
1 INS 2062
1 INV 22
10 BND 8
10 DEL 991
10 DUP 111
10 INS 1196
10 INV 11
11 BND 21
11 DEL 1125
11 DUP 116
11 INS 1236
11 INV 4
12 BND 29
12 DEL 1405
12 DUP 146
12 INS 1609
12 INV 15
13 BND 8
13 DEL 1001
13 DUP 119
13 INS 1119
13 INV 7
14 BND 13
14 DEL 961
14 DUP 102
14 INS 1133
14 INV 9
15 BND 12
15 DEL 1035
15 DUP 131
15 INS 1203
15 INV 1
16 BND 33
16 DEL 1069
16 DUP 124
16 INS 1279
16 INV 6
17 BND 13
17 DEL 1131
17 DUP 127
17 INS 1265
17 INV 13
18 BND 20
18 DEL 998
18 DUP 120
18 INS 1064
18 INV 10
19 BND 7
19 DEL 921
19 DUP 104
19 INS 1134
19 INV 8
2 BND 19
2 DEL 1282
2 DUP 136
2 INS 1491
2 INV 8
20 BND 14
20 DEL 927
20 DUP 70
20 INS 936
20 INV 10
21 BND 18
21 DEL 917
21 DUP 97
21 INS 1036
21 INV 7
22 BND 12
22 DEL 1019
22 DUP 125
22 INS 1160
22 INV 8
23 BND 15
23 DEL 855
23 DUP 100
23 INS 1090
23 INV 9
24 BND 9
24 DEL 791
24 DUP 51
24 INS 929
24 INV 4
25 BND 15
25 DEL 785
25 DUP 110
25 INS 988
25 INV 3
26 BND 31
26 DEL 895
26 DUP 72
26 INS 1002
26 INV 7
27 BND 14
27 DEL 934
27 DUP 73
27 INS 1152
27 INV 6
28 BND 4
28 DEL 654
28 DUP 54
28 INS 737
28 INV 4
29 BND 12
29 DEL 762
29 DUP 89
29 INS 884
29 INV 4
3 BND 19
3 DEL 1408
3 DUP 154
3 INS 1643
3 INV 10
30 BND 8
30 DEL 690
30 DUP 51
30 INS 854
30 INV 4
31 BND 9
31 DEL 698
31 DUP 109
31 INS 851
31 INV 2
32 BND 8
32 DEL 891
32 DUP 60
32 INS 982
32 INV 5
33 BND 8
33 DEL 596
33 DUP 81
33 INS 704
33 INV 3
34 BND 7
34 DEL 755
34 DUP 78
34 INS 875
34 INV 5
35 BND 5
35 DEL 511
35 DUP 38
35 INS 593
35 INV 2
36 BND 7
36 DEL 517
36 DUP 46
36 INS 592
36 INV 4
37 BND 4
37 DEL 438
37 DUP 60
37 INS 552
37 INV 5
38 BND 1
38 DEL 480
38 DUP 56
38 INS 546
38 INV 2
4 BND 22
4 DEL 1350
4 DUP 146
4 INS 1575
4 INV 1
5 BND 20
5 DEL 1297
5 DUP 138
5 INS 1438
5 INV 5
6 BND 15
6 DEL 1217
6 DUP 137
6 INS 1379
6 INV 8
7 BND 19
7 DEL 1176
7 DUP 135
7 INS 1507
7 INV 9
8 BND 62
8 DEL 1448
8 DUP 113
8 INS 1689
8 INV 17
9 BND 44
9 DEL 1097
9 DUP 96
9 INS 1252
9 INV 20
X BND 75
X DEL 1465
X DUP 120
X INS 1852
X INV 30
shared_total_variant_summary_Cfam3 <- shared_vcf_Cfam3 %>%
  group_by(SVTYPE) %>%
  summarise(Total_Count = n()) %>%
  ungroup()

kable(shared_total_variant_summary_Cfam3, caption = "Total variant count by type, shared Samples (CanFam 3.1)")
Total variant count by type, shared Samples (CanFam 3.1)
SVTYPE Total_Count
BND 691
DEL 38280
DUP 3972
INS 44589
INV 308
## CanFam4
# Extract the variant type (SVTYPE) from the INFO column
shared_vcf_Cfam4$SVTYPE <- sapply(shared_vcf_Cfam4$INFO, function(info) {
  match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
  if (length(match) > 0) {
    sub("SVTYPE=", "", match)
  } else {
    NA
  }
})

shared_vcf_Cfam4 <- shared_vcf_Cfam4[!is.na(shared_vcf_Cfam4$SVTYPE),] # remove NAs

# Summarize the counts of each variant type per chromosome
shared_variant_summary_Cfam4 <- shared_vcf_Cfam4 %>%
  group_by(X.CHROM, SVTYPE) %>%
  summarise(Count = n()) %>%
  ungroup()

# Rename chromosome column for better usability in plotting
colnames(shared_variant_summary_Cfam4)[1] <- "Chromosome"

# Ensure the chromosome column is ordered correctly
shared_variant_summary_Cfam4$Chromosome <- factor(shared_variant_summary_Cfam4$Chromosome, levels = c(as.character(1:38), "X"))

# Plot the number of variants per chromosome by type

ggplot(shared_variant_summary_Cfam4, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(
    title = "Structural variant calls per chromosome by type for variants called in \nboth PTCLs and controls (CanFam4)",
    x = "Chromosome",
    y = "Number of Variants",
    fill = "Variant Type"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) +
  scale_fill_brewer(palette = "Set2")

kable(shared_variant_summary_Cfam4, caption = "Variant count per chromosome, shared Samples (CanFam4)")
Variant count per chromosome, shared Samples (CanFam4)
Chromosome SVTYPE Count
1 BND 24
1 DEL 1922
1 DUP 199
1 INS 1994
1 INV 16
10 BND 13
10 DEL 1088
10 DUP 128
10 INS 1183
10 INV 14
11 BND 16
11 DEL 1167
11 DUP 106
11 INS 1274
11 INV 3
12 BND 12
12 DEL 1500
12 DUP 145
12 INS 1593
12 INV 11
13 BND 13
13 DEL 1033
13 DUP 128
13 INS 1061
13 INV 8
14 BND 19
14 DEL 998
14 DUP 99
14 INS 1106
14 INV 9
15 BND 8
15 DEL 1067
15 DUP 129
15 INS 1171
15 INV 7
16 BND 8
16 DEL 1024
16 DUP 119
16 INS 1100
16 INV 5
17 BND 12
17 DEL 1148
17 DUP 108
17 INS 1194
17 INV 8
18 BND 41
18 DEL 1109
18 DUP 120
18 INS 1098
18 INV 11
19 BND 20
19 DEL 1067
19 DUP 135
19 INS 1096
19 INV 8
2 BND 15
2 DEL 1283
2 DUP 137
2 INS 1347
2 INV 11
20 BND 9
20 DEL 925
20 DUP 96
20 INS 990
20 INV 6
21 BND 13
21 DEL 1019
21 DUP 113
21 INS 1089
21 INV 8
22 BND 15
22 DEL 1083
22 DUP 110
22 INS 1115
22 INV 11
23 BND 6
23 DEL 943
23 DUP 87
23 INS 1055
23 INV 9
24 BND 11
24 DEL 866
24 DUP 79
24 INS 985
24 INV 4
25 BND 9
25 DEL 825
25 DUP 114
25 INS 977
25 INV 4
26 BND 22
26 DEL 894
26 DUP 85
26 INS 933
26 INV 7
27 BND 13
27 DEL 1020
27 DUP 69
27 INS 1138
27 INV 4
28 BND 8
28 DEL 769
28 DUP 72
28 INS 720
28 INV 4
29 BND 9
29 DEL 892
29 DUP 99
29 INS 877
29 INV 2
3 BND 15
3 DEL 1437
3 DUP 160
3 INS 1638
3 INV 10
30 BND 6
30 DEL 819
30 DUP 69
30 INS 828
30 INV 5
31 BND 5
31 DEL 873
31 DUP 98
31 INS 835
31 INV 7
32 BND 16
32 DEL 1058
32 DUP 103
32 INS 1093
32 INV 4
33 BND 4
33 DEL 818
33 DUP 113
33 INS 716
33 INV 6
34 BND 12
34 DEL 787
34 DUP 110
34 INS 922
34 INV 5
35 BND 33
35 DEL 722
35 DUP 76
35 INS 772
35 INV 1
36 BND 3
36 DEL 631
36 DUP 62
36 INS 631
36 INV 9
37 BND 2
37 DEL 492
37 DUP 68
37 INS 561
37 INV 2
38 BND 5
38 DEL 608
38 DUP 57
38 INS 556
38 INV 4
4 BND 13
4 DEL 1424
4 DUP 157
4 INS 1564
4 INV 8
5 BND 10
5 DEL 1295
5 DUP 134
5 INS 1378
5 INV 5
6 BND 14
6 DEL 1263
6 DUP 136
6 INS 1331
6 INV 9
7 BND 17
7 DEL 1270
7 DUP 125
7 INS 1462
7 INV 7
8 BND 40
8 DEL 1441
8 DUP 133
8 INS 1675
8 INV 16
9 BND 34
9 DEL 975
9 DUP 87
9 INS 1240
9 INV 15
X BND 55
X DEL 1521
X DUP 126
X INS 1543
X INV 28
shared_total_variant_summary_Cfam4 <- shared_vcf_Cfam4 %>%
  group_by(SVTYPE) %>%
  summarise(Total_Count = n()) %>%
  ungroup()

kable(shared_total_variant_summary_Cfam4, caption = "Total variant count by type, shared Samples (CanFam4)")
Total variant count by type, shared Samples (CanFam4)
SVTYPE Total_Count
BND 600
DEL 41076
DUP 4291
INS 43841
INV 311

Evaluate tumor-specific variants not called in controls

# Exclude rows where ANY control sample has a variant (i.e., not 0/0: or ./.:)
ptcl_only_vcf_Cfam3 <- ptcl_vcf_Cfam3 %>%
  filter(if_all(all_of(ctrl_samples), ~ str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:"))))

ptcl_only_vcf_Cfam4 <- ptcl_vcf_Cfam4 %>%
  filter(if_all(all_of(ctrl_samples), ~ str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:"))))

Number of variants per chromosome

# CanFam 3.1
ptcl_only_variant_counts_Cfam3 <- table(ptcl_only_vcf_Cfam3$X.CHROM)
ptcl_only_variant_df_Cfam3 <- as.data.frame(ptcl_only_variant_counts_Cfam3)
colnames(ptcl_only_variant_df_Cfam3) <- c("Chromosome", "VariantCount")

print(ggplot(ptcl_only_variant_df_Cfam3, aes(x = Chromosome, y = VariantCount)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(
    title = "Structural variant calls per chromosome, PTCL-specific variants (CanFam 3.1)",
    x = "Chromosome",
    y = "Number of Variants"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)))

# CanFam 4
ptcl_only_variant_counts_Cfam4 <- table(ptcl_only_vcf_Cfam4$X.CHROM)
ptcl_only_variant_df_Cfam4 <- as.data.frame(ptcl_only_variant_counts_Cfam4)
colnames(ptcl_only_variant_df_Cfam4) <- c("Chromosome", "VariantCount")

print(ggplot(ptcl_only_variant_df_Cfam4, aes(x = Chromosome, y = VariantCount)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(
    title = "Structural variant calls per chromosome, PTCL-specific variants (CanFam4)",
    x = "Chromosome",
    y = "Number of Variants"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)))

Types of variants by chromosome

## CanFam 3.1
# Extract the variant type (SVTYPE) from the INFO column
ptcl_only_vcf_Cfam3$SVTYPE <- sapply(ptcl_only_vcf_Cfam3$INFO, function(info) {
  match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
  if (length(match) > 0) {
    sub("SVTYPE=", "", match)
  } else {
    NA
  }
})

ptcl_only_vcf_Cfam3 <- ptcl_only_vcf_Cfam3[!is.na(ptcl_only_vcf_Cfam3$SVTYPE),] # remove NAs

# Summarize the counts of each variant type per chromosome
ptcl_only_variant_summary_Cfam3 <- ptcl_only_vcf_Cfam3 %>%
  group_by(X.CHROM, SVTYPE) %>%
  summarise(Count = n()) %>%
  ungroup()

# Rename chromosome column for better usability in plotting
colnames(ptcl_only_variant_summary_Cfam3)[1] <- "Chromosome"

# Ensure the chromosome column is ordered correctly
ptcl_only_variant_summary_Cfam3$Chromosome <- factor(ptcl_only_variant_summary_Cfam3$Chromosome, levels = c(as.character(1:38), "X"))

# Plot the number of variants per chromosome by type
ggplot(ptcl_only_variant_summary_Cfam3, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(
    title = "Structural variants per chromosome by type,\nPTCL-specific variants (CanFam 3.1)",
    x = "Chromosome",
    y = "Number of Variants",
    fill = "Variant Type"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) +
  scale_fill_brewer(palette = "Set2")

kable(ptcl_only_variant_summary_Cfam3, caption = "Variant count per chromosome, PTCL-specific variants (CanFam 3.1)")
Variant count per chromosome, PTCL-specific variants (CanFam 3.1)
Chromosome SVTYPE Count
1 BND 4
1 DEL 478
1 DUP 83
1 INS 1276
1 INV 4
10 BND 2
10 DEL 220
10 DUP 49
10 INS 638
11 BND 34
11 DEL 234
11 DUP 36
11 INS 662
11 INV 6
12 BND 6
12 DEL 285
12 DUP 51
12 INS 916
12 INV 4
13 BND 6
13 DEL 193
13 DUP 36
13 INS 673
13 INV 1
14 BND 4
14 DEL 211
14 DUP 46
14 INS 669
14 INV 3
15 BND 2
15 DEL 279
15 DUP 53
15 INS 810
15 INV 3
16 BND 14
16 DEL 199
16 DUP 30
16 INS 581
17 BND 11
17 DEL 258
17 DUP 48
17 INS 806
17 INV 3
18 BND 12
18 DEL 196
18 DUP 35
18 INS 613
18 INV 6
19 BND 3
19 DEL 188
19 DUP 36
19 INS 555
19 INV 3
2 BND 1
2 DEL 279
2 DUP 49
2 INS 928
2 INV 5
20 BND 7
20 DEL 182
20 DUP 29
20 INS 682
20 INV 3
21 BND 5
21 DEL 265
21 DUP 52
21 INS 747
21 INV 2
22 BND 3
22 DEL 246
22 DUP 58
22 INS 696
22 INV 1
23 BND 2
23 DEL 165
23 DUP 30
23 INS 601
23 INV 2
24 BND 3
24 DEL 147
24 DUP 25
24 INS 492
24 INV 1
25 BND 5
25 DEL 162
25 DUP 33
25 INS 459
25 INV 3
26 BND 8
26 DEL 210
26 DUP 38
26 INS 672
26 INV 5
27 BND 5
27 DEL 180
27 DUP 30
27 INS 607
27 INV 1
28 BND 2
28 DEL 107
28 DUP 34
28 INS 404
28 INV 1
29 BND 2
29 DEL 195
29 DUP 37
29 INS 576
29 INV 2
3 BND 8
3 DEL 293
3 DUP 67
3 INS 958
3 INV 4
30 BND 2
30 DEL 115
30 DUP 27
30 INS 500
30 INV 1
31 BND 1
31 DEL 202
31 DUP 42
31 INS 584
31 INV 2
32 BND 6
32 DEL 163
32 DUP 28
32 INS 503
32 INV 2
33 BND 6
33 DEL 123
33 DUP 40
33 INS 438
33 INV 2
34 BND 3
34 DEL 144
34 DUP 22
34 INS 427
34 INV 4
35 DEL 101
35 DUP 22
35 INS 358
35 INV 2
36 BND 2
36 DEL 116
36 DUP 20
36 INS 335
37 BND 1
37 DEL 154
37 DUP 21
37 INS 343
38 BND 4
38 DEL 116
38 DUP 21
38 INS 278
38 INV 4
4 BND 23
4 DEL 325
4 DUP 62
4 INS 1042
4 INV 5
5 BND 7
5 DEL 317
5 DUP 61
5 INS 940
5 INV 3
6 BND 5
6 DEL 308
6 DUP 54
6 INS 811
6 INV 2
7 BND 2
7 DEL 291
7 DUP 50
7 INS 863
8 BND 30
8 DEL 279
8 DUP 58
8 INS 1000
8 INV 5
9 BND 1
9 DEL 173
9 DUP 33
9 INS 666
9 INV 5
X BND 19
X DEL 230
X DUP 40
X INS 770
X INV 4
ptcl_only_total_variant_summary_Cfam3 <- ptcl_only_vcf_Cfam3 %>%
  group_by(SVTYPE) %>%
  summarise(Total_Count = n()) %>%
  ungroup()

kable(ptcl_only_total_variant_summary_Cfam3, caption = "Total variant count by type, PTCL-specific variants (CanFam 3.1)")
Total variant count by type, PTCL-specific variants (CanFam 3.1)
SVTYPE Total_Count
BND 261
DEL 8329
DUP 1586
INS 25879
INV 104
## CanFam4
# Extract the variant type (SVTYPE) from the INFO column
ptcl_only_vcf_Cfam4$SVTYPE <- sapply(ptcl_only_vcf_Cfam4$INFO, function(info) {
  match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
  if (length(match) > 0) {
    sub("SVTYPE=", "", match)
  } else {
    NA
  }
})

ptcl_only_vcf_Cfam4 <- ptcl_only_vcf_Cfam4[!is.na(ptcl_only_vcf_Cfam4$SVTYPE),] # remove NAs

# Summarize the counts of each variant type per chromosome
ptcl_only_variant_summary_Cfam4 <- ptcl_only_vcf_Cfam4 %>%
  group_by(X.CHROM, SVTYPE) %>%
  summarise(Count = n()) %>%
  ungroup()

# Rename chromosome column for better usability in plotting
colnames(ptcl_only_variant_summary_Cfam4)[1] <- "Chromosome"

# Ensure the chromosome column is ordered correctly
ptcl_only_variant_summary_Cfam4$Chromosome <- factor(ptcl_only_variant_summary_Cfam4$Chromosome, levels = c(as.character(1:38), "X"))

# Plot the number of variants per chromosome by type
ggplot(ptcl_only_variant_summary_Cfam4, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(
    title = "Structural variants per chromosome by type,\nPTCL-specific variants (CanFam4)",
    x = "Chromosome",
    y = "Number of Variants",
    fill = "Variant Type"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) +
  scale_fill_brewer(palette = "Set2")

kable(ptcl_only_variant_summary_Cfam4, caption = "Variant count per chromosome, PTCL-specific variants (CanFam4)")
Variant count per chromosome, PTCL-specific variants (CanFam4)
Chromosome SVTYPE Count
1 BND 6
1 DEL 440
1 DUP 79
1 INS 1191
1 INV 4
10 BND 1
10 DEL 225
10 DUP 53
10 INS 627
10 INV 3
11 BND 39
11 DEL 210
11 DUP 42
11 INS 576
11 INV 9
12 BND 10
12 DEL 281
12 DUP 59
12 INS 890
12 INV 2
13 BND 7
13 DEL 201
13 DUP 34
13 INS 627
13 INV 1
14 BND 3
14 DEL 216
14 DUP 39
14 INS 637
14 INV 1
15 BND 4
15 DEL 279
15 DUP 69
15 INS 762
15 INV 3
16 BND 16
16 DEL 174
16 DUP 27
16 INS 509
16 INV 1
17 BND 11
17 DEL 245
17 DUP 58
17 INS 759
17 INV 2
18 BND 11
18 DEL 205
18 DUP 53
18 INS 599
18 INV 8
19 BND 2
19 DEL 209
19 DUP 64
19 INS 584
19 INV 4
2 BND 2
2 DEL 273
2 DUP 56
2 INS 870
2 INV 6
20 BND 5
20 DEL 169
20 DUP 31
20 INS 643
20 INV 3
21 BND 5
21 DEL 233
21 DUP 49
21 INS 716
21 INV 3
22 BND 1
22 DEL 268
22 DUP 57
22 INS 689
22 INV 1
23 DEL 177
23 DUP 35
23 INS 596
23 INV 4
24 BND 2
24 DEL 134
24 DUP 27
24 INS 459
24 INV 1
25 BND 8
25 DEL 171
25 DUP 47
25 INS 434
25 INV 1
26 BND 6
26 DEL 200
26 DUP 30
26 INS 638
26 INV 4
27 BND 5
27 DEL 183
27 DUP 32
27 INS 561
27 INV 3
28 BND 2
28 DEL 116
28 DUP 32
28 INS 399
28 INV 4
29 BND 6
29 DEL 207
29 DUP 46
29 INS 538
29 INV 2
3 BND 15
3 DEL 280
3 DUP 78
3 INS 932
3 INV 2
30 BND 2
30 DEL 127
30 DUP 36
30 INS 478
31 BND 1
31 DEL 218
31 DUP 48
31 INS 580
31 INV 3
32 BND 10
32 DEL 204
32 DUP 36
32 INS 495
32 INV 3
33 BND 4
33 DEL 137
33 DUP 34
33 INS 417
33 INV 2
34 BND 2
34 DEL 133
34 DUP 30
34 INS 409
34 INV 4
35 BND 9
35 DEL 169
35 DUP 50
35 INS 398
35 INV 4
36 BND 3
36 DEL 125
36 DUP 39
36 INS 341
37 DEL 142
37 DUP 39
37 INS 324
37 INV 1
38 BND 3
38 DEL 111
38 DUP 26
38 INS 268
38 INV 3
4 BND 23
4 DEL 307
4 DUP 73
4 INS 974
4 INV 3
5 BND 7
5 DEL 318
5 DUP 60
5 INS 881
5 INV 1
6 BND 9
6 DEL 318
6 DUP 50
6 INS 803
6 INV 2
7 BND 5
7 DEL 273
7 DUP 53
7 INS 826
8 BND 41
8 DEL 380
8 DUP 68
8 INS 1155
8 INV 3
9 BND 2
9 DEL 195
9 DUP 41
9 INS 655
9 INV 4
X BND 13
X DEL 229
X DUP 51
X INS 717
X INV 5
ptcl_only_total_variant_summary_Cfam4 <- ptcl_only_vcf_Cfam4 %>%
  group_by(SVTYPE) %>%
  summarise(Total_Count = n()) %>%
  ungroup()

kable(ptcl_only_total_variant_summary_Cfam4, caption = "Total variant count by type, PTCL-specific variants (CanFam4)")
Total variant count by type, PTCL-specific variants (CanFam4)
SVTYPE Total_Count
BND 301
DEL 8482
DUP 1831
INS 24957
INV 110

Identify shared fusion partners with RNA-seq data

## CanFam 3.1
# read in csv file(s) of RNA fusion calls
star_rna_fusions <- read.csv("CanFam31_StarFusionSummary.csv")
fc_rna_fusions <- read.csv("fusionCatcher-results-ptcl-no-ctrls.csv")

# get list of implicated genes
star_genes_3 <- star_rna_fusions$X3_prime_gene
star_genes_5 <- star_rna_fusions$X5_prime_gene
fc_genes_3 <- fc_rna_fusions$X3_prime_gene
fc_genes_5 <- fc_rna_fusions$X5_prime_gene

all_star_genes <- c(star_genes_3, star_genes_5)
all_star_genes <- unique(all_star_genes)
all_fc_genes <- c(fc_genes_3, fc_genes_5)
all_fc_genes <- unique(all_fc_genes)

# compare to genes in INFO column of BND variant file
bnd_vcf <- read.csv("PTCLandCTRL_BND_Variants.CanFam3.filtered.ann.bcsq.csv")

extract_gene <- function(info) {
  # Find the BCSQ field
  bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
  # Extract the gene name (second element after "|")
  if (!is.na(bcsc_field)) {
    gene <- strsplit(bcsc_field, "\\|")[[1]][2]
    return(gene)
  }
  return(NA)
}

bnd_vcf <- bnd_vcf %>%
  mutate(Gene = sapply(INFO, extract_gene))

shared_star_bnd_Cfam3 <- bnd_vcf %>%
  filter(Gene %in% all_star_genes)

shared_fc_bnd_Cfam3 <- bnd_vcf %>%
  filter(Gene %in% all_fc_genes)

# print summary
print(unique(shared_star_bnd_Cfam3$Gene))
##  [1] "PALM2AKAP2"         "ASCC3"              "FBXW7"             
##  [4] "ENSCAFG00000014478" "ENSCAFG00000048749" "CPT1A"             
##  [7] "CD6"                "MAML3"              "TNFRSF1B"          
## [10] "MITF"               "DOCK3"              "NBEA"              
## [13] "TOX"                "ENSCAFG00000032474" "UIMC1"             
## [16] "ENSCAFG00000025128" "ENSCAFG00000030258" "ENSCAFG00000028509"
## [19] "ENSCAFG00000028642" "ENSCAFG00000044010"
print(unique(shared_fc_bnd_Cfam3$Gene))
##  [1] "CDO1"     "KCNQ5"    "SND1"     "CRADD"    "GATB"     "FBXW7"   
##  [7] "TRBV28"   "DGUOK"    "MAGI2"    "IGHMBP2"  "CPT1A"    "CD6"     
## [13] "MAML3"    "ZRANB3"   "VPS13D"   "TNFRSF1B" "MITF"     "ITPR2"   
## [19] "TOX"      "FAM172A"  "UBE2E3"   "ATP6V0E1" "RABGAP1L" "ACACA"   
## [25] "ANOS1"
# export
write.csv(shared_star_bnd_Cfam3, file = "Spr25_shared_PacBio_STARFusion_calls.CanFam3.csv")
write.csv(shared_fc_bnd_Cfam3, file = "Spr25_shared_PacBio_FusionCatcher_calls.CanFam3.csv")
## CanFam4
# read in csv file of RNA fusion calls
star_rna_fusions <- read.csv("CanFam4_StarFusionSummary.csv")

# get list of implicated genes
genes_3 <- star_rna_fusions$X3_prime_gene
genes_5 <- star_rna_fusions$X5_prime_gene
all_genes <- c(genes_3, genes_5)
all_genes <- unique(all_genes)

# compare to genes in INFO column of BND variant file
bnd_vcf <- read.csv("PTCLandCTRL_BND_Variants.CanFam4.filtered.ann.bcsq.csv")

extract_gene <- function(info) {
  # Find the BCSQ field
  bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
  # Extract the gene name (second element after "|")
  if (!is.na(bcsc_field)) {
    gene <- strsplit(bcsc_field, "\\|")[[1]][2]
    return(gene)
  }
  return(NA)
}

bnd_vcf <- bnd_vcf %>%
  mutate(Gene = sapply(INFO, extract_gene))

shared_bnd_Cfam4 <- bnd_vcf %>%
  filter(Gene %in% all_genes)

# print summary
print(unique(shared_bnd_Cfam4$Gene))
##  [1] "ENSCAFG00805005290" "ELMO1"              "FBXW7"             
##  [4] "ENSCAFG00805017629" "ENSCAFG00805023029" "ENSCAFG00805002052"
##  [7] "TNFRSF1B"           "ENSCAFG00805023894" "ENSCAFG00805023901"
## [10] "TOX"                "UIMC1"              "RABGAP1L"          
## [13] "ENSCAFG00805008332" "ENSCAFG00805008471" "ENSCAFG00805008786"
## [16] "ENSCAFG00805005681"
# export
write.csv(shared_bnd_Cfam4, file = "Spr25_shared_PacBio_STARFusion_calls.CanFam4.csv")

Identify structural variants implicating cancer-associated genes

# list of cancer-associated genes from OncoKB
oncokb <- read.csv("cancerGeneList.csv")
oncokb_genes <- oncokb$Hugo.Symbol

extract_gene <- function(info) {
  # Find the BCSQ field
  bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
  # Extract the gene name (second element after "|")
  if (!is.na(bcsc_field)) {
    gene <- strsplit(bcsc_field, "\\|")[[1]][2]
    return(gene)
  }
  return(NA)
}

# Use the vcf objects of PTCL-specific variants
onco_vcf_Cfam3 <- ptcl_only_vcf_Cfam3 %>%
  mutate(Gene = sapply(INFO, extract_gene))

onco_vcf_Cfam3 <- onco_vcf_Cfam3 %>%
  filter(Gene %in% oncokb_genes)

onco_vcf_Cfam4 <- ptcl_only_vcf_Cfam4 %>%
  mutate(Gene = sapply(INFO, extract_gene))

onco_vcf_Cfam4 <- onco_vcf_Cfam4 %>%
  filter(Gene %in% oncokb_genes)

# export
write.csv(onco_vcf_Cfam3, file = "Spr25_CanFam3_PTCL_SVs_in_Cancer_Genes.csv")
write.csv(onco_vcf_Cfam4, file = "Spr25_CanFam4_PTCL_SVs_in_Cancer_Genes.csv")

Identify shared variable genes between CanFam3.1 and CanFam4 alignments

# Extract BCSQ gene annotation and SVTYPE from INFO field and place in separate columns called 'Gene' and 'SVTYPE'
extract_gene <- function(info) {
  # Find the BCSQ field
  bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
  # Extract the gene name (second element after "|")
  if (!is.na(bcsc_field)) {
    gene <- strsplit(bcsc_field, "\\|")[[1]][2]
    return(gene)
  }
  return(NA)
}

extract_variant_type <- function(info) {
  # Find the SVTYPE field
  svtype_field <- stringr::str_extract(info, "SVTYPE=[^;]+")
  # Extract the variant type (first element after "=")
  if (!is.na(svtype_field)) {
    var_type <- strsplit(svtype_field, "=")[[1]][2]
    return(var_type)
  }
  return(NA)
}


# Use the vcf objects of PTCL-specific variants
Cfam3_vcf_alt <- ptcl_only_vcf_Cfam3 %>%
  mutate(Gene = sapply(INFO, extract_gene))
Cfam3_vcf_alt <- Cfam3_vcf_alt %>%
  mutate(SVTYPE = sapply(INFO, extract_variant_type))

Cfam4_vcf_alt <- ptcl_only_vcf_Cfam4 %>%
  mutate(Gene = sapply(INFO, extract_gene))
Cfam4_vcf_alt <- Cfam4_vcf_alt %>%
  mutate(SVTYPE = sapply(INFO, extract_variant_type))

# Combine Gene and SVTYPE info into one column
Cfam3_vcf_alt <- Cfam3_vcf_alt %>%
  mutate(
    SVTYPE_Gene = paste(SVTYPE, Gene, sep = "_")
  )

Cfam4_vcf_alt <- Cfam4_vcf_alt %>%
  mutate(
    SVTYPE_Gene = paste(SVTYPE, Gene, sep = "_")
  )


# Create vector of SVTYPE_Gene column contents
Cfam3_variantGenes <- unique(Cfam3_vcf_alt$SVTYPE_Gene)
Cfam4_variantGenes <- unique(Cfam4_vcf_alt$SVTYPE_Gene)

# Filter variant calls of each alignment to include only those of the same variant type in shared genes
Cfam3_shared <- Cfam3_vcf_alt %>%
  filter(SVTYPE_Gene %in% Cfam4_variantGenes) %>%
  filter(Gene != "NA")

Cfam4_shared <- Cfam4_vcf_alt %>%
  filter(SVTYPE_Gene %in% Cfam3_variantGenes) %>%
  filter(Gene != "NA")

# print summary of total number of shared variable genes
paste("Number of genes with the same variant type called in CanFam3.1 and CanFam4: ", length(unique(Cfam3_shared$Gene)), sep = " ")
## [1] "Number of genes with the same variant type called in CanFam3.1 and CanFam4:  2869"
# export
write.csv(Cfam3_shared, file = "Spr25_CanFam3_PacBio_PTCL_Variants_in_Shared_Genes_with_CanFam4.csv")
write.csv(Cfam4_shared, file = "Spr25_CanFam4_PacBio_PTCL_Variants_in_Shared_Genes_with_CanFam3.csv")

# Limit to OncoKB cancer-associated genes
Cfam3_shared_oncokb <- Cfam3_shared %>%
  filter(Gene %in% oncokb_genes)

Cfam4_shared_oncokb <- Cfam4_shared %>%
  filter(Gene %in% oncokb_genes)

paste("Number of cancer-associated genes with same variant type called in CanFam3.1 and CanFam4:", length(unique(Cfam3_shared_oncokb$Gene)), sep = " ")
## [1] "Number of cancer-associated genes with same variant type called in CanFam3.1 and CanFam4: 274"
# export
write.csv(Cfam3_shared_oncokb, file = "Spr25_CanFam3_PacBio_PTCL_Variants_in_Shared_OncoKB_Genes_with_CanFam4.csv")
write.csv(Cfam4_shared_oncokb, file = "Spr25_CanFam4_PacBio_PTCL_Variants_in_Shared_OncoKB_Genes_with_CanFam3.csv")

Citations

sessionInfo()
## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] stringr_1.5.1 knitr_1.49    tidyr_1.3.1   dplyr_1.1.4   ggplot2_3.5.1
## 
## loaded via a namespace (and not attached):
##  [1] vctrs_0.6.5        cli_3.6.2          rlang_1.1.3        xfun_0.49         
##  [5] stringi_1.8.4      purrr_1.0.2        generics_0.1.3     jsonlite_1.8.9    
##  [9] labeling_0.4.3     glue_1.8.0         colorspace_2.1-1   htmltools_0.5.8.1 
## [13] sass_0.4.9         scales_1.3.0       rmarkdown_2.29     grid_4.4.0        
## [17] evaluate_1.0.3     munsell_0.5.1      jquerylib_0.1.4    tibble_3.2.1      
## [21] fastmap_1.2.0      yaml_2.3.10        lifecycle_1.0.4    compiler_4.4.0    
## [25] codetools_0.2-20   RColorBrewer_1.1-3 pkgconfig_2.0.3    rstudioapi_0.17.1 
## [29] farver_2.1.2       digest_0.6.35      R6_2.5.1           tidyselect_1.2.1  
## [33] pillar_1.10.1      magrittr_2.0.3     bslib_0.8.0        withr_3.0.2       
## [37] tools_4.4.0        gtable_0.3.6       cachem_1.1.0
citation()
## To cite R in publications use:
## 
##   R Core Team (2024). _R: A Language and Environment for Statistical
##   Computing_. R Foundation for Statistical Computing, Vienna, Austria.
##   <https://www.R-project.org/>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2024},
##     url = {https://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.