Software

library(ggplot2)
library(dplyr)
library(tidyr)
library(knitr)
library(stringr)

Data

## CanFam 3.1
setwd("C:/Users/edlarsen/Documents/StructuralVariants")
vcf_Cfam3 <- read.csv("PTCLandCTRL_StructuralVariants_CanFam3.filtered.ann.csv")
valid_chromosomes <- c(as.character(1:38), "X")
vcf_Cfam3 <- vcf_Cfam3[vcf_Cfam3$X.CHROM %in% valid_chromosomes, ]

# Exclude any variants only called in controls 
## define ptcl and ctrl columns
ptcl_samples <- c("CF215206",   "CF215393", "CF215422", "CF215442", "CF217246") # must correspond to column names in csv file
ctrl_samples <- c("CF51135",    "CF51406",  "CF53469",  "CF54434")

## Exclude rows where all PTCL samples are "0/0:" (meaning no variant call)
ptcl_vcf_Cfam3 <- vcf_Cfam3 %>%
  filter(!if_all(all_of(ptcl_samples), ~ str_starts(.x, fixed("0/0:"))))

## Exclude rows where all PTCL samples are "./.:" (meaning missing data)
ptcl_vcf_Cfam3 <- ptcl_vcf_Cfam3 %>%
  filter(!if_all(all_of(ptcl_samples), ~ str_starts(.x, fixed("./.:"))))

## CanFam 4
vcf_Cfam4 <- read.csv("PTCLandCTRL_StructuralVariants_CanFam4.filtered.ann.csv")
vcf_Cfam4 <- vcf_Cfam4[vcf_Cfam4$X.CHROM %in% valid_chromosomes, ]
ptcl_vcf_Cfam4 <- vcf_Cfam4 %>%
  filter(!if_all(all_of(ptcl_samples), ~ str_starts(.x, fixed("0/0:"))))
ptcl_vcf_Cfam4 <- ptcl_vcf_Cfam4 %>%
  filter(!if_all(all_of(ptcl_samples), ~ str_starts(.x, fixed("./.:"))))

Evaluate overlap of variants called in both PTCL and control samples

# Include only rows where variants were called in both a PTCL sample and a control sample
shared_vcf_Cfam3 <- vcf_Cfam3 %>%
  # Keep rows where at least one PTCL sample has a variant
  filter(if_any(all_of(ptcl_samples), ~ !(str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:"))))) %>%
  # Keep rows where at least one control sample has a variant
  filter(if_any(all_of(ctrl_samples), ~ !(str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:")))))

shared_vcf_Cfam4 <- vcf_Cfam4 %>%
  # Keep rows where at least one PTCL sample has a variant
  filter(if_any(all_of(ptcl_samples), ~ !(str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:"))))) %>%
  # Keep rows where at least one control sample has a variant
  filter(if_any(all_of(ctrl_samples), ~ !(str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:")))))

Number of variants per chromosome

# CanFam 3.1
shared_variant_counts_Cfam3 <- table(shared_vcf_Cfam3$X.CHROM)
shared_variant_df_Cfam3 <- as.data.frame(shared_variant_counts_Cfam3)
colnames(shared_variant_df_Cfam3) <- c("Chromosome", "VariantCount")

print(ggplot(shared_variant_df_Cfam3, aes(x = Chromosome, y = VariantCount)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(
    title = "Structural variant calls per chromosome for variants called in \nboth PTCLs and controls (CanFam 3.1)",
    x = "Chromosome",
    y = "Number of Variants"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)))

# CanFam 4
shared_variant_counts_Cfam4 <- table(shared_vcf_Cfam4$X.CHROM)
shared_variant_df_Cfam4 <- as.data.frame(shared_variant_counts_Cfam4)
colnames(shared_variant_df_Cfam4) <- c("Chromosome", "VariantCount")

print(ggplot(shared_variant_df_Cfam4, aes(x = Chromosome, y = VariantCount)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(
    title = "Structural variant calls per chromosome for variants called in \nboth PTCLs and controls (CanFam4)",
    x = "Chromosome",
    y = "Number of Variants"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)))

Types of variants by chromosome

## CanFam 3.1
# Extract the variant type (SVTYPE) from the INFO column
shared_vcf_Cfam3$SVTYPE <- sapply(shared_vcf_Cfam3$INFO, function(info) {
  match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
  if (length(match) > 0) {
    sub("SVTYPE=", "", match)
  } else {
    NA
  }
})

shared_vcf_Cfam3 <- shared_vcf_Cfam3[!is.na(shared_vcf_Cfam3$SVTYPE),] # remove NAs

# Summarize the counts of each variant type per chromosome
shared_variant_summary_Cfam3 <- shared_vcf_Cfam3 %>%
  group_by(X.CHROM, SVTYPE) %>%
  summarise(Count = n()) %>%
  ungroup()

# Rename chromosome column for better usability in plotting
colnames(shared_variant_summary_Cfam3)[1] <- "Chromosome"

# Ensure the chromosome column is ordered correctly
shared_variant_summary_Cfam3$Chromosome <- factor(shared_variant_summary_Cfam3$Chromosome, levels = c(as.character(1:38), "X"))

# Plot the number of variants per chromosome by type

ggplot(shared_variant_summary_Cfam3, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(
    title = "Structural variant calls per chromosome by type for variants called in \nboth PTCLs and controls (CanFam 3.1)",
    x = "Chromosome",
    y = "Number of Variants",
    fill = "Variant Type"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) +
  scale_fill_brewer(palette = "Set2")

kable(shared_variant_summary_Cfam3, caption = "Variant count per chromosome, shared Samples (CanFam 3.1)")

Variant count per chromosome, shared Samples (CanFam 3.1)
Chromosome	SVTYPE	Count
1	BND	31
1	DEL	1788
1	DUP	177
1	INS	2062
1	INV	22
10	BND	8
10	DEL	991
10	DUP	111
10	INS	1196
10	INV	11
11	BND	21
11	DEL	1125
11	DUP	116
11	INS	1236
11	INV	4
12	BND	29
12	DEL	1405
12	DUP	146
12	INS	1609
12	INV	15
13	BND	8
13	DEL	1001
13	DUP	119
13	INS	1119
13	INV	7
14	BND	13
14	DEL	961
14	DUP	102
14	INS	1133
14	INV	9
15	BND	12
15	DEL	1035
15	DUP	131
15	INS	1203
15	INV	1
16	BND	33
16	DEL	1069
16	DUP	124
16	INS	1279
16	INV	6
17	BND	13
17	DEL	1131
17	DUP	127
17	INS	1265
17	INV	13
18	BND	20
18	DEL	998
18	DUP	120
18	INS	1064
18	INV	10
19	BND	7
19	DEL	921
19	DUP	104
19	INS	1134
19	INV	8
2	BND	19
2	DEL	1282
2	DUP	136
2	INS	1491
2	INV	8
20	BND	14
20	DEL	927
20	DUP	70
20	INS	936
20	INV	10
21	BND	18
21	DEL	917
21	DUP	97
21	INS	1036
21	INV	7
22	BND	12
22	DEL	1019
22	DUP	125
22	INS	1160
22	INV	8
23	BND	15
23	DEL	855
23	DUP	100
23	INS	1090
23	INV	9
24	BND	9
24	DEL	791
24	DUP	51
24	INS	929
24	INV	4
25	BND	15
25	DEL	785
25	DUP	110
25	INS	988
25	INV	3
26	BND	31
26	DEL	895
26	DUP	72
26	INS	1002
26	INV	7
27	BND	14
27	DEL	934
27	DUP	73
27	INS	1152
27	INV	6
28	BND	4
28	DEL	654
28	DUP	54
28	INS	737
28	INV	4
29	BND	12
29	DEL	762
29	DUP	89
29	INS	884
29	INV	4
3	BND	19
3	DEL	1408
3	DUP	154
3	INS	1643
3	INV	10
30	BND	8
30	DEL	690
30	DUP	51
30	INS	854
30	INV	4
31	BND	9
31	DEL	698
31	DUP	109
31	INS	851
31	INV	2
32	BND	8
32	DEL	891
32	DUP	60
32	INS	982
32	INV	5
33	BND	8
33	DEL	596
33	DUP	81
33	INS	704
33	INV	3
34	BND	7
34	DEL	755
34	DUP	78
34	INS	875
34	INV	5
35	BND	5
35	DEL	511
35	DUP	38
35	INS	593
35	INV	2
36	BND	7
36	DEL	517
36	DUP	46
36	INS	592
36	INV	4
37	BND	4
37	DEL	438
37	DUP	60
37	INS	552
37	INV	5
38	BND	1
38	DEL	480
38	DUP	56
38	INS	546
38	INV	2
4	BND	22
4	DEL	1350
4	DUP	146
4	INS	1575
4	INV	1
5	BND	20
5	DEL	1297
5	DUP	138
5	INS	1438
5	INV	5
6	BND	15
6	DEL	1217
6	DUP	137
6	INS	1379
6	INV	8
7	BND	19
7	DEL	1176
7	DUP	135
7	INS	1507
7	INV	9
8	BND	62
8	DEL	1448
8	DUP	113
8	INS	1689
8	INV	17
9	BND	44
9	DEL	1097
9	DUP	96
9	INS	1252
9	INV	20
X	BND	75
X	DEL	1465
X	DUP	120
X	INS	1852
X	INV	30

shared_total_variant_summary_Cfam3 <- shared_vcf_Cfam3 %>%
  group_by(SVTYPE) %>%
  summarise(Total_Count = n()) %>%
  ungroup()

kable(shared_total_variant_summary_Cfam3, caption = "Total variant count by type, shared Samples (CanFam 3.1)")

Total variant count by type, shared Samples (CanFam 3.1)
SVTYPE	Total_Count
BND	691
DEL	38280
DUP	3972
INS	44589
INV	308

## CanFam4
# Extract the variant type (SVTYPE) from the INFO column
shared_vcf_Cfam4$SVTYPE <- sapply(shared_vcf_Cfam4$INFO, function(info) {
  match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
  if (length(match) > 0) {
    sub("SVTYPE=", "", match)
  } else {
    NA
  }
})

shared_vcf_Cfam4 <- shared_vcf_Cfam4[!is.na(shared_vcf_Cfam4$SVTYPE),] # remove NAs

# Summarize the counts of each variant type per chromosome
shared_variant_summary_Cfam4 <- shared_vcf_Cfam4 %>%
  group_by(X.CHROM, SVTYPE) %>%
  summarise(Count = n()) %>%
  ungroup()

# Rename chromosome column for better usability in plotting
colnames(shared_variant_summary_Cfam4)[1] <- "Chromosome"

# Ensure the chromosome column is ordered correctly
shared_variant_summary_Cfam4$Chromosome <- factor(shared_variant_summary_Cfam4$Chromosome, levels = c(as.character(1:38), "X"))

# Plot the number of variants per chromosome by type

ggplot(shared_variant_summary_Cfam4, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(
    title = "Structural variant calls per chromosome by type for variants called in \nboth PTCLs and controls (CanFam4)",
    x = "Chromosome",
    y = "Number of Variants",
    fill = "Variant Type"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) +
  scale_fill_brewer(palette = "Set2")

kable(shared_variant_summary_Cfam4, caption = "Variant count per chromosome, shared Samples (CanFam4)")

Variant count per chromosome, shared Samples (CanFam4)
Chromosome	SVTYPE	Count
1	BND	24
1	DEL	1922
1	DUP	199
1	INS	1994
1	INV	16
10	BND	13
10	DEL	1088
10	DUP	128
10	INS	1183
10	INV	14
11	BND	16
11	DEL	1167
11	DUP	106
11	INS	1274
11	INV	3
12	BND	12
12	DEL	1500
12	DUP	145
12	INS	1593
12	INV	11
13	BND	13
13	DEL	1033
13	DUP	128
13	INS	1061
13	INV	8
14	BND	19
14	DEL	998
14	DUP	99
14	INS	1106
14	INV	9
15	BND	8
15	DEL	1067
15	DUP	129
15	INS	1171
15	INV	7
16	BND	8
16	DEL	1024
16	DUP	119
16	INS	1100
16	INV	5
17	BND	12
17	DEL	1148
17	DUP	108
17	INS	1194
17	INV	8
18	BND	41
18	DEL	1109
18	DUP	120
18	INS	1098
18	INV	11
19	BND	20
19	DEL	1067
19	DUP	135
19	INS	1096
19	INV	8
2	BND	15
2	DEL	1283
2	DUP	137
2	INS	1347
2	INV	11
20	BND	9
20	DEL	925
20	DUP	96
20	INS	990
20	INV	6
21	BND	13
21	DEL	1019
21	DUP	113
21	INS	1089
21	INV	8
22	BND	15
22	DEL	1083
22	DUP	110
22	INS	1115
22	INV	11
23	BND	6
23	DEL	943
23	DUP	87
23	INS	1055
23	INV	9
24	BND	11
24	DEL	866
24	DUP	79
24	INS	985
24	INV	4
25	BND	9
25	DEL	825
25	DUP	114
25	INS	977
25	INV	4
26	BND	22
26	DEL	894
26	DUP	85
26	INS	933
26	INV	7
27	BND	13
27	DEL	1020
27	DUP	69
27	INS	1138
27	INV	4
28	BND	8
28	DEL	769
28	DUP	72
28	INS	720
28	INV	4
29	BND	9
29	DEL	892
29	DUP	99
29	INS	877
29	INV	2
3	BND	15
3	DEL	1437
3	DUP	160
3	INS	1638
3	INV	10
30	BND	6
30	DEL	819
30	DUP	69
30	INS	828
30	INV	5
31	BND	5
31	DEL	873
31	DUP	98
31	INS	835
31	INV	7
32	BND	16
32	DEL	1058
32	DUP	103
32	INS	1093
32	INV	4
33	BND	4
33	DEL	818
33	DUP	113
33	INS	716
33	INV	6
34	BND	12
34	DEL	787
34	DUP	110
34	INS	922
34	INV	5
35	BND	33
35	DEL	722
35	DUP	76
35	INS	772
35	INV	1
36	BND	3
36	DEL	631
36	DUP	62
36	INS	631
36	INV	9
37	BND	2
37	DEL	492
37	DUP	68
37	INS	561
37	INV	2
38	BND	5
38	DEL	608
38	DUP	57
38	INS	556
38	INV	4
4	BND	13
4	DEL	1424
4	DUP	157
4	INS	1564
4	INV	8
5	BND	10
5	DEL	1295
5	DUP	134
5	INS	1378
5	INV	5
6	BND	14
6	DEL	1263
6	DUP	136
6	INS	1331
6	INV	9
7	BND	17
7	DEL	1270
7	DUP	125
7	INS	1462
7	INV	7
8	BND	40
8	DEL	1441
8	DUP	133
8	INS	1675
8	INV	16
9	BND	34
9	DEL	975
9	DUP	87
9	INS	1240
9	INV	15
X	BND	55
X	DEL	1521
X	DUP	126
X	INS	1543
X	INV	28

shared_total_variant_summary_Cfam4 <- shared_vcf_Cfam4 %>%
  group_by(SVTYPE) %>%
  summarise(Total_Count = n()) %>%
  ungroup()

kable(shared_total_variant_summary_Cfam4, caption = "Total variant count by type, shared Samples (CanFam4)")

Total variant count by type, shared Samples (CanFam4)
SVTYPE	Total_Count
BND	600
DEL	41076
DUP	4291
INS	43841
INV	311

Evaluate tumor-specific variants not called in controls

# Exclude rows where ANY control sample has a variant (i.e., not 0/0: or ./.:)
ptcl_only_vcf_Cfam3 <- ptcl_vcf_Cfam3 %>%
  filter(if_all(all_of(ctrl_samples), ~ str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:"))))

ptcl_only_vcf_Cfam4 <- ptcl_vcf_Cfam4 %>%
  filter(if_all(all_of(ctrl_samples), ~ str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:"))))

Number of variants per chromosome

# CanFam 3.1
ptcl_only_variant_counts_Cfam3 <- table(ptcl_only_vcf_Cfam3$X.CHROM)
ptcl_only_variant_df_Cfam3 <- as.data.frame(ptcl_only_variant_counts_Cfam3)
colnames(ptcl_only_variant_df_Cfam3) <- c("Chromosome", "VariantCount")

print(ggplot(ptcl_only_variant_df_Cfam3, aes(x = Chromosome, y = VariantCount)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(
    title = "Structural variant calls per chromosome, PTCL-specific variants (CanFam 3.1)",
    x = "Chromosome",
    y = "Number of Variants"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)))

# CanFam 4
ptcl_only_variant_counts_Cfam4 <- table(ptcl_only_vcf_Cfam4$X.CHROM)
ptcl_only_variant_df_Cfam4 <- as.data.frame(ptcl_only_variant_counts_Cfam4)
colnames(ptcl_only_variant_df_Cfam4) <- c("Chromosome", "VariantCount")

print(ggplot(ptcl_only_variant_df_Cfam4, aes(x = Chromosome, y = VariantCount)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  theme_minimal() +
  labs(
    title = "Structural variant calls per chromosome, PTCL-specific variants (CanFam4)",
    x = "Chromosome",
    y = "Number of Variants"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)))

Types of variants by chromosome

## CanFam 3.1
# Extract the variant type (SVTYPE) from the INFO column
ptcl_only_vcf_Cfam3$SVTYPE <- sapply(ptcl_only_vcf_Cfam3$INFO, function(info) {
  match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
  if (length(match) > 0) {
    sub("SVTYPE=", "", match)
  } else {
    NA
  }
})

ptcl_only_vcf_Cfam3 <- ptcl_only_vcf_Cfam3[!is.na(ptcl_only_vcf_Cfam3$SVTYPE),] # remove NAs

# Summarize the counts of each variant type per chromosome
ptcl_only_variant_summary_Cfam3 <- ptcl_only_vcf_Cfam3 %>%
  group_by(X.CHROM, SVTYPE) %>%
  summarise(Count = n()) %>%
  ungroup()

# Rename chromosome column for better usability in plotting
colnames(ptcl_only_variant_summary_Cfam3)[1] <- "Chromosome"

# Ensure the chromosome column is ordered correctly
ptcl_only_variant_summary_Cfam3$Chromosome <- factor(ptcl_only_variant_summary_Cfam3$Chromosome, levels = c(as.character(1:38), "X"))

# Plot the number of variants per chromosome by type
ggplot(ptcl_only_variant_summary_Cfam3, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(
    title = "Structural variants per chromosome by type,\nPTCL-specific variants (CanFam 3.1)",
    x = "Chromosome",
    y = "Number of Variants",
    fill = "Variant Type"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) +
  scale_fill_brewer(palette = "Set2")

kable(ptcl_only_variant_summary_Cfam3, caption = "Variant count per chromosome, PTCL-specific variants (CanFam 3.1)")

Variant count per chromosome, PTCL-specific variants (CanFam 3.1)
Chromosome	SVTYPE	Count
1	BND	4
1	DEL	478
1	DUP	83
1	INS	1276
1	INV	4
10	BND	2
10	DEL	220
10	DUP	49
10	INS	638
11	BND	34
11	DEL	234
11	DUP	36
11	INS	662
11	INV	6
12	BND	6
12	DEL	285
12	DUP	51
12	INS	916
12	INV	4
13	BND	6
13	DEL	193
13	DUP	36
13	INS	673
13	INV	1
14	BND	4
14	DEL	211
14	DUP	46
14	INS	669
14	INV	3
15	BND	2
15	DEL	279
15	DUP	53
15	INS	810
15	INV	3
16	BND	14
16	DEL	199
16	DUP	30
16	INS	581
17	BND	11
17	DEL	258
17	DUP	48
17	INS	806
17	INV	3
18	BND	12
18	DEL	196
18	DUP	35
18	INS	613
18	INV	6
19	BND	3
19	DEL	188
19	DUP	36
19	INS	555
19	INV	3
2	BND	1
2	DEL	279
2	DUP	49
2	INS	928
2	INV	5
20	BND	7
20	DEL	182
20	DUP	29
20	INS	682
20	INV	3
21	BND	5
21	DEL	265
21	DUP	52
21	INS	747
21	INV	2
22	BND	3
22	DEL	246
22	DUP	58
22	INS	696
22	INV	1
23	BND	2
23	DEL	165
23	DUP	30
23	INS	601
23	INV	2
24	BND	3
24	DEL	147
24	DUP	25
24	INS	492
24	INV	1
25	BND	5
25	DEL	162
25	DUP	33
25	INS	459
25	INV	3
26	BND	8
26	DEL	210
26	DUP	38
26	INS	672
26	INV	5
27	BND	5
27	DEL	180
27	DUP	30
27	INS	607
27	INV	1
28	BND	2
28	DEL	107
28	DUP	34
28	INS	404
28	INV	1
29	BND	2
29	DEL	195
29	DUP	37
29	INS	576
29	INV	2
3	BND	8
3	DEL	293
3	DUP	67
3	INS	958
3	INV	4
30	BND	2
30	DEL	115
30	DUP	27
30	INS	500
30	INV	1
31	BND	1
31	DEL	202
31	DUP	42
31	INS	584
31	INV	2
32	BND	6
32	DEL	163
32	DUP	28
32	INS	503
32	INV	2
33	BND	6
33	DEL	123
33	DUP	40
33	INS	438
33	INV	2
34	BND	3
34	DEL	144
34	DUP	22
34	INS	427
34	INV	4
35	DEL	101
35	DUP	22
35	INS	358
35	INV	2
36	BND	2
36	DEL	116
36	DUP	20
36	INS	335
37	BND	1
37	DEL	154
37	DUP	21
37	INS	343
38	BND	4
38	DEL	116
38	DUP	21
38	INS	278
38	INV	4
4	BND	23
4	DEL	325
4	DUP	62
4	INS	1042
4	INV	5
5	BND	7
5	DEL	317
5	DUP	61
5	INS	940
5	INV	3
6	BND	5
6	DEL	308
6	DUP	54
6	INS	811
6	INV	2
7	BND	2
7	DEL	291
7	DUP	50
7	INS	863
8	BND	30
8	DEL	279
8	DUP	58
8	INS	1000
8	INV	5
9	BND	1
9	DEL	173
9	DUP	33
9	INS	666
9	INV	5
X	BND	19
X	DEL	230
X	DUP	40
X	INS	770
X	INV	4

ptcl_only_total_variant_summary_Cfam3 <- ptcl_only_vcf_Cfam3 %>%
  group_by(SVTYPE) %>%
  summarise(Total_Count = n()) %>%
  ungroup()

kable(ptcl_only_total_variant_summary_Cfam3, caption = "Total variant count by type, PTCL-specific variants (CanFam 3.1)")

Total variant count by type, PTCL-specific variants (CanFam 3.1)
SVTYPE	Total_Count
BND	261
DEL	8329
DUP	1586
INS	25879
INV	104

## CanFam4
# Extract the variant type (SVTYPE) from the INFO column
ptcl_only_vcf_Cfam4$SVTYPE <- sapply(ptcl_only_vcf_Cfam4$INFO, function(info) {
  match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
  if (length(match) > 0) {
    sub("SVTYPE=", "", match)
  } else {
    NA
  }
})

ptcl_only_vcf_Cfam4 <- ptcl_only_vcf_Cfam4[!is.na(ptcl_only_vcf_Cfam4$SVTYPE),] # remove NAs

# Summarize the counts of each variant type per chromosome
ptcl_only_variant_summary_Cfam4 <- ptcl_only_vcf_Cfam4 %>%
  group_by(X.CHROM, SVTYPE) %>%
  summarise(Count = n()) %>%
  ungroup()

# Rename chromosome column for better usability in plotting
colnames(ptcl_only_variant_summary_Cfam4)[1] <- "Chromosome"

# Ensure the chromosome column is ordered correctly
ptcl_only_variant_summary_Cfam4$Chromosome <- factor(ptcl_only_variant_summary_Cfam4$Chromosome, levels = c(as.character(1:38), "X"))

# Plot the number of variants per chromosome by type
ggplot(ptcl_only_variant_summary_Cfam4, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(
    title = "Structural variants per chromosome by type,\nPTCL-specific variants (CanFam4)",
    x = "Chromosome",
    y = "Number of Variants",
    fill = "Variant Type"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) +
  scale_fill_brewer(palette = "Set2")

kable(ptcl_only_variant_summary_Cfam4, caption = "Variant count per chromosome, PTCL-specific variants (CanFam4)")

Variant count per chromosome, PTCL-specific variants (CanFam4)
Chromosome	SVTYPE	Count
1	BND	6
1	DEL	440
1	DUP	79
1	INS	1191
1	INV	4
10	BND	1
10	DEL	225
10	DUP	53
10	INS	627
10	INV	3
11	BND	39
11	DEL	210
11	DUP	42
11	INS	576
11	INV	9
12	BND	10
12	DEL	281
12	DUP	59
12	INS	890
12	INV	2
13	BND	7
13	DEL	201
13	DUP	34
13	INS	627
13	INV	1
14	BND	3
14	DEL	216
14	DUP	39
14	INS	637
14	INV	1
15	BND	4
15	DEL	279
15	DUP	69
15	INS	762
15	INV	3
16	BND	16
16	DEL	174
16	DUP	27
16	INS	509
16	INV	1
17	BND	11
17	DEL	245
17	DUP	58
17	INS	759
17	INV	2
18	BND	11
18	DEL	205
18	DUP	53
18	INS	599
18	INV	8
19	BND	2
19	DEL	209
19	DUP	64
19	INS	584
19	INV	4
2	BND	2
2	DEL	273
2	DUP	56
2	INS	870
2	INV	6
20	BND	5
20	DEL	169
20	DUP	31
20	INS	643
20	INV	3
21	BND	5
21	DEL	233
21	DUP	49
21	INS	716
21	INV	3
22	BND	1
22	DEL	268
22	DUP	57
22	INS	689
22	INV	1
23	DEL	177
23	DUP	35
23	INS	596
23	INV	4
24	BND	2
24	DEL	134
24	DUP	27
24	INS	459
24	INV	1
25	BND	8
25	DEL	171
25	DUP	47
25	INS	434
25	INV	1
26	BND	6
26	DEL	200
26	DUP	30
26	INS	638
26	INV	4
27	BND	5
27	DEL	183
27	DUP	32
27	INS	561
27	INV	3
28	BND	2
28	DEL	116
28	DUP	32
28	INS	399
28	INV	4
29	BND	6
29	DEL	207
29	DUP	46
29	INS	538
29	INV	2
3	BND	15
3	DEL	280
3	DUP	78
3	INS	932
3	INV	2
30	BND	2
30	DEL	127
30	DUP	36
30	INS	478
31	BND	1
31	DEL	218
31	DUP	48
31	INS	580
31	INV	3
32	BND	10
32	DEL	204
32	DUP	36
32	INS	495
32	INV	3
33	BND	4
33	DEL	137
33	DUP	34
33	INS	417
33	INV	2
34	BND	2
34	DEL	133
34	DUP	30
34	INS	409
34	INV	4
35	BND	9
35	DEL	169
35	DUP	50
35	INS	398
35	INV	4
36	BND	3
36	DEL	125
36	DUP	39
36	INS	341
37	DEL	142
37	DUP	39
37	INS	324
37	INV	1
38	BND	3
38	DEL	111
38	DUP	26
38	INS	268
38	INV	3
4	BND	23
4	DEL	307
4	DUP	73
4	INS	974
4	INV	3
5	BND	7
5	DEL	318
5	DUP	60
5	INS	881
5	INV	1
6	BND	9
6	DEL	318
6	DUP	50
6	INS	803
6	INV	2
7	BND	5
7	DEL	273
7	DUP	53
7	INS	826
8	BND	41
8	DEL	380
8	DUP	68
8	INS	1155
8	INV	3
9	BND	2
9	DEL	195
9	DUP	41
9	INS	655
9	INV	4
X	BND	13
X	DEL	229
X	DUP	51
X	INS	717
X	INV	5

ptcl_only_total_variant_summary_Cfam4 <- ptcl_only_vcf_Cfam4 %>%
  group_by(SVTYPE) %>%
  summarise(Total_Count = n()) %>%
  ungroup()

kable(ptcl_only_total_variant_summary_Cfam4, caption = "Total variant count by type, PTCL-specific variants (CanFam4)")

Total variant count by type, PTCL-specific variants (CanFam4)
SVTYPE	Total_Count
BND	301
DEL	8482
DUP	1831
INS	24957
INV	110

Identify shared fusion partners with RNA-seq data

## CanFam 3.1
# read in csv file(s) of RNA fusion calls
star_rna_fusions <- read.csv("CanFam31_StarFusionSummary.csv")
fc_rna_fusions <- read.csv("fusionCatcher-results-ptcl-no-ctrls.csv")

# get list of implicated genes
star_genes_3 <- star_rna_fusions$X3_prime_gene
star_genes_5 <- star_rna_fusions$X5_prime_gene
fc_genes_3 <- fc_rna_fusions$X3_prime_gene
fc_genes_5 <- fc_rna_fusions$X5_prime_gene

all_star_genes <- c(star_genes_3, star_genes_5)
all_star_genes <- unique(all_star_genes)
all_fc_genes <- c(fc_genes_3, fc_genes_5)
all_fc_genes <- unique(all_fc_genes)

# compare to genes in INFO column of BND variant file
bnd_vcf <- read.csv("PTCLandCTRL_BND_Variants.CanFam3.filtered.ann.bcsq.csv")

extract_gene <- function(info) {
  # Find the BCSQ field
  bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
  # Extract the gene name (second element after "|")
  if (!is.na(bcsc_field)) {
    gene <- strsplit(bcsc_field, "\\|")[[1]][2]
    return(gene)
  }
  return(NA)
}

bnd_vcf <- bnd_vcf %>%
  mutate(Gene = sapply(INFO, extract_gene))

shared_star_bnd_Cfam3 <- bnd_vcf %>%
  filter(Gene %in% all_star_genes)

shared_fc_bnd_Cfam3 <- bnd_vcf %>%
  filter(Gene %in% all_fc_genes)

# print summary
print(unique(shared_star_bnd_Cfam3$Gene))

##  [1] "PALM2AKAP2"         "ASCC3"              "FBXW7"             
##  [4] "ENSCAFG00000014478" "ENSCAFG00000048749" "CPT1A"             
##  [7] "CD6"                "MAML3"              "TNFRSF1B"          
## [10] "MITF"               "DOCK3"              "NBEA"              
## [13] "TOX"                "ENSCAFG00000032474" "UIMC1"             
## [16] "ENSCAFG00000025128" "ENSCAFG00000030258" "ENSCAFG00000028509"
## [19] "ENSCAFG00000028642" "ENSCAFG00000044010"

print(unique(shared_fc_bnd_Cfam3$Gene))

##  [1] "CDO1"     "KCNQ5"    "SND1"     "CRADD"    "GATB"     "FBXW7"   
##  [7] "TRBV28"   "DGUOK"    "MAGI2"    "IGHMBP2"  "CPT1A"    "CD6"     
## [13] "MAML3"    "ZRANB3"   "VPS13D"   "TNFRSF1B" "MITF"     "ITPR2"   
## [19] "TOX"      "FAM172A"  "UBE2E3"   "ATP6V0E1" "RABGAP1L" "ACACA"   
## [25] "ANOS1"

# export
write.csv(shared_star_bnd_Cfam3, file = "Spr25_shared_PacBio_STARFusion_calls.CanFam3.csv")
write.csv(shared_fc_bnd_Cfam3, file = "Spr25_shared_PacBio_FusionCatcher_calls.CanFam3.csv")

## CanFam4
# read in csv file of RNA fusion calls
star_rna_fusions <- read.csv("CanFam4_StarFusionSummary.csv")

# get list of implicated genes
genes_3 <- star_rna_fusions$X3_prime_gene
genes_5 <- star_rna_fusions$X5_prime_gene
all_genes <- c(genes_3, genes_5)
all_genes <- unique(all_genes)

# compare to genes in INFO column of BND variant file
bnd_vcf <- read.csv("PTCLandCTRL_BND_Variants.CanFam4.filtered.ann.bcsq.csv")

extract_gene <- function(info) {
  # Find the BCSQ field
  bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
  # Extract the gene name (second element after "|")
  if (!is.na(bcsc_field)) {
    gene <- strsplit(bcsc_field, "\\|")[[1]][2]
    return(gene)
  }
  return(NA)
}

bnd_vcf <- bnd_vcf %>%
  mutate(Gene = sapply(INFO, extract_gene))

shared_bnd_Cfam4 <- bnd_vcf %>%
  filter(Gene %in% all_genes)

# print summary
print(unique(shared_bnd_Cfam4$Gene))

##  [1] "ENSCAFG00805005290" "ELMO1"              "FBXW7"             
##  [4] "ENSCAFG00805017629" "ENSCAFG00805023029" "ENSCAFG00805002052"
##  [7] "TNFRSF1B"           "ENSCAFG00805023894" "ENSCAFG00805023901"
## [10] "TOX"                "UIMC1"              "RABGAP1L"          
## [13] "ENSCAFG00805008332" "ENSCAFG00805008471" "ENSCAFG00805008786"
## [16] "ENSCAFG00805005681"

# export
write.csv(shared_bnd_Cfam4, file = "Spr25_shared_PacBio_STARFusion_calls.CanFam4.csv")

Identify structural variants implicating cancer-associated genes

# list of cancer-associated genes from OncoKB
oncokb <- read.csv("cancerGeneList.csv")
oncokb_genes <- oncokb$Hugo.Symbol

extract_gene <- function(info) {
  # Find the BCSQ field
  bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
  # Extract the gene name (second element after "|")
  if (!is.na(bcsc_field)) {
    gene <- strsplit(bcsc_field, "\\|")[[1]][2]
    return(gene)
  }
  return(NA)
}

# Use the vcf objects of PTCL-specific variants
onco_vcf_Cfam3 <- ptcl_only_vcf_Cfam3 %>%
  mutate(Gene = sapply(INFO, extract_gene))

onco_vcf_Cfam3 <- onco_vcf_Cfam3 %>%
  filter(Gene %in% oncokb_genes)

onco_vcf_Cfam4 <- ptcl_only_vcf_Cfam4 %>%
  mutate(Gene = sapply(INFO, extract_gene))

onco_vcf_Cfam4 <- onco_vcf_Cfam4 %>%
  filter(Gene %in% oncokb_genes)

# export
write.csv(onco_vcf_Cfam3, file = "Spr25_CanFam3_PTCL_SVs_in_Cancer_Genes.csv")
write.csv(onco_vcf_Cfam4, file = "Spr25_CanFam4_PTCL_SVs_in_Cancer_Genes.csv")

Identify shared variable genes between CanFam3.1 and CanFam4 alignments

# Extract BCSQ gene annotation and SVTYPE from INFO field and place in separate columns called 'Gene' and 'SVTYPE'
extract_gene <- function(info) {
  # Find the BCSQ field
  bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
  # Extract the gene name (second element after "|")
  if (!is.na(bcsc_field)) {
    gene <- strsplit(bcsc_field, "\\|")[[1]][2]
    return(gene)
  }
  return(NA)
}

extract_variant_type <- function(info) {
  # Find the SVTYPE field
  svtype_field <- stringr::str_extract(info, "SVTYPE=[^;]+")
  # Extract the variant type (first element after "=")
  if (!is.na(svtype_field)) {
    var_type <- strsplit(svtype_field, "=")[[1]][2]
    return(var_type)
  }
  return(NA)
}


# Use the vcf objects of PTCL-specific variants
Cfam3_vcf_alt <- ptcl_only_vcf_Cfam3 %>%
  mutate(Gene = sapply(INFO, extract_gene))
Cfam3_vcf_alt <- Cfam3_vcf_alt %>%
  mutate(SVTYPE = sapply(INFO, extract_variant_type))

Cfam4_vcf_alt <- ptcl_only_vcf_Cfam4 %>%
  mutate(Gene = sapply(INFO, extract_gene))
Cfam4_vcf_alt <- Cfam4_vcf_alt %>%
  mutate(SVTYPE = sapply(INFO, extract_variant_type))

# Combine Gene and SVTYPE info into one column
Cfam3_vcf_alt <- Cfam3_vcf_alt %>%
  mutate(
    SVTYPE_Gene = paste(SVTYPE, Gene, sep = "_")
  )

Cfam4_vcf_alt <- Cfam4_vcf_alt %>%
  mutate(
    SVTYPE_Gene = paste(SVTYPE, Gene, sep = "_")
  )


# Create vector of SVTYPE_Gene column contents
Cfam3_variantGenes <- unique(Cfam3_vcf_alt$SVTYPE_Gene)
Cfam4_variantGenes <- unique(Cfam4_vcf_alt$SVTYPE_Gene)

# Filter variant calls of each alignment to include only those of the same variant type in shared genes
Cfam3_shared <- Cfam3_vcf_alt %>%
  filter(SVTYPE_Gene %in% Cfam4_variantGenes) %>%
  filter(Gene != "NA")

Cfam4_shared <- Cfam4_vcf_alt %>%
  filter(SVTYPE_Gene %in% Cfam3_variantGenes) %>%
  filter(Gene != "NA")

# print summary of total number of shared variable genes
paste("Number of genes with the same variant type called in CanFam3.1 and CanFam4: ", length(unique(Cfam3_shared$Gene)), sep = " ")

## [1] "Number of genes with the same variant type called in CanFam3.1 and CanFam4:  2869"

# export
write.csv(Cfam3_shared, file = "Spr25_CanFam3_PacBio_PTCL_Variants_in_Shared_Genes_with_CanFam4.csv")
write.csv(Cfam4_shared, file = "Spr25_CanFam4_PacBio_PTCL_Variants_in_Shared_Genes_with_CanFam3.csv")

# Limit to OncoKB cancer-associated genes
Cfam3_shared_oncokb <- Cfam3_shared %>%
  filter(Gene %in% oncokb_genes)

Cfam4_shared_oncokb <- Cfam4_shared %>%
  filter(Gene %in% oncokb_genes)

paste("Number of cancer-associated genes with same variant type called in CanFam3.1 and CanFam4:", length(unique(Cfam3_shared_oncokb$Gene)), sep = " ")

## [1] "Number of cancer-associated genes with same variant type called in CanFam3.1 and CanFam4: 274"

# export
write.csv(Cfam3_shared_oncokb, file = "Spr25_CanFam3_PacBio_PTCL_Variants_in_Shared_OncoKB_Genes_with_CanFam4.csv")
write.csv(Cfam4_shared_oncokb, file = "Spr25_CanFam4_PacBio_PTCL_Variants_in_Shared_OncoKB_Genes_with_CanFam3.csv")

Citations

sessionInfo()

## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Denver
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] stringr_1.5.1 knitr_1.49    tidyr_1.3.1   dplyr_1.1.4   ggplot2_3.5.1
## 
## loaded via a namespace (and not attached):
##  [1] vctrs_0.6.5        cli_3.6.2          rlang_1.1.3        xfun_0.49         
##  [5] stringi_1.8.4      purrr_1.0.2        generics_0.1.3     jsonlite_1.8.9    
##  [9] labeling_0.4.3     glue_1.8.0         colorspace_2.1-1   htmltools_0.5.8.1 
## [13] sass_0.4.9         scales_1.3.0       rmarkdown_2.29     grid_4.4.0        
## [17] evaluate_1.0.3     munsell_0.5.1      jquerylib_0.1.4    tibble_3.2.1      
## [21] fastmap_1.2.0      yaml_2.3.10        lifecycle_1.0.4    compiler_4.4.0    
## [25] codetools_0.2-20   RColorBrewer_1.1-3 pkgconfig_2.0.3    rstudioapi_0.17.1 
## [29] farver_2.1.2       digest_0.6.35      R6_2.5.1           tidyselect_1.2.1  
## [33] pillar_1.10.1      magrittr_2.0.3     bslib_0.8.0        withr_3.0.2       
## [37] tools_4.4.0        gtable_0.3.6       cachem_1.1.0

citation()

## To cite R in publications use:
## 
##   R Core Team (2024). _R: A Language and Environment for Statistical
##   Computing_. R Foundation for Statistical Computing, Vienna, Austria.
##   <https://www.R-project.org/>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {R: A Language and Environment for Statistical Computing},
##     author = {{R Core Team}},
##     organization = {R Foundation for Statistical Computing},
##     address = {Vienna, Austria},
##     year = {2024},
##     url = {https://www.R-project.org/},
##   }
## 
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.

Summarize VCF

Eileen Owens

2025-04-30

Software

Data

Evaluate overlap of variants called in both PTCL and control samples

Number of variants per chromosome

Types of variants by chromosome

Evaluate tumor-specific variants not called in controls

Number of variants per chromosome

Types of variants by chromosome

Identify shared fusion partners with RNA-seq data

Identify structural variants implicating cancer-associated genes

Identify shared variable genes between CanFam3.1 and CanFam4 alignments

Citations