Software
library(ggplot2)
library(dplyr)
library(tidyr)
library(knitr)
library(stringr)
Data
## CanFam 3.1
setwd("C:/Users/edlarsen/Documents/StructuralVariants")
vcf_Cfam3 <- read.csv("PTCLandCTRL_StructuralVariants_CanFam3.filtered.ann.csv")
valid_chromosomes <- c(as.character(1:38), "X")
vcf_Cfam3 <- vcf_Cfam3[vcf_Cfam3$X.CHROM %in% valid_chromosomes, ]
# Exclude any variants only called in controls
## define ptcl and ctrl columns
ptcl_samples <- c("CF215206", "CF215393", "CF215422", "CF215442", "CF217246") # must correspond to column names in csv file
ctrl_samples <- c("CF51135", "CF51406", "CF53469", "CF54434")
## Exclude rows where all PTCL samples are "0/0:" (meaning no variant call)
ptcl_vcf_Cfam3 <- vcf_Cfam3 %>%
filter(!if_all(all_of(ptcl_samples), ~ str_starts(.x, fixed("0/0:"))))
## Exclude rows where all PTCL samples are "./.:" (meaning missing data)
ptcl_vcf_Cfam3 <- ptcl_vcf_Cfam3 %>%
filter(!if_all(all_of(ptcl_samples), ~ str_starts(.x, fixed("./.:"))))
## CanFam 4
vcf_Cfam4 <- read.csv("PTCLandCTRL_StructuralVariants_CanFam4.filtered.ann.csv")
vcf_Cfam4 <- vcf_Cfam4[vcf_Cfam4$X.CHROM %in% valid_chromosomes, ]
ptcl_vcf_Cfam4 <- vcf_Cfam4 %>%
filter(!if_all(all_of(ptcl_samples), ~ str_starts(.x, fixed("0/0:"))))
ptcl_vcf_Cfam4 <- ptcl_vcf_Cfam4 %>%
filter(!if_all(all_of(ptcl_samples), ~ str_starts(.x, fixed("./.:"))))
Evaluate overlap of variants called in both PTCL and control
samples
# Include only rows where variants were called in both a PTCL sample and a control sample
shared_vcf_Cfam3 <- vcf_Cfam3 %>%
# Keep rows where at least one PTCL sample has a variant
filter(if_any(all_of(ptcl_samples), ~ !(str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:"))))) %>%
# Keep rows where at least one control sample has a variant
filter(if_any(all_of(ctrl_samples), ~ !(str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:")))))
shared_vcf_Cfam4 <- vcf_Cfam4 %>%
# Keep rows where at least one PTCL sample has a variant
filter(if_any(all_of(ptcl_samples), ~ !(str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:"))))) %>%
# Keep rows where at least one control sample has a variant
filter(if_any(all_of(ctrl_samples), ~ !(str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:")))))
Number of variants per chromosome
# CanFam 3.1
shared_variant_counts_Cfam3 <- table(shared_vcf_Cfam3$X.CHROM)
shared_variant_df_Cfam3 <- as.data.frame(shared_variant_counts_Cfam3)
colnames(shared_variant_df_Cfam3) <- c("Chromosome", "VariantCount")
print(ggplot(shared_variant_df_Cfam3, aes(x = Chromosome, y = VariantCount)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
labs(
title = "Structural variant calls per chromosome for variants called in \nboth PTCLs and controls (CanFam 3.1)",
x = "Chromosome",
y = "Number of Variants"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)))

# CanFam 4
shared_variant_counts_Cfam4 <- table(shared_vcf_Cfam4$X.CHROM)
shared_variant_df_Cfam4 <- as.data.frame(shared_variant_counts_Cfam4)
colnames(shared_variant_df_Cfam4) <- c("Chromosome", "VariantCount")
print(ggplot(shared_variant_df_Cfam4, aes(x = Chromosome, y = VariantCount)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
labs(
title = "Structural variant calls per chromosome for variants called in \nboth PTCLs and controls (CanFam4)",
x = "Chromosome",
y = "Number of Variants"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)))

Types of variants by chromosome
## CanFam 3.1
# Extract the variant type (SVTYPE) from the INFO column
shared_vcf_Cfam3$SVTYPE <- sapply(shared_vcf_Cfam3$INFO, function(info) {
match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
if (length(match) > 0) {
sub("SVTYPE=", "", match)
} else {
NA
}
})
shared_vcf_Cfam3 <- shared_vcf_Cfam3[!is.na(shared_vcf_Cfam3$SVTYPE),] # remove NAs
# Summarize the counts of each variant type per chromosome
shared_variant_summary_Cfam3 <- shared_vcf_Cfam3 %>%
group_by(X.CHROM, SVTYPE) %>%
summarise(Count = n()) %>%
ungroup()
# Rename chromosome column for better usability in plotting
colnames(shared_variant_summary_Cfam3)[1] <- "Chromosome"
# Ensure the chromosome column is ordered correctly
shared_variant_summary_Cfam3$Chromosome <- factor(shared_variant_summary_Cfam3$Chromosome, levels = c(as.character(1:38), "X"))
# Plot the number of variants per chromosome by type
ggplot(shared_variant_summary_Cfam3, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(
title = "Structural variant calls per chromosome by type for variants called in \nboth PTCLs and controls (CanFam 3.1)",
x = "Chromosome",
y = "Number of Variants",
fill = "Variant Type"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) +
scale_fill_brewer(palette = "Set2")

kable(shared_variant_summary_Cfam3, caption = "Variant count per chromosome, shared Samples (CanFam 3.1)")
Variant count per chromosome, shared Samples (CanFam
3.1)
| 1 |
BND |
31 |
| 1 |
DEL |
1788 |
| 1 |
DUP |
177 |
| 1 |
INS |
2062 |
| 1 |
INV |
22 |
| 10 |
BND |
8 |
| 10 |
DEL |
991 |
| 10 |
DUP |
111 |
| 10 |
INS |
1196 |
| 10 |
INV |
11 |
| 11 |
BND |
21 |
| 11 |
DEL |
1125 |
| 11 |
DUP |
116 |
| 11 |
INS |
1236 |
| 11 |
INV |
4 |
| 12 |
BND |
29 |
| 12 |
DEL |
1405 |
| 12 |
DUP |
146 |
| 12 |
INS |
1609 |
| 12 |
INV |
15 |
| 13 |
BND |
8 |
| 13 |
DEL |
1001 |
| 13 |
DUP |
119 |
| 13 |
INS |
1119 |
| 13 |
INV |
7 |
| 14 |
BND |
13 |
| 14 |
DEL |
961 |
| 14 |
DUP |
102 |
| 14 |
INS |
1133 |
| 14 |
INV |
9 |
| 15 |
BND |
12 |
| 15 |
DEL |
1035 |
| 15 |
DUP |
131 |
| 15 |
INS |
1203 |
| 15 |
INV |
1 |
| 16 |
BND |
33 |
| 16 |
DEL |
1069 |
| 16 |
DUP |
124 |
| 16 |
INS |
1279 |
| 16 |
INV |
6 |
| 17 |
BND |
13 |
| 17 |
DEL |
1131 |
| 17 |
DUP |
127 |
| 17 |
INS |
1265 |
| 17 |
INV |
13 |
| 18 |
BND |
20 |
| 18 |
DEL |
998 |
| 18 |
DUP |
120 |
| 18 |
INS |
1064 |
| 18 |
INV |
10 |
| 19 |
BND |
7 |
| 19 |
DEL |
921 |
| 19 |
DUP |
104 |
| 19 |
INS |
1134 |
| 19 |
INV |
8 |
| 2 |
BND |
19 |
| 2 |
DEL |
1282 |
| 2 |
DUP |
136 |
| 2 |
INS |
1491 |
| 2 |
INV |
8 |
| 20 |
BND |
14 |
| 20 |
DEL |
927 |
| 20 |
DUP |
70 |
| 20 |
INS |
936 |
| 20 |
INV |
10 |
| 21 |
BND |
18 |
| 21 |
DEL |
917 |
| 21 |
DUP |
97 |
| 21 |
INS |
1036 |
| 21 |
INV |
7 |
| 22 |
BND |
12 |
| 22 |
DEL |
1019 |
| 22 |
DUP |
125 |
| 22 |
INS |
1160 |
| 22 |
INV |
8 |
| 23 |
BND |
15 |
| 23 |
DEL |
855 |
| 23 |
DUP |
100 |
| 23 |
INS |
1090 |
| 23 |
INV |
9 |
| 24 |
BND |
9 |
| 24 |
DEL |
791 |
| 24 |
DUP |
51 |
| 24 |
INS |
929 |
| 24 |
INV |
4 |
| 25 |
BND |
15 |
| 25 |
DEL |
785 |
| 25 |
DUP |
110 |
| 25 |
INS |
988 |
| 25 |
INV |
3 |
| 26 |
BND |
31 |
| 26 |
DEL |
895 |
| 26 |
DUP |
72 |
| 26 |
INS |
1002 |
| 26 |
INV |
7 |
| 27 |
BND |
14 |
| 27 |
DEL |
934 |
| 27 |
DUP |
73 |
| 27 |
INS |
1152 |
| 27 |
INV |
6 |
| 28 |
BND |
4 |
| 28 |
DEL |
654 |
| 28 |
DUP |
54 |
| 28 |
INS |
737 |
| 28 |
INV |
4 |
| 29 |
BND |
12 |
| 29 |
DEL |
762 |
| 29 |
DUP |
89 |
| 29 |
INS |
884 |
| 29 |
INV |
4 |
| 3 |
BND |
19 |
| 3 |
DEL |
1408 |
| 3 |
DUP |
154 |
| 3 |
INS |
1643 |
| 3 |
INV |
10 |
| 30 |
BND |
8 |
| 30 |
DEL |
690 |
| 30 |
DUP |
51 |
| 30 |
INS |
854 |
| 30 |
INV |
4 |
| 31 |
BND |
9 |
| 31 |
DEL |
698 |
| 31 |
DUP |
109 |
| 31 |
INS |
851 |
| 31 |
INV |
2 |
| 32 |
BND |
8 |
| 32 |
DEL |
891 |
| 32 |
DUP |
60 |
| 32 |
INS |
982 |
| 32 |
INV |
5 |
| 33 |
BND |
8 |
| 33 |
DEL |
596 |
| 33 |
DUP |
81 |
| 33 |
INS |
704 |
| 33 |
INV |
3 |
| 34 |
BND |
7 |
| 34 |
DEL |
755 |
| 34 |
DUP |
78 |
| 34 |
INS |
875 |
| 34 |
INV |
5 |
| 35 |
BND |
5 |
| 35 |
DEL |
511 |
| 35 |
DUP |
38 |
| 35 |
INS |
593 |
| 35 |
INV |
2 |
| 36 |
BND |
7 |
| 36 |
DEL |
517 |
| 36 |
DUP |
46 |
| 36 |
INS |
592 |
| 36 |
INV |
4 |
| 37 |
BND |
4 |
| 37 |
DEL |
438 |
| 37 |
DUP |
60 |
| 37 |
INS |
552 |
| 37 |
INV |
5 |
| 38 |
BND |
1 |
| 38 |
DEL |
480 |
| 38 |
DUP |
56 |
| 38 |
INS |
546 |
| 38 |
INV |
2 |
| 4 |
BND |
22 |
| 4 |
DEL |
1350 |
| 4 |
DUP |
146 |
| 4 |
INS |
1575 |
| 4 |
INV |
1 |
| 5 |
BND |
20 |
| 5 |
DEL |
1297 |
| 5 |
DUP |
138 |
| 5 |
INS |
1438 |
| 5 |
INV |
5 |
| 6 |
BND |
15 |
| 6 |
DEL |
1217 |
| 6 |
DUP |
137 |
| 6 |
INS |
1379 |
| 6 |
INV |
8 |
| 7 |
BND |
19 |
| 7 |
DEL |
1176 |
| 7 |
DUP |
135 |
| 7 |
INS |
1507 |
| 7 |
INV |
9 |
| 8 |
BND |
62 |
| 8 |
DEL |
1448 |
| 8 |
DUP |
113 |
| 8 |
INS |
1689 |
| 8 |
INV |
17 |
| 9 |
BND |
44 |
| 9 |
DEL |
1097 |
| 9 |
DUP |
96 |
| 9 |
INS |
1252 |
| 9 |
INV |
20 |
| X |
BND |
75 |
| X |
DEL |
1465 |
| X |
DUP |
120 |
| X |
INS |
1852 |
| X |
INV |
30 |
shared_total_variant_summary_Cfam3 <- shared_vcf_Cfam3 %>%
group_by(SVTYPE) %>%
summarise(Total_Count = n()) %>%
ungroup()
kable(shared_total_variant_summary_Cfam3, caption = "Total variant count by type, shared Samples (CanFam 3.1)")
Total variant count by type, shared Samples (CanFam
3.1)
| BND |
691 |
| DEL |
38280 |
| DUP |
3972 |
| INS |
44589 |
| INV |
308 |
## CanFam4
# Extract the variant type (SVTYPE) from the INFO column
shared_vcf_Cfam4$SVTYPE <- sapply(shared_vcf_Cfam4$INFO, function(info) {
match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
if (length(match) > 0) {
sub("SVTYPE=", "", match)
} else {
NA
}
})
shared_vcf_Cfam4 <- shared_vcf_Cfam4[!is.na(shared_vcf_Cfam4$SVTYPE),] # remove NAs
# Summarize the counts of each variant type per chromosome
shared_variant_summary_Cfam4 <- shared_vcf_Cfam4 %>%
group_by(X.CHROM, SVTYPE) %>%
summarise(Count = n()) %>%
ungroup()
# Rename chromosome column for better usability in plotting
colnames(shared_variant_summary_Cfam4)[1] <- "Chromosome"
# Ensure the chromosome column is ordered correctly
shared_variant_summary_Cfam4$Chromosome <- factor(shared_variant_summary_Cfam4$Chromosome, levels = c(as.character(1:38), "X"))
# Plot the number of variants per chromosome by type
ggplot(shared_variant_summary_Cfam4, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(
title = "Structural variant calls per chromosome by type for variants called in \nboth PTCLs and controls (CanFam4)",
x = "Chromosome",
y = "Number of Variants",
fill = "Variant Type"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) +
scale_fill_brewer(palette = "Set2")

kable(shared_variant_summary_Cfam4, caption = "Variant count per chromosome, shared Samples (CanFam4)")
Variant count per chromosome, shared Samples
(CanFam4)
| 1 |
BND |
24 |
| 1 |
DEL |
1922 |
| 1 |
DUP |
199 |
| 1 |
INS |
1994 |
| 1 |
INV |
16 |
| 10 |
BND |
13 |
| 10 |
DEL |
1088 |
| 10 |
DUP |
128 |
| 10 |
INS |
1183 |
| 10 |
INV |
14 |
| 11 |
BND |
16 |
| 11 |
DEL |
1167 |
| 11 |
DUP |
106 |
| 11 |
INS |
1274 |
| 11 |
INV |
3 |
| 12 |
BND |
12 |
| 12 |
DEL |
1500 |
| 12 |
DUP |
145 |
| 12 |
INS |
1593 |
| 12 |
INV |
11 |
| 13 |
BND |
13 |
| 13 |
DEL |
1033 |
| 13 |
DUP |
128 |
| 13 |
INS |
1061 |
| 13 |
INV |
8 |
| 14 |
BND |
19 |
| 14 |
DEL |
998 |
| 14 |
DUP |
99 |
| 14 |
INS |
1106 |
| 14 |
INV |
9 |
| 15 |
BND |
8 |
| 15 |
DEL |
1067 |
| 15 |
DUP |
129 |
| 15 |
INS |
1171 |
| 15 |
INV |
7 |
| 16 |
BND |
8 |
| 16 |
DEL |
1024 |
| 16 |
DUP |
119 |
| 16 |
INS |
1100 |
| 16 |
INV |
5 |
| 17 |
BND |
12 |
| 17 |
DEL |
1148 |
| 17 |
DUP |
108 |
| 17 |
INS |
1194 |
| 17 |
INV |
8 |
| 18 |
BND |
41 |
| 18 |
DEL |
1109 |
| 18 |
DUP |
120 |
| 18 |
INS |
1098 |
| 18 |
INV |
11 |
| 19 |
BND |
20 |
| 19 |
DEL |
1067 |
| 19 |
DUP |
135 |
| 19 |
INS |
1096 |
| 19 |
INV |
8 |
| 2 |
BND |
15 |
| 2 |
DEL |
1283 |
| 2 |
DUP |
137 |
| 2 |
INS |
1347 |
| 2 |
INV |
11 |
| 20 |
BND |
9 |
| 20 |
DEL |
925 |
| 20 |
DUP |
96 |
| 20 |
INS |
990 |
| 20 |
INV |
6 |
| 21 |
BND |
13 |
| 21 |
DEL |
1019 |
| 21 |
DUP |
113 |
| 21 |
INS |
1089 |
| 21 |
INV |
8 |
| 22 |
BND |
15 |
| 22 |
DEL |
1083 |
| 22 |
DUP |
110 |
| 22 |
INS |
1115 |
| 22 |
INV |
11 |
| 23 |
BND |
6 |
| 23 |
DEL |
943 |
| 23 |
DUP |
87 |
| 23 |
INS |
1055 |
| 23 |
INV |
9 |
| 24 |
BND |
11 |
| 24 |
DEL |
866 |
| 24 |
DUP |
79 |
| 24 |
INS |
985 |
| 24 |
INV |
4 |
| 25 |
BND |
9 |
| 25 |
DEL |
825 |
| 25 |
DUP |
114 |
| 25 |
INS |
977 |
| 25 |
INV |
4 |
| 26 |
BND |
22 |
| 26 |
DEL |
894 |
| 26 |
DUP |
85 |
| 26 |
INS |
933 |
| 26 |
INV |
7 |
| 27 |
BND |
13 |
| 27 |
DEL |
1020 |
| 27 |
DUP |
69 |
| 27 |
INS |
1138 |
| 27 |
INV |
4 |
| 28 |
BND |
8 |
| 28 |
DEL |
769 |
| 28 |
DUP |
72 |
| 28 |
INS |
720 |
| 28 |
INV |
4 |
| 29 |
BND |
9 |
| 29 |
DEL |
892 |
| 29 |
DUP |
99 |
| 29 |
INS |
877 |
| 29 |
INV |
2 |
| 3 |
BND |
15 |
| 3 |
DEL |
1437 |
| 3 |
DUP |
160 |
| 3 |
INS |
1638 |
| 3 |
INV |
10 |
| 30 |
BND |
6 |
| 30 |
DEL |
819 |
| 30 |
DUP |
69 |
| 30 |
INS |
828 |
| 30 |
INV |
5 |
| 31 |
BND |
5 |
| 31 |
DEL |
873 |
| 31 |
DUP |
98 |
| 31 |
INS |
835 |
| 31 |
INV |
7 |
| 32 |
BND |
16 |
| 32 |
DEL |
1058 |
| 32 |
DUP |
103 |
| 32 |
INS |
1093 |
| 32 |
INV |
4 |
| 33 |
BND |
4 |
| 33 |
DEL |
818 |
| 33 |
DUP |
113 |
| 33 |
INS |
716 |
| 33 |
INV |
6 |
| 34 |
BND |
12 |
| 34 |
DEL |
787 |
| 34 |
DUP |
110 |
| 34 |
INS |
922 |
| 34 |
INV |
5 |
| 35 |
BND |
33 |
| 35 |
DEL |
722 |
| 35 |
DUP |
76 |
| 35 |
INS |
772 |
| 35 |
INV |
1 |
| 36 |
BND |
3 |
| 36 |
DEL |
631 |
| 36 |
DUP |
62 |
| 36 |
INS |
631 |
| 36 |
INV |
9 |
| 37 |
BND |
2 |
| 37 |
DEL |
492 |
| 37 |
DUP |
68 |
| 37 |
INS |
561 |
| 37 |
INV |
2 |
| 38 |
BND |
5 |
| 38 |
DEL |
608 |
| 38 |
DUP |
57 |
| 38 |
INS |
556 |
| 38 |
INV |
4 |
| 4 |
BND |
13 |
| 4 |
DEL |
1424 |
| 4 |
DUP |
157 |
| 4 |
INS |
1564 |
| 4 |
INV |
8 |
| 5 |
BND |
10 |
| 5 |
DEL |
1295 |
| 5 |
DUP |
134 |
| 5 |
INS |
1378 |
| 5 |
INV |
5 |
| 6 |
BND |
14 |
| 6 |
DEL |
1263 |
| 6 |
DUP |
136 |
| 6 |
INS |
1331 |
| 6 |
INV |
9 |
| 7 |
BND |
17 |
| 7 |
DEL |
1270 |
| 7 |
DUP |
125 |
| 7 |
INS |
1462 |
| 7 |
INV |
7 |
| 8 |
BND |
40 |
| 8 |
DEL |
1441 |
| 8 |
DUP |
133 |
| 8 |
INS |
1675 |
| 8 |
INV |
16 |
| 9 |
BND |
34 |
| 9 |
DEL |
975 |
| 9 |
DUP |
87 |
| 9 |
INS |
1240 |
| 9 |
INV |
15 |
| X |
BND |
55 |
| X |
DEL |
1521 |
| X |
DUP |
126 |
| X |
INS |
1543 |
| X |
INV |
28 |
shared_total_variant_summary_Cfam4 <- shared_vcf_Cfam4 %>%
group_by(SVTYPE) %>%
summarise(Total_Count = n()) %>%
ungroup()
kable(shared_total_variant_summary_Cfam4, caption = "Total variant count by type, shared Samples (CanFam4)")
Total variant count by type, shared Samples (CanFam4)
| BND |
600 |
| DEL |
41076 |
| DUP |
4291 |
| INS |
43841 |
| INV |
311 |
Evaluate tumor-specific variants not called in controls
# Exclude rows where ANY control sample has a variant (i.e., not 0/0: or ./.:)
ptcl_only_vcf_Cfam3 <- ptcl_vcf_Cfam3 %>%
filter(if_all(all_of(ctrl_samples), ~ str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:"))))
ptcl_only_vcf_Cfam4 <- ptcl_vcf_Cfam4 %>%
filter(if_all(all_of(ctrl_samples), ~ str_starts(.x, fixed("0/0:")) | str_starts(.x, fixed("./.:"))))
Number of variants per chromosome
# CanFam 3.1
ptcl_only_variant_counts_Cfam3 <- table(ptcl_only_vcf_Cfam3$X.CHROM)
ptcl_only_variant_df_Cfam3 <- as.data.frame(ptcl_only_variant_counts_Cfam3)
colnames(ptcl_only_variant_df_Cfam3) <- c("Chromosome", "VariantCount")
print(ggplot(ptcl_only_variant_df_Cfam3, aes(x = Chromosome, y = VariantCount)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
labs(
title = "Structural variant calls per chromosome, PTCL-specific variants (CanFam 3.1)",
x = "Chromosome",
y = "Number of Variants"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)))

# CanFam 4
ptcl_only_variant_counts_Cfam4 <- table(ptcl_only_vcf_Cfam4$X.CHROM)
ptcl_only_variant_df_Cfam4 <- as.data.frame(ptcl_only_variant_counts_Cfam4)
colnames(ptcl_only_variant_df_Cfam4) <- c("Chromosome", "VariantCount")
print(ggplot(ptcl_only_variant_df_Cfam4, aes(x = Chromosome, y = VariantCount)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
labs(
title = "Structural variant calls per chromosome, PTCL-specific variants (CanFam4)",
x = "Chromosome",
y = "Number of Variants"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)))

Types of variants by chromosome
## CanFam 3.1
# Extract the variant type (SVTYPE) from the INFO column
ptcl_only_vcf_Cfam3$SVTYPE <- sapply(ptcl_only_vcf_Cfam3$INFO, function(info) {
match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
if (length(match) > 0) {
sub("SVTYPE=", "", match)
} else {
NA
}
})
ptcl_only_vcf_Cfam3 <- ptcl_only_vcf_Cfam3[!is.na(ptcl_only_vcf_Cfam3$SVTYPE),] # remove NAs
# Summarize the counts of each variant type per chromosome
ptcl_only_variant_summary_Cfam3 <- ptcl_only_vcf_Cfam3 %>%
group_by(X.CHROM, SVTYPE) %>%
summarise(Count = n()) %>%
ungroup()
# Rename chromosome column for better usability in plotting
colnames(ptcl_only_variant_summary_Cfam3)[1] <- "Chromosome"
# Ensure the chromosome column is ordered correctly
ptcl_only_variant_summary_Cfam3$Chromosome <- factor(ptcl_only_variant_summary_Cfam3$Chromosome, levels = c(as.character(1:38), "X"))
# Plot the number of variants per chromosome by type
ggplot(ptcl_only_variant_summary_Cfam3, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(
title = "Structural variants per chromosome by type,\nPTCL-specific variants (CanFam 3.1)",
x = "Chromosome",
y = "Number of Variants",
fill = "Variant Type"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) +
scale_fill_brewer(palette = "Set2")

kable(ptcl_only_variant_summary_Cfam3, caption = "Variant count per chromosome, PTCL-specific variants (CanFam 3.1)")
Variant count per chromosome, PTCL-specific variants (CanFam
3.1)
| 1 |
BND |
4 |
| 1 |
DEL |
478 |
| 1 |
DUP |
83 |
| 1 |
INS |
1276 |
| 1 |
INV |
4 |
| 10 |
BND |
2 |
| 10 |
DEL |
220 |
| 10 |
DUP |
49 |
| 10 |
INS |
638 |
| 11 |
BND |
34 |
| 11 |
DEL |
234 |
| 11 |
DUP |
36 |
| 11 |
INS |
662 |
| 11 |
INV |
6 |
| 12 |
BND |
6 |
| 12 |
DEL |
285 |
| 12 |
DUP |
51 |
| 12 |
INS |
916 |
| 12 |
INV |
4 |
| 13 |
BND |
6 |
| 13 |
DEL |
193 |
| 13 |
DUP |
36 |
| 13 |
INS |
673 |
| 13 |
INV |
1 |
| 14 |
BND |
4 |
| 14 |
DEL |
211 |
| 14 |
DUP |
46 |
| 14 |
INS |
669 |
| 14 |
INV |
3 |
| 15 |
BND |
2 |
| 15 |
DEL |
279 |
| 15 |
DUP |
53 |
| 15 |
INS |
810 |
| 15 |
INV |
3 |
| 16 |
BND |
14 |
| 16 |
DEL |
199 |
| 16 |
DUP |
30 |
| 16 |
INS |
581 |
| 17 |
BND |
11 |
| 17 |
DEL |
258 |
| 17 |
DUP |
48 |
| 17 |
INS |
806 |
| 17 |
INV |
3 |
| 18 |
BND |
12 |
| 18 |
DEL |
196 |
| 18 |
DUP |
35 |
| 18 |
INS |
613 |
| 18 |
INV |
6 |
| 19 |
BND |
3 |
| 19 |
DEL |
188 |
| 19 |
DUP |
36 |
| 19 |
INS |
555 |
| 19 |
INV |
3 |
| 2 |
BND |
1 |
| 2 |
DEL |
279 |
| 2 |
DUP |
49 |
| 2 |
INS |
928 |
| 2 |
INV |
5 |
| 20 |
BND |
7 |
| 20 |
DEL |
182 |
| 20 |
DUP |
29 |
| 20 |
INS |
682 |
| 20 |
INV |
3 |
| 21 |
BND |
5 |
| 21 |
DEL |
265 |
| 21 |
DUP |
52 |
| 21 |
INS |
747 |
| 21 |
INV |
2 |
| 22 |
BND |
3 |
| 22 |
DEL |
246 |
| 22 |
DUP |
58 |
| 22 |
INS |
696 |
| 22 |
INV |
1 |
| 23 |
BND |
2 |
| 23 |
DEL |
165 |
| 23 |
DUP |
30 |
| 23 |
INS |
601 |
| 23 |
INV |
2 |
| 24 |
BND |
3 |
| 24 |
DEL |
147 |
| 24 |
DUP |
25 |
| 24 |
INS |
492 |
| 24 |
INV |
1 |
| 25 |
BND |
5 |
| 25 |
DEL |
162 |
| 25 |
DUP |
33 |
| 25 |
INS |
459 |
| 25 |
INV |
3 |
| 26 |
BND |
8 |
| 26 |
DEL |
210 |
| 26 |
DUP |
38 |
| 26 |
INS |
672 |
| 26 |
INV |
5 |
| 27 |
BND |
5 |
| 27 |
DEL |
180 |
| 27 |
DUP |
30 |
| 27 |
INS |
607 |
| 27 |
INV |
1 |
| 28 |
BND |
2 |
| 28 |
DEL |
107 |
| 28 |
DUP |
34 |
| 28 |
INS |
404 |
| 28 |
INV |
1 |
| 29 |
BND |
2 |
| 29 |
DEL |
195 |
| 29 |
DUP |
37 |
| 29 |
INS |
576 |
| 29 |
INV |
2 |
| 3 |
BND |
8 |
| 3 |
DEL |
293 |
| 3 |
DUP |
67 |
| 3 |
INS |
958 |
| 3 |
INV |
4 |
| 30 |
BND |
2 |
| 30 |
DEL |
115 |
| 30 |
DUP |
27 |
| 30 |
INS |
500 |
| 30 |
INV |
1 |
| 31 |
BND |
1 |
| 31 |
DEL |
202 |
| 31 |
DUP |
42 |
| 31 |
INS |
584 |
| 31 |
INV |
2 |
| 32 |
BND |
6 |
| 32 |
DEL |
163 |
| 32 |
DUP |
28 |
| 32 |
INS |
503 |
| 32 |
INV |
2 |
| 33 |
BND |
6 |
| 33 |
DEL |
123 |
| 33 |
DUP |
40 |
| 33 |
INS |
438 |
| 33 |
INV |
2 |
| 34 |
BND |
3 |
| 34 |
DEL |
144 |
| 34 |
DUP |
22 |
| 34 |
INS |
427 |
| 34 |
INV |
4 |
| 35 |
DEL |
101 |
| 35 |
DUP |
22 |
| 35 |
INS |
358 |
| 35 |
INV |
2 |
| 36 |
BND |
2 |
| 36 |
DEL |
116 |
| 36 |
DUP |
20 |
| 36 |
INS |
335 |
| 37 |
BND |
1 |
| 37 |
DEL |
154 |
| 37 |
DUP |
21 |
| 37 |
INS |
343 |
| 38 |
BND |
4 |
| 38 |
DEL |
116 |
| 38 |
DUP |
21 |
| 38 |
INS |
278 |
| 38 |
INV |
4 |
| 4 |
BND |
23 |
| 4 |
DEL |
325 |
| 4 |
DUP |
62 |
| 4 |
INS |
1042 |
| 4 |
INV |
5 |
| 5 |
BND |
7 |
| 5 |
DEL |
317 |
| 5 |
DUP |
61 |
| 5 |
INS |
940 |
| 5 |
INV |
3 |
| 6 |
BND |
5 |
| 6 |
DEL |
308 |
| 6 |
DUP |
54 |
| 6 |
INS |
811 |
| 6 |
INV |
2 |
| 7 |
BND |
2 |
| 7 |
DEL |
291 |
| 7 |
DUP |
50 |
| 7 |
INS |
863 |
| 8 |
BND |
30 |
| 8 |
DEL |
279 |
| 8 |
DUP |
58 |
| 8 |
INS |
1000 |
| 8 |
INV |
5 |
| 9 |
BND |
1 |
| 9 |
DEL |
173 |
| 9 |
DUP |
33 |
| 9 |
INS |
666 |
| 9 |
INV |
5 |
| X |
BND |
19 |
| X |
DEL |
230 |
| X |
DUP |
40 |
| X |
INS |
770 |
| X |
INV |
4 |
ptcl_only_total_variant_summary_Cfam3 <- ptcl_only_vcf_Cfam3 %>%
group_by(SVTYPE) %>%
summarise(Total_Count = n()) %>%
ungroup()
kable(ptcl_only_total_variant_summary_Cfam3, caption = "Total variant count by type, PTCL-specific variants (CanFam 3.1)")
Total variant count by type, PTCL-specific variants (CanFam
3.1)
| BND |
261 |
| DEL |
8329 |
| DUP |
1586 |
| INS |
25879 |
| INV |
104 |
## CanFam4
# Extract the variant type (SVTYPE) from the INFO column
ptcl_only_vcf_Cfam4$SVTYPE <- sapply(ptcl_only_vcf_Cfam4$INFO, function(info) {
match <- regmatches(info, regexpr("SVTYPE=[^;]+", info))
if (length(match) > 0) {
sub("SVTYPE=", "", match)
} else {
NA
}
})
ptcl_only_vcf_Cfam4 <- ptcl_only_vcf_Cfam4[!is.na(ptcl_only_vcf_Cfam4$SVTYPE),] # remove NAs
# Summarize the counts of each variant type per chromosome
ptcl_only_variant_summary_Cfam4 <- ptcl_only_vcf_Cfam4 %>%
group_by(X.CHROM, SVTYPE) %>%
summarise(Count = n()) %>%
ungroup()
# Rename chromosome column for better usability in plotting
colnames(ptcl_only_variant_summary_Cfam4)[1] <- "Chromosome"
# Ensure the chromosome column is ordered correctly
ptcl_only_variant_summary_Cfam4$Chromosome <- factor(ptcl_only_variant_summary_Cfam4$Chromosome, levels = c(as.character(1:38), "X"))
# Plot the number of variants per chromosome by type
ggplot(ptcl_only_variant_summary_Cfam4, aes(x = Chromosome, y = Count, fill = SVTYPE)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(
title = "Structural variants per chromosome by type,\nPTCL-specific variants (CanFam4)",
x = "Chromosome",
y = "Number of Variants",
fill = "Variant Type"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5)) +
scale_fill_brewer(palette = "Set2")

kable(ptcl_only_variant_summary_Cfam4, caption = "Variant count per chromosome, PTCL-specific variants (CanFam4)")
Variant count per chromosome, PTCL-specific variants
(CanFam4)
| 1 |
BND |
6 |
| 1 |
DEL |
440 |
| 1 |
DUP |
79 |
| 1 |
INS |
1191 |
| 1 |
INV |
4 |
| 10 |
BND |
1 |
| 10 |
DEL |
225 |
| 10 |
DUP |
53 |
| 10 |
INS |
627 |
| 10 |
INV |
3 |
| 11 |
BND |
39 |
| 11 |
DEL |
210 |
| 11 |
DUP |
42 |
| 11 |
INS |
576 |
| 11 |
INV |
9 |
| 12 |
BND |
10 |
| 12 |
DEL |
281 |
| 12 |
DUP |
59 |
| 12 |
INS |
890 |
| 12 |
INV |
2 |
| 13 |
BND |
7 |
| 13 |
DEL |
201 |
| 13 |
DUP |
34 |
| 13 |
INS |
627 |
| 13 |
INV |
1 |
| 14 |
BND |
3 |
| 14 |
DEL |
216 |
| 14 |
DUP |
39 |
| 14 |
INS |
637 |
| 14 |
INV |
1 |
| 15 |
BND |
4 |
| 15 |
DEL |
279 |
| 15 |
DUP |
69 |
| 15 |
INS |
762 |
| 15 |
INV |
3 |
| 16 |
BND |
16 |
| 16 |
DEL |
174 |
| 16 |
DUP |
27 |
| 16 |
INS |
509 |
| 16 |
INV |
1 |
| 17 |
BND |
11 |
| 17 |
DEL |
245 |
| 17 |
DUP |
58 |
| 17 |
INS |
759 |
| 17 |
INV |
2 |
| 18 |
BND |
11 |
| 18 |
DEL |
205 |
| 18 |
DUP |
53 |
| 18 |
INS |
599 |
| 18 |
INV |
8 |
| 19 |
BND |
2 |
| 19 |
DEL |
209 |
| 19 |
DUP |
64 |
| 19 |
INS |
584 |
| 19 |
INV |
4 |
| 2 |
BND |
2 |
| 2 |
DEL |
273 |
| 2 |
DUP |
56 |
| 2 |
INS |
870 |
| 2 |
INV |
6 |
| 20 |
BND |
5 |
| 20 |
DEL |
169 |
| 20 |
DUP |
31 |
| 20 |
INS |
643 |
| 20 |
INV |
3 |
| 21 |
BND |
5 |
| 21 |
DEL |
233 |
| 21 |
DUP |
49 |
| 21 |
INS |
716 |
| 21 |
INV |
3 |
| 22 |
BND |
1 |
| 22 |
DEL |
268 |
| 22 |
DUP |
57 |
| 22 |
INS |
689 |
| 22 |
INV |
1 |
| 23 |
DEL |
177 |
| 23 |
DUP |
35 |
| 23 |
INS |
596 |
| 23 |
INV |
4 |
| 24 |
BND |
2 |
| 24 |
DEL |
134 |
| 24 |
DUP |
27 |
| 24 |
INS |
459 |
| 24 |
INV |
1 |
| 25 |
BND |
8 |
| 25 |
DEL |
171 |
| 25 |
DUP |
47 |
| 25 |
INS |
434 |
| 25 |
INV |
1 |
| 26 |
BND |
6 |
| 26 |
DEL |
200 |
| 26 |
DUP |
30 |
| 26 |
INS |
638 |
| 26 |
INV |
4 |
| 27 |
BND |
5 |
| 27 |
DEL |
183 |
| 27 |
DUP |
32 |
| 27 |
INS |
561 |
| 27 |
INV |
3 |
| 28 |
BND |
2 |
| 28 |
DEL |
116 |
| 28 |
DUP |
32 |
| 28 |
INS |
399 |
| 28 |
INV |
4 |
| 29 |
BND |
6 |
| 29 |
DEL |
207 |
| 29 |
DUP |
46 |
| 29 |
INS |
538 |
| 29 |
INV |
2 |
| 3 |
BND |
15 |
| 3 |
DEL |
280 |
| 3 |
DUP |
78 |
| 3 |
INS |
932 |
| 3 |
INV |
2 |
| 30 |
BND |
2 |
| 30 |
DEL |
127 |
| 30 |
DUP |
36 |
| 30 |
INS |
478 |
| 31 |
BND |
1 |
| 31 |
DEL |
218 |
| 31 |
DUP |
48 |
| 31 |
INS |
580 |
| 31 |
INV |
3 |
| 32 |
BND |
10 |
| 32 |
DEL |
204 |
| 32 |
DUP |
36 |
| 32 |
INS |
495 |
| 32 |
INV |
3 |
| 33 |
BND |
4 |
| 33 |
DEL |
137 |
| 33 |
DUP |
34 |
| 33 |
INS |
417 |
| 33 |
INV |
2 |
| 34 |
BND |
2 |
| 34 |
DEL |
133 |
| 34 |
DUP |
30 |
| 34 |
INS |
409 |
| 34 |
INV |
4 |
| 35 |
BND |
9 |
| 35 |
DEL |
169 |
| 35 |
DUP |
50 |
| 35 |
INS |
398 |
| 35 |
INV |
4 |
| 36 |
BND |
3 |
| 36 |
DEL |
125 |
| 36 |
DUP |
39 |
| 36 |
INS |
341 |
| 37 |
DEL |
142 |
| 37 |
DUP |
39 |
| 37 |
INS |
324 |
| 37 |
INV |
1 |
| 38 |
BND |
3 |
| 38 |
DEL |
111 |
| 38 |
DUP |
26 |
| 38 |
INS |
268 |
| 38 |
INV |
3 |
| 4 |
BND |
23 |
| 4 |
DEL |
307 |
| 4 |
DUP |
73 |
| 4 |
INS |
974 |
| 4 |
INV |
3 |
| 5 |
BND |
7 |
| 5 |
DEL |
318 |
| 5 |
DUP |
60 |
| 5 |
INS |
881 |
| 5 |
INV |
1 |
| 6 |
BND |
9 |
| 6 |
DEL |
318 |
| 6 |
DUP |
50 |
| 6 |
INS |
803 |
| 6 |
INV |
2 |
| 7 |
BND |
5 |
| 7 |
DEL |
273 |
| 7 |
DUP |
53 |
| 7 |
INS |
826 |
| 8 |
BND |
41 |
| 8 |
DEL |
380 |
| 8 |
DUP |
68 |
| 8 |
INS |
1155 |
| 8 |
INV |
3 |
| 9 |
BND |
2 |
| 9 |
DEL |
195 |
| 9 |
DUP |
41 |
| 9 |
INS |
655 |
| 9 |
INV |
4 |
| X |
BND |
13 |
| X |
DEL |
229 |
| X |
DUP |
51 |
| X |
INS |
717 |
| X |
INV |
5 |
ptcl_only_total_variant_summary_Cfam4 <- ptcl_only_vcf_Cfam4 %>%
group_by(SVTYPE) %>%
summarise(Total_Count = n()) %>%
ungroup()
kable(ptcl_only_total_variant_summary_Cfam4, caption = "Total variant count by type, PTCL-specific variants (CanFam4)")
Total variant count by type, PTCL-specific variants
(CanFam4)
| BND |
301 |
| DEL |
8482 |
| DUP |
1831 |
| INS |
24957 |
| INV |
110 |
Identify shared fusion partners with RNA-seq data
## CanFam 3.1
# read in csv file(s) of RNA fusion calls
star_rna_fusions <- read.csv("CanFam31_StarFusionSummary.csv")
fc_rna_fusions <- read.csv("fusionCatcher-results-ptcl-no-ctrls.csv")
# get list of implicated genes
star_genes_3 <- star_rna_fusions$X3_prime_gene
star_genes_5 <- star_rna_fusions$X5_prime_gene
fc_genes_3 <- fc_rna_fusions$X3_prime_gene
fc_genes_5 <- fc_rna_fusions$X5_prime_gene
all_star_genes <- c(star_genes_3, star_genes_5)
all_star_genes <- unique(all_star_genes)
all_fc_genes <- c(fc_genes_3, fc_genes_5)
all_fc_genes <- unique(all_fc_genes)
# compare to genes in INFO column of BND variant file
bnd_vcf <- read.csv("PTCLandCTRL_BND_Variants.CanFam3.filtered.ann.bcsq.csv")
extract_gene <- function(info) {
# Find the BCSQ field
bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
# Extract the gene name (second element after "|")
if (!is.na(bcsc_field)) {
gene <- strsplit(bcsc_field, "\\|")[[1]][2]
return(gene)
}
return(NA)
}
bnd_vcf <- bnd_vcf %>%
mutate(Gene = sapply(INFO, extract_gene))
shared_star_bnd_Cfam3 <- bnd_vcf %>%
filter(Gene %in% all_star_genes)
shared_fc_bnd_Cfam3 <- bnd_vcf %>%
filter(Gene %in% all_fc_genes)
# print summary
print(unique(shared_star_bnd_Cfam3$Gene))
## [1] "PALM2AKAP2" "ASCC3" "FBXW7"
## [4] "ENSCAFG00000014478" "ENSCAFG00000048749" "CPT1A"
## [7] "CD6" "MAML3" "TNFRSF1B"
## [10] "MITF" "DOCK3" "NBEA"
## [13] "TOX" "ENSCAFG00000032474" "UIMC1"
## [16] "ENSCAFG00000025128" "ENSCAFG00000030258" "ENSCAFG00000028509"
## [19] "ENSCAFG00000028642" "ENSCAFG00000044010"
print(unique(shared_fc_bnd_Cfam3$Gene))
## [1] "CDO1" "KCNQ5" "SND1" "CRADD" "GATB" "FBXW7"
## [7] "TRBV28" "DGUOK" "MAGI2" "IGHMBP2" "CPT1A" "CD6"
## [13] "MAML3" "ZRANB3" "VPS13D" "TNFRSF1B" "MITF" "ITPR2"
## [19] "TOX" "FAM172A" "UBE2E3" "ATP6V0E1" "RABGAP1L" "ACACA"
## [25] "ANOS1"
# export
write.csv(shared_star_bnd_Cfam3, file = "Spr25_shared_PacBio_STARFusion_calls.CanFam3.csv")
write.csv(shared_fc_bnd_Cfam3, file = "Spr25_shared_PacBio_FusionCatcher_calls.CanFam3.csv")
## CanFam4
# read in csv file of RNA fusion calls
star_rna_fusions <- read.csv("CanFam4_StarFusionSummary.csv")
# get list of implicated genes
genes_3 <- star_rna_fusions$X3_prime_gene
genes_5 <- star_rna_fusions$X5_prime_gene
all_genes <- c(genes_3, genes_5)
all_genes <- unique(all_genes)
# compare to genes in INFO column of BND variant file
bnd_vcf <- read.csv("PTCLandCTRL_BND_Variants.CanFam4.filtered.ann.bcsq.csv")
extract_gene <- function(info) {
# Find the BCSQ field
bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
# Extract the gene name (second element after "|")
if (!is.na(bcsc_field)) {
gene <- strsplit(bcsc_field, "\\|")[[1]][2]
return(gene)
}
return(NA)
}
bnd_vcf <- bnd_vcf %>%
mutate(Gene = sapply(INFO, extract_gene))
shared_bnd_Cfam4 <- bnd_vcf %>%
filter(Gene %in% all_genes)
# print summary
print(unique(shared_bnd_Cfam4$Gene))
## [1] "ENSCAFG00805005290" "ELMO1" "FBXW7"
## [4] "ENSCAFG00805017629" "ENSCAFG00805023029" "ENSCAFG00805002052"
## [7] "TNFRSF1B" "ENSCAFG00805023894" "ENSCAFG00805023901"
## [10] "TOX" "UIMC1" "RABGAP1L"
## [13] "ENSCAFG00805008332" "ENSCAFG00805008471" "ENSCAFG00805008786"
## [16] "ENSCAFG00805005681"
# export
write.csv(shared_bnd_Cfam4, file = "Spr25_shared_PacBio_STARFusion_calls.CanFam4.csv")
Identify structural variants implicating cancer-associated
genes
# list of cancer-associated genes from OncoKB
oncokb <- read.csv("cancerGeneList.csv")
oncokb_genes <- oncokb$Hugo.Symbol
extract_gene <- function(info) {
# Find the BCSQ field
bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
# Extract the gene name (second element after "|")
if (!is.na(bcsc_field)) {
gene <- strsplit(bcsc_field, "\\|")[[1]][2]
return(gene)
}
return(NA)
}
# Use the vcf objects of PTCL-specific variants
onco_vcf_Cfam3 <- ptcl_only_vcf_Cfam3 %>%
mutate(Gene = sapply(INFO, extract_gene))
onco_vcf_Cfam3 <- onco_vcf_Cfam3 %>%
filter(Gene %in% oncokb_genes)
onco_vcf_Cfam4 <- ptcl_only_vcf_Cfam4 %>%
mutate(Gene = sapply(INFO, extract_gene))
onco_vcf_Cfam4 <- onco_vcf_Cfam4 %>%
filter(Gene %in% oncokb_genes)
# export
write.csv(onco_vcf_Cfam3, file = "Spr25_CanFam3_PTCL_SVs_in_Cancer_Genes.csv")
write.csv(onco_vcf_Cfam4, file = "Spr25_CanFam4_PTCL_SVs_in_Cancer_Genes.csv")
Identify shared variable genes between CanFam3.1 and CanFam4
alignments
# Extract BCSQ gene annotation and SVTYPE from INFO field and place in separate columns called 'Gene' and 'SVTYPE'
extract_gene <- function(info) {
# Find the BCSQ field
bcsc_field <- stringr::str_extract(info, "BCSQ=[^;]+")
# Extract the gene name (second element after "|")
if (!is.na(bcsc_field)) {
gene <- strsplit(bcsc_field, "\\|")[[1]][2]
return(gene)
}
return(NA)
}
extract_variant_type <- function(info) {
# Find the SVTYPE field
svtype_field <- stringr::str_extract(info, "SVTYPE=[^;]+")
# Extract the variant type (first element after "=")
if (!is.na(svtype_field)) {
var_type <- strsplit(svtype_field, "=")[[1]][2]
return(var_type)
}
return(NA)
}
# Use the vcf objects of PTCL-specific variants
Cfam3_vcf_alt <- ptcl_only_vcf_Cfam3 %>%
mutate(Gene = sapply(INFO, extract_gene))
Cfam3_vcf_alt <- Cfam3_vcf_alt %>%
mutate(SVTYPE = sapply(INFO, extract_variant_type))
Cfam4_vcf_alt <- ptcl_only_vcf_Cfam4 %>%
mutate(Gene = sapply(INFO, extract_gene))
Cfam4_vcf_alt <- Cfam4_vcf_alt %>%
mutate(SVTYPE = sapply(INFO, extract_variant_type))
# Combine Gene and SVTYPE info into one column
Cfam3_vcf_alt <- Cfam3_vcf_alt %>%
mutate(
SVTYPE_Gene = paste(SVTYPE, Gene, sep = "_")
)
Cfam4_vcf_alt <- Cfam4_vcf_alt %>%
mutate(
SVTYPE_Gene = paste(SVTYPE, Gene, sep = "_")
)
# Create vector of SVTYPE_Gene column contents
Cfam3_variantGenes <- unique(Cfam3_vcf_alt$SVTYPE_Gene)
Cfam4_variantGenes <- unique(Cfam4_vcf_alt$SVTYPE_Gene)
# Filter variant calls of each alignment to include only those of the same variant type in shared genes
Cfam3_shared <- Cfam3_vcf_alt %>%
filter(SVTYPE_Gene %in% Cfam4_variantGenes) %>%
filter(Gene != "NA")
Cfam4_shared <- Cfam4_vcf_alt %>%
filter(SVTYPE_Gene %in% Cfam3_variantGenes) %>%
filter(Gene != "NA")
# print summary of total number of shared variable genes
paste("Number of genes with the same variant type called in CanFam3.1 and CanFam4: ", length(unique(Cfam3_shared$Gene)), sep = " ")
## [1] "Number of genes with the same variant type called in CanFam3.1 and CanFam4: 2869"
# export
write.csv(Cfam3_shared, file = "Spr25_CanFam3_PacBio_PTCL_Variants_in_Shared_Genes_with_CanFam4.csv")
write.csv(Cfam4_shared, file = "Spr25_CanFam4_PacBio_PTCL_Variants_in_Shared_Genes_with_CanFam3.csv")
# Limit to OncoKB cancer-associated genes
Cfam3_shared_oncokb <- Cfam3_shared %>%
filter(Gene %in% oncokb_genes)
Cfam4_shared_oncokb <- Cfam4_shared %>%
filter(Gene %in% oncokb_genes)
paste("Number of cancer-associated genes with same variant type called in CanFam3.1 and CanFam4:", length(unique(Cfam3_shared_oncokb$Gene)), sep = " ")
## [1] "Number of cancer-associated genes with same variant type called in CanFam3.1 and CanFam4: 274"
# export
write.csv(Cfam3_shared_oncokb, file = "Spr25_CanFam3_PacBio_PTCL_Variants_in_Shared_OncoKB_Genes_with_CanFam4.csv")
write.csv(Cfam4_shared_oncokb, file = "Spr25_CanFam4_PacBio_PTCL_Variants_in_Shared_OncoKB_Genes_with_CanFam3.csv")
Citations
sessionInfo()
## R version 4.4.0 (2024-04-24 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Denver
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] stringr_1.5.1 knitr_1.49 tidyr_1.3.1 dplyr_1.1.4 ggplot2_3.5.1
##
## loaded via a namespace (and not attached):
## [1] vctrs_0.6.5 cli_3.6.2 rlang_1.1.3 xfun_0.49
## [5] stringi_1.8.4 purrr_1.0.2 generics_0.1.3 jsonlite_1.8.9
## [9] labeling_0.4.3 glue_1.8.0 colorspace_2.1-1 htmltools_0.5.8.1
## [13] sass_0.4.9 scales_1.3.0 rmarkdown_2.29 grid_4.4.0
## [17] evaluate_1.0.3 munsell_0.5.1 jquerylib_0.1.4 tibble_3.2.1
## [21] fastmap_1.2.0 yaml_2.3.10 lifecycle_1.0.4 compiler_4.4.0
## [25] codetools_0.2-20 RColorBrewer_1.1-3 pkgconfig_2.0.3 rstudioapi_0.17.1
## [29] farver_2.1.2 digest_0.6.35 R6_2.5.1 tidyselect_1.2.1
## [33] pillar_1.10.1 magrittr_2.0.3 bslib_0.8.0 withr_3.0.2
## [37] tools_4.4.0 gtable_0.3.6 cachem_1.1.0
citation()
## To cite R in publications use:
##
## R Core Team (2024). _R: A Language and Environment for Statistical
## Computing_. R Foundation for Statistical Computing, Vienna, Austria.
## <https://www.R-project.org/>.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {R: A Language and Environment for Statistical Computing},
## author = {{R Core Team}},
## organization = {R Foundation for Statistical Computing},
## address = {Vienna, Austria},
## year = {2024},
## url = {https://www.R-project.org/},
## }
##
## We have invested a lot of time and effort in creating R, please cite it
## when using it for data analysis. See also 'citation("pkgname")' for
## citing R packages.