In statistics, we generally have two kinds of visualization:
“A picture is worth a thousand words”
library(tidyverse)
library(gridExtra)
theme_set(theme_gray(base_size = 18))
data <- read.csv("variants_from_assembly.bed", sep="\t", stringsAsFactors = TRUE, header = FALSE, quote = '')
# examine first few rows
head(data)
V1 V2 V3 V4 V5 V6 V7 V8 V9
1 6 103832058 103832059 SV1 185 + Insertion 0 185
2 6 102958468 102958469 SV2 317 + Insertion -14 303
3 6 102741692 102741693 SV3 130 + Deletion 130 0
4 6 102283759 102283760 SV4 1271 + Insertion -12 1259
5 6 101194032 101194033 SV5 2864 + Insertion -13 2851
6 6 101056644 101056645 SV6 265 + Insertion 0 265
# Colnames
names(data) <- c("chrom", "start", "stop", "name", "size", "strand", "type", "ref.dist", "query.dist")
head(data)
chrom start stop name size strand type ref.dist query.dist
1 6 103832058 103832059 SV1 185 + Insertion 0 185
2 6 102958468 102958469 SV2 317 + Insertion -14 303
3 6 102741692 102741693 SV3 130 + Deletion 130 0
4 6 102283759 102283760 SV4 1271 + Insertion -12 1259
5 6 101194032 101194033 SV5 2864 + Insertion -13 2851
6 6 101056644 101056645 SV6 265 + Insertion 0 265
# Data Structures
glimpse(data)
Rows: 9,556
Columns: 9
$ chrom <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6…
$ start <int> 103832058, 102958468, 102741692, 102283759, 101194032, 101…
$ stop <int> 103832059, 102958469, 102741693, 102283760, 101194033, 101…
$ name <fct> SV1, SV2, SV3, SV4, SV5, SV6, SV7, SV8, SV9, SV10, SV11, S…
$ size <int> 185, 317, 130, 1271, 2864, 265, 334, 248, 362, 31, 499, 16…
$ strand <fct> +, +, +, +, +, +, +, +, +, +, +, +, +, +, +, +, +, +, +, +…
$ type <fct> Insertion, Insertion, Deletion, Insertion, Insertion, Inse…
$ ref.dist <int> 0, -14, 130, -12, -13, 0, 0, 8915, 0, 1939, -6, -1141, -2,…
$ query.dist <int> 185, 303, 0, 1259, 2851, 265, 334, 8667, 362, 1908, 493, 4…
# Summary
summary(data)
chrom start stop name
1 : 779 Min. : 55378 Min. : 55379 SV1 : 1
2 : 702 1st Qu.: 28750831 1st Qu.: 28750832 SV10 : 1
7 : 596 Median : 67213567 Median : 67213568 SV100 : 1
6 : 568 Mean : 77646932 Mean : 77646933 SV1000 : 1
3 : 565 3rd Qu.:116282726 3rd Qu.:116282726 SV10000: 1
4 : 530 Max. :249205989 Max. :249205990 SV10001: 1
(Other):5816 (Other):9550
size strand type ref.dist
Min. : 1.0 +:9556 Contraction:1677 Min. :-11736
1st Qu.: 136.0 Deletion :2257 1st Qu.: -9
Median : 307.0 Expansion :2479 Median : 3
Mean : 681.0 Insertion :3143 Mean : 1077
3rd Qu.: 580.2 3rd Qu.: 348
Max. :9951.0 Max. :157902
query.dist
Min. :-18243.0
1st Qu.: 0.0
Median : 161.0
Mean : 1236.4
3rd Qu.: 758.2
Max. :163991.0
ggplot(data, aes(x=chrom))+
geom_bar()
# Filtering Data
data <- filter(data, chrom %in% c(seq(1,22), "X", "Y", "MT"))
ggplot(data, aes(x=chrom))+
geom_bar()
# Ordering Chromosomes
data$chrom <- factor(data$chrom, levels=c(seq(1,22),"X","Y","MT"))
ggplot(data, aes(x=chrom))+
geom_bar()
# Xlab, Ylab, Title
ggplot(data, aes(x=chrom))+
geom_bar()+
labs(title = "Chromosomes Distribution",
x="Chromosomes",
y= "Counts")
# Centering title
ggplot(data, aes(x=chrom))+
geom_bar()+
labs(title = "Chromosomes Distribution",
x="Chromosomes",
y= "Counts")+
theme(plot.title = element_text(hjust = 0.5))
# Color by categorical variable
ggplot(data, aes(x=chrom, fill=type))+
geom_bar()+
labs(title = "Chromosomes Distributions",
x="Chromosomes",
y= "Counts")+
theme(plot.title = element_text(hjust = 0.5))
# Ordering type
data$type <- factor(data$type, levels = c("Insertion","Deletion", "Expansion","Contraction"))
# Color by categorical variable
ggplot(data, aes(x=chrom, fill=type))+
geom_bar()+
labs(title = "Chromosomes Distributions",
x="Chromosomes",
y= "Counts")+
theme(plot.title = element_text(hjust = 0.5))
# Color by categorical variable
ggplot(data, aes(x=chrom, fill=type))+
geom_bar()+
labs(title = "Chromosomes Distributions",
x="Chromosomes",
y= "Counts")+
theme(plot.title = element_text(hjust = 0.5))+
guides(fill=guide_legend(title = "Type"))
fig <- function(width, heigth){
options(repr.plot.width = width, repr.plot.height = heigth)
}
# Facet Wrap
ggplot(data, aes(x=chrom, fill=type))+
geom_bar()+
labs(title = "Chromosomes Distributions",
x="Chromosomes",
y= "Counts")+
guides(fill=guide_legend(title = "Type"))+
facet_wrap(~type)+
fig(20,6)
# Facet Grid
ggplot(data, aes(x=chrom, fill=type))+
geom_bar()+
labs(title = "Chromosomes Distributions",
x="Chromosomes",
y= "Counts")+
guides(fill=guide_legend(title = "Type"))+
facet_grid(type ~ .)+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
ggplot(data, aes(x=ref.dist, y=query.dist))+
geom_point()
ggplot(data, aes(x=ref.dist, y=query.dist, color=type))+
geom_point()+
xlim(-500, 500)+
ylim(-500, 500)
Warning: Removed 3920 rows containing missing values (geom_point).
ggplot(data, aes(x=type, y=size))+
geom_boxplot()
ggplot(data, aes(x=type, y=size))+
geom_boxplot()+
coord_flip()
ggplot(data, aes(x=type, y=size))+
geom_violin()
ggplot(data, aes(x=type, y=size))+
geom_violin()+
coord_flip()
ggplot(data, aes(x=type, y=size))+
geom_violin(adjust=0.2)+
scale_y_log10()+
coord_flip()
ggplot(data, aes(x=size, fill=type))+
geom_density(alpha=0.5)+
xlim(0,500)
Warning: Removed 2660 rows containing non-finite values (stat_density).
ggplot(data, aes(x=size, fill=type))+
geom_density(alpha=0.5)+
xlim(0,500)+
facet_wrap(~type)
Warning: Removed 2660 rows containing non-finite values (stat_density).
ggplot(data, aes(x=size, fill=type))+
geom_density(alpha=0.5)+
xlim(0,500)+
facet_grid(type ~ .)
Warning: Removed 2660 rows containing non-finite values (stat_density).
ggplot(data, aes(x=size, fill=type))+
geom_density(alpha=0.5)+
xlim(0,500)+
facet_grid(. ~type)
Warning: Removed 2660 rows containing non-finite values (stat_density).
ggplot(data, aes(x=size, fill=type))+
geom_density(alpha=0.5)+
xlim(0,500)+
facet_grid(type ~chrom)
Warning: Removed 2660 rows containing non-finite values (stat_density).
type_counts <- summary(data$type)
pie(type_counts)
f1 <- ggplot(data, aes(x=chrom,fill=type))+
geom_bar()+
labs(title = "Distribution of Chromosomes",
x="Chromosomes", y="Counts",
tag = "A")+
guides(fill=guide_legend(title = "Type"))
f2 <- ggplot(data, aes(x=size, fill=type))+
geom_density(alpha=0.5)+
xlim(0,500)+
labs(title = "Density Plot of Size",
x = "Size",
y = "Probability",
tag = "B")+
guides(fill=guide_legend(title = "Type"))
grid.arrange(f1, f2, nrow=2, ncol=1)
Warning: Removed 2660 rows containing non-finite values (stat_density).