# vcf file as usual
vcf_file <- "https://ecogenomics.biodiv.tw/exercises/Saccer.GeoOrigin.chr1_50k_200k.SNP.vcf"
# metadata as usual
metadata_file <- "https://ecogenomics.biodiv.tw/exercises/Saccer.distinct.GeoOrigin.full.meta.txt"
# Read the actual data into R
vcf <- read.vcfR(vcf_file, verbose = FALSE)
## Local file Saccer.GeoOrigin.chr1_50k_200k.SNP.vcf found.
## Using this local copy instead of retrieving a remote copy.
metadata <- read.delim(metadata_file, header = TRUE)
vcf.gl <- vcfR2genlight(vcf)
## Warning in vcfR2genlight(vcf): Found 201 loci with more than two alleles.
## Objects of class genlight only support loci with two alleles.
## 201 loci will be omitted from the genlight object.
pop(vcf.gl) <- metadata$Country
ploidy(vcf.gl) <- 1
vcf.pca <- glPca(vcf.gl, nf = 10)
vcf.pca
## === PCA of genlight object ===
## Class: list of type glPca
## Call ($call):glPca(x = vcf.gl, nf = 10)
##
## Eigenvalues ($eig):
## 34.971 13.791 10.047 7.054 5.718 4.967 ...
##
## Principal components ($scores):
## matrix with 140 rows (individuals) and 10 columns (axes)
##
## Principal axes ($loadings):
## matrix with 2161 rows (SNPs) and 10 columns (axes)
# convert scores of vcf.pca into a tibble
vcf.pca.scores <- as_tibble(vcf.pca$scores)
# add the Country data into a column of vcf.pca.scores tibble
vcf.pca.scores$Country <- metadata$Country
# We will also determine the variance each PC contributes the data
barplot(100 * vcf.pca$eig / sum(vcf.pca$eig), col="forestgreen")
title(ylab = "Percent of variance explained")
title(xlab = "Eigenvalues")
# Lets extract the variance associated with the top 4 PCs, so we can use them in our plots.
# first we sum all the eigenvalues
eig.total <- sum(vcf.pca$eig)
# sum the variance
PC1.variance <- formatC(head(vcf.pca$eig)[1]/eig.total * 100)
PC2.variance <- formatC(head(vcf.pca$eig)[2]/eig.total * 100)
PC3.variance <- formatC(head(vcf.pca$eig)[3]/eig.total * 100)
PC4.variance <- formatC(head(vcf.pca$eig)[4]/eig.total * 100)
# plot PC1 and PC2 with each sample as a point
plot12 <- ggplot(vcf.pca.scores, aes(PC1, PC2)) + geom_point()
plot12
# We’ll add some axis labels, and incorporate the variance information to describe
# the relative importance of the spread of the data
xlabel <- paste0("PC1 variance = ",PC1.variance,"%")
ylabel <- paste0("PC2 variance = ", PC2.variance, "%")
plot12 <- plot12 + labs(x = xlabel, y = ylabel)
plot12
cols <- colorRampPalette(brewer.pal(8, "Set1"))(33)
plot12 <- plot12 + geom_point(aes(col = Country)) + scale_colour_manual(values=cols)
plot12
plot34 <- ggplot(vcf.pca.scores, aes(PC3, PC4)) +
geom_point(aes(col = Country)) +
labs(x = paste0("PC3 variance = ", PC3.variance,"%"), y = paste0("PC4 variance = ", PC4.variance, "%")) +
scale_colour_manual(values = cols)
plot12 + plot34
# Calculate the mean value of the principal components for each Country.
# We can use this to make some labels for our plots
means <- vcf.pca.scores %>% group_by(Country) %>%
summarize(meanPC1 = mean(PC1), meanPC2 = mean(PC2),meanPC3 = mean(PC3), meanPC4 = mean(PC4))
# Lets make a slightly different plot that our first comparison of PC1 and PC2,
plot12.2 <- ggplot(vcf.pca.scores, aes(PC1, PC2, col = Country)) +
labs(x = paste0("PC1 variance = ", PC1.variance, "%"), y = paste0("PC2 variance = ", PC2.variance, "%")) +
scale_colour_manual(values = cols) +
stat_ellipse(level = 0.95, size = 1) +
geom_label_repel(data = means,aes(
means$meanPC1, means$meanPC2, col = means$Country, label = means$Country))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot12
plot12.2
## Warning: Use of `means$meanPC1` is discouraged.
## ℹ Use `meanPC1` instead.
## Warning: Use of `means$meanPC2` is discouraged.
## ℹ Use `meanPC2` instead.
## Warning: Use of `means$Country` is discouraged.
## ℹ Use `Country` instead.
## Use of `means$Country` is discouraged.
## ℹ Use `Country` instead.
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Warning in MASS::cov.trob(data[, vars], wt = weight * nrow(data)): Probable
## convergence failure
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Warning: Removed 28 rows containing missing values or values outside the scale range
## (`geom_path()`).
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Generated pairwise distances between samples that we will plot in a tree format
tree_data <- aboot(vcf.gl, tree = "upgma", distance = bitwise.dist, sample = 100, showtree = F, cutoff = 50)
## Running bootstraps: 100 / 100
## Calculating bootstrap values... done.
#--- make and plot the tree
tree_plot <- ggtree(tree_data) +
geom_tiplab(size = 2, color = cols[pop(vcf.gl)]) +
xlim(-0.1, 0.3) +
geom_nodelab(size = 2, nudge_x = -0.006, nudge_y = 1) +
theme_tree2(legend.position = 'centre')
## Warning: `aes_()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`
## ℹ The deprecated feature was likely used in the ggtree package.
## Please report the issue at <https://github.com/YuLab-SMU/ggtree/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in fortify(data, ...): Arguments in `...` must be used.
## ✖ Problematic arguments:
## • as.Date = as.Date
## • yscale_mapping = yscale_mapping
## • hang = hang
## ℹ Did you misspell an argument name?
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggtree package.
## Please report the issue at <https://github.com/YuLab-SMU/ggtree/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
tree_plot
myDiff_pops <- genetic_diff(vcf,pops = vcf.gl@pop)
AF_data <- myDiff_pops[,c(1:19)]
AF_data <- melt(AF_data)
## Using CHROM, POS as id variables
colnames(AF_data) <- c("CHROM","POS","country","allele_frequency")
AF_data$country <- gsub("Hs_","", AF_data$country)
# extract the latitude and longitude for each country from the metadata file
coords <- data.frame(metadata$Country, metadata$Latitude, metadata$Longitude)
coords <- unique(coords)
colnames(coords) <- c("country","latitude","longitude")
# join the allele frequency data and the latitude/longitude data together
AF_data_coords <- dplyr::left_join(AF_data, coords, by = "country")
## Warning in dplyr::left_join(AF_data, coords, by = "country"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2363 of `x` matches multiple rows in `y`.
## ℹ Row 51 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# lets have a look at the new data.
head(AF_data_coords)
## CHROM POS country allele_frequency latitude longitude
## 1 chr1 50064 Argentina 0.0 -32.88946 -68.84584
## 2 chr1 50126 Argentina 0.0 -32.88946 -68.84584
## 3 chr1 50141 Argentina 0.0 -32.88946 -68.84584
## 4 chr1 50154 Argentina 0.0 -32.88946 -68.84584
## 5 chr1 50159 Argentina 0.0 -32.88946 -68.84584
## 6 chr1 52310 Argentina 0.5 -32.88946 -68.84584
# map() function is slighly different to other functions. Here the code is adding one layer at a time,
# first the map, then axes, then points, and finally legends
par(fg = "black")
map("world", col = "grey85", fill = TRUE, border = FALSE)
map.axes()
points(metadata$Longitude, metadata$Latitude, cex = 1.5, pch = 20, col = cols[pop(vcf.gl)])
legend( x = "left", legend = unique(pop(vcf.gl)),
col = cols[unique(pop(vcf.gl))], lwd = "1", lty = 0, pch = 20, box.lwd = 0, cex = 1)
snp_loadings <- as_tibble(data.frame(vcf.gl@loc.names, vcf.pca$loadings[,1:2]))
snp_loadings
## # A tibble: 2,161 × 3
## vcf.gl.loc.names Axis1 Axis2
## <chr> <dbl> <dbl>
## 1 chr1:50064:A:G -0.00437 -0.00848
## 2 chr1:50126:A:G -0.0251 0.0253
## 3 chr1:50141:G:A -0.000201 0.0992
## 4 chr1:50154:A:T -0.000588 0.00857
## 5 chr1:50159:C:G 0 0
## 6 chr1:52310:A:G 0.0714 -0.0266
## 7 chr1:52323:A:G 0 0
## 8 chr1:52329:T:C 0 0
## 9 chr1:52339:G:A -0.00447 -0.000699
## 10 chr1:52354:C:T 0 0
## # ℹ 2,151 more rows
snp_loadings %>% arrange(desc(Axis1))
## # A tibble: 2,161 × 3
## vcf.gl.loc.names Axis1 Axis2
## <chr> <dbl> <dbl>
## 1 chr1:119344:G:A 0.136 -0.0396
## 2 chr1:56878:T:C 0.127 0.0143
## 3 chr1:158503:C:T 0.127 -0.0409
## 4 chr1:152970:C:T 0.125 -0.0416
## 5 chr1:136153:C:G 0.125 -0.0401
## 6 chr1:132831:A:G 0.124 -0.0324
## 7 chr1:136195:C:T 0.123 -0.0335
## 8 chr1:56871:C:T 0.122 0.0256
## 9 chr1:158341:T:C 0.121 -0.0530
## 10 chr1:54319:T:C 0.120 -0.0500
## # ℹ 2,151 more rows
# select a SNP of interest based on its position
AF_SNP_coords <- AF_data_coords[AF_data_coords$POS == "194241",]
# Remake your map, but this time, we’ll add a pie chart describing
# the population allele frequency per country.
par(fg = "black")
map("world", col = "grey85", fill = TRUE, border = FALSE)
map.axes()
points(metadata$longitude, metadata$latitude, cex = 1.5, pch = 20, col = cols[pop(vcf.gl)])
for (i in 1:nrow(AF_SNP_coords)){
add.pie(z = c(AF_SNP_coords$allele_frequency[i],
1-AF_SNP_coords$allele_frequency[i]),
x = AF_SNP_coords$longitude[i]+10,
y = AF_SNP_coords$latitude[i],
radius = 5, col = c(alpha("orange", 0.5), alpha("blue", 0.5)), labels = "")
}
legend(title="Country", x = "topleft",
legend = unique(pop(vcf.gl)),
col = cols[unique(pop(vcf.gl))], pch = 20,
box.lwd = 0, cex = 0.9)
legend(title="Allele frequency", x = "bottomleft",
legend = c("reference","variant"),
col = c(alpha("blue", 0.5), alpha("orange", 0.5)), pch = 15, box.lwd = 0, cex = 0.9)