This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(vcfR)
##
## ***** *** vcfR *** *****
## This is vcfR 1.13.0
## browseVignettes('vcfR') # Documentation
## citation('vcfR') # Citation
## ***** ***** ***** *****
Make sure to check working directory and make sure your files are located there.
getwd()
## [1] "/Users/ekoneduncan/Desktop/compbio"
list.files()
## [1] "07-mean_imputation.Rmd"
## [2] "09-PCA_worked_example-SNPs-part1.Rmd"
## [3] "1.159051856-159301856.ALL.chr1_GRCh38.genotypes.20170504.vcf.gz"
## [4] "1540_120122-1.pdf"
## [5] "1540_final_project_Final_Report_template.pdf"
## [6] "1540_final_report_flowchart.pdf"
## [7] "17.12071392-12311392.ALL.chr17_GRCh38.genotypes.20170504.vcf"
## [8] "17.12071392-12311392.ALL.chr17_GRCh38.genotypes.20170504.vcf.gz"
## [9] "2.136483646-136733646.ALL.chr2_GRCh38.genotypes.20170504.vcf.gz"
## [10] "all_loci-1.vcf"
## [11] "center_function.R"
## [12] "code_checkpoint_vcfR.html"
## [13] "code_checkpoint_vcfR.Rmd"
## [14] "feature_engineering_intro_2_functions-part2.Rmd"
## [15] "feature_engineering.Rmd"
## [16] "final_report_template.Rmd"
## [17] "fst_exploration_in_class-STUDENT.Rmd"
## [18] "fst_exploration_in_class.Rmd"
## [19] "lecture-introd2RStudio-with_scripts.pdf"
## [20] "line_of_best_fit_example-tibet_allele_freq.pdf"
## [21] "my_snps"
## [22] "Navarro_regression_part01.pdf"
## [23] "PCA_analysis_in_class_work-for_students.pdf"
## [24] "PCA_with_SNPs_handout_worksheet_031122-1.docx"
## [25] "PCA_with_SNPs_handout_worksheet_031122-1.pdf"
## [26] "PCA-missing_data-KEY.Rmd"
## [27] "PCA-missing_data.Rmd"
## [28] "R_data_structures_vectors_intro.pdf"
## [29] "R_Directory"
## [30] "r_help_hclust_intro-vs2.pdf"
## [31] "removing_fixed_alleles.html"
## [32] "removing_fixed_alleles.Rmd"
## [33] "rsconnect"
## [34] "summary_stats.pdf"
## [35] "test.docx"
## [36] "test.html"
## [37] "test.Rmd"
## [38] "test2.Rmd"
## [39] "transpose_1000_genomes.Rmd"
## [40] "vcfR_test.vcf"
## [41] "vcfR_test.vcf.gz"
## [42] "vegan_PCA_amino_acids-STUDENT.html"
## [43] "vegan_PCA_amino_acids-STUDENT.Rmd"
## [44] "vegan_pca_with_msleep-STUDENT.html"
## [45] "vegan_pca_with_msleep-STUDENT.Rmd"
## [46] "walsh2017morphology.RData"
## [47] "week08_cluster_analysis-1.pdf"
## [48] "What is computational biology_exert.pdf"
Load the data into an object called “genome_snps” using the read.vcfR () function
genome_snps<- read.vcfR("17.12071392-12311392.ALL.chr17_GRCh38.genotypes.20170504.vcf",
convertNA = T) #TODO
## Scanning file to determine attributes.
## File attributes:
## meta lines: 130
## header_line: 131
## variant count: 7493
## column count: 2513
##
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 7493
## Character matrix gt cols: 2513
## skip: 0
## nrows: 7493
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant: 7493
## All variants processed
warning("If this didn't work, you may not have set your working directory to the location of the vcf file")
## Warning: If this didn't work, you may not have set your working directory to the
## location of the vcf file
Examine the data
head(genome_snps)
## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.1"
## [1] "##FILTER=<ID=PASS,Description=\"All filters passed\">"
## [1] "##fileDate=20150218"
## [1] "##reference=ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/refe [Truncated]"
## [1] "##source=1000GenomesPhase3Pipeline"
## [1] "##contig=<ID=1,assembly=b37,length=249250621>"
## [1] "First 6 rows."
## [1]
## [1] "***** Fixed section *****"
## CHROM POS ID REF ALT QUAL FILTER
## [1,] "17" "12071513" "rs546498019" "C" "T" "100" "PASS"
## [2,] "17" "12071514" "rs564276031" "G" "A" "100" "PASS"
## [3,] "17" "12071539" "rs548813685" "C" "T" "100" "PASS"
## [4,] "17" "12071587" "rs184242686" "A" "G" "100" "PASS"
## [5,] "17" "12071609" "rs568210762" "G" "A" "100" "PASS"
## [6,] "17" "12071702" "rs529373806" "T" "C" "100" "PASS"
## [1]
## [1] "***** Genotype section *****"
## FORMAT HG00096 HG00097 HG00099 HG00100 HG00101
## [1,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [2,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [3,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [4,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [5,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [6,] "GT" "0|0" "0|0" "0|0" "0|0" "0|0"
## [1] "First 6 columns only."
## [1]
## [1] "Unique GT formats:"
## [1] "GT"
## [1]
View just the metadata
genome_snps@meta
## [1] "##fileformat=VCFv4.1"
## [2] "##FILTER=<ID=PASS,Description=\"All filters passed\">"
## [3] "##fileDate=20150218"
## [4] "##reference=ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
## [5] "##source=1000GenomesPhase3Pipeline"
## [6] "##contig=<ID=1,assembly=b37,length=249250621>"
## [7] "##contig=<ID=2,assembly=b37,length=243199373>"
## [8] "##contig=<ID=3,assembly=b37,length=198022430>"
## [9] "##contig=<ID=4,assembly=b37,length=191154276>"
## [10] "##contig=<ID=5,assembly=b37,length=180915260>"
## [11] "##contig=<ID=6,assembly=b37,length=171115067>"
## [12] "##contig=<ID=7,assembly=b37,length=159138663>"
## [13] "##contig=<ID=8,assembly=b37,length=146364022>"
## [14] "##contig=<ID=9,assembly=b37,length=141213431>"
## [15] "##contig=<ID=10,assembly=b37,length=135534747>"
## [16] "##contig=<ID=11,assembly=b37,length=135006516>"
## [17] "##contig=<ID=12,assembly=b37,length=133851895>"
## [18] "##contig=<ID=13,assembly=b37,length=115169878>"
## [19] "##contig=<ID=14,assembly=b37,length=107349540>"
## [20] "##contig=<ID=15,assembly=b37,length=102531392>"
## [21] "##contig=<ID=16,assembly=b37,length=90354753>"
## [22] "##contig=<ID=17,assembly=b37,length=81195210>"
## [23] "##contig=<ID=18,assembly=b37,length=78077248>"
## [24] "##contig=<ID=19,assembly=b37,length=59128983>"
## [25] "##contig=<ID=20,assembly=b37,length=63025520>"
## [26] "##contig=<ID=21,assembly=b37,length=48129895>"
## [27] "##contig=<ID=22,assembly=b37,length=51304566>"
## [28] "##contig=<ID=GL000191.1,assembly=b37,length=106433>"
## [29] "##contig=<ID=GL000192.1,assembly=b37,length=547496>"
## [30] "##contig=<ID=GL000193.1,assembly=b37,length=189789>"
## [31] "##contig=<ID=GL000194.1,assembly=b37,length=191469>"
## [32] "##contig=<ID=GL000195.1,assembly=b37,length=182896>"
## [33] "##contig=<ID=GL000196.1,assembly=b37,length=38914>"
## [34] "##contig=<ID=GL000197.1,assembly=b37,length=37175>"
## [35] "##contig=<ID=GL000198.1,assembly=b37,length=90085>"
## [36] "##contig=<ID=GL000199.1,assembly=b37,length=169874>"
## [37] "##contig=<ID=GL000200.1,assembly=b37,length=187035>"
## [38] "##contig=<ID=GL000201.1,assembly=b37,length=36148>"
## [39] "##contig=<ID=GL000202.1,assembly=b37,length=40103>"
## [40] "##contig=<ID=GL000203.1,assembly=b37,length=37498>"
## [41] "##contig=<ID=GL000204.1,assembly=b37,length=81310>"
## [42] "##contig=<ID=GL000205.1,assembly=b37,length=174588>"
## [43] "##contig=<ID=GL000206.1,assembly=b37,length=41001>"
## [44] "##contig=<ID=GL000207.1,assembly=b37,length=4262>"
## [45] "##contig=<ID=GL000208.1,assembly=b37,length=92689>"
## [46] "##contig=<ID=GL000209.1,assembly=b37,length=159169>"
## [47] "##contig=<ID=GL000210.1,assembly=b37,length=27682>"
## [48] "##contig=<ID=GL000211.1,assembly=b37,length=166566>"
## [49] "##contig=<ID=GL000212.1,assembly=b37,length=186858>"
## [50] "##contig=<ID=GL000213.1,assembly=b37,length=164239>"
## [51] "##contig=<ID=GL000214.1,assembly=b37,length=137718>"
## [52] "##contig=<ID=GL000215.1,assembly=b37,length=172545>"
## [53] "##contig=<ID=GL000216.1,assembly=b37,length=172294>"
## [54] "##contig=<ID=GL000217.1,assembly=b37,length=172149>"
## [55] "##contig=<ID=GL000218.1,assembly=b37,length=161147>"
## [56] "##contig=<ID=GL000219.1,assembly=b37,length=179198>"
## [57] "##contig=<ID=GL000220.1,assembly=b37,length=161802>"
## [58] "##contig=<ID=GL000221.1,assembly=b37,length=155397>"
## [59] "##contig=<ID=GL000222.1,assembly=b37,length=186861>"
## [60] "##contig=<ID=GL000223.1,assembly=b37,length=180455>"
## [61] "##contig=<ID=GL000224.1,assembly=b37,length=179693>"
## [62] "##contig=<ID=GL000225.1,assembly=b37,length=211173>"
## [63] "##contig=<ID=GL000226.1,assembly=b37,length=15008>"
## [64] "##contig=<ID=GL000227.1,assembly=b37,length=128374>"
## [65] "##contig=<ID=GL000228.1,assembly=b37,length=129120>"
## [66] "##contig=<ID=GL000229.1,assembly=b37,length=19913>"
## [67] "##contig=<ID=GL000230.1,assembly=b37,length=43691>"
## [68] "##contig=<ID=GL000231.1,assembly=b37,length=27386>"
## [69] "##contig=<ID=GL000232.1,assembly=b37,length=40652>"
## [70] "##contig=<ID=GL000233.1,assembly=b37,length=45941>"
## [71] "##contig=<ID=GL000234.1,assembly=b37,length=40531>"
## [72] "##contig=<ID=GL000235.1,assembly=b37,length=34474>"
## [73] "##contig=<ID=GL000236.1,assembly=b37,length=41934>"
## [74] "##contig=<ID=GL000237.1,assembly=b37,length=45867>"
## [75] "##contig=<ID=GL000238.1,assembly=b37,length=39939>"
## [76] "##contig=<ID=GL000239.1,assembly=b37,length=33824>"
## [77] "##contig=<ID=GL000240.1,assembly=b37,length=41933>"
## [78] "##contig=<ID=GL000241.1,assembly=b37,length=42152>"
## [79] "##contig=<ID=GL000242.1,assembly=b37,length=43523>"
## [80] "##contig=<ID=GL000243.1,assembly=b37,length=43341>"
## [81] "##contig=<ID=GL000244.1,assembly=b37,length=39929>"
## [82] "##contig=<ID=GL000245.1,assembly=b37,length=36651>"
## [83] "##contig=<ID=GL000246.1,assembly=b37,length=38154>"
## [84] "##contig=<ID=GL000247.1,assembly=b37,length=36422>"
## [85] "##contig=<ID=GL000248.1,assembly=b37,length=39786>"
## [86] "##contig=<ID=GL000249.1,assembly=b37,length=38502>"
## [87] "##contig=<ID=MT,assembly=b37,length=16569>"
## [88] "##contig=<ID=NC_007605,assembly=b37,length=171823>"
## [89] "##contig=<ID=X,assembly=b37,length=155270560>"
## [90] "##contig=<ID=Y,assembly=b37,length=59373566>"
## [91] "##contig=<ID=hs37d5,assembly=b37,length=35477943>"
## [92] "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
## [93] "##INFO=<ID=CIEND,Number=2,Type=Integer,Description=\"Confidence interval around END for imprecise variants\">"
## [94] "##INFO=<ID=CIPOS,Number=2,Type=Integer,Description=\"Confidence interval around POS for imprecise variants\">"
## [95] "##INFO=<ID=CS,Number=1,Type=String,Description=\"Source call set.\">"
## [96] "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End coordinate of this variant\">"
## [97] "##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description=\"Imprecise structural variation\">"
## [98] "##INFO=<ID=MC,Number=.,Type=String,Description=\"Merged calls.\">"
## [99] "##INFO=<ID=MEINFO,Number=4,Type=String,Description=\"Mobile element info of the form NAME,START,END<POLARITY; If there is only 5' OR 3' support for this call, will be NULL NULL for START and END\">"
## [100] "##INFO=<ID=MEND,Number=1,Type=Integer,Description=\"Mitochondrial end coordinate of inserted sequence\">"
## [101] "##INFO=<ID=MLEN,Number=1,Type=Integer,Description=\"Estimated length of mitochondrial insert\">"
## [102] "##INFO=<ID=MSTART,Number=1,Type=Integer,Description=\"Mitochondrial start coordinate of inserted sequence\">"
## [103] "##INFO=<ID=SVLEN,Number=.,Type=Integer,Description=\"SV length. It is only calculated for structural variation MEIs. For other types of SVs; one may calculate the SV length by INFO:END-START+1, or by finding the difference between lengthes of REF and ALT alleles\">"
## [104] "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">"
## [105] "##INFO=<ID=TSD,Number=1,Type=String,Description=\"Precise Target Site Duplication for bases, if unknown, value will be NULL\">"
## [106] "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Total number of alternate alleles in called genotypes\">"
## [107] "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Estimated allele frequency in the range (0,1)\">"
## [108] "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">"
## [109] "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">"
## [110] "##INFO=<ID=EAS_AF,Number=A,Type=Float,Description=\"Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)\">"
## [111] "##INFO=<ID=EUR_AF,Number=A,Type=Float,Description=\"Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)\">"
## [112] "##INFO=<ID=AFR_AF,Number=A,Type=Float,Description=\"Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)\">"
## [113] "##INFO=<ID=AMR_AF,Number=A,Type=Float,Description=\"Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)\">"
## [114] "##INFO=<ID=SAS_AF,Number=A,Type=Float,Description=\"Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)\">"
## [115] "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total read depth; only low coverage data were counted towards the DP, exome data were not used\">"
## [116] "##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele. Format: AA|REF|ALT|IndelType. AA: Ancestral allele, REF:Reference Allele, ALT:Alternate Allele, IndelType:Type of Indel (REF, ALT and IndelType are only defined for indels)\">"
## [117] "##INFO=<ID=VT,Number=.,Type=String,Description=\"indicates what type of variant the line represents\">"
## [118] "##INFO=<ID=EX_TARGET,Number=0,Type=Flag,Description=\"indicates whether a variant is within the exon pull down target boundaries\">"
## [119] "##INFO=<ID=MULTI_ALLELIC,Number=0,Type=Flag,Description=\"indicates whether a site is multi-allelic\">"
## [120] "##INFO=<ID=STRAND_FLIP,Number=0,Type=Flag,Description=\"Indicates that the reference strand has changed between GRCh37 and GRCh38\">"
## [121] "##INFO=<ID=REF_SWITCH,Number=0,Type=Flag,Description=\"Indicates that the reference allele has changed\">"
## [122] "##INFO=<ID=DEPRECATED_RSID,Number=.,Type=String,Description=\"dbsnp rs IDs that have been merged into other rs IDs or do not map to GRCh38\">"
## [123] "##INFO=<ID=RSID_REMOVED,Number=.,Type=String,Description=\"dbsnp rs IDs removed from this variant, due to either the variant splitting up or being deprecated/merged\">"
## [124] "##INFO=<ID=GRCH37_38_REF_STRING_MATCH,Number=0,Type=Flag,Description=\"Indicates reference allele in origin GRCh37 vcf string-matches reference allele in dbsnp GRCh38 vcf\">"
## [125] "##INFO=<ID=NOT_ALL_RSIDS_STRAND_CHANGE_OR_REF_SWITCH,Number=0,Type=Flag,Description=\"Indicates only some of the rs IDs in origin GRCh37 vcf switched strands or switched strands and changed reference allele. This would result in rs IDs being split into multiple lines\">"
## [126] "##INFO=<ID=GRCH37_POS,Number=1,Type=Integer,Description=\"Position in origin GRCh37 vcf\">"
## [127] "##INFO=<ID=GRCH37_REF,Number=1,Type=String,Description=\"Representation of reference allele in origin GRCh37 vcf\">"
## [128] "##INFO=<ID=ALLELE_TRANSFORM,Number=0,Type=Flag,Description=\"Indicates that at least some of the alleles have changed in how they're represented, e.g. through left shifting.\">"
## [129] "##INFO=<ID=REF_NEW_ALLELE,Number=0,Type=Flag,Description=\"Indicates that the reference allele is an allele not present in the origin GRCh37 vcf\">"
## [130] "##INFO=<ID=CHROM_CHANGE_BETWEEN_ASSEMBLIES,Number=.,Type=String,Description=\"dbsnp rs IDs that are mapped to a different chromosome between GRCh37 and GRCh38\">"
Extract the genotype scores
genome_snps_num <- extract.gt(genome_snps,
element = "GT",
IDtoRowNames = F,
as.numeric = T,
convertNA = T)
Display the sample names in a compact way with gsub()
colnames(genome_snps_num) <- gsub("sample_", "",
colnames(genome_snps_num))
colnames(genome_snps_num) <- gsub("_", "",
colnames(genome_snps_num))
this shows a small view of the data
genome_snps_num[1:10, 1:4]
## HG00096 HG00097 HG00099 HG00100
## [1,] 0 0 0 0
## [2,] 0 0 0 0
## [3,] 0 0 0 0
## [4,] 0 0 0 0
## [5,] 0 0 0 0
## [6,] 0 0 0 0
## [7,] 0 0 0 0
## [8,] 0 0 0 0
## [9,] 0 0 0 0
## [10,] 0 0 0 0
we can call summary on part of the data
summary(genome_snps_num[, 1:5])
## HG00096 HG00097 HG00099 HG00100
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.05018 Mean :0.05258 Mean :0.05071 Mean :0.05445
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :2.00000 Max. :2.00000 Max. :2.00000 Max. :2.00000
## HG00101
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.04791
## 3rd Qu.:0.00000
## Max. :2.00000
reformat using the transpose function ‘t()’
genome_snps_num_t <- t(genome_snps_num)
check output
genome_snps_num_t[1:10, 1:4]
## [,1] [,2] [,3] [,4]
## HG00096 0 0 0 0
## HG00097 0 0 0 0
## HG00099 0 0 0 0
## HG00100 0 0 0 0
## HG00101 0 0 0 0
## HG00102 0 0 0 0
## HG00103 0 0 0 0
## HG00105 0 0 0 0
## HG00106 0 0 0 0
## HG00107 0 0 0 0