This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: ## Loading the necessary R packages
library(vegan)
## Warning: package 'vegan' was built under R version 4.2.2
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-4
library(ggplot2)
library(ggpubr)
getwd()
## [1] "C:/Users/slmg/OneDrive/Desktop/my snps"
list.files(pattern="vcf")
## [1] "20.17246903-17486903.ALL.chr20_GRCh38.genotypes.20170504.vcf.gz"
## [2] "vcf_num.csv"
## [3] "vcf_num_df.csv"
## [4] "vcf_num_df2.csv"
## [5] "vcf_scaled.csv"
vcf<- vcfR::read.vcfR("20.17246903-17486903.ALL.chr20_GRCh38.genotypes.20170504.vcf.gz",convertNA = T)
## Scanning file to determine attributes.
## File attributes:
## meta lines: 130
## header_line: 131
## variant count: 7220
## column count: 2513
##
Meta line 130 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
## Character matrix gt rows: 7220
## Character matrix gt cols: 2513
## skip: 0
## nrows: 7220
## row_num: 0
##
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant: 7220
## All variants processed
vcf_num<-vcfR::extract.gt(vcf,element="GT", IDtoRowNames=F,as.numeric=T,convertNA=T)
write.csv(vcf_num,file="vcf_num.csv",row.names=F)
list.files()
## [1] "1000genomes_people_info2-1.csv"
## [2] "20.17246903-17486903.ALL.chr20_GRCh38.genotypes.20170504.vcf.gz"
## [3] "final_report_template.Rmd"
## [4] "loading SNPS.Rmd"
## [5] "vcf_num.csv"
## [6] "vcf_num_df.csv"
## [7] "vcf_num_df2.csv"
## [8] "vcf_scaled.csv"
## [9] "Workflow.html"
## [10] "Workflow.Rmd"
vcf_num_t<-t(vcf_num)
vcf_num_df<-data.frame(vcf_num_t)
sample<-row.names(vcf_num_df)
vcf_num_df<-data.frame(sample,vcf_num_df)
getwd()
## [1] "C:/Users/slmg/OneDrive/Desktop/my snps"
write.csv(vcf_num_df,file="vcf_num_df.csv", row.names=F)
list.files()
## [1] "1000genomes_people_info2-1.csv"
## [2] "20.17246903-17486903.ALL.chr20_GRCh38.genotypes.20170504.vcf.gz"
## [3] "final_report_template.Rmd"
## [4] "loading SNPS.Rmd"
## [5] "vcf_num.csv"
## [6] "vcf_num_df.csv"
## [7] "vcf_num_df2.csv"
## [8] "vcf_scaled.csv"
## [9] "Workflow.html"
## [10] "Workflow.Rmd"
pop_meta<-read.csv(file='1000genomes_people_info2-1.csv')
names(pop_meta)
## [1] "pop" "super_pop" "sample" "sex" "lat" "lng"
names(vcf_num_df)[1:10]
## [1] "sample" "X1" "X2" "X3" "X4" "X5" "X6" "X7"
## [9] "X8" "X9"
vcf_num_df2<-merge(pop_meta,vcf_num_df,by="sample")
nrow(vcf_num_df)==nrow(vcf_num_df2)
## [1] TRUE
names(vcf_num_df2)[1:15]
## [1] "sample" "pop" "super_pop" "sex" "lat" "lng"
## [7] "X1" "X2" "X3" "X4" "X5" "X6"
## [13] "X7" "X8" "X9"
getwd()
## [1] "C:/Users/slmg/OneDrive/Desktop/my snps"
write.csv(vcf_num_df2,file="vcf_num_df2.csv", row.names=F)
list.files()
## [1] "1000genomes_people_info2-1.csv"
## [2] "20.17246903-17486903.ALL.chr20_GRCh38.genotypes.20170504.vcf.gz"
## [3] "final_report_template.Rmd"
## [4] "loading SNPS.Rmd"
## [5] "vcf_num.csv"
## [6] "vcf_num_df.csv"
## [7] "vcf_num_df2.csv"
## [8] "vcf_scaled.csv"
## [9] "Workflow.html"
## [10] "Workflow.Rmd"
invar_omit<-function(x)
{
cat("Dataframe of dim", dim(x), "processed...\n")
sds<-apply(x,2,sd,na.rm=TRUE)
i_var0<-which(sds==0)
cat(length(i_var0),"columns removed \n")
if(length(i_var0)>0)
{
x<-x[,-i_var0]
}
return (x)
}
names(vcf_num_df2)[1:10]
## [1] "sample" "pop" "super_pop" "sex" "lat" "lng"
## [7] "X1" "X2" "X3" "X4"
vcf_noinvar<-vcf_num_df2
vcf_noinvar<-vcf_noinvar[,-c(1:6)]
vcf_noinvar<-invar_omit(vcf_noinvar)
## Dataframe of dim 2504 7220 processed...
## 1756 columns removed
vcf_noinvar<-data.frame(vcf_num_df2[,c(1:6)],
vcf_noinvar)
my_meta_N_invar_cols<-1756
# find_NAs<-function(x)
# {
# NAs_TF<-is.na(x)
# i_NA<-which(NAs_TF==TRUE)
# N_NA<-length(i_NA)
# return(i_NA)
# }
#
# N_rows<-nrow(vcf_noinvar) ##rows are individuals
# N_NA<-rep(x=0,times=N_rows)
# N_SNPs<-ncol(vcf_noinvar) ## columns are the SNPS
# for(i in 1:N_rows)
# {
# i_NA<-find_NAs(vcf_noinvar[i,])
# N_NA_i<-length(i_NA)
# N_NA[i]<-N_NA_i
# }
# N_NA_i
#
# cutoff50<-N_SNPs*0.5
# percent_NA<-N_NA/N_SNPs*100
# any(percent_NA>50)
# mean(percent_NA)
# my_meta_N_meanNA_rows<-mean(percent_NA)
mean_imputation<-function(df)
{
cat("This may take some time...")
n_cols<-ncol(df)
for(i in 1:n_cols)
{
column_i<-df[,i]
mean_i<-mean(column_i,na.rm=TRUE)
NAs_i<-which(is.na(column_i))
N_NAs<-length(NAs_i)
column_i[NAs_i]<- mean_i
df[,i]<-column_i
}
return(df)
}
names(vcf_noinvar)[1:10]
## [1] "sample" "pop" "super_pop" "sex" "lat" "lng"
## [7] "X1" "X2" "X5" "X6"
vcf_noNA<-vcf_noinvar
vcf_noNA[,-c(1:6)]<-mean_imputation(vcf_noinvar[,-c(1:6)])
## This may take some time...
vcf_noNA <- vcf_noinvar
vcf_scaled<-vcf_noNA
vcf_scaled[,-c(1:6)]<-scale(vcf_noNA[,-c(1:6)])
write.csv(vcf_scaled, file = "vcf_scaled.csv",
row.names = F)
vcf_pca<-prcomp(vcf_scaled[,-c(1:6)])
screeplot(vcf_pca)
PCA_variation<-function(pca_summary,PCs=2)
{
var_explained<-pca_summary$importance[2,1:PCs]*100
var_explained<-round(var_explained,3)
return(var_explained)
}
vcf_pca_summary<-summary(vcf_pca)
var_out<-PCA_variation(vcf_pca_summary,PCs=500)
N_columns<-ncol(vcf_scaled)
cut_off<-1/N_columns*100
i_cut_off<-which(var_out<cut_off)
i_cut_off<-min(i_cut_off)
## Warning in min(i_cut_off): no non-missing arguments to min; returning Inf
my_meta_N_meanNA_rowsPCs<-i_cut_off
my_meta_var_PC123<-var_out[c(1,2,3)]
#plot percentage variation
barplot(var_out,main="Percent variation (%) scree plot",
ylab="Percent variation (%) explained",
names.arg=1:length(var_out))
abline(h=cut_off,col=2,lwd=2)
abline(v=i_cut_off)
legend("topright",col=c(2,1),lty=c(1,1),
legend=c("Vertical line: cutoff",
"Horizontal line: 1st value below cutoff"))
#plot cumulative variation
cumulative_variation<-cumsum(var_out)
plot(cumulative_variation, type="l")
vcf_pca_scores<-vegan::scores(vcf_pca)
vcf_pca_scores2<-data.frame(super_pop=vcf_noNA$super_pop,vcf_pca_scores)
my_meta_var_PC123[1]
## PC1
## 2.439
my_meta_var_PC123[2]
## PC2
## 1.777
my_meta_var_PC123[3]
## PC3
## 1.637
#plot the results on scatter plot
# PC2 vs PC1
ggpubr::ggscatter(data=vcf_pca_scores2,y="PC2", x="PC1",
color="super_pop", shape="super_pop",
main="PCA Scatterplot", xlab="PC1(2.4% of variation)",
ylab="PC2(1.8% of variation)")
# PC2 vs PC3
ggpubr::ggscatter(data=vcf_pca_scores2, y="PC3", x="PC2",
color="super_pop", shape="super_pop", main="PCA Scatterplot",
xlab="PC2(1.8% of variation)",
ylab="PC3 (1.6% of variation)")
# PC3 vs PC1
ggpubr::ggscatter(data=vcf_pca_scores2, y="PC3", x="PC1",
color="super_pop", shape="super_pop", main="PCA Scatterplot",
xlab="PC1(2.4% of variation)",
ylab="PC3 (1.6% of variation)")