library(breedTools)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data("GWBC_ref_C")
load("~/Documents/rproject/statcourse/additional_ref_geno.RData")
breed_duroc<-solve_composition(durocMarcGenoDose,GWBC_ref_C, ped = NULL, groups = NULL, mia = FALSE,sire = FALSE, dam = FALSE)
breed_duroc1 = as.data.frame(breed_duroc[,1])
names(breed_duroc1) <- c("ratio")
number=breed_duroc1%>%filter(ratio>=0.90)%>%summarize(n = n())
sum=breed_duroc1%>%summarize(n = n())
# ratio of > 90%
number[1,1]/sum[1,1]
## [1] 0.9318182
breed_hampshire<-solve_composition(hampshireMarcGenoDose,GWBC_ref_C, ped = NULL, groups = NULL, mia = FALSE,sire = FALSE, dam = FALSE)
breed_hampshire1 = as.data.frame(breed_hampshire[,2])
names(breed_hampshire1) <- c("ratio")
number=breed_hampshire1%>%filter(ratio>=0.90)%>%summarize(n = n())
sum=breed_hampshire1%>%summarize(n = n())
# ratio of > 90%
number[1,1]/sum[1,1]
## [1] 1
breed_landrace<-solve_composition(landraceMarcGenoDose,GWBC_ref_C, ped = NULL, groups = NULL, mia = FALSE,sire = FALSE, dam = FALSE)
breed_landrace1 = as.data.frame(breed_landrace[,3])
names(breed_landrace1) <- c("ratio")
number=breed_landrace1%>%filter(ratio>=0.90)%>%summarize(n = n())
sum=breed_landrace1%>%summarize(n = n())
# ratio of > 90%
number[1,1]/sum[1,1]
## [1] 0.8153846
breed_landrace1 <- read.csv('breed_landrace1.csv', as.is=T)
breed_landrace1$breed <- factor(breed_landrace1$breed, levels=c('Duroc', 'Hampshire', 'Yorkshire','Landrace'))
ggplot(breed_landrace1, aes(fill=breed, y=rate, x=number)) +
geom_bar(position=position_stack(reverse = T), stat="identity") +
scale_fill_manual(values = c("#F8766D","#7CAE00","#C77CFF","#00BFC4"))+
geom_hline(yintercept=0.10, linetype="dashed", color = "red",linewidth = 0.5) +
theme_minimal()
breed_yorkshire<-solve_composition(yorkshireMarcGenoDose,GWBC_ref_C, ped = NULL, groups = NULL, mia = FALSE,sire = FALSE, dam = FALSE)
breed_yorkshire1 = as.data.frame(breed_yorkshire[,4])
names(breed_yorkshire1) <- c("ratio")
number=breed_yorkshire1%>%filter(ratio>=0.90)%>%summarize(n = n())
sum=breed_yorkshire1%>%summarize(n = n())
# ratio of > 90%
number[1,1]/sum[1,1]
## [1] 0.8938053
breed_yorkshire1 <- read.csv('breed_yorkshire1.csv', as.is=T)
ggplot(breed_yorkshire1, aes(fill=breed, y=rate, x=number)) +
geom_bar(position =position_stack(reverse = T), stat="identity") +
scale_fill_manual(values = c("#F8766D","#7CAE00","#C77CFF","#00BFC4"))+
geom_hline(yintercept=0.10, linetype="dashed", color = "red",linewidth = 0.5) +
theme_minimal()
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(smartsnp)
new<-rbind(durocMarcGenoDose,hampshireMarcGenoDose)
new1<-rbind(new,landraceMarcGenoDose)
breedMarcGenoDose<-rbind(new1,yorkshireMarcGenoDose)
# row (SNP) or column (sample) 、
# Transpose
breedMarcGenoDose1<-t(breedMarcGenoDose)
dim(breedMarcGenoDose1)
## [1] 62163 283
# 62163 SNPs, 283 animals
#filter SNP by missing rate (0.1)
#NA proportion of each row
miss_snp<-rowMeans(is.na(breedMarcGenoDose1))
breedMarcGenoDose2<-breedMarcGenoDose1[miss_snp<=0.05,]
dim(breedMarcGenoDose2) # 43797
## [1] 43797 283
#filter IDs by missing rate (0.1) #caution: groupings will need to be adjusted
miss_id<-colMeans(is.na(breedMarcGenoDose2))
breedMarcGenoDose3<-breedMarcGenoDose2[,miss_id<=0.1]
dim(breedMarcGenoDose3) # 43797 251
## [1] 43797 251
#filter for low MAF (0)
#MAF:Minor allele frequency
MAF<-rowMeans(breedMarcGenoDose3,na.rm = T)/2 # remove NA values
MAF<-pmin(MAF,1-MAF)
breedMarcGenoDose4<-breedMarcGenoDose3[MAF!=0,]
# breedMarcGenoDose4 with filter, replace all na with 9
breedMarcGenoDose4[is.na(breedMarcGenoDose4)] <- 9
dim(breedMarcGenoDose4)
## [1] 43690 251
# random
sel_snp<-sample(x = 1:nrow(breedMarcGenoDose4),10000,replace = F)
breedMarcGenoDose5<-breedMarcGenoDose4[sel_snp,]
dim(breedMarcGenoDose4) # 43690 251
## [1] 43690 251
dim(breedMarcGenoDose5) # 10000 251
## [1] 10000 251
breedMarcGenoDose5<-t(apply(breedMarcGenoDose5,1,function(x) {x[is.na(x)]<-round(mean(x,na.rm=T))
return(x)}))
dim(breedMarcGenoDose5) # 10000 251
## [1] 10000 251
write.table(breedMarcGenoDose4,"breedMarcGenoDose.txt",row.names = F, col.names=F)
# breedMarcGenoDose1 without filter
breedMarcGenoDose1[is.na(breedMarcGenoDose1)] <- 9
write.table(breedMarcGenoDose1,"breedMarcGenoDose1.txt",row.names = F, col.names=F)
#genotype is matrix with SNP in ROWS
grp<- c(rep("duroc", 88), rep("hampshire", 17),rep("landrace", 65),rep("yorkshire", 113))
#this should be a vector of length equal to number of animals
#each element is the breed of the animal
#new grp after filter
grp1<-grp[miss_id<=0.1]
pc<-smart_pca("breedMarcGenoDose.txt",sample_group = grp1,missing_impute = "mean",pc_axes = 3)
## Checking argument options selected...
## Argument options are correct...
## Loading data...
## Imported 43690 SNP by 251 sample genotype matrix
## Time elapsed: 0h 0m 0s
## Filtering data...
## No samples projected after PCA computation
## 43690 SNPs included in PCA computation
## 1 SNPs omitted from PCA computation
## 251 samples included in PCA computation
## Completed data filtering
## Time elapsed: 0h 0m 0s
## Scanning for invariant SNPs...
## Scan complete: no invariant SNPs found
## Time elapsed: 0h 0m 1s
## Checking for missing values...
## 34882 SNPs contain missing values
## Imputing SNPs with missing values...
## Imputation with means completed: 102174 missing values imputed
## Time elapsed: 0h 0m 1s
## Scaling values by SNP...
## Centering and scaling by drift dispersion...
## Completed scaling using drift
## Time elapsed: 0h 0m 1s
## Computing singular value decomposition using RSpectra...
## Completed singular value decomposition using RSpectra
## Time elapsed: 0h 0m 2s
## Extracting eigenvalues and eigenvectors...
## Eigenvalues and eigenvectors extracted
## Time elapsed: 0h 0m 2s
## Tabulating PCA outputs...
## Completed tabulations of PCA outputs...
head(pc$pca.sample_coordinates)
## Group Class PC1 PC2 PC3
## 1 duroc PCA -223.9920 -45.66172 -3.748034
## 2 duroc PCA -232.6987 -48.80585 -6.154142
## 3 duroc PCA -235.2646 -50.85526 -8.915176
## 4 duroc PCA -233.6797 -49.32080 -6.079461
## 5 duroc PCA -238.4734 -54.23686 -10.055835
## 6 duroc PCA -232.5837 -48.43587 -6.782698
ggplot(pc$pca.sample_coordinates,aes(x=PC1,y=PC2,color=Group))+
geom_point(size = 0.1)
# snp without filter
pc1<-smart_pca("breedMarcGenoDose1.txt",sample_group = grp,missing_impute = "mean",pc_axes = 3)
## Checking argument options selected...
## Argument options are correct...
## Loading data...
## Imported 62163 SNP by 283 sample genotype matrix
## Time elapsed: 0h 0m 0s
## Filtering data...
## No samples projected after PCA computation
## 62163 SNPs included in PCA computation
## 1 SNPs omitted from PCA computation
## 283 samples included in PCA computation
## Completed data filtering
## Time elapsed: 0h 0m 1s
## Scanning for invariant SNPs...
## Scan complete: removed 2355 invariant SNPs
## Time elapsed: 0h 0m 1s
## Checking for missing values...
## 59665 SNPs contain missing values
## Imputing SNPs with missing values...
## Imputation with means completed: 776932 missing values imputed
## Time elapsed: 0h 0m 1s
## Scaling values by SNP...
## Centering and scaling by drift dispersion...
## Completed scaling using drift
## Time elapsed: 0h 0m 1s
## Computing singular value decomposition using RSpectra...
## Completed singular value decomposition using RSpectra
## Time elapsed: 0h 0m 2s
## Extracting eigenvalues and eigenvectors...
## Eigenvalues and eigenvectors extracted
## Time elapsed: 0h 0m 2s
## Tabulating PCA outputs...
## Completed tabulations of PCA outputs...
ggplot(pc1$pca.sample_coordinates,aes(x=PC1,y=PC2,color=Group))+
geom_point(size = 0.1)