library("cowplot")
library("tidyverse")
library("viridis")
library("VIM")
gendata<-function(){
hapdat<-matrix(unlist(lapply(strsplit(system(paste("~/src/msdir/ms 50 1 -t 3 -r 4 10000 -I 5 10 10 10 10 10 -ma 0.05 0.05 1 0.01 0 0.05 1 0.05 0.01 0.01 0.025 0.05 1 0 0.01 0.01 0 0.01 1 0.05 0 0.01 0 0.025 1 | tail -50"),intern=T),''),as.numeric)),nrow=50,byrow=T)
genodat<-hapdat[seq(1,50,2),]+hapdat[seq(2,50,2),] # make diploid
write.table(t(genodat),file="~/crap.geno", quote=F, row.names=F, col.names=F, sep=" ")
return(genodat)
}
make data and plot real
mydat<-matrix(as.numeric(gendata()),ncol=25,byrow=TRUE)
bob<-read.table("~/crap.geno")
sum(bob-mydat) #unit test: should be zero!
## [1] 0
sites=length(mydat)
set_missing=sample(1:sites,0.3*sites)
misdat<-mydat
misdat[set_missing]<-NA
dist_dat<-dist(t(mydat))
fit<-cmdscale(dist_dat,eig=TRUE, k=2)
plot(fit$points[,1],fit$points[,1], xlab="Coordinate 1", ylab="Coordinate 2",
main="real data",pch=19,col=c(rep(rgb(1,0,0,0.5),15),rep(rgb(0,0,1,0.5),10)))
image(mydat,col=viridis(3))
now plot with miss ind
dist_dat<-dist(t(misdat))
misfit<-cmdscale(dist_dat,eig=TRUE, k=2)
plot(misfit$points[,1],misfit$points[,1], xlab="Coordinate 1", ylab="Coordinate 2",
main="with missing data",pch=19,col=c(rep(rgb(1,0,0,0.5),15),rep(rgb(0,0,1,0.5),10)))
image(misdat,col=viridis(3))
random fill in
randat<-misdat
for(i in 1:ncol(misdat)){
var=i
nas<-which(is.na(misdat[,var]))
randat[,var][nas]<-sample(na.omit(randat[,var]),length(nas))
}
dist_dat<-dist(t(randat))
ranfit<-cmdscale(dist_dat,eig=TRUE, k=2)
plot(ranfit$points[,1],ranfit$points[,1], xlab="Coordinate 1", ylab="Coordinate 2",
main="random imputation",pch=19,col=c(rep(rgb(1,0,0,0.5),15),rep(rgb(0,0,1,0.5),10)))
image(randat,col=viridis(3))
knn
kdat<-kNN(data.frame(misdat),k=2)
dist_dat<-dist(t(kdat[,1:25]))
kfit<-cmdscale(dist_dat,eig=TRUE, k=2)
plot(kfit$points[,1],kfit$points[,1], xlab="Coordinate 1", ylab="Coordinate 2",
main="knn imputation",pch=19,col=c(rep(rgb(1,0,0,0.5),15),rep(rgb(0,0,1,0.5),10)))
image(as.matrix(kdat[,1:25]),col=viridis(3))
compare: knn imputation wins
paste("kdat:", round(sum(kdat[,1:25]==mydat)/length(mydat)*100,2),"% agreement")
## [1] "kdat: 97.58 % agreement"
paste("randat:", round(sum(mydat==randat)/length(mydat)*100,2),"% agreement")
## [1] "randat: 84.26 % agreement"