library(tidyverse)
x<-read.csv("~/Downloads/suptablu.csv",header=T)

1 Calculate Genotype Stats by accession

1.0.1 Per accession genotype concordance (excluding hets) for all sites and only Sanger alt sites. Then merge.

accession_all<-mutate(x,correct=ifelse(I_geno==S_geno,1,0)) %>%  
  filter(!grepl("/",I_geno)) %>%
  group_by(Accession) %>%  
  summarize(accuracy=mean(correct)) 

accession_alt<-mutate(x,correct=ifelse(I_geno==S_geno,1,0)) %>%  
  group_by(Accession) %>%  
  filter(Sanger=="alt",!grepl("/",I_geno)) %>%
  summarize(alt_accuracy=mean(correct)) 

accession_dat<-merge(accession_all,accession_alt,by="Accession")

1.0.2 Per SNP accuracies (excluding hets); merge.

snp_alt<-mutate(x,correct=ifelse(I_geno==S_geno,1,0)) %>%  
  group_by(SNP) %>%  
  filter(Sanger=="alt",!grepl("/",I_geno)) %>%
  summarize(alt_accuracy=mean(correct)) 

snp_all<-mutate(x,correct=ifelse(I_geno==S_geno,1,0)) %>%  
  group_by(SNP) %>%  
  filter(!grepl("/",I_geno)) %>%
  summarize(accuracy=mean(correct)) 

snp_dat<-merge(snp_all,snp_alt,by="SNP") %>% gather(type,value,c(accuracy,alt_accuracy))

1.0.3 Heterozygosity per accession, merge with accuracy

accession_het<-mutate(x,het=ifelse(grepl("/",I_geno),1,0)) %>% 
  group_by(Accession) %>%  
  summarize(hets=mean(het)) 

accession_dat<-merge(accession_dat,accession_het,by="Accession")  

1.1 Read sequencing depth data

y<-read.csv("~/Desktop/suptablu3.csv",header=T)

1.1.1 Merge with genotypes

xy<-merge(accession_dat,y,by=1)
xyz<-gather(xy,type,value,c(accuracy,alt_accuracy))

2 Graphs

2.0.1 Accuracy by subspecies – maize better than teosinte

ggplot(xyz,aes(y=value,color=Subspecies))+
  geom_boxplot(aes(group=Subspecies))+
  facet_wrap(~type)+
  theme_minimal()

2.0.2 Accuracy vs depth – not much here

ggplot(xyz,aes(x=Average.depth,y=value,color=Subspecies))+
  geom_point()+
  geom_smooth(method="lm",se=FALSE)+
  facet_wrap(~type)+
  theme_minimal()

2.0.3 Het calls by subspecies

ggplot(xy,aes(y=hets,color=Subspecies))+
  geom_boxplot(aes(group=Subspecies))+theme_minimal()

2.0.4 Accuracy by SNP

ggplot(snp_dat,aes(x=SNP,y=value,color=type))+
  geom_point()+
  theme_minimal()