Reading teosinte data

library(tidyverse)
library(cowplot)
library(qtl)

teosinte.xo<-read.table(file="xo_ZeaGBSv27raw_RareAllelesC2TeoCurated_depth_AGPv4_filtered0210.txt",header=TRUE,stringsAsFactors=FALSE)
head(teosinte.xo)
##   taxon chr  start     end
## 1 A0151   1 337863  369777
## 2 A0415   1 337863  369777
## 3 A1190   1 337863  369777
## 4 A2718   1 337863  369777
## 5 A0134   1 337863 1035713
## 6 A0557   1 334927 1035713
teosinte.parentage<-read.table(file="Parentage_for_imputeR.csv",header=TRUE,stringsAsFactors=FALSE,sep=",")
head(teosinte.parentage)
##    Taxa     Mother     Father
## 1 A0003 PC_N14_ID2 PC_N14_ID2
## 2 A0004 PC_J48_ID2 PC_J48_ID2
## 3 A0008 PC_J13_ID1 PC_K60_ID1
## 4 A0013 PC_N10_ID2 PC_N10_ID2
## 5 A0014 PC_K60_ID1 PC_K60_ID1
## 6 A0015 PC_J10_ID1 PC_M15_ID1

Transforming teosinte data

teosinte.merged.xo<-merge(teosinte.xo,teosinte.parentage,by.x="taxon",by.y="Taxa")


teosinte.child.counts<- teosinte.parentage %>% 
  group_by(Mother) %>%
  summarise(family_progeny_count = length(Mother))

teosinte.selfed.counts<-teosinte.parentage[teosinte.parentage$Mother==teosinte.parentage$Father,] %>%
   group_by(Mother) %>%
  summarise(family_selfed_count = length(Mother))

teosinte.xo.by.mom<- teosinte.merged.xo %>% 
  group_by(Mother) %>%
  summarise(family_xo_count = length(Mother))
 

 
teosinte.merged.xo<-inner_join(teosinte.merged.xo,teosinte.child.counts)

teosinte.merged.xo<-inner_join(teosinte.merged.xo,teosinte.selfed.counts)

teosinte.merged.xo<-inner_join(teosinte.merged.xo,teosinte.xo.by.mom)

#head(teosinte.merged.xo)

teosinte.merged.moms<-inner_join(teosinte.xo.by.mom,teosinte.child.counts)
dim(teosinte.merged.moms)
## [1] 51  3
teosinte.merged.moms<-inner_join(teosinte.merged.moms,teosinte.selfed.counts)
head(teosinte.merged.moms)
## # A tibble: 6 × 4
##       Mother family_xo_count family_progeny_count family_selfed_count
##        <chr>           <int>                <int>               <int>
## 1 PC_I05_ID1            2791                  104                  22
## 2 PC_I06_ID1            1793                   74                  37
## 3 PC_I08_ID1            2241                   83                  39
## 4 PC_I11_ID2            3381                  128                  43
## 5 PC_I50_ID1            2431                   91                  12
## 6 PC_I50_ID2            2360                   72                  54
dim(teosinte.merged.moms)
## [1] 49  4
teosinte.xo.over.progeny.count<-data.frame(rep("teosinte",dim(teosinte.merged.moms)[1]),teosinte.merged.moms$family_xo_count/teosinte.merged.moms$family_progeny_count)
names(teosinte.xo.over.progeny.count)<-c("teo.or.maize","xo_over_family_count")

head(teosinte.xo.over.progeny.count)
##   teo.or.maize xo_over_family_count
## 1     teosinte             26.83654
## 2     teosinte             24.22973
## 3     teosinte             27.00000
## 4     teosinte             26.41406
## 5     teosinte             26.71429
## 6     teosinte             32.77778

Reading maize landraces data

landraces.parentage<-read.table(file="landrace_parentage_info.txt",header=TRUE,stringsAsFactors=FALSE,sep="\t")
head(landraces.parentage)
##                  proid                                    parent1
## 1 13FL991195:250305857 164_8_Tuxpeno_El_Aguacatito_Mex_:250358054
## 2 14FL991792:250411065 164_8_Tuxpeno_El_Aguacatito_Mex_:250358054
## 3 14FL992565:250411841 164_8_Tuxpeno_El_Aguacatito_Mex_:250358054
## 4 14FL991871:250411166 164_8_Tuxpeno_El_Aguacatito_Mex_:250358054
## 5 14FL990555:250409701 164_8_Tuxpeno_El_Aguacatito_Mex_:250358054
## 6 14FL992680:250411969 164_8_Tuxpeno_El_Aguacatito_Mex_:250358054
##                                      parent2 progeny parent_1 parent_2
## 1 164_8_Tuxpeno_El_Aguacatito_Mex_:250358054    1004        1        1
## 2 164_8_Tuxpeno_El_Aguacatito_Mex_:250358054    4014        1        1
## 3 164_8_Tuxpeno_El_Aguacatito_Mex_:250358054    4587        1        1
## 4 164_8_Tuxpeno_El_Aguacatito_Mex_:250358054    4082        1        1
## 5 164_8_Tuxpeno_El_Aguacatito_Mex_:250358054    3021        1        1
## 6 164_8_Tuxpeno_El_Aguacatito_Mex_:250358054    4682        1        1
##      qq_uu nloci
## 1 4062.908 24864
## 2 4013.364 24799
## 3 3859.624 24625
## 4 3968.140 24755
## 5 3986.508 24618
## 6 3971.058 24849
landraces.xo<-read.table(file="xo_Combined_LR13_LR14_parents2_AGPv4_filter0219.txt",
               header=TRUE,stringsAsFactors=FALSE)
head(landraces.xo)
##                  taxon chr   start     end
## 1 13FL990708:250295003   1  262738 1023028
## 2 13FL991485:250349616   1  262855 1030583
## 3 14FL991631:250410929   1  262855 1218925
## 4 14FL991263:250410523   1 1030583 1228056
## 5 14FL990731:250409914   1  262855 1508268
## 6 14FL991166:250410414   1 1228478 1508268

Transforming maize landraces data

#landraces_xo_by_taxon<-landraces.xo %>% 
#  group_by(taxon) %>% 
 #   summarise(xo_by_taxon_count = length(taxon))

#write.table(landraces_xo_by_taxon,file="landraces_xo_by_taxon.txt",sep="\t",quote = #FALSE,row.names = FALSE)


landraces.merged.xo<-merge(landraces.xo,landraces.parentage[1:3],by.x="taxon",by.y="proid")

landraces.child.counts<- landraces.parentage %>% 
  group_by(parent1) %>%
  summarise(family_progeny_count = length(parent1))

landraces.selfed.counts<-landraces.parentage[landraces.parentage$parent1==landraces.parentage$parent2,] %>%
   group_by(parent1) %>%
  summarise(family_selfed_count = length(parent1))


#head(landraces.child.counts)
#head(merged.landraces.xo)

landraces.xo.by.mom<- landraces.merged.xo %>% 
  group_by(parent1) %>%
  summarise(family_xo_count = length(parent1))

landraces.merged.xo<-inner_join(landraces.merged.xo,landraces.child.counts)
landraces.merged.xo<-inner_join(landraces.merged.xo,landraces.selfed.counts)
landraces.merged.xo<-inner_join(landraces.merged.xo,landraces.xo.by.mom)
#head(landraces.merged.xo)



#head(landraces.xo.by.mom)

landraces.merged.moms<-inner_join(landraces.xo.by.mom,landraces.child.counts)
landraces.merged.moms<-inner_join(landraces.merged.moms,landraces.selfed.counts)

head(landraces.merged.moms)
## # A tibble: 6 × 4
##                                       parent1 family_xo_count
##                                         <chr>           <int>
## 1  164_1_Tuxpeno_El_Aguacatito_Mex_:250426647           13359
## 2 164_11_Tuxpeno_El_Aguacatito_Mex_:250358056            4096
## 3 164_14_Tuxpeno_El_Aguacatito_Mex_:250426648            4322
## 4  164_2_Tuxpeno_El_Aguacatito_Mex_:250358051            6038
## 5  164_4_Tuxpeno_El_Aguacatito_Mex_:250358052            4830
## 6  164_8_Tuxpeno_El_Aguacatito_Mex_:250358054            6867
## # ... with 2 more variables: family_progeny_count <int>,
## #   family_selfed_count <int>
landraces.xo.over.progeny.count<-data.frame(rep("landraces",dim(landraces.merged.moms)[1]),landraces.merged.moms$family_xo_count/landraces.merged.moms$family_progeny_count)
names(landraces.xo.over.progeny.count)<-c("teo.or.maize","xo_over_family_count")

head(landraces.xo.over.progeny.count)
##   teo.or.maize xo_over_family_count
## 1    landraces             26.55865
## 2    landraces             29.68116
## 3    landraces             24.41808
## 4    landraces             27.19820
## 5    landraces             27.60000
## 6    landraces             27.03543

Recombination rate plot for teosinte,30 bins total

ggplot(teosinte.xo,aes(x=start))+geom_histogram(bins=500)+facet_wrap(~chr,ncol=2)

Recombination rate plot for maize landraces,30 bins total

ggplot(landraces.xo,aes(x=start))+geom_histogram(bins=500)+facet_wrap(~chr,ncol=2)

Per family xo divided by family count

dim(teosinte.xo.over.progeny.count)
## [1] 49  2
dim(landraces.xo.over.progeny.count)
## [1] 35  2
xo.over.progeny.count<-rbind(teosinte.xo.over.progeny.count,landraces.xo.over.progeny.count)
head(xo.over.progeny.count)
##   teo.or.maize xo_over_family_count
## 1     teosinte             26.83654
## 2     teosinte             24.22973
## 3     teosinte             27.00000
## 4     teosinte             26.41406
## 5     teosinte             26.71429
## 6     teosinte             32.77778
tail(xo.over.progeny.count)
##    teo.or.maize xo_over_family_count
## 79    landraces             28.52941
## 80    landraces             26.93750
## 81    landraces             29.87879
## 82    landraces             27.38636
## 83    landraces             27.65385
## 84    landraces             27.72727
boxplot(xo.over.progeny.count$xo_over_family_count ~xo.over.progeny.count$teo.or.maize,
        main="Per family xo/progeny.count")

Filtering for family.selfed.count>20

  teosinte.merged.moms<-teosinte.merged.moms[teosinte.merged.moms$family_selfed_count>20,]
  teosinte.xo.over.progeny.count<-data.frame(rep("teosinte",dim(teosinte.merged.moms)[1]),teosinte.merged.moms$family_xo_count/teosinte.merged.moms$family_progeny_count)
names(teosinte.xo.over.progeny.count)<-c("teo.or.maize","xo_over_family_count")

  landraces.merged.moms<-landraces.merged.moms[landraces.merged.moms$family_selfed_count>20,]
landraces.xo.over.progeny.count<-data.frame(rep("landraces",dim(landraces.merged.moms)[1]),landraces.merged.moms$family_xo_count/landraces.merged.moms$family_progeny_count)
names(landraces.xo.over.progeny.count)<-c("teo.or.maize","xo_over_family_count")

dim(teosinte.xo.over.progeny.count)
## [1] 41  2
dim(landraces.xo.over.progeny.count)
## [1] 24  2

And plot again

xo.over.progeny.count<-rbind(teosinte.xo.over.progeny.count,landraces.xo.over.progeny.count)
head(xo.over.progeny.count)
##   teo.or.maize xo_over_family_count
## 1     teosinte             26.83654
## 2     teosinte             24.22973
## 3     teosinte             27.00000
## 4     teosinte             26.41406
## 5     teosinte             32.77778
## 6     teosinte             25.58559
tail(xo.over.progeny.count)
##    teo.or.maize xo_over_family_count
## 60    landraces             28.52941
## 61    landraces             26.93750
## 62    landraces             29.87879
## 63    landraces             27.38636
## 64    landraces             27.65385
## 65    landraces             27.72727
boxplot(xo.over.progeny.count$xo_over_family_count ~xo.over.progeny.count$teo.or.maize,
        main="Per family xo/progeny.count")

#to pinpoint outlier landrace families
landraces.xo.over.progeny.count<-data.frame(landraces.merged.moms$parent1,landraces.merged.moms$family_xo_count/landraces.merged.moms$family_progeny_count)
names(landraces.xo.over.progeny.count)[2]<-"xo_over_family_count"
landraces.xo.over.progeny.count[landraces.xo.over.progeny.count$xo_over_family_count>40,]
## [1] landraces.merged.moms.parent1 xo_over_family_count         
## <0 rows> (or 0-length row.names)
offending_families<- as.character(landraces.xo.over.progeny.count[landraces.xo.over.progeny.count$xo_over_family_count>40,]$landraces.merged.moms.parent1)

offending_families
## character(0)

Remove outlier families and plot again

landraces.xo.over.progeny.count<-landraces.xo.over.progeny.count[landraces.xo.over.progeny.count$xo_over_family_count<40,]
  landraces.xo.over.progeny.count<-data.frame(rep("landraces",dim(landraces.xo.over.progeny.count)[1]),landraces.xo.over.progeny.count$xo_over_family_count)
                                            
names(landraces.xo.over.progeny.count)<-c("teo.or.maize","xo_over_family_count")
xo.over.progeny.count<-rbind(teosinte.xo.over.progeny.count,landraces.xo.over.progeny.count)

boxplot(xo.over.progeny.count$xo_over_family_count ~xo.over.progeny.count$teo.or.maize,
        main="Per family xo/progeny.count")

Getting the medians

landraces.values<-xo.over.progeny.count[xo.over.progeny.count$teo.or.maize=="landraces",]$xo_over_family_count
median(landraces.values)
## [1] 27.20403
teosinte.values<-xo.over.progeny.count[xo.over.progeny.count$teo.or.maize=="teosinte",]$xo_over_family_count
median(teosinte.values)
## [1] 26.83654
wilcox.test(landraces.values,teosinte.values)
## 
##  Wilcoxon rank sum test
## 
## data:  landraces.values and teosinte.values
## W = 539, p-value = 0.5301
## alternative hypothesis: true location shift is not equal to 0

Filtering xo events, removing families with <20 selfed progeny and of the three landrace families with outlier xo/progeny.count

dim(teosinte.merged.xo)
## [1] 126736      9
dim(landraces.merged.xo)
## [1] 122206      9
#number of teosinte families
length(unique(teosinte.merged.xo$Mother))
## [1] 49
#number of landrace families
length(unique(landraces.merged.xo$parent1))
## [1] 35
teosinte.merged.xo<-teosinte.merged.xo[teosinte.merged.xo$family_selfed_count>20,]
landraces.merged.xo<-landraces.merged.xo[landraces.merged.xo$family_selfed_count>20
                                         & !(landraces.merged.xo$parent1 %in% offending_families),]


dim(teosinte.merged.xo)
## [1] 106294      9
dim(landraces.merged.xo)
## [1] 93874     9
teosinte.moms<-unique(teosinte.merged.xo$Mother)
#number of teosinte families left after filtering
length(teosinte.moms)
## [1] 41
#number of landrace families left after filtering
landraces.moms<-unique(landraces.merged.xo$parent1)
length(landraces.moms)
## [1] 24

Plotting filtered data

Recombination rate plot for teosinte,30 bins total

ggplot(teosinte.merged.xo,aes(x=start))+geom_histogram(bins=500)+facet_wrap(~chr,ncol=2)

Recombination rate plot for maize landraces,30 bins total

ggplot(landraces.merged.xo,aes(x=start))+geom_histogram(bins=500)+facet_wrap(~chr,ncol=2)

And now actually plotting by family

Chr2, families 1:10

ggplot(data=filter(teosinte.merged.xo,chr==2,Mother %in% teosinte.moms[1:10],start>0),aes(x=start,y=(..density..)))+geom_freqpoly(mapping=aes(color=Mother),binwidth=1e7)

#+theme(legend.position="none")

Chr2, families 11:20

 ggplot(data=filter(teosinte.merged.xo,chr==2,Mother %in% teosinte.moms[11:20],start>0 ),aes(x=start,y=..density..))+geom_freqpoly(mapping=aes(color=Mother),binwidth=1e7)

#+theme(legend.position="none")

Chr2, families 21:30

 ggplot(data=filter(teosinte.merged.xo,chr==2,Mother %in% teosinte.moms[21:30],start>0 ),aes(x=start,y=..density..))+geom_freqpoly(mapping=aes(color=Mother),binwidth=1e7)

Chr2, families 31:41

 ggplot(data=filter(teosinte.merged.xo,chr==2,Mother %in% teosinte.moms[31:40],start>0 ),aes(x=start,y=..density..))+geom_freqpoly(mapping=aes(color=Mother),binwidth=1e7)

Chr2, families ALL FAMILIES

 ggplot(data=filter(teosinte.merged.xo,chr==2),aes(x=start,y=..density..))+geom_freqpoly(mapping=aes(color=Mother),alpha=1/4,binwidth=1e7,show.legend=FALSE)

ALL FAMILIES ALL CHROMOSOMES

 ggplot(data=teosinte.merged.xo,aes(x=start,y=..density..))+geom_freqpoly(mapping=aes(color=Mother),alpha=1/4,binwidth=1e7,show.legend=FALSE)+facet_wrap(~chr,ncol=2,scales = "free")