library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Read data, remove NA and stupid columns

x<-read.table("/Users/jri/Desktop/SupplementalDataset2_v2.txt",header=T) %>% filter(Tissue == "Leaf",is.na(TeosinteDepth.Parent)==FALSE,is.na(MaizeDepth.Parent)==FALSE,is.na(logRatio.Hybrid)==FALSE) %>% select(-NoSNPs.Parent,-TeosinteDepth.Parent,-MaizeDepth.Parent,-Tissue,-TeosinteDepth.Hybrid,-NoSNPs.Hybrid,-MaizeSequenceObtained,-HybridSequenceObtained,-TeosinteSequenceObtained,-MaizeDepth.Hybrid)
head(x)
##               Gene MaizeParent TeosinteParent logRatio.Parent BET.p.Parent
## 1 AC147602.5_FG004         B73           TI09      -0.6199690 1.231914e-03
## 2 AC147602.5_FG004         Ki3           TI09      -3.1096245 5.578276e-30
## 3 AC147602.5_FG004        Mo17           TI09       0.1383282 4.118196e-01
## 4 AC147602.5_FG004        Oh43           TI09      -1.3351842 4.718840e-11
## 5 AC147602.5_FG004        Oh43           TI15      -0.3174822 2.143149e-01
## 6 AC148152.3_FG001        Mo17           TI09      -1.5849625 1.459961e-01
##   logRatio.Hybrid BET.p.Hybrid
## 1     -0.36572912 6.119416e-08
## 2     -0.35981553 5.798913e-05
## 3     -0.73118324 6.217471e-05
## 4      0.05703095 8.015050e-01
## 5     -0.25063680 7.289346e-03
## 6     -2.58496250 2.500000e-01

Add columns for hybrid name, whether the hybrid and parents are in same direction (“direction”), whether genes in the same direction shows sig. diff. between maize and teo (“domestication”), and remove parent columns

y<-mutate(x,hybrid=paste(TeosinteParent,MaizeParent,sep="_"),direction=ifelse(logRatio.Parent*logRatio.Hybrid>0,"same","different")) %>% mutate(domestication=ifelse(direction=="same",ifelse(BET.p.Parent<0.05,1,0),NA)) %>% select(-MaizeParent,-TeosinteParent)
head(y)
##               Gene logRatio.Parent BET.p.Parent logRatio.Hybrid
## 1 AC147602.5_FG004      -0.6199690 1.231914e-03     -0.36572912
## 2 AC147602.5_FG004      -3.1096245 5.578276e-30     -0.35981553
## 3 AC147602.5_FG004       0.1383282 4.118196e-01     -0.73118324
## 4 AC147602.5_FG004      -1.3351842 4.718840e-11      0.05703095
## 5 AC147602.5_FG004      -0.3174822 2.143149e-01     -0.25063680
## 6 AC148152.3_FG001      -1.5849625 1.459961e-01     -2.58496250
##   BET.p.Hybrid    hybrid direction domestication
## 1 6.119416e-08  TI09_B73      same             1
## 2 5.798913e-05  TI09_Ki3      same             1
## 3 6.217471e-05 TI09_Mo17 different            NA
## 4 8.015050e-01 TI09_Oh43 different            NA
## 5 7.289346e-03 TI15_Oh43      same             0
## 6 2.500000e-01 TI09_Mo17      same             0

Now for each gene, count how many comparisons there are with data (“count”), how many are in the same direction, the proportion of those in the same direction that are diff. between maize and teosinte (“dom”), and whether more than 70% are in the same direction as required by the ABC list (“abc”)

bob<-group_by(y,Gene) %>% summarise(count=n(),same=sum(direction=="same"),dom=mean(domestication,na.rm=T)) %>%mutate(abc=ifelse(same>=0.7*count,"ABC","no"))
head(bob)
## Source: local data frame [6 x 5]
## 
##               Gene count  same       dom   abc
##             (fctr) (int) (int)     (dbl) (chr)
## 1 AC147602.5_FG004     5     3 0.6666667    no
## 2 AC148152.3_FG001     1     1 0.0000000   ABC
## 3 AC148152.3_FG005    15     9 0.4444444    no
## 4 AC148152.3_FG008    23    14 0.9285714    no
## 5 AC148167.6_FG001    28    16 0.4375000    no
## 6 AC149475.2_FG002     9     3 1.0000000    no

I don’t really know how to decide which genes are “domestication genes” since many show significant diffs between maize and teosinte in only some comparisons. Zak does a weighted average of depth across all hybrids but that sounds painful and I’m lazy. Let’s see what this list looks like. I’m going with \(>50\)% of the genes in the same direction between hybrid and parent.

sue=filter(bob,dom>0.5,abc=="ABC") 
head(sue)
## Source: local data frame [6 x 5]
## 
##               Gene count  same       dom   abc
##             (fctr) (int) (int)     (dbl) (chr)
## 1 AC149475.2_FG005    29    21 0.6190476   ABC
## 2 AC149475.2_FG007     1     1 1.0000000   ABC
## 3 AC149828.2_FG002    21    17 0.8823529   ABC
## 4 AC149829.2_FG006    29    26 0.7307692   ABC
## 5 AC155376.2_FG005    23    17 0.5294118   ABC
## 6 AC155434.2_FG006    25    20 0.6000000   ABC
count(sue)
## Source: local data frame [1 x 1]
## 
##       n
##   (int)
## 1  3925