library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Read data, remove NA and stupid columns
x<-read.table("/Users/jri/Desktop/SupplementalDataset2_v2.txt",header=T) %>% filter(Tissue == "Leaf",is.na(TeosinteDepth.Parent)==FALSE,is.na(MaizeDepth.Parent)==FALSE,is.na(logRatio.Hybrid)==FALSE) %>% select(-NoSNPs.Parent,-TeosinteDepth.Parent,-MaizeDepth.Parent,-Tissue,-TeosinteDepth.Hybrid,-NoSNPs.Hybrid,-MaizeSequenceObtained,-HybridSequenceObtained,-TeosinteSequenceObtained,-MaizeDepth.Hybrid)
head(x)
## Gene MaizeParent TeosinteParent logRatio.Parent BET.p.Parent
## 1 AC147602.5_FG004 B73 TI09 -0.6199690 1.231914e-03
## 2 AC147602.5_FG004 Ki3 TI09 -3.1096245 5.578276e-30
## 3 AC147602.5_FG004 Mo17 TI09 0.1383282 4.118196e-01
## 4 AC147602.5_FG004 Oh43 TI09 -1.3351842 4.718840e-11
## 5 AC147602.5_FG004 Oh43 TI15 -0.3174822 2.143149e-01
## 6 AC148152.3_FG001 Mo17 TI09 -1.5849625 1.459961e-01
## logRatio.Hybrid BET.p.Hybrid
## 1 -0.36572912 6.119416e-08
## 2 -0.35981553 5.798913e-05
## 3 -0.73118324 6.217471e-05
## 4 0.05703095 8.015050e-01
## 5 -0.25063680 7.289346e-03
## 6 -2.58496250 2.500000e-01
Add columns for hybrid name, whether the hybrid and parents are in same direction (“direction”), whether genes in the same direction shows sig. diff. between maize and teo (“domestication”), and remove parent columns
y<-mutate(x,hybrid=paste(TeosinteParent,MaizeParent,sep="_"),direction=ifelse(logRatio.Parent*logRatio.Hybrid>0,"same","different")) %>% mutate(domestication=ifelse(direction=="same",ifelse(BET.p.Parent<0.05,1,0),NA)) %>% select(-MaizeParent,-TeosinteParent)
head(y)
## Gene logRatio.Parent BET.p.Parent logRatio.Hybrid
## 1 AC147602.5_FG004 -0.6199690 1.231914e-03 -0.36572912
## 2 AC147602.5_FG004 -3.1096245 5.578276e-30 -0.35981553
## 3 AC147602.5_FG004 0.1383282 4.118196e-01 -0.73118324
## 4 AC147602.5_FG004 -1.3351842 4.718840e-11 0.05703095
## 5 AC147602.5_FG004 -0.3174822 2.143149e-01 -0.25063680
## 6 AC148152.3_FG001 -1.5849625 1.459961e-01 -2.58496250
## BET.p.Hybrid hybrid direction domestication
## 1 6.119416e-08 TI09_B73 same 1
## 2 5.798913e-05 TI09_Ki3 same 1
## 3 6.217471e-05 TI09_Mo17 different NA
## 4 8.015050e-01 TI09_Oh43 different NA
## 5 7.289346e-03 TI15_Oh43 same 0
## 6 2.500000e-01 TI09_Mo17 same 0
Now for each gene, count how many comparisons there are with data (“count”), how many are in the same direction, the proportion of those in the same direction that are diff. between maize and teosinte (“dom”), and whether more than 70% are in the same direction as required by the ABC list (“abc”)
bob<-group_by(y,Gene) %>% summarise(count=n(),same=sum(direction=="same"),dom=mean(domestication,na.rm=T)) %>%mutate(abc=ifelse(same>=0.7*count,"ABC","no"))
head(bob)
## Source: local data frame [6 x 5]
##
## Gene count same dom abc
## (fctr) (int) (int) (dbl) (chr)
## 1 AC147602.5_FG004 5 3 0.6666667 no
## 2 AC148152.3_FG001 1 1 0.0000000 ABC
## 3 AC148152.3_FG005 15 9 0.4444444 no
## 4 AC148152.3_FG008 23 14 0.9285714 no
## 5 AC148167.6_FG001 28 16 0.4375000 no
## 6 AC149475.2_FG002 9 3 1.0000000 no
I don’t really know how to decide which genes are “domestication genes” since many show significant diffs between maize and teosinte in only some comparisons. Zak does a weighted average of depth across all hybrids but that sounds painful and I’m lazy. Let’s see what this list looks like. I’m going with \(>50\)% of the genes in the same direction between hybrid and parent.
sue=filter(bob,dom>0.5,abc=="ABC")
head(sue)
## Source: local data frame [6 x 5]
##
## Gene count same dom abc
## (fctr) (int) (int) (dbl) (chr)
## 1 AC149475.2_FG005 29 21 0.6190476 ABC
## 2 AC149475.2_FG007 1 1 1.0000000 ABC
## 3 AC149828.2_FG002 21 17 0.8823529 ABC
## 4 AC149829.2_FG006 29 26 0.7307692 ABC
## 5 AC155376.2_FG005 23 17 0.5294118 ABC
## 6 AC155434.2_FG006 25 20 0.6000000 ABC
count(sue)
## Source: local data frame [1 x 1]
##
## n
## (int)
## 1 3925