Purifying selection deleterious alleles

Set dir! Ggplot!

setwd("~/projects/R_reports")
library(ggplot2)

Load maize sorghum (or rice) dn/ds from COGE, fix stuff (see comments). Get deleterious allele and association data from Figshare.

# dn/ds from Coge. separate by ||. remove extra columns with no header (WTF
# Coge??). Remove GEVO link in header (no data, WTF Coge??). Remove # and
# extra lines of headers (WTF Coge??)  note $_=~s/_T\\d+//g; which removes
# transcript info!!

# rice:
# http://genomevolution.org/CoGe//data/diags//16890/9199/16890_9199.CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.gcoords.ks
# sorghum:
# http://genomevolution.org/CoGe//data/diags//16896/9199/16896_9199.CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.gcoords.ks
system("curl -o - http://genomevolution.org/CoGe//data/diags//16896/9199/16896_9199.CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.gcoords.ks | tail +5 | perl -ne 'BEGIN{$kn=0}; $kn+=$_=~m/Kn/ ? 1 : 0; $_=~s/#//g; $_=~s/\\|\\|/\t/g; $_=~s/_T\\d+//g; if( $_=~m/Kn/ ){ $_=~s/GEVO_link//; print $_ unless $kn>1; } else{ @array=split(/\\t/,$_); splice(@array,10,1); splice(@array,21,1); print join(\"\\t\",@array); }'  > data/dnds.txt ")
dnds <- read.table("data/dnds.txt", header = T)

# set 'name2' to be 'Gene'
names(dnds)[which(names(dnds) == "name2")] = "Gene"

# heterosis data f/m Sofiane
system("curl -o data/het_data.txt http://files.figshare.com/1293877/het_data.txt")
het <- read.table("data/het_data.txt", header = T)

Filter matches and join

# minimum %ID
temp <- subset(dnds, dnds$percent_id2 > 70)

# length
temp <- subset(temp, abs(temp$stop2.1 - temp$start2.1) > 500)

# Ks
temp <- subset(temp, temp$Ks > 0)

# join
het_dnds <- merge(het, temp, by = "Gene")
knks <- het_dnds$Kn/het_dnds$Ks

“C” = complementation (presence of deleterious alleles). More positive selection for “C” than “NotC”

chisq.test(c(length(subset(knks, het_dnds$DelAll == "C" & knks > 1)), length(subset(knks, 
    het_dnds$DelAll == "NotC" & knks > 1)), table(het_dnds$DelAll)[1], table(het_dnds$DelAll)[2]))

## 
##  Chi-squared test for given probabilities
## 
## data:  c(length(subset(knks, het_dnds$DelAll == "C" & knks > 1)), length(subset(knks,     het_dnds$DelAll == "NotC" & knks > 1)), table(het_dnds$DelAll)[1],     table(het_dnds$DelAll)[2])
## X-squared = 12424, df = 3, p-value < 2.2e-16

dn/ds significantly higher for “C” genes than “NotC”

other_loci = data.frame(knks, het_dnds$DelAll)
colnames(other_loci) = c("knks", "del")

# with all loci
summary(lm(data = other_loci, knks ~ del))

## 
## Call:
## lm(formula = knks ~ del, data = other_loci)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -0.22  -0.14  -0.07   0.02  98.80 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.2249     0.0219   10.25   <2e-16 ***
## delNotC      -0.0135     0.0371   -0.36     0.72    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.82 on 10574 degrees of freedom
## Multiple R-squared:  1.25e-05,   Adjusted R-squared:  -8.21e-05 
## F-statistic: 0.132 on 1 and 10574 DF,  p-value: 0.716


# without positive selected
other_loci <- subset(other_loci, other_loci$knks <= 1)
summary(lm(data = other_loci, knks ~ del))

## 
## Call:
## lm(formula = knks ~ del, data = other_loci)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.1844 -0.0947 -0.0306  0.0622  0.8267 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.18444    0.00162  113.64  < 2e-16 ***
## delNotC     -0.02081    0.00275   -7.58  3.9e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.134 on 10547 degrees of freedom
## Multiple R-squared:  0.00541,    Adjusted R-squared:  0.00532 
## F-statistic: 57.4 on 1 and 10547 DF,  p-value: 3.87e-14


cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", 
    "#D55E00", "#CC79A7")
ggplot(data = other_loci, aes(knks, fill = del)) + geom_histogram(alpha = 0.8) + 
    scale_fill_manual("", values = cbPalette, labels = c("Del", "None")) + xlab(expression(K[N]/K[S]))  #+geom_vline(xintercept=median(other_loci$knks[other_loci$del=='NotC']),colour=cbPalette[2],size=1)+geom_vline(xintercept=median(other_loci$knks[other_loci$del=='C']),colour=cbPalette[1],size=1)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-5