Set dir! Ggplot!
setwd("~/projects/R_reports")
library(ggplot2)
Load maize sorghum (or rice) dn/ds from COGE, fix stuff (see comments). Get deleterious allele and association data from Figshare.
# dn/ds from Coge. separate by ||. remove extra columns with no header (WTF
# Coge??). Remove GEVO link in header (no data, WTF Coge??). Remove # and
# extra lines of headers (WTF Coge??) note $_=~s/_T\\d+//g; which removes
# transcript info!!
# rice:
# http://genomevolution.org/CoGe//data/diags//16890/9199/16890_9199.CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.gcoords.ks
# sorghum:
# http://genomevolution.org/CoGe//data/diags//16896/9199/16896_9199.CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.gcoords.ks
system("curl -o - http://genomevolution.org/CoGe//data/diags//16896/9199/16896_9199.CDS-CDS.last.tdd10.cs0.filtered.dag.all.go_D20_g10_A5.aligncoords.gcoords.ks | tail +5 | perl -ne 'BEGIN{$kn=0}; $kn+=$_=~m/Kn/ ? 1 : 0; $_=~s/#//g; $_=~s/\\|\\|/\t/g; $_=~s/_T\\d+//g; if( $_=~m/Kn/ ){ $_=~s/GEVO_link//; print $_ unless $kn>1; } else{ @array=split(/\\t/,$_); splice(@array,10,1); splice(@array,21,1); print join(\"\\t\",@array); }' > data/dnds.txt ")
dnds <- read.table("data/dnds.txt", header = T)
# set 'name2' to be 'Gene'
names(dnds)[which(names(dnds) == "name2")] = "Gene"
# heterosis data f/m Sofiane
system("curl -o data/het_data.txt http://files.figshare.com/1293877/het_data.txt")
het <- read.table("data/het_data.txt", header = T)
Filter matches and join
# minimum %ID
temp <- subset(dnds, dnds$percent_id2 > 70)
# length
temp <- subset(temp, abs(temp$stop2.1 - temp$start2.1) > 500)
# Ks
temp <- subset(temp, temp$Ks > 0)
# join
het_dnds <- merge(het, temp, by = "Gene")
knks <- het_dnds$Kn/het_dnds$Ks
“C” = complementation (presence of deleterious alleles). More positive selection for “C” than “NotC”
chisq.test(c(length(subset(knks, het_dnds$DelAll == "C" & knks > 1)), length(subset(knks,
het_dnds$DelAll == "NotC" & knks > 1)), table(het_dnds$DelAll)[1], table(het_dnds$DelAll)[2]))
##
## Chi-squared test for given probabilities
##
## data: c(length(subset(knks, het_dnds$DelAll == "C" & knks > 1)), length(subset(knks, het_dnds$DelAll == "NotC" & knks > 1)), table(het_dnds$DelAll)[1], table(het_dnds$DelAll)[2])
## X-squared = 12424, df = 3, p-value < 2.2e-16
dn/ds significantly higher for “C” genes than “NotC”
other_loci = data.frame(knks, het_dnds$DelAll)
colnames(other_loci) = c("knks", "del")
# with all loci
summary(lm(data = other_loci, knks ~ del))
##
## Call:
## lm(formula = knks ~ del, data = other_loci)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.22 -0.14 -0.07 0.02 98.80
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2249 0.0219 10.25 <2e-16 ***
## delNotC -0.0135 0.0371 -0.36 0.72
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.82 on 10574 degrees of freedom
## Multiple R-squared: 1.25e-05, Adjusted R-squared: -8.21e-05
## F-statistic: 0.132 on 1 and 10574 DF, p-value: 0.716
# without positive selected
other_loci <- subset(other_loci, other_loci$knks <= 1)
summary(lm(data = other_loci, knks ~ del))
##
## Call:
## lm(formula = knks ~ del, data = other_loci)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.1844 -0.0947 -0.0306 0.0622 0.8267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.18444 0.00162 113.64 < 2e-16 ***
## delNotC -0.02081 0.00275 -7.58 3.9e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.134 on 10547 degrees of freedom
## Multiple R-squared: 0.00541, Adjusted R-squared: 0.00532
## F-statistic: 57.4 on 1 and 10547 DF, p-value: 3.87e-14
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2",
"#D55E00", "#CC79A7")
ggplot(data = other_loci, aes(knks, fill = del)) + geom_histogram(alpha = 0.8) +
scale_fill_manual("", values = cbPalette, labels = c("Del", "None")) + xlab(expression(K[N]/K[S])) #+geom_vline(xintercept=median(other_loci$knks[other_loci$del=='NotC']),colour=cbPalette[2],size=1)+geom_vline(xintercept=median(other_loci$knks[other_loci$del=='C']),colour=cbPalette[1],size=1)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.