Lovell — Nov 17, 2014, 6:26 PM
# gene expression remove outliers
rm(list=ls())
library(outliers)
library(car)
Warning: package 'car' was built under R version 3.1.1
library(ggplot2)
setwd("~/Desktop/dropbox/pickle2014")
counts.w.outliers<-read.csv("Hal_Fil_DD_Count_Matrix 2.csv")
#get categories
genos<-colnames(counts.w.outliers)[-1]
geno<-ifelse(grepl("HAL",genos),"Hal",ifelse(grepl("FIL",genos),"Fil","AP13"))
trt<-ifelse(grepl("Wet",genos),"Wet","dry")
time<-ifelse(grepl("Mid",genos),"Mid",ifelse(grepl("Pre",genos),"Pre","Eve"))
info<-data.frame(cbind(genos,geno,trt,time))
counts<-counts.w.outliers
outlier.info<-data.frame()
for(i in 2:length(colnames(counts))){
in.dat<-counts[,c(1,i)]
in.dat$log.tr<-log(in.dat[,2])
dat<-in.dat$log.tr[!is.infinite(in.dat$log.tr)]
p.out<-0
n.out<-0
while(p.out<0.05/length(rownames(counts))){
p.out<-chisq.out.test(dat[!is.na(dat)], variance=var(dat[!is.na(dat)]))$p.value
dat<-dat[!dat==max(dat)]
n.out<-n.out+1
}
if(n.out>0){
counts[,i][counts[,i] %in% rev(sort(in.dat[,2]))[1:n.out]]<-NA
}
outlier.info<-rbind(outlier.info,data.frame(colnames(counts)[i],n.out,sum(counts.w.outliers[,i]),sum(counts[,i], na.rm=T)))
}
colnames(outlier.info)<-c("genos","n.outliers","library.size.initial","library.size.no.outliers")
print(outlier.info)
genos n.outliers library.size.initial library.size.no.outliers
1 HAL229DryPre 2 550180 457273
2 HAL219WetPre 1 160815 158679
3 HAL28WetEve 2 328068 311060
4 HAL220DryPre 2 740629 677101
5 HAL25DryMid 2 2128973 1745820
6 HAL25DryEve 1 667235 656797
7 HAL27DryPre 1 326328 322835
8 HAL211WetEve 1 276397 273882
9 HAL215WetMid 2 664322 608083
10 HAL219WetMid 2 613023 534555
11 HAL223DryMid 1 222872 220404
12 HAL28WetMid 1 324281 321820
13 HAL29DryPre 1 1137593 1121192
14 HAL226WetEve 2 213479 207763
15 HAL221DryPre 1 587819 571337
16 HAL22WetMid 1 163703 162727
17 HAL215WetEve 1 790952 770782
18 HAL213WetPre 1 817202 804122
19 HAL213WetEve 1 165033 163578
20 HAL212DryEve 14 2436 2106
21 HAL222WetPre 1 1123843 1081069
22 HAL216DryPre 1 609795 602331
23 HAL222WetMid 2 58252 56925
24 HAL24WetEve 2 675494 633178
25 HAL218DryPre 2 310262 293368
26 HAL220DryMid 5 41749 39941
27 HAL228WetEve 2 487077 454284
28 HAL225DryPre 1 248290 243012
29 HAL214DryEve 2 352992 306193
30 HAL210WetEve 2 410821 394848
31 HAL24WetPre 3 305295 279846
32 HAL221DryEve 2 861017 804802
33 HAL29DryMid 2 241480 223471
34 HAL27DryEve 1 426161 420036
35 HAL22WetEve 1 1214801 1175712
36 HAL220DryEve 2 246801 239513
37 HAL21DryPre 1 56676 56238
38 HAL224WetEve 1 139851 138699
39 HAL25DryPre 1 387716 378505
40 HAL222WetEve 1 1667996 1622178
41 HAL230WetPre 1 316151 311892
42 HAL224WetMid 1 286007 284181
43 HAL23DryEve 1 1506981 1412053
44 HAL223DryEve 3 135 112
45 HAL214DryPre 2 326073 312935
46 HAL226WetMid 22 6866 5656
47 HAL227DryPre 1 167582 165474
48 HAL230WetMid 2 614493 579761
49 HAL225DryMid 2 1554949 1343524
50 HAL210WetPre 5 87507 79590
51 HAL23DryPre 1 1816866 1751384
52 HAL29DryEve 2 872183 747396
53 HAL213WetMid 1 268927 266646
54 HAL226WetPre 1 316239 311754
55 HAL23DryMid 2 204618 185450
56 HAL227DryEve 1 1122982 1086102
57 HAL22WetPre 1 1356907 1318269
58 HAL24WetMid 1 214440 210879
59 HAL218DryEve 1 245980 241669
60 HAL216DryEve 2 193163 187594
61 HAL224WetPre 2 105442 102960
62 HAL219WetEve 2 883693 822622
63 HAL218DryMid 2 808487 681672
64 HAL230WetEve 2 414571 396473
65 HAL211WetPre 11 1450 1217
66 HAL214DryMid 11 9603 8446
67 HAL229DryEve 1 350048 343163
68 HAL217WetMid 1 396354 390615
69 HAL212DryPre 2 408252 353185
70 HAL212DryMid 2 359261 298293
71 HAL227DryMid 1 324904 322049
72 HAL26WetPre 1 247308 241867
73 HAL28WetPre 2 59373 57965
74 HAL21DryMid 2 243325 232868
75 HAL27DryMid 9 8588 7733
76 HAL223DryPre 5 57129 54410
77 HAL26WetEve 8 762 671
78 HAL225DryEve 1 443304 434766
79 HAL211WetMid 1 1159566 1107969
80 HAL26WetMid 1 66501 66099
81 HAL229DryMid 2 467298 437746
82 HAL228WetMid 1 387287 380338
83 HAL221DryMid 1 309589 307751
84 HAL217WetEve 2 761227 663191
85 HAL217WetPre 1 120362 119145
86 HAL216DryMid 2 540726 499107
87 HAL210WetMid 3 34215 33438
88 HAL21DryEve 1 142734 141592
89 HAL228WetPre 14 5594 4757
90 HAL215WetPre 1 1854443 1781424
91 FIL242DryPre 1 647821 635792
92 FIL243WetPre 2 673534 564566
93 FIL248DryEve 1 146644 144683
94 FIL249WetPre 1 449179 434560
95 FIL245WetMid 1 267838 263978
96 FIL247WetPre 1 107440 106498
97 FIL236WetPre 3 84245 77974
98 FIL232WetEve 1 23119 22934
99 FIL232WetPre 1 612058 603368
100 FIL240WetPre 1 438051 428123
101 FIL254WetEve 1 485859 480886
102 FIL251DryMid 11 13499 11819
103 FIL249WetEve 1 277327 274930
104 FIL245WetEve 2 171803 166863
105 FIL248DryMid 1 210565 209728
106 FIL231DryPre 2 202681 181959
107 FIL236WetEve 2 446401 409265
108 FIL241WetMid 2 62615 60385
109 FIL237DryPre 1 1853917 1759193
110 FIL255DryPre 3 157 138
111 FIL247WetEve 1 429009 424097
112 FIL244DryEve 2 371433 355262
113 FIL250DryPre 1 425237 419412
114 FIL246DryPre 2 479234 453647
115 FIL243WetMid 1 510519 507142
116 FIL255DryEve 3 128877 124068
117 FIL235DryMid 1 308895 306710
118 FIL259DryEve 1 137464 136287
119 FIL248DryPre 1 464027 452495
120 FIL233DryEve 1 83404 82835
121 FIL242DryMid 1 177618 175677
122 FIL243WetEve 1 748695 740058
123 FIL258WetMid 2 552072 485121
124 FIL237DryMid 1 101306 100079
125 FIL251DryPre 1 549092 544517
126 FIL239DryEve 2 246509 239612
127 FIL232WetMid 1 264656 259609
128 FIL257DryMid 1 574265 553064
129 FIL237DryEve 2 1035512 936596
130 FIL256WetEve 2 271859 262554
131 FIL259DryPre 7 15352 14507
132 FIL256WetMid 2 114084 103518
133 FIL236WetMid 1 187701 185623
134 FIL238WetPre 1 852021 819878
135 FIL231DryEve 1 299142 295324
136 FIL253DryMid 1 162225 161416
137 FIL239DryPre 1 510196 506517
138 FIL231DryMid 2 238438 227051
139 FIL244DryPre 2 340542 313712
140 FIL252WetMid 2 487660 428603
141 FIL245WetPre 1 251573 248689
142 FIL235DryPre 1 317803 313233
143 FIL257DryEve 1 1013158 988929
144 FIL250DryMid 1 230745 228501
145 FIL242DryEve 1 245291 241593
146 FIL240WetMid 11 796 659
147 FIL244DryMid 1 348221 343541
148 FIL233DryPre 1 391264 385326
149 FIL247WetMid 1 422711 416185
150 FIL240WetEve 1 452083 448651
151 FIL254WetPre 1 206184 203328
152 FIL233DryMid 1 163723 162069
153 FIL234WetEve 1 1124484 1089674
154 FIL260WetMid 2 1031199 945648
155 FIL246DryMid 1 416968 411943
156 FIL253DryEve 1 246477 243678
157 FIL256WetPre 1 189867 187433
158 FIL258WetPre 2 1397877 1234380
159 FIL254WetMid 1 980504 961721
160 FIL251DryEve 1 273214 271698
161 FIL260WetEve 1 780101 749617
162 FIL252WetEve 1 946791 931044
163 FIL249WetMid 2 392959 366267
164 FIL234WetPre 1 783604 745320
165 FIL257DryPre 1 1695244 1641154
166 FIL252WetPre 2 327490 305975
167 FIL234WetMid 1 358683 352925
168 FIL238WetMid 1 2014308 1963216
169 FIL259DryMid 14 6689 5844
170 FIL246DryEve 1 168119 166802
171 FIL255DryMid 1 550949 548223
172 FIL241WetPre 1 164034 161980
173 FIL260WetPre 2 386035 363832
174 FIL238WetEve 1 104893 104193
175 FIL250DryEve 1 201798 199614
176 FIL253DryPre 2 538675 501829
177 FIL258WetEve 1 455656 450366
178 FIL239DryMid 2 508448 485571
179 FIL235DryEve 2 561011 441708
180 FIL241WetEve 1 916935 879679
all.outlier.info<-merge(info,outlier.info, by="genos")
all.outlier.info$genos <-factor(all.outlier.info$genos, levels=all.outlier.info[order(all.outlier.info$library.size.initial), "genos"])
ggplot(all.outlier.info, aes(x=genos, y=library.size.initial, color=geno, shape=geno))+
geom_point( size=4)+
facet_grid(trt~time, scales="free_x")+
scale_shape(solid=FALSE)+theme_bw()
all.outlier.info$genos <-factor(all.outlier.info$genos, levels=all.outlier.info[order(all.outlier.info$library.size.no.outliers), "genos"])
ggplot(all.outlier.info, aes(x=genos, y=library.size.no.outliers, color=geno, shape=geno))+
geom_point( size=4)+
facet_grid(trt~time, scales="free_x")+
scale_shape(solid=FALSE)+theme_bw()
all.outlier.info$genos <-factor(all.outlier.info$genos, levels=all.outlier.info[order(all.outlier.info$n.outliers), "genos"])
ggplot(all.outlier.info, aes(x=genos, y=n.outliers, color=geno, shape=geno))+
geom_point( size=4)+
facet_grid(trt~time, scales="free_x")+
scale_shape(solid=FALSE)+theme_bw()
plot(all.outlier.info$library.size.initial,all.outlier.info$library.size.no.outliers)
#only the larger libraries are seriously affected by removing outliers
hist(all.outlier.info$library.size.initial, breaks=100)
hist(all.outlier.info$library.size.no.outliers, breaks=100)
hist(all.outlier.info$library.size.no.outliers, breaks=100, xlim=c(0,200000))
#there is a relatively distinct cuttoff where 25 libraries look to have low size. Remove these
library.size<-apply(counts.w.outliers[,-1],2,sum)
print(small.libs<-names(library.size[!library.size>100000]))
[1] "HAL212DryEve" "HAL222WetMid" "HAL220DryMid" "HAL21DryPre"
[5] "HAL223DryEve" "HAL226WetMid" "HAL210WetPre" "HAL211WetPre"
[9] "HAL214DryMid" "HAL28WetPre" "HAL27DryMid" "HAL223DryPre"
[13] "HAL26WetEve" "HAL26WetMid" "HAL210WetMid" "HAL228WetPre"
[17] "FIL236WetPre" "FIL232WetEve" "FIL251DryMid" "FIL241WetMid"
[21] "FIL255DryPre" "FIL233DryEve" "FIL259DryPre" "FIL240WetMid"
[25] "FIL259DryMid"
#here is a summary of the makeup of these genotypes
table(info[info$genos %in% small.libs,c(3,4,2)])
, , geno = Fil
time
trt Eve Mid Pre
dry 1 2 2
Wet 1 2 1
, , geno = Hal
time
trt Eve Mid Pre
dry 2 3 2
Wet 1 4 4
counts<-counts[,all.outlier.info$genos[all.outlier.info$library.size.no.outliers>100000]]
write.csv(counts, file="pickle2014_rawcounts_nooutliers.csv", row.names=F)