pickle2014_draft1_countsdataprocessing.R

Lovell — Nov 17, 2014, 6:26 PM

# gene expression remove outliers
rm(list=ls())
library(outliers)
library(car)
Warning: package 'car' was built under R version 3.1.1
library(ggplot2)
setwd("~/Desktop/dropbox/pickle2014")
counts.w.outliers<-read.csv("Hal_Fil_DD_Count_Matrix 2.csv")
#get categories
genos<-colnames(counts.w.outliers)[-1]
geno<-ifelse(grepl("HAL",genos),"Hal",ifelse(grepl("FIL",genos),"Fil","AP13"))
trt<-ifelse(grepl("Wet",genos),"Wet","dry")
time<-ifelse(grepl("Mid",genos),"Mid",ifelse(grepl("Pre",genos),"Pre","Eve"))
info<-data.frame(cbind(genos,geno,trt,time))
counts<-counts.w.outliers
outlier.info<-data.frame()
for(i in 2:length(colnames(counts))){
  in.dat<-counts[,c(1,i)]
  in.dat$log.tr<-log(in.dat[,2])
  dat<-in.dat$log.tr[!is.infinite(in.dat$log.tr)]
  p.out<-0
  n.out<-0
  while(p.out<0.05/length(rownames(counts))){
    p.out<-chisq.out.test(dat[!is.na(dat)], variance=var(dat[!is.na(dat)]))$p.value
    dat<-dat[!dat==max(dat)]
    n.out<-n.out+1
  }
  if(n.out>0){
    counts[,i][counts[,i] %in% rev(sort(in.dat[,2]))[1:n.out]]<-NA
  }
  outlier.info<-rbind(outlier.info,data.frame(colnames(counts)[i],n.out,sum(counts.w.outliers[,i]),sum(counts[,i], na.rm=T)))
}
colnames(outlier.info)<-c("genos","n.outliers","library.size.initial","library.size.no.outliers")
print(outlier.info)
           genos n.outliers library.size.initial library.size.no.outliers
1   HAL229DryPre          2               550180                   457273
2   HAL219WetPre          1               160815                   158679
3    HAL28WetEve          2               328068                   311060
4   HAL220DryPre          2               740629                   677101
5    HAL25DryMid          2              2128973                  1745820
6    HAL25DryEve          1               667235                   656797
7    HAL27DryPre          1               326328                   322835
8   HAL211WetEve          1               276397                   273882
9   HAL215WetMid          2               664322                   608083
10  HAL219WetMid          2               613023                   534555
11  HAL223DryMid          1               222872                   220404
12   HAL28WetMid          1               324281                   321820
13   HAL29DryPre          1              1137593                  1121192
14  HAL226WetEve          2               213479                   207763
15  HAL221DryPre          1               587819                   571337
16   HAL22WetMid          1               163703                   162727
17  HAL215WetEve          1               790952                   770782
18  HAL213WetPre          1               817202                   804122
19  HAL213WetEve          1               165033                   163578
20  HAL212DryEve         14                 2436                     2106
21  HAL222WetPre          1              1123843                  1081069
22  HAL216DryPre          1               609795                   602331
23  HAL222WetMid          2                58252                    56925
24   HAL24WetEve          2               675494                   633178
25  HAL218DryPre          2               310262                   293368
26  HAL220DryMid          5                41749                    39941
27  HAL228WetEve          2               487077                   454284
28  HAL225DryPre          1               248290                   243012
29  HAL214DryEve          2               352992                   306193
30  HAL210WetEve          2               410821                   394848
31   HAL24WetPre          3               305295                   279846
32  HAL221DryEve          2               861017                   804802
33   HAL29DryMid          2               241480                   223471
34   HAL27DryEve          1               426161                   420036
35   HAL22WetEve          1              1214801                  1175712
36  HAL220DryEve          2               246801                   239513
37   HAL21DryPre          1                56676                    56238
38  HAL224WetEve          1               139851                   138699
39   HAL25DryPre          1               387716                   378505
40  HAL222WetEve          1              1667996                  1622178
41  HAL230WetPre          1               316151                   311892
42  HAL224WetMid          1               286007                   284181
43   HAL23DryEve          1              1506981                  1412053
44  HAL223DryEve          3                  135                      112
45  HAL214DryPre          2               326073                   312935
46  HAL226WetMid         22                 6866                     5656
47  HAL227DryPre          1               167582                   165474
48  HAL230WetMid          2               614493                   579761
49  HAL225DryMid          2              1554949                  1343524
50  HAL210WetPre          5                87507                    79590
51   HAL23DryPre          1              1816866                  1751384
52   HAL29DryEve          2               872183                   747396
53  HAL213WetMid          1               268927                   266646
54  HAL226WetPre          1               316239                   311754
55   HAL23DryMid          2               204618                   185450
56  HAL227DryEve          1              1122982                  1086102
57   HAL22WetPre          1              1356907                  1318269
58   HAL24WetMid          1               214440                   210879
59  HAL218DryEve          1               245980                   241669
60  HAL216DryEve          2               193163                   187594
61  HAL224WetPre          2               105442                   102960
62  HAL219WetEve          2               883693                   822622
63  HAL218DryMid          2               808487                   681672
64  HAL230WetEve          2               414571                   396473
65  HAL211WetPre         11                 1450                     1217
66  HAL214DryMid         11                 9603                     8446
67  HAL229DryEve          1               350048                   343163
68  HAL217WetMid          1               396354                   390615
69  HAL212DryPre          2               408252                   353185
70  HAL212DryMid          2               359261                   298293
71  HAL227DryMid          1               324904                   322049
72   HAL26WetPre          1               247308                   241867
73   HAL28WetPre          2                59373                    57965
74   HAL21DryMid          2               243325                   232868
75   HAL27DryMid          9                 8588                     7733
76  HAL223DryPre          5                57129                    54410
77   HAL26WetEve          8                  762                      671
78  HAL225DryEve          1               443304                   434766
79  HAL211WetMid          1              1159566                  1107969
80   HAL26WetMid          1                66501                    66099
81  HAL229DryMid          2               467298                   437746
82  HAL228WetMid          1               387287                   380338
83  HAL221DryMid          1               309589                   307751
84  HAL217WetEve          2               761227                   663191
85  HAL217WetPre          1               120362                   119145
86  HAL216DryMid          2               540726                   499107
87  HAL210WetMid          3                34215                    33438
88   HAL21DryEve          1               142734                   141592
89  HAL228WetPre         14                 5594                     4757
90  HAL215WetPre          1              1854443                  1781424
91  FIL242DryPre          1               647821                   635792
92  FIL243WetPre          2               673534                   564566
93  FIL248DryEve          1               146644                   144683
94  FIL249WetPre          1               449179                   434560
95  FIL245WetMid          1               267838                   263978
96  FIL247WetPre          1               107440                   106498
97  FIL236WetPre          3                84245                    77974
98  FIL232WetEve          1                23119                    22934
99  FIL232WetPre          1               612058                   603368
100 FIL240WetPre          1               438051                   428123
101 FIL254WetEve          1               485859                   480886
102 FIL251DryMid         11                13499                    11819
103 FIL249WetEve          1               277327                   274930
104 FIL245WetEve          2               171803                   166863
105 FIL248DryMid          1               210565                   209728
106 FIL231DryPre          2               202681                   181959
107 FIL236WetEve          2               446401                   409265
108 FIL241WetMid          2                62615                    60385
109 FIL237DryPre          1              1853917                  1759193
110 FIL255DryPre          3                  157                      138
111 FIL247WetEve          1               429009                   424097
112 FIL244DryEve          2               371433                   355262
113 FIL250DryPre          1               425237                   419412
114 FIL246DryPre          2               479234                   453647
115 FIL243WetMid          1               510519                   507142
116 FIL255DryEve          3               128877                   124068
117 FIL235DryMid          1               308895                   306710
118 FIL259DryEve          1               137464                   136287
119 FIL248DryPre          1               464027                   452495
120 FIL233DryEve          1                83404                    82835
121 FIL242DryMid          1               177618                   175677
122 FIL243WetEve          1               748695                   740058
123 FIL258WetMid          2               552072                   485121
124 FIL237DryMid          1               101306                   100079
125 FIL251DryPre          1               549092                   544517
126 FIL239DryEve          2               246509                   239612
127 FIL232WetMid          1               264656                   259609
128 FIL257DryMid          1               574265                   553064
129 FIL237DryEve          2              1035512                   936596
130 FIL256WetEve          2               271859                   262554
131 FIL259DryPre          7                15352                    14507
132 FIL256WetMid          2               114084                   103518
133 FIL236WetMid          1               187701                   185623
134 FIL238WetPre          1               852021                   819878
135 FIL231DryEve          1               299142                   295324
136 FIL253DryMid          1               162225                   161416
137 FIL239DryPre          1               510196                   506517
138 FIL231DryMid          2               238438                   227051
139 FIL244DryPre          2               340542                   313712
140 FIL252WetMid          2               487660                   428603
141 FIL245WetPre          1               251573                   248689
142 FIL235DryPre          1               317803                   313233
143 FIL257DryEve          1              1013158                   988929
144 FIL250DryMid          1               230745                   228501
145 FIL242DryEve          1               245291                   241593
146 FIL240WetMid         11                  796                      659
147 FIL244DryMid          1               348221                   343541
148 FIL233DryPre          1               391264                   385326
149 FIL247WetMid          1               422711                   416185
150 FIL240WetEve          1               452083                   448651
151 FIL254WetPre          1               206184                   203328
152 FIL233DryMid          1               163723                   162069
153 FIL234WetEve          1              1124484                  1089674
154 FIL260WetMid          2              1031199                   945648
155 FIL246DryMid          1               416968                   411943
156 FIL253DryEve          1               246477                   243678
157 FIL256WetPre          1               189867                   187433
158 FIL258WetPre          2              1397877                  1234380
159 FIL254WetMid          1               980504                   961721
160 FIL251DryEve          1               273214                   271698
161 FIL260WetEve          1               780101                   749617
162 FIL252WetEve          1               946791                   931044
163 FIL249WetMid          2               392959                   366267
164 FIL234WetPre          1               783604                   745320
165 FIL257DryPre          1              1695244                  1641154
166 FIL252WetPre          2               327490                   305975
167 FIL234WetMid          1               358683                   352925
168 FIL238WetMid          1              2014308                  1963216
169 FIL259DryMid         14                 6689                     5844
170 FIL246DryEve          1               168119                   166802
171 FIL255DryMid          1               550949                   548223
172 FIL241WetPre          1               164034                   161980
173 FIL260WetPre          2               386035                   363832
174 FIL238WetEve          1               104893                   104193
175 FIL250DryEve          1               201798                   199614
176 FIL253DryPre          2               538675                   501829
177 FIL258WetEve          1               455656                   450366
178 FIL239DryMid          2               508448                   485571
179 FIL235DryEve          2               561011                   441708
180 FIL241WetEve          1               916935                   879679
all.outlier.info<-merge(info,outlier.info, by="genos")
all.outlier.info$genos <-factor(all.outlier.info$genos, levels=all.outlier.info[order(all.outlier.info$library.size.initial), "genos"])
ggplot(all.outlier.info, aes(x=genos, y=library.size.initial, color=geno, shape=geno))+
  geom_point( size=4)+
  facet_grid(trt~time, scales="free_x")+
  scale_shape(solid=FALSE)+theme_bw()

plot of chunk unnamed-chunk-1

all.outlier.info$genos <-factor(all.outlier.info$genos, levels=all.outlier.info[order(all.outlier.info$library.size.no.outliers), "genos"])
ggplot(all.outlier.info, aes(x=genos, y=library.size.no.outliers, color=geno, shape=geno))+
  geom_point( size=4)+
  facet_grid(trt~time, scales="free_x")+
  scale_shape(solid=FALSE)+theme_bw()

plot of chunk unnamed-chunk-1

all.outlier.info$genos <-factor(all.outlier.info$genos, levels=all.outlier.info[order(all.outlier.info$n.outliers), "genos"])
ggplot(all.outlier.info, aes(x=genos, y=n.outliers, color=geno, shape=geno))+
  geom_point( size=4)+
  facet_grid(trt~time, scales="free_x")+
  scale_shape(solid=FALSE)+theme_bw()

plot of chunk unnamed-chunk-1

plot(all.outlier.info$library.size.initial,all.outlier.info$library.size.no.outliers)

plot of chunk unnamed-chunk-1

#only the larger libraries are seriously affected by removing outliers 
hist(all.outlier.info$library.size.initial, breaks=100)

plot of chunk unnamed-chunk-1

hist(all.outlier.info$library.size.no.outliers, breaks=100)

plot of chunk unnamed-chunk-1

hist(all.outlier.info$library.size.no.outliers, breaks=100, xlim=c(0,200000))

plot of chunk unnamed-chunk-1

#there is a relatively distinct cuttoff where 25 libraries look to have low size. Remove these
library.size<-apply(counts.w.outliers[,-1],2,sum)
print(small.libs<-names(library.size[!library.size>100000]))
 [1] "HAL212DryEve" "HAL222WetMid" "HAL220DryMid" "HAL21DryPre" 
 [5] "HAL223DryEve" "HAL226WetMid" "HAL210WetPre" "HAL211WetPre"
 [9] "HAL214DryMid" "HAL28WetPre"  "HAL27DryMid"  "HAL223DryPre"
[13] "HAL26WetEve"  "HAL26WetMid"  "HAL210WetMid" "HAL228WetPre"
[17] "FIL236WetPre" "FIL232WetEve" "FIL251DryMid" "FIL241WetMid"
[21] "FIL255DryPre" "FIL233DryEve" "FIL259DryPre" "FIL240WetMid"
[25] "FIL259DryMid"
#here is a summary of the makeup of these genotypes
table(info[info$genos %in% small.libs,c(3,4,2)])
, , geno = Fil

     time
trt   Eve Mid Pre
  dry   1   2   2
  Wet   1   2   1

, , geno = Hal

     time
trt   Eve Mid Pre
  dry   2   3   2
  Wet   1   4   4
counts<-counts[,all.outlier.info$genos[all.outlier.info$library.size.no.outliers>100000]]
write.csv(counts, file="pickle2014_rawcounts_nooutliers.csv", row.names=F)