Nadege URGENT

OK I have read the data from 'CS_data_1.txt' which contains summary data for each probe and a gene symbol and a Genbank Accession. The data is not log transformed

cs = read.table("CS_data_1.txt", sep = "\t", header = T)
colnames(cs)

##  [1] "PROBE_ID"                    "SYMBOL"                     
##  [3] "S00022895.AVG_Signal"        "S00022895.Detection.Pval"   
##  [5] "S00022895.BEAD_STDERR"       "S00022895.Avg_NBEADS"       
##  [7] "S00023115.AVG_Signal"        "S00023115.Detection.Pval"   
##  [9] "S00023115.BEAD_STDERR"       "S00023115.Avg_NBEADS"       
## [11] "S00021034.AVG_Signal"        "S00021034.Detection.Pval"   
## [13] "S00021034.BEAD_STDERR"       "S00021034.Avg_NBEADS"       
## [15] "S00021429.AVG_Signal"        "S00021429.Detection.Pval"   
## [17] "S00021429.BEAD_STDERR"       "S00021429.Avg_NBEADS"       
## [19] "S00022313.AVG_Signal"        "S00022313.Detection.Pval"   
## [21] "S00022313.BEAD_STDERR"       "S00022313.Avg_NBEADS"       
## [23] "S00022335.AVG_Signal"        "S00022335.Detection.Pval"   
## [25] "S00022335.BEAD_STDERR"       "S00022335.Avg_NBEADS"       
## [27] "S00022355.AVG_Signal"        "S00022355.Detection.Pval"   
## [29] "S00022355.BEAD_STDERR"       "S00022355.Avg_NBEADS"       
## [31] "S00022512.AVG_Signal"        "S00022512.Detection.Pval"   
## [33] "S00022512.BEAD_STDERR"       "S00022512.Avg_NBEADS"       
## [35] "S00022892.AVG_Signal"        "S00022892.Detection.Pval"   
## [37] "S00022892.BEAD_STDERR"       "S00022892.Avg_NBEADS"       
## [39] "S00022342.AVG_Signal"        "S00022342.Detection.Pval"   
## [41] "S00022342.BEAD_STDERR"       "S00022342.Avg_NBEADS"       
## [43] "S00022367.AVG_Signal"        "S00022367.Detection.Pval"   
## [45] "S00022367.BEAD_STDERR"       "S00022367.Avg_NBEADS"       
## [47] "S00021192.AVG_Signal"        "S00021192.Detection.Pval"   
## [49] "S00021192.BEAD_STDERR"       "S00021192.Avg_NBEADS"       
## [51] "S00021459.60.AVG_Signal"     "S00021459.60.Detection.Pval"
## [53] "S00021459.60.BEAD_STDERR"    "S00021459.60.Avg_NBEADS"    
## [55] "S00023112.AVG_Signal"        "S00023112.Detection.Pval"   
## [57] "S00023112.BEAD_STDERR"       "S00023112.Avg_NBEADS"       
## [59] "S00022886.AVG_Signal"        "S00022886.Detection.Pval"   
## [61] "S00022886.BEAD_STDERR"       "S00022886.Avg_NBEADS"       
## [63] "S00022351.AVG_Signal"        "S00022351.Detection.Pval"   
## [65] "S00022351.BEAD_STDERR"       "S00022351.Avg_NBEADS"       
## [67] "S00019661.AVG_Signal"        "S00019661.Detection.Pval"   
## [69] "S00019661.BEAD_STDERR"       "S00019661.Avg_NBEADS"       
## [71] "S00018175.AVG_Signal"        "S00018175.Detection.Pval"   
## [73] "S00018175.BEAD_STDERR"       "S00018175.Avg_NBEADS"       
## [75] "S0022394.AVG_Signal"         "S0022394.Detection.Pval"    
## [77] "S0022394.BEAD_STDERR"        "S0022394.Avg_NBEADS"        
## [79] "S00022375.AVG_Signal"        "S00022375.Detection.Pval"   
## [81] "S00022375.BEAD_STDERR"       "S00022375.Avg_NBEADS"       
## [83] "S00022333.AVG_Signal"        "S00022333.Detection.Pval"   
## [85] "S00022333.BEAD_STDERR"       "S00022333.Avg_NBEADS"       
## [87] "S00023122.AVG_Signal"        "S00023122.Detection.Pval"   
## [89] "S00023122.BEAD_STDERR"       "S00023122.Avg_NBEADS"       
## [91] "SW1353_A.AVG_Signal"         "SW1353_A.Detection.Pval"    
## [93] "SW1353_A.BEAD_STDERR"        "SW1353_A.Avg_NBEADS"        
## [95] "SW1353_B.AVG_Signal"         "SW1353_B.Detection.Pval"    
## [97] "SW1353_B.BEAD_STDERR"        "SW1353_B.Avg_NBEADS"        
## [99] "SEARCH_KEY"

# the first 8 columns and the last containing Genbank Acc note there are 2
# A1BG and 3 A1CF rows with different probes
head(cs[, c(1:8, 99)])

##       PROBE_ID SYMBOL S00022895.AVG_Signal S00022895.Detection.Pval
## 1 ILMN_1762337    7A5                118.5                  0.16234
## 2 ILMN_2055271   A1BG                129.6                  0.01948
## 3 ILMN_1736007   A1BG                104.1                  0.64805
## 4 ILMN_2383229   A1CF                117.8                  0.19221
## 5 ILMN_1806310   A1CF                107.1                  0.54416
## 6 ILMN_1779670   A1CF                100.3                  0.73636
##   S00022895.BEAD_STDERR S00022895.Avg_NBEADS S00023115.AVG_Signal
## 1                 7.370                   15                120.3
## 2                 7.109                   19                127.1
## 3                 4.971                   24                106.0
## 4                 5.370                   24                134.6
## 5                 6.481                   17                122.1
## 6                 4.251                   14                104.4
##   S00023115.Detection.Pval  SEARCH_KEY
## 1                  0.22208 NM_182762.2
## 2                  0.09740 NM_130786.2
## 3                  0.67662 NM_130786.2
## 4                  0.01818 NM_138932.1
## 5                  0.17922 NM_138933.1
## 6                  0.70260 NM_138933.1

Processing The Data

As I don't know how this data was created - background corrected, normalised, summarised etc.. I will do a few plots to examine the QC:

# I am log2 transforming the data though this still elavesit pretty right
# skewed seq(3,95,4) are the AveExpression columns
boxplot(log2(cs[, seq(3, 95, 4)]), names = F)

plot of chunk QC plots

# the second is a bit speculative as I don't know how the st errors are
# calculated perhaps (probably) it is within the beads e.g. NBEADS =15
boxplot(log2(cs[, seq(5, 97, 4)]), names = F)

plot of chunk QC plots

# the NBeads used per probe summary seems to be pretty consistent
boxplot(cs[, seq(6, 98, 4)], names = F)

plot of chunk QC plots

# I'm not sure what to make of the detection PVals, I would have thought
# about 40-50% should be < 0.05 seems consistent....ish.
boxplot(cs[, seq(4, 96, 4)], names = F)

plot of chunk QC plots

My impression from these plot is that the actual quality of the hybridisations is similar across all the samples but that this data is not that well normalised. I'm not sure though whether it was supposed to be normalised already or whether this was the raw data prior to normalisation. For the moment there is not a lot I can do with the Pval, STDERR or NBEADS data so I'll set it aside for now. I will take th AVG_Signal data and log2 transform and then quantile normalise the data to an 'average' hybridisation.

# the expression data matrix
ave.m = cs[, seq(3, 95, 4)]
# Log2 transform the data
ave.m = log2(ave.m)
# calculate an 'average' hybridisation ofr normalising the rest to -
# sorted
ave.hyb = sort(rowMeans(ave.m))
# I spwnt 20 mins fucking up here by putting order(x) instead of rank (x)
ave.mq = apply(ave.m, 2, function(x) ave.hyb[rank(x)])

I messsed this up the first time I did it by mixing up the rank and order command. I do this a lot…someday I will learn and remember the difference.

# quick look at that plot to check it looks correct i.e. the ordering is
# right this time
pairs(ave.mq[1:1000, 1:4])

plot of chunk checkQuantilNormalisation

# I notice that the column names are pretty redundant I'll use the first 9
# letters
colnames(ave.mq) = substr(colnames(ave.mq), 1, 9)
# and it certainly looks like it is normalised now
boxplot(ave.mq[, 1:4])

plot of chunk checkQuantilNormalisation

# I'll pop this altogether in a data.frame with probe, gene and Accessions
# names as the first three columns respectively and our newly normalised
# expression value on the right of these
cs.data = data.frame(cs[, c(1:2, 99)], ave.mq)
head(cs.data[, 1:6])

##       PROBE_ID SYMBOL  SEARCH_KEY S00022895 S00023115 S00021034
## 1 ILMN_1762337    7A5 NM_182762.2     6.994     6.978     6.990
## 2 ILMN_2055271   A1BG NM_130786.2     7.138     7.048     6.958
## 3 ILMN_1736007   A1BG NM_130786.2     6.833     6.825     6.902
## 4 ILMN_2383229   A1CF NM_138932.1     6.985     7.127     7.061
## 5 ILMN_1806310   A1CF NM_138933.1     6.869     6.996     6.930
## 6 ILMN_1779670   A1CF NM_138933.1     6.786     6.807     6.689

Filtering

Perhaps hard quantile normalisation is a bit harsh but … I don't know how the data was treated before so it's posssibly the best option here. As you asked me to have a look at this data de novo before looking at mutation status - I'll do clustering and MDS to see whether there are discernible groups.

# there are 3 columns of annotation and 24 samples
samps = 4:27
# I'll filter the data before clustering using the coefficient of variance
coefvar = apply(cs.data[, samps], 1, function(x) sd(x)/mean(x))
# this time I use order() instead of rank() correctly!
coefvar.index = order(coefvar, decreasing = T)
quantile(coefvar, 0.75)

##     75% 
## 0.03398

# based upon this plot I'm going to use about a quarter of the data to
# construct a clustering ie. 10000 highest varying probes
plot(density(coefvar))

plot of chunk coefvar

Clustering

Cophenetic clustering measure the correlation between the original distance matrix and the clustering distance matrix. A high value is an indication that the clustering captures information in the data well. I'm going to try different numbers of probes to see if I can find one that produces a 'better' clustering.

cop.v = vector()
for (i in seq(500, 40000, 500)) {
    d = as.dist(1 - cor(cs.data[coefvar.index[1:i], samps]))
    # as I use 1-cor there is no need for scaling
    hc = hclust(d, method = "average")
    cop = cophenetic(hc)
    cop.v = c(cop.v, cor(d, cop))
}
plot(seq(500, 40000, 500), cop.v, type = "l")

plot of chunk copheneticVals

OK based on the relation between the number of probes used and the cophenetic correlation I will use the 1000 highest varying probes. I also tried method='complete' and method='single' but I haven't shown that here.

d1k = d = as.dist(1 - cor(cs.data[coefvar.index[1:1000], samps]))
hc1k.sin = hclust(d1k, method = "single")
hc1k.ave = hclust(d1k, method = "average")
hc1k.com = hclust(d1k, method = "complete")

plot(hc1k.sin, hang = -1, sub = "1000 highest varying genes, single linkage")

plot of chunk clustering1kAverage

plot(hc1k.ave, hang = -1, sub = "1000 highest varying genes average linkage")

plot of chunk clustering1kAverage

plot(hc1k.com, hang = -1, sub = "1000 highest varying genes complete linkage")

plot of chunk clustering1kAverage

Multidimensional Scaling

I now examine the same distance matrix using the multidimensional scalng function cmdscale()

cmd1k = cmdscale(d1k)
# I used identify for labeling but that won't come through in an
# auto-generated report plot(cmd1k) identify(cmd1k[,1], cmd1k[,2],
# rownames(cmd1k))

alt text

OK.. It's pretty clear that SW1353_A and _B appear to be replicates. Then there is a handful of samples that appear a little different (18175, 23115,22335). I'm not all that happy that the clustering is quite different when you use different clustering methods. The multidimensional scaling is probably the closest to the truth of these samples. Having so far done this blind I'm going to look for sample info now and see if I can make sense of the clustering???

Mutational Status

OK from looknig at some other files I see that the mutational status is as follows:

mut_status = as.factor(c(rep("MUT", 16), rep("WT", 6), rep("CONTROL", 
    2)))

# if I plot the cmdscale with the mutational status it look like so...
cmd1k.df = data.frame(mut_status, x = cmd1k[, 1], y = cmd1k[, 2])
library(ggplot2)
qplot(x, y, data = cmd1k.df, colour = mut_status, size = I(5)) + 
    theme_bw()

plot of chunk mutationalStatus

Fancy Animation of MDS

Yeah!???! This is all a bit unconvincing. I can try this graph with a number of different numbers of genes to see if one of them looks like it separates MUT and WT any better.

library(animation)
saveHTML({
    for (i in seq(500, 40000, 500)) {
        d = as.dist(1 - cor(cs.data[coefvar.index[1:i], samps]))
        cmd = cmdscale(d)
        cmd.df = data.frame(mut_status, x = cmd[, 1], y = cmd[, 2])
        print(qplot(x, y, data = cmd.df, colour = mut_status, size = I(5), main = paste(i, 
            "highest varying probes", sep = " ")) + theme_bw())
    }
}, img.name = "cmd_plot", imgdir = "cmd_dir", outdir = getwd(), htmlfile = "cmd.html", 
    autobrowse = FALSE)

## HTML file created at: /Users/stephenhenderson/Google Drive/Guilhamon
## HT12/cmd.html

Summary

You can view the animations of the different probe no. from 500 to 40000 here in this local linked html file. From this I reckon that after 2000 it stabilises to a reliable pattern and then the rest is noise. Even at best the difference between the MUT and WT doen't jump out at you. On reflection I'm abit concerned remembering the strangely low PVal detection levels. I don't know much about Illumina chips but if it was Affy then ~ 40-50% of probesets would be Present (P) with PVals less than 0.05. Here we have…

apply(cs[, seq(4, 96, 4)], 2, function(x) sum(x <= 0.05)/length(x))

##    S00022895.Detection.Pval    S00023115.Detection.Pval 
##                      0.3845                      0.3857 
##    S00021034.Detection.Pval    S00021429.Detection.Pval 
##                      0.3512                      0.3924 
##    S00022313.Detection.Pval    S00022335.Detection.Pval 
##                      0.3680                      0.3674 
##    S00022355.Detection.Pval    S00022512.Detection.Pval 
##                      0.3408                      0.3753 
##    S00022892.Detection.Pval    S00022342.Detection.Pval 
##                      0.3759                      0.4032 
##    S00022367.Detection.Pval    S00021192.Detection.Pval 
##                      0.3711                      0.3115 
## S00021459.60.Detection.Pval    S00023112.Detection.Pval 
##                      0.3062                      0.3928 
##    S00022886.Detection.Pval    S00022351.Detection.Pval 
##                      0.2986                      0.4045 
##    S00019661.Detection.Pval    S00018175.Detection.Pval 
##                      0.4109                      0.3990 
##     S0022394.Detection.Pval    S00022375.Detection.Pval 
##                      0.3974                      0.3959 
##    S00022333.Detection.Pval    S00023122.Detection.Pval 
##                      0.3568                      0.4168 
##     SW1353_A.Detection.Pval     SW1353_B.Detection.Pval 
##                      0.3257                      0.3413

Actually we have ~ 35 -40% I suppose that is OK I guess. I'll move on now to do a bit of limma and see if there are statistically significant differences between the MUT and WT.

Limma

library(limma)
design = model.matrix(~mut_status - 1)
colnames(design) = levels(mut_status)
fit1 = lmFit(cs.data[, samps], design)
cm = makeContrasts(MUT - WT, WT - CONTROL, levels = design)
fit2 = contrasts.fit(fit1, cm)
fit3 = eBayes(fit2)

# this is the toptable and volcanoplot of MUT - WT
toptable(fit3, coef = 1, genelist = cs.data$SYMBOL)

##             ID   logFC      t   P.Value adj.P.Val        B
## 11599    GFRA1 -0.3348 -5.023 4.578e-05    0.9991  1.08824
## 3008  C12orf35 -0.5968 -4.795 8.041e-05    0.9991  0.71254
## 43999     TNK1  0.1570  4.787 8.209e-05    0.9991  0.69872
## 9806    FAM80B -0.5836 -4.756 8.843e-05    0.9991  0.64871
## 31348   MAPK10 -0.9042 -4.619 1.243e-04    0.9991  0.41897
## 8822     EIF4H  0.4685  4.577 1.380e-04    0.9991  0.34800
## 7892    DIO3OS -0.2351 -4.359 2.368e-04    0.9991 -0.02101
## 45842  WFIKKN2 -0.1395 -4.333 2.522e-04    0.9991 -0.06421
## 679     AGPAT2  0.3916  4.314 2.647e-04    0.9991 -0.09757
## 16560 HSD17B12 -0.4649 -4.222 3.318e-04    0.9991 -0.25343

volcanoplot(fit3, coef = 1, highlight = 10, names = cs.data$SYMBOL)

plot of chunk limma


# and this is the same except now for WT - CONTROL
toptable(fit3, coef = 2, genelist = cs.data$SYMBOL)

##            ID  logFC      t   P.Value adj.P.Val     B
## 12956   HKDC1 -3.887 -61.40 1.029e-26 4.859e-22 42.52
## 38576   RCVRN -3.638 -52.47 3.550e-25 8.381e-21 40.95
## 44407  TRIML2 -3.021 -49.14 1.553e-24 2.444e-20 40.22
## 6953   CT47A7 -2.847 -47.32 3.614e-24 4.266e-20 39.77
## 7313  CYP4F11 -2.522 -44.29 1.597e-23 1.320e-19 38.95
## 18474   KRT81 -6.431 -44.19 1.677e-23 1.320e-19 38.92
## 31460   MAT1A -1.750 -35.90 1.754e-21 1.183e-17 36.03
## 3671  C1orf94 -2.425 -34.49 4.269e-21 2.492e-17 35.42
## 8307    DPPA4 -2.122 -34.33 4.749e-21 2.492e-17 35.35
## 43897 TMPRSS3 -1.637 -33.87 6.406e-21 3.025e-17 35.14

volcanoplot(fit3, coef = 2, highlight = 10, names = cs.data$SYMBOL)

plot of chunk limma

Aye OK

So I'm not seeing much discernible difference between the WT and MUT sadly- though the controls stick out as obviously different. I don't know what the previous people who looked at this saw - possibly someone thought there were significant genes to follow up and another didn't. Unless there are obvious outlier samples that maybe lack MUT penetrance or exprression or something then I can't see it either.

Still it's been interesting… particularly teaching myself the animation package.