model for females

read in data

setwd("/Users/seanmaguire/Desktop/spring2013_Bioinformatics/bisulfate/new_CpGfiles_1/")
clusterAll <- read.csv("clusterData.csv")
geneExp <- read.csv("geneExp_fixedIDs.csv", header = TRUE)

Change sample IDs to the row names, get rid of the sample ID column.

row.names(clusterAll) <- clusterAll[, 1]
clusterAll <- clusterAll[, -1]
head(clusterAll)

##           f11       f12      f13 f17    f18    f20      f25       f30
## 3022 0.368243 0.3381579 0.306991 0.3 0.2041 0.2252 0.237458 0.3125000
## 3062 0.013410 0.0003306 0.002037 0.0 0.0000 0.0000 0.004914 0.0032017
## 3071 0.000000 0.0004847 0.000000 0.0 0.0000 0.0000 0.000000 0.0009901
## 3080 0.001672 0.0003163 0.003419 0.0 0.0000 0.1118 0.000000 0.0036969
## 3090 0.000000 0.0000000 0.001538 0.0 0.0000 0.0000 0.003960 0.0008787
## 3131 0.001248 0.0003140 0.004734 0.0 0.0000 0.0000 0.000000 0.0033670
##           f32      f35        f7       f8        f9    m13      m14
## 3022 0.430769 0.238095 0.3262519 0.325000 0.3528369 0.3755 0.359116
## 3062 0.000000 0.006803 0.0000000 0.009317 0.0006849 0.0000 0.003026
## 3071 0.009709 0.000000 0.0000000 0.002924 0.0000000 0.0000 0.000000
## 3080 0.205607 0.092025 0.0009381 0.010870 0.0006031 0.0000 0.003654
## 3090 0.000000 0.137931 0.0008703 0.000000 0.0000000 0.0000 0.002232
## 3131 0.005882 0.000000 0.0006761 0.003745 0.0008787 0.0000 0.004092
##            m15      m16       m18      m19       m21      m22       m24
## 3022 0.3582157 0.285714 0.3676190 0.389423 0.3765978 0.365231 0.3574219
## 3062 0.0017947 0.010989 0.0009381 0.000000 0.0955564 0.000000 0.0007321
## 3071 0.0000000 0.000000 0.0004371 0.001422 0.0003682 0.000000 0.0006930
## 3080 0.0006513 0.000000 0.0008210 0.002597 0.1941880 0.131579 0.0000000
## 3090 0.0009186 0.000000 0.0019062 0.000000 0.1289318 0.001017 0.0006112
## 3131 0.0007269 0.006944 0.0000000 0.059380 0.1245742 0.000800 0.0019093
##            m26      m27      m28        m4        m5       m6       m9
## 3022 0.3571705 0.248276 0.303571 0.2154088 0.3104727 0.344398 0.307580
## 3062 0.0125107 0.005348 0.008333 0.0031250 0.0052632 0.001277 0.007366
## 3071 0.0288925 0.005051 0.003802 0.0241546 0.0007215 0.003492 0.001682
## 3080 0.0039900 0.004717 0.000000 0.0009166 0.1002786 0.001074 0.071372
## 3090 0.0004632 0.000000 0.000000 0.0000000 0.0000000 0.003906 0.000000
## 3131 0.0007196 0.000000 0.000000 0.0006094 0.0010230 0.001500 0.013348

Seperate Males and Females

clusterMale <- clusterAll[, -(1:13)]
clusterFem <- clusterAll[, 1:13]

function for convienient use later

grabP <- function(corObj) {
    return(corObj$p.value)
}

Correlate each cpg site to geneExp. Keep a list of only the ones that have p<0.1.

PCing <- geneExp$P_Cing[match(colnames(clusterFem), geneExp$ID)]
cortests <- apply(clusterFem, MARGIN = 1, FUN = cor.test, y = PCing)
pvals <- lapply(cortests, FUN = grabP)
sigP <- (names(pvals[which(pvals < 0.1)]))
tClusterFem <- t(clusterFem)
sigCluster <- tClusterFem[, sigP]
sigCluster <- as.data.frame(sigCluster)
sigCluster$PCing <- PCing

Rename the cpg sites because R does not like numbers as column names.

names(sigCluster) <- c(paste("cpg", 1:15, sep = ""), "PCing")
head(sigCluster)

##          cpg1      cpg2      cpg3     cpg4      cpg5      cpg6     cpg7
## f11 0.0027285 0.0016863 0.0044843 0.005629 0.0005958 0.0020772 0.000000
## f12 0.0007603 0.0003429 0.0005631 0.001491 0.0002355 0.0004959 0.186577
## f13 0.0000000 0.0013141 0.0011541 0.002743 0.0000000 0.0003734 0.001058
## f17 0.0000000 0.0000000 0.0000000 0.000000 0.0000000 0.0000000 0.000000
## f18 0.0000000 0.0000000 0.0000000 0.000000 0.0000000 0.0000000 0.000000
## f20 0.0054348 0.0000000 0.0000000 0.002160 0.0020661 0.0000000 0.000000
##         cpg8      cpg9     cpg10    cpg11     cpg12     cpg13     cpg14
## f11 0.004642 0.1853568 0.0009363 0.003696 0.0021086 0.0015508 0.0966726
## f12 0.001445 0.0001554 0.0003435 0.001537 0.0006444 0.0003778 0.0004511
## f13 0.001105 0.0036140 0.0014937 0.001396 0.0025847 0.0021534 0.0027572
## f17 0.000000 0.0000000 0.0000000 0.000000 0.0000000 0.0000000 0.0000000
## f18 0.000000 0.0000000 0.0000000 0.000000 0.0000000 0.0000000 0.0000000
## f20 0.001980 0.1394892 0.0019569 0.013333 0.0014599 0.0000000 0.1077763
##        cpg15 PCing
## f11 0.001841 -1.08
## f12 0.475448  1.47
## f13 0.002060 -0.08
## f17 0.032787  0.99
## f18 0.000000  0.78
## f20 0.109445 -0.78

Forward fitting procedure. We fit each cpg site in turn and examine the AIC of the model. In each round we keep only the cpg site that improves the model the most. We continue until no significant improvments can be made to the model. For example:

testModel1 <- lm(PCing ~ 1, data = sigCluster)
AIC(testModel1)

## [1] 40.1

testModel2 <- lm(PCing ~ cpg1, data = sigCluster)
AIC(testModel2)

## [1] 31.42

anova(testModel1, testModel2)

## Analysis of Variance Table
## 
## Model 1: PCing ~ 1
## Model 2: PCing ~ cpg1
##   Res.Df   RSS Df Sum of Sq  F Pr(>F)   
## 1     12 12.24                          
## 2     11  5.38  1      6.85 14 0.0032 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Adding cpg1 to the model lowers the AIC and the anova table indicates that is strongly significantly better than the model of the intercept alone. We continued this process adding one factor at a time until the model could not be improved anymore, leading to the following model:

model <- (lm(PCing ~ (cpg1 + cpg3 + cpg7 + cpg12), data = sigCluster))
summary(model)

## 
## Call:
## lm(formula = PCing ~ (cpg1 + cpg3 + cpg7 + cpg12), data = sigCluster)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3068 -0.2088 -0.0265  0.1571  0.3844 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.028      0.145    7.09  0.00010 ***
## cpg1        -180.722     35.339   -5.11  0.00091 ***
## cpg3        -146.410     59.929   -2.44  0.04037 *  
## cpg7           5.887      1.534    3.84  0.00496 ** 
## cpg12       -355.438     97.624   -3.64  0.00658 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.301 on 8 degrees of freedom
## Multiple R-squared: 0.941,   Adjusted R-squared: 0.911 
## F-statistic: 31.7 on 4 and 8 DF,  p-value: 5.94e-05

require(effects)

## Loading required package: effects

## Loading required package: lattice

## Loading required package: grid

## Loading required package: MASS

## Loading required package: nnet

## Loading required package: colorspace

## Attaching package: 'effects'

## The following object(s) are masked from 'package:datasets':
## 
## Titanic

eff1 <- allEffects(model)


plot(eff1)

plot of chunk unnamed-chunk-9

Bootstrapping: leaving two females out of the model creating the model and then using it to predict the holdouts, this helps to ensure that we didn't overfit the model

realData <- vector()
predictData <- vector()
for (j in 1:20) {
    rand <- sample(1:length(sigCluster[, 1]), size = 2)
    sigCluster2 <- sigCluster[-rand, ]
    testData <- sigCluster[rand, -16]
    realData <- c(realData, sigCluster[rand, 16])
    model <- (lm(PCing ~ (cpg1 + cpg3 + cpg7 + cpg12), data = sigCluster2))
    predictData <- c(predictData, predict(model, newdata = testData))
}

real data plotted against fake data

plot(realData, predictData, ylim = c(-2, 2), pch = 16, xlab = "real data", ylab = "predictions of the model")
abline(lm(predictData ~ realData))

plot of chunk unnamed-chunk-11

summary of the model

summary(lm(realData ~ predictData))

## 
## Call:
## lm(formula = realData ~ predictData)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.725 -0.419 -0.146  0.499  0.695 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.0436     0.0769   -0.57     0.57    
## predictData   0.7733     0.0672   11.50  6.1e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Residual standard error: 0.485 on 38 degrees of freedom
## Multiple R-squared: 0.777,   Adjusted R-squared: 0.771 
## F-statistic:  132 on 1 and 38 DF,  p-value: 6.07e-14