setwd("/Users/seanmaguire/Desktop/spring2013_Bioinformatics/bisulfate/new_CpGfiles_1/")
clusterAll <- read.csv("clusterData.csv")
geneExp <- read.csv("geneExp_fixedIDs.csv", header = TRUE)
Change sample IDs to the row names, get rid of the sample ID column.
row.names(clusterAll) <- clusterAll[, 1]
clusterAll <- clusterAll[, -1]
head(clusterAll)
## f11 f12 f13 f17 f18 f20 f25 f30
## 3022 0.368243 0.3381579 0.306991 0.3 0.2041 0.2252 0.237458 0.3125000
## 3062 0.013410 0.0003306 0.002037 0.0 0.0000 0.0000 0.004914 0.0032017
## 3071 0.000000 0.0004847 0.000000 0.0 0.0000 0.0000 0.000000 0.0009901
## 3080 0.001672 0.0003163 0.003419 0.0 0.0000 0.1118 0.000000 0.0036969
## 3090 0.000000 0.0000000 0.001538 0.0 0.0000 0.0000 0.003960 0.0008787
## 3131 0.001248 0.0003140 0.004734 0.0 0.0000 0.0000 0.000000 0.0033670
## f32 f35 f7 f8 f9 m13 m14
## 3022 0.430769 0.238095 0.3262519 0.325000 0.3528369 0.3755 0.359116
## 3062 0.000000 0.006803 0.0000000 0.009317 0.0006849 0.0000 0.003026
## 3071 0.009709 0.000000 0.0000000 0.002924 0.0000000 0.0000 0.000000
## 3080 0.205607 0.092025 0.0009381 0.010870 0.0006031 0.0000 0.003654
## 3090 0.000000 0.137931 0.0008703 0.000000 0.0000000 0.0000 0.002232
## 3131 0.005882 0.000000 0.0006761 0.003745 0.0008787 0.0000 0.004092
## m15 m16 m18 m19 m21 m22 m24
## 3022 0.3582157 0.285714 0.3676190 0.389423 0.3765978 0.365231 0.3574219
## 3062 0.0017947 0.010989 0.0009381 0.000000 0.0955564 0.000000 0.0007321
## 3071 0.0000000 0.000000 0.0004371 0.001422 0.0003682 0.000000 0.0006930
## 3080 0.0006513 0.000000 0.0008210 0.002597 0.1941880 0.131579 0.0000000
## 3090 0.0009186 0.000000 0.0019062 0.000000 0.1289318 0.001017 0.0006112
## 3131 0.0007269 0.006944 0.0000000 0.059380 0.1245742 0.000800 0.0019093
## m26 m27 m28 m4 m5 m6 m9
## 3022 0.3571705 0.248276 0.303571 0.2154088 0.3104727 0.344398 0.307580
## 3062 0.0125107 0.005348 0.008333 0.0031250 0.0052632 0.001277 0.007366
## 3071 0.0288925 0.005051 0.003802 0.0241546 0.0007215 0.003492 0.001682
## 3080 0.0039900 0.004717 0.000000 0.0009166 0.1002786 0.001074 0.071372
## 3090 0.0004632 0.000000 0.000000 0.0000000 0.0000000 0.003906 0.000000
## 3131 0.0007196 0.000000 0.000000 0.0006094 0.0010230 0.001500 0.013348
Seperate Males and Females
clusterMale <- clusterAll[, -(1:13)]
clusterFem <- clusterAll[, 1:13]
function for convienient use later
grabP <- function(corObj) {
return(corObj$p.value)
}
Correlate each cpg site to geneExp. Keep a list of only the ones that have p<0.1.
PCing <- geneExp$P_Cing[match(colnames(clusterFem), geneExp$ID)]
cortests <- apply(clusterFem, MARGIN = 1, FUN = cor.test, y = PCing)
pvals <- lapply(cortests, FUN = grabP)
sigP <- (names(pvals[which(pvals < 0.1)]))
tClusterFem <- t(clusterFem)
sigCluster <- tClusterFem[, sigP]
sigCluster <- as.data.frame(sigCluster)
sigCluster$PCing <- PCing
Rename the cpg sites because R does not like numbers as column names.
names(sigCluster) <- c(paste("cpg", 1:15, sep = ""), "PCing")
head(sigCluster)
## cpg1 cpg2 cpg3 cpg4 cpg5 cpg6 cpg7
## f11 0.0027285 0.0016863 0.0044843 0.005629 0.0005958 0.0020772 0.000000
## f12 0.0007603 0.0003429 0.0005631 0.001491 0.0002355 0.0004959 0.186577
## f13 0.0000000 0.0013141 0.0011541 0.002743 0.0000000 0.0003734 0.001058
## f17 0.0000000 0.0000000 0.0000000 0.000000 0.0000000 0.0000000 0.000000
## f18 0.0000000 0.0000000 0.0000000 0.000000 0.0000000 0.0000000 0.000000
## f20 0.0054348 0.0000000 0.0000000 0.002160 0.0020661 0.0000000 0.000000
## cpg8 cpg9 cpg10 cpg11 cpg12 cpg13 cpg14
## f11 0.004642 0.1853568 0.0009363 0.003696 0.0021086 0.0015508 0.0966726
## f12 0.001445 0.0001554 0.0003435 0.001537 0.0006444 0.0003778 0.0004511
## f13 0.001105 0.0036140 0.0014937 0.001396 0.0025847 0.0021534 0.0027572
## f17 0.000000 0.0000000 0.0000000 0.000000 0.0000000 0.0000000 0.0000000
## f18 0.000000 0.0000000 0.0000000 0.000000 0.0000000 0.0000000 0.0000000
## f20 0.001980 0.1394892 0.0019569 0.013333 0.0014599 0.0000000 0.1077763
## cpg15 PCing
## f11 0.001841 -1.08
## f12 0.475448 1.47
## f13 0.002060 -0.08
## f17 0.032787 0.99
## f18 0.000000 0.78
## f20 0.109445 -0.78
Forward fitting procedure. We fit each cpg site in turn and examine the AIC of the model. In each round we keep only the cpg site that improves the model the most. We continue until no significant improvments can be made to the model. For example:
testModel1 <- lm(PCing ~ 1, data = sigCluster)
AIC(testModel1)
## [1] 40.1
testModel2 <- lm(PCing ~ cpg1, data = sigCluster)
AIC(testModel2)
## [1] 31.42
anova(testModel1, testModel2)
## Analysis of Variance Table
##
## Model 1: PCing ~ 1
## Model 2: PCing ~ cpg1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 12 12.24
## 2 11 5.38 1 6.85 14 0.0032 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Adding cpg1 to the model lowers the AIC and the anova table indicates that is strongly significantly better than the model of the intercept alone. We continued this process adding one factor at a time until the model could not be improved anymore, leading to the following model:
model <- (lm(PCing ~ (cpg1 + cpg3 + cpg7 + cpg12), data = sigCluster))
summary(model)
##
## Call:
## lm(formula = PCing ~ (cpg1 + cpg3 + cpg7 + cpg12), data = sigCluster)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3068 -0.2088 -0.0265 0.1571 0.3844
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.028 0.145 7.09 0.00010 ***
## cpg1 -180.722 35.339 -5.11 0.00091 ***
## cpg3 -146.410 59.929 -2.44 0.04037 *
## cpg7 5.887 1.534 3.84 0.00496 **
## cpg12 -355.438 97.624 -3.64 0.00658 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.301 on 8 degrees of freedom
## Multiple R-squared: 0.941, Adjusted R-squared: 0.911
## F-statistic: 31.7 on 4 and 8 DF, p-value: 5.94e-05
require(effects)
## Loading required package: effects
## Loading required package: lattice
## Loading required package: grid
## Loading required package: MASS
## Loading required package: nnet
## Loading required package: colorspace
## Attaching package: 'effects'
## The following object(s) are masked from 'package:datasets':
##
## Titanic
eff1 <- allEffects(model)
plot(eff1)
Bootstrapping: leaving two females out of the model creating the model and then using it to predict the holdouts, this helps to ensure that we didn't overfit the model
realData <- vector()
predictData <- vector()
for (j in 1:20) {
rand <- sample(1:length(sigCluster[, 1]), size = 2)
sigCluster2 <- sigCluster[-rand, ]
testData <- sigCluster[rand, -16]
realData <- c(realData, sigCluster[rand, 16])
model <- (lm(PCing ~ (cpg1 + cpg3 + cpg7 + cpg12), data = sigCluster2))
predictData <- c(predictData, predict(model, newdata = testData))
}
plot(realData, predictData, ylim = c(-2, 2), pch = 16, xlab = "real data", ylab = "predictions of the model")
abline(lm(predictData ~ realData))
summary(lm(realData ~ predictData))
##
## Call:
## lm(formula = realData ~ predictData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.725 -0.419 -0.146 0.499 0.695
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0436 0.0769 -0.57 0.57
## predictData 0.7733 0.0672 11.50 6.1e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.485 on 38 degrees of freedom
## Multiple R-squared: 0.777, Adjusted R-squared: 0.771
## F-statistic: 132 on 1 and 38 DF, p-value: 6.07e-14