Run Ittai’s information
## Determine DE genes (for tissue analysis)
# Factors that will go in the lm
chimp_human_heart <- c(1:8)
index_sample <- c(1:8)
FDR_level <- FDR_level
# Get variables
#species <- as.data.frame(samples[,2])
#tissue <- as.data.frame(samples[,3])
# Sort tissue samples
# tissue <- as.data.frame(samples[,2])
# tissue <- tissue[chimp_human_heart,]
# tissue_no_extra <- droplevels.factor(tissue)
# Run the linear model in limma
# design <- model.matrix(~ as.factor(tissue_no_extra))
# fit_all <- lmFit(cpm.voom.cyclic, design)
# fit_all <- eBayes(fit_all)
# Get results
# HvC_Heart_fit_all = topTable(fit_all, coef=2, adjust="BH", number=Inf, sort.by="none")
# HvC_Heart_fit_all_5perc <-
#HvC_Heart_fit_all[which(HvC_Heart_fit_all$adj.P.Val < FDR_level), ]
# Find which DE genes have methylation values associated with it
# human_chimp_heart <- rownames(exprs) %in% HvC_Heart_fit_all_5perc$genes
# human_chimp_heart <- as.data.frame(human_chimp_heart)
# counts_genes_in <- cbind(exprs, human_chimp_heart)
# counts_genes_in_cutoff <- subset(counts_genes_in, human_chimp_heart ==
#"TRUE")
# counts_genes_in_cutoff <- counts_genes_in_cutoff[,1:79]
# expression_values_only <- counts_genes_in_cutoff[,2:32]
# methylation_values_only <- counts_genes_in_cutoff[,49:79]
# dim(expression_values_only)
# dim(methylation_values_only)
# Rename is that we can use Joyce's code
exprs_subset <- cpm.voom.cyclic
methyl_subset <- methyl
# check agreement between methylation data and expression data
all.equal(rownames(methyl_subset), rownames(exprs_subset))
## [1] TRUE
all.equal(colnames(methyl_subset), colnames(exprs_subset))
## [1] "8 string mismatches"
# <---- sample data matching
#samples_sub <- samples[samples$Sample_ID %in% colnames(exprs_subset),]
exprs_pair <- exprs_subset
methyl_pair <- methyl_subset
#species <- droplevels.factor(samples$Species[index_sample])
#tissue <- droplevels.factor(samples$Tissue[index_sample])
#RIN <- samples$RIN[index_sample]
# Extract parameters
N <- ncol(exprs_pair)
ngenes <- nrow(methyl_pair)
# <---- Model 1: exprs ~ tissue
# specify tissue coding
design_1 <- model.matrix(~design[,2])
design_1[design_1[,2]==0,2] <- -1
model_1 <- lmFit(exprs_pair, design_1)
design_1
## (Intercept) design[, 2]
## 1 1 1
## 2 1 1
## 3 1 -1
## 4 1 -1
## 5 1 1
## 6 1 1
## 7 1 -1
## 8 1 -1
## attr(,"assign")
## [1] 0 1
# <---- Model 3: exprs corrected for methylation ~ tissue
# specify design matrix
resid_exprs <- array(0, dim = dim(exprs_pair))
for (index in 1:nrow(exprs_pair)){
resid_exprs[index,] <- lm(exprs_pair[index, ] ~ methyl_pair[index, ])$resid
}
# Prior to this, the residuals were 0! Basically, it was trying to run a lm for each gene AND each individual separately. It was weird.
rownames(resid_exprs) <- rownames(exprs_pair)
model_3 <- lmFit(resid_exprs, design_1)
head(resid_exprs)
## [,1] [,2] [,3] [,4] [,5] [,6]
## 1 -0.21084985 -0.41758764 0.40995603 0.2620344 -0.16367381 -0.25712892
## 2 0.05976573 0.94140912 -0.58735707 -0.9538911 1.04582472 0.47120940
## 3 0.08726226 -0.02085278 0.01368733 0.2207150 -0.41198541 0.03745494
## 4 0.25912664 0.41105152 -0.06644049 -0.4755928 0.25247599 0.49328858
## 5 0.70984383 -0.08534722 -0.33681375 0.3070084 0.53580783 -0.14689933
## 6 -0.13378570 0.45995851 -0.19013191 -0.4516463 0.06971536 0.36710801
## [,7] [,8]
## 1 0.33058054 0.04666922
## 2 -0.07893134 -0.89802946
## 3 -0.19804824 0.27176690
## 4 -0.46438207 -0.40952733
## 5 -0.72022332 -0.26337641
## 6 0.03347689 -0.15469487
## Effect size differences
# get effect sizes
beta1 <- coef(model_1[,2])
beta3 <- coef(model_3[,2])
se_beta1 <- se_beta2 <- se_beta3 <- cov_beta13 <- vector("numeric", ngenes)
# <---- get variances of beta1^S tissue effect
se_beta1 <- model_1$sigma*sqrt((solve(t(design_1)%*%design_1))[2,2])
head(cbind(se_beta1,model_1$sigma*sqrt(model_1$cov.coefficients[2,2])))
## se_beta1
## 1 0.04498742 0.04498742
## 2 0.18270096 0.18270096
## 3 0.05901818 0.05901818
## 4 0.06034491 0.06034491
## 5 0.12512973 0.12512973
## 6 0.04983582 0.04983582
# <---- get variances of beta2 methylation effect
for (g in 1:length(se_beta2)) {
design_2g <- model.matrix(~ as.numeric(methyl_pair[g,]))
sigma_g <- model_1$sigma[g]
se_beta2[index] <- sigma_g*sqrt((solve(t(design_2g)%*%design_2g))[2,2])
}
# <---- get variances of beta3 tissue effect
A <- solve(t(design_1)%*%design_1)%*%t(design_1)
A
## 1 2 3 4 5 6 7 8
## (Intercept) 0.125 0.125 0.125 0.125 0.125 0.125 0.125 0.125
## design[, 2] 0.125 0.125 -0.125 -0.125 0.125 0.125 -0.125 -0.125
contr.vector <- array(c(0,1), dim=c(2,1))
# compute beta3 by hand
for (g in 1:length(se_beta3)) {
# No longer transposed
M_g <- (methyl_pair[g,])
design_2g <- model.matrix(~ as.numeric(M_g))
A_2g <- solve(t(design_2g)%*%design_2g)%*%t(design_2g)
sigma_g <- model_1$sigma[g]
var_beta2g <- (se_beta2^2)[g]
var_part1 <- (se_beta1[g])^2
var_part2 <- ( A%*%M_g%*%var_beta2g%*%t(M_g)%*%t(A) )[2,2]
var_part3 <- ( 2*(sigma_g^2)*A%*%t(A_2g)%*%contr.vector%*%t(M_g)%*%t(A) )[2,2]
se_beta3[g] <- sqrt(var_part1 + var_part2 + var_part3)
}
M_g
## A_21792_HIC B_28126_HIC C_3649_HIC D_40300_HIC E_28815_HIC F_28834_HIC
## 0.7359793 1.2133200 0.3658041 0.1312820 0.7136109 0.7585443
## G_3624_HIC H_3651_HIC
## 0.2500032 0.2697561
design_2g
## (Intercept) as.numeric(M_g)
## 1 1 0.7359793
## 2 1 1.2133200
## 3 1 0.3658041
## 4 1 0.1312820
## 5 1 0.7136109
## 6 1 0.7585443
## 7 1 0.2500032
## 8 1 0.2697561
## attr(,"assign")
## [1] 0 1
A_2g
## 1 2 3 4 5
## (Intercept) 0.01602557 -0.2710621 0.2386605 0.3797095 0.02947863
## as.numeric(M_g) 0.19642553 0.7138988 -0.2048722 -0.4591118 0.17217650
## 6 7 8
## (Intercept) 0.002454247 0.3083068 0.2964268
## as.numeric(M_g) 0.220887733 -0.3304090 -0.3089955
sigma_g
## 1537
## 0.3391738
var_beta2g
## [1] 0.1247108
var_part1
## 1537
## 0.01437986
var_part2
## [1] 0.01126712
var_part3
## [1] 0.02253425
se_beta3[g]
## [1] 0.2195022
# cov(beta1,beta3)
for (g in 1:length(cov_beta13)) {
# No longer transposed
M_g <- (methyl_pair[g,])
design_2g <- model.matrix(~ as.numeric(M_g))
A_2g <- solve(t(design_2g)%*%design_2g)%*%t(design_2g)
sigma_g <- model_1$sigma[g]
var_part1 <- (se_beta1[g])^2
cov_beta13[g] <- var_part1 - (sigma_g^2)*((A %*% t(A_2g) %*%contr.vector%*%t(M_g)%*%t(A))[2,2])
}
M_g
## A_21792_HIC B_28126_HIC C_3649_HIC D_40300_HIC E_28815_HIC F_28834_HIC
## 0.7359793 1.2133200 0.3658041 0.1312820 0.7136109 0.7585443
## G_3624_HIC H_3651_HIC
## 0.2500032 0.2697561
design_2g
## (Intercept) as.numeric(M_g)
## 1 1 0.7359793
## 2 1 1.2133200
## 3 1 0.3658041
## 4 1 0.1312820
## 5 1 0.7136109
## 6 1 0.7585443
## 7 1 0.2500032
## 8 1 0.2697561
## attr(,"assign")
## [1] 0 1
A_2g
## 1 2 3 4 5
## (Intercept) 0.01602557 -0.2710621 0.2386605 0.3797095 0.02947863
## as.numeric(M_g) 0.19642553 0.7138988 -0.2048722 -0.4591118 0.17217650
## 6 7 8
## (Intercept) 0.002454247 0.3083068 0.2964268
## as.numeric(M_g) 0.220887733 -0.3304090 -0.3089955
sigma_g
## 1537
## 0.3391738
var_part1
## 1537
## 0.01437986
head(cov_beta13)
## [1] 0.001913866 0.031565377 0.000943251 0.003545917 0.008777301 0.001359960
# cov(beta1-beta3)
cov_diff_sqrt <- sqrt(se_beta1^2 + se_beta3^2 - 2*cov_beta13)
beta_diff <- beta1-beta3
## Run adaptive shrinkage (with Matthew Stephens' package ashr)
fit_ash <- ash(as.vector(beta_diff), cov_diff_sqrt, mode=0)
## Due to absence of package REBayes, switching to EM algorithm
names(fit_ash$result)
## [1] "betahat" "sebetahat" "NegativeProb" "PositiveProb"
## [5] "lfsr" "svalue" "lfdr" "qvalue"
## [9] "PosteriorMean" "PosteriorSD"
summary(fit_ash$result$svalue)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.02664 0.08948 0.09538 0.15337 0.25011
# lfsr is the minimum of PositiveProb and NegativeProb
# look at genes at various cut-off, say svalue < .01 etc. do they make sense?
summary(fit_ash$result$svalue < 0.05)
## Mode FALSE TRUE
## logical 1009 528
ind_sig <- (fit_ash$result$svalue < FSR_level)
ind_sig[ind_sig == TRUE] <- "red"
ind_sig[ind_sig == FALSE] <- "black"
make_plot <- cbind(beta1-beta3, cov_diff_sqrt, ind_sig, fit_ash$result$lfdr)
colnames(make_plot) <- c("Diff", "SE", "Sign", "LFDR")
make_plot[,1] <- as.numeric(make_plot[,1])
make_plot[,2] <- as.numeric(make_plot[,2])
make_plot[,3] <- as.character(make_plot[,3])
make_plot[,4] <- as.numeric(make_plot[,4])
plot(x=make_plot[,1], y=make_plot[,2],
ylab = "Standard error of the difference",
xlab = "Difference in effect size from 2 lms",
main = "Difference of effect size (DE genes only)",
pch=16, cex=.6, col = make_plot[,3])
legend("topleft", legend=c("s < 0.05", "s >= 0.05"), col=c("red", "black"), pch=16:16, cex=0.8)

Run Lauren’s information
# import sample labels
samples <- read.delim("/Users/laurenblake/Desktop/Reg_Evo_Primates/data/Sample_info_RNAseq_RIN.txt")
# expression
exprs <- read.table("/Users/laurenblake/Desktop/Reg_Evo_Primates/data/human_chimp_orth_exp_methyl_7725_hum.txt", sep="")
# methylation data
methyl <- read.csv("/Users/laurenblake/Desktop/Reg_Evo_Primates/data/chimp_human_orth_7725_avg_methyl_per_ts_gene.txt", sep="")
# Normalized gene expression data
cpm.voom.cyclic <- readRDS("/Users/laurenblake/Desktop/Reg_Evo_Primates/data/human_chimp_orth_cpm_voom_cyclic.rds")
Function that will perform the interspecies analysis- DE genes
two_species <- c(1, 5, 9, 13, 20, 24, 28)
## Determine DE genes (for tissue analysis)
# Factors that will go in the lm
chimp_human_heart <- two_species
index_sample <- chimp_human_heart
FDR_level <- FDR_level
# Get variables
species <- as.data.frame(samples[,2])
tissue <- as.data.frame(samples[,3])
RIN <- as.data.frame(samples[,4])
# Sort tissue samples
tissue <- as.data.frame(samples[,2])
tissue <- tissue[chimp_human_heart,]
tissue_no_extra <- droplevels.factor(tissue)
# Run the linear model in limma
design <- model.matrix(~ as.factor(tissue_no_extra) + RIN[chimp_human_heart,])
fit_all <- lmFit(cpm.voom.cyclic[,chimp_human_heart], design)
fit_all <- eBayes(fit_all)
# Get results
HvC_Heart_fit_all = topTable(fit_all, coef=2, adjust="BH", number=Inf, sort.by="none")
HvC_Heart_fit_all_5perc <-
HvC_Heart_fit_all[which(HvC_Heart_fit_all$adj.P.Val < FDR_level), ]
# Find which DE genes have methylation values associated with it
human_chimp_heart <- rownames(exprs) %in% HvC_Heart_fit_all_5perc$genes
human_chimp_heart <- as.data.frame(human_chimp_heart)
counts_genes_in <- cbind(exprs, human_chimp_heart)
counts_genes_in_cutoff <- subset(counts_genes_in, human_chimp_heart ==
"TRUE")
counts_genes_in_cutoff <- counts_genes_in_cutoff[,1:79]
expression_values_only <- counts_genes_in_cutoff[,2:32]
methylation_values_only <- counts_genes_in_cutoff[,49:79]
dim(expression_values_only)
## [1] 841 31
dim(methylation_values_only)
## [1] 841 31
# Rename is that we can use Joyce's code
exprs_subset <- expression_values_only
methyl_subset <- methylation_values_only
# check agreement between methylation data and expression data
all.equal(rownames(methyl_subset), rownames(exprs_subset))
## [1] TRUE
all.equal(colnames(methyl_subset), colnames(exprs_subset))
## [1] "31 string mismatches"
# <---- sample data matching
samples_sub <- samples[samples$Sample_ID %in% colnames(exprs_subset),]
exprs_pair <- exprs_subset[,index_sample]
methyl_pair <- methyl_subset[,index_sample]
species <- droplevels.factor(samples$Species[index_sample])
tissue <- droplevels.factor(samples$Tissue[index_sample])
RIN <- samples$RIN[index_sample]
# Extract parameters
N <- ncol(exprs_pair)
ngenes <- nrow(methyl_pair)
# <---- Model 1: exprs ~ tissue
# specify tissue coding
design_1 <- model.matrix(~species)
design_1[design_1[,2]==0,2] <- -1
model_1 <- lmFit(exprs_pair, design_1)
# <---- Model 3: exprs corrected for methylation ~ tissue
# specify design matrix
resid_exprs <- array(0, dim = dim(exprs_pair))
for (index in 1:nrow(exprs_pair)){
resid_exprs[index,] <- lm(t(exprs_pair[index, ]) ~ t(methyl_pair[index, ]))$resid
}
rownames(resid_exprs) <- rownames(exprs_pair)
model_3 <- lmFit(resid_exprs, design_1)
## Effect size differences
# get effect sizes
beta1 <- coef(model_1[,2])
beta3 <- coef(model_3[,2])
se_beta1 <- se_beta2 <- se_beta3 <- cov_beta13 <- vector("numeric", ngenes)
# <---- get variances of beta1^S tissue effect
se_beta1 <- model_1$sigma*sqrt((solve(t(design_1)%*%design_1))[2,2])
head(cbind(se_beta1,model_1$sigma*sqrt(model_1$cov.coefficients[2,2])))
## se_beta1
## ENSG00000001460 0.20282819 0.20282819
## ENSG00000001461 0.08017058 0.08017058
## ENSG00000002549 0.15558525 0.15558525
## ENSG00000004660 0.24901178 0.24901178
## ENSG00000005249 0.18313310 0.18313310
## ENSG00000006715 0.12843370 0.12843370
# <---- get variances of beta2 methylation effect
for (g in 1:length(se_beta2)) {
design_2g <- model.matrix(~ as.numeric(methyl_pair[g,]))
sigma_g <- model_1$sigma[g]
se_beta2[index] <- sigma_g*sqrt((solve(t(design_2g)%*%design_2g))[2,2])
}
# <---- get variances of beta3 tissue effect
A <- solve(t(design_1)%*%design_1)%*%t(design_1)
A
## 1 2 3 4 5 6 7
## (Intercept) 0.125 0.125 0.125 0.125 0.1666667 0.1666667 0.1666667
## specieshuman -0.125 -0.125 -0.125 -0.125 0.1666667 0.1666667 0.1666667
contr.vector <- array(c(0,1), dim=c(2,1))
# compute beta3 by hand
for (g in 1:length(se_beta3)) {
M_g <- t(methyl_pair[g,])
design_2g <- model.matrix(~ as.numeric(M_g))
A_2g <- solve(t(design_2g)%*%design_2g)%*%t(design_2g)
sigma_g <- model_1$sigma[g]
var_beta2g <- (se_beta2^2)[g]
var_part1 <- (se_beta1[g])^2
var_part2 <- ( A%*%M_g%*%var_beta2g%*%t(M_g)%*%t(A) )[2,2]
var_part3 <- ( 2*(sigma_g^2)*A%*%t(A_2g)%*%contr.vector%*%t(M_g)%*%t(A) )[2,2]
se_beta3[g] <- sqrt(var_part1 + var_part2 + var_part3)
}
M_g
## ENSG00000258289
## C1H.y 0.02474320
## C2H.y 0.03117289
## C3H.y 0.02145714
## C4H.y 0.02420818
## H2H.y 0.05260557
## H3H.y 0.01832013
## H4H.y 0.03945830
design_2g
## (Intercept) as.numeric(M_g)
## 1 1 0.02474320
## 2 1 0.03117289
## 3 1 0.02145714
## 4 1 0.02420818
## 5 1 0.05260557
## 6 1 0.01832013
## 7 1 0.03945830
## attr(,"assign")
## [1] 0 1
A_2g
## 1 2 3 4 5
## (Intercept) 0.3351809 0.1118732 0.4493078 0.3537624 -0.6324979
## as.numeric(M_g) -6.3513483 1.0232222 -10.1203055 -6.9649877 25.6055235
## 6 7
## (Intercept) 0.5582583 -0.1758847
## as.numeric(M_g) -13.7183150 10.5262109
sigma_g
## ENSG00000258289
## 0.1489349
var_beta2g
## [1] 25.44129
var_part1
## ENSG00000258289
## 0.003234818
var_part2
## [1] 0.0008264884
var_part3
## [1] 0.001652977
se_beta3[g]
## [1] 0.07559288
# cov(beta1,beta3)
for (g in 1:length(cov_beta13)) {
M_g <- t(methyl_pair[g,])
design_2g <- model.matrix(~ as.numeric(M_g))
A_2g <- solve(t(design_2g)%*%design_2g)%*%t(design_2g)
sigma_g <- model_1$sigma[g]
var_part1 <- (se_beta1[g])^2
cov_beta13[g] <- var_part1 - (sigma_g^2)*((A %*% t(A_2g) %*%contr.vector%*%t(M_g)%*%t(A))[2,2])
}
M_g
## ENSG00000258289
## C1H.y 0.02474320
## C2H.y 0.03117289
## C3H.y 0.02145714
## C4H.y 0.02420818
## H2H.y 0.05260557
## H3H.y 0.01832013
## H4H.y 0.03945830
design_2g
## (Intercept) as.numeric(M_g)
## 1 1 0.02474320
## 2 1 0.03117289
## 3 1 0.02145714
## 4 1 0.02420818
## 5 1 0.05260557
## 6 1 0.01832013
## 7 1 0.03945830
## attr(,"assign")
## [1] 0 1
A_2g
## 1 2 3 4 5
## (Intercept) 0.3351809 0.1118732 0.4493078 0.3537624 -0.6324979
## as.numeric(M_g) -6.3513483 1.0232222 -10.1203055 -6.9649877 25.6055235
## 6 7
## (Intercept) 0.5582583 -0.1758847
## as.numeric(M_g) -13.7183150 10.5262109
sigma_g
## ENSG00000258289
## 0.1489349
var_part1
## ENSG00000258289
## 0.003234818
head(cov_beta13)
## [1] 0.038281376 0.002943356 0.023614060 0.035912007 0.031561566 0.015360660
# cov(beta1-beta3)
cov_diff_sqrt <- sqrt(se_beta1^2 + se_beta3^2 - 2*cov_beta13)
beta_diff <- beta1-beta3
## Run adaptive shrinkage (with Matthew Stephens' package ashr)
fit_ash <- ash(as.vector(beta_diff), cov_diff_sqrt, mode=0)
## Due to absence of package REBayes, switching to EM algorithm
names(fit_ash$result)
## [1] "betahat" "sebetahat" "NegativeProb" "PositiveProb"
## [5] "lfsr" "svalue" "lfdr" "qvalue"
## [9] "PosteriorMean" "PosteriorSD"
# lfsr is the minimum of PositiveProb and NegativeProb
# look at genes at various cut-off, say svalue < .01 etc. do they make sense?
ind_sig <- (fit_ash$result$svalue < FSR_level)