Run Ittai’s information

## Determine DE genes (for tissue analysis)

# Factors that will go in the lm
chimp_human_heart <- c(1:8)
index_sample <- c(1:8)
FDR_level <- FDR_level

# Get variables
#species <- as.data.frame(samples[,2])
#tissue <- as.data.frame(samples[,3])


# Sort tissue samples
# tissue <- as.data.frame(samples[,2])
#  tissue <- tissue[chimp_human_heart,]
#  tissue_no_extra <- droplevels.factor(tissue)

# Run the linear model in limma
#  design <- model.matrix(~ as.factor(tissue_no_extra))
#  fit_all <- lmFit(cpm.voom.cyclic, design)
#  fit_all <- eBayes(fit_all)

# Get results
#  HvC_Heart_fit_all = topTable(fit_all, coef=2, adjust="BH", number=Inf, sort.by="none")
#  HvC_Heart_fit_all_5perc <-
#HvC_Heart_fit_all[which(HvC_Heart_fit_all$adj.P.Val < FDR_level), ]

# Find which DE genes have methylation values associated with it
#    human_chimp_heart <-  rownames(exprs) %in% HvC_Heart_fit_all_5perc$genes
#  human_chimp_heart <- as.data.frame(human_chimp_heart)
#  counts_genes_in <- cbind(exprs, human_chimp_heart)
#  counts_genes_in_cutoff <- subset(counts_genes_in, human_chimp_heart ==
#"TRUE")
#  counts_genes_in_cutoff <- counts_genes_in_cutoff[,1:79]
#  expression_values_only <- counts_genes_in_cutoff[,2:32]
#  methylation_values_only <- counts_genes_in_cutoff[,49:79]
#  dim(expression_values_only)
#  dim(methylation_values_only)
  
# Rename is that we can use Joyce's code
exprs_subset <- cpm.voom.cyclic
methyl_subset <- methyl
  
# check agreement between methylation data and expression data
all.equal(rownames(methyl_subset), rownames(exprs_subset))
## [1] TRUE
all.equal(colnames(methyl_subset), colnames(exprs_subset))
## [1] "8 string mismatches"
# <---- sample data matching
#samples_sub <- samples[samples$Sample_ID %in% colnames(exprs_subset),]


exprs_pair <- exprs_subset
methyl_pair <- methyl_subset
#species <- droplevels.factor(samples$Species[index_sample])
#tissue <- droplevels.factor(samples$Tissue[index_sample])
#RIN <- samples$RIN[index_sample]

 

# Extract parameters
N <- ncol(exprs_pair)
ngenes <- nrow(methyl_pair)
# <---- Model 1: exprs ~ tissue
# specify tissue coding
design_1 <- model.matrix(~design[,2])
design_1[design_1[,2]==0,2] <- -1
model_1 <- lmFit(exprs_pair, design_1)

design_1
##   (Intercept) design[, 2]
## 1           1           1
## 2           1           1
## 3           1          -1
## 4           1          -1
## 5           1           1
## 6           1           1
## 7           1          -1
## 8           1          -1
## attr(,"assign")
## [1] 0 1
# <---- Model 3: exprs corrected for methylation ~ tissue
# specify design matrix
resid_exprs <- array(0, dim = dim(exprs_pair))
for (index in 1:nrow(exprs_pair)){
  resid_exprs[index,] <- lm(exprs_pair[index, ] ~ methyl_pair[index, ])$resid
}

# Prior to this, the residuals were 0! Basically, it was trying to run a lm for each gene AND each individual separately. It was weird. 

rownames(resid_exprs) <- rownames(exprs_pair)
model_3 <- lmFit(resid_exprs, design_1)  

head(resid_exprs)
##          [,1]        [,2]        [,3]       [,4]        [,5]        [,6]
## 1 -0.21084985 -0.41758764  0.40995603  0.2620344 -0.16367381 -0.25712892
## 2  0.05976573  0.94140912 -0.58735707 -0.9538911  1.04582472  0.47120940
## 3  0.08726226 -0.02085278  0.01368733  0.2207150 -0.41198541  0.03745494
## 4  0.25912664  0.41105152 -0.06644049 -0.4755928  0.25247599  0.49328858
## 5  0.70984383 -0.08534722 -0.33681375  0.3070084  0.53580783 -0.14689933
## 6 -0.13378570  0.45995851 -0.19013191 -0.4516463  0.06971536  0.36710801
##          [,7]        [,8]
## 1  0.33058054  0.04666922
## 2 -0.07893134 -0.89802946
## 3 -0.19804824  0.27176690
## 4 -0.46438207 -0.40952733
## 5 -0.72022332 -0.26337641
## 6  0.03347689 -0.15469487
## Effect size differences

# get effect sizes
beta1 <- coef(model_1[,2])
beta3 <- coef(model_3[,2])
se_beta1 <- se_beta2 <- se_beta3 <- cov_beta13 <- vector("numeric", ngenes)
# <---- get variances of beta1^S tissue effect
se_beta1 <- model_1$sigma*sqrt((solve(t(design_1)%*%design_1))[2,2])
head(cbind(se_beta1,model_1$sigma*sqrt(model_1$cov.coefficients[2,2])))
##     se_beta1           
## 1 0.04498742 0.04498742
## 2 0.18270096 0.18270096
## 3 0.05901818 0.05901818
## 4 0.06034491 0.06034491
## 5 0.12512973 0.12512973
## 6 0.04983582 0.04983582
# <---- get variances of beta2 methylation effect
for (g in 1:length(se_beta2)) {
  design_2g <- model.matrix(~ as.numeric(methyl_pair[g,]))  
  sigma_g <- model_1$sigma[g]
  se_beta2[index] <- sigma_g*sqrt((solve(t(design_2g)%*%design_2g))[2,2])
}
# <---- get variances of beta3 tissue effect
A <- solve(t(design_1)%*%design_1)%*%t(design_1)
A
##                 1     2      3      4     5     6      7      8
## (Intercept) 0.125 0.125  0.125  0.125 0.125 0.125  0.125  0.125
## design[, 2] 0.125 0.125 -0.125 -0.125 0.125 0.125 -0.125 -0.125
contr.vector <- array(c(0,1), dim=c(2,1))
# compute beta3 by hand
for (g in 1:length(se_beta3)) {
  # No longer transposed
  M_g <- (methyl_pair[g,])
  design_2g <- model.matrix(~ as.numeric(M_g))  
  A_2g <- solve(t(design_2g)%*%design_2g)%*%t(design_2g)
  sigma_g <- model_1$sigma[g]
  var_beta2g <- (se_beta2^2)[g]
  var_part1 <- (se_beta1[g])^2
  var_part2 <- ( A%*%M_g%*%var_beta2g%*%t(M_g)%*%t(A) )[2,2]
  var_part3 <- ( 2*(sigma_g^2)*A%*%t(A_2g)%*%contr.vector%*%t(M_g)%*%t(A) )[2,2]
  se_beta3[g] <- sqrt(var_part1 + var_part2 + var_part3)
}

M_g
## A_21792_HIC B_28126_HIC  C_3649_HIC D_40300_HIC E_28815_HIC F_28834_HIC 
##   0.7359793   1.2133200   0.3658041   0.1312820   0.7136109   0.7585443 
##  G_3624_HIC  H_3651_HIC 
##   0.2500032   0.2697561
design_2g
##   (Intercept) as.numeric(M_g)
## 1           1       0.7359793
## 2           1       1.2133200
## 3           1       0.3658041
## 4           1       0.1312820
## 5           1       0.7136109
## 6           1       0.7585443
## 7           1       0.2500032
## 8           1       0.2697561
## attr(,"assign")
## [1] 0 1
A_2g
##                          1          2          3          4          5
## (Intercept)     0.01602557 -0.2710621  0.2386605  0.3797095 0.02947863
## as.numeric(M_g) 0.19642553  0.7138988 -0.2048722 -0.4591118 0.17217650
##                           6          7          8
## (Intercept)     0.002454247  0.3083068  0.2964268
## as.numeric(M_g) 0.220887733 -0.3304090 -0.3089955
sigma_g
##      1537 
## 0.3391738
var_beta2g
## [1] 0.1247108
var_part1
##       1537 
## 0.01437986
var_part2
## [1] 0.01126712
var_part3
## [1] 0.02253425
se_beta3[g]
## [1] 0.2195022
# cov(beta1,beta3)
for (g in 1:length(cov_beta13)) {
  # No longer transposed
  M_g <- (methyl_pair[g,])
  design_2g <- model.matrix(~ as.numeric(M_g))  
  A_2g <- solve(t(design_2g)%*%design_2g)%*%t(design_2g)
  sigma_g <- model_1$sigma[g]
  var_part1 <- (se_beta1[g])^2
  cov_beta13[g] <- var_part1 - (sigma_g^2)*((A %*% t(A_2g) %*%contr.vector%*%t(M_g)%*%t(A))[2,2])
}
M_g
## A_21792_HIC B_28126_HIC  C_3649_HIC D_40300_HIC E_28815_HIC F_28834_HIC 
##   0.7359793   1.2133200   0.3658041   0.1312820   0.7136109   0.7585443 
##  G_3624_HIC  H_3651_HIC 
##   0.2500032   0.2697561
design_2g
##   (Intercept) as.numeric(M_g)
## 1           1       0.7359793
## 2           1       1.2133200
## 3           1       0.3658041
## 4           1       0.1312820
## 5           1       0.7136109
## 6           1       0.7585443
## 7           1       0.2500032
## 8           1       0.2697561
## attr(,"assign")
## [1] 0 1
A_2g
##                          1          2          3          4          5
## (Intercept)     0.01602557 -0.2710621  0.2386605  0.3797095 0.02947863
## as.numeric(M_g) 0.19642553  0.7138988 -0.2048722 -0.4591118 0.17217650
##                           6          7          8
## (Intercept)     0.002454247  0.3083068  0.2964268
## as.numeric(M_g) 0.220887733 -0.3304090 -0.3089955
sigma_g
##      1537 
## 0.3391738
var_part1
##       1537 
## 0.01437986
head(cov_beta13)
## [1] 0.001913866 0.031565377 0.000943251 0.003545917 0.008777301 0.001359960
# cov(beta1-beta3)
cov_diff_sqrt <- sqrt(se_beta1^2 + se_beta3^2 - 2*cov_beta13)
beta_diff <- beta1-beta3

## Run adaptive shrinkage (with Matthew Stephens' package ashr)

fit_ash <- ash(as.vector(beta_diff), cov_diff_sqrt, mode=0)
## Due to absence of package REBayes, switching to EM algorithm
names(fit_ash$result)
##  [1] "betahat"       "sebetahat"     "NegativeProb"  "PositiveProb" 
##  [5] "lfsr"          "svalue"        "lfdr"          "qvalue"       
##  [9] "PosteriorMean" "PosteriorSD"
summary(fit_ash$result$svalue)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00000 0.02664 0.08948 0.09538 0.15337 0.25011
# lfsr is the minimum of PositiveProb and NegativeProb
# look at genes at various cut-off, say svalue < .01 etc. do they make sense?

summary(fit_ash$result$svalue < 0.05)
##    Mode   FALSE    TRUE 
## logical    1009     528
ind_sig <- (fit_ash$result$svalue < FSR_level)
ind_sig[ind_sig == TRUE] <- "red"
ind_sig[ind_sig == FALSE] <- "black"

make_plot <- cbind(beta1-beta3, cov_diff_sqrt, ind_sig, fit_ash$result$lfdr)
colnames(make_plot) <- c("Diff", "SE", "Sign", "LFDR")
make_plot[,1] <- as.numeric(make_plot[,1])
make_plot[,2] <- as.numeric(make_plot[,2])
make_plot[,3] <- as.character(make_plot[,3])
make_plot[,4] <- as.numeric(make_plot[,4])


plot(x=make_plot[,1], y=make_plot[,2],
     ylab = "Standard error of the difference",
     xlab = "Difference in effect size from 2 lms", 
     main = "Difference of effect size (DE genes only)",
     pch=16, cex=.6, col = make_plot[,3])
legend("topleft", legend=c("s < 0.05", "s >= 0.05"), col=c("red", "black"), pch=16:16, cex=0.8)

Run Lauren’s information

# import sample labels
samples <- read.delim("/Users/laurenblake/Desktop/Reg_Evo_Primates/data/Sample_info_RNAseq_RIN.txt")
# expression
exprs <- read.table("/Users/laurenblake/Desktop/Reg_Evo_Primates/data/human_chimp_orth_exp_methyl_7725_hum.txt", sep="")
# methylation data
methyl <- read.csv("/Users/laurenblake/Desktop/Reg_Evo_Primates/data/chimp_human_orth_7725_avg_methyl_per_ts_gene.txt", sep="")

# Normalized gene expression data
cpm.voom.cyclic <- readRDS("/Users/laurenblake/Desktop/Reg_Evo_Primates/data/human_chimp_orth_cpm_voom_cyclic.rds")

Function that will perform the interspecies analysis- DE genes

two_species <- c(1, 5, 9, 13, 20, 24, 28)


## Determine DE genes (for tissue analysis)

# Factors that will go in the lm
chimp_human_heart <- two_species
index_sample <- chimp_human_heart
FDR_level <- FDR_level

# Get variables
species <- as.data.frame(samples[,2])
tissue <- as.data.frame(samples[,3])
RIN <- as.data.frame(samples[,4])

# Sort tissue samples
  tissue <- as.data.frame(samples[,2])
  tissue <- tissue[chimp_human_heart,]
  tissue_no_extra <- droplevels.factor(tissue)

# Run the linear model in limma
  design <- model.matrix(~ as.factor(tissue_no_extra) + RIN[chimp_human_heart,])
  fit_all <- lmFit(cpm.voom.cyclic[,chimp_human_heart], design)
  fit_all <- eBayes(fit_all)

# Get results
  HvC_Heart_fit_all = topTable(fit_all, coef=2, adjust="BH", number=Inf, sort.by="none")
  HvC_Heart_fit_all_5perc <-
HvC_Heart_fit_all[which(HvC_Heart_fit_all$adj.P.Val < FDR_level), ]

# Find which DE genes have methylation values associated with it
    human_chimp_heart <-  rownames(exprs) %in% HvC_Heart_fit_all_5perc$genes
  human_chimp_heart <- as.data.frame(human_chimp_heart)
  counts_genes_in <- cbind(exprs, human_chimp_heart)
  counts_genes_in_cutoff <- subset(counts_genes_in, human_chimp_heart ==
"TRUE")
  counts_genes_in_cutoff <- counts_genes_in_cutoff[,1:79]
  expression_values_only <- counts_genes_in_cutoff[,2:32]
  methylation_values_only <- counts_genes_in_cutoff[,49:79]
  dim(expression_values_only)
## [1] 841  31
  dim(methylation_values_only)
## [1] 841  31
# Rename is that we can use Joyce's code
exprs_subset <- expression_values_only
methyl_subset <- methylation_values_only
  
# check agreement between methylation data and expression data
all.equal(rownames(methyl_subset), rownames(exprs_subset))
## [1] TRUE
all.equal(colnames(methyl_subset), colnames(exprs_subset))
## [1] "31 string mismatches"
# <---- sample data matching
samples_sub <- samples[samples$Sample_ID %in% colnames(exprs_subset),]


exprs_pair <- exprs_subset[,index_sample]
methyl_pair <- methyl_subset[,index_sample]
species <- droplevels.factor(samples$Species[index_sample])
tissue <- droplevels.factor(samples$Tissue[index_sample])
RIN <- samples$RIN[index_sample]


# Extract parameters
N <- ncol(exprs_pair)
ngenes <- nrow(methyl_pair)
# <---- Model 1: exprs ~ tissue
# specify tissue coding
design_1 <- model.matrix(~species)
design_1[design_1[,2]==0,2] <- -1
model_1 <- lmFit(exprs_pair, design_1)

# <---- Model 3: exprs corrected for methylation ~ tissue
# specify design matrix
resid_exprs <- array(0, dim = dim(exprs_pair))
for (index in 1:nrow(exprs_pair)){
  resid_exprs[index,] <- lm(t(exprs_pair[index, ]) ~ t(methyl_pair[index, ]))$resid
}
rownames(resid_exprs) <- rownames(exprs_pair)
model_3 <- lmFit(resid_exprs, design_1)  


## Effect size differences

# get effect sizes
beta1 <- coef(model_1[,2])
beta3 <- coef(model_3[,2])
se_beta1 <- se_beta2 <- se_beta3 <- cov_beta13 <- vector("numeric", ngenes)
# <---- get variances of beta1^S tissue effect
se_beta1 <- model_1$sigma*sqrt((solve(t(design_1)%*%design_1))[2,2])
head(cbind(se_beta1,model_1$sigma*sqrt(model_1$cov.coefficients[2,2])))
##                   se_beta1           
## ENSG00000001460 0.20282819 0.20282819
## ENSG00000001461 0.08017058 0.08017058
## ENSG00000002549 0.15558525 0.15558525
## ENSG00000004660 0.24901178 0.24901178
## ENSG00000005249 0.18313310 0.18313310
## ENSG00000006715 0.12843370 0.12843370
# <---- get variances of beta2 methylation effect
for (g in 1:length(se_beta2)) {
  design_2g <- model.matrix(~ as.numeric(methyl_pair[g,]))  
  sigma_g <- model_1$sigma[g]
  se_beta2[index] <- sigma_g*sqrt((solve(t(design_2g)%*%design_2g))[2,2])
}
# <---- get variances of beta3 tissue effect
A <- solve(t(design_1)%*%design_1)%*%t(design_1)
A
##                   1      2      3      4         5         6         7
## (Intercept)   0.125  0.125  0.125  0.125 0.1666667 0.1666667 0.1666667
## specieshuman -0.125 -0.125 -0.125 -0.125 0.1666667 0.1666667 0.1666667
contr.vector <- array(c(0,1), dim=c(2,1))
# compute beta3 by hand
for (g in 1:length(se_beta3)) {
  M_g <- t(methyl_pair[g,])
  design_2g <- model.matrix(~ as.numeric(M_g))  
  A_2g <- solve(t(design_2g)%*%design_2g)%*%t(design_2g)
  sigma_g <- model_1$sigma[g]
  var_beta2g <- (se_beta2^2)[g]
  var_part1 <- (se_beta1[g])^2
  var_part2 <- ( A%*%M_g%*%var_beta2g%*%t(M_g)%*%t(A) )[2,2]
  var_part3 <- ( 2*(sigma_g^2)*A%*%t(A_2g)%*%contr.vector%*%t(M_g)%*%t(A) )[2,2]
  se_beta3[g] <- sqrt(var_part1 + var_part2 + var_part3)
}


M_g
##       ENSG00000258289
## C1H.y      0.02474320
## C2H.y      0.03117289
## C3H.y      0.02145714
## C4H.y      0.02420818
## H2H.y      0.05260557
## H3H.y      0.01832013
## H4H.y      0.03945830
design_2g
##   (Intercept) as.numeric(M_g)
## 1           1      0.02474320
## 2           1      0.03117289
## 3           1      0.02145714
## 4           1      0.02420818
## 5           1      0.05260557
## 6           1      0.01832013
## 7           1      0.03945830
## attr(,"assign")
## [1] 0 1
A_2g
##                          1         2           3          4          5
## (Intercept)      0.3351809 0.1118732   0.4493078  0.3537624 -0.6324979
## as.numeric(M_g) -6.3513483 1.0232222 -10.1203055 -6.9649877 25.6055235
##                           6          7
## (Intercept)       0.5582583 -0.1758847
## as.numeric(M_g) -13.7183150 10.5262109
sigma_g
## ENSG00000258289 
##       0.1489349
var_beta2g
## [1] 25.44129
var_part1
## ENSG00000258289 
##     0.003234818
var_part2
## [1] 0.0008264884
var_part3
## [1] 0.001652977
se_beta3[g]
## [1] 0.07559288
# cov(beta1,beta3)
for (g in 1:length(cov_beta13)) {
  M_g <- t(methyl_pair[g,])
  design_2g <- model.matrix(~ as.numeric(M_g))  
  A_2g <- solve(t(design_2g)%*%design_2g)%*%t(design_2g)
  sigma_g <- model_1$sigma[g]
  var_part1 <- (se_beta1[g])^2
  cov_beta13[g] <- var_part1 - (sigma_g^2)*((A %*% t(A_2g) %*%contr.vector%*%t(M_g)%*%t(A))[2,2])
}
  
M_g
##       ENSG00000258289
## C1H.y      0.02474320
## C2H.y      0.03117289
## C3H.y      0.02145714
## C4H.y      0.02420818
## H2H.y      0.05260557
## H3H.y      0.01832013
## H4H.y      0.03945830
design_2g
##   (Intercept) as.numeric(M_g)
## 1           1      0.02474320
## 2           1      0.03117289
## 3           1      0.02145714
## 4           1      0.02420818
## 5           1      0.05260557
## 6           1      0.01832013
## 7           1      0.03945830
## attr(,"assign")
## [1] 0 1
A_2g
##                          1         2           3          4          5
## (Intercept)      0.3351809 0.1118732   0.4493078  0.3537624 -0.6324979
## as.numeric(M_g) -6.3513483 1.0232222 -10.1203055 -6.9649877 25.6055235
##                           6          7
## (Intercept)       0.5582583 -0.1758847
## as.numeric(M_g) -13.7183150 10.5262109
sigma_g
## ENSG00000258289 
##       0.1489349
var_part1
## ENSG00000258289 
##     0.003234818
head(cov_beta13)
## [1] 0.038281376 0.002943356 0.023614060 0.035912007 0.031561566 0.015360660
# cov(beta1-beta3)
cov_diff_sqrt <- sqrt(se_beta1^2 + se_beta3^2 - 2*cov_beta13)
beta_diff <- beta1-beta3

## Run adaptive shrinkage (with Matthew Stephens' package ashr)

fit_ash <- ash(as.vector(beta_diff), cov_diff_sqrt, mode=0)
## Due to absence of package REBayes, switching to EM algorithm
names(fit_ash$result)
##  [1] "betahat"       "sebetahat"     "NegativeProb"  "PositiveProb" 
##  [5] "lfsr"          "svalue"        "lfdr"          "qvalue"       
##  [9] "PosteriorMean" "PosteriorSD"
# lfsr is the minimum of PositiveProb and NegativeProb
# look at genes at various cut-off, say svalue < .01 etc. do they make sense?

ind_sig <- (fit_ash$result$svalue < FSR_level)