rm(list = ls())
setwd("~/Dropbox/professional/astress/jacobs/rna_pilot/shanahan_data/")
dataDirectory <- "~/Dropbox/professional/astress/jacobs/rna_pilot/shanahan_data/"

# Expression features are high dimensional - more parameters than observations or 'overparameterized' - and strongly correlated. 
# These properties mean non-identifiability and high variance (overfitting). 

#  establish that the microarray really adds something to the predictive performance of known risk factors
# P-value of the test: exact, asymptotic, permutation arguments.
# Warning: Regulation of gene expression in higher eukariotes is very complex and far from being well understood. We have incomplete knowledge of cell signalling networks.
# We don't have enough data for a hold-out set, to validate our the predictions of our algorithm. 
# The PAXgene Blood RNA System consists of a blood collection tube (PAXgene Blood RNA Tube) and a nucleic acid purification kit (PAXgene Blood RNA Kit). 
# It is intended for the collection, transport, and storage of blood and stabilization of intracellular RNA in a closed tube and subsequent isolation and purification of intracellular RNA from whole blood for RT-PCR used in molecular diagnostic testing.

# Infer dependence within/between two multivariate systems A, B: e.g. predicting components/subsystems/structure of A from B (gene set expression increases from phenotype/exposure/treatment).
# Infer causal dependence?
# Which properties of the joint (A, B) can be safely assumed - topology of A? A|B conditional distributions? - and which are hypothetical.

# Test complex responses to multivariate traits/treatments.
# In general, there are two assumptions for causal inference from observational data: no unmeasured confounding variables and/or measured instrumental variable
# Constrain prediction with prior data raw or distilled/reasoned - e.g. into network information (PPI network smoothing over a network structure).
# Signal pathway view is limited: pathways are connected to form a large network
# regulatory motifs, such as positive and negative feedback and feedforward loops, that process information. Key regulators of plasticity were highly connected nodes required for the formation of regulatory motifs, indicating the potential importance of such motifs in determining cellular choices between homeostasis and plasticity.

# One central challenge of systems biology is to identify the subset of key components and regulatory interactions whose perturbation or tuning leads to significant functional changes (e.g., changes in a crop's fitness under environmental stress or changes in the state of malfunctioning cells, thereby combating disease). 
# Mathematical modeling can assist in this process by integrating the behavior of multiple components into a comprehensive model that goes beyond human intuition, and also by addressing questions that are not yet accessible to experimental analysis.
# In recent years, theoretical and computational analysis of biochemical networks has been successfully applied to well-defined metabolic pathways, signal transduction, and gene regulatory networks [1–3]. 
# In parallel, high-throughput experimental methods have enabled the construction of genome-scale maps of transcription factor–DNA and protein–protein interactions [4,5]. 
# The former are quantitative, dynamic descriptions of experimentally well-studied cellular pathways with relatively few components, while the latter are static maps of potential interactions with no information about their timing or kinetics. 
# Here we introduce a novel approach that stands in the middle ground of the above-mentioned methods by incorporating the synthesis and dynamic modeling of complex cellular networks that contain diverse, yet only qualitatively known regulatory interactions.
# Explore the extent to which the network topology determines the dynamic behavior of the system.
# Methods: dynamic simulation. 

# Networks which connect molecular components within a cell
##########################################################################################
# Physical molectular interaction networks, i.e. cell-signalling, gene-regulatory, protein-protein interaction networks:
# Network reconstruction from literature: networks manually by experts, semi-automatic identifiattion, completely automatically by natural language processing
# Network reconstruction from experimental data: 

# Epistasis network (not direct physical, but still somehow connects molecular components) 

# Time-series and/or experimental data used to infer a directed and signed graph

# Network to connect cell networks to other things, e.g. drugs to targets
#############################################
# bipartite graphs
# proteins operate as part of highly interconnected cellular networks referred to as interactome networks
# disease-gene products
# update the single drug–single target paradigm, just as single protein–single function relations are somewhat limited

# nodes connected through abstract interactions, genes connected based on which disease is caused by thier mutation. 
#############################################

# Functional association networks 
#############################################
# network representation which connect molecules with more abstract quantities can be used for data integration (Tanay 2004 PNAS)

library.cran <- function(p) { 

    if (!p %in% installed.packages()) { install.packages(p) } 
    library(p, character.only=1)

}

source("https://bioconductor.org/biocLite.R") # first time install

## Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help

library.bioconductor <- function(p) { 

    if (!p %in% installed.packages()) { biocLite(p) } 
    library(p, character.only=1)

}

library.bioconductor("gplots")

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library.bioconductor("devtools")
library.bioconductor("Biobase")

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colnames,
##     do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unsplit, which, which.max,
##     which.min

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

library.bioconductor("dendextend")

## 
## ---------------------
## Welcome to dendextend version 1.3.0
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------

## 
## Attaching package: 'dendextend'

## The following object is masked from 'package:stats':
## 
##     cutree

library.bioconductor("Glimma")
library.cran("tidyr")
library.cran("Hmisc")

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:Biobase':
## 
##     combine, contents

## The following object is masked from 'package:BiocGenerics':
## 
##     combine

## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

#The 53-gene "CTRA" set listed below includes (a) 19 proinflammatory genes which are upregulated in CTRA "on average" (b) 31 genes involved in type I IFN responses down-regulated in the CTRA (c) 3 genes involved in antibody synthesis down-regulated in the CTRA. These molecules have been historically designated by their HGNC names (HUGO gene nomenclature committee).
inflamatory     = c("IL1A", "IL1B", "IL6", "IL8", "TNF", "PTGS1", "PTGS2",
                    "FOS", "FOSB", "FOSL1", "FOSL2", "JUN", "JUNB", "JUND",
                    "NFKB1", "NFKB2", "REL", "RELA", "RELB")
interferonTypeI = c("GBP1", "IFI16", "IFI27", "IFI27L1", "IFI27L2", "IFI30",
                    "IFI35", "IFI44", "IFI44L", "IFI6", "IFIH1", "IFIT1",
                    "IFIT2", "IFIT3", "IFIT5", "IFIT1L", "IFITM1", "IFITM2",
                    "IFITM3", "IFITM4P", "IFITM5", "IFNB1", "IRF2", "IRF7",
                    "IRF8", "MX1", "MX2", "OAS1", "OAS2", "OAS3", "OASL")
antibody        = c("IGJ", "IGLL1", "IGLL3")
ctra0           = c(inflamatory, interferonTypeI, antibody)

ctraCore0       = c("IRF7", "JUN", "IGJ", "IL8", "IL1B", "FOSB", "FOSL2", "IFIT3", "IFI35", "IFI44L", "MX1", "OAS2")
ctraCore        = c("IRF7", "JUN", "JCHAIN", "CXCL8", "IL1B", "FOSB", "FOSL2", "IFIT3", "IFI35", "IFI44L", "MX1", "OAS2")


#To connect our data to previous CTRA publications we must relate three sets of gene identifiers: names according to the HGNC (as above), names according to the Illumina Human HT-12 v4 BeadArray (used for original CTRA publications) and names according to our own HuGene 2.0 ST array V1. To do this we queried the Ensembl Genes 85 database (homo sapiens, archived in July 2016) via the biomaRt interface. 
library("dplyr")

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:Hmisc':
## 
##     combine, src, summarize

## The following object is masked from 'package:Biobase':
## 
##     combine

## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

options(dplyr.print_max = 15)
library.bioconductor("Biobase")
eData   <- exprs # convenience
library.bioconductor("biomaRt")
ensembl <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", 
                   dataset = "hsapiens_gene_ensembl", 
                   host = "jul2016.archive.ensembl.org")


#First note that 4 of 53 HGNC names have been replaced since the original CTRA publications.
a = getBM(attributes = c("hgnc_symbol"),
          filters = "hgnc_symbol",
          values = ctra0,
          mart = ensembl) %>% tbl_df
(missing = setdiff(ctra0, a$hgnc_symbol))

## [1] "IL8"    "IFIT1L" "IGJ"    "IGLL3"

# We will therefore use the newer names (confirming they are all present).
inflamatory     = replace(inflamatory, inflamatory == "IL8", "CXCL8")
interferonTypeI = replace(interferonTypeI , interferonTypeI == "IFIT1L", "IFIT1B")
antibody        = replace(antibody, antibody == "IGJ", "JCHAIN")
antibody        = replace(antibody, antibody == "IGLL3", "IGLL3P")
ctra            = c(inflamatory, interferonTypeI, antibody)

a = getBM(attributes = c("hgnc_symbol"),
          filters = "hgnc_symbol",
          values = ctra,
          mart = ensembl) %>% tbl_df
length(setdiff(ctra, a$hgnc_symbol)) == 0 # check all present now

## [1] TRUE

#Oddly, Illumina Human HT-12 v4 BeadArrays do not assay 2 of 53 CTRA genes, according to the Ensembl Genes 85 database (NAs below).
(a <- getBM(attributes = c("hgnc_symbol", "illumina_humanht_12_v4"),
            filters = "hgnc_symbol",
            values = c("IFI30", "IFIH1"),
            mart = ensembl))

##   hgnc_symbol illumina_humanht_12_v4
## 1       IFI30                     NA
## 2       IFIH1                     NA

#Confirm that IFI30 and IFIH1 are the only two genes not assayed by Illumina Human HT-12 v4 BeadArrays ("illumina_humanht_12_v4"). 
a <- getBM(attributes = c("hgnc_symbol", "illumina_humanht_12_v4"),
           filters = "hgnc_symbol",
           values = ctra,
           mart = ensembl) %>% tbl_df
a <- a %>% mutate(illumina_humanht_12_v4 = replace(illumina_humanht_12_v4, illumina_humanht_12_v4 =="", NA))
b <- a %>% filter(!is.na(illumina_humanht_12_v4)) # remove NA
(missing_ctra = setdiff(ctra, b$hgnc_symbol))

## [1] "IFI30" "IFIH1"

#Similarly, our HuGene 2.0 ST array V1 ("affy_hugene_2_0_st_v1") did not assay 3 of 53 CTRA genes.
a <- getBM(attributes = c("hgnc_symbol", "affy_hugene_2_0_st_v1"),
           filters = "hgnc_symbol",
           values = ctra,
           mart = ensembl) %>% tbl_df
b <- a %>% filter(!is.na(affy_hugene_2_0_st_v1)) # filter out missing values
(missing_ctra          = setdiff(ctra, b$hgnc_symbol))

## [1] "GBP1"   "MX2"    "IGLL3P"

(found_ctra            = setdiff(ctra, missing_ctra))

##  [1] "IL1A"    "IL1B"    "IL6"     "CXCL8"   "TNF"     "PTGS1"   "PTGS2"  
##  [8] "FOS"     "FOSB"    "FOSL1"   "FOSL2"   "JUN"     "JUNB"    "JUND"   
## [15] "NFKB1"   "NFKB2"   "REL"     "RELA"    "RELB"    "IFI16"   "IFI27"  
## [22] "IFI27L1" "IFI27L2" "IFI30"   "IFI35"   "IFI44"   "IFI44L"  "IFI6"   
## [29] "IFIH1"   "IFIT1"   "IFIT2"   "IFIT3"   "IFIT5"   "IFIT1B"  "IFITM1" 
## [36] "IFITM2"  "IFITM3"  "IFITM4P" "IFITM5"  "IFNB1"   "IRF2"    "IRF7"   
## [43] "IRF8"    "MX1"     "OAS1"    "OAS2"    "OAS3"    "OASL"    "JCHAIN" 
## [50] "IGLL1"

# (found_inflamatory     = setdiff(inflamatory, missing_ctra))
# (found_interferonTypeI = setdiff(interferonTypeI, missing_ctra))
# (found_antibody        = setdiff(antibody, missing_ctra))
# setequal(found_ctra, union_all(found_antibody, found_inflamatory, found_interferonTypeI))

# above is also on see rnaAnnotationr.RMd
#########################################

# "Tidy data": Hadley wickam and relational databases
# row = observational unit (one type of observational unit per table)
# col = variable/attribute

# # Each value belongs to both variable and observation
# common messiness: column headers are values not variable names (i.e. there is logical structure across columns, i.e.mutually exclusive)
# wide represents features horrizontally.
# Multiple variables stored in one column is messy, but there should be multiple values in each column. 
# The most important function in tidyr is gather(). It should be used when you have columns that are not variables and you want to collapse them into key-value pairs.
# 
# 
# "ExpressionSet" data is not tidy
# col = sample unit
# row = variable

load(file = "~/Dropbox/professional/astress/jacobs/rna_pilot/shanahan_data/dat.RData")
load(file = "~/Dropbox/professional/astress/jacobs/rna_pilot/shanahan_data/feat.RData")

library.cran("ggplot2")

# check extraneous bmi.dta file is redundant with what we already have
library.cran("readstata13") # load from stata
datBmi <- read.dta13("bmi.dta") # from  michael shanahan
left_join(pData(dat), datBmi, by = "IDNum") %>% dplyr::select(bmi.y, bmi.x) %>% mutate(difff = bmi.y-bmi.x) # the bmi in dat and datBmi are equal (up to rounding error)

##       bmi.y   bmi.x         difff
## 1  12.85531 12.8553  1.139374e-05
## 2  14.18973 14.1897  3.445892e-05
## 3  12.83677 12.8368 -2.994232e-05
## 4  19.71369 19.7137 -1.401062e-05
## 5  13.00550 13.0055 -1.602173e-07
## 6  18.91965 18.9196  4.721680e-05
## 7  15.91152 15.9115  2.286530e-05
## 8  14.20525 14.2052  4.787903e-05
## 9  13.78431 13.7843  1.415558e-05
## 10 12.63848 12.6385 -1.695251e-05
## 11 12.69575 12.6957  4.546814e-05
## 12 18.75143 18.7514  3.241882e-05
## 13 14.63973 14.6397  3.331451e-05
## 14 16.39249 16.3925 -1.152039e-05
## 15 15.93015 15.9301  4.621735e-05
## 16 14.32487 14.3249 -2.512207e-05
## 17 15.59034 15.5903  3.966064e-05
## 18 12.73161 12.7316  9.344482e-06
## 19 15.36717 15.3672 -3.443451e-05
## 20 14.55585 14.5558  4.526062e-05
## 21 14.60934 14.6093  3.589935e-05
## 22 17.40762 17.4076  1.947632e-05
## 23 19.14368 19.1437 -2.324219e-05
## 24 13.87395 13.8740 -4.904175e-05
## 25 15.07893 15.0789  2.513275e-05
## 26 13.88633 13.8863  3.441925e-05
## 27 14.82876 14.8288 -3.794556e-05
## 28 16.94716 16.9472 -4.309387e-05
## 29 17.43950 17.4395  8.087158e-07
## 30 13.35859 13.3586 -6.059265e-06
## 31 16.46633 16.4663  2.957458e-05

dat$IDNum <- as.character(dat$IDNum)
# if(0) {
# 
#     ################################################################################
#     ## Checks 
#     ################################################################################
# 
#     # unpack
#     edata  <- dat %>% eData
#     fdata  <- dat %>% fData
#     pdata  <- dat %>% pData
# 
#     # Sample data
#     pdata %>% is.na %>% colSums # which columns have NA
#     sum(pdata$chen_R_A_Avg == "?") # but NA is
# 
#     # Gene data
#     edata %>% is.na  %>% rowSums %>% table # which rows have missing values
# 
#     # Empirical distributions
#     boxplot(edata)                         # marginal
#     lg2edata = log2(edata +1)
#     boxplot(edata)                         # marginal
#     boxplot(lg2edata)                      # marginal
#     qqplot(edata[, 1], edata[, 2])         # bivariate (confirms quantile normalization)
#     abline(c(0, 1))
#     heatmap(edata, Rowv = NA, Colv = NA)   # multivariate (no odd samples)
# 
#     search = c("Nsun3", "Polrmt")
#     a <- getBM(attributes = c("hgnc_symbol", 'affy_hugene_2_0_st_v1'),
#                filters = "hgnc_symbol",
#                values = search,
#                mart = ensembl) %>% tbl_df
# 
#     #External check: Correlate Y-chromosome genes with phenotype gender (Select all Affymetrix identifiers on the "affy_hugene_2_0_st_v1" chip and for genes located on Y chromosome).
#     a <- getBM(attributes = c("chromosome_name", 'affy_hugene_2_0_st_v1'),
#                filters = c('chromosome_name'),
#                values = list("Y"),
#                mart = ensembl) %>% tbl_df
#     a <- a %>% 
#     filter(!is.na(affy_hugene_2_0_st_v1)) %>% 
#     mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1))
#     yChromeExpression <- dat[a$affy_hugene_2_0_st_v1, ] %>% 
#     eData %>% 
#     tbl_df %>% 
#     summarize_each(funs(mean)) %>% 
#     unlist 
#     title("should split sample by gender")
# 
# }

listAttributes(ensembl, page = "feature_page")

##                                     name
## 1                        ensembl_gene_id
## 2                  ensembl_transcript_id
## 3                     ensembl_peptide_id
## 4                        ensembl_exon_id
## 5                            description
## 6                        chromosome_name
## 7                         start_position
## 8                           end_position
## 9                                 strand
## 10                                  band
## 11                      transcript_start
## 12                        transcript_end
## 13              transcription_start_site
## 14                     transcript_length
## 15                        transcript_tsl
## 16              transcript_gencode_basic
## 17                     transcript_appris
## 18                    external_gene_name
## 19                  external_gene_source
## 20              external_transcript_name
## 21       external_transcript_source_name
## 22                      transcript_count
## 23                 percentage_gc_content
## 24                          gene_biotype
## 25                    transcript_biotype
## 26                                source
## 27                     transcript_source
## 28                                status
## 29                     transcript_status
## 30                               version
## 31                    transcript_version
## 32                 phenotype_description
## 33                           source_name
## 34                     study_external_id
## 35                                 go_id
## 36                             name_1006
## 37                       definition_1006
## 38                       go_linkage_type
## 39                        namespace_1003
## 40                  goslim_goa_accession
## 41                goslim_goa_description
## 42                          arrayexpress
## 43                                chembl
## 44         clone_based_ensembl_gene_name
## 45   clone_based_ensembl_transcript_name
## 46            clone_based_vega_gene_name
## 47      clone_based_vega_transcript_name
## 48                                  ccds
## 49                             dbass3_id
## 50                           dbass3_name
## 51                             dbass5_id
## 52                           dbass5_name
## 53                                  embl
## 54                     ens_hs_transcript
## 55                    ens_hs_translation
## 56                          ens_lrg_gene
## 57                    ens_lrg_transcript
## 58                            entrezgene
## 59            entrezgene_transcript_name
## 60                                   epd
## 61                                genedb
## 62                                   hpa
## 63                                  ottg
## 64                                  ottt
## 65                                  ottp
## 66                               hgnc_id
## 67                           hgnc_symbol
## 68                  hgnc_transcript_name
## 69                                merops
## 70                    mim_gene_accession
## 71                  mim_gene_description
## 72                     mirbase_accession
## 73                            mirbase_id
## 74               mirbase_transcript_name
## 75                            mim_morbid
## 76                                   pdb
## 77                            protein_id
## 78                              reactome
## 79                         reactome_gene
## 80                   reactome_transcript
## 81                           refseq_mrna
## 82                 refseq_mrna_predicted
## 83                          refseq_ncrna
## 84                refseq_ncrna_predicted
## 85                        refseq_peptide
## 86              refseq_peptide_predicted
## 87                                  rfam
## 88                  rfam_transcript_name
## 89                            rnacentral
## 90                                  ucsc
## 91                               unigene
## 92                               uniparc
## 93                      uniprot_sptrembl
## 94                     uniprot_swissprot
## 95                      uniprot_genename
## 96                         wikigene_name
## 97                           wikigene_id
## 98                  wikigene_description
## 99                       agilent_cgh_44b
## 100    efg_agilent_sureprint_g3_ge_8x60k
## 101 efg_agilent_sureprint_g3_ge_8x60k_v2
## 102     efg_agilent_wholegenome_4x44k_v1
## 103     efg_agilent_wholegenome_4x44k_v2
## 104                         affy_hc_g110
## 105                        affy_hg_focus
## 106                  affy_hg_u133_plus_2
## 107                      affy_hg_u133a_2
## 108                        affy_hg_u133a
## 109                        affy_hg_u133b
## 110                       affy_hg_u95av2
## 111                         affy_hg_u95b
## 112                         affy_hg_u95c
## 113                         affy_hg_u95d
## 114                         affy_hg_u95e
## 115                         affy_hg_u95a
## 116                        affy_hugenefl
## 117                         affy_hta_2_0
## 118                  affy_huex_1_0_st_v2
## 119                affy_hugene_1_0_st_v1
## 120                affy_hugene_2_0_st_v1
## 121                       affy_primeview
## 122                        affy_u133_x3p
## 123                             codelink
## 124                illumina_humanwg_6_v1
## 125                illumina_humanwg_6_v2
## 126                illumina_humanwg_6_v3
## 127               illumina_humanht_12_v3
## 128               illumina_humanht_12_v4
## 129               illumina_humanref_8_v3
## 130        illumina_human_methylation_27
## 131       illumina_human_methylation_450
## 132                     phalanx_onearray
## 133                               family
## 134                   family_description
## 135                                pirsf
## 136                          pirsf_start
## 137                            pirsf_end
## 138                          superfamily
## 139                    superfamily_start
## 140                      superfamily_end
## 141                                smart
## 142                          smart_start
## 143                            smart_end
## 144                                hamap
## 145                          hamap_start
## 146                            hamap_end
## 147                              profile
## 148                        profile_start
## 149                          profile_end
## 150                              prosite
## 151                        prosite_start
## 152                          prosite_end
## 153                               prints
## 154                         prints_start
## 155                           prints_end
## 156                                 pfam
## 157                           pfam_start
## 158                             pfam_end
## 159                              tigrfam
## 160                        tigrfam_start
## 161                          tigrfam_end
## 162                               gene3d
## 163                         gene3d_start
## 164                           gene3d_end
## 165                           hmmpanther
## 166                     hmmpanther_start
## 167                       hmmpanther_end
## 168                          blastprodom
## 169                    blastprodom_start
## 170                      blastprodom_end
## 171                             interpro
## 172           interpro_short_description
## 173                 interpro_description
## 174                       interpro_start
## 175                         interpro_end
## 176                       low_complexity
## 177                 low_complexity_start
## 178                   low_complexity_end
## 179                 transmembrane_domain
## 180           transmembrane_domain_start
## 181             transmembrane_domain_end
## 182                        signal_domain
## 183                  signal_domain_start
## 184                    signal_domain_end
## 185                               ncoils
## 186                         ncoils_start
## 187                           ncoils_end
##                                           description         page
## 1                                     Ensembl Gene ID feature_page
## 2                               Ensembl Transcript ID feature_page
## 3                                  Ensembl Protein ID feature_page
## 4                                     Ensembl Exon ID feature_page
## 5                                         Description feature_page
## 6                                     Chromosome Name feature_page
## 7                                     Gene Start (bp) feature_page
## 8                                       Gene End (bp) feature_page
## 9                                              Strand feature_page
## 10                                               Band feature_page
## 11                              Transcript Start (bp) feature_page
## 12                                Transcript End (bp) feature_page
## 13                     Transcription Start Site (TSS) feature_page
## 14         Transcript length (including UTRs and CDS) feature_page
## 15                     Transcript Support Level (TSL) feature_page
## 16                           GENCODE basic annotation feature_page
## 17                                  APPRIS annotation feature_page
## 18                               Associated Gene Name feature_page
## 19                             Associated Gene Source feature_page
## 20                         Associated Transcript Name feature_page
## 21                       Associated Transcript Source feature_page
## 22                                   Transcript count feature_page
## 23                                       % GC content feature_page
## 24                                          Gene type feature_page
## 25                                    Transcript type feature_page
## 26                                      Source (gene) feature_page
## 27                                Source (transcript) feature_page
## 28                                      Status (gene) feature_page
## 29                                Status (transcript) feature_page
## 30                                     Version (gene) feature_page
## 31                               Version (transcript) feature_page
## 32                              Phenotype description feature_page
## 33                                        Source name feature_page
## 34                           Study External Reference feature_page
## 35                                  GO Term Accession feature_page
## 36                                       GO Term Name feature_page
## 37                                 GO Term Definition feature_page
## 38                              GO Term Evidence Code feature_page
## 39                                          GO domain feature_page
## 40                            GOSlim GOA Accession(s) feature_page
## 41                             GOSlim GOA Description feature_page
## 42                                       ArrayExpress feature_page
## 43                                       ChEMBL ID(s) feature_page
## 44                      Clone based Ensembl gene name feature_page
## 45                Clone based Ensembl transcript name feature_page
## 46                         Clone based VEGA gene name feature_page
## 47                   Clone based VEGA transcript name feature_page
## 48                                            CCDS ID feature_page
## 49  Database of Aberrant 3' Splice Sites (DBASS3) IDs feature_page
## 50                                   DBASS3 Gene Name feature_page
## 51  Database of Aberrant 5' Splice Sites (DBASS5) IDs feature_page
## 52                                   DBASS5 Gene Name feature_page
## 53                                  EMBL (Genbank) ID feature_page
## 54                       Ensembl Human Transcript IDs feature_page
## 55                      Ensembl Human Translation IDs feature_page
## 56                           LRG to Ensembl link gene feature_page
## 57                     LRG to Ensembl link transcript feature_page
## 58                                      EntrezGene ID feature_page
## 59                      EntrezGene transcript name ID feature_page
## 60           Eukaryotic Promoter Database (EPD) ID(s) feature_page
## 61                                             GeneDB feature_page
## 62                    Human Protein Atlas Antibody ID feature_page
## 63                             VEGA gene ID(s) (OTTG) feature_page
## 64                       VEGA transcript ID(s) (OTTT) feature_page
## 65                          VEGA protein ID(s) (OTTP) feature_page
## 66                                         HGNC ID(s) feature_page
## 67                                        HGNC symbol feature_page
## 68                               HGNC transcript name feature_page
## 69                                          MEROPS ID feature_page
## 70                                 MIM Gene Accession feature_page
## 71                               MIM Gene Description feature_page
## 72                               miRBase Accession(s) feature_page
## 73                                      miRBase ID(s) feature_page
## 74                            miRBase transcript name feature_page
## 75                                         MIM MORBID feature_page
## 76                                             PDB ID feature_page
## 77               Protein (Genbank) ID [e.g. AAA02487] feature_page
## 78                                        Reactome ID feature_page
## 79                                   Reactome gene ID feature_page
## 80                             Reactome transcript ID feature_page
## 81                    RefSeq mRNA [e.g. NM_001195597] feature_page
## 82          RefSeq mRNA predicted [e.g. XM_001125684] feature_page
## 83                      RefSeq ncRNA [e.g. NR_002834] feature_page
## 84            RefSeq ncRNA predicted [e.g. XR_108264] feature_page
## 85              RefSeq Protein ID [e.g. NP_001005353] feature_page
## 86    RefSeq Predicted Protein ID [e.g. XP_001720922] feature_page
## 87                                            Rfam ID feature_page
## 88                               Rfam transcript name feature_page
## 89                                      RNACentral ID feature_page
## 90                                            UCSC ID feature_page
## 91                                         Unigene ID feature_page
## 92                                            UniParc feature_page
## 93                           UniProt/TrEMBL Accession feature_page
## 94                        UniProt/SwissProt Accession feature_page
## 95                               UniProt Accession ID feature_page
## 96                                      WikiGene Name feature_page
## 97                                        WikiGene ID feature_page
## 98                               WikiGene Description feature_page
## 99                              Agilent CGH 44b probe feature_page
## 100               Agilent SurePrint G3 GE 8x60k probe feature_page
## 101            Agilent SurePrint G3 GE 8x60k v2 probe feature_page
## 102                Agilent WholeGenome 4x44k v1 probe feature_page
## 103                Agilent WholeGenome 4x44k v2 probe feature_page
## 104                             Affy HC G110 probeset feature_page
## 105                            Affy HG FOCUS probeset feature_page
## 106                      Affy HG U133-PLUS-2 probeset feature_page
## 107                          Affy HG U133A_2 probeset feature_page
## 108                            Affy HG U133A probeset feature_page
## 109                            Affy HG U133B probeset feature_page
## 110                           Affy HG U95AV2 probeset feature_page
## 111                             Affy HG U95B probeset feature_page
## 112                             Affy HG U95C probeset feature_page
## 113                             Affy HG U95D probeset feature_page
## 114                             Affy HG U95E probeset feature_page
## 115                             Affy HG U95A probeset feature_page
## 116                           Affy HuGene FL probeset feature_page
## 117                             Affy HTA-2_0 probeset feature_page
## 118                      Affy HuEx 1_0 st v2 probeset feature_page
## 119                    Affy HuGene 1_0 st v1 probeset feature_page
## 120                    Affy HuGene 2_0 st v1 probeset feature_page
## 121                                    Affy primeview feature_page
## 122                            Affy U133 X3P probeset feature_page
## 123                                    Codelink probe feature_page
## 124                       Illumina HumanWG 6 v1 probe feature_page
## 125                       Illumina HumanWG 6 v2 probe feature_page
## 126                       Illumina HumanWG 6 v3 probe feature_page
## 127                    Illumina Human HT 12 V3 probe  feature_page
## 128                    Illumina Human HT 12 V4 probe  feature_page
## 129                     Illumina Human Ref 8 V3 probe feature_page
## 130               Illumina human methylation 27 probe feature_page
## 131              Illumina human methylation 450 probe feature_page
## 132                            Phalanx OneArray probe feature_page
## 133                      Ensembl Protein Family ID(s) feature_page
## 134                        Ensembl Family Description feature_page
## 135                                          PIRSF ID feature_page
## 136                                       PIRSF start feature_page
## 137                                         PIRSF end feature_page
## 138                                    SUPERFAMILY ID feature_page
## 139                                 SUPERFAMILY start feature_page
## 140                                   SUPERFAMILY end feature_page
## 141                                          SMART ID feature_page
## 142                                       SMART start feature_page
## 143                                         SMART end feature_page
## 144                                HAMAP Accession ID feature_page
## 145                                       HAMAP start feature_page
## 146                                         HAMAP end feature_page
## 147                                         Pfscan ID feature_page
## 148                                      Pfscan start feature_page
## 149                                        Pfscan end feature_page
## 150                                    ScanProsite ID feature_page
## 151                                 ScanProsite start feature_page
## 152                                   ScanProsite end feature_page
## 153                                         PRINTS ID feature_page
## 154                                      PRINTS start feature_page
## 155                                        PRINTS end feature_page
## 156                                           Pfam ID feature_page
## 157                                        Pfam start feature_page
## 158                                          Pfam end feature_page
## 159                                        TIGRFAM ID feature_page
## 160                                     TIGRFAM start feature_page
## 161                                       TIGRFAM end feature_page
## 162                                         Gene3D ID feature_page
## 163                                      Gene3D start feature_page
## 164                                        Gene3D end feature_page
## 165                                     HMMPanther ID feature_page
## 166                                  HMMPanther start feature_page
## 167                                    HMMPanther end feature_page
## 168                                    BlastProdom ID feature_page
## 169                                 BlastProdom start feature_page
## 170                                   BlastProdom end feature_page
## 171                                       Interpro ID feature_page
## 172                        Interpro Short Description feature_page
## 173                              Interpro Description feature_page
## 174                                    Interpro start feature_page
## 175                                      Interpro end feature_page
## 176                              low complexity (SEG) feature_page
## 177                        low complexity (SEG) start feature_page
## 178                          low complexity (SEG) end feature_page
## 179                      Transmembrane domain (tmhmm) feature_page
## 180                Transmembrane domain (tmhmm) start feature_page
## 181                  Transmembrane domain (tmhmm) end feature_page
## 182                                    signal peptide feature_page
## 183                              signal peptide start feature_page
## 184                                signal peptide end feature_page
## 185                              coiled coil (ncoils) feature_page
## 186                        coiled coil (ncoils) start feature_page
## 187                          coiled coil (ncoils) end feature_page

listAttributes(ensembl) %>% filter(grepl("go", description, ignore.case = T)) # identify relevant attributes

##                                     name
## 1                                  go_id
## 2                              name_1006
## 3                        definition_1006
## 4                        go_linkage_type
## 5                         namespace_1003
## 6                   goslim_goa_accession
## 7                 goslim_goa_description
## 8          ggorilla_homolog_ensembl_gene
## 9  ggorilla_homolog_associated_gene_name
## 10      ggorilla_homolog_ensembl_peptide
## 11           ggorilla_homolog_chromosome
## 12          ggorilla_homolog_chrom_start
## 13            ggorilla_homolog_chrom_end
## 14       ggorilla_homolog_orthology_type
## 15              ggorilla_homolog_subtype
## 16 ggorilla_homolog_orthology_confidence
## 17            ggorilla_homolog_goc_score
## 18         ggorilla_homolog_wga_coverage
## 19              ggorilla_homolog_perc_id
## 20           ggorilla_homolog_perc_id_r1
## 21                   ggorilla_homolog_dn
## 22                   ggorilla_homolog_ds
##                                         description         page
## 1                                 GO Term Accession feature_page
## 2                                      GO Term Name feature_page
## 3                                GO Term Definition feature_page
## 4                             GO Term Evidence Code feature_page
## 5                                         GO domain feature_page
## 6                           GOSlim GOA Accession(s) feature_page
## 7                            GOSlim GOA Description feature_page
## 8                           Gorilla Ensembl Gene ID     homologs
## 9                      Gorilla associated gene name     homologs
## 10         Gorilla Ensembl Protein or Transcript ID     homologs
## 11                          Gorilla Chromosome Name     homologs
## 12                    Gorilla Chromosome start (bp)     homologs
## 13                      Gorilla Chromosome end (bp)     homologs
## 14                            Gorilla homology type     homologs
## 15                Last common ancestor with Gorilla     homologs
## 16     Gorilla orthology confidence [0 low, 1 high]     homologs
## 17            Gorilla Gene-order conservation score     homologs
## 18          Gorilla Whole-genome alignment coverage     homologs
## 19 %id. target Gorilla gene identical to query gene     homologs
## 20 %id. query gene identical to target Gorilla gene     homologs
## 21                                  dN with Gorilla     homologs
## 22                                  dS with Gorilla     homologs

attr   <-  c("affy_hugene_2_0_st_v1", "hgnc_symbol") 

# Inflamatory CTRA Affy id annotated
(inflame <- getBM(attributes = attr[1:2],
                  filters = attr[2],
                  values = inflamatory,
                  mart = ensembl) %>% tbl_df)

## # A tibble: 38 × 2
##    affy_hugene_2_0_st_v1 hgnc_symbol
##                    <int>       <chr>
## 1               16967771       CXCL8
## 2               16786587         FOS
## 3                     NA         FOS
## 4               16863287        FOSB
## 5               16740630       FOSL1
## 6                     NA       FOSL2
## 7               16878443       FOSL2
## 8               16901974        IL1A
## 9               16901986        IL1B
## 10                    NA        IL1B
## # ... with 28 more rows

(inflame <- inflame %>% 
 filter(!is.na(affy_hugene_2_0_st_v1)) %>% 
 mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1)))

## # A tibble: 27 × 2
##    affy_hugene_2_0_st_v1 hgnc_symbol
##                    <chr>       <chr>
## 1               16967771       CXCL8
## 2               16786587         FOS
## 3               16863287        FOSB
## 4               16740630       FOSL1
## 5               16878443       FOSL2
## 6               16901974        IL1A
## 7               16901986        IL1B
## 8               17044177         IL6
## 9               16687875         JUN
## 10              16858710        JUNB
## # ... with 17 more rows

(inflame %>% filter(!is.na(affy_hugene_2_0_st_v1)) %>% summarise(n_distinct(hgnc_symbol))) # describe: all inflamatory molecules present

## # A tibble: 1 × 1
##   `n_distinct(hgnc_symbol)`
##                       <int>
## 1                        19

# Interferon CTRA Affy id annotated
(interfere <- getBM(attributes = attr[1:2],
                    filters = attr[2],
                    values = interferonTypeI,
                    mart = ensembl) %>% tbl_df)

## # A tibble: 54 × 2
##    affy_hugene_2_0_st_v1 hgnc_symbol
##                    <int>       <chr>
## 1                     NA        GBP1
## 2                     NA       IFI16
## 3               16672390       IFI16
## 4                     NA       IFI27
## 5               16787814       IFI27
## 6                     NA     IFI27L1
## 7               16787799     IFI27L1
## 8                     NA     IFI27L2
## 9               16796229     IFI27L2
## 10                    NA       IFI30
## # ... with 44 more rows

(interfere <- interfere %>% 
 filter(!is.na(affy_hugene_2_0_st_v1)) %>% 
 mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1)))

## # A tibble: 34 × 2
##    affy_hugene_2_0_st_v1 hgnc_symbol
##                    <chr>       <chr>
## 1               16672390       IFI16
## 2               16787814       IFI27
## 3               16787799     IFI27L1
## 4               16796229     IFI27L2
## 5               16859763       IFI30
## 6               16834545       IFI35
## 7               16666509       IFI44
## 8               16666485      IFI44L
## 9               16684080        IFI6
## 10              16904365       IFIH1
## # ... with 24 more rows

(interfere %>% filter(!is.na(affy_hugene_2_0_st_v1)) %>% summarise(n_distinct(hgnc_symbol))) # describe: all inflamatory molecules present

## # A tibble: 1 × 1
##   `n_distinct(hgnc_symbol)`
##                       <int>
## 1                        29

# found CTRA Affy id annotated
found  <- getBM(attributes = attr,
                filters = attr[2],
                values = found_ctra,
                mart = ensembl) %>% tbl_df
found <- found %>% 
filter(!is.na(affy_hugene_2_0_st_v1)) %>% 
mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1))
(found %>% filter(!is.na(affy_hugene_2_0_st_v1)) %>% count(hgnc_symbol) %>% print(n = Inf)) # duplicates for IFITM4P, REL and TNF

## # A tibble: 50 × 2
##    hgnc_symbol     n
##          <chr> <int>
## 1        CXCL8     1
## 2          FOS     1
## 3         FOSB     1
## 4        FOSL1     1
## 5        FOSL2     1
## 6        IFI16     1
## 7        IFI27     1
## 8      IFI27L1     1
## 9      IFI27L2     1
## 10       IFI30     1
## 11       IFI35     1
## 12       IFI44     1
## 13      IFI44L     1
## 14        IFI6     1
## 15       IFIH1     1
## 16       IFIT1     1
## 17      IFIT1B     1
## 18       IFIT2     1
## 19       IFIT3     1
## 20       IFIT5     1
## 21      IFITM1     1
## 22      IFITM2     1
## 23      IFITM3     1
## 24     IFITM4P     6
## 25      IFITM5     1
## 26       IFNB1     1
## 27       IGLL1     1
## 28        IL1A     1
## 29        IL1B     1
## 30         IL6     1
## 31        IRF2     1
## 32        IRF7     1
## 33        IRF8     1
## 34      JCHAIN     1
## 35         JUN     1
## 36        JUNB     1
## 37        JUND     1
## 38         MX1     1
## 39       NFKB1     1
## 40       NFKB2     1
## 41        OAS1     1
## 42        OAS2     1
## 43        OAS3     1
## 44        OASL     1
## 45       PTGS1     1
## 46       PTGS2     1
## 47         REL     2
## 48        RELA     1
## 49        RELB     1
## 50         TNF     8

## # A tibble: 50 × 2
##    hgnc_symbol     n
##          <chr> <int>
## 1        CXCL8     1
## 2          FOS     1
## 3         FOSB     1
## 4        FOSL1     1
## 5        FOSL2     1
## 6        IFI16     1
## 7        IFI27     1
## 8      IFI27L1     1
## 9      IFI27L2     1
## 10       IFI30     1
## # ... with 40 more rows

(feat %>% filter(hgnc_symbol %in% c("TNF", "IFITM4P", "REL")))   # confirm duplicates for IFITM4P, REL and TNF

## # A tibble: 15,024 × 9
##    affy_hugene_2_0_st_v1 hgnc_symbol entrezgene ensembl_gene_id
##                    <chr>       <chr>      <int>           <chr>
## 1               17030620         TNF       7124 ENSG00000232810
## 2               17030620         TNF       7124 ENSG00000232810
## 3               17030620         TNF       7124 ENSG00000232810
## 4               17030620         TNF       7124 ENSG00000232810
## 5               17030620         TNF       7124 ENSG00000232810
## 6               17030620         TNF       7124 ENSG00000232810
## 7               17030620         TNF       7124 ENSG00000232810
## 8               17030620         TNF       7124 ENSG00000232810
## 9               17030620         TNF       7124 ENSG00000232810
## 10              17030620         TNF       7124 ENSG00000232810
## # ... with 15,014 more rows, and 5 more variables: description <chr>,
## #   phenotype_description <chr>, definition_1006 <chr>, name_1006 <chr>,
## #   go_id <chr>

# subset features/samples
i <- inflame$affy_hugene_2_0_st_v1
i <- found$affy_hugene_2_0_st_v1
j <- !is.na(dat$Chen_Avg_ShCa)
dat0  = dat[i, j]

# Tidy edat
library("tibble") # for rownames_to_column()
# average across replicated features (Affy ids with the same hgnc_symbol),  then create matrix
edat  <- dat0 %>% 
eData %>% 
tbl_df %>% 
rownames_to_column() %>% 
mutate(hgnc_symbol = found$hgnc_symbol) %>% 
group_by(hgnc_symbol) %>% 
summarise_at(vars(matches("[0-9]")), funs(mean)) 

# Transpose to attain statistics/relational-database/tidy convention (transposed microarray convention)
edatt <- edat %>% dplyr::select(matches("[0-9]")) %>% t %>% as.data.frame() %>% tibble::rownames_to_column() %>% tbl_df
colnames(edatt) = c("IDNum", edat$hgnc_symbol)

# just relevant phenotype, then long format
pdat  <- pData(dat0) %>% dplyr::select(IDNum, Chen_Avg_ShCa, bmi) 
jdat  <- left_join(pdat, edatt, by = "IDNum") %>% tbl_df %>% gather(hgnc_symbol, express, -(1:3)) 

# clumsy way to add molecule group labels
jdat <- jdat %>% 
mutate(isInflame =         1*(hgnc_symbol %in% inflamatory)) %>%  
mutate(isinterferonTypeI = 2*(hgnc_symbol %in% interferonTypeI)) %>%  
mutate(isAntibody =        3*(hgnc_symbol %in% antibody)) %>%
mutate(molecule = as.factor(isInflame + isinterferonTypeI + isAntibody)) 
table(jdat$molecule)

## 
##   1   2   3 
## 475 725  50

levels(jdat$molecule) = list(inflamatory="1", interferon="2", antibody="3") 
table(jdat$molecule)

## 
## inflamatory  interferon    antibody 
##         475         725          50

jdat <- jdat %>% dplyr::select(-isInflame, -isinterferonTypeI, -isAntibody)

# standardize features
jdat <- jdat %>% group_by(hgnc_symbol) %>% mutate(expressStandardized = (express - mean(express)))

# jdat %>% dplyr::select(Chen_Avg_ShCa, bmi, express) %>% plot()

# some plots
library.cran("ggvis")

## 
## Attaching package: 'ggvis'

## The following object is masked from 'package:ggplot2':
## 
##     resolution

jdat %>% group_by(hgnc_symbol) %>% ggvis(~express, fill = ~hgnc_symbol) %>% layer_densities()

jdat %>% group_by(hgnc_symbol) %>% ggvis(~expressStandardized, fill = ~hgnc_symbol) %>% layer_densities()

jdat %>% filter(molecule == "interferon") %>% group_by(hgnc_symbol) %>% ggvis(~express, fill = ~hgnc_symbol) %>% layer_densities()

jdat %>% filter(molecule == "inflamatory") %>% group_by(hgnc_symbol) %>% ggvis(~express, fill = ~hgnc_symbol) %>% layer_densities()

jdat %>% filter(molecule == "antibody") %>% group_by(hgnc_symbol) %>% ggvis(~express, fill = ~hgnc_symbol) %>% layer_densities()

jdat %>% group_by(molecule) %>% ggvis(~express, fill = ~molecule) %>% layer_densities()

jdat %>% group_by(molecule) %>% ggvis(~express, fill = ~molecule) %>% layer_histograms()

## Guessing width = 0.2 # range / 42

jdat %>%
  filter(molecule == "interferon") %>%
  group_by(hgnc_symbol) %>%
  ggvis(~Chen_Avg_ShCa, ~expressStandardized) %>%
  layer_points() %>%
  layer_model_predictions(formula = expressStandardized ~ Chen_Avg_ShCa, model = "lm", stroke = ~molecule)

edat %>% dplyr::select(matches("[0-9]")) %>% as.matrix %>% heatmap()

jdat %>%
  filter(molecule == "interferon", IDNum != "1092") %>%
  group_by(hgnc_symbol) %>%
  ggvis(~Chen_Avg_ShCa, ~expressStandardized) %>%
  layer_points() %>%
  layer_model_predictions(formula = expressStandardized ~ Chen_Avg_ShCa, model = "lm", stroke = ~molecule)

jdat %>%
  filter(molecule == "inflamatory") %>%
  group_by(hgnc_symbol) %>%
  ggvis(~Chen_Avg_ShCa, ~expressStandardized) %>%
  layer_points() %>%
  layer_model_predictions(formula = expressStandardized ~ Chen_Avg_ShCa, model = "lm", stroke = ~molecule)

jdat %>%
  filter(molecule == "antibody") %>%
  group_by(hgnc_symbol) %>%
  ggvis(~Chen_Avg_ShCa, ~expressStandardized) %>%
  layer_points() %>%
  layer_model_predictions(formula = expressStandardized ~ Chen_Avg_ShCa, model = "lm", stroke = ~molecule)

library.cran("lme4")

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:ggvis':
## 
##     band

## The following object is masked from 'package:tidyr':
## 
##     expand

lmer(expressStandardized ~ Chen_Avg_ShCa + (1|IDNum) + (1|hgnc_symbol), data = (jdat %>% filter(molecule == "interferon"))) %>% summary

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## expressStandardized ~ Chen_Avg_ShCa + (1 | IDNum) + (1 | hgnc_symbol)
##    Data: (jdat %>% filter(molecule == "interferon"))
## 
## REML criterion at convergence: 961.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8486 -0.4684  0.0226  0.4678  6.1390 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  hgnc_symbol (Intercept) 0.0000   0.0000  
##  IDNum       (Intercept) 0.1118   0.3344  
##  Residual                0.1987   0.4458  
## Number of obs: 725, groups:  hgnc_symbol, 29; IDNum, 25
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)   -0.07645    0.07892  -0.969
## Chen_Avg_ShCa -0.17779    0.08951  -1.986
## 
## Correlation of Fixed Effects:
##             (Intr)
## Chn_Avg_ShC 0.488

lmer(expressStandardized ~ Chen_Avg_ShCa + (1|IDNum) + (1|hgnc_symbol), data = (jdat %>% filter(molecule == "interferon", IDNum != "1092"))) %>% summary

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## expressStandardized ~ Chen_Avg_ShCa + (1 | IDNum) + (1 | hgnc_symbol)
##    Data: (jdat %>% filter(molecule == "interferon", IDNum != "1092"))
## 
## REML criterion at convergence: 617.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.4819 -0.5404  0.0308  0.5628  4.5436 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  hgnc_symbol (Intercept) 0.00000  0.0000  
##  IDNum       (Intercept) 0.02512  0.1585  
##  Residual                0.13187  0.3631  
## Number of obs: 696, groups:  hgnc_symbol, 29; IDNum, 24
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)   -0.10086    0.03957  -2.549
## Chen_Avg_ShCa -0.09437    0.04585  -2.058
## 
## Correlation of Fixed Effects:
##             (Intr)
## Chn_Avg_ShC 0.459

lmer(expressStandardized ~ Chen_Avg_ShCa + (1|IDNum) + (1|hgnc_symbol), data = (jdat %>% filter(molecule == "inflamatory"))) %>% summary

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## expressStandardized ~ Chen_Avg_ShCa + (1 | IDNum) + (1 | hgnc_symbol)
##    Data: (jdat %>% filter(molecule == "inflamatory"))
## 
## REML criterion at convergence: 261.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1652 -0.5755  0.0123  0.5861  3.0870 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  IDNum       (Intercept) 0.006626 0.0814  
##  hgnc_symbol (Intercept) 0.000000 0.0000  
##  Residual                0.095302 0.3087  
## Number of obs: 475, groups:  IDNum, 25; hgnc_symbol, 19
## 
## Fixed effects:
##                 Estimate Std. Error t value
## (Intercept)   -0.0007244  0.0247184  -0.029
## Chen_Avg_ShCa -0.0016846  0.0280330  -0.060
## 
## Correlation of Fixed Effects:
##             (Intr)
## Chn_Avg_ShC 0.488

lmer(expressStandardized ~ Chen_Avg_ShCa + (1|IDNum) + (1|hgnc_symbol), data = (jdat %>% filter(molecule == "antibody"))) %>% summary

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## expressStandardized ~ Chen_Avg_ShCa + (1 | IDNum) + (1 | hgnc_symbol)
##    Data: (jdat %>% filter(molecule == "antibody"))
## 
## REML criterion at convergence: 99.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.58070 -0.45096  0.04447  0.57130  2.28322 
## 
## Random effects:
##  Groups      Name        Variance  Std.Dev. 
##  IDNum       (Intercept) 4.698e-20 2.167e-10
##  hgnc_symbol (Intercept) 0.000e+00 0.000e+00
##  Residual                3.980e-01 6.309e-01
## Number of obs: 50, groups:  IDNum, 25; hgnc_symbol, 2
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)    0.01103    0.10219   0.108
## Chen_Avg_ShCa  0.02566    0.11590   0.221
## 
## Correlation of Fixed Effects:
##             (Intr)
## Chn_Avg_ShC 0.488

lmer(expressStandardized ~ Chen_Avg_ShCa + bmi + (1|IDNum) + (1|hgnc_symbol), data = (jdat %>% filter(molecule == "interferon"))) %>% summary

## Linear mixed model fit by REML ['lmerMod']
## Formula: expressStandardized ~ Chen_Avg_ShCa + bmi + (1 | IDNum) + (1 |  
##     hgnc_symbol)
##    Data: (jdat %>% filter(molecule == "interferon"))
## 
## REML criterion at convergence: 965.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8539 -0.4674  0.0199  0.4650  6.1337 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  hgnc_symbol (Intercept) 0.0000   0.0000  
##  IDNum       (Intercept) 0.1111   0.3333  
##  Residual                0.1987   0.4458  
## Number of obs: 725, groups:  hgnc_symbol, 29; IDNum, 25
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)    0.62762    0.66384   0.945
## Chen_Avg_ShCa -0.11261    0.10810  -1.042
## bmi           -0.04410    0.04128  -1.068
## 
## Correlation of Fixed Effects:
##             (Intr) C_A_SC
## Chn_Avg_ShC  0.608       
## bmi         -0.993 -0.564

lmer(expressStandardized ~ Chen_Avg_ShCa + (1|IDNum) + (1|hgnc_symbol), data = jdat) %>% summary

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## expressStandardized ~ Chen_Avg_ShCa + (1 | IDNum) + (1 | hgnc_symbol)
##    Data: jdat
## 
## REML criterion at convergence: 1612
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.8706 -0.4719 -0.0024  0.4681  7.6510 
## 
## Random effects:
##  Groups      Name        Variance Std.Dev.
##  hgnc_symbol (Intercept) 0.00000  0.0000  
##  IDNum       (Intercept) 0.03373  0.1837  
##  Residual                0.20221  0.4497  
## Number of obs: 1250, groups:  hgnc_symbol, 50; IDNum, 25
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)   -0.04417    0.04452  -0.992
## Chen_Avg_ShCa -0.10273    0.05049  -2.034
## 
## Correlation of Fixed Effects:
##             (Intr)
## Chn_Avg_ShC 0.488

jdat %>% ggvis(~bmi, ~Chen_Avg_ShCa) %>% layer_points(size = ~bmi)

jdat %>% filter(IDNum != "1092") %>% ggvis(~bmi, ~Chen_Avg_ShCa) %>% layer_points(size = ~bmi)

jdat %>% filter(IDNum != "1092") %>% ggvis(~bmi, ~expressStandardized) %>% layer_points()

jdat %>% ggvis(~bmi) %>% layer_histograms()

## Guessing width = 0.2 # range / 36

jdat %>%
  group_by(hgnc_symbol) %>%
  ggvis(~Chen_Avg_ShCa, ~express) %>%
  layer_points(fill = ~hgnc_symbol) %>%
  layer_model_predictions(formula = express ~ Chen_Avg_ShCa, model = "loess", stroke = ~hgnc_symbol)

jdat %>%
  group_by(molecule) %>%
  ggvis(~Chen_Avg_ShCa, ~express) %>%
  layer_smooths(stroke = ~factor(molecule)) %>%
  layer_points(fill = ~factor(molecule))

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

# ggplot2
jdat %>% ggplot(aes(x = Chen_Avg_ShCa, y = express, col = hgnc_symbol)) +  geom_point(alpha = 0.2) + geom_smooth(method = "loess", se = FALSE, aes(col = hgnc_symbol))

jdat %>% ggplot(aes(x = Chen_Avg_ShCa, y = express, col = hgnc_symbol)) +  geom_point(alpha = 0.2) + geom_smooth(method = "lm",    se = FALSE, aes(col = hgnc_symbol))

# messy way to get a tidy df with one new collum giving molecule class name (inflammatory, antibody, interferon)
tidE_3 <- edat %>% 
mutate(isInflame = (hgnc_symbol %in% inflamatory)) %>%  
mutate(isinterferonTypeI = (hgnc_symbol %in% interferonTypeI)) %>%  
mutate(isAntibody = (hgnc_symbol %in% antibody)) %>% 
gather(key = moleculeFamily, value, isInflame, isinterferonTypeI, isAntibody) %>% 
filter(value == TRUE) %>% dplyr::select(-value) %>% 
group_by(moleculeFamily) %>% 
summarise_at(vars(matches("[0-9]")), funs(mean)) 
# call limma here, or just lm. This is one data compression.
tidE_4 <- rbind(tidE_3, c(NA, dat0$Chen_Avg_ShCa)) 
tidE_4 %>% dplyr::select(matches("[0-9]")) %>% t %>% rcorr( type = "spearman")

##       1     2     3     4
## 1  1.00 -0.21 -0.15  0.08
## 2 -0.21  1.00  0.22 -0.02
## 3 -0.15  0.22  1.00 -0.47
## 4  0.08 -0.02 -0.47  1.00
## 
## n= 25 
## 
## 
## P
##   1      2      3      4     
## 1        0.3065 0.4881 0.6997
## 2 0.3065        0.2994 0.9427
## 3 0.4881 0.2994        0.0180
## 4 0.6997 0.9427 0.0180

# format for linear model (limma)
matE  = edat %>% 
dplyr::select(matches("[0-9]")) %>% 
as.matrix()
rownames(matE) <- unique(edat$hgnc_symbol) 

# linear model
library.bioconductor("limma")

## 
## Attaching package: 'limma'

## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA

X          <- model.matrix(~ Chen_Avg_ShCa, data = pData(dat0))
Y          <- matE
fit        <- lmFit(Y, X)
ebayes     <- eBayes(fit) # Variance parameter inference is variable/unstable, because few replicates. T-statistic penalizes this instability (low df). In conventional Type-I paradigm must increase threshold, and loose power. Consider pseudo-replicates to borrow strength across.
(top       <- topTable(ebayes, coef="Chen_Avg_ShCa", adjust="fdr", n=100) %>% tbl_df %>% tibble::rownames_to_column("hgnc_symbol"))

## # A tibble: 50 × 7
##    hgnc_symbol      logFC  AveExpr         t     P.Value adj.P.Val
##          <chr>      <dbl>    <dbl>     <dbl>       <dbl>     <dbl>
## 1        IFIH1 -0.3251154 6.624734 -3.001100 0.005806392 0.2903196
## 2          MX1 -0.3892383 6.869637 -2.545291 0.017080578 0.3058802
## 3        IFIT3 -0.4474916 5.538752 -2.442369 0.021571633 0.3058802
## 4          JUN  0.2582925 4.075052  2.321041 0.028248913 0.3058802
## 5        IFI35 -0.1764350 6.506621 -2.094531 0.045935890 0.3058802
## 6        IFI44 -0.4198591 5.436538 -1.948257 0.062070961 0.3058802
## 7        IFIT5 -0.2439950 5.976889 -1.945992 0.062355743 0.3058802
## 8         OAS1 -0.2658196 6.822599 -1.908996 0.067170972 0.3058802
## 9         OASL -0.2509461 6.024771 -1.902800 0.068008277 0.3058802
## 10        OAS2 -0.2256543 6.871324 -1.851076 0.075359000 0.3058802
## # ... with 40 more rows, and 1 more variables: B <dbl>

(top       <- topTable(ebayes, coef="Chen_Avg_ShCa", adjust="fdr", number = Inf) %>% tbl_df %>% tibble::rownames_to_column("hgnc_symbol"))

## # A tibble: 50 × 7
##    hgnc_symbol      logFC  AveExpr         t     P.Value adj.P.Val
##          <chr>      <dbl>    <dbl>     <dbl>       <dbl>     <dbl>
## 1        IFIH1 -0.3251154 6.624734 -3.001100 0.005806392 0.2903196
## 2          MX1 -0.3892383 6.869637 -2.545291 0.017080578 0.3058802
## 3        IFIT3 -0.4474916 5.538752 -2.442369 0.021571633 0.3058802
## 4          JUN  0.2582925 4.075052  2.321041 0.028248913 0.3058802
## 5        IFI35 -0.1764350 6.506621 -2.094531 0.045935890 0.3058802
## 6        IFI44 -0.4198591 5.436538 -1.948257 0.062070961 0.3058802
## 7        IFIT5 -0.2439950 5.976889 -1.945992 0.062355743 0.3058802
## 8         OAS1 -0.2658196 6.822599 -1.908996 0.067170972 0.3058802
## 9         OASL -0.2509461 6.024771 -1.902800 0.068008277 0.3058802
## 10        OAS2 -0.2256543 6.871324 -1.851076 0.075359000 0.3058802
## # ... with 40 more rows, and 1 more variables: B <dbl>

post_hoc_set_level <- function(Y, X, perm = TRUE){
    # Geoman/Buhlmann 2007 (Table 4): Tuckey' non-competitive subject sampling 2 x 2 table. This adapts the historic post hoc method to test a self-contained null hypothesis and to calculate the P-value using subject sampling.
    # The null hypothesis assumes (X1, Y1), …, (Xn, Yn) are independent and identically distributed. Then the implied null disribution of any test statistic calculated from (X1, Y1),…, (Xn, Yn) has a distribution that can be non-parametrically constructed as the empirical distribution of the same test-statistic based on (X1, Yπ (1)), …, (Xn, Yπ (n)), where the distribution is generated from all (or many randomly generated) permutations π : {1, … ,n} → {1, … ,n}. 
    # Criticism: too much researcher df to define feature-levl "discoveries"
    # Criticism: The permutation null distribution requires the assumption that there is no relationship between the gene expressions 
    # and covariates: permuting destroys these associations. This makes the permutation null distribution less useful when covariates are present. 
    if(perm){
        perms      <- sample(dim(X)[1])
        X          <- X[perms, ]
    } 

    # some feature-level statistic of effect strength, e.g. linear model 
    fit        <- lmFit(Y, X)
    ebayes     <- eBayes(fit) # Variance parameter inference is variable/unstable, because few replicates. T-statistic penalizes this instability (low df). In conventional Type-I paradigm must increase threshold, and loose power. Consider pseudo-replicates to borrow strength across.
    top        <- topTable(ebayes, coef="Chen_Avg_ShCa", adjust="fdr", n=Inf) %>% tbl_df %>% tibble::rownames_to_column("hgnc_symbol")
    discovery  <- (top$P.Value < 0.05) # this is just a labeling and so no multiple comparison correction needed
    discovery  <- abs(top$logFC) > 0.3  # this is just a labeling and so no multiple comparison correction needed
    discovery  <- abs(top$t) > abs(qt(0.05,25))  # this is just a labeling and so no multiple comparison correction needed

    return(sum(discovery))
}

k_perm  = 1000 %>% replicate(post_hoc_set_level(Y = Y, X = X)) # null assuming exchangeablility 
(k_obs  = post_hoc_set_level(Y = Y, X = X, perm =  FALSE))      # observed

## [1] 14

(p_val_set_level = sum(k_perm >= k_obs) / length(k_perm))

## [1] 0.053

hist(k_perm, breaks = 1000) # visualize null

# Geomans Global Score test
# Being a score test, the global test is most focused on alternatives close to the null hypothesis. This means that the global test is good at detecting alternatives that have many small effects (in terms of the chosen parametrization), but that it may not be the optimal test to use if the effects are very large.
library.bioconductor("globaltest")

## 
## Attaching package: 'globaltest'

## The following object is masked from 'package:tidyr':
## 
##     extract

n_perm  = 1e5

Y = dat0$Chen_Avg_ShCa # NB change notation: Y = X and X = Y!!!!
X = matE %>% t %>% tbl_df
summary(gt(Y ~ 1, Y ~ ., data = X, permutations = n_perm))

## "gt.object" object from package globaltest
## 
## Call:
## gt(response = Y ~ 1, alternative = Y ~ ., data = X, permutations = n_perm) 
## 
## Model: linear regression.
## Degrees of freedom: 25 total; 1 null; 1 + 50 alternative.
## Null distibution:  99999 random permutations.
## 
##   p-value Statistic Expected Std.dev #Cov
## 1  0.0607      8.92     4.17    2.66   50

# with addtional confound bmi
bmi = dat0$bmi
summary(gt(Y ~ 1 + bmi, Y ~ . + bmi, data = X, permutations = n_perm))

## "gt.object" object from package globaltest
## 
## Call:
## gt(response = Y ~ 1 + bmi, alternative = Y ~ . + bmi, data = X,      permutations = n_perm) 
## 
## Model: linear regression.
## Degrees of freedom: 25 total; 2 null; 2 + 51 alternative.
## Null distibution:  99999 random permutations.
## 
##   p-value Statistic Expected Std.dev #Cov
## 1   0.408      3.74     4.15    2.78   51

# with all feature interactions
summary(gt(Y ~ 1, Y ~ 1 + .*., data = X, permutations = n_perm))

## "gt.object" object from package globaltest
## 
## Call:
## gt(response = Y ~ 1, alternative = Y ~ 1 + . * ., data = X, permutations = n_perm) 
## 
## Model: linear regression.
## Degrees of freedom: 25 total; 1 null; 1 + 1275 alternative.
## Null distibution:  99999 random permutations.
## 
##   p-value Statistic Expected Std.dev #Cov
## 1  0.0573      9.86     4.16    3.07 1275

hist(gt(Y,X, permutations=1e4))

# only interferon
interfer_ind = which(names(X) %in% interferonTypeI)
inflame_ind  = which(names(X) %in% inflamatory)
antibody_ind = which(names(X) %in% antibody)
summary(gt(Y ~ 1, Y ~ 1 + ., data = X[, interfer_ind], permutations = n_perm))

## "gt.object" object from package globaltest
## 
## Call:
## gt(response = Y ~ 1, alternative = Y ~ 1 + ., data = X[, interfer_ind],      permutations = n_perm) 
## 
## Model: linear regression.
## Degrees of freedom: 25 total; 1 null; 1 + 29 alternative.
## Null distibution:  99999 random permutations.
## 
##   p-value Statistic Expected Std.dev #Cov
## 1  0.0565      10.5     4.17    3.35   29

summary(gt(Y ~ 1, Y ~ 1 + ., data = X[, inflame_ind],  permutations = n_perm))

## "gt.object" object from package globaltest
## 
## Call:
## gt(response = Y ~ 1, alternative = Y ~ 1 + ., data = X[, inflame_ind],      permutations = n_perm) 
## 
## Model: linear regression.
## Degrees of freedom: 25 total; 1 null; 1 + 19 alternative.
## Null distibution:  99999 random permutations.
## 
##   p-value Statistic Expected Std.dev #Cov
## 1   0.417      4.23     4.17    1.71   19

summary(gt(Y ~ 1, Y ~ 1 + ., data = X[, antibody_ind], permutations = n_perm))

## "gt.object" object from package globaltest
## 
## Call:
## gt(response = Y ~ 1, alternative = Y ~ 1 + ., data = X[, antibody_ind],      permutations = n_perm) 
## 
## Model: linear regression.
## Degrees of freedom: 25 total; 1 null; 1 + 2 alternative.
## Null distibution:  99999 random permutations.
## 
##   p-value Statistic Expected Std.dev #Cov
## 1   0.568      1.87     4.19    4.96    2

all features?

X = eData(dat[, j]) %>% t %>% tbl_df

heatmaps

labDfns = as.character(as.numeric(dat0$Chen_Avg_ShCa < 0) + 1) par(mfrow = c(1, 3)) X[, inflame_ind] %>% t %>% heatmap(ColSideColors = labDfns) X[, interfer_ind] %>% t %>% heatmap(ColSideColors = labDfns) X[, antibody_ind] %>% t %>% heatmap(ColSideColors = labDfns)

Unsupervised gene sets (from hard k-means classification)

kmeans1 = kmeans(X[, ],centers=3, nstart = 1000) lm(dat0$Chen_Avg_ShCa ~ as.factor(kmeans1$cluster)) %>% summary plot(kmeans1$cluster, dat0$Chen_Avg_ShCa) # subject 1092 is outlying (number 3) lm(dat0$Chen_Avg_ShCa[-3] ~ as.factor(kmeans1$cluster[-3])) %>% summary plot(dat0$Chen_Avg_ShCa[-3], kmeans1$cluster[-3])

feat %>% group_by(hgnc_symbol) %>% summarise(n_distinct(affy_hugene_2_0_st_v1)) %>% dim # how many HGNC genes measured on our Affy feat %>% filter(grepl(“microRNA”, description, ignore.case = T)) query = “” query = “behavior” query = “aggressive” query = “defense” query = “learning” query = “virus” query = “regulation of gene expression” query = “immune” query = “fear” query = “inflammatory” query = “cytokine” query = “interferon” query = “response to cortisol” query = “response to epinephrine” a = feat %>% filter(grepl(query, name_1006, ignore.case = TRUE)) %>% dplyr::select(affy_hugene_2_0_st_v1, name_1006, go_id, hgnc_symbol) %>% filter(!duplicated(affy_hugene_2_0_st_v1)) # intersection with ctra? intersect(ctra, a$hgnc_symbol) %>% length

X = dat[a$affy_hugene_2_0_st_v1, j] %>% eData() %>% t summary(gt(Y ~ 1, Y ~ ., data = X, permutations = n_perm))

XY = left_join(pdat, X %>% as.data.frame() %>% tibble::rownames_to_column(“IDNum”) %>% tbl_df, by = “IDNum”)

XY = gather(XY, affy, express, matches(“[0-9]”)) # long format

XY %>% group_by(affy) %>% ggvis(~Chen_Avg_ShCa, ~express) %>% layer_points(fill = ~affy) %>% layer_model_predictions(formula = express ~ Chen_Avg_ShCa, model = “lm”, stroke = ~affy)

descriptives

a$name_1006 %>% unique a %>% dplyr::select(name_1006, go_id) %>% unique a %>% dplyr::select(affy_hugene_2_0_st_v1) %>% n_distinct() # how many affy probes in this ontology?

plot

cases = !is.na(dat$Chen_Avg_ShCa) chen = dat$Chen_Avg_ShCa[cases] y = dat[a$affy_hugene_2_0_st_v1, cases] %>% eData() rownames(y) = a$name_1006 rownames(y) = a$hgnc_symbol heatmap(y)

marginal correlations between chen and prespecified gene subsets

d = dim(y)[1] rho = rcorr(t(rbind(chen, y)), type = “spearman”) rho$r[1,2:d] %>% hist(breaks = 10) rho$P[1,2:d] %>% hist(breaks = 10) rho$P[1,2:d] %>% min

labels

km = kmeans(t(matE),centers=2, iter.max = 1000, nstart = 1000000) labKm = as.character(km$cluster) labDfns = as.character(as.numeric(dat0$Chen_Avg_ShCa <= 0) + 1) # plot heatmap(matE, ColSideColors = labKm, labRow = dat0$hgnc_symbol) heatmap(matE, ColSideColors = labDfns, labRow = dat0$hgnc_symbol) glMDSPlot(matE, groups = labKm) # mean inflamatory activation (subject_specific_inflamation = colMeans(matE)) %>% hist(breaks = 100) a = rcorr(t(rbind(subject_specific_inflamation, dat0$Chen_Avg_ShCa))) # component-wise inflamatory activation a = rcorr(t(rbind(matE, dat0$Chen_Avg_ShCa))) tidE$hgnc_symbol[a$P[20, ] < 0.05]

Core CTRA Affy id annotated (Steve Cole suggested this in email correspondance)

(ctra_core <- getBM(attributes = attr[1:2], filters = attr[2], values = ctraCore, mart = ensembl) %>% tbl_df) (ctra_core <- ctra_core %>% filter(!is.na(affy_hugene_2_0_st_v1)) %>% mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1))) # describe ctra_core %>% summarise(n_distinct(hgnc_symbol)) # all inflamatory molecules present dat0 = dat[ctra_core$affy_hugene_2_0_st_v1, !is.na(dat$Chen_Avg_ShCa)] tidE = dat0 %>% eData %>% tbl_df %>% tibble::rownames_to_column() %>% mutate(hgnc_symbol = ctra_core$hgnc_symbol) %>% group_by(hgnc_symbol) %>% summarise_at(vars(matches("[0-9]")), funs(mean)) matE = tidE %>% dplyr::select(matches("[0-9]")) %>% as.matrix() km = kmeans(t(matE),centers=3, iter.max = 1000, nstart = 1000000) labKm = as.character(km$cluster) heatmap(matE, labRow = tidE$hgnc_symbol, ColSideColors = labKm) glMDSPlot(matE, groups = labKm) # multidimensional scaling

hierarchical clustering

edata = log2(edata + 1)

dist1 = dist(t(matE)) heatmap(as.matrix(dist1),Colv=NA,Rowv=NA) hclust1 = hclust(dist1) plot(hclust1, labels = labels) plot(hclust1, labels = labels, hang = -1) dend = as.dendrogram(hclust1) dend = color_labels(hclust1,2,col=1:2) # identify/color 2 groups plot(dend) labels_colors(dend) <- labDfns plot(dend)

k-means

kmeans1 = kmeans(edata,centers=2) names(kmeans1) matplot(t(kmeans1$centers),col=1:2,type="l", lwd=3) table(kmeans1$cluster) heatmap(as.matrix(edata)[order(kmeans1$cluster),],col=colramp,Colv=NA,Rowv=NA) # reorder the data according to cluster membership

MA plot of fold-differences (log scale differences) ignores noise as explanation, so meaningless unless variance is constant over gene- or group.

a <- rowMeans(edata) d <- rowMeans(edata[, labels == “2”]) - rowMeans(edata[, labels == “1”]) smoothScatter(a, d, main=“MA plot”, xlab=“A”, ylab=“M”) abline(h=c(-1,1), col=“red”)

The goal of GWAS is often hypothesis-generating machines.

marginal vs conditional

Power to detect causal variants may vary between methods (real disease causing locus)

Many non-causal traits more prevalent in the “case population”

A major concern in GWAS is the need to account for the complicated dependence structure of the data, both between loci as well as between individuals. Dependence structure (due to discrete subpopulations or smooth spatial variatian, families, cryptic relatedness)

correcting for dependence among loci (for example, due to population structure).

The mixed model, which handles population structure by estimating the phenotypic covariance that is due to genetic relatedness—or kinship—between individuals, has previously been shown to perform well in GWAS.

disentangle genetic correlations from environmental correlations, whenever these are uncorrelated

Also dependent phenotypes! (A mixed-model approach for genome-wide association studies of correlated traits in structured populations)

In human genetics, disentangling genetic and environmental effects is also of obvious interest, although much more challenging, as the environment usually cannot be experimentally manipulated4.

see wikipedia for “Causes of population stratification”: non-random mating (smooth or discrete spatial variation, between isolated populations)

Conditional (controlling for while controlling for all other features/markers and the family wise error rate (FWER). The former rules out spurious relations between phenotypes and expression that can arise from marginal methods because the ‘spuriously correlated’ gene merely happens to be correlated with the ‘truly causal’ feature.

The two common GWAS designs are case–control studies which look for associations between SNPs and disease (discrete phenotype), and population-based studies which focus on finding associations between SNPs and continuous traits (McCarthy et al., 2008).

The larger goal of these studies is to function as hypothesis-generating machines, resulting in sets of loci that require further analysis. Thus GWAS are an important first step in the gene identification process (Cantor et al., 2010).

Whole genome DE analysis control bias and random variance. Error, e.g. “false positives”, due to bias and random variance (confounding vs aggregate noise). Bias is relative to the generating population, want causal effect, measure marginal confounded effect. Variance due to sampling process, for ex.

Conventional GWAS underpowered (type II) due to small effect, high dimensional noise corrections (MCP). They neglect joint (interactive?) effect, of multiple causal features. They don’t identify causal or conditional parameters.

Joint modeling of all SNPs is challenging. standard multivariable approaches require $d<n$. Genome-wide Complex Trait Analysis (GCTA), which is based on linear mixed models (Yang et al., 2011, 2014) and enables some joint analysis of SNPs.

GWA studies can be confounded by population stratification — systematic ancestry differences between cases and controls — which has previously been addressed by methods that infer genetic ancestry. Those

GWAS in inbred model organisms are confronted by the problem of inflated false positive rates due to population structure and genetic relatedness among inbred strains caused by the complex genealogical history of most model organism strains.

Conventional statistical tests of independence between a genetic marker and a phenotype are prone to spurious associations because the marker and the phenotype are likely to be correlated due to population structure that violates the independence assumption under the null hypothesis.

Our simulations considered a case–control study with two subpopulations, POP1 and POP2, with 300 cases and 200 controls from POP1 and 200 cases and 300 controls from POP2.

Population stratification was initially addressed using general linear model (GLM)-based methods such as structured association2, genomic control3 and family-based tests of association4. The introduction of MLM approaches has more recently been demonstrated as an improved method to simultaneously account for population structure and unequal relatedness among individuals5.

In the MLM-based methods, population structure2, 6 is fit as a fixed effect, whereas kinship among individuals is incorporated as the variance-covariance structure of the random effect for the individuals. Regardless of the applied statistical method, GWAS require large sample sizes to achieve sufficient statistical power7, especially in order to detect the small effect polymorphisms that underlie most complex traits8. For the MLM approach, datasets with these large sample sizes create a heavy computational burden because the computing time for solving a MLM increases with the cube of the number of individuals fit as a random effect. The earliest effort to reduce the size of the random effect in an MLM can be traced back to the sire model approach used in animal breeding9, 10, 11, 12, which replaces an individual’s genetic effect with that of its sire. Consequently, the sire-model approach requires pedigrees, which are not always available, and which in particular are often not available in plant studies. Even with available pedigrees, the use of a marker-based kinship is preferred because of its higher accuracy13, 14. The computing time is further increased because iteration is needed to estimate population parameters, such as variance components15, for each tested marker. Even though a number of studies have sought to improve the speed of the iteration process, including development of the recent efficient mixed-model association (EMMA) algorithm16, solving an MLM for a large number of individuals and markers remains computationally intensive. To address this issue, a residual approach was proposed based on a two-step strategy17. The first step optimized a reduced MLM with the genetic marker effect excluded. In the second step, the residual from the reduced MLM was fit as the dependent variable to test each marker in a GLM. Because the random genetic effect was not fit in the second step, iteration was not required when testing markers. This residual approach can be performed much faster than the one-step MLM with full optimization for all unknown parameters, but it has a statistical power equivalent to that of the full optimization approach only for traits with low heritability. We propose here methods to reduce the size of the random genetic effect in the absence of pedigree information and eliminate iterations to re-estimate the population parameters for each marker without compromising statistical power. We show that the joint use of these two methods greatly reduces computing time and maintains or even increases statistical power.

A limitation of population structure methods is that they do not model family structure or cryptic relatedness. These factors may lead to inflation in test statistics if they are not explicitly modelled because samples that are correlated are assumed to be uncorrelated: Although correcting for genetic ancestry and then dividing by the residual λGC will restore an appropriate null distribution, association statistics that explicitly account for family structure or cryptic relatedness are likely to achieve higher power owing to improved weighting of the data.

Linear mixed models (LMMs) in genome-wide association studies (GWAS) is now widely accepted1 because LMMs have been shown to be capable of correcting for several forms of confounding due to

genetic relatedness, such as population structure and familial relatedness1, and because recent advances have made them computationally efficient1, 2.

LMMs tackle confounding by using a matrix of pairwise genetic similarities to model the relatedness among subjects.

There is a long history of multi-trait models in quantitative genetics5, 6, 7, 8, 9, but these methods have rarely been applied to GWAS.

In many of the approaches, the variables are assumed fixed, but in many cases where the predictor variables are random, such as gene expression data, assumptions can be made that result in the same formulation as in fixed case [26]. One such assumptions is a joint multivariate normal distribution for response and predictors, other is an analysis of response conditioned upon the random predictors.

subset data

j <- !is.na(dat$Chen_Avg_ShCa) D <- dat[i, j] # subset features samples

linear model

X <- model.matrix(~ Chen_Avg_ShCa, data = pData(D)) y <- eData(D) fit <- lmFit(y, X) ebayes <- eBayes(fit) # Variance parameter inference is variable/unstable, because few replicates. T-statistic penalizes this instability (low df). In conventional Type-I paradigm must increase threshold, and loose power. Consider pseudo-replicates to borrow strength across. top <- topTable(ebayes, coef=“Chen_Avg_ShCa”, adjust=“fdr”, n=100) %>% tbl_df %>% tibble::rownames_to_column(“affy_hugene_2_0_st_v1”) %>% mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1)) heatmap(y[top$affy_hugene_2_0_st_v1, ])

a = feat %>% filter(affy_hugene_2_0_st_v1 %in% top$affy_hugene_2_0_st_v1)

a = feat %>% filter(affy_hugene_2_0_st_v1 %in% top$affy_hugene_2_0_st_v1) %>% dplyr::select(affy_hugene_2_0_st_v1, hgnc_symbol) %>% filter(!duplicated(affy_hugene_2_0_st_v1)) %>% print(n = Inf) setequal(a$affy_hugene_2_0_st_v1, top$affy_hugene_2_0_st_v1) left_join(top, a, by = “affy_hugene_2_0_st_v1”) attr <- c(“affy_hugene_2_0_st_v1”, “hgnc_symbol”)

case: “whole genome”

attr <- c(“affy_hugene_2_0_st_v1”, “hgnc_symbol”) (hits <- getBM(attributes = attr, filters = attr[1], values = top$affy_hugene_2_0_st_v1, mart = ensembl) %>% tbl_df) (hits <- hits %>% mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1))) # type coersion left_join(top, hits, by = "affy_hugene_2_0_st_v1") %>% print(n = Inf) (enriched = hits$hgnc_symbol %>% unique) write(enriched, file = “enriched.txt”)

probability that the enrichment score obtained for a particular gene set is a chance occurrence of phenotype changes.

The procedure would be to shuffle the phenotype labels, calculate the differential expression of each gene, rank all genes and compute an enrichment score for the same gene set.

The entire process is repeated multiple times to obtain a distribution of enrichment scores, and the P-value of the actual enrichment score is simply the fraction of shuffles that produce enrichment scores at least as great as observed

Lack of gold standard. Use “controled mutual coverage”, essentially a positive predictive value where the true positives are determined by weighted overlaps between different methods.

we believe that the inclusion of additional biological features such as topology or covariates as discussed in the ‘Gene set-level statistics’ section would be more useful than changing statistics.

scalar omnibus statistics on a set of features, called “gene set-level statistics” by

weight the gene level statistics by a topological influence factor (TIF).

attr <- c(“affy_hugene_2_0_st_v1”, “hgnc_symbol”, “name_1006”) # why does adding go_id reduce the dimension of return object (hits <- getBM(attributes = attr, filters = attr[1], values = top$affy_hugene_2_0_st_v1, mart = ensembl) %>% tbl_df) hits <- hits %>% filter(grepl(query, name_1006, ignore.case = T)) %>% filter(!duplicated(affy_hugene_2_0_st_v1)) %>% mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1)) left_join(top, hits, by = “affy_hugene_2_0_st_v1”)

hits <- spread(hits, name_1006, name_1006)

The first argument of spread() takes the name of your messy dataset (hits), the second argument takes the name of the column to spread into new columns (name_1006), and the third argument takes the column that contains the value with which to fill in the newly spread out columns (name_1006).

hits$name_1006 %>% unique (relevant_hits <- hits %>% filter(grepl(query, name_1006, ignore.case = TRUE)) %>% dplyr::select(affy_hugene_2_0_st_v1, name_1006, hgnc_symbol))

listAttributes(ensembl) %>% filter(grepl(“feature_page”, page, ignore.case = T)) %>% filter(grepl(“go”, description, ignore.case = T))

get literature

library(“hugene20sttranscriptcluster.db”) sapply(annotate::lookUp(probeset, “hugene20sttranscriptcluster.db”, “PMID”), head) absts <- annotate::pm.getabst(probeset[1], “hugene20sttranscriptcluster.db”) # check refs for the top genes titl <- annotate::pm.titles(absts[1]) # get titles strwrap(titl, simplify=FALSE)

molecule = “16657652” sapply(annotate::lookUp(molecule, “hugene20sttranscriptcluster.db”, “PMID”), head) absts <- annotate::pm.getabst(molecule, “hugene20sttranscriptcluster.db”) # check refs for the top genes annotate::pm.titles(absts)

nice html or text tables

library.bioconductor(“KEGG.db”) library.bioconductor(“GO.db”) library.bioconductor(“annaffy”) atab <- aafTableAnn( probeset, “hugene20sttranscriptcluster.db”, aaf.handler(chip = “hugene20sttranscriptcluster.db”)) saveHTML(atab, file=“report2.html”) browseURL(“report2.html”)

PCA and friends: PCA/SVD identifies patterns - linear transformations, projection: rotation, reflection, scaling of data - that explain a large percentage of the variation (Alter et al. 2000)

d <- fdata %>% dplyr::filter(grepl(“transcription”,DESC)) ind <- d$affy_hugene_2_0_st_v1 %>% as.character edata <- edata[ind, ] edata <- log2(edata + 1) edatac<- edata - rowMeans(edata) # centered svd1 <- svd(edatac) names(svd1) par(pch=19, mfrow=c(1,2)) plot(svd1$d^2/sum(svd1$d^2),ylab="Percent Variance Explained",col=2) plot(svd1$v[,1],svd1$v[,2], ylab=“2nd PC”, xlab=“1st PC”,col= treat + 1) # check svd(), prcomp() X <- edata (s <- svd(X))

Sparse PCA: PCA offers a low dimensional approximation to data. We know one direction explains most variation. Can we get away with using a small number of genes to get approximation? This would assist interpretability?

Factor analysis

fa <- factanal(t(X), 3, rotation=“varimax”)

library(GEOquery) eList = getGEO(“GSE11675”) # the processed data length(eList) names(eList) eData = eList[[1]] eData names(pData(eData)) eList2 = getGEOSuppFiles(“GSE11675”) # the raw data

S4 classes/methods

?ExpressionSet # S4 classes getClass(“ExpressionSet”) validObject(dat) showMethods(“as.data.frame”) # S4 methods, generic function dispatches on x getMethod(“as.data.frame”, “GenomicRanges”) # look at the function if x = GenomicRanges method?“as.data.frame,GenomicRanges”

LARS

library(“glmnet”) CV = cv.glmnet(x = t(y), y = chen, family = “gaussian”, alpha = 1, nlambda = 1000); plot(CV) fit = glmnet(x = t(y), y = chen, family = “gaussian”, alpha = 1, lambda = CV$lambda.1se)

how many clusters?

mydata <- y wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var)) for (i in 2:15) wss[i] <- sum(kmeans(t(mydata), centers=i)$withinss) plot(1:15, wss, type=“b”, xlab=“Number of Clusters”, ylab=“Within groups sum of squares”)

Notes on Nepali PTSD

library(GEOquery) outNepali = getGEO(“GSE77164”) outNepali = outNepali[[1]]

Raw data

supNepali = getGEOSuppFiles(“GSE77164”)

supNepali = read.table(gzfile(‘/Users/chumbley/GSE77164/GSE77164_non-normalized.txt.gz’), header = TRUE)

dim(supNepali) == dim(outNepali)

rownames(outNepali) %>% head

rownames(outNepali) %in% ctra0 %>% sum()

all(fData(outNepali)$ID == rownames(outNepali)) # all(fData(outNepali)$ORF == rownames(outNepali))

compare raw and pre-processed

pdat = outNepali %>% pData() edat = outNepali %>% eData() pdat$characteristics_ch1 # soldier status tmp = edat[rownames(edat) %in% ctra0, ] heatmap(tmp, ColSideColors = as.character(as.numeric(pdat$characteristics_ch1)))

# how many clusters?

mydata <- tmp

wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var)) # for (i in 2:15) wss[i] <- sum(kmeans(mydata, centers=i)$withinss) # plot(1:15, wss, type=“b”, xlab=“Number of Clusters”, ylab=“Within groups sum of squares”)

influence of initialization?

kmeans1 = kmeans(t(tmp),centers=2, iter.max = 10000, nstart = 10000) heatmap(tmp, ColSideColors = as.character(kmeans1$cluster)) table(kmeans1$cluster, pdat$characteristics_ch1) chisq.test(kmeans1$cluster, pdat$characteristics_ch1)

joke regression

pdat %>% dplyr::select(matches(“chara.*[^16]$")) %>% head (vars = pdat %>% dplyr::select(matches("chara.*[^16]$“)) %>% names()) form = as.formula( paste(”kmeans1$cluster ~“, paste(”as.factor(“, vars,”)“, collapse =”+“))) summary(lm(formula = form, data = pdat ))

For “wide” data - like ours - infering class conditional distributions is non-identified, ill-determined, noisy. The covariance has $d^2/2$ parameter (check), but we old have n samples.

Need constraints. Sparcity. Smoothing filter to covariance, with shrinkage regularization controled by a single parameter.

library(“JGL”) ## run fused graphical lasso # Network topology # Erdos-Rengyi random # Small world: Watts, strogatz networks have high transitivity (clustering) AND low geodesics (path length), which fits real-world topology. Their model explains realworld topological features (transtivitity/geodesics) in terms of shortcuts in real-world networks which connect modules that are highly clustered. # Scale free: Barabasi, Albert. Powerlaw NOT poisson degree distributions. Rich-get-richer. Importance of hub nodes. # Duplication-divergence: # Magnetization

Casual network, subjunctive: if I would intervene on node1 (or nodeSet1) what would be the consequence

Marginal dependence graph Em (edges code marginal dependence), easy to esimate (pairwise correlation). “Marginal screening”.

Conditional dependence graph E (requires paper from last 15 years on high-d graphical models). Conditional independence iff $P_ij=0$, so use L1 penalty or population to zero out small elements.

L1 interpreted as an independent laplacian prior on each edge, i.e. all edges equally likely and independent, i.e Erdos-Renyi graph in which most nodes have approx same number of edges (unrealistic). C.f. “hub penalty function” which assumes decomposition of E into Erdos-Renyi + hub graph

In practice, is there a relationship between edge-sets by thresholding Em and by E? Yes, in high-d it is rare to have mismatch between marginal and conditional

infer conditional dependence, not marginal dependence

e = outNepali[rownames(outNepali) %in% ctra0, ] %>% eData e1 = e[, pdat$characteristics_ch1 == “childsoldier: 0”] e2 = e[, pdat$characteristics_ch1 == “childsoldier: 1”] nepali = list(t(e1), t(e2)) # expects samples on rows, features/genes across columns # lambda is a sparsity parameter (it offers another solution to the exploding multiple comparison problem, 400 million ways to go wrong). Some “statistical learning” people talk about finding a “penalty function that induces realistic structure”. # Sparcity constraints. Loss function based on some distributional assumptions (likelyhood, pseudo-likelihood, conditional likelihood) like multivariate normality. Motivate more flexible approach. fgl.results = JGL(Y=nepali,penalty=“group”,lambda1=.1,lambda2=.1, return.whole.theta = TRUE) a1 = (fgl.results$theta[[1]] != 0) # plot adjacency (inverse covariance) matrix (only returned over the connected nodes if return.whole.theta = FALSE, and only the diagonal of the matrix is returned over the unconnected nodes (diag.theta.unconnected) a2 = (fgl.results$theta[[2]] != 0) par(mfrow=c(1,2)) image(a1) image(a2)

g1 = graph.adjacency(a1, mode = “undirected”, diag = 0) g2 = graph.adjacency(a2, mode = “undirected”, diag = 0) mean(degree(g1)) - mean(degree(g2)) sd(degree(g1)) - sd(degree(g2))

E(g1)$color = "green" V(g1)$color <- “green” V(g1)$color[degree(g1)>3] <- "red" E(g1)$color[1:4] <- “red” par(mfrow = c(1,1)) plot(g1) subnetworks(fgl.results$theta) # get subnetwork membership of FGL results:

The null distribution on differential expression topology

permutationNull <- function(){ p = dim(e)[1] n = dim(e)[2] n1 = dim(e1)[2]
ord = sample(n, replace = FALSE) # random permutation e1 = e[, ord[1:n1]] e2 = e[, ord[(n1+1):n]] y = list(t(e1), t(e2)) # expects samples on rows, features/genes across columns out = JGL(Y=y,penalty=“group”,lambda1=1,lambda2=1, return.whole.theta = TRUE)

# use igraph 
a1 = (out$theta[[1]] != 0)
a2 = (out$theta[[2]] != 0)
g1 = graph.adjacency(a1, mode = "undirected", diag = 0)  # inverse covariance matrix is only returned over the connected nodes if return.whole.theta = FALSE, and only the diagonal of the matrix is returned over the unconnected nodes (diag.theta.unconnected)
g2 = graph.adjacency(a2, mode = "undirected", diag = 0) 
degr = data.frame(degree(g1), degree(g2))
par(mfrow = c(1,2))
image(a1)
image(a2)
plot(g1)
plot(g2)
# return(list(dat = degr))
return(norm(a1-a2, type = "2"))

}

out = replicate(10, permutationNull())

par(mfrow = c(1, 2)) net.degree(fgl.results$theta) %>% sapply(hist, breaks = 100) #List the degrees of the most connected nodes in each class. net.hubs(fgl.results$theta, nhubs = 10) #For each class, returns the names of the nodes connected to a given node, here IL6. ind = which(colnames(nepali[[2]]) %in% “IL6”) net.neighbors(fgl.results$theta, index = ind)

#Notes on data from Frederickson et al 2013.

outFred = getGEO(“GSE45330”)

outFred = outFred[[1]]

supFred = getGEOSuppFiles(“GSE45330”) # downloads raw data to ‘/Users/chumbley/GSE45330/GSE45330_non_normalized_ProbeData.txt.gz’

supFred = read.table(gzfile(‘/Users/chumbley/GSE45330/GSE45330_non_normalized_ProbeData.txt.gz’), header = TRUE)

dim(supFred) == dim(outFred) # evidently, their preprocessing went from 47231 features to 34591

not exactly the same genes across cole studies

dim(outNepali) == dim(outFred)

setdiff(featureNames(outNepali), featureNames(outFred))

setdiff(featureNames(outFred), featureNames(outNepali))

technique illustration

tmp = jdat %>% mutate(highBmi = 1*(bmi > median(bmi))) tmp %>% group_by(molecule, highBmi) %>% ggvis(~express, fill = ~interaction(molecule, highBmi)) %>% layer_densities()

Mapping between different identifiers for the same molecule