rm(list = ls())
setwd("~/Dropbox/professional/astress/jacobs/rna_pilot/shanahan_data/")
dataDirectory <- "~/Dropbox/professional/astress/jacobs/rna_pilot/shanahan_data/"
# Expression features are high dimensional - more parameters than observations or 'overparameterized' - and strongly correlated.
# These properties mean non-identifiability and high variance (overfitting).
# establish that the microarray really adds something to the predictive performance of known risk factors
# P-value of the test: exact, asymptotic, permutation arguments.
# Warning: Regulation of gene expression in higher eukariotes is very complex and far from being well understood. We have incomplete knowledge of cell signalling networks.
# We don't have enough data for a hold-out set, to validate our the predictions of our algorithm.
# The PAXgene Blood RNA System consists of a blood collection tube (PAXgene Blood RNA Tube) and a nucleic acid purification kit (PAXgene Blood RNA Kit).
# It is intended for the collection, transport, and storage of blood and stabilization of intracellular RNA in a closed tube and subsequent isolation and purification of intracellular RNA from whole blood for RT-PCR used in molecular diagnostic testing.
# Infer dependence within/between two multivariate systems A, B: e.g. predicting components/subsystems/structure of A from B (gene set expression increases from phenotype/exposure/treatment).
# Infer causal dependence?
# Which properties of the joint (A, B) can be safely assumed - topology of A? A|B conditional distributions? - and which are hypothetical.
# Test complex responses to multivariate traits/treatments.
# In general, there are two assumptions for causal inference from observational data: no unmeasured confounding variables and/or measured instrumental variable
# Constrain prediction with prior data raw or distilled/reasoned - e.g. into network information (PPI network smoothing over a network structure).
# Signal pathway view is limited: pathways are connected to form a large network
# regulatory motifs, such as positive and negative feedback and feedforward loops, that process information. Key regulators of plasticity were highly connected nodes required for the formation of regulatory motifs, indicating the potential importance of such motifs in determining cellular choices between homeostasis and plasticity.
# One central challenge of systems biology is to identify the subset of key components and regulatory interactions whose perturbation or tuning leads to significant functional changes (e.g., changes in a crop's fitness under environmental stress or changes in the state of malfunctioning cells, thereby combating disease).
# Mathematical modeling can assist in this process by integrating the behavior of multiple components into a comprehensive model that goes beyond human intuition, and also by addressing questions that are not yet accessible to experimental analysis.
# In recent years, theoretical and computational analysis of biochemical networks has been successfully applied to well-defined metabolic pathways, signal transduction, and gene regulatory networks [1–3].
# In parallel, high-throughput experimental methods have enabled the construction of genome-scale maps of transcription factor–DNA and protein–protein interactions [4,5].
# The former are quantitative, dynamic descriptions of experimentally well-studied cellular pathways with relatively few components, while the latter are static maps of potential interactions with no information about their timing or kinetics.
# Here we introduce a novel approach that stands in the middle ground of the above-mentioned methods by incorporating the synthesis and dynamic modeling of complex cellular networks that contain diverse, yet only qualitatively known regulatory interactions.
# Explore the extent to which the network topology determines the dynamic behavior of the system.
# Methods: dynamic simulation.
# Networks which connect molecular components within a cell
##########################################################################################
# Physical molectular interaction networks, i.e. cell-signalling, gene-regulatory, protein-protein interaction networks:
# Network reconstruction from literature: networks manually by experts, semi-automatic identifiattion, completely automatically by natural language processing
# Network reconstruction from experimental data:
# Epistasis network (not direct physical, but still somehow connects molecular components)
# Time-series and/or experimental data used to infer a directed and signed graph
# Network to connect cell networks to other things, e.g. drugs to targets
#############################################
# bipartite graphs
# proteins operate as part of highly interconnected cellular networks referred to as interactome networks
# disease-gene products
# update the single drug–single target paradigm, just as single protein–single function relations are somewhat limited
# nodes connected through abstract interactions, genes connected based on which disease is caused by thier mutation.
#############################################
# Functional association networks
#############################################
# network representation which connect molecules with more abstract quantities can be used for data integration (Tanay 2004 PNAS)
library.cran <- function(p) {
if (!p %in% installed.packages()) { install.packages(p) }
library(p, character.only=1)
}
source("https://bioconductor.org/biocLite.R") # first time install
## Bioconductor version 3.4 (BiocInstaller 1.24.0), ?biocLite for help
library.bioconductor <- function(p) {
if (!p %in% installed.packages()) { biocLite(p) }
library(p, character.only=1)
}
library.bioconductor("gplots")
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library.bioconductor("devtools")
library.bioconductor("Biobase")
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
library.bioconductor("dendextend")
##
## ---------------------
## Welcome to dendextend version 1.3.0
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library.bioconductor("Glimma")
library.cran("tidyr")
library.cran("Hmisc")
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:Biobase':
##
## combine, contents
## The following object is masked from 'package:BiocGenerics':
##
## combine
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
#The 53-gene "CTRA" set listed below includes (a) 19 proinflammatory genes which are upregulated in CTRA "on average" (b) 31 genes involved in type I IFN responses down-regulated in the CTRA (c) 3 genes involved in antibody synthesis down-regulated in the CTRA. These molecules have been historically designated by their HGNC names (HUGO gene nomenclature committee).
inflamatory = c("IL1A", "IL1B", "IL6", "IL8", "TNF", "PTGS1", "PTGS2",
"FOS", "FOSB", "FOSL1", "FOSL2", "JUN", "JUNB", "JUND",
"NFKB1", "NFKB2", "REL", "RELA", "RELB")
interferonTypeI = c("GBP1", "IFI16", "IFI27", "IFI27L1", "IFI27L2", "IFI30",
"IFI35", "IFI44", "IFI44L", "IFI6", "IFIH1", "IFIT1",
"IFIT2", "IFIT3", "IFIT5", "IFIT1L", "IFITM1", "IFITM2",
"IFITM3", "IFITM4P", "IFITM5", "IFNB1", "IRF2", "IRF7",
"IRF8", "MX1", "MX2", "OAS1", "OAS2", "OAS3", "OASL")
antibody = c("IGJ", "IGLL1", "IGLL3")
ctra0 = c(inflamatory, interferonTypeI, antibody)
ctraCore0 = c("IRF7", "JUN", "IGJ", "IL8", "IL1B", "FOSB", "FOSL2", "IFIT3", "IFI35", "IFI44L", "MX1", "OAS2")
ctraCore = c("IRF7", "JUN", "JCHAIN", "CXCL8", "IL1B", "FOSB", "FOSL2", "IFIT3", "IFI35", "IFI44L", "MX1", "OAS2")
#To connect our data to previous CTRA publications we must relate three sets of gene identifiers: names according to the HGNC (as above), names according to the Illumina Human HT-12 v4 BeadArray (used for original CTRA publications) and names according to our own HuGene 2.0 ST array V1. To do this we queried the Ensembl Genes 85 database (homo sapiens, archived in July 2016) via the biomaRt interface.
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## combine, src, summarize
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
options(dplyr.print_max = 15)
library.bioconductor("Biobase")
eData <- exprs # convenience
library.bioconductor("biomaRt")
ensembl <- useMart(biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl",
host = "jul2016.archive.ensembl.org")
#First note that 4 of 53 HGNC names have been replaced since the original CTRA publications.
a = getBM(attributes = c("hgnc_symbol"),
filters = "hgnc_symbol",
values = ctra0,
mart = ensembl) %>% tbl_df
(missing = setdiff(ctra0, a$hgnc_symbol))
## [1] "IL8" "IFIT1L" "IGJ" "IGLL3"
# We will therefore use the newer names (confirming they are all present).
inflamatory = replace(inflamatory, inflamatory == "IL8", "CXCL8")
interferonTypeI = replace(interferonTypeI , interferonTypeI == "IFIT1L", "IFIT1B")
antibody = replace(antibody, antibody == "IGJ", "JCHAIN")
antibody = replace(antibody, antibody == "IGLL3", "IGLL3P")
ctra = c(inflamatory, interferonTypeI, antibody)
a = getBM(attributes = c("hgnc_symbol"),
filters = "hgnc_symbol",
values = ctra,
mart = ensembl) %>% tbl_df
length(setdiff(ctra, a$hgnc_symbol)) == 0 # check all present now
## [1] TRUE
#Oddly, Illumina Human HT-12 v4 BeadArrays do not assay 2 of 53 CTRA genes, according to the Ensembl Genes 85 database (NAs below).
(a <- getBM(attributes = c("hgnc_symbol", "illumina_humanht_12_v4"),
filters = "hgnc_symbol",
values = c("IFI30", "IFIH1"),
mart = ensembl))
## hgnc_symbol illumina_humanht_12_v4
## 1 IFI30 NA
## 2 IFIH1 NA
#Confirm that IFI30 and IFIH1 are the only two genes not assayed by Illumina Human HT-12 v4 BeadArrays ("illumina_humanht_12_v4").
a <- getBM(attributes = c("hgnc_symbol", "illumina_humanht_12_v4"),
filters = "hgnc_symbol",
values = ctra,
mart = ensembl) %>% tbl_df
a <- a %>% mutate(illumina_humanht_12_v4 = replace(illumina_humanht_12_v4, illumina_humanht_12_v4 =="", NA))
b <- a %>% filter(!is.na(illumina_humanht_12_v4)) # remove NA
(missing_ctra = setdiff(ctra, b$hgnc_symbol))
## [1] "IFI30" "IFIH1"
#Similarly, our HuGene 2.0 ST array V1 ("affy_hugene_2_0_st_v1") did not assay 3 of 53 CTRA genes.
a <- getBM(attributes = c("hgnc_symbol", "affy_hugene_2_0_st_v1"),
filters = "hgnc_symbol",
values = ctra,
mart = ensembl) %>% tbl_df
b <- a %>% filter(!is.na(affy_hugene_2_0_st_v1)) # filter out missing values
(missing_ctra = setdiff(ctra, b$hgnc_symbol))
## [1] "GBP1" "MX2" "IGLL3P"
(found_ctra = setdiff(ctra, missing_ctra))
## [1] "IL1A" "IL1B" "IL6" "CXCL8" "TNF" "PTGS1" "PTGS2"
## [8] "FOS" "FOSB" "FOSL1" "FOSL2" "JUN" "JUNB" "JUND"
## [15] "NFKB1" "NFKB2" "REL" "RELA" "RELB" "IFI16" "IFI27"
## [22] "IFI27L1" "IFI27L2" "IFI30" "IFI35" "IFI44" "IFI44L" "IFI6"
## [29] "IFIH1" "IFIT1" "IFIT2" "IFIT3" "IFIT5" "IFIT1B" "IFITM1"
## [36] "IFITM2" "IFITM3" "IFITM4P" "IFITM5" "IFNB1" "IRF2" "IRF7"
## [43] "IRF8" "MX1" "OAS1" "OAS2" "OAS3" "OASL" "JCHAIN"
## [50] "IGLL1"
# (found_inflamatory = setdiff(inflamatory, missing_ctra))
# (found_interferonTypeI = setdiff(interferonTypeI, missing_ctra))
# (found_antibody = setdiff(antibody, missing_ctra))
# setequal(found_ctra, union_all(found_antibody, found_inflamatory, found_interferonTypeI))
# above is also on see rnaAnnotationr.RMd
#########################################
# "Tidy data": Hadley wickam and relational databases
# row = observational unit (one type of observational unit per table)
# col = variable/attribute
# # Each value belongs to both variable and observation
# common messiness: column headers are values not variable names (i.e. there is logical structure across columns, i.e.mutually exclusive)
# wide represents features horrizontally.
# Multiple variables stored in one column is messy, but there should be multiple values in each column.
# The most important function in tidyr is gather(). It should be used when you have columns that are not variables and you want to collapse them into key-value pairs.
#
#
# "ExpressionSet" data is not tidy
# col = sample unit
# row = variable
load(file = "~/Dropbox/professional/astress/jacobs/rna_pilot/shanahan_data/dat.RData")
load(file = "~/Dropbox/professional/astress/jacobs/rna_pilot/shanahan_data/feat.RData")
library.cran("ggplot2")
# check extraneous bmi.dta file is redundant with what we already have
library.cran("readstata13") # load from stata
datBmi <- read.dta13("bmi.dta") # from michael shanahan
left_join(pData(dat), datBmi, by = "IDNum") %>% dplyr::select(bmi.y, bmi.x) %>% mutate(difff = bmi.y-bmi.x) # the bmi in dat and datBmi are equal (up to rounding error)
## bmi.y bmi.x difff
## 1 12.85531 12.8553 1.139374e-05
## 2 14.18973 14.1897 3.445892e-05
## 3 12.83677 12.8368 -2.994232e-05
## 4 19.71369 19.7137 -1.401062e-05
## 5 13.00550 13.0055 -1.602173e-07
## 6 18.91965 18.9196 4.721680e-05
## 7 15.91152 15.9115 2.286530e-05
## 8 14.20525 14.2052 4.787903e-05
## 9 13.78431 13.7843 1.415558e-05
## 10 12.63848 12.6385 -1.695251e-05
## 11 12.69575 12.6957 4.546814e-05
## 12 18.75143 18.7514 3.241882e-05
## 13 14.63973 14.6397 3.331451e-05
## 14 16.39249 16.3925 -1.152039e-05
## 15 15.93015 15.9301 4.621735e-05
## 16 14.32487 14.3249 -2.512207e-05
## 17 15.59034 15.5903 3.966064e-05
## 18 12.73161 12.7316 9.344482e-06
## 19 15.36717 15.3672 -3.443451e-05
## 20 14.55585 14.5558 4.526062e-05
## 21 14.60934 14.6093 3.589935e-05
## 22 17.40762 17.4076 1.947632e-05
## 23 19.14368 19.1437 -2.324219e-05
## 24 13.87395 13.8740 -4.904175e-05
## 25 15.07893 15.0789 2.513275e-05
## 26 13.88633 13.8863 3.441925e-05
## 27 14.82876 14.8288 -3.794556e-05
## 28 16.94716 16.9472 -4.309387e-05
## 29 17.43950 17.4395 8.087158e-07
## 30 13.35859 13.3586 -6.059265e-06
## 31 16.46633 16.4663 2.957458e-05
dat$IDNum <- as.character(dat$IDNum)
# if(0) {
#
# ################################################################################
# ## Checks
# ################################################################################
#
# # unpack
# edata <- dat %>% eData
# fdata <- dat %>% fData
# pdata <- dat %>% pData
#
# # Sample data
# pdata %>% is.na %>% colSums # which columns have NA
# sum(pdata$chen_R_A_Avg == "?") # but NA is
#
# # Gene data
# edata %>% is.na %>% rowSums %>% table # which rows have missing values
#
# # Empirical distributions
# boxplot(edata) # marginal
# lg2edata = log2(edata +1)
# boxplot(edata) # marginal
# boxplot(lg2edata) # marginal
# qqplot(edata[, 1], edata[, 2]) # bivariate (confirms quantile normalization)
# abline(c(0, 1))
# heatmap(edata, Rowv = NA, Colv = NA) # multivariate (no odd samples)
#
# search = c("Nsun3", "Polrmt")
# a <- getBM(attributes = c("hgnc_symbol", 'affy_hugene_2_0_st_v1'),
# filters = "hgnc_symbol",
# values = search,
# mart = ensembl) %>% tbl_df
#
# #External check: Correlate Y-chromosome genes with phenotype gender (Select all Affymetrix identifiers on the "affy_hugene_2_0_st_v1" chip and for genes located on Y chromosome).
# a <- getBM(attributes = c("chromosome_name", 'affy_hugene_2_0_st_v1'),
# filters = c('chromosome_name'),
# values = list("Y"),
# mart = ensembl) %>% tbl_df
# a <- a %>%
# filter(!is.na(affy_hugene_2_0_st_v1)) %>%
# mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1))
# yChromeExpression <- dat[a$affy_hugene_2_0_st_v1, ] %>%
# eData %>%
# tbl_df %>%
# summarize_each(funs(mean)) %>%
# unlist
# title("should split sample by gender")
#
# }
listAttributes(ensembl, page = "feature_page")
## name
## 1 ensembl_gene_id
## 2 ensembl_transcript_id
## 3 ensembl_peptide_id
## 4 ensembl_exon_id
## 5 description
## 6 chromosome_name
## 7 start_position
## 8 end_position
## 9 strand
## 10 band
## 11 transcript_start
## 12 transcript_end
## 13 transcription_start_site
## 14 transcript_length
## 15 transcript_tsl
## 16 transcript_gencode_basic
## 17 transcript_appris
## 18 external_gene_name
## 19 external_gene_source
## 20 external_transcript_name
## 21 external_transcript_source_name
## 22 transcript_count
## 23 percentage_gc_content
## 24 gene_biotype
## 25 transcript_biotype
## 26 source
## 27 transcript_source
## 28 status
## 29 transcript_status
## 30 version
## 31 transcript_version
## 32 phenotype_description
## 33 source_name
## 34 study_external_id
## 35 go_id
## 36 name_1006
## 37 definition_1006
## 38 go_linkage_type
## 39 namespace_1003
## 40 goslim_goa_accession
## 41 goslim_goa_description
## 42 arrayexpress
## 43 chembl
## 44 clone_based_ensembl_gene_name
## 45 clone_based_ensembl_transcript_name
## 46 clone_based_vega_gene_name
## 47 clone_based_vega_transcript_name
## 48 ccds
## 49 dbass3_id
## 50 dbass3_name
## 51 dbass5_id
## 52 dbass5_name
## 53 embl
## 54 ens_hs_transcript
## 55 ens_hs_translation
## 56 ens_lrg_gene
## 57 ens_lrg_transcript
## 58 entrezgene
## 59 entrezgene_transcript_name
## 60 epd
## 61 genedb
## 62 hpa
## 63 ottg
## 64 ottt
## 65 ottp
## 66 hgnc_id
## 67 hgnc_symbol
## 68 hgnc_transcript_name
## 69 merops
## 70 mim_gene_accession
## 71 mim_gene_description
## 72 mirbase_accession
## 73 mirbase_id
## 74 mirbase_transcript_name
## 75 mim_morbid
## 76 pdb
## 77 protein_id
## 78 reactome
## 79 reactome_gene
## 80 reactome_transcript
## 81 refseq_mrna
## 82 refseq_mrna_predicted
## 83 refseq_ncrna
## 84 refseq_ncrna_predicted
## 85 refseq_peptide
## 86 refseq_peptide_predicted
## 87 rfam
## 88 rfam_transcript_name
## 89 rnacentral
## 90 ucsc
## 91 unigene
## 92 uniparc
## 93 uniprot_sptrembl
## 94 uniprot_swissprot
## 95 uniprot_genename
## 96 wikigene_name
## 97 wikigene_id
## 98 wikigene_description
## 99 agilent_cgh_44b
## 100 efg_agilent_sureprint_g3_ge_8x60k
## 101 efg_agilent_sureprint_g3_ge_8x60k_v2
## 102 efg_agilent_wholegenome_4x44k_v1
## 103 efg_agilent_wholegenome_4x44k_v2
## 104 affy_hc_g110
## 105 affy_hg_focus
## 106 affy_hg_u133_plus_2
## 107 affy_hg_u133a_2
## 108 affy_hg_u133a
## 109 affy_hg_u133b
## 110 affy_hg_u95av2
## 111 affy_hg_u95b
## 112 affy_hg_u95c
## 113 affy_hg_u95d
## 114 affy_hg_u95e
## 115 affy_hg_u95a
## 116 affy_hugenefl
## 117 affy_hta_2_0
## 118 affy_huex_1_0_st_v2
## 119 affy_hugene_1_0_st_v1
## 120 affy_hugene_2_0_st_v1
## 121 affy_primeview
## 122 affy_u133_x3p
## 123 codelink
## 124 illumina_humanwg_6_v1
## 125 illumina_humanwg_6_v2
## 126 illumina_humanwg_6_v3
## 127 illumina_humanht_12_v3
## 128 illumina_humanht_12_v4
## 129 illumina_humanref_8_v3
## 130 illumina_human_methylation_27
## 131 illumina_human_methylation_450
## 132 phalanx_onearray
## 133 family
## 134 family_description
## 135 pirsf
## 136 pirsf_start
## 137 pirsf_end
## 138 superfamily
## 139 superfamily_start
## 140 superfamily_end
## 141 smart
## 142 smart_start
## 143 smart_end
## 144 hamap
## 145 hamap_start
## 146 hamap_end
## 147 profile
## 148 profile_start
## 149 profile_end
## 150 prosite
## 151 prosite_start
## 152 prosite_end
## 153 prints
## 154 prints_start
## 155 prints_end
## 156 pfam
## 157 pfam_start
## 158 pfam_end
## 159 tigrfam
## 160 tigrfam_start
## 161 tigrfam_end
## 162 gene3d
## 163 gene3d_start
## 164 gene3d_end
## 165 hmmpanther
## 166 hmmpanther_start
## 167 hmmpanther_end
## 168 blastprodom
## 169 blastprodom_start
## 170 blastprodom_end
## 171 interpro
## 172 interpro_short_description
## 173 interpro_description
## 174 interpro_start
## 175 interpro_end
## 176 low_complexity
## 177 low_complexity_start
## 178 low_complexity_end
## 179 transmembrane_domain
## 180 transmembrane_domain_start
## 181 transmembrane_domain_end
## 182 signal_domain
## 183 signal_domain_start
## 184 signal_domain_end
## 185 ncoils
## 186 ncoils_start
## 187 ncoils_end
## description page
## 1 Ensembl Gene ID feature_page
## 2 Ensembl Transcript ID feature_page
## 3 Ensembl Protein ID feature_page
## 4 Ensembl Exon ID feature_page
## 5 Description feature_page
## 6 Chromosome Name feature_page
## 7 Gene Start (bp) feature_page
## 8 Gene End (bp) feature_page
## 9 Strand feature_page
## 10 Band feature_page
## 11 Transcript Start (bp) feature_page
## 12 Transcript End (bp) feature_page
## 13 Transcription Start Site (TSS) feature_page
## 14 Transcript length (including UTRs and CDS) feature_page
## 15 Transcript Support Level (TSL) feature_page
## 16 GENCODE basic annotation feature_page
## 17 APPRIS annotation feature_page
## 18 Associated Gene Name feature_page
## 19 Associated Gene Source feature_page
## 20 Associated Transcript Name feature_page
## 21 Associated Transcript Source feature_page
## 22 Transcript count feature_page
## 23 % GC content feature_page
## 24 Gene type feature_page
## 25 Transcript type feature_page
## 26 Source (gene) feature_page
## 27 Source (transcript) feature_page
## 28 Status (gene) feature_page
## 29 Status (transcript) feature_page
## 30 Version (gene) feature_page
## 31 Version (transcript) feature_page
## 32 Phenotype description feature_page
## 33 Source name feature_page
## 34 Study External Reference feature_page
## 35 GO Term Accession feature_page
## 36 GO Term Name feature_page
## 37 GO Term Definition feature_page
## 38 GO Term Evidence Code feature_page
## 39 GO domain feature_page
## 40 GOSlim GOA Accession(s) feature_page
## 41 GOSlim GOA Description feature_page
## 42 ArrayExpress feature_page
## 43 ChEMBL ID(s) feature_page
## 44 Clone based Ensembl gene name feature_page
## 45 Clone based Ensembl transcript name feature_page
## 46 Clone based VEGA gene name feature_page
## 47 Clone based VEGA transcript name feature_page
## 48 CCDS ID feature_page
## 49 Database of Aberrant 3' Splice Sites (DBASS3) IDs feature_page
## 50 DBASS3 Gene Name feature_page
## 51 Database of Aberrant 5' Splice Sites (DBASS5) IDs feature_page
## 52 DBASS5 Gene Name feature_page
## 53 EMBL (Genbank) ID feature_page
## 54 Ensembl Human Transcript IDs feature_page
## 55 Ensembl Human Translation IDs feature_page
## 56 LRG to Ensembl link gene feature_page
## 57 LRG to Ensembl link transcript feature_page
## 58 EntrezGene ID feature_page
## 59 EntrezGene transcript name ID feature_page
## 60 Eukaryotic Promoter Database (EPD) ID(s) feature_page
## 61 GeneDB feature_page
## 62 Human Protein Atlas Antibody ID feature_page
## 63 VEGA gene ID(s) (OTTG) feature_page
## 64 VEGA transcript ID(s) (OTTT) feature_page
## 65 VEGA protein ID(s) (OTTP) feature_page
## 66 HGNC ID(s) feature_page
## 67 HGNC symbol feature_page
## 68 HGNC transcript name feature_page
## 69 MEROPS ID feature_page
## 70 MIM Gene Accession feature_page
## 71 MIM Gene Description feature_page
## 72 miRBase Accession(s) feature_page
## 73 miRBase ID(s) feature_page
## 74 miRBase transcript name feature_page
## 75 MIM MORBID feature_page
## 76 PDB ID feature_page
## 77 Protein (Genbank) ID [e.g. AAA02487] feature_page
## 78 Reactome ID feature_page
## 79 Reactome gene ID feature_page
## 80 Reactome transcript ID feature_page
## 81 RefSeq mRNA [e.g. NM_001195597] feature_page
## 82 RefSeq mRNA predicted [e.g. XM_001125684] feature_page
## 83 RefSeq ncRNA [e.g. NR_002834] feature_page
## 84 RefSeq ncRNA predicted [e.g. XR_108264] feature_page
## 85 RefSeq Protein ID [e.g. NP_001005353] feature_page
## 86 RefSeq Predicted Protein ID [e.g. XP_001720922] feature_page
## 87 Rfam ID feature_page
## 88 Rfam transcript name feature_page
## 89 RNACentral ID feature_page
## 90 UCSC ID feature_page
## 91 Unigene ID feature_page
## 92 UniParc feature_page
## 93 UniProt/TrEMBL Accession feature_page
## 94 UniProt/SwissProt Accession feature_page
## 95 UniProt Accession ID feature_page
## 96 WikiGene Name feature_page
## 97 WikiGene ID feature_page
## 98 WikiGene Description feature_page
## 99 Agilent CGH 44b probe feature_page
## 100 Agilent SurePrint G3 GE 8x60k probe feature_page
## 101 Agilent SurePrint G3 GE 8x60k v2 probe feature_page
## 102 Agilent WholeGenome 4x44k v1 probe feature_page
## 103 Agilent WholeGenome 4x44k v2 probe feature_page
## 104 Affy HC G110 probeset feature_page
## 105 Affy HG FOCUS probeset feature_page
## 106 Affy HG U133-PLUS-2 probeset feature_page
## 107 Affy HG U133A_2 probeset feature_page
## 108 Affy HG U133A probeset feature_page
## 109 Affy HG U133B probeset feature_page
## 110 Affy HG U95AV2 probeset feature_page
## 111 Affy HG U95B probeset feature_page
## 112 Affy HG U95C probeset feature_page
## 113 Affy HG U95D probeset feature_page
## 114 Affy HG U95E probeset feature_page
## 115 Affy HG U95A probeset feature_page
## 116 Affy HuGene FL probeset feature_page
## 117 Affy HTA-2_0 probeset feature_page
## 118 Affy HuEx 1_0 st v2 probeset feature_page
## 119 Affy HuGene 1_0 st v1 probeset feature_page
## 120 Affy HuGene 2_0 st v1 probeset feature_page
## 121 Affy primeview feature_page
## 122 Affy U133 X3P probeset feature_page
## 123 Codelink probe feature_page
## 124 Illumina HumanWG 6 v1 probe feature_page
## 125 Illumina HumanWG 6 v2 probe feature_page
## 126 Illumina HumanWG 6 v3 probe feature_page
## 127 Illumina Human HT 12 V3 probe feature_page
## 128 Illumina Human HT 12 V4 probe feature_page
## 129 Illumina Human Ref 8 V3 probe feature_page
## 130 Illumina human methylation 27 probe feature_page
## 131 Illumina human methylation 450 probe feature_page
## 132 Phalanx OneArray probe feature_page
## 133 Ensembl Protein Family ID(s) feature_page
## 134 Ensembl Family Description feature_page
## 135 PIRSF ID feature_page
## 136 PIRSF start feature_page
## 137 PIRSF end feature_page
## 138 SUPERFAMILY ID feature_page
## 139 SUPERFAMILY start feature_page
## 140 SUPERFAMILY end feature_page
## 141 SMART ID feature_page
## 142 SMART start feature_page
## 143 SMART end feature_page
## 144 HAMAP Accession ID feature_page
## 145 HAMAP start feature_page
## 146 HAMAP end feature_page
## 147 Pfscan ID feature_page
## 148 Pfscan start feature_page
## 149 Pfscan end feature_page
## 150 ScanProsite ID feature_page
## 151 ScanProsite start feature_page
## 152 ScanProsite end feature_page
## 153 PRINTS ID feature_page
## 154 PRINTS start feature_page
## 155 PRINTS end feature_page
## 156 Pfam ID feature_page
## 157 Pfam start feature_page
## 158 Pfam end feature_page
## 159 TIGRFAM ID feature_page
## 160 TIGRFAM start feature_page
## 161 TIGRFAM end feature_page
## 162 Gene3D ID feature_page
## 163 Gene3D start feature_page
## 164 Gene3D end feature_page
## 165 HMMPanther ID feature_page
## 166 HMMPanther start feature_page
## 167 HMMPanther end feature_page
## 168 BlastProdom ID feature_page
## 169 BlastProdom start feature_page
## 170 BlastProdom end feature_page
## 171 Interpro ID feature_page
## 172 Interpro Short Description feature_page
## 173 Interpro Description feature_page
## 174 Interpro start feature_page
## 175 Interpro end feature_page
## 176 low complexity (SEG) feature_page
## 177 low complexity (SEG) start feature_page
## 178 low complexity (SEG) end feature_page
## 179 Transmembrane domain (tmhmm) feature_page
## 180 Transmembrane domain (tmhmm) start feature_page
## 181 Transmembrane domain (tmhmm) end feature_page
## 182 signal peptide feature_page
## 183 signal peptide start feature_page
## 184 signal peptide end feature_page
## 185 coiled coil (ncoils) feature_page
## 186 coiled coil (ncoils) start feature_page
## 187 coiled coil (ncoils) end feature_page
listAttributes(ensembl) %>% filter(grepl("go", description, ignore.case = T)) # identify relevant attributes
## name
## 1 go_id
## 2 name_1006
## 3 definition_1006
## 4 go_linkage_type
## 5 namespace_1003
## 6 goslim_goa_accession
## 7 goslim_goa_description
## 8 ggorilla_homolog_ensembl_gene
## 9 ggorilla_homolog_associated_gene_name
## 10 ggorilla_homolog_ensembl_peptide
## 11 ggorilla_homolog_chromosome
## 12 ggorilla_homolog_chrom_start
## 13 ggorilla_homolog_chrom_end
## 14 ggorilla_homolog_orthology_type
## 15 ggorilla_homolog_subtype
## 16 ggorilla_homolog_orthology_confidence
## 17 ggorilla_homolog_goc_score
## 18 ggorilla_homolog_wga_coverage
## 19 ggorilla_homolog_perc_id
## 20 ggorilla_homolog_perc_id_r1
## 21 ggorilla_homolog_dn
## 22 ggorilla_homolog_ds
## description page
## 1 GO Term Accession feature_page
## 2 GO Term Name feature_page
## 3 GO Term Definition feature_page
## 4 GO Term Evidence Code feature_page
## 5 GO domain feature_page
## 6 GOSlim GOA Accession(s) feature_page
## 7 GOSlim GOA Description feature_page
## 8 Gorilla Ensembl Gene ID homologs
## 9 Gorilla associated gene name homologs
## 10 Gorilla Ensembl Protein or Transcript ID homologs
## 11 Gorilla Chromosome Name homologs
## 12 Gorilla Chromosome start (bp) homologs
## 13 Gorilla Chromosome end (bp) homologs
## 14 Gorilla homology type homologs
## 15 Last common ancestor with Gorilla homologs
## 16 Gorilla orthology confidence [0 low, 1 high] homologs
## 17 Gorilla Gene-order conservation score homologs
## 18 Gorilla Whole-genome alignment coverage homologs
## 19 %id. target Gorilla gene identical to query gene homologs
## 20 %id. query gene identical to target Gorilla gene homologs
## 21 dN with Gorilla homologs
## 22 dS with Gorilla homologs
attr <- c("affy_hugene_2_0_st_v1", "hgnc_symbol")
# Inflamatory CTRA Affy id annotated
(inflame <- getBM(attributes = attr[1:2],
filters = attr[2],
values = inflamatory,
mart = ensembl) %>% tbl_df)
## # A tibble: 38 × 2
## affy_hugene_2_0_st_v1 hgnc_symbol
## <int> <chr>
## 1 16967771 CXCL8
## 2 16786587 FOS
## 3 NA FOS
## 4 16863287 FOSB
## 5 16740630 FOSL1
## 6 NA FOSL2
## 7 16878443 FOSL2
## 8 16901974 IL1A
## 9 16901986 IL1B
## 10 NA IL1B
## # ... with 28 more rows
(inflame <- inflame %>%
filter(!is.na(affy_hugene_2_0_st_v1)) %>%
mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1)))
## # A tibble: 27 × 2
## affy_hugene_2_0_st_v1 hgnc_symbol
## <chr> <chr>
## 1 16967771 CXCL8
## 2 16786587 FOS
## 3 16863287 FOSB
## 4 16740630 FOSL1
## 5 16878443 FOSL2
## 6 16901974 IL1A
## 7 16901986 IL1B
## 8 17044177 IL6
## 9 16687875 JUN
## 10 16858710 JUNB
## # ... with 17 more rows
(inflame %>% filter(!is.na(affy_hugene_2_0_st_v1)) %>% summarise(n_distinct(hgnc_symbol))) # describe: all inflamatory molecules present
## # A tibble: 1 × 1
## `n_distinct(hgnc_symbol)`
## <int>
## 1 19
# Interferon CTRA Affy id annotated
(interfere <- getBM(attributes = attr[1:2],
filters = attr[2],
values = interferonTypeI,
mart = ensembl) %>% tbl_df)
## # A tibble: 54 × 2
## affy_hugene_2_0_st_v1 hgnc_symbol
## <int> <chr>
## 1 NA GBP1
## 2 NA IFI16
## 3 16672390 IFI16
## 4 NA IFI27
## 5 16787814 IFI27
## 6 NA IFI27L1
## 7 16787799 IFI27L1
## 8 NA IFI27L2
## 9 16796229 IFI27L2
## 10 NA IFI30
## # ... with 44 more rows
(interfere <- interfere %>%
filter(!is.na(affy_hugene_2_0_st_v1)) %>%
mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1)))
## # A tibble: 34 × 2
## affy_hugene_2_0_st_v1 hgnc_symbol
## <chr> <chr>
## 1 16672390 IFI16
## 2 16787814 IFI27
## 3 16787799 IFI27L1
## 4 16796229 IFI27L2
## 5 16859763 IFI30
## 6 16834545 IFI35
## 7 16666509 IFI44
## 8 16666485 IFI44L
## 9 16684080 IFI6
## 10 16904365 IFIH1
## # ... with 24 more rows
(interfere %>% filter(!is.na(affy_hugene_2_0_st_v1)) %>% summarise(n_distinct(hgnc_symbol))) # describe: all inflamatory molecules present
## # A tibble: 1 × 1
## `n_distinct(hgnc_symbol)`
## <int>
## 1 29
# found CTRA Affy id annotated
found <- getBM(attributes = attr,
filters = attr[2],
values = found_ctra,
mart = ensembl) %>% tbl_df
found <- found %>%
filter(!is.na(affy_hugene_2_0_st_v1)) %>%
mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1))
(found %>% filter(!is.na(affy_hugene_2_0_st_v1)) %>% count(hgnc_symbol) %>% print(n = Inf)) # duplicates for IFITM4P, REL and TNF
## # A tibble: 50 × 2
## hgnc_symbol n
## <chr> <int>
## 1 CXCL8 1
## 2 FOS 1
## 3 FOSB 1
## 4 FOSL1 1
## 5 FOSL2 1
## 6 IFI16 1
## 7 IFI27 1
## 8 IFI27L1 1
## 9 IFI27L2 1
## 10 IFI30 1
## 11 IFI35 1
## 12 IFI44 1
## 13 IFI44L 1
## 14 IFI6 1
## 15 IFIH1 1
## 16 IFIT1 1
## 17 IFIT1B 1
## 18 IFIT2 1
## 19 IFIT3 1
## 20 IFIT5 1
## 21 IFITM1 1
## 22 IFITM2 1
## 23 IFITM3 1
## 24 IFITM4P 6
## 25 IFITM5 1
## 26 IFNB1 1
## 27 IGLL1 1
## 28 IL1A 1
## 29 IL1B 1
## 30 IL6 1
## 31 IRF2 1
## 32 IRF7 1
## 33 IRF8 1
## 34 JCHAIN 1
## 35 JUN 1
## 36 JUNB 1
## 37 JUND 1
## 38 MX1 1
## 39 NFKB1 1
## 40 NFKB2 1
## 41 OAS1 1
## 42 OAS2 1
## 43 OAS3 1
## 44 OASL 1
## 45 PTGS1 1
## 46 PTGS2 1
## 47 REL 2
## 48 RELA 1
## 49 RELB 1
## 50 TNF 8
## # A tibble: 50 × 2
## hgnc_symbol n
## <chr> <int>
## 1 CXCL8 1
## 2 FOS 1
## 3 FOSB 1
## 4 FOSL1 1
## 5 FOSL2 1
## 6 IFI16 1
## 7 IFI27 1
## 8 IFI27L1 1
## 9 IFI27L2 1
## 10 IFI30 1
## # ... with 40 more rows
(feat %>% filter(hgnc_symbol %in% c("TNF", "IFITM4P", "REL"))) # confirm duplicates for IFITM4P, REL and TNF
## # A tibble: 15,024 × 9
## affy_hugene_2_0_st_v1 hgnc_symbol entrezgene ensembl_gene_id
## <chr> <chr> <int> <chr>
## 1 17030620 TNF 7124 ENSG00000232810
## 2 17030620 TNF 7124 ENSG00000232810
## 3 17030620 TNF 7124 ENSG00000232810
## 4 17030620 TNF 7124 ENSG00000232810
## 5 17030620 TNF 7124 ENSG00000232810
## 6 17030620 TNF 7124 ENSG00000232810
## 7 17030620 TNF 7124 ENSG00000232810
## 8 17030620 TNF 7124 ENSG00000232810
## 9 17030620 TNF 7124 ENSG00000232810
## 10 17030620 TNF 7124 ENSG00000232810
## # ... with 15,014 more rows, and 5 more variables: description <chr>,
## # phenotype_description <chr>, definition_1006 <chr>, name_1006 <chr>,
## # go_id <chr>
# subset features/samples
i <- inflame$affy_hugene_2_0_st_v1
i <- found$affy_hugene_2_0_st_v1
j <- !is.na(dat$Chen_Avg_ShCa)
dat0 = dat[i, j]
# Tidy edat
library("tibble") # for rownames_to_column()
# average across replicated features (Affy ids with the same hgnc_symbol), then create matrix
edat <- dat0 %>%
eData %>%
tbl_df %>%
rownames_to_column() %>%
mutate(hgnc_symbol = found$hgnc_symbol) %>%
group_by(hgnc_symbol) %>%
summarise_at(vars(matches("[0-9]")), funs(mean))
# Transpose to attain statistics/relational-database/tidy convention (transposed microarray convention)
edatt <- edat %>% dplyr::select(matches("[0-9]")) %>% t %>% as.data.frame() %>% tibble::rownames_to_column() %>% tbl_df
colnames(edatt) = c("IDNum", edat$hgnc_symbol)
# just relevant phenotype, then long format
pdat <- pData(dat0) %>% dplyr::select(IDNum, Chen_Avg_ShCa, bmi)
jdat <- left_join(pdat, edatt, by = "IDNum") %>% tbl_df %>% gather(hgnc_symbol, express, -(1:3))
# clumsy way to add molecule group labels
jdat <- jdat %>%
mutate(isInflame = 1*(hgnc_symbol %in% inflamatory)) %>%
mutate(isinterferonTypeI = 2*(hgnc_symbol %in% interferonTypeI)) %>%
mutate(isAntibody = 3*(hgnc_symbol %in% antibody)) %>%
mutate(molecule = as.factor(isInflame + isinterferonTypeI + isAntibody))
table(jdat$molecule)
##
## 1 2 3
## 475 725 50
levels(jdat$molecule) = list(inflamatory="1", interferon="2", antibody="3")
table(jdat$molecule)
##
## inflamatory interferon antibody
## 475 725 50
jdat <- jdat %>% dplyr::select(-isInflame, -isinterferonTypeI, -isAntibody)
# standardize features
jdat <- jdat %>% group_by(hgnc_symbol) %>% mutate(expressStandardized = (express - mean(express)))
# jdat %>% dplyr::select(Chen_Avg_ShCa, bmi, express) %>% plot()
# some plots
library.cran("ggvis")
##
## Attaching package: 'ggvis'
## The following object is masked from 'package:ggplot2':
##
## resolution
jdat %>% group_by(hgnc_symbol) %>% ggvis(~express, fill = ~hgnc_symbol) %>% layer_densities()
jdat %>% group_by(hgnc_symbol) %>% ggvis(~expressStandardized, fill = ~hgnc_symbol) %>% layer_densities()
jdat %>% filter(molecule == "interferon") %>% group_by(hgnc_symbol) %>% ggvis(~express, fill = ~hgnc_symbol) %>% layer_densities()
jdat %>% filter(molecule == "inflamatory") %>% group_by(hgnc_symbol) %>% ggvis(~express, fill = ~hgnc_symbol) %>% layer_densities()
jdat %>% filter(molecule == "antibody") %>% group_by(hgnc_symbol) %>% ggvis(~express, fill = ~hgnc_symbol) %>% layer_densities()
jdat %>% group_by(molecule) %>% ggvis(~express, fill = ~molecule) %>% layer_densities()
jdat %>% group_by(molecule) %>% ggvis(~express, fill = ~molecule) %>% layer_histograms()
## Guessing width = 0.2 # range / 42
jdat %>%
filter(molecule == "interferon") %>%
group_by(hgnc_symbol) %>%
ggvis(~Chen_Avg_ShCa, ~expressStandardized) %>%
layer_points() %>%
layer_model_predictions(formula = expressStandardized ~ Chen_Avg_ShCa, model = "lm", stroke = ~molecule)
edat %>% dplyr::select(matches("[0-9]")) %>% as.matrix %>% heatmap()
jdat %>%
filter(molecule == "interferon", IDNum != "1092") %>%
group_by(hgnc_symbol) %>%
ggvis(~Chen_Avg_ShCa, ~expressStandardized) %>%
layer_points() %>%
layer_model_predictions(formula = expressStandardized ~ Chen_Avg_ShCa, model = "lm", stroke = ~molecule)
jdat %>%
filter(molecule == "inflamatory") %>%
group_by(hgnc_symbol) %>%
ggvis(~Chen_Avg_ShCa, ~expressStandardized) %>%
layer_points() %>%
layer_model_predictions(formula = expressStandardized ~ Chen_Avg_ShCa, model = "lm", stroke = ~molecule)
jdat %>%
filter(molecule == "antibody") %>%
group_by(hgnc_symbol) %>%
ggvis(~Chen_Avg_ShCa, ~expressStandardized) %>%
layer_points() %>%
layer_model_predictions(formula = expressStandardized ~ Chen_Avg_ShCa, model = "lm", stroke = ~molecule)
library.cran("lme4")
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:ggvis':
##
## band
## The following object is masked from 'package:tidyr':
##
## expand
lmer(expressStandardized ~ Chen_Avg_ShCa + (1|IDNum) + (1|hgnc_symbol), data = (jdat %>% filter(molecule == "interferon"))) %>% summary
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## expressStandardized ~ Chen_Avg_ShCa + (1 | IDNum) + (1 | hgnc_symbol)
## Data: (jdat %>% filter(molecule == "interferon"))
##
## REML criterion at convergence: 961.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8486 -0.4684 0.0226 0.4678 6.1390
##
## Random effects:
## Groups Name Variance Std.Dev.
## hgnc_symbol (Intercept) 0.0000 0.0000
## IDNum (Intercept) 0.1118 0.3344
## Residual 0.1987 0.4458
## Number of obs: 725, groups: hgnc_symbol, 29; IDNum, 25
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -0.07645 0.07892 -0.969
## Chen_Avg_ShCa -0.17779 0.08951 -1.986
##
## Correlation of Fixed Effects:
## (Intr)
## Chn_Avg_ShC 0.488
lmer(expressStandardized ~ Chen_Avg_ShCa + (1|IDNum) + (1|hgnc_symbol), data = (jdat %>% filter(molecule == "interferon", IDNum != "1092"))) %>% summary
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## expressStandardized ~ Chen_Avg_ShCa + (1 | IDNum) + (1 | hgnc_symbol)
## Data: (jdat %>% filter(molecule == "interferon", IDNum != "1092"))
##
## REML criterion at convergence: 617.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.4819 -0.5404 0.0308 0.5628 4.5436
##
## Random effects:
## Groups Name Variance Std.Dev.
## hgnc_symbol (Intercept) 0.00000 0.0000
## IDNum (Intercept) 0.02512 0.1585
## Residual 0.13187 0.3631
## Number of obs: 696, groups: hgnc_symbol, 29; IDNum, 24
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -0.10086 0.03957 -2.549
## Chen_Avg_ShCa -0.09437 0.04585 -2.058
##
## Correlation of Fixed Effects:
## (Intr)
## Chn_Avg_ShC 0.459
lmer(expressStandardized ~ Chen_Avg_ShCa + (1|IDNum) + (1|hgnc_symbol), data = (jdat %>% filter(molecule == "inflamatory"))) %>% summary
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## expressStandardized ~ Chen_Avg_ShCa + (1 | IDNum) + (1 | hgnc_symbol)
## Data: (jdat %>% filter(molecule == "inflamatory"))
##
## REML criterion at convergence: 261.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1652 -0.5755 0.0123 0.5861 3.0870
##
## Random effects:
## Groups Name Variance Std.Dev.
## IDNum (Intercept) 0.006626 0.0814
## hgnc_symbol (Intercept) 0.000000 0.0000
## Residual 0.095302 0.3087
## Number of obs: 475, groups: IDNum, 25; hgnc_symbol, 19
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -0.0007244 0.0247184 -0.029
## Chen_Avg_ShCa -0.0016846 0.0280330 -0.060
##
## Correlation of Fixed Effects:
## (Intr)
## Chn_Avg_ShC 0.488
lmer(expressStandardized ~ Chen_Avg_ShCa + (1|IDNum) + (1|hgnc_symbol), data = (jdat %>% filter(molecule == "antibody"))) %>% summary
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## expressStandardized ~ Chen_Avg_ShCa + (1 | IDNum) + (1 | hgnc_symbol)
## Data: (jdat %>% filter(molecule == "antibody"))
##
## REML criterion at convergence: 99.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.58070 -0.45096 0.04447 0.57130 2.28322
##
## Random effects:
## Groups Name Variance Std.Dev.
## IDNum (Intercept) 4.698e-20 2.167e-10
## hgnc_symbol (Intercept) 0.000e+00 0.000e+00
## Residual 3.980e-01 6.309e-01
## Number of obs: 50, groups: IDNum, 25; hgnc_symbol, 2
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.01103 0.10219 0.108
## Chen_Avg_ShCa 0.02566 0.11590 0.221
##
## Correlation of Fixed Effects:
## (Intr)
## Chn_Avg_ShC 0.488
lmer(expressStandardized ~ Chen_Avg_ShCa + bmi + (1|IDNum) + (1|hgnc_symbol), data = (jdat %>% filter(molecule == "interferon"))) %>% summary
## Linear mixed model fit by REML ['lmerMod']
## Formula: expressStandardized ~ Chen_Avg_ShCa + bmi + (1 | IDNum) + (1 |
## hgnc_symbol)
## Data: (jdat %>% filter(molecule == "interferon"))
##
## REML criterion at convergence: 965.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8539 -0.4674 0.0199 0.4650 6.1337
##
## Random effects:
## Groups Name Variance Std.Dev.
## hgnc_symbol (Intercept) 0.0000 0.0000
## IDNum (Intercept) 0.1111 0.3333
## Residual 0.1987 0.4458
## Number of obs: 725, groups: hgnc_symbol, 29; IDNum, 25
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.62762 0.66384 0.945
## Chen_Avg_ShCa -0.11261 0.10810 -1.042
## bmi -0.04410 0.04128 -1.068
##
## Correlation of Fixed Effects:
## (Intr) C_A_SC
## Chn_Avg_ShC 0.608
## bmi -0.993 -0.564
lmer(expressStandardized ~ Chen_Avg_ShCa + (1|IDNum) + (1|hgnc_symbol), data = jdat) %>% summary
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## expressStandardized ~ Chen_Avg_ShCa + (1 | IDNum) + (1 | hgnc_symbol)
## Data: jdat
##
## REML criterion at convergence: 1612
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.8706 -0.4719 -0.0024 0.4681 7.6510
##
## Random effects:
## Groups Name Variance Std.Dev.
## hgnc_symbol (Intercept) 0.00000 0.0000
## IDNum (Intercept) 0.03373 0.1837
## Residual 0.20221 0.4497
## Number of obs: 1250, groups: hgnc_symbol, 50; IDNum, 25
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -0.04417 0.04452 -0.992
## Chen_Avg_ShCa -0.10273 0.05049 -2.034
##
## Correlation of Fixed Effects:
## (Intr)
## Chn_Avg_ShC 0.488
jdat %>% ggvis(~bmi, ~Chen_Avg_ShCa) %>% layer_points(size = ~bmi)
jdat %>% filter(IDNum != "1092") %>% ggvis(~bmi, ~Chen_Avg_ShCa) %>% layer_points(size = ~bmi)
jdat %>% filter(IDNum != "1092") %>% ggvis(~bmi, ~expressStandardized) %>% layer_points()
jdat %>% ggvis(~bmi) %>% layer_histograms()
## Guessing width = 0.2 # range / 36
jdat %>%
group_by(hgnc_symbol) %>%
ggvis(~Chen_Avg_ShCa, ~express) %>%
layer_points(fill = ~hgnc_symbol) %>%
layer_model_predictions(formula = express ~ Chen_Avg_ShCa, model = "loess", stroke = ~hgnc_symbol)
jdat %>%
group_by(molecule) %>%
ggvis(~Chen_Avg_ShCa, ~express) %>%
layer_smooths(stroke = ~factor(molecule)) %>%
layer_points(fill = ~factor(molecule))
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
# ggplot2
jdat %>% ggplot(aes(x = Chen_Avg_ShCa, y = express, col = hgnc_symbol)) + geom_point(alpha = 0.2) + geom_smooth(method = "loess", se = FALSE, aes(col = hgnc_symbol))
jdat %>% ggplot(aes(x = Chen_Avg_ShCa, y = express, col = hgnc_symbol)) + geom_point(alpha = 0.2) + geom_smooth(method = "lm", se = FALSE, aes(col = hgnc_symbol))
# messy way to get a tidy df with one new collum giving molecule class name (inflammatory, antibody, interferon)
tidE_3 <- edat %>%
mutate(isInflame = (hgnc_symbol %in% inflamatory)) %>%
mutate(isinterferonTypeI = (hgnc_symbol %in% interferonTypeI)) %>%
mutate(isAntibody = (hgnc_symbol %in% antibody)) %>%
gather(key = moleculeFamily, value, isInflame, isinterferonTypeI, isAntibody) %>%
filter(value == TRUE) %>% dplyr::select(-value) %>%
group_by(moleculeFamily) %>%
summarise_at(vars(matches("[0-9]")), funs(mean))
# call limma here, or just lm. This is one data compression.
tidE_4 <- rbind(tidE_3, c(NA, dat0$Chen_Avg_ShCa))
tidE_4 %>% dplyr::select(matches("[0-9]")) %>% t %>% rcorr( type = "spearman")
## 1 2 3 4
## 1 1.00 -0.21 -0.15 0.08
## 2 -0.21 1.00 0.22 -0.02
## 3 -0.15 0.22 1.00 -0.47
## 4 0.08 -0.02 -0.47 1.00
##
## n= 25
##
##
## P
## 1 2 3 4
## 1 0.3065 0.4881 0.6997
## 2 0.3065 0.2994 0.9427
## 3 0.4881 0.2994 0.0180
## 4 0.6997 0.9427 0.0180
# format for linear model (limma)
matE = edat %>%
dplyr::select(matches("[0-9]")) %>%
as.matrix()
rownames(matE) <- unique(edat$hgnc_symbol)
# linear model
library.bioconductor("limma")
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
X <- model.matrix(~ Chen_Avg_ShCa, data = pData(dat0))
Y <- matE
fit <- lmFit(Y, X)
ebayes <- eBayes(fit) # Variance parameter inference is variable/unstable, because few replicates. T-statistic penalizes this instability (low df). In conventional Type-I paradigm must increase threshold, and loose power. Consider pseudo-replicates to borrow strength across.
(top <- topTable(ebayes, coef="Chen_Avg_ShCa", adjust="fdr", n=100) %>% tbl_df %>% tibble::rownames_to_column("hgnc_symbol"))
## # A tibble: 50 × 7
## hgnc_symbol logFC AveExpr t P.Value adj.P.Val
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 IFIH1 -0.3251154 6.624734 -3.001100 0.005806392 0.2903196
## 2 MX1 -0.3892383 6.869637 -2.545291 0.017080578 0.3058802
## 3 IFIT3 -0.4474916 5.538752 -2.442369 0.021571633 0.3058802
## 4 JUN 0.2582925 4.075052 2.321041 0.028248913 0.3058802
## 5 IFI35 -0.1764350 6.506621 -2.094531 0.045935890 0.3058802
## 6 IFI44 -0.4198591 5.436538 -1.948257 0.062070961 0.3058802
## 7 IFIT5 -0.2439950 5.976889 -1.945992 0.062355743 0.3058802
## 8 OAS1 -0.2658196 6.822599 -1.908996 0.067170972 0.3058802
## 9 OASL -0.2509461 6.024771 -1.902800 0.068008277 0.3058802
## 10 OAS2 -0.2256543 6.871324 -1.851076 0.075359000 0.3058802
## # ... with 40 more rows, and 1 more variables: B <dbl>
(top <- topTable(ebayes, coef="Chen_Avg_ShCa", adjust="fdr", number = Inf) %>% tbl_df %>% tibble::rownames_to_column("hgnc_symbol"))
## # A tibble: 50 × 7
## hgnc_symbol logFC AveExpr t P.Value adj.P.Val
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 IFIH1 -0.3251154 6.624734 -3.001100 0.005806392 0.2903196
## 2 MX1 -0.3892383 6.869637 -2.545291 0.017080578 0.3058802
## 3 IFIT3 -0.4474916 5.538752 -2.442369 0.021571633 0.3058802
## 4 JUN 0.2582925 4.075052 2.321041 0.028248913 0.3058802
## 5 IFI35 -0.1764350 6.506621 -2.094531 0.045935890 0.3058802
## 6 IFI44 -0.4198591 5.436538 -1.948257 0.062070961 0.3058802
## 7 IFIT5 -0.2439950 5.976889 -1.945992 0.062355743 0.3058802
## 8 OAS1 -0.2658196 6.822599 -1.908996 0.067170972 0.3058802
## 9 OASL -0.2509461 6.024771 -1.902800 0.068008277 0.3058802
## 10 OAS2 -0.2256543 6.871324 -1.851076 0.075359000 0.3058802
## # ... with 40 more rows, and 1 more variables: B <dbl>
post_hoc_set_level <- function(Y, X, perm = TRUE){
# Geoman/Buhlmann 2007 (Table 4): Tuckey' non-competitive subject sampling 2 x 2 table. This adapts the historic post hoc method to test a self-contained null hypothesis and to calculate the P-value using subject sampling.
# The null hypothesis assumes (X1, Y1), …, (Xn, Yn) are independent and identically distributed. Then the implied null disribution of any test statistic calculated from (X1, Y1),…, (Xn, Yn) has a distribution that can be non-parametrically constructed as the empirical distribution of the same test-statistic based on (X1, Yπ (1)), …, (Xn, Yπ (n)), where the distribution is generated from all (or many randomly generated) permutations π : {1, … ,n} → {1, … ,n}.
# Criticism: too much researcher df to define feature-levl "discoveries"
# Criticism: The permutation null distribution requires the assumption that there is no relationship between the gene expressions
# and covariates: permuting destroys these associations. This makes the permutation null distribution less useful when covariates are present.
if(perm){
perms <- sample(dim(X)[1])
X <- X[perms, ]
}
# some feature-level statistic of effect strength, e.g. linear model
fit <- lmFit(Y, X)
ebayes <- eBayes(fit) # Variance parameter inference is variable/unstable, because few replicates. T-statistic penalizes this instability (low df). In conventional Type-I paradigm must increase threshold, and loose power. Consider pseudo-replicates to borrow strength across.
top <- topTable(ebayes, coef="Chen_Avg_ShCa", adjust="fdr", n=Inf) %>% tbl_df %>% tibble::rownames_to_column("hgnc_symbol")
discovery <- (top$P.Value < 0.05) # this is just a labeling and so no multiple comparison correction needed
discovery <- abs(top$logFC) > 0.3 # this is just a labeling and so no multiple comparison correction needed
discovery <- abs(top$t) > abs(qt(0.05,25)) # this is just a labeling and so no multiple comparison correction needed
return(sum(discovery))
}
k_perm = 1000 %>% replicate(post_hoc_set_level(Y = Y, X = X)) # null assuming exchangeablility
(k_obs = post_hoc_set_level(Y = Y, X = X, perm = FALSE)) # observed
## [1] 14
(p_val_set_level = sum(k_perm >= k_obs) / length(k_perm))
## [1] 0.053
hist(k_perm, breaks = 1000) # visualize null
# Geomans Global Score test
# Being a score test, the global test is most focused on alternatives close to the null hypothesis. This means that the global test is good at detecting alternatives that have many small effects (in terms of the chosen parametrization), but that it may not be the optimal test to use if the effects are very large.
library.bioconductor("globaltest")
##
## Attaching package: 'globaltest'
## The following object is masked from 'package:tidyr':
##
## extract
n_perm = 1e5
Y = dat0$Chen_Avg_ShCa # NB change notation: Y = X and X = Y!!!!
X = matE %>% t %>% tbl_df
summary(gt(Y ~ 1, Y ~ ., data = X, permutations = n_perm))
## "gt.object" object from package globaltest
##
## Call:
## gt(response = Y ~ 1, alternative = Y ~ ., data = X, permutations = n_perm)
##
## Model: linear regression.
## Degrees of freedom: 25 total; 1 null; 1 + 50 alternative.
## Null distibution: 99999 random permutations.
##
## p-value Statistic Expected Std.dev #Cov
## 1 0.0607 8.92 4.17 2.66 50
# with addtional confound bmi
bmi = dat0$bmi
summary(gt(Y ~ 1 + bmi, Y ~ . + bmi, data = X, permutations = n_perm))
## "gt.object" object from package globaltest
##
## Call:
## gt(response = Y ~ 1 + bmi, alternative = Y ~ . + bmi, data = X, permutations = n_perm)
##
## Model: linear regression.
## Degrees of freedom: 25 total; 2 null; 2 + 51 alternative.
## Null distibution: 99999 random permutations.
##
## p-value Statistic Expected Std.dev #Cov
## 1 0.408 3.74 4.15 2.78 51
# with all feature interactions
summary(gt(Y ~ 1, Y ~ 1 + .*., data = X, permutations = n_perm))
## "gt.object" object from package globaltest
##
## Call:
## gt(response = Y ~ 1, alternative = Y ~ 1 + . * ., data = X, permutations = n_perm)
##
## Model: linear regression.
## Degrees of freedom: 25 total; 1 null; 1 + 1275 alternative.
## Null distibution: 99999 random permutations.
##
## p-value Statistic Expected Std.dev #Cov
## 1 0.0573 9.86 4.16 3.07 1275
hist(gt(Y,X, permutations=1e4))
# only interferon
interfer_ind = which(names(X) %in% interferonTypeI)
inflame_ind = which(names(X) %in% inflamatory)
antibody_ind = which(names(X) %in% antibody)
summary(gt(Y ~ 1, Y ~ 1 + ., data = X[, interfer_ind], permutations = n_perm))
## "gt.object" object from package globaltest
##
## Call:
## gt(response = Y ~ 1, alternative = Y ~ 1 + ., data = X[, interfer_ind], permutations = n_perm)
##
## Model: linear regression.
## Degrees of freedom: 25 total; 1 null; 1 + 29 alternative.
## Null distibution: 99999 random permutations.
##
## p-value Statistic Expected Std.dev #Cov
## 1 0.0565 10.5 4.17 3.35 29
summary(gt(Y ~ 1, Y ~ 1 + ., data = X[, inflame_ind], permutations = n_perm))
## "gt.object" object from package globaltest
##
## Call:
## gt(response = Y ~ 1, alternative = Y ~ 1 + ., data = X[, inflame_ind], permutations = n_perm)
##
## Model: linear regression.
## Degrees of freedom: 25 total; 1 null; 1 + 19 alternative.
## Null distibution: 99999 random permutations.
##
## p-value Statistic Expected Std.dev #Cov
## 1 0.417 4.23 4.17 1.71 19
summary(gt(Y ~ 1, Y ~ 1 + ., data = X[, antibody_ind], permutations = n_perm))
## "gt.object" object from package globaltest
##
## Call:
## gt(response = Y ~ 1, alternative = Y ~ 1 + ., data = X[, antibody_ind], permutations = n_perm)
##
## Model: linear regression.
## Degrees of freedom: 25 total; 1 null; 1 + 2 alternative.
## Null distibution: 99999 random permutations.
##
## p-value Statistic Expected Std.dev #Cov
## 1 0.568 1.87 4.19 4.96 2
X = eData(dat[, j]) %>% t %>% tbl_df
labDfns = as.character(as.numeric(dat0$Chen_Avg_ShCa < 0) + 1) par(mfrow = c(1, 3)) X[, inflame_ind] %>% t %>% heatmap(ColSideColors = labDfns) X[, interfer_ind] %>% t %>% heatmap(ColSideColors = labDfns) X[, antibody_ind] %>% t %>% heatmap(ColSideColors = labDfns)
kmeans1 = kmeans(X[, ],centers=3, nstart = 1000) lm(dat0\(Chen_Avg_ShCa ~ as.factor(kmeans1\)cluster)) %>% summary plot(kmeans1\(cluster, dat0\)Chen_Avg_ShCa) # subject 1092 is outlying (number 3) lm(dat0\(Chen_Avg_ShCa[-3] ~ as.factor(kmeans1\)cluster[-3])) %>% summary plot(dat0\(Chen_Avg_ShCa[-3], kmeans1\)cluster[-3])
feat %>% group_by(hgnc_symbol) %>% summarise(n_distinct(affy_hugene_2_0_st_v1)) %>% dim # how many HGNC genes measured on our Affy feat %>% filter(grepl(“microRNA”, description, ignore.case = T)) query = “” query = “behavior” query = “aggressive” query = “defense” query = “learning” query = “virus” query = “regulation of gene expression” query = “immune” query = “fear” query = “inflammatory” query = “cytokine” query = “interferon” query = “response to cortisol” query = “response to epinephrine” a = feat %>% filter(grepl(query, name_1006, ignore.case = TRUE)) %>% dplyr::select(affy_hugene_2_0_st_v1, name_1006, go_id, hgnc_symbol) %>% filter(!duplicated(affy_hugene_2_0_st_v1)) # intersection with ctra? intersect(ctra, a$hgnc_symbol) %>% length
X = dat[a$affy_hugene_2_0_st_v1, j] %>% eData() %>% t summary(gt(Y ~ 1, Y ~ ., data = X, permutations = n_perm))
XY = left_join(pdat, X %>% as.data.frame() %>% tibble::rownames_to_column(“IDNum”) %>% tbl_df, by = “IDNum”)
XY = gather(XY, affy, express, matches(“[0-9]”)) # long format
XY %>% group_by(affy) %>% ggvis(~Chen_Avg_ShCa, ~express) %>% layer_points(fill = ~affy) %>% layer_model_predictions(formula = express ~ Chen_Avg_ShCa, model = “lm”, stroke = ~affy)
a$name_1006 %>% unique a %>% dplyr::select(name_1006, go_id) %>% unique a %>% dplyr::select(affy_hugene_2_0_st_v1) %>% n_distinct() # how many affy probes in this ontology?
cases = !is.na(dat\(Chen_Avg_ShCa) chen = dat\)Chen_Avg_ShCa[cases] y = dat[a$affy_hugene_2_0_st_v1, cases] %>% eData() rownames(y) = a\(name_1006 rownames(y) = a\)hgnc_symbol heatmap(y)
d = dim(y)[1] rho = rcorr(t(rbind(chen, y)), type = “spearman”) rho\(r[1,2:d] %>% hist(breaks = 10) rho\)P[1,2:d] %>% hist(breaks = 10) rho$P[1,2:d] %>% min
km = kmeans(t(matE),centers=2, iter.max = 1000, nstart = 1000000) labKm = as.character(km\(cluster) labDfns = as.character(as.numeric(dat0\)Chen_Avg_ShCa <= 0) + 1) # plot heatmap(matE, ColSideColors = labKm, labRow = dat0\(hgnc_symbol) heatmap(matE, ColSideColors = labDfns, labRow = dat0\)hgnc_symbol) glMDSPlot(matE, groups = labKm) # mean inflamatory activation (subject_specific_inflamation = colMeans(matE)) %>% hist(breaks = 100) a = rcorr(t(rbind(subject_specific_inflamation, dat0\(Chen_Avg_ShCa))) # component-wise inflamatory activation a = rcorr(t(rbind(matE, dat0\)Chen_Avg_ShCa))) tidE\(hgnc_symbol[a\)P[20, ] < 0.05]
(ctra_core <- getBM(attributes = attr[1:2], filters = attr[2], values = ctraCore, mart = ensembl) %>% tbl_df) (ctra_core <- ctra_core %>% filter(!is.na(affy_hugene_2_0_st_v1)) %>% mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1))) # describe ctra_core %>% summarise(n_distinct(hgnc_symbol)) # all inflamatory molecules present dat0 = dat[ctra_core\(affy_hugene_2_0_st_v1, !is.na(dat\)Chen_Avg_ShCa)] tidE = dat0 %>% eData %>% tbl_df %>% tibble::rownames_to_column() %>% mutate(hgnc_symbol = ctra_core\(hgnc_symbol) %>% group_by(hgnc_symbol) %>% summarise_at(vars(matches("[0-9]")), funs(mean)) matE = tidE %>% dplyr::select(matches("[0-9]")) %>% as.matrix() km = kmeans(t(matE),centers=3, iter.max = 1000, nstart = 1000000) labKm = as.character(km\)cluster) heatmap(matE, labRow = tidE$hgnc_symbol, ColSideColors = labKm) glMDSPlot(matE, groups = labKm) # multidimensional scaling
dist1 = dist(t(matE)) heatmap(as.matrix(dist1),Colv=NA,Rowv=NA) hclust1 = hclust(dist1) plot(hclust1, labels = labels) plot(hclust1, labels = labels, hang = -1) dend = as.dendrogram(hclust1) dend = color_labels(hclust1,2,col=1:2) # identify/color 2 groups plot(dend) labels_colors(dend) <- labDfns plot(dend)
kmeans1 = kmeans(edata,centers=2) names(kmeans1) matplot(t(kmeans1\(centers),col=1:2,type="l", lwd=3) table(kmeans1\)cluster) heatmap(as.matrix(edata)[order(kmeans1$cluster),],col=colramp,Colv=NA,Rowv=NA) # reorder the data according to cluster membership
a <- rowMeans(edata) d <- rowMeans(edata[, labels == “2”]) - rowMeans(edata[, labels == “1”]) smoothScatter(a, d, main=“MA plot”, xlab=“A”, ylab=“M”) abline(h=c(-1,1), col=“red”)
j <- !is.na(dat$Chen_Avg_ShCa) D <- dat[i, j] # subset features samples
X <- model.matrix(~ Chen_Avg_ShCa, data = pData(D)) y <- eData(D) fit <- lmFit(y, X) ebayes <- eBayes(fit) # Variance parameter inference is variable/unstable, because few replicates. T-statistic penalizes this instability (low df). In conventional Type-I paradigm must increase threshold, and loose power. Consider pseudo-replicates to borrow strength across. top <- topTable(ebayes, coef=“Chen_Avg_ShCa”, adjust=“fdr”, n=100) %>% tbl_df %>% tibble::rownames_to_column(“affy_hugene_2_0_st_v1”) %>% mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1)) heatmap(y[top$affy_hugene_2_0_st_v1, ])
a = feat %>% filter(affy_hugene_2_0_st_v1 %in% top$affy_hugene_2_0_st_v1)
a = feat %>% filter(affy_hugene_2_0_st_v1 %in% top\(affy_hugene_2_0_st_v1) %>% dplyr::select(affy_hugene_2_0_st_v1, hgnc_symbol) %>% filter(!duplicated(affy_hugene_2_0_st_v1)) %>% print(n = Inf) setequal(a\)affy_hugene_2_0_st_v1, top$affy_hugene_2_0_st_v1) left_join(top, a, by = “affy_hugene_2_0_st_v1”) attr <- c(“affy_hugene_2_0_st_v1”, “hgnc_symbol”)
attr <- c(“affy_hugene_2_0_st_v1”, “hgnc_symbol”) (hits <- getBM(attributes = attr, filters = attr[1], values = top\(affy_hugene_2_0_st_v1, mart = ensembl) %>% tbl_df) (hits <- hits %>% mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1))) # type coersion left_join(top, hits, by = "affy_hugene_2_0_st_v1") %>% print(n = Inf) (enriched = hits\)hgnc_symbol %>% unique) write(enriched, file = “enriched.txt”)
attr <- c(“affy_hugene_2_0_st_v1”, “hgnc_symbol”, “name_1006”) # why does adding go_id reduce the dimension of return object (hits <- getBM(attributes = attr, filters = attr[1], values = top$affy_hugene_2_0_st_v1, mart = ensembl) %>% tbl_df) hits <- hits %>% filter(grepl(query, name_1006, ignore.case = T)) %>% filter(!duplicated(affy_hugene_2_0_st_v1)) %>% mutate(affy_hugene_2_0_st_v1 = as.character(affy_hugene_2_0_st_v1)) left_join(top, hits, by = “affy_hugene_2_0_st_v1”)
hits$name_1006 %>% unique (relevant_hits <- hits %>% filter(grepl(query, name_1006, ignore.case = TRUE)) %>% dplyr::select(affy_hugene_2_0_st_v1, name_1006, hgnc_symbol))
listAttributes(ensembl) %>% filter(grepl(“feature_page”, page, ignore.case = T)) %>% filter(grepl(“go”, description, ignore.case = T))
library(“hugene20sttranscriptcluster.db”) sapply(annotate::lookUp(probeset, “hugene20sttranscriptcluster.db”, “PMID”), head) absts <- annotate::pm.getabst(probeset[1], “hugene20sttranscriptcluster.db”) # check refs for the top genes titl <- annotate::pm.titles(absts[1]) # get titles strwrap(titl, simplify=FALSE)
molecule = “16657652” sapply(annotate::lookUp(molecule, “hugene20sttranscriptcluster.db”, “PMID”), head) absts <- annotate::pm.getabst(molecule, “hugene20sttranscriptcluster.db”) # check refs for the top genes annotate::pm.titles(absts)
library.bioconductor(“KEGG.db”) library.bioconductor(“GO.db”) library.bioconductor(“annaffy”) atab <- aafTableAnn( probeset, “hugene20sttranscriptcluster.db”, aaf.handler(chip = “hugene20sttranscriptcluster.db”)) saveHTML(atab, file=“report2.html”) browseURL(“report2.html”)
d <- fdata %>% dplyr::filter(grepl(“transcription”,DESC)) ind <- d\(affy_hugene_2_0_st_v1 %>% as.character edata <- edata[ind, ] edata <- log2(edata + 1) edatac<- edata - rowMeans(edata) # centered svd1 <- svd(edatac) names(svd1) par(pch=19, mfrow=c(1,2)) plot(svd1\)d^2/sum(svd1\(d^2),ylab="Percent Variance Explained",col=2) plot(svd1\)v[,1],svd1$v[,2], ylab=“2nd PC”, xlab=“1st PC”,col= treat + 1) # check svd(), prcomp() X <- edata (s <- svd(X))
fa <- factanal(t(X), 3, rotation=“varimax”)
library(GEOquery) eList = getGEO(“GSE11675”) # the processed data length(eList) names(eList) eData = eList[[1]] eData names(pData(eData)) eList2 = getGEOSuppFiles(“GSE11675”) # the raw data
?ExpressionSet # S4 classes getClass(“ExpressionSet”) validObject(dat) showMethods(“as.data.frame”) # S4 methods, generic function dispatches on x getMethod(“as.data.frame”, “GenomicRanges”) # look at the function if x = GenomicRanges method?“as.data.frame,GenomicRanges”
library(“glmnet”) CV = cv.glmnet(x = t(y), y = chen, family = “gaussian”, alpha = 1, nlambda = 1000); plot(CV) fit = glmnet(x = t(y), y = chen, family = “gaussian”, alpha = 1, lambda = CV$lambda.1se)
mydata <- y wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var)) for (i in 2:15) wss[i] <- sum(kmeans(t(mydata), centers=i)$withinss) plot(1:15, wss, type=“b”, xlab=“Number of Clusters”, ylab=“Within groups sum of squares”)
library(GEOquery) outNepali = getGEO(“GSE77164”) outNepali = outNepali[[1]]
pdat = outNepali %>% pData() edat = outNepali %>% eData() pdat\(characteristics_ch1 # soldier status tmp = edat[rownames(edat) %in% ctra0, ] heatmap(tmp, ColSideColors = as.character(as.numeric(pdat\)characteristics_ch1)))
kmeans1 = kmeans(t(tmp),centers=2, iter.max = 10000, nstart = 10000) heatmap(tmp, ColSideColors = as.character(kmeans1\(cluster)) table(kmeans1\)cluster, pdat\(characteristics_ch1) chisq.test(kmeans1\)cluster, pdat$characteristics_ch1)
pdat %>% dplyr::select(matches(“chara.*[^16]\(")) %>% head (vars = pdat %>% dplyr::select(matches("chara.*[^16]\)“)) %>% names()) form = as.formula( paste(”kmeans1$cluster ~“, paste(”as.factor(“, vars,”)“, collapse =”+“))) summary(lm(formula = form, data = pdat ))
library(“JGL”) ## run fused graphical lasso # Network topology # Erdos-Rengyi random # Small world: Watts, strogatz networks have high transitivity (clustering) AND low geodesics (path length), which fits real-world topology. Their model explains realworld topological features (transtivitity/geodesics) in terms of shortcuts in real-world networks which connect modules that are highly clustered. # Scale free: Barabasi, Albert. Powerlaw NOT poisson degree distributions. Rich-get-richer. Importance of hub nodes. # Duplication-divergence: # Magnetization
e = outNepali[rownames(outNepali) %in% ctra0, ] %>% eData e1 = e[, pdat$characteristics_ch1 == “childsoldier: 0”] e2 = e[, pdat$characteristics_ch1 == “childsoldier: 1”] nepali = list(t(e1), t(e2)) # expects samples on rows, features/genes across columns # lambda is a sparsity parameter (it offers another solution to the exploding multiple comparison problem, 400 million ways to go wrong). Some “statistical learning” people talk about finding a “penalty function that induces realistic structure”. # Sparcity constraints. Loss function based on some distributional assumptions (likelyhood, pseudo-likelihood, conditional likelihood) like multivariate normality. Motivate more flexible approach. fgl.results = JGL(Y=nepali,penalty=“group”,lambda1=.1,lambda2=.1, return.whole.theta = TRUE) a1 = (fgl.results\(theta[[1]] != 0) # plot adjacency (inverse covariance) matrix (only returned over the connected nodes if return.whole.theta = FALSE, and only the diagonal of the matrix is returned over the unconnected nodes (diag.theta.unconnected) a2 = (fgl.results\)theta[[2]] != 0) par(mfrow=c(1,2)) image(a1) image(a2)
g1 = graph.adjacency(a1, mode = “undirected”, diag = 0) g2 = graph.adjacency(a2, mode = “undirected”, diag = 0) mean(degree(g1)) - mean(degree(g2)) sd(degree(g1)) - sd(degree(g2))
E(g1)\(color = "green" V(g1)\)color <- “green” V(g1)\(color[degree(g1)>3] <- "red" E(g1)\)color[1:4] <- “red” par(mfrow = c(1,1)) plot(g1) subnetworks(fgl.results$theta) # get subnetwork membership of FGL results:
permutationNull <- function(){ p = dim(e)[1] n = dim(e)[2] n1 = dim(e1)[2]
ord = sample(n, replace = FALSE) # random permutation e1 = e[, ord[1:n1]] e2 = e[, ord[(n1+1):n]] y = list(t(e1), t(e2)) # expects samples on rows, features/genes across columns out = JGL(Y=y,penalty=“group”,lambda1=1,lambda2=1, return.whole.theta = TRUE)
# use igraph
a1 = (out$theta[[1]] != 0)
a2 = (out$theta[[2]] != 0)
g1 = graph.adjacency(a1, mode = "undirected", diag = 0) # inverse covariance matrix is only returned over the connected nodes if return.whole.theta = FALSE, and only the diagonal of the matrix is returned over the unconnected nodes (diag.theta.unconnected)
g2 = graph.adjacency(a2, mode = "undirected", diag = 0)
degr = data.frame(degree(g1), degree(g2))
par(mfrow = c(1,2))
image(a1)
image(a2)
plot(g1)
plot(g2)
# return(list(dat = degr))
return(norm(a1-a2, type = "2"))
}
out = replicate(10, permutationNull())
par(mfrow = c(1, 2)) net.degree(fgl.results\(theta) %>% sapply(hist, breaks = 100) #List the degrees of the most connected nodes in each class. net.hubs(fgl.results\)theta, nhubs = 10) #For each class, returns the names of the nodes connected to a given node, here IL6. ind = which(colnames(nepali[[2]]) %in% “IL6”) net.neighbors(fgl.results$theta, index = ind)
tmp = jdat %>% mutate(highBmi = 1*(bmi > median(bmi))) tmp %>% group_by(molecule, highBmi) %>% ggvis(~express, fill = ~interaction(molecule, highBmi)) %>% layer_densities()