Timothy Tickle and Brian Haas
October 1, 2015
Let's take a moment.
# Load libraries
library(fpc) # Density based clustering dbscan
library(gplots) # Colorpanel
library(scatterplot3d) # 3D plotting
library(monocle)
library(tsne) # Non-linear ordination
library(pheatmap)
library(MASS)
library(cluster)
library(mclust)
library(flexmix)
library(lattice)
library(amap)
library(RColorBrewer)
library(locfit)
library(Seurat)
library(vioplot)
# Source code
source(file.path("src", "Modules.R")) # Helper functions
Islam S et al. Characterization of the single-cell transcriptional landscape by highly multiplex RNA-seq . Genome Research 2011
# Load tab delimited file
data.set = read.delim(file.path("data", "GSE29087_L139_updated.txt"))
rownames(data.set)
[1] "Tor1aip2" "Pnkd" "Smyd3" "4921521F21Rik"
[5] "Gpbar1" "1700016C15Rik"
colnames(data.set)
[1] "ES_A01" "ES_B01" "ES_C01" "ES_D01" "ES_E01" "ES_F01"
# Plot genes per cell ( how many genes express )
genes.per.cell <- apply(data.set, 2, function(x) sum(x > 0))
cell.outlier = plot.cell.complexity(genes.per.cell)
cell.outlier
ES_F06 MEF_D12
46 92
# Remove outlier cells
data.set = data.set[, -1 * cell.outlier]
ncol(data.set)
[1] 90
plot.quantiles(data.set)
# Remove low expressing genes
data.cleaned <- func_filter_by_occurence(data.set, 10, 10)
data.cleaned.norm <- func_cpx(data.cleaned)
func_plot_saturation_curve(data.cleaned[, 1], 1000)
#TODO Izzary paper
nbt = read.into.seurat(file.path("data", "HiSeq301_RSEM_linear_values.txt"),
sep = "\t", header = TRUE, row.names = 1)
nbt = setup(nbt, project = "NBT", min.cells = 3, names.field = 2, names.delim = "_",
min.genes = 1000, is.expr = 1)
vlnPlot(nbt, c("DPPA4"))
cellPlot(nbt, nbt@cell.names[1], nbt@cell.names[2], do.ident = FALSE)
cellPlot(nbt, nbt@cell.names[3], nbt@cell.names[4], do.ident = FALSE)
Things to be aware of.
nbt = prep.pca.seurat(y.cutoff = 2, x.low.cutoff = 2)
pca.plot(nbt, 1, 2, pt.size = 3)
print.pca(nbt, 1)
[1] "PC1"
[1] "LGALS1" "TIMP1" "KRT18" "IFI30" "ARHGDIB"
[6] "IFI27" "UCA1" "HIST1H2BK" "KRT15" "LCN2"
[11] "S100A9" "KRT81" "ALDH1A3" "KLK5" "CEACAM6"
[1] ""
[1] "SOX11" "TUBB2B" "DCX" "GPM6A" "CRMP1"
[6] "RTN1" "NNAT" "C1orf61" "STMN2" "FABP7"
[11] "LOC150568" "41520" "TMSB15A" "PPP2R2B" "GAP43"
[16] "NREP"
[1] ""
[1] ""
viz.pca(nbt, 1:2)
nbt = run_tsne(nbt, dims.use = 1:11, max_iter = 2000)
tsne.plot(nbt, pt.size = 3)
pca.plot(nbt, 1, 2, pt.size = 3, group.by = "nGene")
tsne.plot(nbt, pt.size = 3, group.by = "nGene")
nbt = DBclust_dimension(nbt, 1, 2, reduction.use = "tsne", G.use = 8, set.ident = TRUE)
nbt = buildClusterTree(nbt, do.reorder = TRUE, reorder.numeric = TRUE, pcs.use = 1:11,
do.plot = FALSE)
tsne.plot(nbt, do.label = TRUE, label.pt.size = 0.5)
For each group (ES or MEF).
Differential Expression.
## Setting up cells groups Get groupings data.groups <- rep( NA, ncol(
## data.cleaned ) ) data.groups[ grep( 'MEF', names( data.cleaned )) ] <-
## 'MEF' data.groups[ grep( 'ES', names( data.cleaned )) ] <- 'ES'
## data.groups <- factor( data.groups, levels = c('ES','MEF') )
# library(scde)
## Calculate error models o.ifm <- scde.error.models( as.matrix( data.cleaned
## ), groups = data.groups, n.cores=3, threshold.segmentation=TRUE,
## save.crossfit.plot=FALSE, save.model.plots=FALSE, verbose=1 )
## Filter out cell (QC) o.ifm <- o.ifm[ o.ifm$corr.a > 0, ]
## Set up the Prior (starting value) o.prior <-
## scde.expression.prior(models=o.ifm,counts=as.matrix( data.cleaned ),
## length.out=400,show.plot=FALSE)
## Perform T-test like analysis ediff <-
## scde.expression.difference(o.ifm,as.matrix(data.cleaned),
## o.prior,groups=data.groups,n.randomizations=100, n.cores=1,verbose=1)
## write.table(ediff[order(abs(ediff$Z),decreasing=T),],
## file='scde_results.txt',row.names=T,col.names=T, sep='\t',quote=F)
source(file.path("src", "RaceID_class.R"))
# Load tutorial data
race.in <- read.csv(file.path("data", "transcript_counts_intestine.xls"), sep = "\t",
header = TRUE)
rownames(race.in) <- race.in$GENEID
race.in <- race.in[grep("ERCC", rownames(race.in), invert = TRUE), -1]
race.data <- SCseq(race.in)
race.data <- filterdata(race.data, mintotal = 3000, minexpr = 5, minnumber = 1,
maxexpr = 500, downsample = FALSE, dsn = 1, rseed = 17000)
race.data <- clustexp(race.data, metric = "pearson", cln = 0, do.gap = TRUE,
clustnr = 20, B.gap = 50, SE.method = "Tibs2001SEmax", SE.factor = 0.25,
bootnr = 50, rseed = 17000)
boot 1
boot 2
boot 3
boot 4
boot 5
boot 6
boot 7
boot 8
boot 9
boot 10
boot 11
boot 12
boot 13
boot 14
boot 15
boot 16
boot 17
boot 18
boot 19
boot 20
boot 21
boot 22
boot 23
boot 24
boot 25
boot 26
boot 27
boot 28
boot 29
boot 30
boot 31
boot 32
boot 33
boot 34
boot 35
boot 36
boot 37
boot 38
boot 39
boot 40
boot 41
boot 42
boot 43
boot 44
boot 45
boot 46
boot 47
boot 48
boot 49
boot 50
plotgap(race.data)
race.data <- findoutliers(race.data, outminc = 5, outlg = 2, probthr = 0.001,
thr = 2^-(1:40), outdistquant = 0.75)
race.data <- comptsne(race.data, rseed = 15555)
plottsne(race.data, final = FALSE)
plottsne(race.data, final = TRUE)
target.genes <- c("Apoa1__chr9", "Apoa1bp__chr3", "Apoa2__chr1", "Apoa4__chr9",
"Apoa5__chr9")
plotexptsne(race.data, target.genes)
plotsymbolstsne(race.data, type = sub("\\_\\d+$", "", names(race.data@ndata)))
TODO: the methodology briefly
TODO:Can we tie this into SCDE?
Needed files -
# Do not run (For later) monocle.data <- make_cell_data_set(
# expression_file='monocle_exprs.txt',
# cell_phenotype_file='monocle_cell_meta.txt',
# gene_metadata_file='monocle_gene_meta.txt' )
# Get data for today
monocle.data <- get_monocle_presentation_data()
# Require a minimun of 0.1 expression
monocle.data <- detectGenes(monocle.data, min_expr = 0.1)
# Require atleast 50 cells to have the minimum 0.1 expression Get name of
# genes pass these filters
monocle.expr.genes <- row.names(subset(fData(monocle.data), num_cells_expressed >=
50))
Other QC can be performed.
plot_log_normal_monocle(monocle.data)
# Marker genes of biological interest
marker.genes <- get_monocle_presentation_marker_genes()
# Select from those marker genes those important to the study
ordering.genes <- select_ordering_genes(monocle.data, monocle.expr.genes, marker.genes,
"expression~Media", 0.01)
# Order the cells by expression
monocle.data <- order_cells_wrapper(monocle.data, ordering.genes, use_irlba = FALSE,
num_paths = 2, reverse = TRUE)
# Plot all cells in study with ordering
plot_spanning_tree(monocle.data)
# Write data to file TODO
<simpleError in lm.fit(X.vlm, y = z.vlm, ...): NA/NaN/Inf in 'y'>
monocle.data.diff.states <- monocle.data[monocle.expr.genes, pData(monocle.data)$State !=
3]
# TODO subset is not generic, needs gene_short_name attr
subset.for.plot <- subset_to_genes(monocle.data.diff.states, c("CDK1", "MEF2C",
"MYH3"))
plot_genes_in_pseudotime(subset.for.plot, color_by = "Hours")
# Get genes of interest
subset.pseudo <- subset_to_genes(monocle.data, c("MYH3", "MEF2C", "CCNB2", "TNNT1"))
subset.pseudo <- subset.pseudo[, pData(subset.pseudo)$State != 3]
subset.pseudo.diff <- differentialGeneTest(subset.pseudo, fullModelFormulaStr = "expression~sm.ns(Pseudotime)")
plot_genes_in_pseudotime(subset.pseudo, color_by = "Hours")
#TODO Add theory about diffusion maps.
Please note this is a collection of many peoples ideas. Included in the download is a references.txt to document sources, tutorials, software, and links to cute corgi pictures :-)
# pdf( 'data/my_file.pdf', useDingbats = FALSE ) # Start pdf plot( 1:10,
# log(1:10 ) ) # plot in to the pdf file plot( seq(0,.9,.1), sin(0:9) ) #
# another plot for the pdf file dev.off() # Close pdf file ( very important
# )