This study is GSE85599 using CEL files in total RNA of acute infectious mononucleosis and chronic EBV, few samples, uses a platform. So we will be combining the CEL files with the platform and using our previous project on Autism to work the similar way.
library(rmarkdown)
series31 <- read.table("GSE85599_series_matrix.txt", nrow=31)
series <- read.table("GSE85599_series_matrix.txt", skip=31, nrow=35)
paged_table(series)
library(oligo)
## Loading required package: BiocGenerics
## Loading required package: generics
##
## Attaching package: 'generics'
## The following objects are masked from 'package:base':
##
## as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
## setequal, union
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
## unsplit, which.max, which.min
## Loading required package: oligoClasses
## Welcome to oligoClasses version 1.74.0
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: Biostrings
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: XVector
## Loading required package: Seqinfo
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## ================================================================================
## Welcome to oligo version 1.76.0
## ================================================================================
library(GEOquery)
## Setting options('download.file.method.GEOquery'='auto')
## Setting options('GEOquery.inmemory.gpl'=FALSE)
getGEOSuppFiles("GSE85599")
## Using locally cached version of supplementary file(s) GSE85599 found here:
## C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar
## size
## C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar 380016640
## isdir
## C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar FALSE
## mode
## C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar 666
## mtime
## C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar 2026-05-19 17:49:05
## ctime
## C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar 2026-05-19 17:42:56
## atime
## C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar 2026-05-19 19:48:26
## exe
## C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar no
## uname
## C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar jlcor
## udomain
## C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar DATAMASSAGER1
## fname
## C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar GSE85599_RAW.tar
## destdir
## C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599
## filepath
## C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar
## GEO
## C:/Users/jlcor/Desktop/GSE85599 2017 study mononucleosis and EBV/GSE85599/GSE85599_RAW.tar GSE85599
untar("GSE85599/GSE85599_RAW.tar", exdir="data/")
cel_files <- list.celfiles("files CEL/", full.names = TRUE)
cel_files
## [1] "files CEL/GSM2279022_15VC1799_HTA-2_0.CEL"
## [2] "files CEL/GSM2279023_15VC1803_HTA-2_0.CEL"
## [3] "files CEL/GSM2279024_15VC2293_HTA-2_0.CEL"
## [4] "files CEL/GSM2279025_15VC2295_HTA-2_0.CEL"
## [5] "files CEL/GSM2279026_15VC2386_HTA-2_0.CEL"
## [6] "files CEL/GSM2279027_15VC2407_HTA-2_0.CEL"
## [7] "files CEL/GSM2279028_15VC2414_HTA-2_0.CEL"
## [8] "files CEL/GSM2279029_15VC2656_HTA-2_0.CEL"
## [9] "files CEL/GSM2279030_15VC3454_HTA-2_0.CEL"
## [10] "files CEL/GSM2279031_C13_HTA-2_0.CEL"
## [11] "files CEL/GSM2279032_C15_HTA-2_0.CEL"
## [12] "files CEL/GSM2279033_C21_HTA-2_0.CEL"
## [13] "files CEL/GSM2279034_C22_HTA-2_0.CEL"
## [14] "files CEL/GSM2279035_C23_HTA-2_0.CEL"
## [15] "files CEL/GSM2279036_C24_HTA-2_0.CEL"
## [16] "files CEL/GSM2279037_ED483_HTA-2_0.CEL"
## [17] "files CEL/GSM2279038_ED490_HTA-2_0.CEL"
BiocManager::version()
## [1] '3.23'
I need the pd.hta.2.0 package for my version of bioconductor 3.23.
BiocManager::install("pd.hta.2.0")
data <- read.celfiles(cel_files)
## Loading required package: pd.hta.2.0
## Loading required package: RSQLite
## Loading required package: DBI
## Platform design info loaded.
## Reading in : files CEL/GSM2279022_15VC1799_HTA-2_0.CEL
## Reading in : files CEL/GSM2279023_15VC1803_HTA-2_0.CEL
## Reading in : files CEL/GSM2279024_15VC2293_HTA-2_0.CEL
## Reading in : files CEL/GSM2279025_15VC2295_HTA-2_0.CEL
## Reading in : files CEL/GSM2279026_15VC2386_HTA-2_0.CEL
## Reading in : files CEL/GSM2279027_15VC2407_HTA-2_0.CEL
## Reading in : files CEL/GSM2279028_15VC2414_HTA-2_0.CEL
## Reading in : files CEL/GSM2279029_15VC2656_HTA-2_0.CEL
## Reading in : files CEL/GSM2279030_15VC3454_HTA-2_0.CEL
## Reading in : files CEL/GSM2279031_C13_HTA-2_0.CEL
## Reading in : files CEL/GSM2279032_C15_HTA-2_0.CEL
## Reading in : files CEL/GSM2279033_C21_HTA-2_0.CEL
## Reading in : files CEL/GSM2279034_C22_HTA-2_0.CEL
## Reading in : files CEL/GSM2279035_C23_HTA-2_0.CEL
## Reading in : files CEL/GSM2279036_C24_HTA-2_0.CEL
## Reading in : files CEL/GSM2279037_ED483_HTA-2_0.CEL
## Reading in : files CEL/GSM2279038_ED490_HTA-2_0.CEL
data
## HTAFeatureSet (storageMode: lockedEnvironment)
## assayData: 6892960 features, 17 samples
## element names: exprs
## protocolData
## rowNames: GSM2279022_15VC1799_HTA-2_0.CEL
## GSM2279023_15VC1803_HTA-2_0.CEL ... GSM2279038_ED490_HTA-2_0.CEL
## (17 total)
## varLabels: exprs dates
## varMetadata: labelDescription channel
## phenoData
## rowNames: GSM2279022_15VC1799_HTA-2_0.CEL
## GSM2279023_15VC1803_HTA-2_0.CEL ... GSM2279038_ED490_HTA-2_0.CEL
## (17 total)
## varLabels: index
## varMetadata: labelDescription channel
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: pd.hta.2.0
normalized.data <- rma(data)
## Background correcting
## Normalizing
## Calculating Expression
normalized.data
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 70523 features, 17 samples
## element names: exprs
## protocolData
## rowNames: GSM2279022_15VC1799_HTA-2_0.CEL
## GSM2279023_15VC1803_HTA-2_0.CEL ... GSM2279038_ED490_HTA-2_0.CEL
## (17 total)
## varLabels: exprs dates
## varMetadata: labelDescription channel
## phenoData
## rowNames: GSM2279022_15VC1799_HTA-2_0.CEL
## GSM2279023_15VC1803_HTA-2_0.CEL ... GSM2279038_ED490_HTA-2_0.CEL
## (17 total)
## varLabels: index
## varMetadata: labelDescription channel
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: pd.hta.2.0
normalized.expr <- exprs(normalized.data)
normalized.expr <- as.data.frame(exprs(normalized.data))
write.csv(normalized.expr,'normalized.expr1.csv', row.names=T)
paged_table(normalized.expr[1:10,])
Map probe IDs into gene symbols.
gse <- getGEO("GSE85599", GSEMatrix = T)
gse
Next get the feature IDs and store it.
feature.data <- gse$GSE85599_series_matrix.txt.gz@featureData@data
The above produces additional information with alternate gene ID symbols to the Probe IDs as well as other information.
paged_table(feature.data[1:10,])
feature.data <- read.csv('feature.data.csv', header=T, row.names=1)
paged_table(feature.data[1:10,])
subset to only get the gene symbols and probe IDs. We will get the gene IDs from two different comparable messy features.
colnames(feature.data)
## [1] "ID" "probeset_id" "seqname" "strand"
## [5] "start" "stop" "total_probes" "gene_assignment"
## [9] "mrna_assignment" "swissprot" "unigene" "category"
## [13] "locus.type" "notes" "SPOT_ID"
We will use columns 1, 8, and 9. The gene symbols are embedded in parentheses in column 9 but is the 2nd item between //…// in the 8th column.
feature.data1 <- feature.data[,c(1,8,9)]
paged_table(feature.data1[1:20,])
Add an ID column to the normalized.expr data frame of samples by probe ID.
normalized.expr$ID <- row.names(normalized.expr)
paged_table(normalized.expr[1:10,])
str(normalized.expr)
## 'data.frame': 70523 obs. of 21 variables:
## $ ID : chr "1" "2" "3" "4" ...
## $ GSM2279022_AIM : num 11.9 11.4 11.1 11.1 10.1 ...
## $ GSM2279023_AIM : num 11.74 11.25 10.92 11.14 9.86 ...
## $ GSM2279024_AIM : num 11.6 11.02 10.91 11.01 9.89 ...
## $ GSM2279025_CAEBV : num 12.1 11.4 11.3 11.5 10.4 ...
## $ GSM2279026_AIM : num 11.66 10.91 10.98 11.05 9.96 ...
## $ GSM2279027_CAEBV : num 11.63 10.89 10.68 11.1 9.47 ...
## $ GSM2279028_CAEBV : num 11.58 10.57 10.92 10.9 9.26 ...
## $ GSM2279029_CAEBV : num 11.8 11.1 11.2 11.2 10.3 ...
## $ GSM2279030_CAEBV : num 11.66 11.04 11.05 11.05 9.87 ...
## $ GSM2279031_healthy: num 11.52 10.95 10.98 10.84 9.78 ...
## $ GSM2279032_healthy: num 11.9 11.5 11.3 11.2 10.3 ...
## $ GSM2279033_healthy: num 11.67 11.28 10.99 11.2 9.89 ...
## $ GSM2279034_healthy: num 11.6 10.8 10.9 10.9 10.1 ...
## $ GSM2279035_healthy: num 11.9 11.1 11 11.1 10.2 ...
## $ GSM2279036_healthy: num 11.5 10.8 10.8 11 9.8 ...
## $ GSM2279037_AIM : num 11.65 10.88 10.83 11.13 9.92 ...
## $ GSM2279038_AIM : num 11.79 10.99 10.7 11.05 9.84 ...
## $ gene_assignment : chr "--- // --- // Positive Housekeeping Controls - Probeset Type: normgene->exon // --- // ---" "--- // --- // Positive Housekeeping Controls - Probeset Type: normgene->exon // --- // ---" "--- // --- // Positive Housekeeping Controls - Probeset Type: normgene->exon // --- // ---" "--- // --- // Positive Housekeeping Controls - Probeset Type: normgene->exon // --- // ---" ...
## $ mrna_assignment : chr "---" "---" "---" "---" ...
## $ gene : chr " --- " " --- " " --- " " --- " ...
normalized.expr <- merge(normalized.expr,feature.data1, by="ID")
str(normalized.expr)
## 'data.frame': 70523 obs. of 21 variables:
## $ ID : chr "1" "2" "3" "4" ...
## $ GSM2279022_AIM : num 11.9 11.4 11.1 11.1 10.1 ...
## $ GSM2279023_AIM : num 11.74 11.25 10.92 11.14 9.86 ...
## $ GSM2279024_AIM : num 11.6 11.02 10.91 11.01 9.89 ...
## $ GSM2279025_CAEBV : num 12.1 11.4 11.3 11.5 10.4 ...
## $ GSM2279026_AIM : num 11.66 10.91 10.98 11.05 9.96 ...
## $ GSM2279027_CAEBV : num 11.63 10.89 10.68 11.1 9.47 ...
## $ GSM2279028_CAEBV : num 11.58 10.57 10.92 10.9 9.26 ...
## $ GSM2279029_CAEBV : num 11.8 11.1 11.2 11.2 10.3 ...
## $ GSM2279030_CAEBV : num 11.66 11.04 11.05 11.05 9.87 ...
## $ GSM2279031_healthy: num 11.52 10.95 10.98 10.84 9.78 ...
## $ GSM2279032_healthy: num 11.9 11.5 11.3 11.2 10.3 ...
## $ GSM2279033_healthy: num 11.67 11.28 10.99 11.2 9.89 ...
## $ GSM2279034_healthy: num 11.6 10.8 10.9 10.9 10.1 ...
## $ GSM2279035_healthy: num 11.9 11.1 11 11.1 10.2 ...
## $ GSM2279036_healthy: num 11.5 10.8 10.8 11 9.8 ...
## $ GSM2279037_AIM : num 11.65 10.88 10.83 11.13 9.92 ...
## $ GSM2279038_AIM : num 11.79 10.99 10.7 11.05 9.84 ...
## $ gene_assignment : chr "--- // --- // Positive Housekeeping Controls - Probeset Type: normgene->exon // --- // ---" "--- // --- // Positive Housekeeping Controls - Probeset Type: normgene->exon // --- // ---" "--- // --- // Positive Housekeeping Controls - Probeset Type: normgene->exon // --- // ---" "--- // --- // Positive Housekeeping Controls - Probeset Type: normgene->exon // --- // ---" ...
## $ mrna_assignment : chr "---" "---" "---" "---" ...
## $ gene : chr " --- " " --- " " --- " " --- " ...
Lets get the series information for the sample IDs and the sample type as AIM or CAEBV.
paged_table(series)
We want the 1st and 12th rows.
sampleInformation <- series[c(1,12),(2:18)]
paged_table(sampleInformation)
colnames(normalized.expr)
## [1] "ID" "GSM2279022_AIM" "GSM2279023_AIM"
## [4] "GSM2279024_AIM" "GSM2279025_CAEBV" "GSM2279026_AIM"
## [7] "GSM2279027_CAEBV" "GSM2279028_CAEBV" "GSM2279029_CAEBV"
## [10] "GSM2279030_CAEBV" "GSM2279031_healthy" "GSM2279032_healthy"
## [13] "GSM2279033_healthy" "GSM2279034_healthy" "GSM2279035_healthy"
## [16] "GSM2279036_healthy" "GSM2279037_AIM" "GSM2279038_AIM"
## [19] "gene_assignment" "mrna_assignment" "gene"
Columns 2-18 we should identify as the type of sample and remove the excess information.
colnames(normalized.expr)[2:18] <- gsub("_.*","",colnames(normalized.expr)[2:18])
colnames(normalized.expr)
## [1] "ID" "GSM2279022" "GSM2279023" "GSM2279024"
## [5] "GSM2279025" "GSM2279026" "GSM2279027" "GSM2279028"
## [9] "GSM2279029" "GSM2279030" "GSM2279031" "GSM2279032"
## [13] "GSM2279033" "GSM2279034" "GSM2279035" "GSM2279036"
## [17] "GSM2279037" "GSM2279038" "gene_assignment" "mrna_assignment"
## [21] "gene"
sampleInfo <- data.frame(t(sampleInformation))
sampleInfo
## X1 X12
## V2 GSM2279022 diagnosis: acute infectious mononucleosis (AIM)
## V3 GSM2279023 diagnosis: acute infectious mononucleosis (AIM)
## V4 GSM2279024 diagnosis: acute infectious mononucleosis (AIM)
## V5 GSM2279025 diagnosis: chronic active EBV infection (CAEBV)
## V6 GSM2279026 diagnosis: acute infectious mononucleosis (AIM)
## V7 GSM2279027 diagnosis: chronic active EBV infection (CAEBV)
## V8 GSM2279028 diagnosis: chronic active EBV infection (CAEBV)
## V9 GSM2279029 diagnosis: chronic active EBV infection (CAEBV)
## V10 GSM2279030 diagnosis: chronic active EBV infection (CAEBV)
## V11 GSM2279031 diagnosis: healthy
## V12 GSM2279032 diagnosis: healthy
## V13 GSM2279033 diagnosis: healthy
## V14 GSM2279034 diagnosis: healthy
## V15 GSM2279035 diagnosis: healthy
## V16 GSM2279036 diagnosis: healthy
## V17 GSM2279037 diagnosis: acute infectious mononucleosis (AIM)
## V18 GSM2279038 diagnosis: acute infectious mononucleosis (AIM)
AIM <- grep("AIM",sampleInfo$X12)
CAEBV <- grep("CAEBV", sampleInfo$X12)
healthy <- grep("healthy", sampleInfo$X12)
sampleInfo$X12[AIM] <- "AIM"
sampleInfo$X12[CAEBV] <- "CAEBV"
sampleInfo$X12[healthy] <- "healthy"
paged_table(sampleInfo)
sampleInfo$samples <- paste(sampleInfo$X1,sampleInfo$X12,sep='_')
paged_table(sampleInfo)
colnames(normalized.expr)[2:18] <- sampleInfo$samples
colnames(normalized.expr)
## [1] "ID" "GSM2279022_AIM" "GSM2279023_AIM"
## [4] "GSM2279024_AIM" "GSM2279025_CAEBV" "GSM2279026_AIM"
## [7] "GSM2279027_CAEBV" "GSM2279028_CAEBV" "GSM2279029_CAEBV"
## [10] "GSM2279030_CAEBV" "GSM2279031_healthy" "GSM2279032_healthy"
## [13] "GSM2279033_healthy" "GSM2279034_healthy" "GSM2279035_healthy"
## [16] "GSM2279036_healthy" "GSM2279037_AIM" "GSM2279038_AIM"
## [19] "gene_assignment" "mrna_assignment" "gene"
class <- sampleInfo$X12
Got it — you want a regular expression in R that extracts the second item between // delimiters from a string.
For example, if you have:
Copy code “//first//second//third” you want “second”.
R Solution using stringr R
Copy code
library(stringr)
text <- “//first//second//third”
second_item <- str_match(text, “//[/]//([^/])”)[, 2]
print(second_item) # “second” How the regex works ^ → start of string // → match the first // [^/]* → match any characters except / (first item) // → match the second // ([^/]*) → capture group for the second item (until next /) If you have multiple strings R
Copy code texts <- c(“//a//b//c”, “//x//y//z”, “//one//two//three”)
second_items <- str_match(texts, “//[/]//([^/])”)[, 2] print(second_items) # [1] “b” “y” “two”
That was from internet search on how to extract 2nd listed item between // in our data.
normalized.expr$gene <- str_match(normalized.expr$gene_assignment, "^//[^/]*//([^/]*)")
That one didn’t work. text <- “abc //first// second //second// third //third//”
first_item <- str_match(text, “//(.*?)//“)[, 2]
cat(“First item between // is:”, first_item, “”)
This is another one to try extracting gene.
normalized.expr$gene <- str_match(normalized.expr$gene_assignment, "//(.*?)//")[, 2]
That one worked for regular expression extraction of gene. Lets get the columns we want and write this out to csv.
#colnames(normalized.expr)
dataNeeded <- normalized.expr[,c(1,21,2:18)]
Write this file out to read in later.
write.csv(normalized.expr,"InfectiousAcuteMononuculeosisAndChronicActiveEBV_genes.csv", row.names=F)
write.csv(dataNeeded,"dataNeeded_ML_AIM_CAEBV.csv", row.names=F)
=====================
Machine learning to predict the class of the sample as AIM, CAEBV, or healthy.
table(class)
## class
## AIM CAEBV healthy
## 6 5 6
We have 6 acute infectious mononucleosis, 5 chronic active EBV, and 6 healthy samples.
colnames(dataNeeded)
## [1] "ID" "gene" "GSM2279022_AIM"
## [4] "GSM2279023_AIM" "GSM2279024_AIM" "GSM2279025_CAEBV"
## [7] "GSM2279026_AIM" "GSM2279027_CAEBV" "GSM2279028_CAEBV"
## [10] "GSM2279029_CAEBV" "GSM2279030_CAEBV" "GSM2279031_healthy"
## [13] "GSM2279032_healthy" "GSM2279033_healthy" "GSM2279034_healthy"
## [16] "GSM2279035_healthy" "GSM2279036_healthy" "GSM2279037_AIM"
## [19] "GSM2279038_AIM"
dataNeeded$AIM_mean <- rowMeans(dataNeeded[,c(3,4,5,7,18,19)],dims=1)
dataNeeded$CAEBV_mean <- rowMeans(dataNeeded[,c(6,8,9,10,11)], dims=1)
dataNeeded$healthy_mean <- rowMeans(dataNeeded[,c(12:17)], dims=1)
dataNeeded$FC_AIM_healthy <- dataNeeded$AIM_mean/dataNeeded$healthy_mean
dataNeeded$FC_CAEBV_healthy <- dataNeeded$CAEBV_mean/dataNeeded$healthy_mean
AIM_orderedFC <- dataNeeded[order(dataNeeded$FC_AIM_healthy, decreasing=T),]
CAEBV_orderedFC <- dataNeeded[order(dataNeeded$FC_CAEBV_healthy, decreasing=T),]
triples <- grep('---', AIM_orderedFC$gene)
triples2 <- grep('---', CAEBV_orderedFC$gene)
AIM_genes <- AIM_orderedFC[-triples,] #32670 rows
CAEBV_genes <- CAEBV_orderedFC[-triples2,] #32670 rows
topAIM20 <- AIM_genes[c(1:10,32661:32670),]
topCAEBV20 <- CAEBV_genes[c(1:10,32661:32670),]
Make our matrices for both.
colnames(AIM_genes)
## [1] "ID" "gene" "GSM2279022_AIM"
## [4] "GSM2279023_AIM" "GSM2279024_AIM" "GSM2279025_CAEBV"
## [7] "GSM2279026_AIM" "GSM2279027_CAEBV" "GSM2279028_CAEBV"
## [10] "GSM2279029_CAEBV" "GSM2279030_CAEBV" "GSM2279031_healthy"
## [13] "GSM2279032_healthy" "GSM2279033_healthy" "GSM2279034_healthy"
## [16] "GSM2279035_healthy" "GSM2279036_healthy" "GSM2279037_AIM"
## [19] "GSM2279038_AIM" "AIM_mean" "CAEBV_mean"
## [22] "healthy_mean" "FC_AIM_healthy" "FC_CAEBV_healthy"
SAme for both top genes datasets.
AIM_mx <- data.frame(t(topAIM20[,3:19]))
colnames(AIM_mx) <- topAIM20$gene
AIM_mx$class <- as.factor(class)
CAEBV_mx <- data.frame(t(topCAEBV20[,3:19]))
colnames(CAEBV_mx) <- topCAEBV20$gene
CAEBV_mx$class <- as.factor(class)
paged_table(AIM_mx)
paged_table(CAEBV_mx)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:BiocGenerics':
##
## combine
set.seed(123)
inTrain <- sample(1:17,.8*17)
training <- AIM_mx[inTrain,]
testing <- AIM_mx[-inTrain,]
table(training$class)
##
## AIM CAEBV healthy
## 5 3 5
table(testing$class)
##
## AIM CAEBV healthy
## 1 2 1
rf_AIM <- randomForest(training[1:20], training$class, mtry=7, ntree=5000, confusion=T, importance=T)
rf_AIM$confusion
## AIM CAEBV healthy class.error
## AIM 5 0 0 0.0000000
## CAEBV 1 2 0 0.3333333
## healthy 0 0 5 0.0000000
So, 100% accuracy on the AIM and healthy class, but 67% accuracy on the CAEBV class.
Lets see how well these genes predict on validation testing set.
predictionAIM <- predict(rf_AIM,testing)
results <- data.frame(predicted=predictionAIM, actual=testing$class)
results
## predicted actual
## GSM2279028_CAEBV healthy CAEBV
## GSM2279030_CAEBV CAEBV CAEBV
## GSM2279034_healthy healthy healthy
## GSM2279038_AIM AIM AIM
Overall, on this 3 class model, it scored 75% accuracy in prediction, but one of the CAEBV samples was predicted healthy.
Now lets see how well the genes predict for the CAEBV class.
set.seed(345)
inTrain <- sample(1:17, .80*17)
training <- CAEBV_mx[inTrain,]
testing <- CAEBV_mx[-inTrain,]
table(training$class)
##
## AIM CAEBV healthy
## 5 4 4
table(testing$class)
##
## AIM CAEBV healthy
## 1 1 2
rf_CAEBV <- randomForest(training[1:20], training$class, mtry=7, ntree=5000, confusion=T, importance=T)
rf_CAEBV$confusion
## AIM CAEBV healthy class.error
## AIM 5 0 0 0
## CAEBV 0 4 0 0
## healthy 0 0 4 0
This was really great in prediction of the class with 100% accuracy in each class of AIM, CAEBV, or healthy. Lets see how well it predicts on hold out validation 20% of samples data.
predictionCAEBV <- predict(rf_CAEBV,testing)
results2 <- data.frame(predicted=predictionCAEBV, actual=testing$class)
results2
## predicted actual
## GSM2279025_CAEBV AIM CAEBV
## GSM2279033_healthy healthy healthy
## GSM2279035_healthy healthy healthy
## GSM2279037_AIM AIM AIM
The model scored 75% accuracy for this set of genes incorrectly predicting the CAEBV sample as acute infectious mononucleosis or AIM.
These seem to be great genes to use in our top genes of AIM and CAEBV. That was a 3 class model and that score was great for being more than 2 classes, generally speaking.