This is the fold change by median values instead of by mean values to see if the accuracy in classification improves. Some samples might just need to be removed as there are about 4-5 in different classes where some genes are greatly over expresses in relation to their neighbors in each class. This is the median fold change version of LymeDiseaseTicks.Rmd.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
descriptors2 <- read.csv('descriptors2.csv')
LymeDisease4 <- read.csv('LymeDisease4normalized-easynames.csv', sep=',',
header=T, na.strings = c('',' ','NA'))
lymeMx2 <- read.csv('lymeMx2-denormalized-easynames.csv', sep=',',header=T,
na.strings=c('',' ','NA'))
Lets remove those samples from our data: Samples: 7 of the acute class samples, sample 12 of the 1 month class samples, sample 10 of the 6 month class samples, and samples 1, 11, and 12 of the healthy class samples.
colnames(lymeMx2)
## [1] "gene" "healthyControl_1" "healthyControl_2"
## [4] "healthyControl_3" "healthyControl_4" "healthyControl_5"
## [7] "healthyControl_6" "healthyControl_7" "healthyControl_8"
## [10] "healthyControl_9" "healthyControl_10" "healthyControl_11"
## [13] "healthyControl_12" "healthyControl_13" "healthyControl_14"
## [16] "healthyControl_15" "healthyControl_16" "healthyControl_17"
## [19] "healthyControl_18" "healthyControl_19" "healthyControl_20"
## [22] "healthyControl_21" "acuteLymeDisease_1" "acuteLymeDisease_2"
## [25] "acuteLymeDisease_3" "acuteLymeDisease_4" "acuteLymeDisease_5"
## [28] "acuteLymeDisease_6" "acuteLymeDisease_7" "acuteLymeDisease_8"
## [31] "acuteLymeDisease_9" "acuteLymeDisease_10" "acuteLymeDisease_11"
## [34] "acuteLymeDisease_12" "acuteLymeDisease_13" "acuteLymeDisease_14"
## [37] "acuteLymeDisease_15" "acuteLymeDisease_16" "acuteLymeDisease_17"
## [40] "acuteLymeDisease_18" "acuteLymeDisease_19" "acuteLymeDisease_20"
## [43] "acuteLymeDisease_21" "acuteLymeDisease_22" "acuteLymeDisease_23"
## [46] "acuteLymeDisease_24" "acuteLymeDisease_25" "acuteLymeDisease_26"
## [49] "acuteLymeDisease_27" "acuteLymeDisease_28" "Antibodies_1month_1"
## [52] "Antibodies_1month_2" "Antibodies_1month_3" "Antibodies_1month_4"
## [55] "Antibodies_1month_5" "Antibodies_1month_6" "Antibodies_1month_7"
## [58] "Antibodies_1month_8" "Antibodies_1month_9" "Antibodies_1month_10"
## [61] "Antibodies_1month_11" "Antibodies_1month_12" "Antibodies_1month_13"
## [64] "Antibodies_1month_14" "Antibodies_1month_15" "Antibodies_1month_16"
## [67] "Antibodies_1month_17" "Antibodies_1month_18" "Antibodies_1month_19"
## [70] "Antibodies_1month_20" "Antibodies_1month_21" "Antibodies_1month_22"
## [73] "Antibodies_1month_23" "Antibodies_1month_24" "Antibodies_1month_25"
## [76] "Antibodies_1month_26" "Antibodies_1month_27" "Antibodies_6months_1"
## [79] "Antibodies_6months_2" "Antibodies_6months_3" "Antibodies_6months_4"
## [82] "Antibodies_6months_5" "Antibodies_6months_6" "Antibodies_6months_7"
## [85] "Antibodies_6months_8" "Antibodies_6months_9" "Antibodies_6months_10"
lymeMx2b <- lymeMx2[,-c(2,12,13,87,62,29)]
LymeDisease4b <- LymeDisease4[,-c(2,12,13,87,62,29)]
Now, we can use this data to find the median values across samples and get the fold change values, then plot the data in Tableau.
LymeDisease5 <- LymeDisease4b %>% group_by(Gene) %>% summarise_at(vars('healthyControl_2':'Antibodies_6months_9'),median)
## Warning: Factor `Gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
lymeMx3 <- lymeMx2b %>% group_by(gene) %>% summarise_at(vars('healthyControl_2':'Antibodies_6months_9'),median)
## Warning: Factor `gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
Lyme6 <- LymeDisease5 %>% group_by(Gene) %>%
mutate(
healthy_Median = median(healthyControl_2:healthyControl_21,na.rm=T),
acuteLymeDisease_Median = median(acuteLymeDisease_1:acuteLymeDisease_28,na.rm=T),
antibodies_1month_Median = median(Antibodies_1month_1:Antibodies_1month_27,na.rm=T),
antibodies_6month_Median = median(Antibodies_6months_1:Antibodies_6months_9,na.rm=T)
)
## Warning: Factor `Gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `Gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
tail(colnames(Lyme6),5)
## [1] "Antibodies_6months_9" "healthy_Median"
## [3] "acuteLymeDisease_Median" "antibodies_1month_Median"
## [5] "antibodies_6month_Median"
lymeMx4 <- lymeMx3 %>% group_by(gene) %>%
mutate(
healthy_median = median(healthyControl_2:healthyControl_21,na.rm=T),
acuteLymeDisease_median = median(acuteLymeDisease_1:acuteLymeDisease_28,na.rm=T),
antibodies_1month_median = median(Antibodies_1month_1:Antibodies_1month_27,na.rm=T),
antibodies_6month_median = median(Antibodies_6months_1:Antibodies_6months_9,na.rm=T)
)
## Warning: Factor `gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
tail(colnames(lymeMx4),5)
## [1] "Antibodies_6months_9" "healthy_median"
## [3] "acuteLymeDisease_median" "antibodies_1month_median"
## [5] "antibodies_6month_median"
lymeMx5 <- lymeMx4 %>% group_by(gene) %>%
mutate(acuteHealthy_foldChange=acuteLymeDisease_median/healthy_median,
antibodies_1month_healthy_foldChange=antibodies_1month_median/healthy_median,
antibodies_6month_healthy_foldchange=antibodies_6month_median/healthy_median)
## Warning: Factor `gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
tail(colnames(lymeMx5),10)
## [1] "Antibodies_6months_7"
## [2] "Antibodies_6months_8"
## [3] "Antibodies_6months_9"
## [4] "healthy_median"
## [5] "acuteLymeDisease_median"
## [6] "antibodies_1month_median"
## [7] "antibodies_6month_median"
## [8] "acuteHealthy_foldChange"
## [9] "antibodies_1month_healthy_foldChange"
## [10] "antibodies_6month_healthy_foldchange"
Lyme7 <- Lyme6 %>% group_by(Gene) %>%
mutate(acuteHealthy_foldChange=acuteLymeDisease_Median/healthy_Median,
antibodies_1month_healthy_foldChange=antibodies_1month_Median/healthy_Median,
antibodies_6month_healthy_foldchange=antibodies_6month_Median/healthy_Median)
## Warning: Factor `Gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning: Factor `Gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
tail(colnames(Lyme7),10)
## [1] "Antibodies_6months_7"
## [2] "Antibodies_6months_8"
## [3] "Antibodies_6months_9"
## [4] "healthy_Median"
## [5] "acuteLymeDisease_Median"
## [6] "antibodies_1month_Median"
## [7] "antibodies_6month_Median"
## [8] "acuteHealthy_foldChange"
## [9] "antibodies_1month_healthy_foldChange"
## [10] "antibodies_6month_healthy_foldchange"
Our tables of unique genes grouped by genes to get their medians of each gene within each sample for the duplicate genes, the added features of each class’s median gene expression per gene, and the fold change ratio of the diseased or treated to the healthy gene expression values have been created. The normalized data or the original data is the Lyme7 data frame and the denormalized data is the lymeMx5 data frame. Now each shrunk from 48851 genes to 19526 genes when grouping by unique genes, but now that is still a lot of genes, so lets take the gene that have the top 10 most expressed and least expressed values in both data frames by acute/healthy fold change, and the top 10 and bottom 10 of the 1month of antibodies/healthy fold change values, and finally the top 10 and bottom 10 of the 6 month of antibodies/healthy fold change values. *** The denormalized group first:
Acute/healthy top 10 and bottom 10 genes by fold change data frame:
acuteHealthy20 <- lymeMx5[order(lymeMx5$acuteHealthy_foldChange,
decreasing=T)[c(1:10,19517:19526)],]
One month/healthy top 10 and bottom 10 genes by fold change data frame:
month1healthy20 <- lymeMx5[order(lymeMx5$antibodies_1month_healthy_foldChange,
decreasing=T)[c(1:10,19517:19526)],]
Six month/healthy top 10 and bottom 10 genes by fold change data frame:
month6healthy20 <- lymeMx5[order(lymeMx5$antibodies_6month_healthy_foldchange,
decreasing=T)[c(1:10,19517:19526)],]
lymeMx6 <- rbind(acuteHealthy20,month1healthy20,month6healthy20)
lymeMx7 <- lymeMx6[!duplicated(lymeMx6),]
There were 45 unique genes between all three fold change groups in the denormalized data out of 60 genes that were either the top 10 or bottom 10 of genes being expressed.
Now, for the normalized data:
Acute/healthy top 10 and bottom 10 genes by fold change data frame:
acuteHealthy20b <- Lyme7[order(Lyme7$acuteHealthy_foldChange,
decreasing=T)[c(1:10,19517:19526)],]
One month/healthy top 10 and bottom 10 genes by fold change data frame:
month1healthy20b <- Lyme7[order(Lyme7$antibodies_1month_healthy_foldChange,
decreasing=T)[c(1:10,19517:19526)],]
Six month/healthy top 10 and bottom 10 genes by fold change data frame:
month6healthy20b <- Lyme7[order(Lyme7$antibodies_6month_healthy_foldchange,
decreasing=T)[c(1:10,19517:19526)],]
Lyme8 <- rbind(acuteHealthy20b,month1healthy20b,month6healthy20b)
Lyme9 <- Lyme8[!duplicated(Lyme8),]
There are 34 genes unique to the normalized data, probably because this data had negative values. The scaling done to denormalize this data is probably not exactly what the true raw values are. But they should have the same number of genes, but this one has 10 less than the normalized data. We will see later which one can be split into training and testing sets with better prediction accuracy within each class and overall.
Lets also add the gene summaries to these data frames and create a field that will give the class of each sample. This file,genecards2.R, is an R file sourced for the functions made in previous scripts. We lose one of the genes in the original data frame because it isn’t in genecards.org and end up with 32 instead of 33 genes for that data frame.
source('geneCards2.R')
## Warning: package 'rvest' was built under R version 3.6.3
## Loading required package: xml2
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
LOC400657 (#13 in list) is a gene that genecards.org doesn’t recognize and it will throw an error, so we should skip it.
for (i in Lyme9$Gene[1:12]){
getSummaries2(i,'protein')
}
for (i in Lyme9$Gene[14:34]){
getSummaries2(i,'protein')
}
getGeneSummaries('protein')
summsLyme9 <- read.csv("proteinGeneSummaries_protein.csv")
The LOC genes don’t seem to have gene summaries, so they had to be removed.
for (i in lymeMx7$gene[1:34]){
getSummaries2(i,'immune')
}
for (i in lymeMx7$gene[37:39]){
getSummaries2(i,'immune')
}
for (i in lymeMx7$gene[41:45]){
getSummaries2(i,'immune')
}
getGeneSummaries('immune')
summsLymeMx7 <- read.csv("proteinGeneSummaries_immune.csv")
Lyme10 <- merge(summsLyme9,Lyme9,by.x='gene', by.y='Gene')
lymeMx8 <- merge(summsLymeMx7,lymeMx7, by.x='gene',by.y='gene')
Lets create those classes for each data frame. But first we have to tidy the data.
Lyme11 <- gather(Lyme10, key='classSample',value='classValue',8:87)
lymeMx9 <- gather(lymeMx8,key='classSample',value='classValue',8:87)
Lyme11$class <- Lyme11$classSample
Lyme11$class <- gsub('^hea.*$','healthy',Lyme11$class, perl=T)
Lyme11$class <- gsub('^acute.*$','acute Lyme Disease',Lyme11$class,perl=T)
Lyme11$class <- gsub('^.*1month.*$','1 month treatment', Lyme11$class, perl=T)
Lyme11$class <- gsub('^.*6month.*$','6 months treatment',Lyme11$class,perl=T)
lymeMx9$class <- lymeMx9$classSample
lymeMx9$class <- gsub('^hea.*$','healthy',lymeMx9$class, perl=T)
lymeMx9$class <- gsub('^acute.*$','acute Lyme Disease',lymeMx9$class,perl=T)
lymeMx9$class <- gsub('^.*1month.*$','1 month treatment', lymeMx9$class, perl=T)
lymeMx9$class <- gsub('^.*6month.*$','6 months treatment',lymeMx9$class,perl=T)
unique(Lyme11$gene)
## [1] ACSBG2 ANKRD54 ATP5G3 CBX3 CCDC15 COX7A2L CSK FEV
## [9] GEN1 GPR50 HEPN1 HS1BP3 KRT83 KRTAP9-8 LYVE1 MAP1B
## [17] MFSD3 MTNR1B NSUN3 OAS2 ODF3B PRCP PRL SNORA60
## [25] SNRPD2 TMEM92 TTLL3 VCAN ZNF239 ZWILCH
## 33 Levels: ACSBG2 ANKRD54 ATP5G3 C1ORF141 C4ORF14 C9ORF129 CBX3 ... ZWILCH
unique(lymeMx9$gene)
## [1] BPI CA1 CAMP CCDC144A CEACAM8 CTSG
## [7] CXCL2 DDX3Y DEFA4 EIF1AY EPB42 ERAP2
## [13] EREG FSIP1 GAL3ST4 HBA1 HBD HEMGN
## [19] HIST1H1C HIST2H2AA3 HLA-DRB5 HSPA1B IL1B IL8
## [25] KDM5D KIR2DS1 KRT1 LCN2 LGALS2 LILRA3
## [31] LTF MYOM2 OLR1 POTEB SIRPB1 SLC25A37
## [37] SNCA TSIX XIST ZNF815
## 41 Levels: BPI CA1 CAMP CCDC144A CEACAM8 CTSG CXCL2 CXORF21 DDX3Y ... ZNF815
It looks like the genes aren’t even the same genes.
unique(Lyme11$Gene) %in% unique(lymeMx9)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE
Apparently, they are not the same genes. Its ok, maybe they still offer some information. The techniques and methods are the same to inverse what was assumed to be the normalization method, but for typical studies. In bioinformatics, with gene expression data, there is usually more to it, like trimming the bottom and top outliers, and taking the quantile normalization, then scaling. We used the standardization method of normalizing values between 0 and 1 as log2 normalized is to f(x)=log2[(x-min(x))/(max(x)-min(x))]=y and the inverse would be: f(y)=21=x So, there is some logic to this, and at some point rounded values could lose information in the numer of scientific placeholders of precision is used in calculating the inverse of the base 2 log, or the exact values for max and min of X need to be used. Reminder, when I demonstrated this earlier, the method worked using this procedure for 10 values that included a 0 where a small value was added to take the log2 of x=0 without an error, but the exact values were still decimals at the final step. To fix this they were turned to fractions, where the denominator was the max(x), and so each value multiplied by the denominator at that point returned the original x values in our list of 10. When addidng that step to the last step we used on this data to denormalize the data, the values were extremely large, approximately 103-104 larger. So we stopped before taking the fractional values. We will continue with these genes in our machine learning to see if either set makes good gene targets for pathogenesis of lyme disease by how accurately the classes of: healthy, acute disease, 1 month convalescing or developing antibodies after being given a regimen of antibiotics, and 6 months convalescing after being given antibiotics. This is temporal or time specific data, and there were some discrepencies in the study when being done, because it spanned 2 years, some patients didn’t know how long they had it but if they had symptoms they were assumed to be suffering from lyme disease, like the facial paralysis or the skin lesion type marks. Also, some patients dropped out and if the study spanned two years, and only the last 6 months recorded the convalescing at 6 months then the first batches of patients in the acute phase weren’t being recorded or they were actually being monitored after six months and up to two years after being given antibiotics. So we can imagine the data might be skewed for these differences or discrepencies.
Lets write these two tables out to csv files to analyze visually in Tableau.
write.csv(Lyme11,'LymeDisease_originalValues_foldchages33_medianAndRemoved.csv',row.names=F)
write.csv(Lyme11[,-c(5,6)],'LymeDisease_originalValues_foldchages33_medianAndRemoved_rm2Summs.csv',row.names=F)
write.csv(lymeMx9,"LymeDisease_denormalizedValues_foldchanges40_medianAndRemoved.csv",row.names=F)
write.csv(lymeMx9[-c(5,6)],"LymeDisease_denormalizedValues_foldchanges40_medianAndRemoved_rm2Summs.csv",row.names=F)
Lets also write the Lyme10 and lymeMx8 data frames that we will use in our ML models.
write.csv(Lyme10,'ML_medDrop_GSE145974_original.csv', row.names = F)
write.csv(lymeMx8,'ML_medDrop_GSE145974_destandardized.csv',row.names=F)
Lets start the machine learning by first making the data frames with the class as the output or target feature and the samples as observations and the genes as predictors from both sets separately.
The 40 de-standardized genes will be created first then the 30 original genes that are completely different. Both are the filtered top or bottom 10 genes out of their respective 19526 unique gene sets of each class by fold change of acute/healthy, 1 month/healthy, or 6 months/healthy by medians of their respective class samples.
The destandardized set. Lets just name our data sets something silly to keep track of them. Dance is the de-standardized set and Stand is the original log2 normalized set.
The Dance Machine Learning set, made from the lymeMx7, not-tidied, de-standardized data frame:
colnames(lymeMx7)
## [1] "gene"
## [2] "healthyControl_2"
## [3] "healthyControl_3"
## [4] "healthyControl_4"
## [5] "healthyControl_5"
## [6] "healthyControl_6"
## [7] "healthyControl_7"
## [8] "healthyControl_8"
## [9] "healthyControl_9"
## [10] "healthyControl_10"
## [11] "healthyControl_13"
## [12] "healthyControl_14"
## [13] "healthyControl_15"
## [14] "healthyControl_16"
## [15] "healthyControl_17"
## [16] "healthyControl_18"
## [17] "healthyControl_19"
## [18] "healthyControl_20"
## [19] "healthyControl_21"
## [20] "acuteLymeDisease_1"
## [21] "acuteLymeDisease_2"
## [22] "acuteLymeDisease_3"
## [23] "acuteLymeDisease_4"
## [24] "acuteLymeDisease_5"
## [25] "acuteLymeDisease_6"
## [26] "acuteLymeDisease_8"
## [27] "acuteLymeDisease_9"
## [28] "acuteLymeDisease_10"
## [29] "acuteLymeDisease_11"
## [30] "acuteLymeDisease_12"
## [31] "acuteLymeDisease_13"
## [32] "acuteLymeDisease_14"
## [33] "acuteLymeDisease_15"
## [34] "acuteLymeDisease_16"
## [35] "acuteLymeDisease_17"
## [36] "acuteLymeDisease_18"
## [37] "acuteLymeDisease_19"
## [38] "acuteLymeDisease_20"
## [39] "acuteLymeDisease_21"
## [40] "acuteLymeDisease_22"
## [41] "acuteLymeDisease_23"
## [42] "acuteLymeDisease_24"
## [43] "acuteLymeDisease_25"
## [44] "acuteLymeDisease_26"
## [45] "acuteLymeDisease_27"
## [46] "acuteLymeDisease_28"
## [47] "Antibodies_1month_1"
## [48] "Antibodies_1month_2"
## [49] "Antibodies_1month_3"
## [50] "Antibodies_1month_4"
## [51] "Antibodies_1month_5"
## [52] "Antibodies_1month_6"
## [53] "Antibodies_1month_7"
## [54] "Antibodies_1month_8"
## [55] "Antibodies_1month_9"
## [56] "Antibodies_1month_10"
## [57] "Antibodies_1month_11"
## [58] "Antibodies_1month_13"
## [59] "Antibodies_1month_14"
## [60] "Antibodies_1month_15"
## [61] "Antibodies_1month_16"
## [62] "Antibodies_1month_17"
## [63] "Antibodies_1month_18"
## [64] "Antibodies_1month_19"
## [65] "Antibodies_1month_20"
## [66] "Antibodies_1month_21"
## [67] "Antibodies_1month_22"
## [68] "Antibodies_1month_23"
## [69] "Antibodies_1month_24"
## [70] "Antibodies_1month_25"
## [71] "Antibodies_1month_26"
## [72] "Antibodies_1month_27"
## [73] "Antibodies_6months_1"
## [74] "Antibodies_6months_2"
## [75] "Antibodies_6months_3"
## [76] "Antibodies_6months_4"
## [77] "Antibodies_6months_5"
## [78] "Antibodies_6months_6"
## [79] "Antibodies_6months_7"
## [80] "Antibodies_6months_8"
## [81] "Antibodies_6months_9"
## [82] "healthy_median"
## [83] "acuteLymeDisease_median"
## [84] "antibodies_1month_median"
## [85] "antibodies_6month_median"
## [86] "acuteHealthy_foldChange"
## [87] "antibodies_1month_healthy_foldChange"
## [88] "antibodies_6month_healthy_foldchange"
Lets remove the fold change and median value features from our lymeMx8 data frame and save it as ‘Dance’ after we transpose it to get the unique genes as predictors and the samples as observations.
dance <- lymeMx8[,-c(2:7,88:94)]
danceSampleNames <- colnames(dance)[2:81]
month1 <- grep('1month',danceSampleNames)
month6 <- grep('6month',danceSampleNames)
healthy <- grep('healthy',danceSampleNames)
acute <- grep('acute',danceSampleNames)
class <- danceSampleNames
class[month1] <- '1 month'
class[month6] <- '6 months'
class[healthy] <- 'healthy'
class[acute] <- 'acute'
danceGeneNames <- dance$gene
Dance <- as.data.frame(t(dance[,-1]))
colnames(Dance) <- danceGeneNames
Dance$class <- class
Dance2 <- Dance[,c(41,1:40)]
head(Dance2)
## class BPI CA1 CAMP CCDC144A CEACAM8
## healthyControl_2 healthy 44.71151 187.58066 61.39548 27.852233 59.99411
## healthyControl_3 healthy 29.02589 17.32854 15.62534 10.299108 29.53441
## healthyControl_4 healthy 18.36342 15.48674 16.94088 15.867351 18.74716
## healthyControl_5 healthy 33.89164 26.00250 27.18118 32.890159 26.42882
## healthyControl_6 healthy 33.41352 18.64946 20.98911 8.884042 18.28476
## healthyControl_7 healthy 32.39563 39.00369 30.04639 33.431603 34.07572
## CTSG CXCL2 DDX3Y DEFA4 EIF1AY EPB42
## healthyControl_2 53.10345 90.08176 15.127511 68.67612 5.882532 188.58533
## healthyControl_3 23.49343 26.80778 6.992033 28.76057 3.349618 18.82050
## healthyControl_4 11.58055 12.26752 88.358654 11.98227 417.075821 16.32208
## healthyControl_5 12.99521 36.01149 159.894013 21.02267 533.947205 31.19096
## healthyControl_6 74.44276 29.94835 5.178288 19.84215 3.160839 20.54773
## healthyControl_7 29.58619 21.40277 200.257935 58.04673 652.397858 44.49253
## ERAP2 EREG FSIP1 GAL3ST4 HBA1 HBD
## healthyControl_2 48.643327 93.65032 26.85640 780.43516 133.136907 274.93081
## healthyControl_3 11.343900 15.74919 27.90551 20.56412 3.504471 13.83797
## healthyControl_4 8.138399 22.54677 16.33424 11.73537 20.246208 13.87592
## healthyControl_5 83.403401 47.02939 31.07355 32.62728 15.197226 18.59400
## healthyControl_6 37.838495 22.14446 16.89240 17.38865 16.619494 12.64241
## healthyControl_7 63.497294 72.50205 39.34989 31.61232 59.326363 39.88204
## HEMGN HIST1H1C HIST2H2AA3 HLA-DRB5 HSPA1B IL1B
## healthyControl_2 240.88830 14.18824 24.780563 26.986300 20.41038 46.72271
## healthyControl_3 11.01357 21.44584 5.685453 228.898114 22.40959 14.95010
## healthyControl_4 13.76587 22.84205 27.891293 252.183721 12.06996 25.60839
## healthyControl_5 28.19839 21.41184 50.434659 21.402235 21.23190 16.24523
## healthyControl_6 19.01051 17.99494 14.038684 5.877128 13.44129 26.42206
## healthyControl_7 27.97675 104.92962 70.533000 25.547715 169.99956 87.67157
## IL8 KDM5D KIR2DS1 KRT1 LCN2 LGALS2
## healthyControl_2 92.69553 21.147566 61.10841 228.70386 33.503142 9.971672
## healthyControl_3 29.11711 9.683688 21.24897 26.71635 10.400323 6.352478
## healthyControl_4 20.83394 86.118100 65.46716 15.79359 12.799352 25.957496
## healthyControl_5 30.94746 153.958130 56.76246 25.13165 20.690155 14.623216
## healthyControl_6 29.23871 8.506017 14.84648 10.12461 6.900668 20.165600
## healthyControl_7 128.20648 74.040103 29.33814 41.14450 51.096213 57.850952
## LILRA3 LTF MYOM2 OLR1 POTEB SIRPB1
## healthyControl_2 11.27490 75.86353 323.065372 82.22582 28.97101 30.57768
## healthyControl_3 13.96374 21.55983 65.086732 19.37997 35.29903 30.11269
## healthyControl_4 31.99443 13.72309 9.536578 23.11463 17.02823 110.35373
## healthyControl_5 30.68084 21.22504 46.234651 51.28529 28.21187 173.67341
## healthyControl_6 22.93716 18.82061 14.813989 19.34019 19.32730 16.39025
## healthyControl_7 22.10753 29.26608 465.398919 58.15919 70.60498 47.27990
## SLC25A37 SNCA TSIX XIST ZNF815
## healthyControl_2 167.517066 232.27893 433.24591 395.790441 85.75114
## healthyControl_3 6.954475 15.06107 134.56488 182.761123 35.97833
## healthyControl_4 12.892965 12.54416 10.88805 3.343866 14.12286
## healthyControl_5 23.670906 24.60520 12.66765 4.832158 52.97482
## healthyControl_6 12.442719 16.16613 53.40405 171.117221 15.28891
## healthyControl_7 48.265747 54.19420 15.94847 7.291165 89.00234
I created a dashboard of the fold change median values across class samples compared to the healthy class, and with the individual samples’ gene expression values. This data set has 6 samples removed that skewed the data when looking at mean gene expression values in LymeDiseaseTicks.Rmd, but when looking at the median gene expression values and those other samples removed there are some other samples that also skew the data by median values. clicking a gene in the gene filter and selecting ctrl+clicking each additional gene will display only those gene or genes interested in. Clicking the genes again will display all genes. Lets look through the genes to see what genes show unique or odd behaviors and group them into lists of those genes that have fold change values the same from acute -> 1 month of treatment -> 6 months of treatment compared to the healthy samples. dashboard of median fold change values and 6 samples removed
dashboard of median fold change values and individual sample values
Figure 10: Dashboard of median fold change values with six samples removed. The individual samples are shown in each class as well as each top or bottom gene in fold change expression of diseased or treated to healthy ratio of medians per class values. There is a gene filter to select each gene interested in. To select more than one gene at a time use ctrl + click on each gene, to remove the selection and return all genes select the selected genes again. Even when removing the six samples that had some different gene behaviors compared to the neighboring samples within each class when using the mean values for fold change values, there are other samples in the median selected sets with odd behaviors as well that could throw off the classification algorithms. This dashboard needs to be looked through to get a list of those samples with odd behaviors and also the genes’ fold change values behavior. Such as, monotastically increasing or decreasing from acute ->1 month -> 6 months, or start high, drops low, then shoots really high in gene expression after 6 months of treatment, and other behaviors. They could allow us to zoom in on target genes for lyme disease pathogenesis in these separate classes.
There are some categories of behaviors. out of three classes of fold change, if you see a low and lowest and high, the low and lowest are closest in gene expression levels lower than the high description. And if you see a high, a highest, and a low then the high and highest are closer in gene expression level higher than the low class.
up acute:
monotastically decreasing from acute -> 1 month -> 6 months:
BPI CA1 CAMP CEACAM8 CTSG DDX3Y DEFA4 EIF1AY EPB42 HBA1 HBD HEMGN HIST1H1C HIST2H2AA3 HSPA1B KDM5D KIR2DS1 LCN2 LILRA3 LTF SLC25A37 ZNF815
high, lowest,low: SNCA CCDC144A
highest,low,high: CXCL2 GAL3ST4 HLA-DRB5 KRT1 LGALS2
monotastically increasing from acute -> 1 month -> 6 months: ERAP2
high, low, highest: IL8 EREG IL1B OLR1 POTEB
low, lowest, high: TSIX XIST MYOM2 FSIP1
low,high,lowest: SIRPB1
There are 7 groups:
Acute_md <- c('BPI','CA1','CAMP','CEACAM8','CTSG','DDX3Y','DEFA4','EIF1AY','EPB42',
'HBA1','HBD','HEMGN','HIST1H1C','HIST2H2AA3','HSPA1B','KDM5D','KIR2DS1','LCN2',
'LILRA3','LTF','SLC25A37','ZNF815')
HlL <- c('SNCA','CCDC144A')
Hlh <- c('CXCL2', 'GAL3ST4', 'HLA-DRB5', 'KRT1', 'LGALS2')
Acute_mi <- 'ERAP2'
hlH <- c('IL8','EREG','IL1B','OLR1','POTEB')
lLh <- c('TSIX','XIST','MYOM2','FSIP1')
lhL <- 'SIRPB1'
Now that we have our lists, lets see about those data frames for the seven different groups of gene anomolies or similarities. The following are our ML ready dataframes for our seven groups in our de-standardized Lyme disease data.
Acute_md_DF <- Dance2[,colnames(Dance2) %in% Acute_md]
Acute_md_DF$class <- Dance2$class
HlL_DF <- Dance2[,colnames(Dance2) %in% HlL]
HlL_DF$class <- Dance2$class
Acute_mi_DF <- data.frame(ERAP2=Dance2[,colnames(Dance2) %in% Acute_mi], row.names=row.names(Dance2))
Acute_mi_DF$class <- Dance2$class
lhL_DF <- data.frame(SIRPB1=Dance2[,colnames(Dance2) %in% lhL],row.names=row.names(Dance2))
lhL_DF$class <- Dance2$class
Hlh_DF <- Dance2[,colnames(Dance2) %in% Hlh]
Hlh_DF$class <- Dance2$class
hlH_DF <- Dance2[,colnames(Dance2) %in% hlH]
hlH_DF$class <- Dance2$class
lLh_DF <- Dance2[,colnames(Dance2) %in% lLh]
lLh_DF$class <- Dance2$class
Great, now we need to run through each of these 7 data frames and split into separate training and testing sets, and test a machine learning algorithm on. I tend to always use random forest to start with, or caret’s rpart.
Lets make sure we keep the same samples in our testing set and training set for each group to test machine learning algorithm(s) on. Lets keep the standard 70% training set and 30% testing set using a random sampling of our classes.
set.seed(34567)
train <- sample(1:80,.7*80)
training <- class[train]
testing <- class[-train]
t <- data.frame(train = training)
ts <- data.frame(test= testing)
t %>% group_by(train) %>% count(train)
## # A tibble: 4 x 2
## # Groups: train [4]
## train n
## <fct> <int>
## 1 1 month 21
## 2 6 months 6
## 3 acute 16
## 4 healthy 13
ts %>% group_by(test) %>% count(test)
## # A tibble: 4 x 2
## # Groups: test [4]
## test n
## <fct> <int>
## 1 1 month 5
## 2 6 months 3
## 3 acute 11
## 4 healthy 5
We can see we have a fair share of samples in our training set and at least one of each class in our testing set to make predictions based on the model we train. Lets keep these same samples in each of our 8 groups to classify with. Lets make our 8 training and testing sets with our indices labeled ‘train’ and note the numeric labeling of each correspongs to their data frame:
Training/Testing split 1: Acute_md_DF Training/Testing split 2: HlL_DF Training/Testing split 3: Acute_mi_DF Training/Testing split 4: lhL_DF Training/Testing split 5: Hlh_DF Training/Testing split 6: hlH_DF Training/Testing split 7: lLh_DF Training/Testing split 8: Dance2
training1 <- Acute_md_DF[train,]
testing1 <- Acute_md_DF[-train,]
training2 <- HlL_DF[train,]
testing2 <- HlL_DF[-train,]
training3 <- Acute_mi_DF[train,]
testing3 <- Acute_mi_DF[-train,]
training4 <- lhL_DF[train,]
testing4 <- lhL_DF[-train,]
training5 <- Hlh_DF[train,]
testing5 <- Hlh_DF[-train,]
training6 <- hlH_DF[train,]
testing6 <- hlH_DF[-train,]
training7 <- lLh_DF[train,]
testing7 <- lLh_DF[-train,]
training8 <- Dance2[train,]
testing8 <- Dance2[-train,]
Lets make a function specific to our data frames to return the precision, recall, and accuracy of these four classes. I actually made this in a previous script,monotonicGenes.Rmd, when testing the COVID-19 samples with GSE152418 that also had four classes to classify. But those classes were healthy, moderate, severe, or ICU grades of severity of Covid19. Actually, I found out later, that the convalescent class was its own class even though it was only one sample. So there should have been five classes. But no need to alter that function now. There is also some other packages or in the caret package, that I never use that can return the precision and recall, but i don’t think as a confusion matrix. I thought the convalescent class was mislabeled, so had it relabeled as healthy, since the models pedicted it as such. I didn’t find out until this study, when the summary of this study, GSE145974, used ‘convalesced’ blood after 1 and 6 months of antibiotics, that the sample in GSE152418 was likely its own class. I assumed it was identifying the source of its patient sample,because another previous study on Rheumatoid Arthritis (RA), GSE151161, did use convalescent patients, and it preceded the analysis on GSE152418. Typically in research, you need a client consent and informed consent from people who aren’t incarcerated or in the care of another person or facility,because it violates the human research subjects guidelines for ethical research and not victimizing vulnerable populations or culpabe and incoherant populations. This stems from research that was criminal in the Tuskegee hospital on injecting black populations with syphilis or polio vaccines on inmates in other studies for some small reward or break from their punishment or lowered/free cost clinic for medical treatment. Any researcher knows this, especially if they are funded by government agencies. Also, due to the Nazi research done on Jewish victims during World War 2, the Nuremberg Code, was created, as well as later the Belmont report. “The Nuremberg Code states that”the voluntary consent of the human subject is absolutely essential" and it further explains the details implied by this requirement: capacity to consent, freedom from coercion, no penalty for withdrawal, and comprehension of the risks and benefits involved."-The Nuremberg Code, taken from a resource for getting certified in understanding compliance with human research experiments as part of my graduate research project this had to be completed. The agency who provided this, similar to HIPPA compliance for healthcare providers, is CITI.
precisionRecallAccuracy <- function(df){
colnames(df) <- c('pred','type')
df$pred <- as.character(paste(df$pred))
df$type <- as.character(paste(df$type))
classes <- unique(df$type)
class1a <- as.character(paste(classes[1]))
class2a <- as.character(paste(classes[2]))
class3a <- as.character(paste(classes[3]))
class4a <- as.character(paste(classes[4]))
#correct classes
class1 <- subset(df, df$type==class1a)
class2 <- subset(df, df$type==class2a)
class3 <- subset(df, df$type==class3a)
class4 <- subset(df, df$type==class4a)
#incorrect classes
notClass1 <- subset(df,df$type != class1a)
notClass2 <- subset(df,df$type != class2a)
notClass3 <- subset(df,df$type != class3a)
notClass4 <- subset(df, df$type != class4a)
#true positives (real positives predicted positive)
tp_1 <- sum(class1$pred==class1$type)
tp_2 <- sum(class2$pred==class2$type)
tp_3 <- sum(class3$pred==class3$type)
tp_4 <- sum(class4$pred==class4$type)
#false positives (real negatives predicted positive)
fp_1 <- sum(notClass1$pred==class1a)
fp_2 <- sum(notClass2$pred==class2a)
fp_3 <- sum(notClass3$pred==class3a)
fp_4 <- sum(notClass4$pred==class4a)
#false negatives (real positive predicted negative)
fn_1 <- sum(class1$pred!=class1$type)
fn_2 <- sum(class2$pred!=class2$type)
fn_3 <- sum(class3$pred!=class3$type)
fn_4 <- sum(class4$pred!=class4$type)
#true negatives (real negatives predicted negative)
tn_1 <- sum(notClass1$pred!=class1a)
tn_2 <- sum(notClass2$pred!=class2a)
tn_3 <- sum(notClass3$pred!=class3a)
tn_4 <- sum(notClass4$pred!=class4a)
#precision
p1 <- tp_1/(tp_1+fp_1)
p2 <- tp_2/(tp_2+fp_2)
p3 <- tp_3/(tp_3+fp_3)
p4 <- tp_4/(tp_4+fp_4)
p1 <- ifelse(p1=='NaN',0,p1)
p2 <- ifelse(p2=='NaN',0,p2)
p3 <- ifelse(p3=='NaN',0,p3)
p4 <- ifelse(p4=='NaN',0,p4)
#recall
r1 <- tp_1/(tp_1+fn_1)
r2 <- tp_2/(tp_2+fn_2)
r3 <- tp_3/(tp_3+fn_3)
r4 <- tp_4/(tp_4+fn_4)
r1 <- ifelse(r1=='NaN',0,r1)
r2 <- ifelse(r2=='NaN',0,r2)
r3 <- ifelse(r3=='NaN',0,r3)
r4 <- ifelse(r4=='NaN',0,r4)
#accuracy
ac1 <- (tp_1+tn_1)/(tp_1+tn_1+fp_1+fn_1)
ac2 <- (tp_2+tn_2)/(tp_2+tn_2+fp_2+fn_2)
ac3 <- (tp_3+tn_3)/(tp_3+tn_3+fp_3+fn_3)
ac4 <- (tp_4+tn_4)/(tp_4+tn_4+fp_4+fn_4)
table <- as.data.frame(rbind(c(class1a,p1,r1,ac1),
c(class2a,p2,r2,ac2),
c(class3a,p3,r3,ac3),
c(class4a,p4,r4,ac4)))
colnames(table) <- c('class','precision','recall','accuracy')
acc <- (sum(df$pred==df$type)/length(df$type))*100
cat('accuracy is: ',as.character(paste(acc)),'%')
return(table)
}
Lets start with the first group of genes using
library(e1071)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(gbm)
## Loaded gbm 2.1.5
library(RANN) #used in the tuning parameter of rf method of caret for 'oob' one out bag
## Warning: package 'RANN' was built under R version 3.6.3
Training/Testing 1:
set.seed(589647)
rfMod1 <- train(class~., method='rf',
na.action=na.pass,
data=(training1), preProc = c("center", "scale","medianImpute"),
trControl=trainControl(method='oob'), number=5)
predRF1 <- predict(rfMod1, testing1)
predDF1 <- data.frame(predRF1, type=testing1$class)
predDF1
## predRF1 type
## 1 healthy healthy
## 2 1 month healthy
## 3 1 month healthy
## 4 healthy healthy
## 5 healthy healthy
## 6 1 month acute
## 7 acute acute
## 8 acute acute
## 9 healthy acute
## 10 acute acute
## 11 1 month acute
## 12 1 month acute
## 13 1 month acute
## 14 1 month acute
## 15 acute acute
## 16 healthy acute
## 17 acute 1 month
## 18 1 month 1 month
## 19 healthy 1 month
## 20 6 months 1 month
## 21 6 months 1 month
## 22 1 month 6 months
## 23 1 month 6 months
## 24 1 month 6 months
pra1 <- precisionRecallAccuracy(predDF1)
## accuracy is: 33.3333333333333 %
pra1
## class precision recall accuracy
## 1 healthy 0.5 0.6 0.791666666666667
## 2 acute 0.8 0.363636363636364 0.666666666666667
## 3 1 month 0.0909090909090909 0.2 0.416666666666667
## 4 6 months 0 0 0.791666666666667
That set wasn’t so great. Lets run through the other 7 sets using the same format and compare the results at the end.
Training/Testing 2:
rfMod2 <- train(class~., method='rf',
na.action=na.pass,
data=(training2), preProc = c("center", "scale","medianImpute"),
trControl=trainControl(method='oob'), number=5)
## note: only 1 unique complexity parameters in default grid. Truncating the grid to 1 .
predRF2 <- predict(rfMod2, testing2)
predDF2 <- data.frame(predRF2, type=testing2$class)
predDF2
## predRF2 type
## 1 acute healthy
## 2 6 months healthy
## 3 acute healthy
## 4 1 month healthy
## 5 healthy healthy
## 6 acute acute
## 7 1 month acute
## 8 1 month acute
## 9 healthy acute
## 10 1 month acute
## 11 1 month acute
## 12 1 month acute
## 13 1 month acute
## 14 acute acute
## 15 acute acute
## 16 acute acute
## 17 healthy 1 month
## 18 healthy 1 month
## 19 1 month 1 month
## 20 1 month 1 month
## 21 1 month 1 month
## 22 1 month 6 months
## 23 acute 6 months
## 24 acute 6 months
pra2 <- precisionRecallAccuracy(predDF2)
## accuracy is: 33.3333333333333 %
pra2
## class precision recall accuracy
## 1 healthy 0.25 0.2 0.708333333333333
## 2 acute 0.5 0.363636363636364 0.541666666666667
## 3 1 month 0.272727272727273 0.6 0.583333333333333
## 4 6 months 0 0 0.833333333333333
Training/Testing 3:
rfMod3 <- train(class~., method='rf',
na.action=na.pass,
data=(training3), preProc = c("center", "scale","medianImpute"),
trControl=trainControl(method='oob'), number=5)
predRF3 <- predict(rfMod3, testing3)
predDF3 <- data.frame(predRF3, type=testing3$class)
predDF3
## predRF3 type
## 1 acute healthy
## 2 acute healthy
## 3 acute healthy
## 4 acute healthy
## 5 acute healthy
## 6 1 month acute
## 7 1 month acute
## 8 acute acute
## 9 healthy acute
## 10 acute acute
## 11 1 month acute
## 12 1 month acute
## 13 acute acute
## 14 1 month acute
## 15 healthy acute
## 16 1 month acute
## 17 6 months 1 month
## 18 1 month 1 month
## 19 acute 1 month
## 20 healthy 1 month
## 21 1 month 1 month
## 22 acute 6 months
## 23 1 month 6 months
## 24 acute 6 months
pra3 <- precisionRecallAccuracy(predDF3)
## accuracy is: 20.8333333333333 %
pra3
## class precision recall accuracy
## 1 healthy 0 0 0.666666666666667
## 2 acute 0.272727272727273 0.272727272727273 0.333333333333333
## 3 1 month 0.222222222222222 0.4 0.583333333333333
## 4 6 months 0 0 0.833333333333333
Training/Testing 4:
rfMod4 <- train(class~., method='rf',
na.action=na.pass,
data=(training4), preProc = c("center", "scale","medianImpute"),
trControl=trainControl(method='oob'), number=5)
predRF4 <- predict(rfMod4, testing4)
predDF4 <- data.frame(predRF4, type=testing4$class)
predDF4
## predRF4 type
## 1 1 month healthy
## 2 1 month healthy
## 3 acute healthy
## 4 acute healthy
## 5 healthy healthy
## 6 acute acute
## 7 1 month acute
## 8 healthy acute
## 9 acute acute
## 10 1 month acute
## 11 healthy acute
## 12 1 month acute
## 13 acute acute
## 14 1 month acute
## 15 healthy acute
## 16 6 months acute
## 17 1 month 1 month
## 18 1 month 1 month
## 19 1 month 1 month
## 20 acute 1 month
## 21 acute 1 month
## 22 acute 6 months
## 23 1 month 6 months
## 24 healthy 6 months
pra4 <- precisionRecallAccuracy(predDF4)
## accuracy is: 29.1666666666667 %
pra4
## class precision recall accuracy
## 1 healthy 0.2 0.2 0.666666666666667
## 2 acute 0.375 0.272727272727273 0.458333333333333
## 3 1 month 0.3 0.6 0.625
## 4 6 months 0 0 0.833333333333333
Training/Testing 5:
rfMod5 <- train(class~., method='rf',
na.action=na.pass,
data=(training5), preProc = c("center", "scale","medianImpute"),
trControl=trainControl(method='oob'), number=5)
predRF5 <- predict(rfMod5, testing5)
predDF5 <- data.frame(predRF5, type=testing5$class)
predDF5
## predRF5 type
## 1 1 month healthy
## 2 healthy healthy
## 3 acute healthy
## 4 acute healthy
## 5 6 months healthy
## 6 1 month acute
## 7 acute acute
## 8 acute acute
## 9 1 month acute
## 10 1 month acute
## 11 1 month acute
## 12 1 month acute
## 13 1 month acute
## 14 1 month acute
## 15 acute acute
## 16 6 months acute
## 17 1 month 1 month
## 18 1 month 1 month
## 19 healthy 1 month
## 20 healthy 1 month
## 21 healthy 1 month
## 22 acute 6 months
## 23 acute 6 months
## 24 acute 6 months
pra5 <- precisionRecallAccuracy(predDF5)
## accuracy is: 25 %
pra5
## class precision recall accuracy
## 1 healthy 0.25 0.2 0.708333333333333
## 2 acute 0.375 0.272727272727273 0.458333333333333
## 3 1 month 0.2 0.4 0.541666666666667
## 4 6 months 0 0 0.791666666666667
Training/Testing 6:
rfMod6 <- train(class~., method='rf',
na.action=na.pass,
data=(training6), preProc = c("center", "scale","medianImpute"),
trControl=trainControl(method='oob'), number=5)
predRF6 <- predict(rfMod6, testing6)
predDF6 <- data.frame(predRF6, type=testing6$class)
predDF6
## predRF6 type
## 1 acute healthy
## 2 1 month healthy
## 3 healthy healthy
## 4 acute healthy
## 5 acute healthy
## 6 1 month acute
## 7 acute acute
## 8 1 month acute
## 9 6 months acute
## 10 1 month acute
## 11 1 month acute
## 12 1 month acute
## 13 1 month acute
## 14 1 month acute
## 15 1 month acute
## 16 acute acute
## 17 acute 1 month
## 18 1 month 1 month
## 19 healthy 1 month
## 20 1 month 1 month
## 21 healthy 1 month
## 22 healthy 6 months
## 23 healthy 6 months
## 24 healthy 6 months
pra6 <- precisionRecallAccuracy(predDF6)
## accuracy is: 20.8333333333333 %
pra6
## class precision recall accuracy
## 1 healthy 0.166666666666667 0.2 0.625
## 2 acute 0.333333333333333 0.181818181818182 0.458333333333333
## 3 1 month 0.181818181818182 0.4 0.5
## 4 6 months 0 0 0.833333333333333
Training/Testing 7:
rfMod7 <- train(class~., method='rf',
na.action=na.pass,
data=(training7), preProc = c("center", "scale","medianImpute"),
trControl=trainControl(method='oob'), number=5)
predRF7 <- predict(rfMod7, testing7)
predDF7 <- data.frame(predRF7, type=testing7$class)
predDF7
## predRF7 type
## 1 1 month healthy
## 2 1 month healthy
## 3 healthy healthy
## 4 acute healthy
## 5 1 month healthy
## 6 1 month acute
## 7 acute acute
## 8 6 months acute
## 9 6 months acute
## 10 1 month acute
## 11 1 month acute
## 12 1 month acute
## 13 1 month acute
## 14 acute acute
## 15 healthy acute
## 16 healthy acute
## 17 1 month 1 month
## 18 acute 1 month
## 19 acute 1 month
## 20 healthy 1 month
## 21 1 month 1 month
## 22 1 month 6 months
## 23 1 month 6 months
## 24 acute 6 months
pra7 <- precisionRecallAccuracy(predDF1)
## accuracy is: 33.3333333333333 %
pra7
## class precision recall accuracy
## 1 healthy 0.5 0.6 0.791666666666667
## 2 acute 0.8 0.363636363636364 0.666666666666667
## 3 1 month 0.0909090909090909 0.2 0.416666666666667
## 4 6 months 0 0 0.791666666666667
Training/Testing 8:
rfMod8 <- train(class~., method='rf',
na.action=na.pass,
data=(training8), preProc = c("center", "scale","medianImpute"),
trControl=trainControl(method='oob'), number=5)
predRF8 <- predict(rfMod8, testing8)
predDF8 <- data.frame(predRF8, type=testing8$class)
predDF8
## predRF8 type
## 1 1 month healthy
## 2 healthy healthy
## 3 acute healthy
## 4 1 month healthy
## 5 healthy healthy
## 6 1 month acute
## 7 acute acute
## 8 acute acute
## 9 healthy acute
## 10 1 month acute
## 11 1 month acute
## 12 1 month acute
## 13 1 month acute
## 14 1 month acute
## 15 acute acute
## 16 healthy acute
## 17 acute 1 month
## 18 1 month 1 month
## 19 healthy 1 month
## 20 1 month 1 month
## 21 1 month 1 month
## 22 healthy 6 months
## 23 acute 6 months
## 24 1 month 6 months
pra8 <- precisionRecallAccuracy(predDF1)
## accuracy is: 33.3333333333333 %
pra8
## class precision recall accuracy
## 1 healthy 0.5 0.6 0.791666666666667
## 2 acute 0.8 0.363636363636364 0.666666666666667
## 3 1 month 0.0909090909090909 0.2 0.416666666666667
## 4 6 months 0 0 0.791666666666667
These are the groups by gene behaviors in fold change of diseased or treated median values compared to healthy median values:
Training/Testing split 1: Acute_md_DF Training/Testing split 2: HlL_DF Training/Testing split 3: Acute_mi_DF Training/Testing split 4: lhL_DF Training/Testing split 5: Hlh_DF Training/Testing split 6: hlH_DF Training/Testing split 7: lLh_DF Training/Testing split 8: Dance2
pra_all <- rbind(pra1,pra2,pra3,pra4,pra5,pra6,pra7,pra8)
pra_all$GroupMembership <- c(rep(1,4),
rep(2,4),
rep(3,4),
rep(4,4),
rep(5,4),
rep(6,4),
rep(7,4),
rep(8,4))
pra_all2 <- pra_all %>% group_by(class) %>% mutate(max=
ifelse(accuracy==max(as.numeric(paste(accuracy))),'max','not max'))
max <- subset(pra_all2, pra_all2$max=='max')
max
## # A tibble: 11 x 6
## # Groups: class [4]
## class precision recall accuracy GroupMembership max
## <fct> <fct> <fct> <fct> <dbl> <chr>
## 1 healthy 0.5 0.6 0.791666666666667 1 max
## 2 acute 0.8 0.363636363636364 0.666666666666667 1 max
## 3 6 months 0 0 0.833333333333333 2 max
## 4 6 months 0 0 0.833333333333333 3 max
## 5 1 month 0.3 0.6 0.625 4 max
## 6 6 months 0 0 0.833333333333333 4 max
## 7 6 months 0 0 0.833333333333333 6 max
## 8 healthy 0.5 0.6 0.791666666666667 7 max
## 9 acute 0.8 0.363636363636364 0.666666666666667 7 max
## 10 healthy 0.5 0.6 0.791666666666667 8 max
## 11 acute 0.8 0.363636363636364 0.666666666666667 8 max
All groups, except group 5 (the highest, low, high genes for acute, 1 month, and 6 months as classes), had a score in the best accuracy of a class. None of the groups predicted a 6 month class, but they all scored 83% accuracy and 0% accuracy for precision and recall because of not selecting it as a class for groups 2,3,4, and 6. Groups 1 and 7 scored the same in accuracy, precision, and recall in predicting the healthy and acute classes. Group 8 scored the same as group 1 and 7 in the healthy class , but that was the only class group 8 predicted with the best in accuracy.Only Group 4 predicted the 1 month class the best with 30% precision and 60% recall for an accuracy of 63%. The overall accuracy was worse with this set of median samples as the best was 34% accuracy (Groups 1,2,7, and 8) and worst was 21% accuracy (Group 5).
We can’t remove any samples, not because we already removed 6 from the 86 total before getting the class median sample values for the fold change values, but because looking at the 40 genes and the samples within each class, there are many samples with different values differing from their neighbors dramatically or not. We can see if scaling the training set in the model training will help. It will center and scale the data for us with that step I commented out in the last model prediction runs.
Lets use the randomForest package and its randomForest() to tune our model and test our same 8 groups.
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training1) <- gsub('-','',colnames(training1))
colnames(testing1) <- gsub('-','',colnames(testing1))
testing1$class <- as.factor(paste(testing1$class))
training1$class <- as.factor(paste(training1$class))
RF1 <- randomForest(class ~ ., data=training1,
importance=TRUE, nodesize=2, ntree=400,mtry=3)
predict1 <- predict(RF1,testing1)
predict1df <- data.frame(predict1, type=testing1$class)
predict1df
## predict1 type
## healthyControl_4 healthy healthy
## healthyControl_14 1 month healthy
## healthyControl_15 1 month healthy
## healthyControl_16 healthy healthy
## healthyControl_21 healthy healthy
## acuteLymeDisease_1 1 month acute
## acuteLymeDisease_2 acute acute
## acuteLymeDisease_3 acute acute
## acuteLymeDisease_5 healthy acute
## acuteLymeDisease_8 acute acute
## acuteLymeDisease_9 1 month acute
## acuteLymeDisease_11 1 month acute
## acuteLymeDisease_13 healthy acute
## acuteLymeDisease_17 1 month acute
## acuteLymeDisease_24 acute acute
## acuteLymeDisease_26 healthy acute
## Antibodies_1month_3 acute 1 month
## Antibodies_1month_8 1 month 1 month
## Antibodies_1month_10 healthy 1 month
## Antibodies_1month_23 healthy 1 month
## Antibodies_1month_27 6 months 1 month
## Antibodies_6months_2 1 month 6 months
## Antibodies_6months_4 1 month 6 months
## Antibodies_6months_5 1 month 6 months
PRA1 <- precisionRecallAccuracy(predict1df)
## accuracy is: 33.3333333333333 %
PRA1
## class precision recall accuracy
## 1 healthy 0.375 0.6 0.708333333333333
## 2 acute 0.8 0.363636363636364 0.666666666666667
## 3 1 month 0.1 0.2 0.458333333333333
## 4 6 months 0 0 0.833333333333333
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training2) <- gsub('-','',colnames(training2))
colnames(testing2) <- gsub('-','',colnames(testing2))
testing2$class <- as.factor(paste(testing2$class))
training2$class <- as.factor(paste(training2$class))
RF2 <- randomForest(class ~ ., data=training2,
importance=TRUE, nodesize=2, ntree=400,mtry=3)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
predict2 <- predict(RF2,testing2)
predict2df <- data.frame(predict2, type=testing2$class)
predict2df
## predict2 type
## healthyControl_4 healthy healthy
## healthyControl_14 healthy healthy
## healthyControl_15 acute healthy
## healthyControl_16 1 month healthy
## healthyControl_21 healthy healthy
## acuteLymeDisease_1 acute acute
## acuteLymeDisease_2 1 month acute
## acuteLymeDisease_3 1 month acute
## acuteLymeDisease_5 healthy acute
## acuteLymeDisease_8 1 month acute
## acuteLymeDisease_9 healthy acute
## acuteLymeDisease_11 1 month acute
## acuteLymeDisease_13 1 month acute
## acuteLymeDisease_17 acute acute
## acuteLymeDisease_24 acute acute
## acuteLymeDisease_26 acute acute
## Antibodies_1month_3 healthy 1 month
## Antibodies_1month_8 healthy 1 month
## Antibodies_1month_10 1 month 1 month
## Antibodies_1month_23 1 month 1 month
## Antibodies_1month_27 1 month 1 month
## Antibodies_6months_2 1 month 6 months
## Antibodies_6months_4 acute 6 months
## Antibodies_6months_5 acute 6 months
PRA2 <- precisionRecallAccuracy(predict2df)
## accuracy is: 41.6666666666667 %
PRA2
## class precision recall accuracy
## 1 healthy 0.428571428571429 0.6 0.75
## 2 acute 0.571428571428571 0.363636363636364 0.583333333333333
## 3 1 month 0.3 0.6 0.625
## 4 6 months 0 0 0.875
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training3) <- gsub('-','',colnames(training3))
colnames(testing3) <- gsub('-','',colnames(testing3))
testing3$class <- as.factor(paste(testing3$class))
training3$class <- as.factor(paste(training3$class))
RF3 <- randomForest(class ~ ., data=training3,
importance=TRUE, nodesize=2, ntree=400,mtry=3)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
predict3 <- predict(RF3,testing3)
predict3df <- data.frame(predict3, type=testing3$class)
predict3df
## predict3 type
## healthyControl_4 1 month healthy
## healthyControl_14 acute healthy
## healthyControl_15 acute healthy
## healthyControl_16 acute healthy
## healthyControl_21 acute healthy
## acuteLymeDisease_1 1 month acute
## acuteLymeDisease_2 1 month acute
## acuteLymeDisease_3 acute acute
## acuteLymeDisease_5 healthy acute
## acuteLymeDisease_8 acute acute
## acuteLymeDisease_9 1 month acute
## acuteLymeDisease_11 1 month acute
## acuteLymeDisease_13 acute acute
## acuteLymeDisease_17 1 month acute
## acuteLymeDisease_24 healthy acute
## acuteLymeDisease_26 1 month acute
## Antibodies_1month_3 6 months 1 month
## Antibodies_1month_8 1 month 1 month
## Antibodies_1month_10 acute 1 month
## Antibodies_1month_23 healthy 1 month
## Antibodies_1month_27 1 month 1 month
## Antibodies_6months_2 acute 6 months
## Antibodies_6months_4 1 month 6 months
## Antibodies_6months_5 acute 6 months
PRA3 <- precisionRecallAccuracy(predict3df)
## accuracy is: 20.8333333333333 %
PRA3
## class precision recall accuracy
## 1 healthy 0 0 0.666666666666667
## 2 acute 0.3 0.272727272727273 0.375
## 3 1 month 0.2 0.4 0.541666666666667
## 4 6 months 0 0 0.833333333333333
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training4) <- gsub('-','',colnames(training4))
colnames(testing4) <- gsub('-','',colnames(testing4))
testing4$class <- as.factor(paste(testing4$class))
training4$class <- as.factor(paste(training4$class))
RF4 <- randomForest(class ~ ., data=training4,
importance=TRUE, nodesize=2, ntree=400,mtry=3)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
predict4 <- predict(RF4,testing4)
predict4df <- data.frame(predict4, type=testing4$class)
predict4df
## predict4 type
## healthyControl_4 1 month healthy
## healthyControl_14 1 month healthy
## healthyControl_15 acute healthy
## healthyControl_16 acute healthy
## healthyControl_21 healthy healthy
## acuteLymeDisease_1 acute acute
## acuteLymeDisease_2 1 month acute
## acuteLymeDisease_3 1 month acute
## acuteLymeDisease_5 acute acute
## acuteLymeDisease_8 1 month acute
## acuteLymeDisease_9 healthy acute
## acuteLymeDisease_11 1 month acute
## acuteLymeDisease_13 acute acute
## acuteLymeDisease_17 1 month acute
## acuteLymeDisease_24 1 month acute
## acuteLymeDisease_26 6 months acute
## Antibodies_1month_3 1 month 1 month
## Antibodies_1month_8 1 month 1 month
## Antibodies_1month_10 1 month 1 month
## Antibodies_1month_23 acute 1 month
## Antibodies_1month_27 acute 1 month
## Antibodies_6months_2 acute 6 months
## Antibodies_6months_4 1 month 6 months
## Antibodies_6months_5 healthy 6 months
PRA4 <- precisionRecallAccuracy(predict4df)
## accuracy is: 29.1666666666667 %
PRA4
## class precision recall accuracy
## 1 healthy 0.333333333333333 0.2 0.75
## 2 acute 0.375 0.272727272727273 0.458333333333333
## 3 1 month 0.25 0.6 0.541666666666667
## 4 6 months 0 0 0.833333333333333
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training5) <- gsub('-','',colnames(training5))
colnames(testing5) <- gsub('-','',colnames(testing5))
testing5$class <- as.factor(paste(testing5$class))
training5$class <- as.factor(paste(training5$class))
RF5 <- randomForest(class ~ ., data=training5,
importance=TRUE, nodesize=2, ntree=400,mtry=3)
predict5 <- predict(RF5,testing5)
predict5df <- data.frame(predict5, type=testing5$class)
predict5df
## predict5 type
## healthyControl_4 1 month healthy
## healthyControl_14 healthy healthy
## healthyControl_15 acute healthy
## healthyControl_16 acute healthy
## healthyControl_21 6 months healthy
## acuteLymeDisease_1 1 month acute
## acuteLymeDisease_2 acute acute
## acuteLymeDisease_3 acute acute
## acuteLymeDisease_5 1 month acute
## acuteLymeDisease_8 1 month acute
## acuteLymeDisease_9 1 month acute
## acuteLymeDisease_11 1 month acute
## acuteLymeDisease_13 1 month acute
## acuteLymeDisease_17 1 month acute
## acuteLymeDisease_24 acute acute
## acuteLymeDisease_26 6 months acute
## Antibodies_1month_3 1 month 1 month
## Antibodies_1month_8 1 month 1 month
## Antibodies_1month_10 healthy 1 month
## Antibodies_1month_23 healthy 1 month
## Antibodies_1month_27 1 month 1 month
## Antibodies_6months_2 acute 6 months
## Antibodies_6months_4 acute 6 months
## Antibodies_6months_5 acute 6 months
PRA5 <- precisionRecallAccuracy(predict5df)
## accuracy is: 29.1666666666667 %
PRA5
## class precision recall accuracy
## 1 healthy 0.333333333333333 0.2 0.75
## 2 acute 0.375 0.272727272727273 0.458333333333333
## 3 1 month 0.272727272727273 0.6 0.583333333333333
## 4 6 months 0 0 0.791666666666667
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training6) <- gsub('-','',colnames(training6))
colnames(testing6) <- gsub('-','',colnames(testing6))
testing6$class <- as.factor(paste(testing6$class))
training6$class <- as.factor(paste(training6$class))
RF6 <- randomForest(class ~ ., data=training6,
importance=TRUE, nodesize=2, ntree=400,mtry=3)
predict6 <- predict(RF6,testing6)
predict6df <- data.frame(predict6, type=testing6$class)
predict6df
## predict6 type
## healthyControl_4 acute healthy
## healthyControl_14 1 month healthy
## healthyControl_15 healthy healthy
## healthyControl_16 acute healthy
## healthyControl_21 acute healthy
## acuteLymeDisease_1 1 month acute
## acuteLymeDisease_2 acute acute
## acuteLymeDisease_3 1 month acute
## acuteLymeDisease_5 6 months acute
## acuteLymeDisease_8 acute acute
## acuteLymeDisease_9 1 month acute
## acuteLymeDisease_11 1 month acute
## acuteLymeDisease_13 1 month acute
## acuteLymeDisease_17 1 month acute
## acuteLymeDisease_24 1 month acute
## acuteLymeDisease_26 acute acute
## Antibodies_1month_3 acute 1 month
## Antibodies_1month_8 1 month 1 month
## Antibodies_1month_10 healthy 1 month
## Antibodies_1month_23 1 month 1 month
## Antibodies_1month_27 healthy 1 month
## Antibodies_6months_2 healthy 6 months
## Antibodies_6months_4 acute 6 months
## Antibodies_6months_5 healthy 6 months
PRA6 <- precisionRecallAccuracy(predict6df)
## accuracy is: 25 %
PRA6
## class precision recall accuracy
## 1 healthy 0.2 0.2 0.666666666666667
## 2 acute 0.375 0.272727272727273 0.458333333333333
## 3 1 month 0.2 0.4 0.541666666666667
## 4 6 months 0 0 0.833333333333333
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training7) <- gsub('-','',colnames(training7))
colnames(testing7) <- gsub('-','',colnames(testing7))
testing7$class <- as.factor(paste(testing7$class))
training7$class <- as.factor(paste(training7$class))
RF7 <- randomForest(class ~ ., data=training7,
importance=TRUE, nodesize=2, ntree=400,mtry=3)
predict7 <- predict(RF7,testing7)
predict7df <- data.frame(predict7, type=testing7$class)
predict7df
## predict7 type
## healthyControl_4 1 month healthy
## healthyControl_14 1 month healthy
## healthyControl_15 healthy healthy
## healthyControl_16 healthy healthy
## healthyControl_21 1 month healthy
## acuteLymeDisease_1 1 month acute
## acuteLymeDisease_2 acute acute
## acuteLymeDisease_3 acute acute
## acuteLymeDisease_5 6 months acute
## acuteLymeDisease_8 1 month acute
## acuteLymeDisease_9 1 month acute
## acuteLymeDisease_11 1 month acute
## acuteLymeDisease_13 1 month acute
## acuteLymeDisease_17 acute acute
## acuteLymeDisease_24 healthy acute
## acuteLymeDisease_26 healthy acute
## Antibodies_1month_3 1 month 1 month
## Antibodies_1month_8 acute 1 month
## Antibodies_1month_10 acute 1 month
## Antibodies_1month_23 healthy 1 month
## Antibodies_1month_27 6 months 1 month
## Antibodies_6months_2 1 month 6 months
## Antibodies_6months_4 1 month 6 months
## Antibodies_6months_5 acute 6 months
PRA7 <- precisionRecallAccuracy(predict7df)
## accuracy is: 25 %
PRA7
## class precision recall accuracy
## 1 healthy 0.4 0.4 0.75
## 2 acute 0.5 0.272727272727273 0.541666666666667
## 3 1 month 0.0909090909090909 0.2 0.416666666666667
## 4 6 months 0 0 0.791666666666667
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training8) <- gsub('-','',colnames(training8))
colnames(testing8) <- gsub('-','',colnames(testing8))
testing8$class <- as.factor(paste(testing8$class))
training8$class <- as.factor(paste(training8$class))
RF8 <- randomForest(class ~ ., data=training8,
importance=TRUE, nodesize=2, ntree=400,mtry=3)
predict8 <- predict(RF8,testing8)
predict8df <- data.frame(predict8, type=testing8$class)
predict8df
## predict8 type
## healthyControl_4 1 month healthy
## healthyControl_14 healthy healthy
## healthyControl_15 acute healthy
## healthyControl_16 1 month healthy
## healthyControl_21 healthy healthy
## acuteLymeDisease_1 acute acute
## acuteLymeDisease_2 acute acute
## acuteLymeDisease_3 acute acute
## acuteLymeDisease_5 healthy acute
## acuteLymeDisease_8 1 month acute
## acuteLymeDisease_9 1 month acute
## acuteLymeDisease_11 1 month acute
## acuteLymeDisease_13 1 month acute
## acuteLymeDisease_17 1 month acute
## acuteLymeDisease_24 acute acute
## acuteLymeDisease_26 healthy acute
## Antibodies_1month_3 1 month 1 month
## Antibodies_1month_8 1 month 1 month
## Antibodies_1month_10 healthy 1 month
## Antibodies_1month_23 healthy 1 month
## Antibodies_1month_27 1 month 1 month
## Antibodies_6months_2 healthy 6 months
## Antibodies_6months_4 acute 6 months
## Antibodies_6months_5 1 month 6 months
PRA8 <- precisionRecallAccuracy(predict8df)
## accuracy is: 37.5 %
PRA8
## class precision recall accuracy
## 1 healthy 0.285714285714286 0.4 0.666666666666667
## 2 acute 0.666666666666667 0.363636363636364 0.625
## 3 1 month 0.272727272727273 0.6 0.583333333333333
## 4 6 months 0 0 0.875
PRA_all <- rbind(PRA1,PRA2,PRA3,PRA4,PRA5,PRA6,PRA7,PRA8)
PRA_all$groupMembership <- c(rep(1,4),
rep(2,4),
rep(3,4),
rep(4,4),
rep(5,4),
rep(6,4),
rep(7,4),
rep(8,4))
PRA_all2 <- PRA_all %>% group_by(class) %>% mutate(max=
ifelse(accuracy==max(as.numeric(paste(accuracy))),'max','not max'))
max2 <- subset(PRA_all2, PRA_all2$max=='max')
max2
## # A tibble: 8 x 6
## # Groups: class [4]
## class precision recall accuracy groupMembership max
## <fct> <fct> <fct> <fct> <dbl> <chr>
## 1 acute 0.8 0.36363636363~ 0.666666666666~ 1 max
## 2 healthy 0.4285714285714~ 0.6 0.75 2 max
## 3 1 month 0.3 0.6 0.625 2 max
## 4 6 months 0 0 0.875 2 max
## 5 healthy 0.3333333333333~ 0.2 0.75 4 max
## 6 healthy 0.3333333333333~ 0.2 0.75 5 max
## 7 healthy 0.4 0.4 0.75 7 max
## 8 6 months 0 0 0.875 8 max
We see that there are less groups that got the best class prediction accuracy than when we didn’t scale or center the data, and that group 2 predicted the healthy class with the same accuracy as groups 4, 5, and 7, but with both a higher precision and recall. Group 2 also predicted the 1 month and 6 month classes with the best accuracy. The 6 month class was still not used by any group to classify any testing samples in our random forest model. but not selecting the 6 month group gave an 88% accuracy, but both group 8 and 2 scored 0% for recall and precision for that. Only Group 1 predicted the acute class with the max accuracy of 67% having a precision of 80% (classified about 4/5 acute samples) and a recall of 37% (some irrelevant records classified as acute). The range of accuracy overall was 21%-42%. Group 2 scored the highest accuracy in prediction, with Group 3 scoring the lowest with this random forest model.
At this point lets take those genes that are in the best predicting classes of gene groups and use it as one data frame. Groups 1,2,4, and 7 are the better gene groups. Lets combine those into one group.
Training1247 <- training8[,colnames(training8) %in% c(Acute_md,HlL_DF,lhL_DF,lLh_DF)]
Training1247$class <- training8$class
Testing1247 <- testing8[,colnames(training8) %in% c(Acute_md,HlL_DF,lhL_DF,lLh_DF)]
Testing1247$class <- testing8$class
colnames(Training1247)
## [1] "BPI" "CA1" "CAMP" "CEACAM8" "CTSG"
## [6] "DDX3Y" "DEFA4" "EIF1AY" "EPB42" "HBA1"
## [11] "HBD" "HEMGN" "HIST1H1C" "HIST2H2AA3" "HSPA1B"
## [16] "KDM5D" "KIR2DS1" "LCN2" "LILRA3" "LTF"
## [21] "SLC25A37" "ZNF815" "class"
colnames(Testing1247)
## [1] "BPI" "CA1" "CAMP" "CEACAM8" "CTSG"
## [6] "DDX3Y" "DEFA4" "EIF1AY" "EPB42" "HBA1"
## [11] "HBD" "HEMGN" "HIST1H1C" "HIST2H2AA3" "HSPA1B"
## [16] "KDM5D" "KIR2DS1" "LCN2" "LILRA3" "LTF"
## [21] "SLC25A37" "ZNF815" "class"
colnames(Testing1247)==colnames(Training1247)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
The same genes are in each of our better genes data frame for the testing and training sets. Now lets see how the last model does on prediction with these genes.
RF1247 <- randomForest(class ~ ., data=Training1247,
importance=TRUE, nodesize=2, ntree=400,mtry=3)
predict1247 <- predict(RF1247,Testing1247)
predict1247df <- data.frame(predict1247, type=Testing1247$class)
predict1247df
## predict1247 type
## healthyControl_4 healthy healthy
## healthyControl_14 1 month healthy
## healthyControl_15 1 month healthy
## healthyControl_16 healthy healthy
## healthyControl_21 healthy healthy
## acuteLymeDisease_1 acute acute
## acuteLymeDisease_2 acute acute
## acuteLymeDisease_3 acute acute
## acuteLymeDisease_5 healthy acute
## acuteLymeDisease_8 acute acute
## acuteLymeDisease_9 1 month acute
## acuteLymeDisease_11 1 month acute
## acuteLymeDisease_13 1 month acute
## acuteLymeDisease_17 1 month acute
## acuteLymeDisease_24 acute acute
## acuteLymeDisease_26 healthy acute
## Antibodies_1month_3 acute 1 month
## Antibodies_1month_8 1 month 1 month
## Antibodies_1month_10 healthy 1 month
## Antibodies_1month_23 6 months 1 month
## Antibodies_1month_27 6 months 1 month
## Antibodies_6months_2 1 month 6 months
## Antibodies_6months_4 1 month 6 months
## Antibodies_6months_5 1 month 6 months
PRA1247 <- precisionRecallAccuracy(predict1247df)
## accuracy is: 37.5 %
PRA1247
## class precision recall accuracy
## 1 healthy 0.5 0.6 0.791666666666667
## 2 acute 0.833333333333333 0.454545454545455 0.708333333333333
## 3 1 month 0.1 0.2 0.458333333333333
## 4 6 months 0 0 0.791666666666667
The overall accuracy was 37.5% or about 9/24 correct for four classes. So that’s better than 25% or 1/4 chances of predicting accurately. The accuracy for each of the classes was 79% for the healthy class, 71% for the acute class, 46% for the 1 month class, and 79% for the 6 month class. With the highest precision in predicting the acute class, and highest recall in predicting the healthy class. It predicted more healthy classes correctly and only misclassified 40% for a recall of 60%, and predicted 83% of the acute classes only missing 17% of the other acute classes in the testing set.
At this point I don’t think tuning the model will allow it to perform better. We can use other algorithms on the entire data or this set.
knnMod0 <- train(class ~ .,
method='knn', preProcess=c('center','scale'),
tuneLength=10, trControl=trainControl(method='adaptive_cv'),
data=Training1247)
predKNN0 <- predict(knnMod0, Testing1247)
dfKNN0 <- data.frame(KNN=predKNN0, class=Testing1247$class)
dfKNN0
## KNN class
## 1 1 month healthy
## 2 healthy healthy
## 3 healthy healthy
## 4 healthy healthy
## 5 healthy healthy
## 6 1 month acute
## 7 1 month acute
## 8 acute acute
## 9 acute acute
## 10 6 months acute
## 11 1 month acute
## 12 acute acute
## 13 6 months acute
## 14 1 month acute
## 15 acute acute
## 16 healthy acute
## 17 1 month 1 month
## 18 acute 1 month
## 19 healthy 1 month
## 20 6 months 1 month
## 21 6 months 1 month
## 22 1 month 6 months
## 23 1 month 6 months
## 24 1 month 6 months
precisionRecallAccuracy(dfKNN0)
## accuracy is: 37.5 %
## class precision recall accuracy
## 1 healthy 0.666666666666667 0.8 0.875
## 2 acute 0.8 0.363636363636364 0.666666666666667
## 3 1 month 0.111111111111111 0.2 0.5
## 4 6 months 0 0 0.708333333333333
The K-Nearest Neighbor algorithm also scored 37.5%, with the best accuracy in prediction for the healthy class, with 87.5% accuracy, 67% precision, and 80% recall. The acute class scored 67% accuracy but had an 80% precision and 37% recall.
rpartMod0 <- train(class ~ ., method='rpart', tuneLength=7, data=Training1247)
predRPART0 <- predict(rpartMod0, Testing1247)
dfRPART0 <- data.frame(Rpart=predRPART0, class=Testing1247$class)
dfRPART0
precisionRecallAccuracy(dfRPART0)
The rpart algorithm only scored 21% accuracy overall, and didn’t predict any of the healthy,acute, or 6 month classes correctly as the precision and recall are both 0% for those clases, but the accuracy was 79% for the healthy class, 54% for the acute class, and 88% for the 6 months class. This model scored 21% precision on the 1 month class and 100% recall for the same class. that is less than the probability of 1/4 correct or 25% in accuracy overall. This model would not be a good fit for this data or could be tuned. Doubtful it will improve much or any other model. The underlying data seems to be bad. These genes aren’t able to predict out of all these classes in overall accuracy better than 42% accuracy using the median fold change values to gather our genes. We could try separating the classes and seeing in pairs if the models will predict better accuracy. If we exclude the antibiotic treatment classes, can we get good prediction accuracy?
y*(max(y)-min(y))+min(y)↩