This is the fold change by median values instead of by mean values to see if the accuracy in classification improves. Some samples might just need to be removed as there are about 4-5 in different classes where some genes are greatly over expresses in relation to their neighbors in each class. This is the median fold change version of LymeDiseaseTicks.Rmd.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
descriptors2 <- read.csv('descriptors2.csv')
LymeDisease4 <- read.csv('LymeDisease4normalized-easynames.csv', sep=',',
                         header=T, na.strings = c('',' ','NA'))
lymeMx2 <- read.csv('lymeMx2-denormalized-easynames.csv', sep=',',header=T,
                    na.strings=c('',' ','NA'))

Lets remove those samples from our data: Samples: 7 of the acute class samples, sample 12 of the 1 month class samples, sample 10 of the 6 month class samples, and samples 1, 11, and 12 of the healthy class samples.

colnames(lymeMx2)
##  [1] "gene"                  "healthyControl_1"      "healthyControl_2"     
##  [4] "healthyControl_3"      "healthyControl_4"      "healthyControl_5"     
##  [7] "healthyControl_6"      "healthyControl_7"      "healthyControl_8"     
## [10] "healthyControl_9"      "healthyControl_10"     "healthyControl_11"    
## [13] "healthyControl_12"     "healthyControl_13"     "healthyControl_14"    
## [16] "healthyControl_15"     "healthyControl_16"     "healthyControl_17"    
## [19] "healthyControl_18"     "healthyControl_19"     "healthyControl_20"    
## [22] "healthyControl_21"     "acuteLymeDisease_1"    "acuteLymeDisease_2"   
## [25] "acuteLymeDisease_3"    "acuteLymeDisease_4"    "acuteLymeDisease_5"   
## [28] "acuteLymeDisease_6"    "acuteLymeDisease_7"    "acuteLymeDisease_8"   
## [31] "acuteLymeDisease_9"    "acuteLymeDisease_10"   "acuteLymeDisease_11"  
## [34] "acuteLymeDisease_12"   "acuteLymeDisease_13"   "acuteLymeDisease_14"  
## [37] "acuteLymeDisease_15"   "acuteLymeDisease_16"   "acuteLymeDisease_17"  
## [40] "acuteLymeDisease_18"   "acuteLymeDisease_19"   "acuteLymeDisease_20"  
## [43] "acuteLymeDisease_21"   "acuteLymeDisease_22"   "acuteLymeDisease_23"  
## [46] "acuteLymeDisease_24"   "acuteLymeDisease_25"   "acuteLymeDisease_26"  
## [49] "acuteLymeDisease_27"   "acuteLymeDisease_28"   "Antibodies_1month_1"  
## [52] "Antibodies_1month_2"   "Antibodies_1month_3"   "Antibodies_1month_4"  
## [55] "Antibodies_1month_5"   "Antibodies_1month_6"   "Antibodies_1month_7"  
## [58] "Antibodies_1month_8"   "Antibodies_1month_9"   "Antibodies_1month_10" 
## [61] "Antibodies_1month_11"  "Antibodies_1month_12"  "Antibodies_1month_13" 
## [64] "Antibodies_1month_14"  "Antibodies_1month_15"  "Antibodies_1month_16" 
## [67] "Antibodies_1month_17"  "Antibodies_1month_18"  "Antibodies_1month_19" 
## [70] "Antibodies_1month_20"  "Antibodies_1month_21"  "Antibodies_1month_22" 
## [73] "Antibodies_1month_23"  "Antibodies_1month_24"  "Antibodies_1month_25" 
## [76] "Antibodies_1month_26"  "Antibodies_1month_27"  "Antibodies_6months_1" 
## [79] "Antibodies_6months_2"  "Antibodies_6months_3"  "Antibodies_6months_4" 
## [82] "Antibodies_6months_5"  "Antibodies_6months_6"  "Antibodies_6months_7" 
## [85] "Antibodies_6months_8"  "Antibodies_6months_9"  "Antibodies_6months_10"
lymeMx2b <- lymeMx2[,-c(2,12,13,87,62,29)]
LymeDisease4b <- LymeDisease4[,-c(2,12,13,87,62,29)]

Now, we can use this data to find the median values across samples and get the fold change values, then plot the data in Tableau.

LymeDisease5 <- LymeDisease4b %>% group_by(Gene) %>% summarise_at(vars('healthyControl_2':'Antibodies_6months_9'),median)
## Warning: Factor `Gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
lymeMx3 <- lymeMx2b %>% group_by(gene) %>% summarise_at(vars('healthyControl_2':'Antibodies_6months_9'),median)
## Warning: Factor `gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
Lyme6 <- LymeDisease5 %>% group_by(Gene) %>% 
  mutate(
    healthy_Median = median(healthyControl_2:healthyControl_21,na.rm=T),
    acuteLymeDisease_Median = median(acuteLymeDisease_1:acuteLymeDisease_28,na.rm=T),
    antibodies_1month_Median = median(Antibodies_1month_1:Antibodies_1month_27,na.rm=T),
    antibodies_6month_Median = median(Antibodies_6months_1:Antibodies_6months_9,na.rm=T)
  )
## Warning: Factor `Gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `Gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
tail(colnames(Lyme6),5)
## [1] "Antibodies_6months_9"     "healthy_Median"          
## [3] "acuteLymeDisease_Median"  "antibodies_1month_Median"
## [5] "antibodies_6month_Median"
lymeMx4 <- lymeMx3 %>% group_by(gene) %>% 
  mutate(
    healthy_median = median(healthyControl_2:healthyControl_21,na.rm=T),
    acuteLymeDisease_median = median(acuteLymeDisease_1:acuteLymeDisease_28,na.rm=T),
    antibodies_1month_median = median(Antibodies_1month_1:Antibodies_1month_27,na.rm=T),
    antibodies_6month_median = median(Antibodies_6months_1:Antibodies_6months_9,na.rm=T)
  )
## Warning: Factor `gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
tail(colnames(lymeMx4),5)
## [1] "Antibodies_6months_9"     "healthy_median"          
## [3] "acuteLymeDisease_median"  "antibodies_1month_median"
## [5] "antibodies_6month_median"
lymeMx5 <- lymeMx4 %>% group_by(gene) %>% 
  mutate(acuteHealthy_foldChange=acuteLymeDisease_median/healthy_median,
    antibodies_1month_healthy_foldChange=antibodies_1month_median/healthy_median,
    antibodies_6month_healthy_foldchange=antibodies_6month_median/healthy_median)
## Warning: Factor `gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
tail(colnames(lymeMx5),10)
##  [1] "Antibodies_6months_7"                
##  [2] "Antibodies_6months_8"                
##  [3] "Antibodies_6months_9"                
##  [4] "healthy_median"                      
##  [5] "acuteLymeDisease_median"             
##  [6] "antibodies_1month_median"            
##  [7] "antibodies_6month_median"            
##  [8] "acuteHealthy_foldChange"             
##  [9] "antibodies_1month_healthy_foldChange"
## [10] "antibodies_6month_healthy_foldchange"
Lyme7 <- Lyme6 %>% group_by(Gene) %>% 
  mutate(acuteHealthy_foldChange=acuteLymeDisease_Median/healthy_Median,
    antibodies_1month_healthy_foldChange=antibodies_1month_Median/healthy_Median,
    antibodies_6month_healthy_foldchange=antibodies_6month_Median/healthy_Median)
## Warning: Factor `Gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `Gene` contains implicit NA, consider using
## `forcats::fct_explicit_na`
tail(colnames(Lyme7),10)
##  [1] "Antibodies_6months_7"                
##  [2] "Antibodies_6months_8"                
##  [3] "Antibodies_6months_9"                
##  [4] "healthy_Median"                      
##  [5] "acuteLymeDisease_Median"             
##  [6] "antibodies_1month_Median"            
##  [7] "antibodies_6month_Median"            
##  [8] "acuteHealthy_foldChange"             
##  [9] "antibodies_1month_healthy_foldChange"
## [10] "antibodies_6month_healthy_foldchange"

Our tables of unique genes grouped by genes to get their medians of each gene within each sample for the duplicate genes, the added features of each class’s median gene expression per gene, and the fold change ratio of the diseased or treated to the healthy gene expression values have been created. The normalized data or the original data is the Lyme7 data frame and the denormalized data is the lymeMx5 data frame. Now each shrunk from 48851 genes to 19526 genes when grouping by unique genes, but now that is still a lot of genes, so lets take the gene that have the top 10 most expressed and least expressed values in both data frames by acute/healthy fold change, and the top 10 and bottom 10 of the 1month of antibodies/healthy fold change values, and finally the top 10 and bottom 10 of the 6 month of antibodies/healthy fold change values. *** The denormalized group first:

Acute/healthy top 10 and bottom 10 genes by fold change data frame:

acuteHealthy20 <- lymeMx5[order(lymeMx5$acuteHealthy_foldChange,
                                decreasing=T)[c(1:10,19517:19526)],]

One month/healthy top 10 and bottom 10 genes by fold change data frame:

month1healthy20 <- lymeMx5[order(lymeMx5$antibodies_1month_healthy_foldChange,
                                 decreasing=T)[c(1:10,19517:19526)],]

Six month/healthy top 10 and bottom 10 genes by fold change data frame:

month6healthy20 <- lymeMx5[order(lymeMx5$antibodies_6month_healthy_foldchange,
                                 decreasing=T)[c(1:10,19517:19526)],]
lymeMx6 <- rbind(acuteHealthy20,month1healthy20,month6healthy20)
lymeMx7 <- lymeMx6[!duplicated(lymeMx6),]

There were 45 unique genes between all three fold change groups in the denormalized data out of 60 genes that were either the top 10 or bottom 10 of genes being expressed.

Now, for the normalized data:

Acute/healthy top 10 and bottom 10 genes by fold change data frame:

acuteHealthy20b <- Lyme7[order(Lyme7$acuteHealthy_foldChange,
                                decreasing=T)[c(1:10,19517:19526)],]

One month/healthy top 10 and bottom 10 genes by fold change data frame:

month1healthy20b <- Lyme7[order(Lyme7$antibodies_1month_healthy_foldChange,
                                 decreasing=T)[c(1:10,19517:19526)],]

Six month/healthy top 10 and bottom 10 genes by fold change data frame:

month6healthy20b <- Lyme7[order(Lyme7$antibodies_6month_healthy_foldchange,
                                 decreasing=T)[c(1:10,19517:19526)],]
Lyme8 <- rbind(acuteHealthy20b,month1healthy20b,month6healthy20b)
Lyme9 <- Lyme8[!duplicated(Lyme8),]

There are 34 genes unique to the normalized data, probably because this data had negative values. The scaling done to denormalize this data is probably not exactly what the true raw values are. But they should have the same number of genes, but this one has 10 less than the normalized data. We will see later which one can be split into training and testing sets with better prediction accuracy within each class and overall.

Lets also add the gene summaries to these data frames and create a field that will give the class of each sample. This file,genecards2.R, is an R file sourced for the functions made in previous scripts. We lose one of the genes in the original data frame because it isn’t in genecards.org and end up with 32 instead of 33 genes for that data frame.

source('geneCards2.R')
## Warning: package 'rvest' was built under R version 3.6.3
## Loading required package: xml2
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date

LOC400657 (#13 in list) is a gene that genecards.org doesn’t recognize and it will throw an error, so we should skip it.

for (i in Lyme9$Gene[1:12]){
  getSummaries2(i,'protein')
}

for (i in Lyme9$Gene[14:34]){
  getSummaries2(i,'protein')
}
getGeneSummaries('protein')
summsLyme9 <- read.csv("proteinGeneSummaries_protein.csv")

The LOC genes don’t seem to have gene summaries, so they had to be removed.

for (i in lymeMx7$gene[1:34]){
  getSummaries2(i,'immune')
}
for (i in lymeMx7$gene[37:39]){
  getSummaries2(i,'immune')
}
for (i in lymeMx7$gene[41:45]){
  getSummaries2(i,'immune')
}
getGeneSummaries('immune')
summsLymeMx7 <- read.csv("proteinGeneSummaries_immune.csv")
Lyme10 <- merge(summsLyme9,Lyme9,by.x='gene', by.y='Gene')
lymeMx8 <- merge(summsLymeMx7,lymeMx7, by.x='gene',by.y='gene')

Lets create those classes for each data frame. But first we have to tidy the data.

Lyme11 <- gather(Lyme10, key='classSample',value='classValue',8:87)
lymeMx9 <- gather(lymeMx8,key='classSample',value='classValue',8:87)
Lyme11$class <- Lyme11$classSample
Lyme11$class <- gsub('^hea.*$','healthy',Lyme11$class, perl=T)
Lyme11$class <- gsub('^acute.*$','acute Lyme Disease',Lyme11$class,perl=T)
Lyme11$class <- gsub('^.*1month.*$','1 month treatment', Lyme11$class, perl=T)
Lyme11$class <- gsub('^.*6month.*$','6 months treatment',Lyme11$class,perl=T)
lymeMx9$class <- lymeMx9$classSample
lymeMx9$class <- gsub('^hea.*$','healthy',lymeMx9$class, perl=T)
lymeMx9$class <- gsub('^acute.*$','acute Lyme Disease',lymeMx9$class,perl=T)
lymeMx9$class <- gsub('^.*1month.*$','1 month treatment', lymeMx9$class, perl=T)
lymeMx9$class <- gsub('^.*6month.*$','6 months treatment',lymeMx9$class,perl=T)
unique(Lyme11$gene)
##  [1] ACSBG2   ANKRD54  ATP5G3   CBX3     CCDC15   COX7A2L  CSK      FEV     
##  [9] GEN1     GPR50    HEPN1    HS1BP3   KRT83    KRTAP9-8 LYVE1    MAP1B   
## [17] MFSD3    MTNR1B   NSUN3    OAS2     ODF3B    PRCP     PRL      SNORA60 
## [25] SNRPD2   TMEM92   TTLL3    VCAN     ZNF239   ZWILCH  
## 33 Levels: ACSBG2 ANKRD54 ATP5G3 C1ORF141 C4ORF14 C9ORF129 CBX3 ... ZWILCH
unique(lymeMx9$gene)
##  [1] BPI        CA1        CAMP       CCDC144A   CEACAM8    CTSG      
##  [7] CXCL2      DDX3Y      DEFA4      EIF1AY     EPB42      ERAP2     
## [13] EREG       FSIP1      GAL3ST4    HBA1       HBD        HEMGN     
## [19] HIST1H1C   HIST2H2AA3 HLA-DRB5   HSPA1B     IL1B       IL8       
## [25] KDM5D      KIR2DS1    KRT1       LCN2       LGALS2     LILRA3    
## [31] LTF        MYOM2      OLR1       POTEB      SIRPB1     SLC25A37  
## [37] SNCA       TSIX       XIST       ZNF815    
## 41 Levels: BPI CA1 CAMP CCDC144A CEACAM8 CTSG CXCL2 CXORF21 DDX3Y ... ZNF815

It looks like the genes aren’t even the same genes.

unique(Lyme11$Gene) %in% unique(lymeMx9)
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE

Apparently, they are not the same genes. Its ok, maybe they still offer some information. The techniques and methods are the same to inverse what was assumed to be the normalization method, but for typical studies. In bioinformatics, with gene expression data, there is usually more to it, like trimming the bottom and top outliers, and taking the quantile normalization, then scaling. We used the standardization method of normalizing values between 0 and 1 as log2 normalized is to f(x)=log2[(x-min(x))/(max(x)-min(x))]=y and the inverse would be: f(y)=21=x So, there is some logic to this, and at some point rounded values could lose information in the numer of scientific placeholders of precision is used in calculating the inverse of the base 2 log, or the exact values for max and min of X need to be used. Reminder, when I demonstrated this earlier, the method worked using this procedure for 10 values that included a 0 where a small value was added to take the log2 of x=0 without an error, but the exact values were still decimals at the final step. To fix this they were turned to fractions, where the denominator was the max(x), and so each value multiplied by the denominator at that point returned the original x values in our list of 10. When addidng that step to the last step we used on this data to denormalize the data, the values were extremely large, approximately 103-104 larger. So we stopped before taking the fractional values. We will continue with these genes in our machine learning to see if either set makes good gene targets for pathogenesis of lyme disease by how accurately the classes of: healthy, acute disease, 1 month convalescing or developing antibodies after being given a regimen of antibiotics, and 6 months convalescing after being given antibiotics. This is temporal or time specific data, and there were some discrepencies in the study when being done, because it spanned 2 years, some patients didn’t know how long they had it but if they had symptoms they were assumed to be suffering from lyme disease, like the facial paralysis or the skin lesion type marks. Also, some patients dropped out and if the study spanned two years, and only the last 6 months recorded the convalescing at 6 months then the first batches of patients in the acute phase weren’t being recorded or they were actually being monitored after six months and up to two years after being given antibiotics. So we can imagine the data might be skewed for these differences or discrepencies.

Lets write these two tables out to csv files to analyze visually in Tableau.

write.csv(Lyme11,'LymeDisease_originalValues_foldchages33_medianAndRemoved.csv',row.names=F)
write.csv(Lyme11[,-c(5,6)],'LymeDisease_originalValues_foldchages33_medianAndRemoved_rm2Summs.csv',row.names=F)
write.csv(lymeMx9,"LymeDisease_denormalizedValues_foldchanges40_medianAndRemoved.csv",row.names=F)
write.csv(lymeMx9[-c(5,6)],"LymeDisease_denormalizedValues_foldchanges40_medianAndRemoved_rm2Summs.csv",row.names=F)

Lets also write the Lyme10 and lymeMx8 data frames that we will use in our ML models.

write.csv(Lyme10,'ML_medDrop_GSE145974_original.csv', row.names = F)
write.csv(lymeMx8,'ML_medDrop_GSE145974_destandardized.csv',row.names=F)

Lets start the machine learning by first making the data frames with the class as the output or target feature and the samples as observations and the genes as predictors from both sets separately.

The 40 de-standardized genes will be created first then the 30 original genes that are completely different. Both are the filtered top or bottom 10 genes out of their respective 19526 unique gene sets of each class by fold change of acute/healthy, 1 month/healthy, or 6 months/healthy by medians of their respective class samples.

The destandardized set. Lets just name our data sets something silly to keep track of them. Dance is the de-standardized set and Stand is the original log2 normalized set.

The Dance Machine Learning set, made from the lymeMx7, not-tidied, de-standardized data frame:

colnames(lymeMx7)
##  [1] "gene"                                
##  [2] "healthyControl_2"                    
##  [3] "healthyControl_3"                    
##  [4] "healthyControl_4"                    
##  [5] "healthyControl_5"                    
##  [6] "healthyControl_6"                    
##  [7] "healthyControl_7"                    
##  [8] "healthyControl_8"                    
##  [9] "healthyControl_9"                    
## [10] "healthyControl_10"                   
## [11] "healthyControl_13"                   
## [12] "healthyControl_14"                   
## [13] "healthyControl_15"                   
## [14] "healthyControl_16"                   
## [15] "healthyControl_17"                   
## [16] "healthyControl_18"                   
## [17] "healthyControl_19"                   
## [18] "healthyControl_20"                   
## [19] "healthyControl_21"                   
## [20] "acuteLymeDisease_1"                  
## [21] "acuteLymeDisease_2"                  
## [22] "acuteLymeDisease_3"                  
## [23] "acuteLymeDisease_4"                  
## [24] "acuteLymeDisease_5"                  
## [25] "acuteLymeDisease_6"                  
## [26] "acuteLymeDisease_8"                  
## [27] "acuteLymeDisease_9"                  
## [28] "acuteLymeDisease_10"                 
## [29] "acuteLymeDisease_11"                 
## [30] "acuteLymeDisease_12"                 
## [31] "acuteLymeDisease_13"                 
## [32] "acuteLymeDisease_14"                 
## [33] "acuteLymeDisease_15"                 
## [34] "acuteLymeDisease_16"                 
## [35] "acuteLymeDisease_17"                 
## [36] "acuteLymeDisease_18"                 
## [37] "acuteLymeDisease_19"                 
## [38] "acuteLymeDisease_20"                 
## [39] "acuteLymeDisease_21"                 
## [40] "acuteLymeDisease_22"                 
## [41] "acuteLymeDisease_23"                 
## [42] "acuteLymeDisease_24"                 
## [43] "acuteLymeDisease_25"                 
## [44] "acuteLymeDisease_26"                 
## [45] "acuteLymeDisease_27"                 
## [46] "acuteLymeDisease_28"                 
## [47] "Antibodies_1month_1"                 
## [48] "Antibodies_1month_2"                 
## [49] "Antibodies_1month_3"                 
## [50] "Antibodies_1month_4"                 
## [51] "Antibodies_1month_5"                 
## [52] "Antibodies_1month_6"                 
## [53] "Antibodies_1month_7"                 
## [54] "Antibodies_1month_8"                 
## [55] "Antibodies_1month_9"                 
## [56] "Antibodies_1month_10"                
## [57] "Antibodies_1month_11"                
## [58] "Antibodies_1month_13"                
## [59] "Antibodies_1month_14"                
## [60] "Antibodies_1month_15"                
## [61] "Antibodies_1month_16"                
## [62] "Antibodies_1month_17"                
## [63] "Antibodies_1month_18"                
## [64] "Antibodies_1month_19"                
## [65] "Antibodies_1month_20"                
## [66] "Antibodies_1month_21"                
## [67] "Antibodies_1month_22"                
## [68] "Antibodies_1month_23"                
## [69] "Antibodies_1month_24"                
## [70] "Antibodies_1month_25"                
## [71] "Antibodies_1month_26"                
## [72] "Antibodies_1month_27"                
## [73] "Antibodies_6months_1"                
## [74] "Antibodies_6months_2"                
## [75] "Antibodies_6months_3"                
## [76] "Antibodies_6months_4"                
## [77] "Antibodies_6months_5"                
## [78] "Antibodies_6months_6"                
## [79] "Antibodies_6months_7"                
## [80] "Antibodies_6months_8"                
## [81] "Antibodies_6months_9"                
## [82] "healthy_median"                      
## [83] "acuteLymeDisease_median"             
## [84] "antibodies_1month_median"            
## [85] "antibodies_6month_median"            
## [86] "acuteHealthy_foldChange"             
## [87] "antibodies_1month_healthy_foldChange"
## [88] "antibodies_6month_healthy_foldchange"

Lets remove the fold change and median value features from our lymeMx8 data frame and save it as ‘Dance’ after we transpose it to get the unique genes as predictors and the samples as observations.

dance <- lymeMx8[,-c(2:7,88:94)]
danceSampleNames <- colnames(dance)[2:81]

month1 <- grep('1month',danceSampleNames)
month6 <- grep('6month',danceSampleNames)
healthy <- grep('healthy',danceSampleNames)
acute <- grep('acute',danceSampleNames)

class <- danceSampleNames
class[month1] <- '1 month'
class[month6] <- '6 months'
class[healthy] <- 'healthy'
class[acute] <- 'acute'


danceGeneNames <- dance$gene
Dance <- as.data.frame(t(dance[,-1]))
colnames(Dance) <- danceGeneNames
Dance$class <- class
Dance2 <- Dance[,c(41,1:40)]
head(Dance2)
##                    class      BPI       CA1     CAMP  CCDC144A  CEACAM8
## healthyControl_2 healthy 44.71151 187.58066 61.39548 27.852233 59.99411
## healthyControl_3 healthy 29.02589  17.32854 15.62534 10.299108 29.53441
## healthyControl_4 healthy 18.36342  15.48674 16.94088 15.867351 18.74716
## healthyControl_5 healthy 33.89164  26.00250 27.18118 32.890159 26.42882
## healthyControl_6 healthy 33.41352  18.64946 20.98911  8.884042 18.28476
## healthyControl_7 healthy 32.39563  39.00369 30.04639 33.431603 34.07572
##                      CTSG    CXCL2      DDX3Y    DEFA4     EIF1AY     EPB42
## healthyControl_2 53.10345 90.08176  15.127511 68.67612   5.882532 188.58533
## healthyControl_3 23.49343 26.80778   6.992033 28.76057   3.349618  18.82050
## healthyControl_4 11.58055 12.26752  88.358654 11.98227 417.075821  16.32208
## healthyControl_5 12.99521 36.01149 159.894013 21.02267 533.947205  31.19096
## healthyControl_6 74.44276 29.94835   5.178288 19.84215   3.160839  20.54773
## healthyControl_7 29.58619 21.40277 200.257935 58.04673 652.397858  44.49253
##                      ERAP2     EREG    FSIP1   GAL3ST4       HBA1       HBD
## healthyControl_2 48.643327 93.65032 26.85640 780.43516 133.136907 274.93081
## healthyControl_3 11.343900 15.74919 27.90551  20.56412   3.504471  13.83797
## healthyControl_4  8.138399 22.54677 16.33424  11.73537  20.246208  13.87592
## healthyControl_5 83.403401 47.02939 31.07355  32.62728  15.197226  18.59400
## healthyControl_6 37.838495 22.14446 16.89240  17.38865  16.619494  12.64241
## healthyControl_7 63.497294 72.50205 39.34989  31.61232  59.326363  39.88204
##                      HEMGN  HIST1H1C HIST2H2AA3   HLA-DRB5    HSPA1B     IL1B
## healthyControl_2 240.88830  14.18824  24.780563  26.986300  20.41038 46.72271
## healthyControl_3  11.01357  21.44584   5.685453 228.898114  22.40959 14.95010
## healthyControl_4  13.76587  22.84205  27.891293 252.183721  12.06996 25.60839
## healthyControl_5  28.19839  21.41184  50.434659  21.402235  21.23190 16.24523
## healthyControl_6  19.01051  17.99494  14.038684   5.877128  13.44129 26.42206
## healthyControl_7  27.97675 104.92962  70.533000  25.547715 169.99956 87.67157
##                        IL8      KDM5D  KIR2DS1      KRT1      LCN2    LGALS2
## healthyControl_2  92.69553  21.147566 61.10841 228.70386 33.503142  9.971672
## healthyControl_3  29.11711   9.683688 21.24897  26.71635 10.400323  6.352478
## healthyControl_4  20.83394  86.118100 65.46716  15.79359 12.799352 25.957496
## healthyControl_5  30.94746 153.958130 56.76246  25.13165 20.690155 14.623216
## healthyControl_6  29.23871   8.506017 14.84648  10.12461  6.900668 20.165600
## healthyControl_7 128.20648  74.040103 29.33814  41.14450 51.096213 57.850952
##                    LILRA3      LTF      MYOM2     OLR1    POTEB    SIRPB1
## healthyControl_2 11.27490 75.86353 323.065372 82.22582 28.97101  30.57768
## healthyControl_3 13.96374 21.55983  65.086732 19.37997 35.29903  30.11269
## healthyControl_4 31.99443 13.72309   9.536578 23.11463 17.02823 110.35373
## healthyControl_5 30.68084 21.22504  46.234651 51.28529 28.21187 173.67341
## healthyControl_6 22.93716 18.82061  14.813989 19.34019 19.32730  16.39025
## healthyControl_7 22.10753 29.26608 465.398919 58.15919 70.60498  47.27990
##                    SLC25A37      SNCA      TSIX       XIST   ZNF815
## healthyControl_2 167.517066 232.27893 433.24591 395.790441 85.75114
## healthyControl_3   6.954475  15.06107 134.56488 182.761123 35.97833
## healthyControl_4  12.892965  12.54416  10.88805   3.343866 14.12286
## healthyControl_5  23.670906  24.60520  12.66765   4.832158 52.97482
## healthyControl_6  12.442719  16.16613  53.40405 171.117221 15.28891
## healthyControl_7  48.265747  54.19420  15.94847   7.291165 89.00234

I created a dashboard of the fold change median values across class samples compared to the healthy class, and with the individual samples’ gene expression values. This data set has 6 samples removed that skewed the data when looking at mean gene expression values in LymeDiseaseTicks.Rmd, but when looking at the median gene expression values and those other samples removed there are some other samples that also skew the data by median values. clicking a gene in the gene filter and selecting ctrl+clicking each additional gene will display only those gene or genes interested in. Clicking the genes again will display all genes. Lets look through the genes to see what genes show unique or odd behaviors and group them into lists of those genes that have fold change values the same from acute -> 1 month of treatment -> 6 months of treatment compared to the healthy samples. dashboard of median fold change values and 6 samples removed

dashboard of median fold change values and individual sample values

dashboard of median fold change values and individual sample values

Figure 10: Dashboard of median fold change values with six samples removed. The individual samples are shown in each class as well as each top or bottom gene in fold change expression of diseased or treated to healthy ratio of medians per class values. There is a gene filter to select each gene interested in. To select more than one gene at a time use ctrl + click on each gene, to remove the selection and return all genes select the selected genes again. Even when removing the six samples that had some different gene behaviors compared to the neighboring samples within each class when using the mean values for fold change values, there are other samples in the median selected sets with odd behaviors as well that could throw off the classification algorithms. This dashboard needs to be looked through to get a list of those samples with odd behaviors and also the genes’ fold change values behavior. Such as, monotastically increasing or decreasing from acute ->1 month -> 6 months, or start high, drops low, then shoots really high in gene expression after 6 months of treatment, and other behaviors. They could allow us to zoom in on target genes for lyme disease pathogenesis in these separate classes.

There are some categories of behaviors. out of three classes of fold change, if you see a low and lowest and high, the low and lowest are closest in gene expression levels lower than the high description. And if you see a high, a highest, and a low then the high and highest are closer in gene expression level higher than the low class.

up acute:

monotastically decreasing from acute -> 1 month -> 6 months:

BPI CA1 CAMP CEACAM8 CTSG DDX3Y DEFA4 EIF1AY EPB42 HBA1 HBD HEMGN HIST1H1C HIST2H2AA3 HSPA1B KDM5D KIR2DS1 LCN2 LILRA3 LTF SLC25A37 ZNF815

high, lowest,low: SNCA CCDC144A

highest,low,high: CXCL2 GAL3ST4 HLA-DRB5 KRT1 LGALS2

monotastically increasing from acute -> 1 month -> 6 months: ERAP2

high, low, highest: IL8 EREG IL1B OLR1 POTEB

low, lowest, high: TSIX XIST MYOM2 FSIP1

low,high,lowest: SIRPB1

There are 7 groups:

Acute_md <- c('BPI','CA1','CAMP','CEACAM8','CTSG','DDX3Y','DEFA4','EIF1AY','EPB42',
'HBA1','HBD','HEMGN','HIST1H1C','HIST2H2AA3','HSPA1B','KDM5D','KIR2DS1','LCN2',
'LILRA3','LTF','SLC25A37','ZNF815')
HlL <- c('SNCA','CCDC144A')
Hlh <- c('CXCL2', 'GAL3ST4', 'HLA-DRB5', 'KRT1', 'LGALS2')
Acute_mi <- 'ERAP2'
hlH <- c('IL8','EREG','IL1B','OLR1','POTEB')
lLh <- c('TSIX','XIST','MYOM2','FSIP1')
lhL <- 'SIRPB1'

Now that we have our lists, lets see about those data frames for the seven different groups of gene anomolies or similarities. The following are our ML ready dataframes for our seven groups in our de-standardized Lyme disease data.

Acute_md_DF <- Dance2[,colnames(Dance2) %in% Acute_md]
Acute_md_DF$class <- Dance2$class

HlL_DF <- Dance2[,colnames(Dance2) %in% HlL]
HlL_DF$class <- Dance2$class

Acute_mi_DF <- data.frame(ERAP2=Dance2[,colnames(Dance2) %in% Acute_mi], row.names=row.names(Dance2))
Acute_mi_DF$class <- Dance2$class

lhL_DF <- data.frame(SIRPB1=Dance2[,colnames(Dance2) %in%  lhL],row.names=row.names(Dance2))
lhL_DF$class <- Dance2$class

Hlh_DF <- Dance2[,colnames(Dance2) %in% Hlh]
Hlh_DF$class <- Dance2$class

hlH_DF <- Dance2[,colnames(Dance2) %in% hlH]
hlH_DF$class <- Dance2$class

lLh_DF <- Dance2[,colnames(Dance2) %in% lLh]
lLh_DF$class <- Dance2$class

Great, now we need to run through each of these 7 data frames and split into separate training and testing sets, and test a machine learning algorithm on. I tend to always use random forest to start with, or caret’s rpart.

Lets make sure we keep the same samples in our testing set and training set for each group to test machine learning algorithm(s) on. Lets keep the standard 70% training set and 30% testing set using a random sampling of our classes.

set.seed(34567)
train <- sample(1:80,.7*80)
training <- class[train]
testing <- class[-train]
t <- data.frame(train = training)
ts <- data.frame(test= testing)
t %>% group_by(train) %>% count(train)
## # A tibble: 4 x 2
## # Groups:   train [4]
##   train        n
##   <fct>    <int>
## 1 1 month     21
## 2 6 months     6
## 3 acute       16
## 4 healthy     13
ts %>% group_by(test) %>% count(test)
## # A tibble: 4 x 2
## # Groups:   test [4]
##   test         n
##   <fct>    <int>
## 1 1 month      5
## 2 6 months     3
## 3 acute       11
## 4 healthy      5

We can see we have a fair share of samples in our training set and at least one of each class in our testing set to make predictions based on the model we train. Lets keep these same samples in each of our 8 groups to classify with. Lets make our 8 training and testing sets with our indices labeled ‘train’ and note the numeric labeling of each correspongs to their data frame:

Training/Testing split 1: Acute_md_DF Training/Testing split 2: HlL_DF Training/Testing split 3: Acute_mi_DF Training/Testing split 4: lhL_DF Training/Testing split 5: Hlh_DF Training/Testing split 6: hlH_DF Training/Testing split 7: lLh_DF Training/Testing split 8: Dance2

training1 <- Acute_md_DF[train,]
testing1 <- Acute_md_DF[-train,]
training2 <- HlL_DF[train,]
testing2 <- HlL_DF[-train,]
training3 <- Acute_mi_DF[train,]
testing3 <- Acute_mi_DF[-train,]
training4 <- lhL_DF[train,]
testing4 <- lhL_DF[-train,]
training5 <- Hlh_DF[train,]
testing5 <- Hlh_DF[-train,]
training6 <- hlH_DF[train,]
testing6 <- hlH_DF[-train,]
training7 <- lLh_DF[train,]
testing7 <- lLh_DF[-train,]
training8 <- Dance2[train,]
testing8 <- Dance2[-train,]

Lets make a function specific to our data frames to return the precision, recall, and accuracy of these four classes. I actually made this in a previous script,monotonicGenes.Rmd, when testing the COVID-19 samples with GSE152418 that also had four classes to classify. But those classes were healthy, moderate, severe, or ICU grades of severity of Covid19. Actually, I found out later, that the convalescent class was its own class even though it was only one sample. So there should have been five classes. But no need to alter that function now. There is also some other packages or in the caret package, that I never use that can return the precision and recall, but i don’t think as a confusion matrix. I thought the convalescent class was mislabeled, so had it relabeled as healthy, since the models pedicted it as such. I didn’t find out until this study, when the summary of this study, GSE145974, used ‘convalesced’ blood after 1 and 6 months of antibiotics, that the sample in GSE152418 was likely its own class. I assumed it was identifying the source of its patient sample,because another previous study on Rheumatoid Arthritis (RA), GSE151161, did use convalescent patients, and it preceded the analysis on GSE152418. Typically in research, you need a client consent and informed consent from people who aren’t incarcerated or in the care of another person or facility,because it violates the human research subjects guidelines for ethical research and not victimizing vulnerable populations or culpabe and incoherant populations. This stems from research that was criminal in the Tuskegee hospital on injecting black populations with syphilis or polio vaccines on inmates in other studies for some small reward or break from their punishment or lowered/free cost clinic for medical treatment. Any researcher knows this, especially if they are funded by government agencies. Also, due to the Nazi research done on Jewish victims during World War 2, the Nuremberg Code, was created, as well as later the Belmont report. “The Nuremberg Code states that”the voluntary consent of the human subject is absolutely essential" and it further explains the details implied by this requirement: capacity to consent, freedom from coercion, no penalty for withdrawal, and comprehension of the risks and benefits involved."-The Nuremberg Code, taken from a resource for getting certified in understanding compliance with human research experiments as part of my graduate research project this had to be completed. The agency who provided this, similar to HIPPA compliance for healthcare providers, is CITI.

precisionRecallAccuracy <- function(df){
  
 colnames(df) <- c('pred','type')
  df$pred <- as.character(paste(df$pred))
  df$type <- as.character(paste(df$type))
  
 classes <- unique(df$type)
 
 class1a <- as.character(paste(classes[1]))
 class2a <- as.character(paste(classes[2]))
 class3a <- as.character(paste(classes[3]))
 class4a <- as.character(paste(classes[4]))
 
  #correct classes
  class1 <- subset(df, df$type==class1a)
  class2 <- subset(df, df$type==class2a)
  class3 <- subset(df, df$type==class3a)
  class4 <- subset(df, df$type==class4a)
  
  #incorrect classes
  notClass1 <- subset(df,df$type != class1a)
  notClass2 <- subset(df,df$type != class2a)
  notClass3 <- subset(df,df$type != class3a)
  notClass4 <- subset(df, df$type != class4a)
  
  #true positives (real positives predicted positive)
  tp_1 <- sum(class1$pred==class1$type)
  tp_2 <- sum(class2$pred==class2$type)
  tp_3 <- sum(class3$pred==class3$type)
  tp_4 <- sum(class4$pred==class4$type)
  
  #false positives (real negatives predicted positive)
  fp_1 <- sum(notClass1$pred==class1a)
  fp_2 <- sum(notClass2$pred==class2a)
  fp_3 <- sum(notClass3$pred==class3a)
  fp_4 <- sum(notClass4$pred==class4a)
  
  #false negatives (real positive predicted negative)
  fn_1 <- sum(class1$pred!=class1$type)
  fn_2 <- sum(class2$pred!=class2$type)
  fn_3 <- sum(class3$pred!=class3$type)
  fn_4 <- sum(class4$pred!=class4$type)
  
  #true negatives (real negatives predicted negative)
  tn_1 <- sum(notClass1$pred!=class1a)
  tn_2 <- sum(notClass2$pred!=class2a)
  tn_3 <- sum(notClass3$pred!=class3a)
  tn_4 <- sum(notClass4$pred!=class4a)
  
  
  #precision
  p1 <- tp_1/(tp_1+fp_1)
  p2 <- tp_2/(tp_2+fp_2)
  p3 <- tp_3/(tp_3+fp_3)
  p4 <- tp_4/(tp_4+fp_4)
  
  p1 <- ifelse(p1=='NaN',0,p1)
  p2 <- ifelse(p2=='NaN',0,p2)
  p3 <- ifelse(p3=='NaN',0,p3)
  p4 <- ifelse(p4=='NaN',0,p4)
  
  #recall
  r1 <- tp_1/(tp_1+fn_1)
  r2 <- tp_2/(tp_2+fn_2)
  r3 <- tp_3/(tp_3+fn_3)
  r4 <- tp_4/(tp_4+fn_4)
  
  r1 <- ifelse(r1=='NaN',0,r1)
  r2 <- ifelse(r2=='NaN',0,r2)
  r3 <- ifelse(r3=='NaN',0,r3)
  r4 <- ifelse(r4=='NaN',0,r4)
  
  #accuracy
  ac1 <- (tp_1+tn_1)/(tp_1+tn_1+fp_1+fn_1)
  ac2 <- (tp_2+tn_2)/(tp_2+tn_2+fp_2+fn_2)
  ac3 <- (tp_3+tn_3)/(tp_3+tn_3+fp_3+fn_3)
  ac4 <- (tp_4+tn_4)/(tp_4+tn_4+fp_4+fn_4)
  
  table <- as.data.frame(rbind(c(class1a,p1,r1,ac1),
                         c(class2a,p2,r2,ac2),
                         c(class3a,p3,r3,ac3),
                         c(class4a,p4,r4,ac4)))
  
  colnames(table) <- c('class','precision','recall','accuracy')
  acc <- (sum(df$pred==df$type)/length(df$type))*100
  cat('accuracy is: ',as.character(paste(acc)),'%')
  return(table)
  
  
}

Lets start with the first group of genes using

library(e1071)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(gbm)
## Loaded gbm 2.1.5
library(RANN) #used in the tuning parameter of rf method of caret for 'oob' one out bag
## Warning: package 'RANN' was built under R version 3.6.3

Training/Testing 1:

set.seed(589647)
rfMod1 <- train(class~., method='rf', 
               na.action=na.pass,
               data=(training1),  preProc = c("center", "scale","medianImpute"),
               trControl=trainControl(method='oob'), number=5)
predRF1 <- predict(rfMod1, testing1)

predDF1 <- data.frame(predRF1, type=testing1$class)
predDF1
##     predRF1     type
## 1   healthy  healthy
## 2   1 month  healthy
## 3   1 month  healthy
## 4   healthy  healthy
## 5   healthy  healthy
## 6   1 month    acute
## 7     acute    acute
## 8     acute    acute
## 9   healthy    acute
## 10    acute    acute
## 11  1 month    acute
## 12  1 month    acute
## 13  1 month    acute
## 14  1 month    acute
## 15    acute    acute
## 16  healthy    acute
## 17    acute  1 month
## 18  1 month  1 month
## 19  healthy  1 month
## 20 6 months  1 month
## 21 6 months  1 month
## 22  1 month 6 months
## 23  1 month 6 months
## 24  1 month 6 months
pra1 <- precisionRecallAccuracy(predDF1)
## accuracy is:  33.3333333333333 %
pra1
##      class          precision            recall          accuracy
## 1  healthy                0.5               0.6 0.791666666666667
## 2    acute                0.8 0.363636363636364 0.666666666666667
## 3  1 month 0.0909090909090909               0.2 0.416666666666667
## 4 6 months                  0                 0 0.791666666666667

That set wasn’t so great. Lets run through the other 7 sets using the same format and compare the results at the end.

Training/Testing 2:

rfMod2 <- train(class~., method='rf', 
               na.action=na.pass,
               data=(training2),  preProc = c("center", "scale","medianImpute"),
               trControl=trainControl(method='oob'), number=5)
## note: only 1 unique complexity parameters in default grid. Truncating the grid to 1 .
predRF2 <- predict(rfMod2, testing2)

predDF2 <- data.frame(predRF2, type=testing2$class)
predDF2
##     predRF2     type
## 1     acute  healthy
## 2  6 months  healthy
## 3     acute  healthy
## 4   1 month  healthy
## 5   healthy  healthy
## 6     acute    acute
## 7   1 month    acute
## 8   1 month    acute
## 9   healthy    acute
## 10  1 month    acute
## 11  1 month    acute
## 12  1 month    acute
## 13  1 month    acute
## 14    acute    acute
## 15    acute    acute
## 16    acute    acute
## 17  healthy  1 month
## 18  healthy  1 month
## 19  1 month  1 month
## 20  1 month  1 month
## 21  1 month  1 month
## 22  1 month 6 months
## 23    acute 6 months
## 24    acute 6 months
pra2 <- precisionRecallAccuracy(predDF2)
## accuracy is:  33.3333333333333 %
pra2
##      class         precision            recall          accuracy
## 1  healthy              0.25               0.2 0.708333333333333
## 2    acute               0.5 0.363636363636364 0.541666666666667
## 3  1 month 0.272727272727273               0.6 0.583333333333333
## 4 6 months                 0                 0 0.833333333333333

Training/Testing 3:

rfMod3 <- train(class~., method='rf', 
               na.action=na.pass,
               data=(training3),  preProc = c("center", "scale","medianImpute"),
               trControl=trainControl(method='oob'), number=5)
predRF3 <- predict(rfMod3, testing3)

predDF3 <- data.frame(predRF3, type=testing3$class)
predDF3
##     predRF3     type
## 1     acute  healthy
## 2     acute  healthy
## 3     acute  healthy
## 4     acute  healthy
## 5     acute  healthy
## 6   1 month    acute
## 7   1 month    acute
## 8     acute    acute
## 9   healthy    acute
## 10    acute    acute
## 11  1 month    acute
## 12  1 month    acute
## 13    acute    acute
## 14  1 month    acute
## 15  healthy    acute
## 16  1 month    acute
## 17 6 months  1 month
## 18  1 month  1 month
## 19    acute  1 month
## 20  healthy  1 month
## 21  1 month  1 month
## 22    acute 6 months
## 23  1 month 6 months
## 24    acute 6 months
pra3 <- precisionRecallAccuracy(predDF3)
## accuracy is:  20.8333333333333 %
pra3
##      class         precision            recall          accuracy
## 1  healthy                 0                 0 0.666666666666667
## 2    acute 0.272727272727273 0.272727272727273 0.333333333333333
## 3  1 month 0.222222222222222               0.4 0.583333333333333
## 4 6 months                 0                 0 0.833333333333333

Training/Testing 4:

rfMod4 <- train(class~., method='rf', 
               na.action=na.pass,
               data=(training4),  preProc = c("center", "scale","medianImpute"),
               trControl=trainControl(method='oob'), number=5)
predRF4 <- predict(rfMod4, testing4)

predDF4 <- data.frame(predRF4, type=testing4$class)
predDF4
##     predRF4     type
## 1   1 month  healthy
## 2   1 month  healthy
## 3     acute  healthy
## 4     acute  healthy
## 5   healthy  healthy
## 6     acute    acute
## 7   1 month    acute
## 8   healthy    acute
## 9     acute    acute
## 10  1 month    acute
## 11  healthy    acute
## 12  1 month    acute
## 13    acute    acute
## 14  1 month    acute
## 15  healthy    acute
## 16 6 months    acute
## 17  1 month  1 month
## 18  1 month  1 month
## 19  1 month  1 month
## 20    acute  1 month
## 21    acute  1 month
## 22    acute 6 months
## 23  1 month 6 months
## 24  healthy 6 months
pra4 <- precisionRecallAccuracy(predDF4)
## accuracy is:  29.1666666666667 %
pra4
##      class precision            recall          accuracy
## 1  healthy       0.2               0.2 0.666666666666667
## 2    acute     0.375 0.272727272727273 0.458333333333333
## 3  1 month       0.3               0.6             0.625
## 4 6 months         0                 0 0.833333333333333

Training/Testing 5:

rfMod5 <- train(class~., method='rf', 
               na.action=na.pass,
               data=(training5),  preProc = c("center", "scale","medianImpute"),
               trControl=trainControl(method='oob'), number=5)
predRF5 <- predict(rfMod5, testing5)

predDF5 <- data.frame(predRF5, type=testing5$class)
predDF5
##     predRF5     type
## 1   1 month  healthy
## 2   healthy  healthy
## 3     acute  healthy
## 4     acute  healthy
## 5  6 months  healthy
## 6   1 month    acute
## 7     acute    acute
## 8     acute    acute
## 9   1 month    acute
## 10  1 month    acute
## 11  1 month    acute
## 12  1 month    acute
## 13  1 month    acute
## 14  1 month    acute
## 15    acute    acute
## 16 6 months    acute
## 17  1 month  1 month
## 18  1 month  1 month
## 19  healthy  1 month
## 20  healthy  1 month
## 21  healthy  1 month
## 22    acute 6 months
## 23    acute 6 months
## 24    acute 6 months
pra5 <- precisionRecallAccuracy(predDF5)
## accuracy is:  25 %
pra5
##      class precision            recall          accuracy
## 1  healthy      0.25               0.2 0.708333333333333
## 2    acute     0.375 0.272727272727273 0.458333333333333
## 3  1 month       0.2               0.4 0.541666666666667
## 4 6 months         0                 0 0.791666666666667

Training/Testing 6:

rfMod6 <- train(class~., method='rf', 
               na.action=na.pass,
               data=(training6),  preProc = c("center", "scale","medianImpute"),
               trControl=trainControl(method='oob'), number=5)
predRF6 <- predict(rfMod6, testing6)

predDF6 <- data.frame(predRF6, type=testing6$class)
predDF6
##     predRF6     type
## 1     acute  healthy
## 2   1 month  healthy
## 3   healthy  healthy
## 4     acute  healthy
## 5     acute  healthy
## 6   1 month    acute
## 7     acute    acute
## 8   1 month    acute
## 9  6 months    acute
## 10  1 month    acute
## 11  1 month    acute
## 12  1 month    acute
## 13  1 month    acute
## 14  1 month    acute
## 15  1 month    acute
## 16    acute    acute
## 17    acute  1 month
## 18  1 month  1 month
## 19  healthy  1 month
## 20  1 month  1 month
## 21  healthy  1 month
## 22  healthy 6 months
## 23  healthy 6 months
## 24  healthy 6 months
pra6 <- precisionRecallAccuracy(predDF6)
## accuracy is:  20.8333333333333 %
pra6
##      class         precision            recall          accuracy
## 1  healthy 0.166666666666667               0.2             0.625
## 2    acute 0.333333333333333 0.181818181818182 0.458333333333333
## 3  1 month 0.181818181818182               0.4               0.5
## 4 6 months                 0                 0 0.833333333333333

Training/Testing 7:

rfMod7 <- train(class~., method='rf', 
               na.action=na.pass,
               data=(training7),  preProc = c("center", "scale","medianImpute"),
               trControl=trainControl(method='oob'), number=5)
predRF7 <- predict(rfMod7, testing7)

predDF7 <- data.frame(predRF7, type=testing7$class)
predDF7
##     predRF7     type
## 1   1 month  healthy
## 2   1 month  healthy
## 3   healthy  healthy
## 4     acute  healthy
## 5   1 month  healthy
## 6   1 month    acute
## 7     acute    acute
## 8  6 months    acute
## 9  6 months    acute
## 10  1 month    acute
## 11  1 month    acute
## 12  1 month    acute
## 13  1 month    acute
## 14    acute    acute
## 15  healthy    acute
## 16  healthy    acute
## 17  1 month  1 month
## 18    acute  1 month
## 19    acute  1 month
## 20  healthy  1 month
## 21  1 month  1 month
## 22  1 month 6 months
## 23  1 month 6 months
## 24    acute 6 months
pra7 <- precisionRecallAccuracy(predDF1)
## accuracy is:  33.3333333333333 %
pra7
##      class          precision            recall          accuracy
## 1  healthy                0.5               0.6 0.791666666666667
## 2    acute                0.8 0.363636363636364 0.666666666666667
## 3  1 month 0.0909090909090909               0.2 0.416666666666667
## 4 6 months                  0                 0 0.791666666666667

Training/Testing 8:

rfMod8 <- train(class~., method='rf', 
               na.action=na.pass,
               data=(training8),  preProc = c("center", "scale","medianImpute"),
               trControl=trainControl(method='oob'), number=5)
predRF8 <- predict(rfMod8, testing8)

predDF8 <- data.frame(predRF8, type=testing8$class)
predDF8
##    predRF8     type
## 1  1 month  healthy
## 2  healthy  healthy
## 3    acute  healthy
## 4  1 month  healthy
## 5  healthy  healthy
## 6  1 month    acute
## 7    acute    acute
## 8    acute    acute
## 9  healthy    acute
## 10 1 month    acute
## 11 1 month    acute
## 12 1 month    acute
## 13 1 month    acute
## 14 1 month    acute
## 15   acute    acute
## 16 healthy    acute
## 17   acute  1 month
## 18 1 month  1 month
## 19 healthy  1 month
## 20 1 month  1 month
## 21 1 month  1 month
## 22 healthy 6 months
## 23   acute 6 months
## 24 1 month 6 months
pra8 <- precisionRecallAccuracy(predDF1)
## accuracy is:  33.3333333333333 %
pra8
##      class          precision            recall          accuracy
## 1  healthy                0.5               0.6 0.791666666666667
## 2    acute                0.8 0.363636363636364 0.666666666666667
## 3  1 month 0.0909090909090909               0.2 0.416666666666667
## 4 6 months                  0                 0 0.791666666666667

These are the groups by gene behaviors in fold change of diseased or treated median values compared to healthy median values:

Training/Testing split 1: Acute_md_DF Training/Testing split 2: HlL_DF Training/Testing split 3: Acute_mi_DF Training/Testing split 4: lhL_DF Training/Testing split 5: Hlh_DF Training/Testing split 6: hlH_DF Training/Testing split 7: lLh_DF Training/Testing split 8: Dance2

pra_all <- rbind(pra1,pra2,pra3,pra4,pra5,pra6,pra7,pra8)
pra_all$GroupMembership <- c(rep(1,4),
                             rep(2,4),
                             rep(3,4),
                             rep(4,4),
                             rep(5,4),
                             rep(6,4),
                             rep(7,4),
                             rep(8,4))
pra_all2 <- pra_all %>% group_by(class) %>% mutate(max=
                  ifelse(accuracy==max(as.numeric(paste(accuracy))),'max','not max'))
max <- subset(pra_all2, pra_all2$max=='max')
max
## # A tibble: 11 x 6
## # Groups:   class [4]
##    class    precision recall            accuracy          GroupMembership max  
##    <fct>    <fct>     <fct>             <fct>                       <dbl> <chr>
##  1 healthy  0.5       0.6               0.791666666666667               1 max  
##  2 acute    0.8       0.363636363636364 0.666666666666667               1 max  
##  3 6 months 0         0                 0.833333333333333               2 max  
##  4 6 months 0         0                 0.833333333333333               3 max  
##  5 1 month  0.3       0.6               0.625                           4 max  
##  6 6 months 0         0                 0.833333333333333               4 max  
##  7 6 months 0         0                 0.833333333333333               6 max  
##  8 healthy  0.5       0.6               0.791666666666667               7 max  
##  9 acute    0.8       0.363636363636364 0.666666666666667               7 max  
## 10 healthy  0.5       0.6               0.791666666666667               8 max  
## 11 acute    0.8       0.363636363636364 0.666666666666667               8 max

All groups, except group 5 (the highest, low, high genes for acute, 1 month, and 6 months as classes), had a score in the best accuracy of a class. None of the groups predicted a 6 month class, but they all scored 83% accuracy and 0% accuracy for precision and recall because of not selecting it as a class for groups 2,3,4, and 6. Groups 1 and 7 scored the same in accuracy, precision, and recall in predicting the healthy and acute classes. Group 8 scored the same as group 1 and 7 in the healthy class , but that was the only class group 8 predicted with the best in accuracy.Only Group 4 predicted the 1 month class the best with 30% precision and 60% recall for an accuracy of 63%. The overall accuracy was worse with this set of median samples as the best was 34% accuracy (Groups 1,2,7, and 8) and worst was 21% accuracy (Group 5).

We can’t remove any samples, not because we already removed 6 from the 86 total before getting the class median sample values for the fold change values, but because looking at the 40 genes and the samples within each class, there are many samples with different values differing from their neighbors dramatically or not. We can see if scaling the training set in the model training will help. It will center and scale the data for us with that step I commented out in the last model prediction runs.

Lets use the randomForest package and its randomForest() to tune our model and test our same 8 groups.

#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training1) <- gsub('-','',colnames(training1))
colnames(testing1) <- gsub('-','',colnames(testing1))
testing1$class <- as.factor(paste(testing1$class))
training1$class <- as.factor(paste(training1$class))
RF1 <- randomForest(class ~ ., data=training1, 
                    importance=TRUE, nodesize=2, ntree=400,mtry=3)
predict1 <- predict(RF1,testing1)
predict1df <- data.frame(predict1, type=testing1$class)
predict1df
##                      predict1     type
## healthyControl_4      healthy  healthy
## healthyControl_14     1 month  healthy
## healthyControl_15     1 month  healthy
## healthyControl_16     healthy  healthy
## healthyControl_21     healthy  healthy
## acuteLymeDisease_1    1 month    acute
## acuteLymeDisease_2      acute    acute
## acuteLymeDisease_3      acute    acute
## acuteLymeDisease_5    healthy    acute
## acuteLymeDisease_8      acute    acute
## acuteLymeDisease_9    1 month    acute
## acuteLymeDisease_11   1 month    acute
## acuteLymeDisease_13   healthy    acute
## acuteLymeDisease_17   1 month    acute
## acuteLymeDisease_24     acute    acute
## acuteLymeDisease_26   healthy    acute
## Antibodies_1month_3     acute  1 month
## Antibodies_1month_8   1 month  1 month
## Antibodies_1month_10  healthy  1 month
## Antibodies_1month_23  healthy  1 month
## Antibodies_1month_27 6 months  1 month
## Antibodies_6months_2  1 month 6 months
## Antibodies_6months_4  1 month 6 months
## Antibodies_6months_5  1 month 6 months
PRA1 <- precisionRecallAccuracy(predict1df)
## accuracy is:  33.3333333333333 %
PRA1
##      class precision            recall          accuracy
## 1  healthy     0.375               0.6 0.708333333333333
## 2    acute       0.8 0.363636363636364 0.666666666666667
## 3  1 month       0.1               0.2 0.458333333333333
## 4 6 months         0                 0 0.833333333333333
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training2) <- gsub('-','',colnames(training2))
colnames(testing2) <- gsub('-','',colnames(testing2))
testing2$class <- as.factor(paste(testing2$class))
training2$class <- as.factor(paste(training2$class))
RF2 <- randomForest(class ~ ., data=training2, 
                    importance=TRUE, nodesize=2, ntree=400,mtry=3)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
predict2 <- predict(RF2,testing2)
predict2df <- data.frame(predict2, type=testing2$class)
predict2df
##                      predict2     type
## healthyControl_4      healthy  healthy
## healthyControl_14     healthy  healthy
## healthyControl_15       acute  healthy
## healthyControl_16     1 month  healthy
## healthyControl_21     healthy  healthy
## acuteLymeDisease_1      acute    acute
## acuteLymeDisease_2    1 month    acute
## acuteLymeDisease_3    1 month    acute
## acuteLymeDisease_5    healthy    acute
## acuteLymeDisease_8    1 month    acute
## acuteLymeDisease_9    healthy    acute
## acuteLymeDisease_11   1 month    acute
## acuteLymeDisease_13   1 month    acute
## acuteLymeDisease_17     acute    acute
## acuteLymeDisease_24     acute    acute
## acuteLymeDisease_26     acute    acute
## Antibodies_1month_3   healthy  1 month
## Antibodies_1month_8   healthy  1 month
## Antibodies_1month_10  1 month  1 month
## Antibodies_1month_23  1 month  1 month
## Antibodies_1month_27  1 month  1 month
## Antibodies_6months_2  1 month 6 months
## Antibodies_6months_4    acute 6 months
## Antibodies_6months_5    acute 6 months
PRA2 <- precisionRecallAccuracy(predict2df)
## accuracy is:  41.6666666666667 %
PRA2
##      class         precision            recall          accuracy
## 1  healthy 0.428571428571429               0.6              0.75
## 2    acute 0.571428571428571 0.363636363636364 0.583333333333333
## 3  1 month               0.3               0.6             0.625
## 4 6 months                 0                 0             0.875
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training3) <- gsub('-','',colnames(training3))
colnames(testing3) <- gsub('-','',colnames(testing3))
testing3$class <- as.factor(paste(testing3$class))
training3$class <- as.factor(paste(training3$class))
RF3 <- randomForest(class ~ ., data=training3, 
                    importance=TRUE, nodesize=2, ntree=400,mtry=3)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
predict3 <- predict(RF3,testing3)
predict3df <- data.frame(predict3, type=testing3$class)
predict3df
##                      predict3     type
## healthyControl_4      1 month  healthy
## healthyControl_14       acute  healthy
## healthyControl_15       acute  healthy
## healthyControl_16       acute  healthy
## healthyControl_21       acute  healthy
## acuteLymeDisease_1    1 month    acute
## acuteLymeDisease_2    1 month    acute
## acuteLymeDisease_3      acute    acute
## acuteLymeDisease_5    healthy    acute
## acuteLymeDisease_8      acute    acute
## acuteLymeDisease_9    1 month    acute
## acuteLymeDisease_11   1 month    acute
## acuteLymeDisease_13     acute    acute
## acuteLymeDisease_17   1 month    acute
## acuteLymeDisease_24   healthy    acute
## acuteLymeDisease_26   1 month    acute
## Antibodies_1month_3  6 months  1 month
## Antibodies_1month_8   1 month  1 month
## Antibodies_1month_10    acute  1 month
## Antibodies_1month_23  healthy  1 month
## Antibodies_1month_27  1 month  1 month
## Antibodies_6months_2    acute 6 months
## Antibodies_6months_4  1 month 6 months
## Antibodies_6months_5    acute 6 months
PRA3 <- precisionRecallAccuracy(predict3df)
## accuracy is:  20.8333333333333 %
PRA3
##      class precision            recall          accuracy
## 1  healthy         0                 0 0.666666666666667
## 2    acute       0.3 0.272727272727273             0.375
## 3  1 month       0.2               0.4 0.541666666666667
## 4 6 months         0                 0 0.833333333333333
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training4) <- gsub('-','',colnames(training4))
colnames(testing4) <- gsub('-','',colnames(testing4))
testing4$class <- as.factor(paste(testing4$class))
training4$class <- as.factor(paste(training4$class))
RF4 <- randomForest(class ~ ., data=training4, 
                    importance=TRUE, nodesize=2, ntree=400,mtry=3)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
predict4 <- predict(RF4,testing4)
predict4df <- data.frame(predict4, type=testing4$class)
predict4df
##                      predict4     type
## healthyControl_4      1 month  healthy
## healthyControl_14     1 month  healthy
## healthyControl_15       acute  healthy
## healthyControl_16       acute  healthy
## healthyControl_21     healthy  healthy
## acuteLymeDisease_1      acute    acute
## acuteLymeDisease_2    1 month    acute
## acuteLymeDisease_3    1 month    acute
## acuteLymeDisease_5      acute    acute
## acuteLymeDisease_8    1 month    acute
## acuteLymeDisease_9    healthy    acute
## acuteLymeDisease_11   1 month    acute
## acuteLymeDisease_13     acute    acute
## acuteLymeDisease_17   1 month    acute
## acuteLymeDisease_24   1 month    acute
## acuteLymeDisease_26  6 months    acute
## Antibodies_1month_3   1 month  1 month
## Antibodies_1month_8   1 month  1 month
## Antibodies_1month_10  1 month  1 month
## Antibodies_1month_23    acute  1 month
## Antibodies_1month_27    acute  1 month
## Antibodies_6months_2    acute 6 months
## Antibodies_6months_4  1 month 6 months
## Antibodies_6months_5  healthy 6 months
PRA4 <- precisionRecallAccuracy(predict4df)
## accuracy is:  29.1666666666667 %
PRA4
##      class         precision            recall          accuracy
## 1  healthy 0.333333333333333               0.2              0.75
## 2    acute             0.375 0.272727272727273 0.458333333333333
## 3  1 month              0.25               0.6 0.541666666666667
## 4 6 months                 0                 0 0.833333333333333
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training5) <- gsub('-','',colnames(training5))
colnames(testing5) <- gsub('-','',colnames(testing5))
testing5$class <- as.factor(paste(testing5$class))
training5$class <- as.factor(paste(training5$class))
RF5 <- randomForest(class ~ ., data=training5, 
                    importance=TRUE, nodesize=2, ntree=400,mtry=3)
predict5 <- predict(RF5,testing5)
predict5df <- data.frame(predict5, type=testing5$class)
predict5df
##                      predict5     type
## healthyControl_4      1 month  healthy
## healthyControl_14     healthy  healthy
## healthyControl_15       acute  healthy
## healthyControl_16       acute  healthy
## healthyControl_21    6 months  healthy
## acuteLymeDisease_1    1 month    acute
## acuteLymeDisease_2      acute    acute
## acuteLymeDisease_3      acute    acute
## acuteLymeDisease_5    1 month    acute
## acuteLymeDisease_8    1 month    acute
## acuteLymeDisease_9    1 month    acute
## acuteLymeDisease_11   1 month    acute
## acuteLymeDisease_13   1 month    acute
## acuteLymeDisease_17   1 month    acute
## acuteLymeDisease_24     acute    acute
## acuteLymeDisease_26  6 months    acute
## Antibodies_1month_3   1 month  1 month
## Antibodies_1month_8   1 month  1 month
## Antibodies_1month_10  healthy  1 month
## Antibodies_1month_23  healthy  1 month
## Antibodies_1month_27  1 month  1 month
## Antibodies_6months_2    acute 6 months
## Antibodies_6months_4    acute 6 months
## Antibodies_6months_5    acute 6 months
PRA5 <- precisionRecallAccuracy(predict5df)
## accuracy is:  29.1666666666667 %
PRA5
##      class         precision            recall          accuracy
## 1  healthy 0.333333333333333               0.2              0.75
## 2    acute             0.375 0.272727272727273 0.458333333333333
## 3  1 month 0.272727272727273               0.6 0.583333333333333
## 4 6 months                 0                 0 0.791666666666667
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training6) <- gsub('-','',colnames(training6))
colnames(testing6) <- gsub('-','',colnames(testing6))
testing6$class <- as.factor(paste(testing6$class))
training6$class <- as.factor(paste(training6$class))
RF6 <- randomForest(class ~ ., data=training6, 
                    importance=TRUE, nodesize=2, ntree=400,mtry=3)
predict6 <- predict(RF6,testing6)
predict6df <- data.frame(predict6, type=testing6$class)
predict6df
##                      predict6     type
## healthyControl_4        acute  healthy
## healthyControl_14     1 month  healthy
## healthyControl_15     healthy  healthy
## healthyControl_16       acute  healthy
## healthyControl_21       acute  healthy
## acuteLymeDisease_1    1 month    acute
## acuteLymeDisease_2      acute    acute
## acuteLymeDisease_3    1 month    acute
## acuteLymeDisease_5   6 months    acute
## acuteLymeDisease_8      acute    acute
## acuteLymeDisease_9    1 month    acute
## acuteLymeDisease_11   1 month    acute
## acuteLymeDisease_13   1 month    acute
## acuteLymeDisease_17   1 month    acute
## acuteLymeDisease_24   1 month    acute
## acuteLymeDisease_26     acute    acute
## Antibodies_1month_3     acute  1 month
## Antibodies_1month_8   1 month  1 month
## Antibodies_1month_10  healthy  1 month
## Antibodies_1month_23  1 month  1 month
## Antibodies_1month_27  healthy  1 month
## Antibodies_6months_2  healthy 6 months
## Antibodies_6months_4    acute 6 months
## Antibodies_6months_5  healthy 6 months
PRA6 <- precisionRecallAccuracy(predict6df)
## accuracy is:  25 %
PRA6
##      class precision            recall          accuracy
## 1  healthy       0.2               0.2 0.666666666666667
## 2    acute     0.375 0.272727272727273 0.458333333333333
## 3  1 month       0.2               0.4 0.541666666666667
## 4 6 months         0                 0 0.833333333333333
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training7) <- gsub('-','',colnames(training7))
colnames(testing7) <- gsub('-','',colnames(testing7))
testing7$class <- as.factor(paste(testing7$class))
training7$class <- as.factor(paste(training7$class))
RF7 <- randomForest(class ~ ., data=training7, 
                    importance=TRUE, nodesize=2, ntree=400,mtry=3)
predict7 <- predict(RF7,testing7)
predict7df <- data.frame(predict7, type=testing7$class)
predict7df
##                      predict7     type
## healthyControl_4      1 month  healthy
## healthyControl_14     1 month  healthy
## healthyControl_15     healthy  healthy
## healthyControl_16     healthy  healthy
## healthyControl_21     1 month  healthy
## acuteLymeDisease_1    1 month    acute
## acuteLymeDisease_2      acute    acute
## acuteLymeDisease_3      acute    acute
## acuteLymeDisease_5   6 months    acute
## acuteLymeDisease_8    1 month    acute
## acuteLymeDisease_9    1 month    acute
## acuteLymeDisease_11   1 month    acute
## acuteLymeDisease_13   1 month    acute
## acuteLymeDisease_17     acute    acute
## acuteLymeDisease_24   healthy    acute
## acuteLymeDisease_26   healthy    acute
## Antibodies_1month_3   1 month  1 month
## Antibodies_1month_8     acute  1 month
## Antibodies_1month_10    acute  1 month
## Antibodies_1month_23  healthy  1 month
## Antibodies_1month_27 6 months  1 month
## Antibodies_6months_2  1 month 6 months
## Antibodies_6months_4  1 month 6 months
## Antibodies_6months_5    acute 6 months
PRA7 <- precisionRecallAccuracy(predict7df)
## accuracy is:  25 %
PRA7
##      class          precision            recall          accuracy
## 1  healthy                0.4               0.4              0.75
## 2    acute                0.5 0.272727272727273 0.541666666666667
## 3  1 month 0.0909090909090909               0.2 0.416666666666667
## 4 6 months                  0                 0 0.791666666666667
#an error with hyphen in HLA-DRB4, so we will omit it in the testing and training set
set.seed(4567)
colnames(training8) <- gsub('-','',colnames(training8))
colnames(testing8) <- gsub('-','',colnames(testing8))
testing8$class <- as.factor(paste(testing8$class))
training8$class <- as.factor(paste(training8$class))
RF8 <- randomForest(class ~ ., data=training8, 
                    importance=TRUE, nodesize=2, ntree=400,mtry=3)
predict8 <- predict(RF8,testing8)
predict8df <- data.frame(predict8, type=testing8$class)
predict8df
##                      predict8     type
## healthyControl_4      1 month  healthy
## healthyControl_14     healthy  healthy
## healthyControl_15       acute  healthy
## healthyControl_16     1 month  healthy
## healthyControl_21     healthy  healthy
## acuteLymeDisease_1      acute    acute
## acuteLymeDisease_2      acute    acute
## acuteLymeDisease_3      acute    acute
## acuteLymeDisease_5    healthy    acute
## acuteLymeDisease_8    1 month    acute
## acuteLymeDisease_9    1 month    acute
## acuteLymeDisease_11   1 month    acute
## acuteLymeDisease_13   1 month    acute
## acuteLymeDisease_17   1 month    acute
## acuteLymeDisease_24     acute    acute
## acuteLymeDisease_26   healthy    acute
## Antibodies_1month_3   1 month  1 month
## Antibodies_1month_8   1 month  1 month
## Antibodies_1month_10  healthy  1 month
## Antibodies_1month_23  healthy  1 month
## Antibodies_1month_27  1 month  1 month
## Antibodies_6months_2  healthy 6 months
## Antibodies_6months_4    acute 6 months
## Antibodies_6months_5  1 month 6 months
PRA8 <- precisionRecallAccuracy(predict8df)
## accuracy is:  37.5 %
PRA8
##      class         precision            recall          accuracy
## 1  healthy 0.285714285714286               0.4 0.666666666666667
## 2    acute 0.666666666666667 0.363636363636364             0.625
## 3  1 month 0.272727272727273               0.6 0.583333333333333
## 4 6 months                 0                 0             0.875
PRA_all <- rbind(PRA1,PRA2,PRA3,PRA4,PRA5,PRA6,PRA7,PRA8)
PRA_all$groupMembership <- c(rep(1,4),
                             rep(2,4),
                             rep(3,4),
                             rep(4,4),
                             rep(5,4),
                             rep(6,4),
                             rep(7,4),
                             rep(8,4))
PRA_all2 <- PRA_all %>% group_by(class) %>% mutate(max=
    ifelse(accuracy==max(as.numeric(paste(accuracy))),'max','not max'))
max2 <- subset(PRA_all2, PRA_all2$max=='max')
max2
## # A tibble: 8 x 6
## # Groups:   class [4]
##   class    precision        recall         accuracy        groupMembership max  
##   <fct>    <fct>            <fct>          <fct>                     <dbl> <chr>
## 1 acute    0.8              0.36363636363~ 0.666666666666~               1 max  
## 2 healthy  0.4285714285714~ 0.6            0.75                          2 max  
## 3 1 month  0.3              0.6            0.625                         2 max  
## 4 6 months 0                0              0.875                         2 max  
## 5 healthy  0.3333333333333~ 0.2            0.75                          4 max  
## 6 healthy  0.3333333333333~ 0.2            0.75                          5 max  
## 7 healthy  0.4              0.4            0.75                          7 max  
## 8 6 months 0                0              0.875                         8 max

We see that there are less groups that got the best class prediction accuracy than when we didn’t scale or center the data, and that group 2 predicted the healthy class with the same accuracy as groups 4, 5, and 7, but with both a higher precision and recall. Group 2 also predicted the 1 month and 6 month classes with the best accuracy. The 6 month class was still not used by any group to classify any testing samples in our random forest model. but not selecting the 6 month group gave an 88% accuracy, but both group 8 and 2 scored 0% for recall and precision for that. Only Group 1 predicted the acute class with the max accuracy of 67% having a precision of 80% (classified about 4/5 acute samples) and a recall of 37% (some irrelevant records classified as acute). The range of accuracy overall was 21%-42%. Group 2 scored the highest accuracy in prediction, with Group 3 scoring the lowest with this random forest model.

At this point lets take those genes that are in the best predicting classes of gene groups and use it as one data frame. Groups 1,2,4, and 7 are the better gene groups. Lets combine those into one group.

Training1247 <- training8[,colnames(training8) %in% c(Acute_md,HlL_DF,lhL_DF,lLh_DF)]
Training1247$class <- training8$class
Testing1247 <-   testing8[,colnames(training8) %in% c(Acute_md,HlL_DF,lhL_DF,lLh_DF)]
Testing1247$class <- testing8$class
colnames(Training1247)
##  [1] "BPI"        "CA1"        "CAMP"       "CEACAM8"    "CTSG"      
##  [6] "DDX3Y"      "DEFA4"      "EIF1AY"     "EPB42"      "HBA1"      
## [11] "HBD"        "HEMGN"      "HIST1H1C"   "HIST2H2AA3" "HSPA1B"    
## [16] "KDM5D"      "KIR2DS1"    "LCN2"       "LILRA3"     "LTF"       
## [21] "SLC25A37"   "ZNF815"     "class"
colnames(Testing1247)
##  [1] "BPI"        "CA1"        "CAMP"       "CEACAM8"    "CTSG"      
##  [6] "DDX3Y"      "DEFA4"      "EIF1AY"     "EPB42"      "HBA1"      
## [11] "HBD"        "HEMGN"      "HIST1H1C"   "HIST2H2AA3" "HSPA1B"    
## [16] "KDM5D"      "KIR2DS1"    "LCN2"       "LILRA3"     "LTF"       
## [21] "SLC25A37"   "ZNF815"     "class"
colnames(Testing1247)==colnames(Training1247)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

The same genes are in each of our better genes data frame for the testing and training sets. Now lets see how the last model does on prediction with these genes.

RF1247 <- randomForest(class ~ ., data=Training1247, 
                    importance=TRUE, nodesize=2, ntree=400,mtry=3)
predict1247 <- predict(RF1247,Testing1247)
predict1247df <- data.frame(predict1247, type=Testing1247$class)
predict1247df
##                      predict1247     type
## healthyControl_4         healthy  healthy
## healthyControl_14        1 month  healthy
## healthyControl_15        1 month  healthy
## healthyControl_16        healthy  healthy
## healthyControl_21        healthy  healthy
## acuteLymeDisease_1         acute    acute
## acuteLymeDisease_2         acute    acute
## acuteLymeDisease_3         acute    acute
## acuteLymeDisease_5       healthy    acute
## acuteLymeDisease_8         acute    acute
## acuteLymeDisease_9       1 month    acute
## acuteLymeDisease_11      1 month    acute
## acuteLymeDisease_13      1 month    acute
## acuteLymeDisease_17      1 month    acute
## acuteLymeDisease_24        acute    acute
## acuteLymeDisease_26      healthy    acute
## Antibodies_1month_3        acute  1 month
## Antibodies_1month_8      1 month  1 month
## Antibodies_1month_10     healthy  1 month
## Antibodies_1month_23    6 months  1 month
## Antibodies_1month_27    6 months  1 month
## Antibodies_6months_2     1 month 6 months
## Antibodies_6months_4     1 month 6 months
## Antibodies_6months_5     1 month 6 months
PRA1247 <- precisionRecallAccuracy(predict1247df)
## accuracy is:  37.5 %
PRA1247
##      class         precision            recall          accuracy
## 1  healthy               0.5               0.6 0.791666666666667
## 2    acute 0.833333333333333 0.454545454545455 0.708333333333333
## 3  1 month               0.1               0.2 0.458333333333333
## 4 6 months                 0                 0 0.791666666666667

The overall accuracy was 37.5% or about 9/24 correct for four classes. So that’s better than 25% or 1/4 chances of predicting accurately. The accuracy for each of the classes was 79% for the healthy class, 71% for the acute class, 46% for the 1 month class, and 79% for the 6 month class. With the highest precision in predicting the acute class, and highest recall in predicting the healthy class. It predicted more healthy classes correctly and only misclassified 40% for a recall of 60%, and predicted 83% of the acute classes only missing 17% of the other acute classes in the testing set.

At this point I don’t think tuning the model will allow it to perform better. We can use other algorithms on the entire data or this set.

knnMod0 <- train(class ~ .,
                method='knn', preProcess=c('center','scale'),
                tuneLength=10, trControl=trainControl(method='adaptive_cv'),
                data=Training1247)
predKNN0 <- predict(knnMod0, Testing1247)
dfKNN0 <- data.frame(KNN=predKNN0, class=Testing1247$class)
dfKNN0
##         KNN    class
## 1   1 month  healthy
## 2   healthy  healthy
## 3   healthy  healthy
## 4   healthy  healthy
## 5   healthy  healthy
## 6   1 month    acute
## 7   1 month    acute
## 8     acute    acute
## 9     acute    acute
## 10 6 months    acute
## 11  1 month    acute
## 12    acute    acute
## 13 6 months    acute
## 14  1 month    acute
## 15    acute    acute
## 16  healthy    acute
## 17  1 month  1 month
## 18    acute  1 month
## 19  healthy  1 month
## 20 6 months  1 month
## 21 6 months  1 month
## 22  1 month 6 months
## 23  1 month 6 months
## 24  1 month 6 months
precisionRecallAccuracy(dfKNN0)
## accuracy is:  37.5 %
##      class         precision            recall          accuracy
## 1  healthy 0.666666666666667               0.8             0.875
## 2    acute               0.8 0.363636363636364 0.666666666666667
## 3  1 month 0.111111111111111               0.2               0.5
## 4 6 months                 0                 0 0.708333333333333

The K-Nearest Neighbor algorithm also scored 37.5%, with the best accuracy in prediction for the healthy class, with 87.5% accuracy, 67% precision, and 80% recall. The acute class scored 67% accuracy but had an 80% precision and 37% recall.

rpartMod0 <- train(class ~ ., method='rpart', tuneLength=7, data=Training1247) 
predRPART0 <- predict(rpartMod0, Testing1247)
dfRPART0 <- data.frame(Rpart=predRPART0, class=Testing1247$class)
dfRPART0

precisionRecallAccuracy(dfRPART0)

The rpart algorithm only scored 21% accuracy overall, and didn’t predict any of the healthy,acute, or 6 month classes correctly as the precision and recall are both 0% for those clases, but the accuracy was 79% for the healthy class, 54% for the acute class, and 88% for the 6 months class. This model scored 21% precision on the 1 month class and 100% recall for the same class. that is less than the probability of 1/4 correct or 25% in accuracy overall. This model would not be a good fit for this data or could be tuned. Doubtful it will improve much or any other model. The underlying data seems to be bad. These genes aren’t able to predict out of all these classes in overall accuracy better than 42% accuracy using the median fold change values to gather our genes. We could try separating the classes and seeing in pairs if the models will predict better accuracy. If we exclude the antibiotic treatment classes, can we get good prediction accuracy?


  1. y*(max(y)-min(y))+min(y)