adding Lyme Disease Top Genes Revisited Normalized no zeros

Revisiting the Lyme disease data, last time we normalized it to remove the negatie values, and the top genes in that data wasn’t even in the database of Lyme disease due to the merge that removed those top genes. So, we are just explaining that the genes when normalized flipped the expression dynamics so that the range of 0,1, meant that from zero to one, the lowest values are actually the highest values and vice versa.

So, we are going to pull those genes up now. from the last run of the Lyme disease normalization.

library(rmarkdown)
library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

Lyme disease link

data <- read.csv('LymeDiseaseNormalizedFCsMeansAdded.csv', header=T)

paged_table(data[1:10,]) #19,526 X 95

colnames(data)

##  [1] "Gene"                      "healthyControl_1"         
##  [3] "healthyControl_2"          "healthyControl_3"         
##  [5] "healthyControl_4"          "healthyControl_5"         
##  [7] "healthyControl_6"          "healthyControl_7"         
##  [9] "healthyControl_8"          "healthyControl_9"         
## [11] "healthyControl_10"         "healthyControl_11"        
## [13] "healthyControl_12"         "healthyControl_13"        
## [15] "healthyControl_14"         "healthyControl_15"        
## [17] "healthyControl_16"         "healthyControl_17"        
## [19] "healthyControl_18"         "healthyControl_19"        
## [21] "healthyControl_20"         "healthyControl_21"        
## [23] "acuteLymeDisease_1"        "acuteLymeDisease_2"       
## [25] "acuteLymeDisease_3"        "acuteLymeDisease_4"       
## [27] "acuteLymeDisease_5"        "acuteLymeDisease_6"       
## [29] "acuteLymeDisease_7"        "acuteLymeDisease_8"       
## [31] "acuteLymeDisease_9"        "acuteLymeDisease_10"      
## [33] "acuteLymeDisease_11"       "acuteLymeDisease_12"      
## [35] "acuteLymeDisease_13"       "acuteLymeDisease_14"      
## [37] "acuteLymeDisease_15"       "acuteLymeDisease_16"      
## [39] "acuteLymeDisease_17"       "acuteLymeDisease_18"      
## [41] "acuteLymeDisease_19"       "acuteLymeDisease_20"      
## [43] "acuteLymeDisease_21"       "acuteLymeDisease_22"      
## [45] "acuteLymeDisease_23"       "acuteLymeDisease_24"      
## [47] "acuteLymeDisease_25"       "acuteLymeDisease_26"      
## [49] "acuteLymeDisease_27"       "acuteLymeDisease_28"      
## [51] "Antibodies_1month_1"       "Antibodies_1month_2"      
## [53] "Antibodies_1month_3"       "Antibodies_1month_4"      
## [55] "Antibodies_1month_5"       "Antibodies_1month_6"      
## [57] "Antibodies_1month_7"       "Antibodies_1month_8"      
## [59] "Antibodies_1month_9"       "Antibodies_1month_10"     
## [61] "Antibodies_1month_11"      "Antibodies_1month_12"     
## [63] "Antibodies_1month_13"      "Antibodies_1month_14"     
## [65] "Antibodies_1month_15"      "Antibodies_1month_16"     
## [67] "Antibodies_1month_17"      "Antibodies_1month_18"     
## [69] "Antibodies_1month_19"      "Antibodies_1month_20"     
## [71] "Antibodies_1month_21"      "Antibodies_1month_22"     
## [73] "Antibodies_1month_23"      "Antibodies_1month_24"     
## [75] "Antibodies_1month_25"      "Antibodies_1month_26"     
## [77] "Antibodies_1month_27"      "Antibodies_6months_1"     
## [79] "Antibodies_6months_2"      "Antibodies_6months_3"     
## [81] "Antibodies_6months_4"      "Antibodies_6months_5"     
## [83] "Antibodies_6months_6"      "Antibodies_6months_7"     
## [85] "Antibodies_6months_8"      "Antibodies_6months_9"     
## [87] "Antibodies_6months_10"     "healthy_mean"             
## [89] "acute_mean"                "month1_mean"              
## [91] "month6_mean"               "foldchange_acute_healthy" 
## [93] "foldchange_1month_healthy" "foldchange_6month_healthy"
## [95] "foldchange_6month_acute"

Now we are going to order by the foldchange value for each of acute, 1 month of antibodies, 6 months of antibodies, and taking the top ten stimulated and top ten inhibited.

acute <- grep('acute', colnames(data)[1:87])
healthy <- grep('healthy', colnames(data)[1:87])
month1 <- grep('1month',colnames(data)[1:87])
month6 <- grep('6month',colnames(data)[1:87])

acute

##  [1] 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
## [26] 48 49 50

There are 28 acute samples.

healthy

##  [1]  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22

There are 21 healthy samples.

month1

##  [1] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## [26] 76 77

There are 27 one month antibodies developed samples.

month6

##  [1] 78 79 80 81 82 83 84 85 86 87

There are only 10 six months antibodies developed samples.

class <- 'class'

class[1:21] <- 'Healthy'
class[22:49] <- 'acute'
class[50:76] <- '1 month infected'
class[77:86] <- '6 months infected'

table(class)

## class
##  1 month infected 6 months infected             acute           Healthy 
##                27                10                28                21

There are 86 samples total.

Now, we are going to order by foldchange values and get the top genes in each class.

acuteordered <- data[order(data$foldchange_acute_healthy, decreasing=T),]
acute20 <- acuteordered[c(1:10,19517:19526),]

month1Ordered <- data[order(data$foldchange_1month_healthy, decreasing=T),]
month1_20 <- month1Ordered[c(1:10,19517:19526),]

month6Ordered <- data[order(data$foldchange_6month_healthy, decreasing=T),]
month6_20 <- month6Ordered[c(1:10,19517:19526),]

acuteDB <- acute20[,c(1:87,92)]
month1DB <- month1_20[,c(1:87,93)]
month6DB <- month6_20[,c(1:87,94)]

colnames(acuteDB)[88] <- "foldchange"
colnames(month1DB)[88] <- "foldchange"
colnames(month6DB)[88] <- "foldchange"

acuteDB$topGeneSource <- "acute over healthy mean values for foldchange top genes Lyme Disease with 28 acute and 21 healthy samples"
month1DB$topGeneSource <- "One month of antibodies over healthy  mean values for comparison foldchange values for top genes in Lyme Disease with 27 1 month antibodies samples and 21 healthy samples"
month6DB$topGeneSource <- "Six months of antibodies over healthy mean values for comparison foldchange values of top genes in Lyme disease with 10 samples of 6 months antibodies and 21 healthy samples. "

topGenes60 <- rbind(acuteDB, month1DB, month6DB)

paged_table(topGenes60)

Lets see if any duplicate genes.

topGenes60_b <- topGenes60[!duplicated(topGenes60$Gene),]

dim(topGenes60_b)

## [1] 52 89

write.csv(topGenes60, 'topGenesLyme60duplicates.csv', row.names=F)

Now for the matrix to test these genes, we will use the topGenes60_b dataset so there aren’t any duplicate genes that are the features in our matrix due to need for avoiding multicollinearity in machine learning.

data52 <- data[which(data$Gene %in% topGenes60_b$Gene),1:87]

data52_mx <- data.frame(t(data52[,2:87]))
colnames(data52_mx) <- data52$Gene
data52_mx$class <- as.factor(class)

paged_table(data52_mx)

set.seed(456)

inTrain <- sample(1:86, .8*86)

training <- data52_mx[inTrain,]
testing <- data52_mx[-inTrain,]

table(training$class)

## 
##  1 month infected 6 months infected             acute           Healthy 
##                23                 9                20                16

table(testing$class)

## 
##  1 month infected 6 months infected             acute           Healthy 
##                 4                 1                 8                 5

rf <- randomForest(training[1:52], training$class, mtry=16, ntree=5000, confusion=T)

rf$confusion

##                   1 month infected 6 months infected acute Healthy class.error
## 1 month infected                12                 0     8       3   0.4782609
## 6 months infected                0                 6     1       2   0.3333333
## acute                            7                 2    10       1   0.5000000
## Healthy                          1                 1     2      12   0.2500000

Lets see how well it predicts on the hold out validation 20% dataset.

predicted <- predict(rf,testing)

results <- data.frame(predicted=predicted, actual=testing$class)
results

##                              predicted            actual
## healthyControl_3               Healthy           Healthy
## healthyControl_7               Healthy           Healthy
## healthyControl_9               Healthy           Healthy
## healthyControl_18              Healthy           Healthy
## healthyControl_21              Healthy           Healthy
## acuteLymeDisease_1               acute             acute
## acuteLymeDisease_5             Healthy             acute
## acuteLymeDisease_8               acute             acute
## acuteLymeDisease_11              acute             acute
## acuteLymeDisease_15   1 month infected             acute
## acuteLymeDisease_16   1 month infected             acute
## acuteLymeDisease_21              acute             acute
## acuteLymeDisease_24            Healthy             acute
## Antibodies_1month_4            Healthy  1 month infected
## Antibodies_1month_6              acute  1 month infected
## Antibodies_1month_14  1 month infected  1 month infected
## Antibodies_1month_26           Healthy  1 month infected
## Antibodies_6months_1 6 months infected 6 months infected

All healthy and 6 months infected samples were predicted correctly while the acute class only had 4/8 correctly predicted as acute, and the 1 month infected class had only 1/4 predicted correctly.

So if we go back and look at those genes that were duplicated we could possibly find a better set of target genes in these samples for predicting Lyme disease. Lets see.

duplicates <- topGenes60[duplicated(topGenes60$Gene),]
duplicates$Gene

## [1] "UHMK1"    "BCL2"     "GPR183"   "CSNK1G3"  "CDK5RAP3" "C7orf60"  "RBMX"    
## [8] "CAMK4"

Lets test these genes, in the previous work in elderly lymphomas of polymorphic and monomorphic diffuse large b-cell and classic hodgkins lymphomas, we saw that the genes that were duplicated were the better set of genes. Now, we will see if that is true here.

duplicatedGenes <- duplicates$Gene

doublesDB <- data[data$Gene %in% duplicatedGenes,1:87]

doublesMX <- data.frame(t(doublesDB[2:87]))
colnames(doublesMX) <- doublesDB$Gene
doublesMX$class <- as.factor(class)

paged_table(doublesMX)

set.seed(567)

inTrain <- sample(1:86, .8*86)

training <- doublesMX[inTrain,]
testing <- doublesMX[-inTrain,]

table(training$class)

## 
##  1 month infected 6 months infected             acute           Healthy 
##                21                 7                20                20

table(testing$class)

## 
##  1 month infected 6 months infected             acute           Healthy 
##                 6                 3                 8                 1

rf8 <- randomForest(training[1:8],training$class, mtry=3, ntree=5000, confusion=T)
rf8$confusion

##                   1 month infected 6 months infected acute Healthy class.error
## 1 month infected                10                 0     5       6   0.5238095
## 6 months infected                2                 2     1       2   0.7142857
## acute                            4                 2    11       3   0.4500000
## Healthy                          2                 1     0      17   0.1500000

These genes are actually worse but lets see how well they predict.

predict8 <- predict(rf8, testing)

results8 <- data.frame(predicted=predict8, actual=testing$class)
results8

##                              predicted            actual
## healthyControl_11              Healthy           Healthy
## acuteLymeDisease_9               acute             acute
## acuteLymeDisease_12   1 month infected             acute
## acuteLymeDisease_14   1 month infected             acute
## acuteLymeDisease_18   1 month infected             acute
## acuteLymeDisease_23   1 month infected             acute
## acuteLymeDisease_24   1 month infected             acute
## acuteLymeDisease_27            Healthy             acute
## acuteLymeDisease_28            Healthy             acute
## Antibodies_1month_1   1 month infected  1 month infected
## Antibodies_1month_3              acute  1 month infected
## Antibodies_1month_12             acute  1 month infected
## Antibodies_1month_14  1 month infected  1 month infected
## Antibodies_1month_17  1 month infected  1 month infected
## Antibodies_1month_19             acute  1 month infected
## Antibodies_6months_1 6 months infected 6 months infected
## Antibodies_6months_5           Healthy 6 months infected
## Antibodies_6months_9 6 months infected 6 months infected

The healthy sample was correctly identified but the 6 months infected only 2/3 correct, the acute only 1/8 correctly predicted, and the 1 month infected only had 3/6 correctly predicted.

These genes may not be great target genes, but we used them in a 4 class model instead of a 2 class model. So, they could be useful. But the best set of genes were the 52 top genes and not the 8 duplicated genes.

link to pathologies current database is here

path <- "your path to pathologies data"

setwd(path)

pathologies <- read.csv("pathologyDB_CFS_added_May25_2026.csv", header=T)

paged_table(pathologies) #581X7

table(pathologies$topGenePathology)

## 
##                                         Acute Infectious Mononucleosis and Chronic Active Epstein-Barr Virus 
##                                                                                                           40 
## Chronic Fatigue Syndrome (CFS) or Systemic Exhertion Intolerance Disease (SEID) or Myalgic Encephalomyelitis 
##                                                                                                           20 
##                                                           classic Hodgkin's Lymphoma, and/or EBV, and/or HIV 
##                                                                                                           52 
##                                                                        colon cancer cells AKA adenocarcinoma 
##                                                                                                           57 
##                                                                                EBV & CHL elderly 50-94 years 
##                                                                                                           13 
##                                                                             EBV & mDLBCL elderly 50-94 years 
##                                                                                                           13 
##                                                                             EBV & pDLBCL elderly 50-94 years 
##                                                                                                           13 
##                                                          EBVaNPC nasopharyngeal carcinoma with EBV infection 
##                                                                                                           34 
##                                                                                           Epstein Barr Virus 
##                                                                                                           80 
##                                                                                                 fibromyalgia 
##                                                                                                           15 
##                                                 Gastric Carcinoma and Peritoneal Metastatic Gastric Carcioma 
##                                                                                                           30 
##                                                                          intrahepatic cholangiocarcinoma ICC 
##                                                                                                           37 
##                                                                                              language Autism 
##                                                                                                           16 
##                                                                                        Lyme Disease 6 months 
##                                                                                                           33 
##                                                                                                  mild Autism 
##                                                                                                           12 
##                                                                                                mononucleosis 
##                                                                                                           15 
##                                                                                           Multiple Sclerosis 
##                                                                                                           41 
##                                                                   NKTCL Natural Killer T-Cell Lymphoma & EBV 
##                                                                                                           19 
##                                                                                                savant Autism 
##                                                                                                           14 
##                                                                            uterine fibroid myometrial tissue 
##                                                                                                           27

lyme33 <- pathologies[pathologies$topGenePathology == "Lyme Disease 6 months",]

lyme33$Genecards_ID

##  [1] "ISG20"     "CLEC2L"    "PSMF1"     "RNF168"    "PEX26"     "F2"       
##  [7] "KCNJ16"    "MAP2K7"    "ESYT1"     "GATC"      "ENO1"      "CYP7B1"   
## [13] "IGFALS"    "OR52A4"    "INAFM1"    "DLG3"      "TMEM194A"  "RGPD3"    
## [19] "HPGD"      "SLC1A1"    "NUDT18"    "LOC400657" "OTOS"      "HECW1"    
## [25] "POU4F2"    "FRS3"      "PDZRN3"    "KHDRBS3"   "CENPF"     "FAM162A"  
## [31] "CABP1"     "POU3F2"    "CTXN3"

Lets test these genes to see if they are still good predictors of class of Lyme disease.

lyme33DB <- data[data$Gene %in% lyme33$Genecards_ID,1:87]

lyme33DB$Gene

##  [1] "RNF168"    "ISG20"     "NUDT18"    "ESYT1"     "OTOS"      "CYP7B1"   
##  [7] "HECW1"     "HPGD"      "GATC"      "FAM162A"   "PSMF1"     "POU3F2"   
## [13] "PDZRN3"    "MAP2K7"    "OR52A4"    "CENPF"     "PEX26"     "SLC1A1"   
## [19] "DLG3"      "POU4F2"    "ENO1"      "LOC400657" "CLEC2L"    "CTXN3"    
## [25] "RGPD3"     "TMEM194A"  "IGFALS"    "KCNJ16"    "F2"        "CABP1"    
## [31] "KHDRBS3"   "FRS3"

lyme33MX <- data.frame(t(lyme33DB[,2:87]))
colnames(lyme33MX) <- lyme33DB$Gene
lyme33MX$class <- as.factor(class)

paged_table(lyme33MX)

Now, lets test these genes and see if they are the best set to predict a 4 class model of healthy, acute, 1 month infected, or 6 months infected.

set.seed(235)

inTrain <- sample(1:86,.8*86)

training <- lyme33MX[inTrain,]
testing <- lyme33MX[-inTrain,]

table(training$class)

## 
##  1 month infected 6 months infected             acute           Healthy 
##                22                 8                24                14

table(testing$class)

## 
##  1 month infected 6 months infected             acute           Healthy 
##                 5                 2                 4                 7

rf33 <- randomForest(training[1:32], training$class, mtry=10, ntree=5000, confusion=T)

rf33$confusion

##                   1 month infected 6 months infected acute Healthy class.error
## 1 month infected                14                 1     5       2   0.3636364
## 6 months infected                2                 5     1       0   0.3750000
## acute                            5                 0    17       2   0.2916667
## Healthy                          6                 0     7       1   0.9285714

The overall results are better for the acute, 1 month infected, and 6 months infected, but the healthy class was much worse and only 1/14 predicted correctly. The healthy samples were misidentified as either acute infection or 1 month infected.

Lets see how well it predicts on the hold out validation 20% data.

predict33 <- predict(rf33, testing)

results <- data.frame(predicted=predict33, actual=testing$class)
results

##                              predicted            actual
## healthyControl_1      1 month infected           Healthy
## healthyControl_2     6 months infected           Healthy
## healthyControl_3               Healthy           Healthy
## healthyControl_7                 acute           Healthy
## healthyControl_16                acute           Healthy
## healthyControl_17                acute           Healthy
## healthyControl_18              Healthy           Healthy
## acuteLymeDisease_5               acute             acute
## acuteLymeDisease_24              acute             acute
## acuteLymeDisease_25              acute             acute
## acuteLymeDisease_26              acute             acute
## Antibodies_1month_4              acute  1 month infected
## Antibodies_1month_7              acute  1 month infected
## Antibodies_1month_14             acute  1 month infected
## Antibodies_1month_23             acute  1 month infected
## Antibodies_1month_24  1 month infected  1 month infected
## Antibodies_6months_2             acute 6 months infected
## Antibodies_6months_4 6 months infected 6 months infected

These genes are also not that great. Even worse for the healthy class. In the training model seemed to be ok. For healthy, 2/7 correct. For acute, 4/4 correct. For 1 month infected, 1/5 correct. And for 6 months infected 1/2 correct.

Since the 52 genes are better overall, we will use those genes.

lyme33[1,]

##         Ensembl_ID Genecards_ID FC_pathology_control      topGenePathology
## 96 ENSG00000172183        ISG20             154077.3 Lyme Disease 6 months
##                     mediaType
## 96 RBCs of PBMCs array format
##                                                                                                                                                                                                                                                                                                                                                                                             studySummarized
## 96 Lyme disease samples had more acute, healthy, and 1 month infection than the chronic infection samples. The blood of peripheral blood mononuclear cells was examined by array and not high throughput analysis.There were 10 chronic cases of 6 months infection after antibiotics, 21 cases of healthy and uninfected, 28 cases of acute infection, and 27 cases of infected 1 month with antibiotics. 
##    GSE_study_ID
## 96    GSE145974

colnames(lyme33)

## [1] "Ensembl_ID"           "Genecards_ID"         "FC_pathology_control"
## [4] "topGenePathology"     "mediaType"            "studySummarized"     
## [7] "GSE_study_ID"

colnames(topGenes60_b)

##  [1] "Gene"                  "healthyControl_1"      "healthyControl_2"     
##  [4] "healthyControl_3"      "healthyControl_4"      "healthyControl_5"     
##  [7] "healthyControl_6"      "healthyControl_7"      "healthyControl_8"     
## [10] "healthyControl_9"      "healthyControl_10"     "healthyControl_11"    
## [13] "healthyControl_12"     "healthyControl_13"     "healthyControl_14"    
## [16] "healthyControl_15"     "healthyControl_16"     "healthyControl_17"    
## [19] "healthyControl_18"     "healthyControl_19"     "healthyControl_20"    
## [22] "healthyControl_21"     "acuteLymeDisease_1"    "acuteLymeDisease_2"   
## [25] "acuteLymeDisease_3"    "acuteLymeDisease_4"    "acuteLymeDisease_5"   
## [28] "acuteLymeDisease_6"    "acuteLymeDisease_7"    "acuteLymeDisease_8"   
## [31] "acuteLymeDisease_9"    "acuteLymeDisease_10"   "acuteLymeDisease_11"  
## [34] "acuteLymeDisease_12"   "acuteLymeDisease_13"   "acuteLymeDisease_14"  
## [37] "acuteLymeDisease_15"   "acuteLymeDisease_16"   "acuteLymeDisease_17"  
## [40] "acuteLymeDisease_18"   "acuteLymeDisease_19"   "acuteLymeDisease_20"  
## [43] "acuteLymeDisease_21"   "acuteLymeDisease_22"   "acuteLymeDisease_23"  
## [46] "acuteLymeDisease_24"   "acuteLymeDisease_25"   "acuteLymeDisease_26"  
## [49] "acuteLymeDisease_27"   "acuteLymeDisease_28"   "Antibodies_1month_1"  
## [52] "Antibodies_1month_2"   "Antibodies_1month_3"   "Antibodies_1month_4"  
## [55] "Antibodies_1month_5"   "Antibodies_1month_6"   "Antibodies_1month_7"  
## [58] "Antibodies_1month_8"   "Antibodies_1month_9"   "Antibodies_1month_10" 
## [61] "Antibodies_1month_11"  "Antibodies_1month_12"  "Antibodies_1month_13" 
## [64] "Antibodies_1month_14"  "Antibodies_1month_15"  "Antibodies_1month_16" 
## [67] "Antibodies_1month_17"  "Antibodies_1month_18"  "Antibodies_1month_19" 
## [70] "Antibodies_1month_20"  "Antibodies_1month_21"  "Antibodies_1month_22" 
## [73] "Antibodies_1month_23"  "Antibodies_1month_24"  "Antibodies_1month_25" 
## [76] "Antibodies_1month_26"  "Antibodies_1month_27"  "Antibodies_6months_1" 
## [79] "Antibodies_6months_2"  "Antibodies_6months_3"  "Antibodies_6months_4" 
## [82] "Antibodies_6months_5"  "Antibodies_6months_6"  "Antibodies_6months_7" 
## [85] "Antibodies_6months_8"  "Antibodies_6months_9"  "Antibodies_6months_10"
## [88] "foldchange"            "topGeneSource"

lyme52 <- topGenes60_b[,c(1,88,89)]

paged_table(lyme52)

“Genecards_ID” “FC_pathology_control” [4] “topGenePathology” “mediaType” “studySummarized”
[7] “GSE_study_ID”

colnames(lyme52) <- c("Genecards_ID", "FC_pathology_control",
                      "studySummarized")
lyme52$topGenePathology <- lyme33$topGenePathology[1]
lyme52$mediaType <- lyme33$mediaType[1]
lyme52$GSE_study_ID <- lyme33$GSE_study_ID[1]
lyme52$studySummarized <- paste(lyme33$studySummarized[1],lyme52$studySummarized, sep="_")
lyme52$studySummarized <- paste(lyme52$studySummarized,"these values were normalized from original data in Excel for max minus min over the range of max-min to remove the negative values, and made the larger values the lower ones and smaller values the higher ones, but the foldchange values are still valid but could be different than up or down as in the opposite.", sep=" - Note - ")

Lets add the Ensembl IDs from another dataset.

path1 <- 'path to your ensembl IDs'

setwd(path1)

ensembl <- read.csv("GSE271486_ensembleIDs_NPC_LBMP_study.csv", header=T)

colnames(ensembl)

##  [1] "gene_id"                "gene_name"              "description"           
##  [4] "locus"                  "HNE_1_MUT_LMP1_1_count" "HNE_1_MUT_LMP1_2_count"
##  [7] "HNE_1_MUT_LMP1_3_count" "HNE_1_WT_LMP1_1_count"  "HNE_1_WT_LMP1_2_count" 
## [10] "HNE_1_WT_LMP1_3_count"  "HNE_1_MUT_LMP1_1_FPKM"  "HNE_1_MUT_LMP1_2_FPKM" 
## [13] "HNE_1_MUT_LMP1_3_FPKM"  "HNE_1_WT_LMP1_1_FPKM"   "HNE_1_WT_LMP1_2_FPKM"  
## [16] "HNE_1_WT_LMP1_3_FPKM"

We only want columns 1 through 2.

ensembl2 <- ensembl[,1:2]

paged_table(ensembl2[1:10,])

Lets merge this to lyme52.

lyme52_b <- merge(lyme52,ensembl2, by.x="Genecards_ID", by.y="gene_name")

colnames(lyme52_b)[7] <- "Ensembl_ID"

colnames(lyme52_b)

## [1] "Genecards_ID"         "FC_pathology_control" "studySummarized"     
## [4] "topGenePathology"     "mediaType"            "GSE_study_ID"        
## [7] "Ensembl_ID"

colnames(pathologies)

## [1] "Ensembl_ID"           "Genecards_ID"         "FC_pathology_control"
## [4] "topGenePathology"     "mediaType"            "studySummarized"     
## [7] "GSE_study_ID"

lyme52_c <- lyme52_b[,c(7,1,2,4,5,3,6)]

colnames(lyme52_c)

## [1] "Ensembl_ID"           "Genecards_ID"         "FC_pathology_control"
## [4] "topGenePathology"     "mediaType"            "studySummarized"     
## [7] "GSE_study_ID"

The feature names match to combine, but I want to remove the old top genes for lyme disease.

Pathology <- subset(pathologies, pathologies$topGenePathology != "Lyme Disease 6 months")

The new pathology database:

newP <- rbind(Pathology, lyme52_c)

paged_table(newP) #591X7

Lets write this new pathology database out to the folder of current pathologies.

setwd(path)

write.csv(newP,'pathologyDB_LymeFCsChangesAdded_5_31_2026.csv', row.names=F)

adding Lyme Disease Top Genes Revisited Normalized no zeros

Janis Harris

2026-06-01