This is the tuned dimension reduction for the gut microbiome data. This will be integrated into the modelling workflow and used for neural network, KNN, and SVM classifers. Stay tuned!

install.packages("nloptr")
install.packages(c("rpart", "xgboost", "rstanarm", "lme4"))
install.packages("embed")
install.packages("Rtsne")
install.packages("plotly")
install.packages("tidymodels", force = TRUE)
install.packages("Rtsne")
link_data <- "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6342642/bin/NIHMS1510763-supplement-Dataset_2.xlsx"

df <- read_excel("NIHMS1510763-supplement-Dataset_2.xlsx", skip = 1)
df_long <- df %>% pivot_longer(-`# Feature / Sample`, names_to="PatientID") %>%
  pivot_wider(names_from=`# Feature / Sample`, values_from="value") %>% select(-Fecal.Calprotectin)
PRISM <- df %>% select(starts_with('PRISM')) %>% names()
data_prism <- df_long %>% filter(PatientID %in% PRISM )
data_validate <- df_long %>% filter(!PatientID %in% PRISM) %>% replace_na(list(immunosuppressant="No", mesalamine="No", steroids="No"))

tidy_data <- function(untidy) {
  untidy %>% 
  mutate(PatientID = as.factor(PatientID),
        Age = as.numeric(Age),
        Diagnosis = as.factor(Diagnosis),
        antibiotic = as.factor(antibiotic),
        mesalamine = as.factor(mesalamine),
        steroids = as.factor(steroids),
        immunosuppressant = as.factor(immunosuppressant)) -> formatted
  
w <- which(sapply(formatted, class ) == 'character' )
formatted[w] <- lapply(formatted[w], function(x) as.numeric(x))
return(formatted)
}
data_prism_formatted <- tidy_data(data_prism)  %>% na.omit()
data_validate_formatted <- tidy_data(data_validate) %>% na.omit()

colnames(data_prism_formatted) <- make.names(colnames(data_prism_formatted))
colnames(data_validate_formatted) <- make.names(colnames(data_validate_formatted))
train_df <- data_prism_formatted %>% select(-PatientID) %>%
  mutate(Diagnosis = factor(Diagnosis, levels =c("Control","CD", "UC"), labels =c(1,2,3)))
test_df <- data_validate_formatted %>% select(-PatientID,-Diagnosis) %>% na.omit()
test_df_truth <- data_validate_formatted %>% select(PatientID, Diagnosis) %>%
  mutate(Diagnosis = factor(Diagnosis, levels =c("Control","CD", "UC"), labels =c(1,2,3))) %>%
  mutate(Cohort = case_when(str_detect(PatientID,"LLDeep") == TRUE ~ "Deep",
                                                             TRUE ~ "UMCG")) 
folds <- train_df %>% vfold_cv(5, strata = Diagnosis)
                       
data_combined <- bind_rows(data_prism_formatted, data_validate_formatted) %>%
  mutate(Cohort = case_when(
      str_detect(PatientID, "LLDeep") == TRUE ~ "Deep",
      str_detect(PatientID, "PRISM") == TRUE ~ "PRISM", 
      TRUE ~ "UMCG")) %>% na.omit
data_combined_train <- data_combined %>% select(-c(PatientID, Diagnosis, Cohort))
head(data_combined)
## # A tibble: 6 × 8,856
##   PatientID    Age Diagnosis antibiotic immunosuppressant mesalamine steroids
##   <fct>      <dbl> <fct>     <fct>      <fct>             <fct>      <fct>   
## 1 PRISM|7122    38 CD        No         Yes               No         No      
## 2 PRISM|7147    50 CD        No         No                Yes        No      
## 3 PRISM|7150    41 CD        No         Yes               No         No      
## 4 PRISM|7153    51 CD        No         No                Yes        No      
## 5 PRISM|7184    68 CD        No         No                No         No      
## 6 PRISM|7238    67 CD        Yes        Yes               No         Yes     
## # … with 8,849 more variables: C18.neg_Cluster_0001 <dbl>,
## #   C18.neg_Cluster_0002 <dbl>, C18.neg_Cluster_0003 <dbl>,
## #   C18.neg_Cluster_0004 <dbl>, C18.neg_Cluster_0005 <dbl>,
## #   C18.neg_Cluster_0006 <dbl>, C18.neg_Cluster_0007 <dbl>,
## #   C18.neg_Cluster_0008 <dbl>, C18.neg_Cluster_0009 <dbl>,
## #   C18.neg_Cluster_0010 <dbl>, C18.neg_Cluster_0011 <dbl>,
## #   C18.neg_Cluster_0012 <dbl>, C18.neg_Cluster_0013 <dbl>, …
data_tsne <- tibble(perplexity = c(5, 10, 15, 30, 45))
data_tsne <- data_tsne %>% mutate(TSNE = map(perplexity, ~Rtsne(data_combined_train, dims=2, PCA=F, perplexity=.x, max_iter=2000)))

for (i in 1:nrow(data_tsne)){
  x = data_tsne$perplexity[[i]]
assign(paste0("tsne_df_tune_", x),
       tibble(
              X = data_tsne$TSNE[[i]]$Y[,1],
              Y = data_tsne$TSNE[[i]]$Y[,2],
              #Z = data_tsne$TSNE[[i]]$Y[,3],
              Perplexity = data_tsne$perplexity[[i]],
              Diagnosis = data_combined$Diagnosis,
              Cohort = data_combined$Cohort,
              ID = data_combined$PatientID
             )
      )
}

TNSE allows not the direct embedding of test data since no mapping function is learnt. Therefore, the discovery and validation data are combined and projected together into a two-dimensional space for visualization. The main tunable parameter (hyperparameter) is perplexity, which can be described as the number of neighbors, and is selected based on visual inspection. Different paramters were tried and are selectable in interactive plot below.

list_tsne_tune <- ls(pattern = "tsne_df_tune_")
tsne_combined <- bind_rows(mget(ls(pattern = "tsne_df_tune_")))

tsne_combined %>% 
plot_ly(x = ~X, y = ~Y, frame = ~Perplexity, color = ~Diagnosis, symbol = ~Cohort,
        colors = "Set1", type="scatter", mode="markers", ids=~ID,  text = ~paste0("Sample: ", ID, "<br>",
                                                                                  "Diagnosis: ", Diagnosis),
        hoverinfo = "text",
        marker = list(sizemode = "diameter")) %>%

layout(xaxis = list(showgrid = F,
                    zeroline = F
                   ),
       yaxis = list(showgrid = F,
                    zeroline = F
                   )
       ) %>%
animation_opts(
    redraw=FALSE,
    frame = 2000,
    transition = 500) %>%

animation_slider(
     currentvalue = list(
       prefix = "Perplexity: ", 
       font = list(color = "red")))
set.seed(42)
recipe_umap <- recipe(Diagnosis~., data = train_df) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_umap(all_predictors(), outcome = vars(Diagnosis), num_comp = 6, min_dist = 0.94396076, learn_rate = 6.193164e-03)

set.seed(42)
recipe_kpca <- recipe(Diagnosis~., data = train_df) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_kpca_rbf(all_predictors(), num_comp = 61, sigma = 1.782724e-06)

set.seed(42)
recipe_pca <- recipe(Diagnosis~., data = train_df) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_pca(all_predictors(), num_comp = 60)

train_umap <- bake(recipe_umap %>% prep(), train_df)
train_kpca <- bake(recipe_kpca %>% prep(), train_df)
train_pca <- bake(recipe_pca %>% prep(), train_df)

test_umap <- bake(recipe_umap %>% prep(), test_df)
test_kpca <- bake(recipe_kpca %>% prep(), test_df)
test_pca <- bake(recipe_pca %>% prep(), test_df)
train_1 <- train_umap %>% select(UMAP = UMAP1, Diagnosis) %>%
add_column(PC = train_pca$PC01) %>%
add_column(kPC = train_kpca$kPC01)

train_2 <- train_umap %>% select(UMAP2) %>%
add_column(PatientID = data_prism_formatted$PatientID) %>%
add_column(PC = train_pca$PC02) %>%
add_column(kPC = train_kpca$kPC02)

test_1 <- test_umap %>% select(UMAP = UMAP1) %>%
add_column(Diagnosis = test_df_truth$Diagnosis) %>%
add_column(kPC = test_kpca$kPC01) %>%
add_column(PC = test_pca$PC01)

test_2 <- test_umap %>% select(UMAP = UMAP2) %>%
add_column(PatientID = data_validate_formatted$PatientID) %>%
add_column(kPC = test_kpca$kPC02) %>%
add_column(PC = test_pca$PC02)
train_combined <- train_1 %>% pivot_longer(!Diagnosis, names_to = "Preprocessing", values_to = "Dim1") %>%
add_column(train_2 %>% pivot_longer(!PatientID,names_to = "Preprocessing", values_to = "Dim2") %>%
           select(Dim2, PatientID)) %>%
mutate(Diagnosis = factor(Diagnosis, levels = c(1,2,3), labels = c("Control","CD", "UC")))

test_combined <- test_1 %>% pivot_longer(!Diagnosis, names_to = "Preprocessing", values_to = "Dim1") %>%
add_column(test_2 %>% pivot_longer(!PatientID,names_to = "Preprocessing", values_to = "Dim2") %>%
           select(Dim2, PatientID)) %>%
mutate(Diagnosis = factor(Diagnosis, levels = c(1,2,3), labels = c("Control","CD", "UC")))

combined <- train_combined %>% bind_rows(test_combined, .id = c("Group")) %>%
mutate(Group = case_when(Group==1 ~ "Discovery",
                         TRUE ~ "Validate"))
head(combined)
## # A tibble: 6 × 6
##   Group     Diagnosis Preprocessing    Dim1    Dim2 PatientID 
##   <chr>     <fct>     <chr>           <dbl>   <dbl> <fct>     
## 1 Discovery CD        UMAP           -1.40   -1.39  PRISM|7122
## 2 Discovery CD        PC            -18.2   -16.7   PRISM|7122
## 3 Discovery CD        kPC             0.414   0.386 PRISM|7122
## 4 Discovery CD        UMAP           -0.528  -0.130 PRISM|7147
## 5 Discovery CD        PC            -15.0    26.0   PRISM|7147
## 6 Discovery CD        kPC             0.344  -0.583 PRISM|7147

Here, linear (PCA) and non-linear (radial basis function kernel PCA (kPCA), UMAP) dimension reduction techniques were used. The chosen hyperparamters were selected based on the best area under the curve of the receiver opererater characteristic (AUROC) score of an tuned classifier. UMAP and kPCA hyperparamters were tuned with a single-layer neure network with AUROC scores of 0.80 and 0.87, respectively. The number of principal components in PCA were selected based on the Radial basis function SVM (RBF-SVM) classifier with a AUROC score of 0.88.

combined %>% 
plot_ly(x = ~Dim1, y = ~Dim2, frame = ~Preprocessing, color = ~Diagnosis, symbol = ~Group,
        colors = "Set1", type="scatter", mode="markers") %>%

layout(xaxis = list(showgrid = F,
                    zeroline = F,
                    autorange = T
                   ),
       yaxis = list(showgrid = F,
                    zeroline = F,
                    autorange = T
                   )
       ) %>%
animation_opts(
    redraw=FALSE,
    frame = 2000,
    transition = 500) %>%

animation_slider(
     currentvalue = list(
       prefix = "Preprocessing: ", 
       font = list(color = "red")))