This is the tuned dimension reduction for the gut microbiome data. This will be integrated into the modelling workflow and used for neural network, KNN, and SVM classifers. Stay tuned!
install.packages("nloptr")
install.packages(c("rpart", "xgboost", "rstanarm", "lme4"))
install.packages("embed")
install.packages("Rtsne")
install.packages("plotly")
install.packages("tidymodels", force = TRUE)
install.packages("Rtsne")
link_data <- "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6342642/bin/NIHMS1510763-supplement-Dataset_2.xlsx"
df <- read_excel("NIHMS1510763-supplement-Dataset_2.xlsx", skip = 1)
df_long <- df %>% pivot_longer(-`# Feature / Sample`, names_to="PatientID") %>%
pivot_wider(names_from=`# Feature / Sample`, values_from="value") %>% select(-Fecal.Calprotectin)
PRISM <- df %>% select(starts_with('PRISM')) %>% names()
data_prism <- df_long %>% filter(PatientID %in% PRISM )
data_validate <- df_long %>% filter(!PatientID %in% PRISM) %>% replace_na(list(immunosuppressant="No", mesalamine="No", steroids="No"))
tidy_data <- function(untidy) {
untidy %>%
mutate(PatientID = as.factor(PatientID),
Age = as.numeric(Age),
Diagnosis = as.factor(Diagnosis),
antibiotic = as.factor(antibiotic),
mesalamine = as.factor(mesalamine),
steroids = as.factor(steroids),
immunosuppressant = as.factor(immunosuppressant)) -> formatted
w <- which(sapply(formatted, class ) == 'character' )
formatted[w] <- lapply(formatted[w], function(x) as.numeric(x))
return(formatted)
}
data_prism_formatted <- tidy_data(data_prism) %>% na.omit()
data_validate_formatted <- tidy_data(data_validate) %>% na.omit()
colnames(data_prism_formatted) <- make.names(colnames(data_prism_formatted))
colnames(data_validate_formatted) <- make.names(colnames(data_validate_formatted))
train_df <- data_prism_formatted %>% select(-PatientID) %>%
mutate(Diagnosis = factor(Diagnosis, levels =c("Control","CD", "UC"), labels =c(1,2,3)))
test_df <- data_validate_formatted %>% select(-PatientID,-Diagnosis) %>% na.omit()
test_df_truth <- data_validate_formatted %>% select(PatientID, Diagnosis) %>%
mutate(Diagnosis = factor(Diagnosis, levels =c("Control","CD", "UC"), labels =c(1,2,3))) %>%
mutate(Cohort = case_when(str_detect(PatientID,"LLDeep") == TRUE ~ "Deep",
TRUE ~ "UMCG"))
folds <- train_df %>% vfold_cv(5, strata = Diagnosis)
data_combined <- bind_rows(data_prism_formatted, data_validate_formatted) %>%
mutate(Cohort = case_when(
str_detect(PatientID, "LLDeep") == TRUE ~ "Deep",
str_detect(PatientID, "PRISM") == TRUE ~ "PRISM",
TRUE ~ "UMCG")) %>% na.omit
data_combined_train <- data_combined %>% select(-c(PatientID, Diagnosis, Cohort))
head(data_combined)
## # A tibble: 6 × 8,856
## PatientID Age Diagnosis antibiotic immunosuppressant mesalamine steroids
## <fct> <dbl> <fct> <fct> <fct> <fct> <fct>
## 1 PRISM|7122 38 CD No Yes No No
## 2 PRISM|7147 50 CD No No Yes No
## 3 PRISM|7150 41 CD No Yes No No
## 4 PRISM|7153 51 CD No No Yes No
## 5 PRISM|7184 68 CD No No No No
## 6 PRISM|7238 67 CD Yes Yes No Yes
## # … with 8,849 more variables: C18.neg_Cluster_0001 <dbl>,
## # C18.neg_Cluster_0002 <dbl>, C18.neg_Cluster_0003 <dbl>,
## # C18.neg_Cluster_0004 <dbl>, C18.neg_Cluster_0005 <dbl>,
## # C18.neg_Cluster_0006 <dbl>, C18.neg_Cluster_0007 <dbl>,
## # C18.neg_Cluster_0008 <dbl>, C18.neg_Cluster_0009 <dbl>,
## # C18.neg_Cluster_0010 <dbl>, C18.neg_Cluster_0011 <dbl>,
## # C18.neg_Cluster_0012 <dbl>, C18.neg_Cluster_0013 <dbl>, …
data_tsne <- tibble(perplexity = c(5, 10, 15, 30, 45))
data_tsne <- data_tsne %>% mutate(TSNE = map(perplexity, ~Rtsne(data_combined_train, dims=2, PCA=F, perplexity=.x, max_iter=2000)))
for (i in 1:nrow(data_tsne)){
x = data_tsne$perplexity[[i]]
assign(paste0("tsne_df_tune_", x),
tibble(
X = data_tsne$TSNE[[i]]$Y[,1],
Y = data_tsne$TSNE[[i]]$Y[,2],
#Z = data_tsne$TSNE[[i]]$Y[,3],
Perplexity = data_tsne$perplexity[[i]],
Diagnosis = data_combined$Diagnosis,
Cohort = data_combined$Cohort,
ID = data_combined$PatientID
)
)
}
TNSE allows not the direct embedding of test data since no mapping function is learnt. Therefore, the discovery and validation data are combined and projected together into a two-dimensional space for visualization. The main tunable parameter (hyperparameter) is perplexity, which can be described as the number of neighbors, and is selected based on visual inspection. Different paramters were tried and are selectable in interactive plot below.
list_tsne_tune <- ls(pattern = "tsne_df_tune_")
tsne_combined <- bind_rows(mget(ls(pattern = "tsne_df_tune_")))
tsne_combined %>%
plot_ly(x = ~X, y = ~Y, frame = ~Perplexity, color = ~Diagnosis, symbol = ~Cohort,
colors = "Set1", type="scatter", mode="markers", ids=~ID, text = ~paste0("Sample: ", ID, "<br>",
"Diagnosis: ", Diagnosis),
hoverinfo = "text",
marker = list(sizemode = "diameter")) %>%
layout(xaxis = list(showgrid = F,
zeroline = F
),
yaxis = list(showgrid = F,
zeroline = F
)
) %>%
animation_opts(
redraw=FALSE,
frame = 2000,
transition = 500) %>%
animation_slider(
currentvalue = list(
prefix = "Perplexity: ",
font = list(color = "red")))
set.seed(42)
recipe_umap <- recipe(Diagnosis~., data = train_df) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_umap(all_predictors(), outcome = vars(Diagnosis), num_comp = 6, min_dist = 0.94396076, learn_rate = 6.193164e-03)
set.seed(42)
recipe_kpca <- recipe(Diagnosis~., data = train_df) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_kpca_rbf(all_predictors(), num_comp = 61, sigma = 1.782724e-06)
set.seed(42)
recipe_pca <- recipe(Diagnosis~., data = train_df) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_pca(all_predictors(), num_comp = 60)
train_umap <- bake(recipe_umap %>% prep(), train_df)
train_kpca <- bake(recipe_kpca %>% prep(), train_df)
train_pca <- bake(recipe_pca %>% prep(), train_df)
test_umap <- bake(recipe_umap %>% prep(), test_df)
test_kpca <- bake(recipe_kpca %>% prep(), test_df)
test_pca <- bake(recipe_pca %>% prep(), test_df)
train_1 <- train_umap %>% select(UMAP = UMAP1, Diagnosis) %>%
add_column(PC = train_pca$PC01) %>%
add_column(kPC = train_kpca$kPC01)
train_2 <- train_umap %>% select(UMAP2) %>%
add_column(PatientID = data_prism_formatted$PatientID) %>%
add_column(PC = train_pca$PC02) %>%
add_column(kPC = train_kpca$kPC02)
test_1 <- test_umap %>% select(UMAP = UMAP1) %>%
add_column(Diagnosis = test_df_truth$Diagnosis) %>%
add_column(kPC = test_kpca$kPC01) %>%
add_column(PC = test_pca$PC01)
test_2 <- test_umap %>% select(UMAP = UMAP2) %>%
add_column(PatientID = data_validate_formatted$PatientID) %>%
add_column(kPC = test_kpca$kPC02) %>%
add_column(PC = test_pca$PC02)
train_combined <- train_1 %>% pivot_longer(!Diagnosis, names_to = "Preprocessing", values_to = "Dim1") %>%
add_column(train_2 %>% pivot_longer(!PatientID,names_to = "Preprocessing", values_to = "Dim2") %>%
select(Dim2, PatientID)) %>%
mutate(Diagnosis = factor(Diagnosis, levels = c(1,2,3), labels = c("Control","CD", "UC")))
test_combined <- test_1 %>% pivot_longer(!Diagnosis, names_to = "Preprocessing", values_to = "Dim1") %>%
add_column(test_2 %>% pivot_longer(!PatientID,names_to = "Preprocessing", values_to = "Dim2") %>%
select(Dim2, PatientID)) %>%
mutate(Diagnosis = factor(Diagnosis, levels = c(1,2,3), labels = c("Control","CD", "UC")))
combined <- train_combined %>% bind_rows(test_combined, .id = c("Group")) %>%
mutate(Group = case_when(Group==1 ~ "Discovery",
TRUE ~ "Validate"))
head(combined)
## # A tibble: 6 × 6
## Group Diagnosis Preprocessing Dim1 Dim2 PatientID
## <chr> <fct> <chr> <dbl> <dbl> <fct>
## 1 Discovery CD UMAP -1.40 -1.39 PRISM|7122
## 2 Discovery CD PC -18.2 -16.7 PRISM|7122
## 3 Discovery CD kPC 0.414 0.386 PRISM|7122
## 4 Discovery CD UMAP -0.528 -0.130 PRISM|7147
## 5 Discovery CD PC -15.0 26.0 PRISM|7147
## 6 Discovery CD kPC 0.344 -0.583 PRISM|7147
Here, linear (PCA) and non-linear (radial basis function kernel PCA (kPCA), UMAP) dimension reduction techniques were used. The chosen hyperparamters were selected based on the best area under the curve of the receiver opererater characteristic (AUROC) score of an tuned classifier. UMAP and kPCA hyperparamters were tuned with a single-layer neure network with AUROC scores of 0.80 and 0.87, respectively. The number of principal components in PCA were selected based on the Radial basis function SVM (RBF-SVM) classifier with a AUROC score of 0.88.
combined %>%
plot_ly(x = ~Dim1, y = ~Dim2, frame = ~Preprocessing, color = ~Diagnosis, symbol = ~Group,
colors = "Set1", type="scatter", mode="markers") %>%
layout(xaxis = list(showgrid = F,
zeroline = F,
autorange = T
),
yaxis = list(showgrid = F,
zeroline = F,
autorange = T
)
) %>%
animation_opts(
redraw=FALSE,
frame = 2000,
transition = 500) %>%
animation_slider(
currentvalue = list(
prefix = "Preprocessing: ",
font = list(color = "red")))