CIBMTR dataset - Clustering algorithms

##3. Kohonem Maps

###3.1 Complete dataset

library(aweSOM) # (version 1.3) 
Warning: package 'aweSOM' was built under R version 4.3.3
## Import Data
setwd("C:\\Users\\vinic\\OneDrive\\Área de Trabalho\\doutorado_peb\\peb_redes_neurais")
import.data <- data.table::fread("dataset_10in10_som.csv", stringsAsFactors = TRUE, data.table = FALSE, check.names = TRUE)

import.data <- import.data[complete.cases(import.data),]

### Transform selected numeric to factors
import.data[, "ethnicit"] <- as.factor(import.data[, "ethnicit"])
import.data[, "racegp"]   <- as.factor(import.data[, "racegp"])
import.data[, "ragecat"]  <- as.factor(import.data[, "ragecat"])
import.data[, "dagecat"]  <- as.factor(import.data[, "dagecat"])
import.data[, "disease"]  <- as.factor(import.data[, "disease"])
import.data[, "kps"]      <- as.factor(import.data[, "kps"])
import.data[, "dissta"]   <- as.factor(import.data[, "dissta"])
import.data[, "hctci"]    <- as.factor(import.data[, "hctci"])
import.data[, "drcmvpr"]  <- as.factor(import.data[, "drcmvpr"])
import.data[, "drsex"]    <- as.factor(import.data[, "drsex"])
import.data[, "graftype"] <- as.factor(import.data[, "graftype"])
import.data[, "atgcampathgp"] <- as.factor(import.data[, "atgcampathgp"])
import.data[, "condint"]  <- as.factor(import.data[, "condint"])
import.data[, "gvhprhrx"] <- as.factor(import.data[, "gvhprhrx"])
import.data[, "anc"]      <- as.factor(import.data[, "anc"])
import.data[, "plt"]      <- as.factor(import.data[, "plt"])
import.data[, "rel"]      <- as.factor(import.data[, "rel"])
import.data[, "agvhd24"]  <- as.factor(import.data[, "agvhd24"])
import.data[, "agvhd34"]  <- as.factor(import.data[, "agvhd34"])
import.data[, "cgvhd"]    <- as.factor(import.data[, "cgvhd"])

## Build training data (categorical to dummies)
cat.data <- import.data[c("ethnicit", "racegp", "ragecat", "dagecat", "disease", "kps", "dissta", "hctci", "drcmvpr", "drsex", "graftype", "atgcampathgp", "condint", "gvhprhrx", "anc", "plt", "rel", "agvhd24", "agvhd34", "cgvhd")]
train.data <- cdt(cat.data)
catLevels <- colnames(train.data)
varWeights <- c("ethnicit_1" = 0.5, "ethnicit_2" = 0.5, "ethnicit_3" = 0.5, "racegp_1" = 0.333, "racegp_2" = 0.333, "racegp_3" = 0.333, "racegp_4" = 0.333, "ragecat_0" = 0.167, "ragecat_1" = 0.167, "ragecat_2" = 0.167, "ragecat_3" = 0.167, "ragecat_4" = 0.167, "ragecat_5" = 0.167, "ragecat_6" = 0.167, "dagecat_1" = 0.25, "dagecat_2" = 0.25, "dagecat_3" = 0.25, "dagecat_4" = 0.25, "dagecat_5" = 0.25, "disease_10" = 0.5, "disease_20" = 0.5, "disease_50" = 0.5, "kps_0" = 1, "kps_1" = 1, "dissta_1" = 0.5, "dissta_2" = 0.5, "dissta_3" = 0.5, "hctci_1" = 1, "hctci_2" = 1, "drcmvpr_0" = 0.333, "drcmvpr_1" = 0.333, "drcmvpr_2" = 0.333, "drcmvpr_3" = 0.333, "drsex_1" = 0.333, "drsex_2" = 0.333, "drsex_3" = 0.333, "drsex_4" = 0.333, "graftype_1" = 1, "graftype_22" = 1, "atgcampathgp_2" = 1, "atgcampathgp_4" = 1, "condint_1" = 1, "condint_2" = 1, "gvhprhrx_1" = 1, "gvhprhrx_2" = 1, "anc_0" = 1, "anc_1" = 1, "plt_0" = 1, "plt_1" = 1, "rel_0" = 1, "rel_1" = 1, "agvhd24_0" = 1, "agvhd24_1" = 1, "agvhd34_0" = 1, "agvhd34_1" = 1, "cgvhd_0" = 1, "cgvhd_1" = 1)

### Scale training variables (MCA-type scaling)
train.data <- t(t(train.data) / sqrt(colMeans(train.data, na.rm = TRUE)))
### Apply variables weights
train.data <- t(t(train.data) * sqrt(varWeights))
### Prepare plotting data
plot.data <- cbind(import.data, cdt(cat.data))

10 x 10 grid + hexagonal topology

## Train SOM
### RNG Seed (for reproducibility)
set.seed(1)
### Initialization
init <- somInit(train.data, ncols = 10, nrows = 10, method= "random")
### Training
the.som <- kohonen::som(train.data, grid = kohonen::somgrid(10, 10, "hexagonal"), maxNA.fraction = 0.25, rlen = 500, alpha = c(0.05, 0.01), radius = c(6.08,-6.08), init = init, dist.fcts = "sumofsquares")

## Quality measures
somQuality(the.som, train.data)

## Quality measures:
 * Quantization error     :  8.961173 
 * (% explained variance) :  55.19 
 * Topographic error      :  0.4318615 
 * Kaski-Lagus error      :  8.614998 
 
## Number of obs. per map cell:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
136  87  82 123 127  98 103 122 112 140 178  58  60 124 105 153 101 116 182 156 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
203 116 250 220 137  96  76 165 132 169 109 149  79 119 151 158 101 108  71  55 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
105 155  69 213  72 122 163  95 143 149 118  69 147 151 113 119  91 111 102 160 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
 91 119 131 139 120 133 185 173  72  97 305 186  92 144 107 159 186  83 106   0 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
234 186 145  48 133  72  17  18   0 192 103 169 260 215 202  80 114 105 177   0 

Number of Hits/Cell

aweSOMplot(som = the.som, type = "Hitmap")

Distance Matrix

aweSOMplot(som = the.som, type = "UMatrix")

Smooth Distance matrix

aweSOMsmoothdist(the.som)

Hierarchical Clustering of kohonen prototypes

superclust <- hclust(dist(the.som$codes[[1]]), "complete")
superclasses <- cutree(superclust, 4)

aweSOMplot(som = the.som, type = "Hitmap", superclass = superclasses)

aweSOMplot(som = the.som, type = "Cloud", data = plot.data, 
           variables = c("dead"),
           #superclass = superclasses,
           legendFontsize = 22,
           cloudType = "proximity"
           )

Stratifing by Neutrophil Recovery status (ANC)

aweSOMplot(som = the.som, type = "CatBarplot", data = plot.data, 
           variables = "anc", superclass = superclasses)

Platelet Recovery status (PLT)

## Interactive plot
superclust <- hclust(dist(the.som$codes[[1]]), "complete")
superclasses <- cutree(superclust, 4)

aweSOMplot(som = the.som, type = "Cloud", data = plot.data, 
           variables = c("plt"),
           superclass = superclasses,
           legendFontsize = 22,
           )

Acute GvHD 3-4 status - (changing from Complete to Ward method)

superclust <- hclust(dist(the.som$codes[[1]]), "ward")
The "ward" method has been renamed to "ward.D"; note new "ward.D2"
superclasses <- cutree(superclust, 4)
aweSOMplot(som = the.som, type = "Cloud", data = plot.data, 
           variables = c("agvhd34"),
           superclass = superclasses,
           legendFontsize = 22,
           cloudType = "proximity"
           )

superclust_hclust <- hclust(dist(the.som$codes[[1]]), "complete")

aweSOMdendrogram(clust = superclust_hclust, nclass = 4)

superclust_hclust <- hclust(dist(the.som$codes[[1]]), "ward")
The "ward" method has been renamed to "ward.D"; note new "ward.D2"
aweSOMdendrogram(clust = superclust_hclust, nclass = 4)

3.2 Removing ANC and PLT variables

## Build training data (categorical to dummies)
cat.data <- import.data[c("ethnicit", "racegp", "ragecat", "dagecat", "disease", "kps", "dissta", "hctci", "drcmvpr", "drsex", "graftype", "atgcampathgp", "condint", "gvhprhrx", "rel", "agvhd24", "agvhd34", "cgvhd")]
train.data <- cdt(cat.data)
catLevels <- colnames(train.data)
varWeights <- c("ethnicit_1" = 0.5, "ethnicit_2" = 0.5, "ethnicit_3" = 0.5, "racegp_1" = 0.333, "racegp_2" = 0.333, "racegp_3" = 0.333, "racegp_4" = 0.333, "ragecat_0" = 0.167, "ragecat_1" = 0.167, "ragecat_2" = 0.167, "ragecat_3" = 0.167, "ragecat_4" = 0.167, "ragecat_5" = 0.167, "ragecat_6" = 0.167, "dagecat_1" = 0.25, "dagecat_2" = 0.25, "dagecat_3" = 0.25, "dagecat_4" = 0.25, "dagecat_5" = 0.25, "disease_10" = 0.5, "disease_20" = 0.5, "disease_50" = 0.5, "kps_0" = 1, "kps_1" = 1, "dissta_1" = 0.5, "dissta_2" = 0.5, "dissta_3" = 0.5, "hctci_1" = 1, "hctci_2" = 1, "drcmvpr_0" = 0.333, "drcmvpr_1" = 0.333, "drcmvpr_2" = 0.333, "drcmvpr_3" = 0.333, "drsex_1" = 0.333, "drsex_2" = 0.333, "drsex_3" = 0.333, "drsex_4" = 0.333, "graftype_1" = 1, "graftype_22" = 1, "atgcampathgp_2" = 1, "atgcampathgp_4" = 1, "condint_1" = 1, "condint_2" = 1, "gvhprhrx_1" = 1, "gvhprhrx_2" = 1, "rel_0" = 1, "rel_1" = 1, "agvhd24_0" = 1, "agvhd24_1" = 1, "agvhd34_0" = 1, "agvhd34_1" = 1, "cgvhd_0" = 1, "cgvhd_1" = 1)

### Scale training variables (MCA-type scaling)
train.data <- t(t(train.data) / sqrt(colMeans(train.data, na.rm = TRUE)))
### Apply variables weights
train.data <- t(t(train.data) * sqrt(varWeights))
### Prepare plotting data
plot.data <- cbind(import.data, cdt(cat.data))
## Train SOM
### RNG Seed (for reproducibility)
set.seed(1)
### Initialization
init <- somInit(train.data, ncols = 10, nrows = 10, method= "random")
### Training
the.som <- kohonen::som(train.data, grid = kohonen::somgrid(10, 10, "hexagonal"), maxNA.fraction = 0.25, rlen = 500, alpha = c(0.05, 0.01), radius = c(6.08,-6.08), init = init, dist.fcts = "sumofsquares")

## Quality measures
somQuality(the.som, train.data)

## Quality measures:
 * Quantization error     :  8.46429 
 * (% explained variance) :  52.97 
 * Topographic error      :  0.4615629 
 * Kaski-Lagus error      :  8.879444 
 
## Number of obs. per map cell:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
154 163 133 125 174 122 119  84 148 320 121 110  96 105  89 128 149 103 109  92 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
127 230  66 136 203 125 101  68 203 181 111  80  98  63  83 265  86 117 166 120 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
102  70  92  94  74 114 128 165 112 193 166  49  81  83  88  84 118 144  90 157 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
 71  61  55  77 114  99 117 137 237 119 116  93 153 141 105 129 165 109 225 179 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
156  93 108 144  72  98 151 127 136 146 113 191  94 142  76 107 111 209 216 123 
aweSOMplot(som = the.som, type = "Hitmap")

aweSOMplot(som = the.som, type = "UMatrix")

aweSOMsmoothdist(the.som)

aweSOMplot(som = the.som, type = "Cloud", data = plot.data, 
           variables = "dead",
           # cloudType = "proximity"
           )

Relapse

aweSOMplot(som = the.som, type = "Cloud", data = plot.data, 
           variables = "rel", #superclass = superclasses
           )

Ethnicit

superclust <- hclust(dist(the.som$codes[[1]]), "complete")
superclasses <- cutree(superclust, 2)

aweSOMplot(som = the.som, type = "CatBarplot", data = plot.data, 
           variables = "ethnicit", superclass = superclasses)

Ethnicit: 1 - Hispanic | Latino 3 - Non US resident

Acute GvHD 3-4

superclust <- hclust(dist(the.som$codes[[1]]), "ward")
The "ward" method has been renamed to "ward.D"; note new "ward.D2"
superclasses <- cutree(superclust, 2)
aweSOMplot(som = the.som, type = "CatBarplot", data = plot.data, 
           variables = "agvhd34",
           superclass = superclasses
           )

aweSOMplot(som = the.som, type = "CatBarplot", data = plot.data, 
           variables = "ragecat",
           #superclass = superclasses
           )

Recipient Age Categories: 0 - 0-9 years 1 - 10-17 years 2 - 18-29 years 3 - 30-39 years 4 - 40-49 years 5 - 50-59 years 6 - 60 years and older

aweSOMplot(som = the.som, type = "CatBarplot", data = plot.data, 
           variables = "graftype",
           superclass = superclasses
           )

Graft Type 1 - Marrow 22- PBSC

3.3 Removing all intermediate outcomes (agvhd, cgvhd, anc, plt, rel)

## Build training data (categorical to dummies)
cat.data <- import.data[c("ethnicit", "racegp", "ragecat", "dagecat", "disease", "kps", "dissta", "hctci", "drcmvpr", "drsex", "graftype", "atgcampathgp", "condint", "gvhprhrx")]
train.data <- cdt(cat.data)
catLevels <- colnames(train.data)
varWeights <- c("ethnicit_1" = 0.5, "ethnicit_2" = 0.5, "ethnicit_3" = 0.5, "racegp_1" = 0.333, "racegp_2" = 0.333, "racegp_3" = 0.333, "racegp_4" = 0.333, "ragecat_0" = 0.167, "ragecat_1" = 0.167, "ragecat_2" = 0.167, "ragecat_3" = 0.167, "ragecat_4" = 0.167, "ragecat_5" = 0.167, "ragecat_6" = 0.167, "dagecat_1" = 0.25, "dagecat_2" = 0.25, "dagecat_3" = 0.25, "dagecat_4" = 0.25, "dagecat_5" = 0.25, "disease_10" = 0.5, "disease_20" = 0.5, "disease_50" = 0.5, "kps_0" = 1, "kps_1" = 1, "dissta_1" = 0.5, "dissta_2" = 0.5, "dissta_3" = 0.5, "hctci_1" = 1, "hctci_2" = 1, "drcmvpr_0" = 0.333, "drcmvpr_1" = 0.333, "drcmvpr_2" = 0.333, "drcmvpr_3" = 0.333, "drsex_1" = 0.333, "drsex_2" = 0.333, "drsex_3" = 0.333, "drsex_4" = 0.333, "graftype_1" = 1, "graftype_22" = 1, "atgcampathgp_2" = 1, "atgcampathgp_4" = 1, "condint_1" = 1, "condint_2" = 1, "gvhprhrx_1" = 1, "gvhprhrx_2" = 1)

### Scale training variables (MCA-type scaling)
train.data <- t(t(train.data) / sqrt(colMeans(train.data, na.rm = TRUE)))
### Apply variables weights
train.data <- t(t(train.data) * sqrt(varWeights))
### Prepare plotting data
plot.data <- cbind(import.data, cdt(cat.data))
## Train SOM
### RNG Seed (for reproducibility)
set.seed(1)
### Initialization
init <- somInit(train.data, ncols = 10, nrows = 10, method= "random")
### Training
the.som <- kohonen::som(train.data, grid = kohonen::somgrid(10, 10, "hexagonal"), maxNA.fraction = 0.25, rlen = 500, alpha = c(0.05, 0.01), radius = c(6.08,-6.08), init = init, dist.fcts = "sumofsquares")

## Quality measures
somQuality(the.som, train.data)

## Quality measures:
 * Quantization error     :  5.603983 
 * (% explained variance) :  59.97 
 * Topographic error      :  0.402716 
 * Kaski-Lagus error      :  7.784943 
 
## Number of obs. per map cell:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
 78  69  58  61 262 117 207 143 155 133 100  82  80 112  65   0  56 119 136 215 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
 89 112 111 119 141  72  84 113 106 333 125  20  61 192 112  43 286 262 148  90 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
105 235  74   0 296 140  49  34  91 281 201  86  78 185 147  81 150 137 293   0 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
 72 117   0   0 185  64 216  89   0 120 118 165  62 113  63 113  96   0 273   0 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
138  94   0   0 103  97 208  52 153 320 185 351 319 150 161 204  78 170 138 275 
aweSOMplot(som = the.som, type = "Hitmap")

aweSOMplot(som = the.som, type = "UMatrix")

aweSOMsmoothdist(the.som)

aweSOMplot(som = the.som, type = "Cloud", data = plot.data, 
           variables = "dead")

Race Groups

superclust <- hclust(dist(the.som$codes[[1]]), "complete")
superclasses <- cutree(superclust, 4)

aweSOMplot(som = the.som, type = "Cloud", data = plot.data, 
           variables = "racegp", superclass = superclasses)

Race group 1 White 2 Black or African American 3 Asian 4 Other