Introduction
Loading necessary libraries and dataset
library(tidyverse)
library(magrittr)
library(broom)
library(e1071)
library(clValid)
library(fpc)
library(gplots)
load("OAI_JSW_Data.RData")
Prepairing data to be analysed
feats <-
OAI_JSW_Data %>%
filter(VISITID == 0, complete.cases(.)) %>%
select(-c(VISITID, IDSIDE, ID, SITE, BASEVISITDATE_C, FUVISITDATE, DaysFomStart, DAYSFROMBL,
DaystoTKR, TimeToEvent, DeathVisit, BLKL2, WOMACTOTAL,
WOMAC2YTotalChange, `2YIncreaseSymptoms`, YearTOKL2),
-starts_with("Y2")) %>%
mutate_at(vars(BLKL, mJSN,lJSN, PainMedication, AnySurgery), as.integer) %>%
mutate_if(is.double, funs(scale))
Bootstrapping
Calculate optimal k s by:
- Sample of 25% of data with replacement
- Calculate Dunn index for said sample per clustering algorithm for k s between and 2 and a given value
- Find k that maximizes the Dunn index per clustering algorithm
- Repeat steps 1 to 3 x times
- Get mean best k per clustering algorithm
res <- clustest::find_k(dataset = feats, train_size = 0.25, iters = 28, max_k = 5)
train <- res$train
test <- res$test
k_stats <- res$k_stats
best_k <- res$best_k
all_best_ks <- res$all_best_ks
c_samples <- res$c_samples
remove(res)
Plotting optimal k s
k_stats %>%
ggplot(aes(x = method, y = mean_best_k)) +
geom_col(aes(fill = method)) +
guides(fill = F) +
geom_errorbar(aes(ymin = ci_min, ymax = ci_max))

best_k
Jaccard Stability Test
Fuzzy c-Means Clustering Bootstrap Interface
cmeansCBI <- function(data, k) {
result <- cmeans(x = data, center = k)
list(result = result,
partition = result$cluster,
clusterlist = map(1:k, ~. == result$cluster),
clustermethod = "fcmeans",
nc = k)
}
Jaccard evaluation of all algorithms
jac <- list(
KMeans = clusterboot(train, B = 30, clustermethod = kmeansCBI, k = best_k$KMeans)$bootresult,
FCMeans = clusterboot(train, B = 30, clustermethod = cmeansCBI, k = best_k$FCMeans)$bootresult,
HCWard = clusterboot(train, B = 30, clustermethod = hclustCBI, k = best_k$HCWard,
method = "ward.D")$bootresult,
HCSing = clusterboot(train, B = 30, clustermethod = hclustCBI, k = best_k$HCSing,
method = "single")$bootresult,
HCComp = clusterboot(train, B = 30, clustermethod = hclustCBI, k = best_k$HCComp,
method = "complete")$bootresult)
Plotting stability tests summary
get_stats <- function(jac_mat){
jac_mat %>%
t() %>%
as_tibble() %>%
summarise_all(funs(ci_min = t.test(.)$conf.int[1],
ci_max = t.test(.)$conf.int[2],
mean_jaccard = mean(.))) %>%
gather() %>%
separate(key, into = c("cluster", "stat"), extra = "merge") %>%
spread(stat, value)
}
jac %>%
map_df(get_stats, .id = "method") %>%
ggplot(aes(x = cluster, y = mean_jaccard)) +
facet_wrap(~method) +
geom_col(aes(fill = cluster)) +
guides(fill = F) +
geom_errorbar(aes(ymin = ci_min, ymax = ci_max))

ANOVA
Plot training sets ANOVA
a_o_v <-
c_samples %>%
group_by(sample) %>%
do(clustest::var_clust_aov(., methods = c("KMeans", "FCMeans", "HCWard", "HCSing", "HCComp"),
dep_vars = c("WOMACPAIN", "BLKL", "TKREvent")))
|===========================================================================================|100% ~0 s remaining
a_o_v %>%
group_by(dep_var, term) %>%
summarise(signif_percent = sum(p.value <= 0.05)/n(),
ci_min = t.test(p.value)$conf.int[1],
ci_max = t.test(p.value)$conf.int[2],
mean_p_value = mean(p.value)) %T>%
print() %>%
ggplot(aes(x = term, y = mean_p_value, fill = term)) +
facet_wrap(~dep_var, ncol = 2) +
geom_col() +
guides(fill = F) +
geom_errorbar(aes(ymin = ci_min, ymax = ci_max))

Print testing set ANOVA
Hierarchical Clustering Heatmap

---
title: "OAI_Clust"
author: "David Duarte"
date: "2/6/2018"
output: html_notebook
---

## Introduction

Loading necessary libraries and dataset

```{r}
library(tidyverse)
library(magrittr)
library(e1071)
library(fpc)
library(gplots)
load("OAI_JSW_Data.RData")
```

</br>

Prepairing data to be analysed

```{r}
feats <-
  OAI_JSW_Data %>% 
  filter(VISITID == 0, complete.cases(.)) %>% 
  select(-c(VISITID, IDSIDE, ID, SITE, BASEVISITDATE_C, FUVISITDATE, DaysFomStart, DAYSFROMBL,
            DaystoTKR, TimeToEvent, DeathVisit, BLKL2, WOMACTOTAL, 
            WOMAC2YTotalChange, `2YIncreaseSymptoms`, YearTOKL2), 
         -starts_with("Y2")) %>% 
  mutate_at(vars(BLKL, mJSN,lJSN, PainMedication, AnySurgery), as.integer) %>% 
  mutate_if(is.double, funs(scale))
```

</br>

## Bootstrapping

Calculate optimal _k_ s by:

1. Sample of 25% of data with replacement
2. Calculate Dunn index for said sample per clustering algorithm for _k_ s between and 2 and a given value
3. Find _k_ that maximizes the Dunn index per clustering algorithm
4. Repeat steps 1 to 3 _x_ times
5. Get mean best _k_ per clustering algorithm

```{r}

res         <- clustest::find_k(dataset = feats, train_size = 0.25, iters = 28, max_k = 5)
train       <- res$train
test        <- res$test
k_stats     <- res$k_stats
best_k      <- res$best_k 
all_best_ks <- res$all_best_ks
c_samples   <- res$c_samples

remove(res)

```

</br>

Plotting optimal _k_ s

```{r}

k_stats %>% 
  ggplot(aes(x = method, y = mean_best_k)) +
    geom_col(aes(fill = method)) +
    guides(fill = F) +
    geom_errorbar(aes(ymin = ci_min, ymax = ci_max))

best_k
```

</br>

## Jaccard Stability Test

Fuzzy _c_-Means Clustering Bootstrap Interface

```{r}
cmeansCBI <- function(data, k) {
  result <- cmeans(x = data, center = k)
  
  list(result = result, 
       partition = result$cluster,
       clusterlist = map(1:k, ~. == result$cluster),
       clustermethod = "fcmeans",
       nc = k)
}
```

</br>

Jaccard evaluation of all algorithms

```{r}
jac <- list(
  KMeans  = clusterboot(train, B = 30, clustermethod = kmeansCBI, k = best_k$KMeans)$bootresult,
  FCMeans = clusterboot(train, B = 30, clustermethod = cmeansCBI, k = best_k$FCMeans)$bootresult,
  HCWard  = clusterboot(train, B = 30, clustermethod = hclustCBI, k = best_k$HCWard, 
                        method = "ward.D")$bootresult,
  HCSing  = clusterboot(train, B = 30, clustermethod = hclustCBI, k = best_k$HCSing, 
                        method = "single")$bootresult,
  HCComp  = clusterboot(train, B = 30, clustermethod = hclustCBI, k = best_k$HCComp, 
                        method = "complete")$bootresult)
```

</br>

Plotting stability tests summary

```{r}
get_stats <- function(jac_mat){
  jac_mat %>% 
    t() %>% 
    as_tibble() %>% 
    summarise_all(funs(ci_min = t.test(.)$conf.int[1],
                       ci_max = t.test(.)$conf.int[2],
                       mean_jaccard = mean(.))) %>% 
    gather() %>% 
    separate(key, into = c("cluster", "stat"), extra = "merge") %>% 
    spread(stat, value)
}

jac %>% 
  map_df(get_stats, .id = "method") %>%  
  ggplot(aes(x = cluster, y = mean_jaccard)) +
    facet_wrap(~method) +
    geom_col(aes(fill = cluster)) +
    guides(fill = F) +
    geom_errorbar(aes(ymin = ci_min, ymax = ci_max))
```

</br>

## ANOVA

</br>

Plot training sets ANOVA

```{r}
a_o_v <- 
  c_samples %>% 
  group_by(sample) %>% 
  do(clustest::var_clust_aov(., methods =  c("KMeans", "FCMeans", "HCWard", "HCSing", "HCComp"), 
                                dep_vars = c("WOMACPAIN", "BLKL", "TKREvent")))

a_o_v %>% 
  group_by(dep_var, term) %>% 
  summarise(signif_percent = sum(p.value <= 0.05)/n(),
            ci_min = t.test(p.value)$conf.int[1],
            ci_max = t.test(p.value)$conf.int[2],
            mean_p_value = mean(p.value)) %T>%
  print() %>% 
  ggplot(aes(x = term, y = mean_p_value, fill = term)) +
    facet_wrap(~dep_var, ncol = 2) +
    geom_col() +
    guides(fill = F) +
    geom_errorbar(aes(ymin = ci_min, ymax = ci_max))
```

</br>

Print testing set ANOVA

</br>

```{r}
d_test <- dist(test)

cls <- map(c("ward.D", "single", "complete"), ~hclust(d_test, .))
names(cls) <- c("HCWard", "HCSing", "HCComp")

clustered_test <- 
  test %>% 
  bind_cols(
    KMeans  = kmeans(., best_k$KMeans)$cluster,
    FCMeans = cmeans(., best_k$FCMeans)$cluster,
    HCWard  = 
      cls$HCWard %>% 
      cutree(best_k$HCWard),
    HCSing  = 
      cls$HCSing %>% 
      cutree(best_k$HCSing),
    HCComp  = 
      cls$HCComp %>% 
      cutree(best_k$HCComp)
  )

clustered_test %>%  
  clustest::var_clust_aov(c("KMeans", "FCMeans", "HCWard", "HCSing", "HCComp"),
                          c("WOMACPAIN", "BLKL", "TKREvent")) %>% 
  filter(p.value <= 0.05)
```

</br>

## Hierarchical Clustering Heatmap

```{r}
colormat <- sapply(names(best_k), function(.) rainbow(best_k[[.]])[clustered_test[[.]]])

plotfun <- function(){
  rindx <- get('rowInd',parent.frame())
  cindx <- get('colInd', parent.frame())
  marg  <- get('margins', parent.frame())
  
  for (i in 1:ncol(colormat)){
    par(mar = c(0.25, 0, 0, marg[2]), cex = 0.66)
    image(cbind(1:nrow(colormat)), col = colormat[,i][cindx], axes = F)
    mtext(colnames(colormat)[i], side = 4, line = 1, cex=0.4,las=1, col="black")
  }
  
  par(mar = c(0.25, 0, 0, marg[2]), mgp=c(0,0,0), pty="m",xaxs="i")
  plot(1:nrow(colormat), clustered_test$WOMACPAIN, type = "s", axes=FALSE,ylab="",xlab="")
  mtext("WOMACPAIN", side= 4, line=1, cex=0.4,las=1, col="black")
  
  par(mar = c(0.25, 0, 0, marg[2]),mgp=c(0,0,0),pty="m",xaxs="i")
  image(cbind(1:nrow(colormat)), 
        col = grey.colors(length(unique(clustered_test$TKREvent)))[clustered_test$TKREvent+1], axes = F)
  mtext("TKREvent", side= 4, line=1, cex=0.4,las=1, col="black")
  
  par(mar = c(0.25, 0, 0, marg[2]),mgp=c(0,0,0),pty="m",xaxs="i")
  image(cbind(1:nrow(colormat)), 
        col = rainbow(length(unique(clustered_test$BLKL)))[clustered_test$BLKL], axes = F)
  mtext("BLKL", side= 4, line=1, cex=0.4,las=1, col="black")
  
  par(mar = c(0.25, 0, 1.5, marg[2]), mgp=c(0,0.5,0),pty="m",xaxs="i")
  
  blkl_values <- 
  clustered_test$BLKL %>% 
  unique() %>% 
  sort()

  image(y = blkl_values, z = rbind(1:length(blkl_values)), col = rainbow(length(blkl_values)), xaxt = 'n', 
        ylab = "", cex.axis = 0.7)
  title("BLKL", cex.main = 0.7)
}

test %>% 
  t() %>% 
  heatmap.2(dendrogram = "col", Colv = as.dendrogram(cls$HCWard), trace = "none", 
            scale = "none", breaks = seq(-2, 2, length.out = 101), 
            col = colorRampPalette(colors = c("cyan","blue","black","red","yellow")),
            density.info = "density", cexRow = 0.4, extrafun = plotfun, 
            lmat = rbind(c(4, 3, 13), c(0, 8, 0), c(0, 6, 0), c(0, 7, 0), c(0, 5, 0),
                         c(0, 9, 0), c(0, 10, 0), c(2, 11, 0), c(2, 12, 0), c(2, 1, 0)), 
            lwid = c(1.2, 3.38, 0.62), 
            lhei = c(1.5,0.1,0.1,0.1,0.1,0.1, 0.1, 0.1, 0.1, 3.5))

```
