# Clustering
# Petter Storm 2016-04-06


# Function for calculating distance between each patient and cluster-center and returning cluster with minimum distance
clusters <- function(x, centers) {
  # compute squared euclidean distance from each sample to each cluster center
  tmp <- sapply(seq_len(nrow(x)),
                function(i) apply(centers, 1,
                                  function(v) sum((x[i, ]-v)^2)))
  max.col(-t(tmp))  # find index of min distance
}





#/////////////////////////////////////////////////////////////////////////////
#  Read data
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

# Read clinical variables from file
clusterData<-read.csv("~/Dropbox/DIREVA.clusterdata.20160406.csv")
head(clusterData,n=30)
##     X patid Immigrant diagresult_id sex      BMI HbA1c    gadab
## 1   1     1         0        T2DRIN   2 17.55587  13.8 2.198335
## 2   2     2         0        T2DRIN   1 39.45209   7.9 2.198335
## 3   3     3         0                 2 24.56747  12.3 7.601407
## 4   4     4         0        T1DABS   1 16.25050  12.4 3.806885
## 5   5     5         0         T1DEL   1 29.53704   8.8 7.601407
## 6   6     6         0           SEK   1 24.42659  14.0 7.601407
## 7   7     7         0        T1DABS   1 23.39693   9.4 4.990501
## 8   8     8         0        T1DRIN   2 21.67969   9.0 4.634826
## 9   9     9         0        T1DABS   2 21.87242  14.0 7.601407
## 10 10    10         0                 1 17.53469  14.0 4.219655
## 11 11    11         0        T2DRIN   1 26.83016  11.1 2.198335
## 12 13    13         0        T2DRIN   1 25.35693   5.4 2.198335
## 13 14    14         0        T2DCLC   1 37.94832   7.2 2.198335
## 14 15    15         0        T2DCLC   1 30.45776   8.0 2.198335
## 15 16    16         0        T2DCLC   1 33.39506   8.5 2.198335
## 16 18    18         0        T2DCLC   2 30.71740   5.3 2.198335
## 17 19    19         0        T2DCLC   1 26.85121   6.6 2.198335
## 18 20    20         0        T2DCLC   1 30.42185   6.8 1.388791
## 19 21    21         0        T2DCLC   1 22.87640  10.2 2.198335
## 20 22    22         0        T2DCLC   2 31.24524   6.4 2.198335
## 21 23    23         0        T2DCLC   1 32.44712  10.1 2.198335
## 22 24    25         0        T2DRIN   2 20.14568   5.6 2.198335
## 23 25    26         0        T2DCLC   1 34.77551   5.8 2.198335
## 24 26    27         0        T2DCLC   1 32.07161   6.5 2.198335
## 25 27    28         0        T2DCLC   2 22.98169  13.7 2.198335
## 26 28    29         0           LDA   2 29.07331   9.2 7.601407
## 27 29    30         0           LDA   2 35.94415   6.4 3.091497
## 28 30    31         0        T2DRIN   2 33.57377   5.7 2.198335
## 29 31    32         0        T2DCLC   1 29.11999   6.8 2.198335
## 30 32    33         0         T2YNG   2 35.32124   6.5 2.198335
##    AgeAtDiagnosis     HOMAB     HOMAIR
## 1              67 1.9047619 0.04977778
## 2              57 3.4545455 0.09626667
## 3              41 0.5714286 0.05226667
## 4              25 0.2941176 0.09133333
## 5              40 1.8620690 0.07680000
## 6              39 0.6238532 0.21760000
## 7              33 0.3750000 0.10480000
## 8              24 1.2083333 0.10697778
## 9              25 0.7307692 0.07346667
## 10             23 1.5952381 0.35435556
## 11             64 0.6792453 0.46560000
## 12             45 4.5217391 0.13404444
## 13             48 3.1320755 0.32462222
## 14             54 3.0400000 0.28711111
## 15             56 7.5789474 0.17280000
## 16             58 6.2222222 0.23146667
## 17             65 3.5652174 0.29520000
## 18             58 6.1739130 0.18302222
## 19             55 3.0952381 0.22244444
## 20             71 4.4864865 0.26560000
## 21             74 6.0000000 0.36986667
## 22             69 3.0000000 0.13000000
## 23             69 7.6153846 0.26840000
## 24             78 6.8888889 0.25626667
## 25             62 4.7333333 0.20511111
## 26             56 3.3000000 0.22000000
## 27             60 7.1250000 0.33946667
## 28             79 5.4117647 0.10631111
## 29             77 3.8245614 0.44568889
## 30             35 5.2857143 0.20720000
# read clustercenters
clusterCenters <- read.csv("~/ANDIS/ShinyPanel/klustercenters.csv",row.names = 1)



#/////////////////////////////////////////////////////////////////////////////
# Prepare clinical data
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

# Scale variables
clusterData.scale <- data.frame(scale(clusterData[,colnames(clusterData) %nin% c("patid","Immigrant", "diagresult_id","sex")]))
clusterData.scale$patid <- clusterData$patid

#rearrange columnorder
clusterData.scale<-clusterData.scale[,c("HbA1c","BMI","gadab","AgeAtDiagnosis",      "HOMAB",     "HOMAIR")]

# take a look
head(clusterData.scale,n=30)
##           HbA1c           BMI       gadab AgeAtDiagnosis        HOMAB
## 1   3.106358222 -2.265765e+00  0.07810134     0.55838383 -0.886970814
## 2   0.333550465  1.579389e+00  0.07810134    -0.15389468 -0.403740909
## 3   2.401407097 -1.034470e+00  4.63858433    -1.29354030 -1.302710508
## 4   2.448403839 -2.494999e+00  1.43580355    -2.43318593 -1.389177377
## 5   0.756521140 -1.617746e-01  4.63858433    -1.36476816 -0.900282676
## 6   3.200351705 -1.059211e+00  4.63858433    -1.43599601 -1.286364256
## 7   1.038501590 -1.240027e+00  2.43483892    -1.86336312 -1.363957874
## 8   0.850514623 -1.541589e+00  2.13463052    -2.50441378 -1.104120565
## 9   3.200351705 -1.507743e+00  4.63858433    -2.43318593 -1.253027331
## 10  3.200351705 -2.269483e+00  1.78420364    -2.57564163 -0.983481814
## 11  1.837446198 -6.371234e-01  0.07810134     0.34470028 -1.269092743
## 12 -0.841368075 -8.958353e-01  0.07810134    -1.00862890 -0.070984830
## 13  0.004573274  1.315314e+00  0.07810134    -0.79494535 -0.504288588
## 14  0.380547207 -8.782917e-05  0.07810134    -0.36757824 -0.532998160
## 15  0.615530915  5.157258e-01  0.07810134    -0.22512253  0.882267284
## 16 -0.888364817  4.550665e-02  0.07810134    -0.08266683  0.459233911
## 17 -0.277407176 -6.334275e-01  0.07810134     0.41592813 -0.369232871
## 18 -0.183413693 -6.394373e-03 -0.60519710    -0.08266683  0.444170878
## 19  1.414475523 -1.331436e+00  0.07810134    -0.29635039 -0.515774658
## 20 -0.371400659  1.381991e-01  0.07810134     0.84329524 -0.081976772
## 21  1.367478781  3.492590e-01  0.07810134     1.05697879  0.389943962
## 22 -0.747374592 -1.810973e+00  0.07810134     0.70083953 -0.545470351
## 23 -0.653381109  7.581437e-01  0.07810134     0.70083953  0.893628592
## 24 -0.324403917  2.833165e-01  0.07810134     1.34189020  0.667103758
## 25  3.059361480 -1.312947e+00  0.07810134     0.20224457 -0.005008748
## 26  0.944508107 -2.432085e-01  4.63858433    -0.22512253 -0.451928919
## 27 -0.371400659  9.633655e-01  0.83197786     0.05978887  0.740724329
## 28 -0.700377851  5.471076e-01  0.07810134     1.41311805  0.206529391
## 29 -0.183413693 -2.350114e-01  0.07810134     1.27066234 -0.288368171
## 30 -0.324403917  8.539778e-01  0.07810134    -1.72090741  0.167226268
##         HOMAIR
## 1  -0.95592541
## 2  -0.77806744
## 3  -0.94640338
## 4  -0.79694147
## 5  -0.85254334
## 6  -0.31386834
## 7  -0.74542047
## 8  -0.73708869
## 9  -0.86529606
## 10  0.20933337
## 11  0.63493423
## 12 -0.63353658
## 13  0.09557908
## 14 -0.04793156
## 15 -0.48526493
## 16 -0.26081701
## 17 -0.01698495
## 18 -0.44615658
## 19 -0.29533438
## 20 -0.13022913
## 21  0.26867604
## 22 -0.64900989
## 23 -0.11951684
## 24 -0.16593675
## 25 -0.36164854
## 26 -0.30468638
## 27  0.15237121
## 28 -0.73963924
## 29  0.55875797
## 30 -0.35365683
# give same column names as for cluster centers
colnames(clusterData.scale)<-colnames(clusterCenters)



#/////////////////////////////////////////////////////////////////////////////
# Assign clusters to new patients
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

# Assign clusters to new patients
clusterData.scale$Cluster<-rownames(clusterCenters)[clusters(clusterData.scale,clusterCenters)]

# take a look
head(clusterData.scale,n=30)
##           HBA1C           BMI        GADA AGEATDIAGNOSIS        HOMAB
## 1   3.106358222 -2.265765e+00  0.07810134     0.55838383 -0.886970814
## 2   0.333550465  1.579389e+00  0.07810134    -0.15389468 -0.403740909
## 3   2.401407097 -1.034470e+00  4.63858433    -1.29354030 -1.302710508
## 4   2.448403839 -2.494999e+00  1.43580355    -2.43318593 -1.389177377
## 5   0.756521140 -1.617746e-01  4.63858433    -1.36476816 -0.900282676
## 6   3.200351705 -1.059211e+00  4.63858433    -1.43599601 -1.286364256
## 7   1.038501590 -1.240027e+00  2.43483892    -1.86336312 -1.363957874
## 8   0.850514623 -1.541589e+00  2.13463052    -2.50441378 -1.104120565
## 9   3.200351705 -1.507743e+00  4.63858433    -2.43318593 -1.253027331
## 10  3.200351705 -2.269483e+00  1.78420364    -2.57564163 -0.983481814
## 11  1.837446198 -6.371234e-01  0.07810134     0.34470028 -1.269092743
## 12 -0.841368075 -8.958353e-01  0.07810134    -1.00862890 -0.070984830
## 13  0.004573274  1.315314e+00  0.07810134    -0.79494535 -0.504288588
## 14  0.380547207 -8.782917e-05  0.07810134    -0.36757824 -0.532998160
## 15  0.615530915  5.157258e-01  0.07810134    -0.22512253  0.882267284
## 16 -0.888364817  4.550665e-02  0.07810134    -0.08266683  0.459233911
## 17 -0.277407176 -6.334275e-01  0.07810134     0.41592813 -0.369232871
## 18 -0.183413693 -6.394373e-03 -0.60519710    -0.08266683  0.444170878
## 19  1.414475523 -1.331436e+00  0.07810134    -0.29635039 -0.515774658
## 20 -0.371400659  1.381991e-01  0.07810134     0.84329524 -0.081976772
## 21  1.367478781  3.492590e-01  0.07810134     1.05697879  0.389943962
## 22 -0.747374592 -1.810973e+00  0.07810134     0.70083953 -0.545470351
## 23 -0.653381109  7.581437e-01  0.07810134     0.70083953  0.893628592
## 24 -0.324403917  2.833165e-01  0.07810134     1.34189020  0.667103758
## 25  3.059361480 -1.312947e+00  0.07810134     0.20224457 -0.005008748
## 26  0.944508107 -2.432085e-01  4.63858433    -0.22512253 -0.451928919
## 27 -0.371400659  9.633655e-01  0.83197786     0.05978887  0.740724329
## 28 -0.700377851  5.471076e-01  0.07810134     1.41311805  0.206529391
## 29 -0.183413693 -2.350114e-01  0.07810134     1.27066234 -0.288368171
## 30 -0.324403917  8.539778e-01  0.07810134    -1.72090741  0.167226268
##         HOMAIR Cluster
## 1  -0.95592541  2_SIDD
## 2  -0.77806744   4_MOD
## 3  -0.94640338  1_SAID
## 4  -0.79694147  1_SAID
## 5  -0.85254334  1_SAID
## 6  -0.31386834  1_SAID
## 7  -0.74542047  1_SAID
## 8  -0.73708869  1_SAID
## 9  -0.86529606  1_SAID
## 10  0.20933337  1_SAID
## 11  0.63493423  2_SIDD
## 12 -0.63353658  5_MARD
## 13  0.09557908  2_SIDD
## 14 -0.04793156  2_SIDD
## 15 -0.48526493   4_MOD
## 16 -0.26081701  5_MARD
## 17 -0.01698495  5_MARD
## 18 -0.44615658  5_MARD
## 19 -0.29533438  2_SIDD
## 20 -0.13022913  5_MARD
## 21  0.26867604  2_SIDD
## 22 -0.64900989  5_MARD
## 23 -0.11951684   4_MOD
## 24 -0.16593675  5_MARD
## 25 -0.36164854  2_SIDD
## 26 -0.30468638  1_SAID
## 27  0.15237121   4_MOD
## 28 -0.73963924  5_MARD
## 29  0.55875797  5_MARD
## 30 -0.35365683   4_MOD
#/////////////////////////////////////////////////////////////////////////////
# Compare with old diagnosis 
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

clusterData.scale$patid <-clusterData$patid

clusterData.newold <- merge(clusterData.scale,clusterData,by="patid")

table(Old=clusterData.newold$diagresult_id,New=clusterData.newold$Cluster)
##         New
## Old      1_SAID 2_SIDD 3_SIRD 4_MOD 5_MARD
##              21      8      1     0      2
##   LDA        19      1      3    16     20
##   LDAL        0      1      1     7     13
##   SEK         6     11      1     2     13
##   T1DABS     52      3      0     0      0
##   T1DEL      28      3      0     0     10
##   T1DRIN      6      0      0     0      0
##   T2DCLC      0    132     60   314    495
##   T2DRIN      0     88      0    14    309
##   T2YNG       0      5      7    16      3