# Clustering
# Petter Storm 2016-04-06


# Function for calculating distance between each patient and cluster-center and returning cluster with minimum distance
clusters <- function(x, centers) {
  # compute squared euclidean distance from each sample to each cluster center
  tmp <- sapply(seq_len(nrow(x)),
                function(i) apply(centers, 1,
                                  function(v) sum((x[i, ]-v)^2)))
  max.col(-t(tmp))  # find index of min distance
}


#/////////////////////////////////////////////////////////////////////////////
#  Read data from file
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

# Read clinical variables from file
clusterData<-read.csv("~/Dropbox/DIREVA.clusterdata.20160406.csv")



# read clustercenters
clusterCenters <- read.csv("https://dl.dropboxusercontent.com/u/178837/clusterCenters.csv",row.names = 1)
clusterCenters
##             HBA1C        BMI       GADA AGEATDIAGNOSIS        HOMAB
## 2_SIDD  1.0952958 -0.1192128 -0.1938179    -0.11298639 -0.772066752
## 1_SAID  0.9090041 -1.5881351  2.0272612    -2.25376103 -1.051204710
## 3_SIRD  0.8468572  0.7584651 -0.1549492     0.01730112 -0.381055879
## 4_MOD  -0.5186486  0.9900636 -0.1806745     0.07170192  1.218962566
## 5_MARD -0.5413320 -0.2456463 -0.1635800     0.42326741  0.004496165
##            HOMAIR
## 2_SIDD  0.2384151
## 1_SAID -0.6339935
## 3_SIRD  2.7455749
## 4_MOD   0.1275066
## 5_MARD -0.4196750
#/////////////////////////////////////////////////////////////////////////////
# Prepare clinical data
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

# Scale variables
clusterData.scale <- data.frame(scale(clusterData[,colnames(clusterData) %nin% c("patid","Immigrant", "diagresult_id","sex")]))
clusterData.scale$patid <- clusterData$patid

#rearrange columnorder
clusterData.scale<-clusterData.scale[,c("HbA1c","BMI","gadab","AgeAtDiagnosis",      "HOMAB",     "HOMAIR")]

# take a look
head(clusterData.scale,n=30)
##           HbA1c           BMI       gadab AgeAtDiagnosis        HOMAB
## 1   3.106358222 -2.265765e+00  0.07810134     0.55838383 -0.886970814
## 2   0.333550465  1.579389e+00  0.07810134    -0.15389468 -0.403740909
## 3   2.401407097 -1.034470e+00  4.63858433    -1.29354030 -1.302710508
## 4   2.448403839 -2.494999e+00  1.43580355    -2.43318593 -1.389177377
## 5   0.756521140 -1.617746e-01  4.63858433    -1.36476816 -0.900282676
## 6   3.200351705 -1.059211e+00  4.63858433    -1.43599601 -1.286364256
## 7   1.038501590 -1.240027e+00  2.43483892    -1.86336312 -1.363957874
## 8   0.850514623 -1.541589e+00  2.13463052    -2.50441378 -1.104120565
## 9   3.200351705 -1.507743e+00  4.63858433    -2.43318593 -1.253027331
## 10  3.200351705 -2.269483e+00  1.78420364    -2.57564163 -0.983481814
## 11  1.837446198 -6.371234e-01  0.07810134     0.34470028 -1.269092743
## 12 -0.841368075 -8.958353e-01  0.07810134    -1.00862890 -0.070984830
## 13  0.004573274  1.315314e+00  0.07810134    -0.79494535 -0.504288588
## 14  0.380547207 -8.782917e-05  0.07810134    -0.36757824 -0.532998160
## 15  0.615530915  5.157258e-01  0.07810134    -0.22512253  0.882267284
## 16 -0.888364817  4.550665e-02  0.07810134    -0.08266683  0.459233911
## 17 -0.277407176 -6.334275e-01  0.07810134     0.41592813 -0.369232871
## 18 -0.183413693 -6.394373e-03 -0.60519710    -0.08266683  0.444170878
## 19  1.414475523 -1.331436e+00  0.07810134    -0.29635039 -0.515774658
## 20 -0.371400659  1.381991e-01  0.07810134     0.84329524 -0.081976772
## 21  1.367478781  3.492590e-01  0.07810134     1.05697879  0.389943962
## 22 -0.747374592 -1.810973e+00  0.07810134     0.70083953 -0.545470351
## 23 -0.653381109  7.581437e-01  0.07810134     0.70083953  0.893628592
## 24 -0.324403917  2.833165e-01  0.07810134     1.34189020  0.667103758
## 25  3.059361480 -1.312947e+00  0.07810134     0.20224457 -0.005008748
## 26  0.944508107 -2.432085e-01  4.63858433    -0.22512253 -0.451928919
## 27 -0.371400659  9.633655e-01  0.83197786     0.05978887  0.740724329
## 28 -0.700377851  5.471076e-01  0.07810134     1.41311805  0.206529391
## 29 -0.183413693 -2.350114e-01  0.07810134     1.27066234 -0.288368171
## 30 -0.324403917  8.539778e-01  0.07810134    -1.72090741  0.167226268
##         HOMAIR
## 1  -0.95592541
## 2  -0.77806744
## 3  -0.94640338
## 4  -0.79694147
## 5  -0.85254334
## 6  -0.31386834
## 7  -0.74542047
## 8  -0.73708869
## 9  -0.86529606
## 10  0.20933337
## 11  0.63493423
## 12 -0.63353658
## 13  0.09557908
## 14 -0.04793156
## 15 -0.48526493
## 16 -0.26081701
## 17 -0.01698495
## 18 -0.44615658
## 19 -0.29533438
## 20 -0.13022913
## 21  0.26867604
## 22 -0.64900989
## 23 -0.11951684
## 24 -0.16593675
## 25 -0.36164854
## 26 -0.30468638
## 27  0.15237121
## 28 -0.73963924
## 29  0.55875797
## 30 -0.35365683
# give same column names as for cluster centers
colnames(clusterData.scale)<-colnames(clusterCenters)



#/////////////////////////////////////////////////////////////////////////////
# Assign clusters to new patients
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

# Assign clusters to new patients
clusterData.scale$Cluster<-rownames(clusterCenters)[clusters(clusterData.scale,clusterCenters)]

# take a look
head(clusterData.scale,n=30)
##           HBA1C           BMI        GADA AGEATDIAGNOSIS        HOMAB
## 1   3.106358222 -2.265765e+00  0.07810134     0.55838383 -0.886970814
## 2   0.333550465  1.579389e+00  0.07810134    -0.15389468 -0.403740909
## 3   2.401407097 -1.034470e+00  4.63858433    -1.29354030 -1.302710508
## 4   2.448403839 -2.494999e+00  1.43580355    -2.43318593 -1.389177377
## 5   0.756521140 -1.617746e-01  4.63858433    -1.36476816 -0.900282676
## 6   3.200351705 -1.059211e+00  4.63858433    -1.43599601 -1.286364256
## 7   1.038501590 -1.240027e+00  2.43483892    -1.86336312 -1.363957874
## 8   0.850514623 -1.541589e+00  2.13463052    -2.50441378 -1.104120565
## 9   3.200351705 -1.507743e+00  4.63858433    -2.43318593 -1.253027331
## 10  3.200351705 -2.269483e+00  1.78420364    -2.57564163 -0.983481814
## 11  1.837446198 -6.371234e-01  0.07810134     0.34470028 -1.269092743
## 12 -0.841368075 -8.958353e-01  0.07810134    -1.00862890 -0.070984830
## 13  0.004573274  1.315314e+00  0.07810134    -0.79494535 -0.504288588
## 14  0.380547207 -8.782917e-05  0.07810134    -0.36757824 -0.532998160
## 15  0.615530915  5.157258e-01  0.07810134    -0.22512253  0.882267284
## 16 -0.888364817  4.550665e-02  0.07810134    -0.08266683  0.459233911
## 17 -0.277407176 -6.334275e-01  0.07810134     0.41592813 -0.369232871
## 18 -0.183413693 -6.394373e-03 -0.60519710    -0.08266683  0.444170878
## 19  1.414475523 -1.331436e+00  0.07810134    -0.29635039 -0.515774658
## 20 -0.371400659  1.381991e-01  0.07810134     0.84329524 -0.081976772
## 21  1.367478781  3.492590e-01  0.07810134     1.05697879  0.389943962
## 22 -0.747374592 -1.810973e+00  0.07810134     0.70083953 -0.545470351
## 23 -0.653381109  7.581437e-01  0.07810134     0.70083953  0.893628592
## 24 -0.324403917  2.833165e-01  0.07810134     1.34189020  0.667103758
## 25  3.059361480 -1.312947e+00  0.07810134     0.20224457 -0.005008748
## 26  0.944508107 -2.432085e-01  4.63858433    -0.22512253 -0.451928919
## 27 -0.371400659  9.633655e-01  0.83197786     0.05978887  0.740724329
## 28 -0.700377851  5.471076e-01  0.07810134     1.41311805  0.206529391
## 29 -0.183413693 -2.350114e-01  0.07810134     1.27066234 -0.288368171
## 30 -0.324403917  8.539778e-01  0.07810134    -1.72090741  0.167226268
##         HOMAIR Cluster
## 1  -0.95592541  2_SIDD
## 2  -0.77806744   4_MOD
## 3  -0.94640338  1_SAID
## 4  -0.79694147  1_SAID
## 5  -0.85254334  1_SAID
## 6  -0.31386834  1_SAID
## 7  -0.74542047  1_SAID
## 8  -0.73708869  1_SAID
## 9  -0.86529606  1_SAID
## 10  0.20933337  1_SAID
## 11  0.63493423  2_SIDD
## 12 -0.63353658  5_MARD
## 13  0.09557908  2_SIDD
## 14 -0.04793156  2_SIDD
## 15 -0.48526493   4_MOD
## 16 -0.26081701  5_MARD
## 17 -0.01698495  5_MARD
## 18 -0.44615658  5_MARD
## 19 -0.29533438  2_SIDD
## 20 -0.13022913  5_MARD
## 21  0.26867604  2_SIDD
## 22 -0.64900989  5_MARD
## 23 -0.11951684   4_MOD
## 24 -0.16593675  5_MARD
## 25 -0.36164854  2_SIDD
## 26 -0.30468638  1_SAID
## 27  0.15237121   4_MOD
## 28 -0.73963924  5_MARD
## 29  0.55875797  5_MARD
## 30 -0.35365683   4_MOD
#/////////////////////////////////////////////////////////////////////////////
# Compare with old diagnosis 
#\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

clusterData.scale$patid <-clusterData$patid

clusterData.newold <- merge(clusterData.scale,clusterData,by="patid")

table(Old=clusterData.newold$diagresult_id,New=clusterData.newold$Cluster)
##         New
## Old      1_SAID 2_SIDD 3_SIRD 4_MOD 5_MARD
##              21      8      1     0      2
##   LDA        19      1      3    16     20
##   LDAL        0      1      1     7     13
##   SEK         6     11      1     2     13
##   T1DABS     52      3      0     0      0
##   T1DEL      28      3      0     0     10
##   T1DRIN      6      0      0     0      0
##   T2DCLC      0    132     60   314    495
##   T2DRIN      0     88      0    14    309
##   T2YNG       0      5      7    16      3