Clustering

Research question:

Classification of patients based on 4 standardized variables.

Standardizing the clustering variables:

library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:psych':
## 
##     describe

## The following objects are masked from 'package:base':
## 
##     format.pval, units

mydata$Age_z <- scale(mydata$Age)
mydata$BloodPressure_z <- scale(mydata$BloodPressure)
mydata$Cholesterol_z <- scale(mydata$Cholesterol)
mydata$MaxHeartRate_z <- scale(mydata$MaxHeartRate)

rcorr(as.matrix(mydata[,c("Age_z", "BloodPressure_z", "Cholesterol_z", "MaxHeartRate_z")]), type = "pearson")

##                 Age_z BloodPressure_z Cholesterol_z MaxHeartRate_z
## Age_z            1.00            0.25         -0.10          -0.38
## BloodPressure_z  0.25            1.00          0.10          -0.11
## Cholesterol_z   -0.10            0.10          1.00           0.24
## MaxHeartRate_z  -0.38           -0.11          0.24           1.00
## 
## n= 918 
## 
## 
## P
##                 Age_z  BloodPressure_z Cholesterol_z MaxHeartRate_z
## Age_z                  0.0000          0.0039        0.0000        
## BloodPressure_z 0.0000                 0.0022        0.0007        
## Cholesterol_z   0.0039 0.0022                        0.0000        
## MaxHeartRate_z  0.0000 0.0007          0.0000

We need to find outliers:

mydata$Dissimilarity <- sqrt(mydata$Age_z^2 + mydata$BloodPressure_z^2 + mydata$Cholesterol_z^2 + mydata$MaxHeartRate_z^2)

head(mydata[order(-mydata$Dissimilarity),], 30)

##     Age Gender BloodPressure Cholesterol MaxHeartRate GenderFactor       Age_z BloodPressure_z Cholesterol_z
## 450  55      M             0           0          155         Male  0.15786784     -7.15109712  -1.817444065
## 400  61      M           200           0           70         Male  0.79395857      3.65144880  -1.817444065
## 366  64      F           200           0          140       Female  1.11200394      3.65144880  -1.817444065
## 760  54      M           192         283          195         Male  0.05185271      3.21934696   0.769768197
## 77   32      M           118         529          130         Male -2.28048000     -0.77759503   3.018723025
## 617  67      F           115         564          160       Female  1.43004931     -0.93963322   3.338696273
## 373  63      M           185           0           98         Male  1.00598882      2.84125786  -1.817444065
## 733  56      F           200         288          133       Female  0.26388296      3.65144880   0.815478661
## 150  54      M           130         603          125         Male  0.05185271     -0.12944227   3.695237892
## 110  39      M           190         241          106         Male -1.53837413      3.11132150   0.385800300
## 242  54      M           200         198          142         Male  0.05185271      3.65144880  -0.007309691
## 830  29      M           130         204          202         Male -2.59852536     -0.12944227   0.047542866
## 497  58      M           132         458           69         Male  0.47591321     -0.02141681   2.369634436
## 361  62      M           160           0           72         Male  0.89997370      1.49093962  -1.817444065
## 403  65      M           145           0           67         Male  1.21801907      0.68074867  -1.817444065
## 295  32      M            95           0          127         Male -2.28048000     -2.01988781  -1.817444065
## 391  51      M           140           0           60         Male -0.26619266      0.41068502  -1.817444065
## 371  60      M           135           0           63         Male  0.68794345      0.14062138  -1.817444065
## 216  30      F           170         237          170       Female -2.49251024      2.03106691   0.349231928
## 329  52      M            95           0           82         Male -0.16017753     -2.01988781  -1.817444065
## 333  38      M           100           0          179         Male -1.64438926     -1.74982416  -1.817444065
## 408  62      M           115           0           72         Male  0.89997370     -0.93963322  -1.817444065
## 315  53      M            80           0          141         Male -0.05416241     -2.83007875  -1.817444065
## 209  28      M           130         132          185         Male -2.70454049     -0.12944227  -0.610687815
## 593  61      M           190         287          150         Male  0.79395857      3.11132150   0.806336568
## 551  55      M           172         260           73         Male  0.15786784      2.13909237   0.559500063
## 124  58      F           180         393          110       Female  0.47591321      2.57119421   1.775398405
## 275  45      F           180         295          180       Female -0.90228340      2.57119421   0.879473311
## 310  57      M            95           0          182         Male  0.36989808     -2.01988781  -1.817444065
## 492  75      M           170         203          108         Male  2.27817030      2.03106691   0.038400773
##     MaxHeartRate_z Dissimilarity
## 450      0.7144695      7.414626
## 400     -2.6240570      4.914491
## 366      0.1253178      4.229473
## 760      2.2855408      4.022825
## 77      -0.2674501      3.871623
## 617      0.9108534      3.860634
## 373     -1.5243071      3.835541
## 733     -0.1496197      3.753679
## 150     -0.4638340      3.726844
## 110     -1.2100928      3.695955
## 242      0.2038713      3.657511
## 830      2.5604783      3.650671
## 497     -2.6633338      3.596591
## 361     -2.5455034      3.579880
## 403     -2.7418874      3.573239
## 295     -0.3852804      3.568204
## 391     -3.0168248      3.555820
## 371     -2.8989945      3.492895
## 216      1.3036212      3.487009
## 329     -2.1527356      3.470299
## 333      1.6571123      3.437299
## 408     -2.5455034      3.387559
## 315      0.1645945      3.367859
## 209      1.8927730      3.359587
## 593      0.5180856      3.351012
## 551     -2.5062267      3.345871
## 124     -1.0529857      3.331419
## 275      1.6963890      3.328117
## 310      1.7749426      3.266542
## 492     -1.1315393      3.255326

hist(mydata$Dissimilarity, 
     breaks = 20, 
     xlab = "Dissimilarity", 
     main = "Histogram of dissimilarities")

Removing the outlier and scaling the data again:

mydata <- mydata [-450,]
mydata$Age_z <- scale(mydata$Age)
mydata$BloodPressure_z <- scale(mydata$BloodPressure)
mydata$Cholesterol_z <- scale(mydata$Cholesterol)
mydata$MaxHeartRate_z <- scale(mydata$MaxHeartRate)

Calculating Euclidian distances and making the dissimilarity matrix:

distance <- get_dist(mydata[c("Age_z", "BloodPressure_z", "Cholesterol_z", "MaxHeartRate_z")], 
                     method = "euclidian")

fviz_dist(distance^2)

We need to do Hopkins statistics now.

H₀: There are no natural groups of objects

H₁: There are natural groups of objects

get_clust_tendency(mydata[c("Age_z", "BloodPressure_z", "Cholesterol_z", "MaxHeartRate_z")],
                   n = nrow(mydata) - 1, 
                   graph = FALSE)

## $hopkins_stat
## [1] 0.7566933
## 
## $plot
## NULL

According to the estimated Hopkins statistics, the data can be clustered because its value is larger than 0.5.

WARD <- mydata[,c("Age_z", "BloodPressure_z", "Cholesterol_z", "MaxHeartRate_z")] %>% 
  get_dist(method = "euclidian") %>% 
  hclust(method = "ward.D2")

WARD

## 
## Call:
## hclust(d = ., method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 917

Finding best number of clusters:

fviz_dend(WARD)

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.

fviz_dend(WARD, 
          k = 2,
          cex = 0.5, 
          palette = "jama",
          color_labels_by_k = TRUE, 
          rect = TRUE)

## Warning in data.frame(xmin = unlist(xleft), ymin = unlist(ybottom), xmax = unlist(xright), : row names were found from a
## short variable and have been discarded

fviz_dend(WARD, 
          k = 3,
          cex = 0.5, 
          palette = "jama",
          color_labels_by_k = TRUE, 
          rect = TRUE)

library(NbClust)

OptNumber <- mydata[c("Age_z", "BloodPressure_z", "Cholesterol_z", "MaxHeartRate_z")] %>%
  NbClust(distance = "euclidean",
          min.nc = 2, max.nc = 10, 
          method = "ward.D2", 
          index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 2 proposed 2 as the best number of clusters 
## * 14 proposed 3 as the best number of clusters 
## * 4 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

According to the results we chose to use 3 clusters. Now we needed to assign each unit to a cluster.

mydata$ClusterWard <- cutree(WARD, 
                             k = 3)
head(mydata)

##   Age Gender BloodPressure Cholesterol MaxHeartRate GenderFactor      Age_z BloodPressure_z Cholesterol_z
## 1  40      M           140         289          172         Male -1.4314252       0.4144005    0.82367657
## 2  49      F           160         180          156       Female -0.4777965       1.5255272   -0.17406862
## 3  37      M           130         283           98         Male -1.7493014      -0.1411628    0.76875481
## 4  48      F           138         214          108       Female -0.5837553       0.3032879    0.13715465
## 5  54      M           150         195          122         Male  0.0519972       0.9699639   -0.03676423
## 6  39      M           120         339          170         Male -1.5373839      -0.6967261    1.28135785
##   MaxHeartRate_z Dissimilarity ClusterWard
## 1      1.3825849      2.193336           1
## 2      0.7543241      1.746216           2
## 3     -1.5231215      2.448821           1
## 4     -1.1304585      1.316294           2
## 5     -0.5807302      1.116366           2
## 6      1.3040523      2.481364           1

Next, calculating positions of initial leaders:

Initial_leaders <- aggregate(mydata[, c("Age_z", "BloodPressure_z", "Cholesterol_z", "MaxHeartRate_z")], 
                            by = list(mydata$ClusterWard), 
                            FUN = mean)

Initial_leaders

##   Group.1      Age_z BloodPressure_z Cholesterol_z MaxHeartRate_z
## 1       1 -0.6326993      -0.3972406     0.4429802      0.5811472
## 2       2  0.6221562       0.5514726     0.3985778     -0.4023958
## 3       3  0.2785297      -0.1357348    -1.8058339     -0.5823099

K means clustering:

K_MEANS <- hkmeans(mydata[, c("Age_z", "BloodPressure_z", "Cholesterol_z", "MaxHeartRate_z")], 
                   k = 3,
                   hc.metric = "euclidean",
                   hc.method = "ward.D2")

K_MEANS

## Hierarchical K-means clustering with 3 clusters of sizes 351, 398, 168
## 
## Cluster means:
##        Age_z BloodPressure_z Cholesterol_z MaxHeartRate_z
## 1 -0.8599725     -0.43698835     0.3097435      0.7372080
## 2  0.5844532      0.41412137     0.4745508     -0.3575634
## 3  0.4121308     -0.06807973    -1.7713763     -0.6931534
## 
## Clustering vector:
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27  28  29  30 
##   1   2   1   2   2   1   1   1   1   1   1   2   1   1   1   1   1   1   2   1   1   1   1   1   1   1   2   2   2   1 
##  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   2   2   2   1   1   1   2   1   1   2   2   2   1   1   1   2   1   1   1   1   2   2   2   1   1   1   1   2   2   2 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90 
##   1   1   1   1   1   1   1   1   2   1   2   1   1   1   2   2   1   1   3   1   1   1   2   2   2   2   2   2   1   2 
##  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   1   1   1   2   1   2   1   1   3   1   2   2   1   1   1   2   2   1   2   2   2   2   2   1   1   1   1   2   1   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 
##   1   1   1   2   2   2   1   1   1   1   1   1   2   2   2   1   1   1   2   1   2   2   1   1   1   1   1   1   1   2 
## 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 
##   1   3   1   1   1   2   1   2   2   2   2   2   1   1   2   1   2   2   2   2   1   1   1   1   2   1   2   2   1   2 
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 
##   2   1   2   1   2   2   2   1   2   2   2   2   1   2   1   2   1   1   2   2   1   1   1   1   2   2   1   1   1   1 
## 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 
##   1   2   2   1   2   1   1   1   2   1   2   2   1   1   2   1   1   1   1   1   1   1   1   1   2   1   1   2   2   2 
## 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 
##   2   2   2   1   2   1   1   1   1   2   1   1   2   2   2   2   2   1   2   1   1   1   1   3   1   2   2   1   2   1 
## 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 
##   1   1   2   2   1   2   1   2   2   1   2   2   1   1   1   1   2   1   1   1   1   1   1   3   1   3   3   3   3   3 
## 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 
##   3   3   3   3   3   3   3   3   3   1   3   3   1   1   3   3   3   3   3   3   3   3   1   3   3   3   3   3   3   3 
## 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 
##   3   3   1   1   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
## 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 
##   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   1   3   3   3   3   3   3 
## 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 
##   3   3   3   3   3   1   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   2   1   2   2 
## 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 451 
##   2   3   2   3   3   2   2   3   3   3   3   2   2   1   3   3   3   3   3   3   1   3   3   2   3   2   3   2   2   3 
## 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 
##   3   2   3   2   3   3   1   3   3   2   3   2   3   3   1   3   3   2   1   3   3   3   2   3   3   1   3   2   3   3 
## 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 
##   3   2   3   3   2   1   2   2   2   2   2   3   2   2   2   2   2   2   2   2   2   2   1   2   2   2   1   3   2   2 
## 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 
##   2   1   2   3   3   2   2   3   2   2   2   2   2   2   1   2   2   1   2   2   2   2   2   2   3   3   2   2   2   2 
## 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 
##   3   2   2   2   1   1   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   1   2   1   2   2 
## 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 
##   3   2   2   2   2   2   2   2   2   2   1   2   2   2   2   2   1   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 
##   2   2   2   2   2   2   2   2   1   1   2   2   2   2   2   2   2   2   2   2   2   1   2   2   2   2   1   2   1   2 
## 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 
##   2   1   2   1   2   1   1   1   1   1   1   1   1   2   2   2   1   2   2   1   2   2   1   1   1   2   1   1   2   2 
## 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 
##   1   1   2   2   1   1   2   2   1   1   2   2   2   2   2   1   1   2   2   2   1   2   1   1   2   1   2   2   2   1 
## 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 
##   1   1   1   1   2   1   2   1   2   1   2   2   1   2   2   2   2   2   1   1   2   1   2   1   1   2   1   2   2   1 
## 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 
##   1   2   2   1   2   1   2   1   1   1   1   2   2   2   1   2   2   2   1   2   2   1   1   1   1   2   1   2   2   1 
## 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 
##   2   2   1   1   2   1   1   2   2   2   1   1   1   1   1   1   1   2   1   1   2   1   1   2   1   2   1   2   1   2 
## 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 
##   1   1   1   2   2   2   2   2   1   2   2   1   2   1   1   2   1   1   1   1   2   1   2   2   1   2   1   1   1   1 
## 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 
##   2   1   2   2   2   1   2   1   2   1   1   1   2   1   2   1   1   2   1   1   2   1   1   1   1   2   2   2   1   1 
## 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 
##   2   1   2   1   2   1   1   1   2   2   1   1   1   1   2   1   1   2   2   2   2   2   2   2   2   1   1   1   2   2 
## 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 
##   2   2   2   1   1   2   1   1   1   2   1   2   2   2   2   1   1   1   2   2   2   1   2   2   1   1   2   1   2   2 
## 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 
##   2   2   1   1   2   1   1   2   2   1   2   2   1   2   3   1   1 
## 
## Within cluster sum of squares by cluster:
## [1] 717.1737 959.5077 454.3758
##  (between_SS / total_SS =  41.8 %)
## 
## Available components:
## 
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"        
##  [8] "iter"         "ifault"       "data"         "hclust"

fviz_cluster(K_MEANS, 
             palette = "jama", 
             repel = FALSE,
             ggtheme = theme_classic())

mydata$ClusterK_Means <- K_MEANS$cluster
table(mydata$ClusterWard, mydata$ClusterK_Means)

##    
##       1   2   3
##   1 325  80   2
##   2  14 318   4
##   3  12   0 162

From this table, we can see that following the application of K-MEANS clustering, 80 units were reclassified from cluster 1 to cluster 2, 2 from cluster 1 to cluster 2, 14 from cluster 2 to cluster one etc.

Centroids <- K_MEANS$centers
Centroids

##        Age_z BloodPressure_z Cholesterol_z MaxHeartRate_z
## 1 -0.8599725     -0.43698835     0.3097435      0.7372080
## 2  0.5844532      0.41412137     0.4745508     -0.3575634
## 3  0.4121308     -0.06807973    -1.7713763     -0.6931534

Figure <- as.data.frame(Centroids)
Figure$id <- 1:nrow(Figure)
Figure <- pivot_longer(Figure, cols = c(Age_z, BloodPressure_z, Cholesterol_z, MaxHeartRate_z))

Figure$Groups <- factor(Figure$id, 
                        levels = c(1, 2, 3), 
                        labels = c("1", "2", "3"))

Figure$nameFactor <- factor(Figure$name, 
                            levels = c("Age_z", "BloodPressure_z", "Cholesterol_z", "MaxHeartRate_z"), 
                            labels = c("Age_z", "BloodPressure_z", "Cholesterol_z", "MaxHeartRate_z"))

ggplot(Figure, aes(x = nameFactor, y = value)) +
  geom_hline(yintercept = 0) +
  theme_bw() +
  geom_point(aes(shape = Groups, col = Groups), size = 3) +
  geom_line(aes(group = id), linewidth = 1) +
  ylab("Averages") +
  xlab("Cluster variables")+
  ylim(-2, 2)

From the graph we can conclude that cluster 1 is below average while 2 and 3 are above average with age. It indicates that the results for clusters 2 and 3 lower on points of Blood pressure, cholesterol and maximum heart rate, while in cluster 1 the points are rising in comparison to age.

H₀: All arithmetic means are equal

H₁: At least one arithmetic mean is different

fit <- aov(cbind(Age_z, BloodPressure_z, Cholesterol_z, MaxHeartRate_z) ~ as.factor(ClusterK_Means), data = mydata)

summary(fit)

##  Response 1 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   2 424.07 212.035  393.96 < 2.2e-16 ***
## Residuals                 914 491.93   0.538                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 2 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   2 136.06  68.030  79.724 < 2.2e-16 ***
## Residuals                 914 779.94   0.853                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 3 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   2 650.45  325.23  1119.4 < 2.2e-16 ***
## Residuals                 914 265.55    0.29                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response 4 :
##                            Df Sum Sq Mean Sq F value    Pr(>F)    
## as.factor(ClusterK_Means)   2 322.36 161.181  248.16 < 2.2e-16 ***
## Residuals                 914 593.64   0.649                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From the test we can conduct that with all 4 variables we can reject H0 at p < 0.001. All of the variables employed in the clustering are statistically significant.

Criterion validity:

H₀: All arithmetic means are equal. H₁: At least one arithmetic mean is different.

aggregate(mydata$Age, 
          by = list(mydata$ClusterK_Means), 
          FUN = "mean")

##   Group.1        x
## 1       1 45.39316
## 2       2 59.02513
## 3       3 57.39881

fit <- aov(Age ~ as.factor(ClusterK_Means), 
           data = mydata)

summary(fit)

##                            Df Sum Sq Mean Sq F value Pr(>F)    
## as.factor(ClusterK_Means)   2  37771   18886     394 <2e-16 ***
## Residuals                 914  43816      48                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can reject H₀, at p < 0.001, which means that at least one arithmetic mean is different.

Pearson Chi2 test - checking validity:

H₀: There is no association between categorical variables. H₁: There is some association between categorical variables.

chisq_results <- chisq.test(mydata$GenderFactor, as.factor(mydata$ClusterK_Means))
chisq_results

## 
##  Pearson's Chi-squared test
## 
## data:  mydata$GenderFactor and as.factor(mydata$ClusterK_Means)
## X-squared = 38.497, df = 2, p-value = 4.37e-09

addmargins(chisq_results$observed)

##                    
## mydata$GenderFactor   1   2   3 Sum
##              Male   249 316 159 724
##              Female 102  82   9 193
##              Sum    351 398 168 917

round(chisq_results$expected, 2)

##                    
## mydata$GenderFactor      1      2      3
##              Male   277.13 314.23 132.64
##              Female  73.87  83.77  35.36

round(chisq_results$res, 2)

##                    
## mydata$GenderFactor     1     2     3
##              Male   -1.69  0.10  2.29
##              Female  3.27 -0.19 -4.43

We can reject H0 at p < 0.001. By this we see that there is some correlation between variables. In cluster 2 and 3 the majority of the patients was male.

Homework4_Alisa_Kim_Jagodic

2023-02-16

Clustering

Research question:

Conclusion