# Example
# ---
# Question: Implement the hierarchical clustering algorithm using the Arrests dataset
# ---
# OUR CODE GOES BELOW
#
# Loading the data set
# ---
#
data("USArrests")
# Remove any missing value (i.e, NA values for not available)
# That might be present in the data
# ---
#
df <- na.omit(USArrests)
# Previewing our dataset
# ---
#
head(df)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
# Before hierarchical clustering, we can compute some descriptive statistics
# ---
#
desc_stats <- data.frame(
Min = apply(df, 2, min), # minimum
Med = apply(df, 2, median), # median
Mean = apply(df, 2, mean), # mean
SD = apply(df, 2, sd), # Standard deviation
Max = apply(df, 2, max) # Maximum
)
desc_stats <- round(desc_stats, 1)
head(desc_stats)
## Min Med Mean SD Max
## Murder 0.8 7.2 7.8 4.4 17.4
## Assault 45.0 159.0 170.8 83.3 337.0
## UrbanPop 32.0 66.0 65.5 14.5 91.0
## Rape 7.3 20.1 21.2 9.4 46.0
#the above can be done using describe() as well
library(psych)
## Warning: package 'psych' was built under R version 4.1.3
describe(df)
## vars n mean sd median trimmed mad min max range skew
## Murder 1 50 7.79 4.36 7.25 7.53 5.41 0.8 17.4 16.6 0.37
## Assault 2 50 170.76 83.34 159.00 168.48 110.45 45.0 337.0 292.0 0.22
## UrbanPop 3 50 65.54 14.47 66.00 65.88 17.79 32.0 91.0 59.0 -0.21
## Rape 4 50 21.23 9.37 20.10 20.36 8.60 7.3 46.0 38.7 0.75
## kurtosis se
## Murder -0.95 0.62
## Assault -1.15 11.79
## UrbanPop -0.87 2.05
## Rape 0.08 1.32
# We note that the variables have a large different means and variances.
# This is explained by the fact that the variables are measured in different
# units; Murder, Rape, and Assault are measured as the number of occurrences per 100 000 people,
# and UrbanPop is the percentage of the stateâs population that lives in an urban area.
# They must be standardized (i.e., scaled) to make them comparable. Recall that,
# standardization consists of transforming the variables such that
# they have mean zero and standard deviation one.
#As we donât want the hierarchical clustering result to depend to an arbitrary variable unit,
# we start by scaling the data using the R function scale() as follows
# ---
#
df <- scale(df)
head(df)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
# We now use the R function hclust() for hierarchical clustering
# ---
#
# First we use the dist() function to compute the Euclidean distance between observations,
# d will be the first argument in the hclust() function dissimilarity matrix
# ---
#
d <- dist(df, method = "euclidean")
d
## Alabama Alaska Arizona Arkansas California Colorado
## Alaska 2.7037541
## Arizona 2.2935197 2.7006429
## Arkansas 1.2898102 2.8260386 2.7177583
## California 3.2631104 3.0125415 1.3104842 3.7636409
## Colorado 2.6510673 2.3265187 1.3650307 2.8310512 1.2876185
## Connecticut 3.2152975 4.7399125 3.2628575 2.6076395 4.0663898 3.3279920
## Delaware 2.0192927 3.6213633 1.9093696 1.8003239 3.0737852 2.5547456
## Florida 2.2981353 2.9967642 1.7493928 3.3721968 2.0250039 2.4458600
## Georgia 1.1314351 2.8194388 2.7871963 2.2117614 3.3780585 2.8649105
## Hawaii 3.3885300 4.5301340 3.2621208 2.9723097 3.6589083 2.8233524
## Idaho 2.9146623 4.0580555 3.5210071 1.7687255 4.4879436 3.4767685
## Illinois 1.8734993 3.2670626 1.0825512 2.4626424 1.9117469 1.7898322
## Indiana 2.0761411 3.3655952 2.6407486 1.4450503 3.4061273 2.3655622
## Iowa 3.4878952 4.7251910 4.1157513 2.4252661 4.9708591 3.9406898
## Kansas 2.2941096 3.6808173 2.7762838 1.5718411 3.6071725 2.6272281
## Kentucky 1.8475879 3.5440903 3.3567681 1.0598104 4.2463809 3.2274013
## Louisiana 0.7722224 2.9631431 2.2178519 2.0254276 3.0176625 2.6546743
## Maine 3.4851115 4.8322605 4.2961903 2.3621893 5.2699843 4.2713441
## Maryland 1.2896460 2.2777590 1.2117356 2.0582244 2.2312581 1.9667562
## Massachusetts 2.9874810 4.3729925 2.5162281 2.6881270 3.2156499 2.6522793
## Michigan 1.8814771 2.1154937 1.1940906 2.5895050 1.5146739 1.2363108
## Minnesota 3.2314338 4.4266606 3.5388450 2.3300992 4.3123134 3.3283853
## Mississippi 1.2831907 3.2554326 3.4551406 1.9318631 4.4200736 3.8491042
## Missouri 1.6309686 2.5360573 1.5958731 1.6717500 2.2891751 1.3127406
## Montana 2.3317271 3.6575988 3.3270869 1.2290066 4.2494176 3.1845338
## Nebraska 2.6625170 3.9136902 3.1641791 1.7240495 4.0197242 3.0034613
## Nevada 3.1024305 2.3443182 1.9260292 3.7086787 1.1968261 1.3988595
## New Hampshire 3.5619825 4.8650686 4.2430411 2.4949861 5.1270892 4.1126287
## New Jersey 2.6980230 4.1791832 2.1755787 2.7398478 2.7463023 2.3229870
## New Mexico 1.5993970 2.0580889 1.0376848 2.3183196 1.8010201 1.5467439
## New York 2.0723680 3.2903769 1.0725219 2.7478626 1.6787069 1.7363385
## North Carolina 1.6043662 3.2403071 3.1478947 2.0717938 4.2802569 3.8649275
## North Dakota 4.0614988 5.2110254 4.9319844 2.8756492 5.8660699 4.8014019
## Ohio 2.2698519 3.5903348 2.3585705 1.9617104 3.0133425 2.1188236
## Oklahoma 1.9570874 3.3416664 2.2648377 1.4224574 3.1488712 2.2263966
## Oregon 2.3705678 2.6990696 2.0008664 1.8477626 2.6574019 1.5331980
## Pennsylvania 2.5161340 4.1239537 2.9188907 1.9739986 3.7144562 2.8541709
## Rhode Island 3.3951297 5.0629572 3.0570151 3.0883430 3.8883995 3.4810739
## South Carolina 0.9157968 2.5640542 2.7992041 1.7074195 3.7546959 3.2131137
## South Dakota 3.0835587 4.2467198 4.1020099 1.8724822 5.0529153 3.9667318
## Tennessee 0.8407489 2.3362541 2.2989846 1.4254486 3.0119267 2.1972111
## Texas 1.6463225 3.1527905 1.6448574 2.3505545 2.1698156 1.7947199
## Utah 3.0906007 3.9480881 2.5244431 2.6049855 3.0701663 2.2461228
## Vermont 3.9791527 4.8707876 5.1003665 2.7442984 6.0323504 4.8924735
## Virginia 1.4859733 3.0492081 2.3106550 0.9971035 3.2159723 2.2622539
## Washington 2.6481824 3.2715253 2.1399117 2.1313402 2.7746720 1.7897920
## West Virginia 3.1243471 4.5004558 4.4974190 1.9951691 5.4883565 4.4210375
## Wisconsin 3.5047330 4.8711543 3.9425867 2.6102451 4.7354960 3.7846917
## Wyoming 1.8291027 3.4993456 2.6923028 0.9912639 3.7242766 2.8211492
## Connecticut Delaware Florida Georgia Hawaii Idaho
## Alaska
## Arizona
## Arkansas
## California
## Colorado
## Connecticut
## Delaware 1.7568475
## Florida 4.4700701 3.0614170
## Georgia 3.9738227 2.9838715 2.1812958
## Hawaii 1.3843291 2.4748807 4.3596338 3.8105218
## Idaho 1.6354214 2.0382540 4.6999827 3.8005715 2.3658101
## Illinois 2.7400560 1.5584719 1.7711863 2.3135778 2.7329756 3.2728945
## Indiana 1.6147898 1.6973340 3.6150778 2.6924143 1.5460727 1.4923351
## Iowa 1.5470089 2.6068606 5.2682765 4.2517889 2.1564575 0.8584962
## Kansas 1.2280424 1.5510864 3.8424558 3.0071474 1.4648766 1.2103118
## Kentucky 2.3346386 2.2514939 3.9474983 2.4408198 2.5203345 1.6565236
## Louisiana 3.5329409 2.3266996 1.7529677 0.8592544 3.5687157 3.5283772
## Maine 1.8792141 2.6560808 5.3946798 4.3334217 2.7160558 0.8486112
## Maryland 3.4968269 1.9624834 1.4355204 1.8388691 3.6148670 3.4014584
## Massachusetts 0.9468199 1.4382527 3.7753087 3.6706708 1.3276676 2.2201020
## Michigan 3.7037870 2.5165292 1.3357020 1.9185489 3.4123472 3.7775301
## Minnesota 0.9843793 2.1652930 4.7635252 3.9621842 1.4673850 1.0124936
## Mississippi 4.1762631 3.0510628 3.0886673 1.5828594 4.4777223 3.6002946
## Missouri 2.4383227 1.6723281 2.5182466 2.1021909 2.1832480 2.4697182
## Montana 1.8584328 2.0306850 4.2696476 3.0967288 2.2488801 0.8286936
## Nebraska 1.2116949 1.8113430 4.3082894 3.4295510 1.6628657 0.7515014
## Nevada 4.5868149 3.5920897 1.9500388 2.9023041 4.0281974 4.7300228
## New Hampshire 1.6169000 2.6744233 5.3778074 4.3427351 2.3112009 0.9249563
## New Jersey 1.6108823 1.5808719 3.1900596 3.1989350 1.5050500 2.7425260
## New Mexico 3.6233659 2.2271650 1.2965798 1.9015384 3.5506088 3.5883476
## New York 3.0239174 1.8992106 1.5730970 2.3634498 2.9055803 3.5910319
## North Carolina 4.1894604 2.7475286 2.9994188 2.3351307 4.7330517 3.5929592
## North Dakota 2.5099838 3.3615239 6.0356613 4.8596758 3.1974906 1.4144557
## Ohio 1.4443671 1.5838515 3.3897305 2.8043208 1.1494313 1.9647327
## Oklahoma 1.4510623 1.1802929 3.3553471 2.7121515 1.6585736 1.5168111
## Oregon 2.1756954 1.7742778 3.3399718 2.9998878 2.0031861 1.9757247
## Pennsylvania 0.8721491 1.5894850 3.9389869 3.1817981 1.2119256 1.5171866
## Rhode Island 1.0756115 1.6230495 4.2314871 4.1832075 2.0590981 2.4592705
## South Carolina 4.0127954 2.7039667 2.5295912 1.3970074 4.2531214 3.4549959
## South Dakota 2.2397424 2.6722813 5.1015141 3.8729745 2.8044891 0.8070290
## Tennessee 3.2302375 2.3195070 2.3992285 1.0122252 3.0747375 2.9234395
## Texas 2.8734475 2.0031365 1.8537984 1.7575559 2.5901696 3.3172180
## Utah 1.2825907 1.8080931 3.9274528 3.7183994 1.0709720 2.0268663
## Vermont 3.2066152 3.7144653 6.0766416 4.7091538 3.7208347 1.7797462
## Virginia 1.9277004 1.4088230 3.1515587 2.2249559 2.0479238 1.6999289
## Washington 1.6963486 1.6350170 3.5570666 3.3016469 1.5452901 1.8861921
## West Virginia 2.7117590 3.0381601 5.3004067 3.8545331 3.2831874 1.4398440
## Wisconsin 1.0354597 2.4410507 5.1085370 4.2281611 1.6666970 1.2105401
## Wyoming 1.6218573 1.2586225 3.6325811 2.7329062 2.1883414 1.1687896
## Illinois Indiana Iowa Kansas Kentucky Louisiana
## Alaska
## Arizona
## Arkansas
## California
## Colorado
## Connecticut
## Delaware
## Florida
## Georgia
## Hawaii
## Idaho
## Illinois
## Indiana 2.2027081
## Iowa 3.7380070 1.7786548
## Kansas 2.3228505 0.4287712 1.4699265
## Kentucky 2.8478883 1.1790552 1.9426473 1.3020180
## Louisiana 1.6535178 2.4957547 4.0359614 2.7284126 2.4221964
## Maine 3.9342034 2.1029158 0.6457158 1.7913753 1.9925855 4.0901924
## Maryland 1.3429997 2.5430878 4.0642448 2.7400943 2.8229479 1.2739137
## Massachusetts 2.0080982 1.6615695 2.3510287 1.4343401 2.6284451 3.1524549
## Michigan 1.3959090 2.6118471 4.3248636 2.9020920 3.1163494 1.6677999
## Minnesota 3.1558788 1.3184866 0.7644384 0.9745872 1.9333640 3.6905974
## Mississippi 3.0869477 3.0859068 4.1603272 3.2683740 2.3898884 1.6268879
## Missouri 1.3552973 1.2203931 2.9398546 1.5192717 1.9677184 1.8362172
## Montana 2.9659043 1.0033431 1.2403561 0.9170466 0.8523702 2.9444756
## Nebraska 2.7962196 0.8570429 0.9821819 0.5279092 1.4219429 3.1706333
## Nevada 2.3891753 3.5278633 5.2227312 3.8391728 4.1644286 2.8410670
## New Hampshire 3.8490624 1.9278736 0.2058539 1.6084091 2.0093558 4.1168122
## New Jersey 1.4562775 1.7638332 2.9122979 1.7071034 2.6914828 2.6826380
## New Mexico 1.3393276 2.5909993 4.2131394 2.8356373 3.0007332 1.4911656
## New York 0.3502188 2.4628527 4.0411586 2.6096016 3.1213366 1.7495096
## North Carolina 3.0124311 3.3437548 4.2973973 3.4387635 2.8798080 1.9868618
## North Dakota 4.6139615 2.6587932 1.0534375 2.3970805 2.4482563 4.6977846
## Ohio 1.8124981 0.6976320 2.1610242 0.7817000 1.7726720 2.4996969
## Oklahoma 1.8439860 0.5303259 1.9391446 0.5198728 1.4623483 2.3535566
## Oregon 2.0743434 1.1780815 2.4662295 1.3426890 2.1388677 2.7490592
## Pennsylvania 2.3134187 0.8412900 1.5708895 0.5456840 1.5944097 2.8440845
## Rhode Island 2.5057761 2.3335609 2.5453686 2.0087021 3.0457816 3.5648047
## South Carolina 2.6163680 2.8469842 4.1015324 3.0609333 2.4166385 1.3151908
## South Dakota 3.8004708 1.8411735 0.9886706 1.6701106 1.5114990 3.7457555
## Tennessee 1.9478353 1.8100316 3.4176329 2.1533060 1.7489942 1.1298534
## Texas 0.8241352 2.0035762 3.6962443 2.2378289 2.5297839 1.3325285
## Utah 2.2771632 1.4019666 2.1682069 1.2751603 2.5461745 3.3440990
## Vermont 4.8624402 2.8667983 1.7298425 2.7298377 2.3888326 4.6795933
## Virginia 1.8624960 0.6127246 2.1704984 0.8351949 1.0918624 1.9554079
## Washington 2.0612962 1.1405746 2.2502832 1.1579118 2.2630242 2.9705622
## West Virginia 4.1148082 2.2478563 1.5256890 2.1244674 1.5236299 3.7947215
## Wisconsin 3.4790637 1.6806129 0.6318069 1.3242947 2.0950212 3.9559184
## Wyoming 2.2643574 0.8898783 1.7194683 0.7588728 1.0694408 2.3837077
## Maine Maryland Massachusetts Michigan Minnesota
## Alaska
## Arizona
## Arkansas
## California
## Colorado
## Connecticut
## Delaware
## Florida
## Georgia
## Hawaii
## Idaho
## Illinois
## Indiana
## Iowa
## Kansas
## Kentucky
## Louisiana
## Maine
## Maryland 4.1259083
## Massachusetts 2.6920282 2.9743193
## Michigan 4.5333420 1.0800988 3.0576915
## Minnesota 1.2980362 3.6448929 1.6587245 3.7995101
## Mississippi 4.0014591 2.2992240 4.1217248 2.9722824 4.1067600
## Missouri 3.2055955 1.5705755 1.9810531 1.4068840 2.4088795
## Montana 1.3271199 3.0249456 2.2919046 3.3348908 1.2662635
## Nebraska 1.3218907 3.1309065 1.6863806 3.3478988 0.6083415
## Nevada 5.5153139 2.2551337 3.8556049 1.2609417 4.6391114
## New Hampshire 0.4995971 4.1663744 2.4573524 4.4646172 0.9279247
## New Jersey 3.2532459 2.6263456 0.7977642 2.5678440 2.2254151
## New Mexico 4.3460538 0.5353893 3.0274701 0.5782474 3.7377675
## New York 4.2595904 1.4362170 2.2479437 1.2897453 3.4391596
## North Carolina 4.0631653 2.0542355 4.0773401 3.0232021 4.2219622
## North Dakota 0.7305609 4.7423030 3.3446903 5.1171939 1.8065731
## Ohio 2.5455752 2.5061694 1.1567960 2.4459855 1.5216293
## Oklahoma 2.1929825 2.2492942 1.3383233 2.4336743 1.4198434
## Oregon 2.7813372 2.2466329 1.8709252 2.1626274 1.9270100
## Pennsylvania 1.9197571 2.9585539 1.1337883 3.1048542 1.0106613
## Rhode Island 2.7331079 3.4379146 0.9440940 3.7320501 2.0310592
## South Carolina 4.0015575 1.6165582 3.8310425 2.3233363 3.9484630
## South Dakota 0.7812991 3.7991896 2.8925136 4.1744724 1.4990317
## Tennessee 3.5420469 1.5202431 2.9678843 1.5970196 3.1023238
## Texas 3.9386296 1.5431868 2.2593978 1.2888621 3.1438264
## Utah 2.6218087 3.0338001 0.9015809 2.9441421 1.4177147
## Vermont 1.4253680 4.7430576 3.9277625 5.1250778 2.4019924
## Virginia 2.3474650 2.0124420 1.8503795 2.2439957 1.7932233
## Washington 2.6292546 2.5434911 1.3472994 2.4715215 1.5955418
## West Virginia 1.1818120 4.0251562 3.3782752 4.4668346 2.0791705
## Wisconsin 1.1485830 4.0091486 1.8882704 4.2034334 0.4940832
## Wyoming 1.7665064 2.4041294 1.8201580 2.8324573 1.4845967
## Mississippi Missouri Montana Nebraska Nevada
## Alaska
## Arizona
## Arkansas
## California
## Colorado
## Connecticut
## Delaware
## Florida
## Georgia
## Hawaii
## Idaho
## Illinois
## Indiana
## Iowa
## Kansas
## Kentucky
## Louisiana
## Maine
## Maryland
## Massachusetts
## Michigan
## Minnesota
## Mississippi
## Missouri 2.8692946
## Montana 3.0015255 2.0313649
## Nebraska 3.5269565 1.9651798 0.7389936
## Nevada 4.1064793 2.3489003 4.3243112 4.2628916
## New Hampshire 4.1895936 3.0885710 1.3329504 1.1300720 5.3871427
## New Jersey 3.8894324 1.7079555 2.5912431 2.1246377 3.3464214
## New Mexico 2.6557350 1.4579057 3.1915871 3.2494088 1.7234839
## New York 3.2655822 1.5284764 3.2662661 3.0925340 2.1674148
## North Carolina 1.1826891 3.0224849 3.2209267 3.6500186 4.1773437
## North Dakota 4.4753078 3.7811273 1.8291157 1.9038740 6.0519445
## Ohio 3.4148987 1.1327425 1.6436336 1.2654510 3.2930712
## Oklahoma 3.0466140 1.0927654 1.2225315 0.9674809 3.4108696
## Oregon 3.5033774 0.9974171 1.8044622 1.5727910 2.8581280
## Pennsylvania 3.4971746 1.7793568 1.3246445 0.8483058 4.0392694
## Rhode Island 4.3875001 2.7475003 2.6888576 2.1303973 4.6185330
## South Carolina 0.7865674 2.3846001 2.9024302 3.3517226 3.4427701
## South Dakota 3.5355186 2.8862448 0.8857149 1.2591419 5.1416772
## Tennessee 1.8269569 1.2413874 2.2494023 2.5526834 2.6666268
## Texas 2.8431727 1.1654171 2.8298991 2.7568751 2.2765693
## Utah 4.2571173 1.7478909 2.0956369 1.4573012 3.5868975
## Vermont 4.2046660 3.8803394 1.9261350 2.2952287 6.0437845
## Virginia 2.5383053 0.9787310 1.1556682 1.2472262 3.3001850
## Washington 3.8140404 1.2502752 1.8442691 1.3859985 3.1570805
## West Virginia 3.3281129 3.2538044 1.2758193 1.8117833 5.4963193
## Wisconsin 4.2987974 2.8171535 1.4916365 0.9719877 5.0751736
## Wyoming 2.6813279 1.6073860 0.8150071 0.9268202 3.9202716
## New Hampshire New Jersey New Mexico New York North Carolina
## Alaska
## Arizona
## Arkansas
## California
## Colorado
## Connecticut
## Delaware
## Florida
## Georgia
## Hawaii
## Idaho
## Illinois
## Indiana
## Iowa
## Kansas
## Kentucky
## Louisiana
## Maine
## Maryland
## Massachusetts
## Michigan
## Minnesota
## Mississippi
## Missouri
## Montana
## Nebraska
## Nevada
## New Hampshire
## New Jersey 3.0269198
## New Mexico 4.3360809 2.6208087
## New York 4.1586415 1.6344744 1.3324096
## North Carolina 4.3157112 3.9418824 2.5348334 3.2163998
## North Dakota 0.9231894 3.9166205 4.9450519 4.9325292 4.5836787
## Ohio 2.3095495 1.1099823 2.4960904 2.0434995 3.6205693
## Oklahoma 2.0697098 1.4711183 2.3426252 2.1367108 3.1366639
## Oregon 2.6377191 1.9738854 2.1553130 2.2727718 3.5095191
## Pennsylvania 1.6822035 1.4216058 3.0619915 2.5949374 3.6803956
## Rhode Island 2.5813199 1.4668378 3.6032966 2.7682543 4.2185789
## South Carolina 4.1596914 3.5826726 1.9596343 2.7755634 1.0476313
## South Dakota 0.9874611 3.3318222 3.9969513 4.1124693 3.6955387
## Tennessee 3.5298430 2.6339707 1.5528304 2.0847931 2.3374653
## Texas 3.8178258 1.6226525 1.4418241 0.8457697 3.0857436
## Utah 2.3304873 1.3141843 2.9843796 2.4826984 4.2680823
## Vermont 1.6716127 4.4005416 4.9416825 5.1704762 4.3880034
## Virginia 2.2878085 1.8255601 2.1341562 2.1439207 2.7517523
## Washington 2.4214987 1.5759539 2.4796057 2.2747965 3.8055684
## West Virginia 1.4648924 3.7402121 4.2681325 4.4279608 3.5978058
## Wisconsin 0.7155628 2.4671212 4.1327758 3.7687073 4.4429456
## Wyoming 1.7950754 2.0372127 2.6286722 2.5890441 2.7501141
## North Dakota Ohio Oklahoma Oregon Pennsylvania
## Alaska
## Arizona
## Arkansas
## California
## Colorado
## Connecticut
## Delaware
## Florida
## Georgia
## Hawaii
## Idaho
## Illinois
## Indiana
## Iowa
## Kansas
## Kentucky
## Louisiana
## Maine
## Maryland
## Massachusetts
## Michigan
## Minnesota
## Mississippi
## Missouri
## Montana
## Nebraska
## Nevada
## New Hampshire
## New Jersey
## New Mexico
## New York
## North Carolina
## North Dakota
## Ohio 3.1448279
## Oklahoma 2.8246690 0.6483903
## Oregon 3.2862071 1.2407607 1.0734082
## Pennsylvania 2.5555137 0.7781298 0.8180221 1.7293732
## Rhode Island 3.4042300 1.9659747 1.9746699 2.6621371 1.6369255
## South Carolina 4.5104172 3.1289884 2.7470931 3.0134453 3.3429642
## South Dakota 1.0324944 2.4394250 2.0340486 2.4988870 1.9790714
## Tennessee 4.0623149 2.0167804 1.8500296 2.0306758 2.4343114
## Texas 4.5749422 1.6711510 1.8312655 2.1053000 2.2460705
## Utah 3.1738212 1.0154223 1.2372916 1.2825152 1.2529078
## Vermont 0.9824857 3.4825859 3.1010306 3.4262789 3.0270572
## Virginia 2.9443461 0.9774388 0.5646254 1.2664430 1.1769236
## Washington 3.1725909 0.9725013 0.9586525 0.5935343 1.3993323
## West Virginia 1.2716808 2.8650371 2.4631736 3.0349855 2.3799278
## Wisconsin 1.6216339 1.8649801 1.7916829 2.4088700 1.2204658
## Wyoming 2.4170757 1.3086480 0.7366465 1.6013015 1.0684605
## Rhode Island South Carolina South Dakota Tennessee Texas
## Alaska
## Arizona
## Arkansas
## California
## Colorado
## Connecticut
## Delaware
## Florida
## Georgia
## Hawaii
## Idaho
## Illinois
## Indiana
## Iowa
## Kansas
## Kentucky
## Louisiana
## Maine
## Maryland
## Massachusetts
## Michigan
## Minnesota
## Mississippi
## Missouri
## Montana
## Nebraska
## Nevada
## New Hampshire
## New Jersey
## New Mexico
## New York
## North Carolina
## North Dakota
## Ohio
## Oklahoma
## Oregon
## Pennsylvania
## Rhode Island
## South Carolina 4.1861320
## South Dakota 3.1262712 3.5215978
## Tennessee 3.5743861 1.4375120 3.0589938
## Texas 2.8757996 2.4532276 3.7101039 1.4712840
## Utah 1.7565845 3.8912317 2.6823382 2.8678113 2.4039834
## Vermont 4.1104165 4.2668977 1.0856574 3.9356721 4.7444455
## Virginia 2.4330133 2.2636538 2.0316897 1.3514491 1.6921625
## Washington 2.1743525 3.3802314 2.5083824 2.3809584 2.1635337
## West Virginia 3.5400858 3.4651680 0.7108812 3.1707450 3.9586581
## Wisconsin 2.0779526 4.2190973 1.5437375 3.4257189 3.4539515
## Wyoming 2.1726807 2.5059056 1.5644785 1.9298669 2.2564704
## Utah Vermont Virginia Washington West Virginia Wisconsin
## Alaska
## Arizona
## Arkansas
## California
## Colorado
## Connecticut
## Delaware
## Florida
## Georgia
## Hawaii
## Idaho
## Illinois
## Indiana
## Iowa
## Kansas
## Kentucky
## Louisiana
## Maine
## Maryland
## Massachusetts
## Michigan
## Minnesota
## Mississippi
## Missouri
## Montana
## Nebraska
## Nevada
## New Hampshire
## New Jersey
## New Mexico
## New York
## North Carolina
## North Dakota
## Ohio
## Oklahoma
## Oregon
## Pennsylvania
## Rhode Island
## South Carolina
## South Dakota
## Tennessee
## Texas
## Utah
## Vermont 3.6546040
## Virginia 1.7612066 3.0638337
## Washington 0.6940667 3.4804319 1.3809295
## West Virginia 3.2680139 1.0380554 2.3353210 3.0846553
## Wisconsin 1.8082282 2.3518637 2.1266497 2.0637823 2.0308890
## Wyoming 1.8552036 2.6299335 0.7038309 1.5929546 1.8821600 1.7446366
# We then hierarchical clustering using the Ward's method
# ---
#
res.hc <- hclust(d, method = "ward.D2" )
res.hc
##
## Call:
## hclust(d = d, method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 50
# Lastly, we plot the obtained dendrogram
# ---
#
plot(res.hc, cex = 0.6, hang = -1)
# We now use the R function hclust() for hierarchical clustering
# ---
#
# First we use the dist() function to compute the Euclidean distance between observations,
# d will be the first argument in the hclust() function dissimilarity matrix
# ---
#
d.dist <- dist(df, method = "manhattan")
as.matrix(d.dist)[1:6, 1:6]
## Alabama Alaska Arizona Arkansas California Colorado
## Alabama 0.000000 4.237162 4.433076 2.296372 5.795338 4.850928
## Alaska 4.237162 0.000000 4.460279 3.958759 3.772656 3.881920
## Arizona 4.433076 4.460279 0.000000 4.709019 2.207509 2.086123
## Arkansas 2.296372 3.958759 4.709019 0.000000 6.163118 4.358911
## California 5.795338 3.772656 2.207509 6.163118 0.000000 2.217477
## Colorado 4.850928 3.881920 2.086123 4.358911 2.217477 0.000000
exploring the coloured visualization on ward method created
# Hierarchical clustering using the Ward's method
# ---
#
res.hc <- hclust(d, method = "ward.D2" )
res.hc
##
## Call:
## hclust(d = d, method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 50
# Lastly, we plot the obtained dendrogram
# ---
#
plot(res.hc, cex = 0.6, hang = -1)
# Cut the tree
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.1.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.1.3
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
#visualization
library("ggplot2")
fviz_dend(res.hc, cex = 0.5, k = 4, color_labels_by_k = TRUE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
# Don't color labels, add rectangles
fviz_dend(res.hc, cex = 0.5, k = 4,
color_labels_by_k = FALSE, rect = TRUE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
# Change the color of tree using black color for all groups
# Change rectangle border colors
fviz_dend(res.hc, rect = TRUE, k_colors ="black",
rect_border = 2:5, rect_lty = 1)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
# Customized color for groups
fviz_dend(res.hc, k = 4,
k_colors = c("#1B9E77", "#D95F02", "#7570B3", "#E7298A"))
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
# Color labels using k-means clusters
km.clust <- kmeans(df, 4)$cluster
fviz_dend(res.hc, k = 4,
k_colors = c("blue", "green3", "red", "black"),
label_cols = km.clust[res.hc$order], cex = 0.6)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
my etra work….
Verify the cluster tree After linking the objects in a data set into a hierarchical cluster tree, you might want to assess that the distances (i.e., heights) in the tree reflect the original distances accurately.
One way to measure how well the cluster tree generated by the hclust() function reflects your data is to compute the correlation between the cophenetic distances and the original distance data generated by the dist() function. If the clustering is valid, the linking of objects in the cluster tree should have a strong correlation with the distances between objects in the original distance matrix.
The closer the value of the correlation coefficient is to 1, the more accurately the clustering solution reflects your data. Values above 0.75 are felt to be good. The “average” linkage method appears to produce high values of this statistic. This may be one reason that it is so popular.
The R base function cophenetic() can be used to compute the cophenetic distances for hierarchical clustering.
# Compute cophentic distance
res.coph <- cophenetic(res.hc)
# Correlation between cophenetic distance and
# the original distance
cor(d.dist, res.coph)
## [1] 0.694799
Execute the hclust() function again using the average linkage method. Next, call cophenetic() to evaluate the clustering solution.
res.hc2 <- hclust(d.dist, method = "average")
cor(d.dist, cophenetic(res.hc2))
## [1] 0.7118403
observation: The correlation coefficient shows that using a different linkage method creates a tree that represents the original distances slightly better
##Cut the dendrogram into different groups One of the problems with hierarchical clustering is that, it does not tell us how many clusters there are, or where to cut the dendrogram to form clusters.
You can cut the hierarchical tree at a given height in order to partition your data into clusters. The R base function cutree() can be used to cut a tree, generated by the hclust() function, into several groups either by specifying the desired number of groups or the cut height. It returns a vector containing the cluster number of each observation.
# Cut tree into 4 groups and check in which cluster each country belongs
grp <- cutree(res.hc, k = 4)
head(grp, n = 30)
## Alabama Alaska Arizona Arkansas California
## 1 2 2 3 2
## Colorado Connecticut Delaware Florida Georgia
## 2 3 3 2 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 4 2 3 4
## Kansas Kentucky Louisiana Maine Maryland
## 3 3 1 4 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 2 4 1 3
## Montana Nebraska Nevada New Hampshire New Jersey
## 4 4 2 4 3
# Number of members in each cluster
table(grp)
## grp
## 1 2 3 4
## 7 12 19 12
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
## [1] "Alabama" "Georgia" "Louisiana" "Mississippi"
## [5] "North Carolina" "South Carolina" "Tennessee"
# circuler Visualization
fviz_dend(res.hc, cex = 0.5, k = 4, type = "circular")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
# phylogenic visualization
fviz_dend(res.hc, cex = 0.5, k = 4, type = "phylogenic")
more practice
# Load and scale the data
data(USArrests)
df <- scale(USArrests)
# Hierarchical clustering
res.hc <- hclust(dist(df))
# Default plot
fviz_dend(res.hc)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
# Cut the tree
fviz_dend(res.hc, cex = 0.5, k = 4, color_labels_by_k = TRUE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
# Don't color labels, add rectangles
fviz_dend(res.hc, cex = 0.5, k = 4,
color_labels_by_k = FALSE, rect = TRUE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
# phylogenic
fviz_dend(res.hc, cex = 0.5, k = 4, type = "phylogenic")
# Change the color of tree using black color for all groups
# Change rectangle border colors
fviz_dend(res.hc, rect = TRUE, k_colors ="black",
rect_border = 2:5, rect_lty = 1)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
# Customized color for groups
fviz_dend(res.hc, k = 4,
k_colors = c("#1B9E77", "#D95F02", "#7570B3", "#E7298A"))
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
# Color labels using k-means clusters
km.clust <- kmeans(df, 4)$cluster
fviz_dend(res.hc, k = 4,
k_colors = c("blue", "green3", "red", "black"),
label_cols = km.clust[res.hc$order], cex = 0.6)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
#example Hierarchical clustering for both Agglomerative and DIvisive
#install cluster package
#call the library
library("cluster")
## Warning: package 'cluster' was built under R version 4.1.3
#1. Agglomerative Nesting (Hierarchical Clustering)
res.agnes <- agnes(x = USArrests, # data matrix
stand = TRUE, # Standardize the data
metric = "euclidean", # metric for distance matrix
method = "ward" # Linkage method
)
#2. DIvisive ANAlysis Clustering
res.diana <- diana(x = USArrests, # data matrix
stand = TRUE, # standardize the data
metric = "euclidean" # metric for distance matrix
)
#plotting agnes and diana clustering and checking the differences
fviz_dend(res.agnes, cex = 0.6, k = 4)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
fviz_dend(res.diana, cex = 0.6, k = 4)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.