Informe #4 - Analisis estadistico.

Capítulo 8 HCPC: Hierarchical Clustering on Principal Components

Instalar paquetes y librerias

install.packages(c("FactoMineR", "factoextra"), repos = "http://cran.us.r-project.org")
## Installing packages into 'C:/Users/Lejac/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## Warning: unable to access index for repository http://cran.us.r-project.org/src/contrib:
##   no fue posible abrir la URL 'http://cran.us.r-project.org/src/contrib/PACKAGES'
## Warning: packages 'FactoMineR', 'factoextra' are not available for this version of R
## 
## Versions of these packages for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Warning: unable to access index for repository http://cran.us.r-project.org/bin/windows/contrib/4.3:
##   no fue posible abrir la URL 'http://cran.us.r-project.org/bin/windows/contrib/4.3/PACKAGES'
# Cargamos librerias 
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.3
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library (FactoMineR)
## Warning: package 'FactoMineR' was built under R version 4.3.3

Case of continuous variables

library(FactoMineR)
# Compute PCA with ncp = 3
res.pca <- PCA(USArrests, ncp = 3, graph = FALSE)
# Compute hierarchical clustering on principal components
res.hcpc <- HCPC(res.pca, graph = FALSE)
fviz_dend(res.hcpc, 
          cex = 0.7,                     # Label size
          palette = "jco",               # Color palette see ?ggpubr::ggpar
          rect = TRUE, rect_fill = TRUE, # Add rectangle around groups
          rect_border = "jco",           # Rectangle color
          labels_track_height = 0.8      # Augment the room for labels
          )
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

plot(res.hcpc, choice = "3D.map")

head(res.hcpc$data.clust, 10)
##             Murder Assault UrbanPop Rape clust
## Alabama       13.2     236       58 21.2     3
## Alaska        10.0     263       48 44.5     4
## Arizona        8.1     294       80 31.0     4
## Arkansas       8.8     190       50 19.5     3
## California     9.0     276       91 40.6     4
## Colorado       7.9     204       78 38.7     4
## Connecticut    3.3     110       77 11.1     2
## Delaware       5.9     238       72 15.8     2
## Florida       15.4     335       80 31.9     4
## Georgia       17.4     211       60 25.8     3
res.hcpc$desc.var$quanti
## $`1`
##             v.test Mean in category Overall mean sd in category Overall sd
## UrbanPop -3.898420         52.07692       65.540       9.691087  14.329285
## Murder   -4.030171          3.60000        7.788       2.269870   4.311735
## Rape     -4.052061         12.17692       21.232       3.130779   9.272248
## Assault  -4.638172         78.53846      170.760      24.700095  82.500075
##               p.value
## UrbanPop 9.682222e-05
## Murder   5.573624e-05
## Rape     5.076842e-05
## Assault  3.515038e-06
## 
## $`2`
##             v.test Mean in category Overall mean sd in category Overall sd
## UrbanPop  2.793185         73.87500       65.540       8.652131  14.329285
## Murder   -2.374121          5.65625        7.788       1.594902   4.311735
##              p.value
## UrbanPop 0.005219187
## Murder   0.017590794
## 
## $`3`
##             v.test Mean in category Overall mean sd in category Overall sd
## Murder    4.357187          13.9375        7.788       2.433587   4.311735
## Assault   2.698255         243.6250      170.760      46.540137  82.500075
## UrbanPop -2.513667          53.7500       65.540       7.529110  14.329285
##               p.value
## Murder   1.317449e-05
## Assault  6.970399e-03
## UrbanPop 1.194833e-02
## 
## $`4`
##            v.test Mean in category Overall mean sd in category Overall sd
## Rape     5.352124         33.19231       21.232       6.996643   9.272248
## Assault  4.356682        257.38462      170.760      41.850537  82.500075
## UrbanPop 3.028838         76.00000       65.540      10.347798  14.329285
## Murder   2.913295         10.81538        7.788       2.001863   4.311735
##               p.value
## Rape     8.692769e-08
## Assault  1.320491e-05
## UrbanPop 2.454964e-03
## Murder   3.576369e-03
res.hcpc$desc.axes$quanti
## $`1`
##          v.test Mean in category  Overall mean sd in category Overall sd
## Dim.1 -5.175764        -1.964502 -5.828671e-16      0.6192556   1.574878
##            p.value
## Dim.1 2.269806e-07
## 
## $`2`
##         v.test Mean in category  Overall mean sd in category Overall sd
## Dim.2 3.585635        0.7428712 -4.951595e-16      0.6137936  0.9948694
##            p.value
## Dim.2 0.0003362596
## 
## $`3`
##          v.test Mean in category  Overall mean sd in category Overall sd
## Dim.1  2.058338        1.0610731 -5.828671e-16      0.5146613  1.5748783
## Dim.3  2.028887        0.3965588 -4.163336e-18      0.3714503  0.5971291
## Dim.2 -4.536594       -1.4773302 -4.951595e-16      0.5750284  0.9948694
##            p.value
## Dim.1 3.955769e-02
## Dim.3 4.246985e-02
## Dim.2 5.717010e-06
## 
## $`4`
##         v.test Mean in category  Overall mean sd in category Overall sd
## Dim.1 4.986474         1.892656 -5.828671e-16      0.6126035   1.574878
##            p.value
## Dim.1 6.149115e-07
res.hcpc$desc.ind$para
## Cluster: 1
##         Idaho  South Dakota         Maine          Iowa New Hampshire 
##     0.3674381     0.4993032     0.5012072     0.5533105     0.5891145 
## ------------------------------------------------------------ 
## Cluster: 2
##         Ohio     Oklahoma Pennsylvania       Kansas      Indiana 
##    0.2796100    0.5047549    0.5088363    0.6039091    0.7100820 
## ------------------------------------------------------------ 
## Cluster: 3
##        Alabama South Carolina        Georgia      Tennessee      Louisiana 
##      0.3553460      0.5335189      0.6136865      0.8522640      0.8780872 
## ------------------------------------------------------------ 
## Cluster: 4
##   Michigan    Arizona New Mexico   Maryland      Texas 
##  0.3246254  0.4532480  0.5176322  0.9013514  0.9239792

Case of categorical variables

# Loading data
library(FactoMineR)
data(tea)
# Performing MCA
res.mca <- MCA(tea, 
               ncp = 20,            # Number of components kept
               quanti.sup = 19,     # Quantitative supplementary variables
               quali.sup = c(20:36), # Qualitative supplementary variables
               graph=FALSE)
res.hcpc <- HCPC (res.mca, graph = FALSE, max = 3)
fviz_dend(res.hcpc, show_labels = FALSE)

fviz_cluster(res.hcpc, geom = "point", main = "Factor map")

res.hcpc$desc.var$test.chi2
##                    p.value df
## where         8.465616e-79  4
## how           3.144675e-47  4
## price         1.862462e-28 10
## tearoom       9.624188e-19  2
## pub           8.539893e-10  2
## friends       6.137618e-08  2
## resto         3.537876e-07  2
## How           3.616532e-06  6
## Tea           1.778330e-03  4
## sex           1.789593e-03  2
## frequency     1.973274e-03  6
## work          3.052988e-03  2
## tea.time      3.679599e-03  2
## lunch         1.052478e-02  2
## dinner        2.234313e-02  2
## always        3.600913e-02  2
## sugar         3.685785e-02  2
## sophisticated 4.077297e-02  2
res.hcpc$desc.var$category
## $`1`
##                               Cla/Mod   Mod/Cla    Global      p.value
## where=chain store           85.937500 93.750000 64.000000 2.094419e-40
## how=tea bag                 84.117647 81.250000 56.666667 1.478564e-25
## tearoom=Not.tearoom         70.661157 97.159091 80.666667 1.082077e-18
## price=p_branded             83.157895 44.886364 31.666667 1.631861e-09
## pub=Not.pub                 67.088608 90.340909 79.000000 1.249296e-08
## friends=Not.friends         76.923077 45.454545 34.666667 2.177180e-06
## resto=Not.resto             64.705882 81.250000 73.666667 4.546462e-04
## price=p_private label       90.476190 10.795455  7.000000 1.343844e-03
## tea.time=Not.tea time       67.938931 50.568182 43.666667 4.174032e-03
## How=alone                   64.102564 71.022727 65.000000 9.868387e-03
## work=Not.work               63.380282 76.704545 71.000000 1.036429e-02
## sugar=sugar                 66.206897 54.545455 48.333333 1.066744e-02
## always=Not.always           63.959391 71.590909 65.666667 1.079912e-02
## price=p_unknown             91.666667  6.250000  4.000000 1.559798e-02
## frequency=1 to 2/week       75.000000 18.750000 14.666667 1.649092e-02
## frequency=1/day             68.421053 36.931818 31.666667 1.958790e-02
## age_Q=15-24                 68.478261 35.795455 30.666667 2.179803e-02
## price=p_cheap              100.000000  3.977273  2.333333 2.274539e-02
## lunch=Not.lunch             61.328125 89.204545 85.333333 2.681490e-02
## SPC=senior                  42.857143  8.522727 11.666667 4.813710e-02
## lunch=lunch                 43.181818 10.795455 14.666667 2.681490e-02
## always=always               48.543689 28.409091 34.333333 1.079912e-02
## sugar=No.sugar              51.612903 45.454545 51.666667 1.066744e-02
## work=work                   47.126437 23.295455 29.000000 1.036429e-02
## tea.time=tea time           51.479290 49.431818 56.333333 4.174032e-03
## How=lemon                   30.303030  5.681818 11.000000 5.943089e-04
## resto=resto                 41.772152 18.750000 26.333333 4.546462e-04
## How=other                    0.000000  0.000000  3.000000 2.952904e-04
## price=p_variable            44.642857 28.409091 37.333333 1.595638e-04
## frequency=+2/day            45.669291 32.954545 42.333333 9.872288e-05
## friends=friends             48.979592 54.545455 65.333333 2.177180e-06
## how=unpackaged              19.444444  3.977273 12.000000 4.328211e-07
## pub=pub                     26.984127  9.659091 21.000000 1.249296e-08
## where=tea shop               6.666667  1.136364 10.000000 4.770573e-10
## price=p_upscale             18.867925  5.681818 17.666667 9.472539e-11
## how=tea bag+unpackaged      27.659574 14.772727 31.333333 1.927326e-13
## tearoom=tearoom              8.620690  2.840909 19.333333 1.082077e-18
## where=chain store+tea shop  11.538462  5.113636 26.000000 1.133459e-23
##                                v.test
## where=chain store           13.307475
## how=tea bag                 10.449142
## tearoom=Not.tearoom          8.826287
## price=p_branded              6.030764
## pub=Not.pub                  5.692859
## friends=Not.friends          4.736242
## resto=Not.resto              3.506146
## price=p_private label        3.206448
## tea.time=Not.tea time        2.864701
## How=alone                    2.580407
## work=Not.work                2.563432
## sugar=sugar                  2.553408
## always=Not.always            2.549133
## price=p_unknown              2.418189
## frequency=1 to 2/week        2.397866
## frequency=1/day              2.334149
## age_Q=15-24                  2.293869
## price=p_cheap                2.277684
## lunch=Not.lunch              2.214202
## SPC=senior                  -1.976156
## lunch=lunch                 -2.214202
## always=always               -2.549133
## sugar=No.sugar              -2.553408
## work=work                   -2.563432
## tea.time=tea time           -2.864701
## How=lemon                   -3.434198
## resto=resto                 -3.506146
## How=other                   -3.619397
## price=p_variable            -3.775692
## frequency=+2/day            -3.893709
## friends=friends             -4.736242
## how=unpackaged              -5.053925
## pub=pub                     -5.692859
## where=tea shop              -6.226471
## price=p_upscale             -6.475138
## how=tea bag+unpackaged      -7.353743
## tearoom=tearoom             -8.826287
## where=chain store+tea shop -10.029275
## 
## $`2`
##                                         Cla/Mod Mod/Cla   Global      p.value
## where=tea shop                        90.000000  84.375 10.00000 3.703402e-30
## how=unpackaged                        66.666667  75.000 12.00000 5.346850e-20
## price=p_upscale                       49.056604  81.250 17.66667 2.392655e-17
## Tea=green                             27.272727  28.125 11.00000 4.436713e-03
## sophisticated=sophisticated           13.488372  90.625 71.66667 8.080918e-03
## sex=M                                 16.393443  62.500 40.66667 9.511848e-03
## resto=Not.resto                       13.122172  90.625 73.66667 1.587879e-02
## dinner=dinner                         28.571429  18.750  7.00000 1.874042e-02
## escape.exoticism=Not.escape-exoticism 14.556962  71.875 52.66667 2.177458e-02
## how=tea bag+unpackaged                 5.319149  15.625 31.33333 3.876799e-02
## escape.exoticism=escape-exoticism      6.338028  28.125 47.33333 2.177458e-02
## dinner=Not.dinner                      9.318996  81.250 93.00000 1.874042e-02
## resto=resto                            3.797468   9.375 26.33333 1.587879e-02
## Tea=Earl Grey                          7.253886  43.750 64.33333 1.314753e-02
## sex=F                                  6.741573  37.500 59.33333 9.511848e-03
## sophisticated=Not.sophisticated        3.529412   9.375 28.33333 8.080918e-03
## where=chain store+tea shop             2.564103   6.250 26.00000 3.794134e-03
## price=p_variable                       3.571429  12.500 37.33333 1.349384e-03
## age_Q=15-24                            2.173913   6.250 30.66667 6.100227e-04
## price=p_branded                        2.105263   6.250 31.66667 4.024289e-04
## how=tea bag                            1.764706   9.375 56.66667 5.537403e-09
## where=chain store                      1.562500   9.375 64.00000 1.664577e-11
##                                          v.test
## where=tea shop                        11.410559
## how=unpackaged                         9.156781
## price=p_upscale                        8.472945
## Tea=green                              2.845318
## sophisticated=sophisticated            2.648670
## sex=M                                  2.593088
## resto=Not.resto                        2.411690
## dinner=dinner                          2.350655
## escape.exoticism=Not.escape-exoticism  2.294277
## how=tea bag+unpackaged                -2.066641
## escape.exoticism=escape-exoticism     -2.294277
## dinner=Not.dinner                     -2.350655
## resto=resto                           -2.411690
## Tea=Earl Grey                         -2.479748
## sex=F                                 -2.593088
## sophisticated=Not.sophisticated       -2.648670
## where=chain store+tea shop            -2.894789
## price=p_variable                      -3.205264
## age_Q=15-24                           -3.427119
## price=p_branded                       -3.538486
## how=tea bag                           -5.830161
## where=chain store                     -6.732775
## 
## $`3`
##                               Cla/Mod    Mod/Cla   Global      p.value
## where=chain store+tea shop  85.897436  72.826087 26.00000 5.730651e-34
## how=tea bag+unpackaged      67.021277  68.478261 31.33333 1.382641e-19
## tearoom=tearoom             77.586207  48.913043 19.33333 1.252051e-16
## pub=pub                     63.492063  43.478261 21.00000 1.126679e-09
## friends=friends             41.836735  89.130435 65.33333 1.429181e-09
## price=p_variable            51.785714  63.043478 37.33333 1.572243e-09
## resto=resto                 54.430380  46.739130 26.33333 2.406386e-07
## How=other                  100.000000   9.782609  3.00000 1.807938e-05
## frequency=+2/day            41.732283  57.608696 42.33333 4.237330e-04
## tea.time=tea time           38.461538  70.652174 56.33333 8.453564e-04
## work=work                   44.827586  42.391304 29.00000 9.079377e-04
## sex=F                       37.078652  71.739130 59.33333 3.494245e-03
## lunch=lunch                 50.000000  23.913043 14.66667 3.917102e-03
## How=lemon                   51.515152  18.478261 11.00000 8.747530e-03
## sugar=No.sugar              36.129032  60.869565 51.66667 3.484061e-02
## home=home                   31.615120 100.000000 97.00000 3.506563e-02
## home=Not.home                0.000000   0.000000  3.00000 3.506563e-02
## sugar=sugar                 24.827586  39.130435 48.33333 3.484061e-02
## price=p_private label        9.523810   2.173913  7.00000 2.370629e-02
## how=unpackaged              13.888889   5.434783 12.00000 1.645107e-02
## How=alone                   25.128205  53.260870 65.00000 5.300881e-03
## lunch=Not.lunch             27.343750  76.086957 85.33333 3.917102e-03
## sex=M                       21.311475  28.260870 40.66667 3.494245e-03
## Tea=green                    9.090909   3.260870 11.00000 2.545816e-03
## frequency=1 to 2/week       11.363636   5.434783 14.66667 1.604219e-03
## work=Not.work               24.882629  57.608696 71.00000 9.079377e-04
## tea.time=Not.tea time       20.610687  29.347826 43.66667 8.453564e-04
## where=tea shop               3.333333   1.086957 10.00000 1.466234e-04
## price=p_branded             14.736842  15.217391 31.66667 2.746948e-05
## resto=Not.resto             22.171946  53.260870 73.66667 2.406386e-07
## friends=Not.friends          9.615385  10.869565 34.66667 1.429181e-09
## pub=Not.pub                 21.940928  56.521739 79.00000 1.126679e-09
## how=tea bag                 14.117647  26.086957 56.66667 1.082059e-12
## tearoom=Not.tearoom         19.421488  51.086957 80.66667 1.252051e-16
## where=chain store           12.500000  26.086957 64.00000 1.711522e-19
##                               v.test
## where=chain store+tea shop 12.150084
## how=tea bag+unpackaged      9.053653
## tearoom=tearoom             8.278053
## pub=pub                     6.090345
## friends=friends             6.052158
## price=p_variable            6.036775
## resto=resto                 5.164845
## How=other                   4.287379
## frequency=+2/day            3.524844
## tea.time=tea time           3.337500
## work=work                   3.317602
## sex=F                       2.920541
## lunch=lunch                 2.884762
## How=lemon                   2.621767
## sugar=No.sugar              2.110206
## home=home                   2.107600
## home=Not.home              -2.107600
## sugar=sugar                -2.110206
## price=p_private label      -2.261856
## how=unpackaged             -2.398752
## How=alone                  -2.788157
## lunch=Not.lunch            -2.884762
## sex=M                      -2.920541
## Tea=green                  -3.017842
## frequency=1 to 2/week      -3.155139
## work=Not.work              -3.317602
## tea.time=Not.tea time      -3.337500
## where=tea shop             -3.796720
## price=p_branded            -4.193490
## resto=Not.resto            -5.164845
## friends=Not.friends        -6.052158
## pub=Not.pub                -6.090345
## how=tea bag                -7.119644
## tearoom=Not.tearoom        -8.278053
## where=chain store          -9.030332
res.hcpc$desc.axes
## 
## Link between the cluster variable and the quantitative variables
## ================================================================
##              Eta2      P-value
## Dim.2  0.66509105 2.828937e-71
## Dim.1  0.63497903 1.009707e-65
## Dim.4  0.11231020 2.073924e-08
## Dim.14 0.03141943 8.732913e-03
## Dim.6  0.02358138 2.890373e-02
## 
## Description of each cluster by quantitative variables
## =====================================================
## $`1`
##           v.test Mean in category Overall mean sd in category Overall sd
## Dim.6   2.647552       0.03433626 1.608233e-17      0.2655618  0.2671712
## Dim.2  -7.796641      -0.13194656 5.612264e-17      0.1813156  0.3486355
## Dim.1 -12.409741      -0.23196088 1.299886e-17      0.2143767  0.3850642
##            p.value
## Dim.6 8.107689e-03
## Dim.2 6.357699e-15
## Dim.1 2.314001e-35
## 
## $`2`
##           v.test Mean in category  Overall mean sd in category Overall sd
## Dim.2  13.918285       0.81210870  5.612264e-17      0.2340345  0.3486355
## Dim.4   4.350620       0.20342610 -2.717589e-17      0.3700048  0.2793822
## Dim.14  2.909073       0.10749165  2.015604e-17      0.2161509  0.2207818
## Dim.13  2.341566       0.08930402 -4.333917e-18      0.1606616  0.2278809
## Dim.3   2.208179       0.11087544  1.000213e-17      0.2449710  0.3000159
## Dim.11 -2.234447      -0.08934293 -1.678345e-17      0.2066708  0.2389094
##             p.value
## Dim.2  4.905356e-44
## Dim.4  1.357531e-05
## Dim.14 3.625025e-03
## Dim.13 1.920305e-02
## Dim.3  2.723180e-02
## Dim.11 2.545367e-02
## 
## $`3`
##          v.test Mean in category  Overall mean sd in category Overall sd
## Dim.1 13.485906       0.45155993  1.299886e-17      0.2516544  0.3850642
## Dim.6 -2.221728      -0.05161581  1.608233e-17      0.2488566  0.2671712
## Dim.4 -4.725270      -0.11479621 -2.717589e-17      0.2924881  0.2793822
##            p.value
## Dim.1 1.893256e-41
## Dim.6 2.630166e-02
## Dim.4 2.298093e-06
res.hcpc$desc.ind$para
## Cluster: 1
##       285       152       166       143        71 
## 0.5884476 0.6242123 0.6242123 0.6244176 0.6478185 
## ------------------------------------------------------------ 
## Cluster: 2
##        31        95        53       182       202 
## 0.6620553 0.7442013 0.7610437 0.7948663 0.8154826 
## ------------------------------------------------------------ 
## Cluster: 3
##       172        33       233        18        67 
## 0.7380497 0.7407711 0.7503006 0.7572188 0.7701598