TEMA 1 CLUSTER

Se agregan las librerias necesarias para realizar el proceso clustering

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.1
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(ggpubr)
## 
## Attaching package: 'ggpubr'
## 
## The following object is masked from 'package:cowplot':
## 
##     get_legend
library(cluster)
library(purrr)
library(dplyr)

La base de datos a utilizar es el registro de las ventas de automóviles

db <- read_csv("Auto Sales data.csv", 
    col_types = cols(ORDERDATE = col_skip(), 
        STATUS = col_skip(), PRODUCTLINE = col_skip(), 
        PRODUCTCODE = col_skip(), CUSTOMERNAME = col_skip(), 
        PHONE = col_skip(), ADDRESSLINE1 = col_skip(), 
        CITY = col_skip(), POSTALCODE = col_skip(), 
        CONTACTLASTNAME = col_skip(), CONTACTFIRSTNAME = col_skip(), 
        DEALSIZE = col_skip(),COUNTRY = col_skip()))
View(db)
summary(db)
##   ORDERNUMBER    QUANTITYORDERED   PRICEEACH      ORDERLINENUMBER 
##  Min.   :10100   Min.   : 6.0    Min.   : 26.88   Min.   : 1.000  
##  1st Qu.:10181   1st Qu.:27.0    1st Qu.: 68.75   1st Qu.: 3.000  
##  Median :10264   Median :35.0    Median : 95.55   Median : 6.000  
##  Mean   :10260   Mean   :35.1    Mean   :101.10   Mean   : 6.491  
##  3rd Qu.:10334   3rd Qu.:43.0    3rd Qu.:127.10   3rd Qu.: 9.000  
##  Max.   :10425   Max.   :97.0    Max.   :252.87   Max.   :18.000  
##      SALES         DAYS_SINCE_LASTORDER      MSRP      
##  Min.   :  482.1   Min.   :  42         Min.   : 33.0  
##  1st Qu.: 2204.3   1st Qu.:1077         1st Qu.: 68.0  
##  Median : 3184.8   Median :1761         Median : 99.0  
##  Mean   : 3553.0   Mean   :1757         Mean   :100.7  
##  3rd Qu.: 4503.1   3rd Qu.:2436         3rd Qu.:124.0  
##  Max.   :14082.8   Max.   :3562         Max.   :214.0

N

DB_PRICE = scale(db, center = TRUE, scale = TRUE)
summary(DB_PRICE)
##   ORDERNUMBER       QUANTITYORDERED      PRICEEACH       ORDERLINENUMBER  
##  Min.   :-1.73885   Min.   :-2.98122   Min.   :-1.7653   Min.   :-1.2980  
##  1st Qu.:-0.85725   1st Qu.:-0.83005   1st Qu.:-0.7696   1st Qu.:-0.8252  
##  Median : 0.04613   Median :-0.01055   Median :-0.1320   Median :-0.1161  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.81346   3rd Qu.: 0.80894   3rd Qu.: 0.6184   3rd Qu.: 0.5930  
##  Max.   : 1.79846   Max.   : 6.34052   Max.   : 3.6099   Max.   : 2.7204  
##      SALES         DAYS_SINCE_LASTORDER      MSRP         
##  Min.   :-1.6699   Min.   :-2.093405    Min.   :-1.68745  
##  1st Qu.:-0.7334   1st Qu.:-0.830101    1st Qu.:-0.81495  
##  Median :-0.2002   Median : 0.004777    Median :-0.04217  
##  Mean   : 0.0000   Mean   : 0.000000    Mean   : 0.00000  
##  3rd Qu.: 0.5166   3rd Qu.: 0.829281    3rd Qu.: 0.58104  
##  Max.   : 5.7259   Max.   : 2.203048    Max.   : 2.82460
DB_PRICE = as.data.frame(DB_PRICE)
AUTOS=rownames(DB_PRICE)

creamos 4 cluster en funcion a su precio

?kmeans
## starting httpd help server ... done
kmcluster = kmeans(DB_PRICE,centers=4,nstart = 50)
kmcluster
## K-means clustering with 4 clusters of sizes 839, 549, 840, 519
## 
## Cluster means:
##   ORDERNUMBER QUANTITYORDERED  PRICEEACH ORDERLINENUMBER       SALES
## 1 -0.86999726     -0.08747219 -0.5958556      0.10085656 -0.52789961
## 2  0.11082118      0.97033795  1.0539473     -0.16171905  1.54028532
## 3  0.85349510     -0.09201466 -0.6012092      0.01730381 -0.53710556
## 4 -0.09219463     -0.73609644  0.8214288     -0.01998091  0.09337149
##   DAYS_SINCE_LASTORDER       MSRP
## 1           0.84410654 -0.5997182
## 2          -0.63661781  0.9013419
## 3           0.07118736 -0.5765595
## 4          -0.80635760  0.9492038
## 
## Clustering vector:
##    [1] 4 1 4 1 4 4 2 4 4 4 4 4 2 4 4 2 2 2 2 3 3 3 3 2 4 2 2 2 2 4 2 4 2 2 2 2 2
##   [38] 2 4 2 2 2 2 2 4 4 2 4 4 2 4 4 4 4 4 2 4 4 2 4 2 4 4 2 4 4 2 4 3 3 2 3 2 2
##   [75] 4 2 2 4 4 2 2 2 2 2 2 2 2 4 2 4 2 4 4 4 4 4 2 2 2 2 4 2 4 2 4 4 2 2 4 4 4
##  [112] 4 4 4 4 3 3 4 3 4 4 2 4 2 4 4 4 4 4 4 4 4 4 2 2 4 4 4 2 4 2 2 3 4 4 2 4 3
##  [149] 4 2 4 2 2 2 4 4 4 4 2 4 4 4 2 2 4 4 2 2 4 2 4 4 4 4 2 4 2 2 4 2 2 4 2 2 2
##  [186] 2 2 2 2 2 2 2 4 4 4 2 4 4 4 2 4 4 4 4 4 2 2 2 4 4 2 4 2 2 4 4 2 2 4 4 4 2
##  [223] 2 3 2 2 4 2 4 2 4 2 4 4 2 4 2 2 2 2 4 4 4 4 2 4 2 2 2 4 2 2 2 4 2 4 2 4 4
##  [260] 4 4 2 4 4 4 4 4 2 4 4 2 3 2 4 2 2 4 2 4 2 4 2 4 4 4 4 4 4 2 2 4 4 4 3 2 4
##  [297] 4 3 4 4 2 2 2 2 4 2 2 2 4 4 2 4 2 4 2 4 2 4 2 4 2 2 4 2 4 1 1 1 1 1 1 1 3
##  [334] 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 2 4 4 4 2 4 4 2 4 2 4 2 2 4 2 4 2 2 3 2
##  [371] 2 3 3 3 2 4 4 4 4 4 2 4 4 2 4 4 4 2 4 4 4 4 4 4 3 2 2 2 2 2 4 4 2 4 4 4 4
##  [408] 2 4 4 4 2 4 4 2 2 4 2 2 4 4 3 4 4 3 4 2 4 4 4 4 4 4 2 4 2 4 4 2 4 4 4 2 2
##  [445] 2 2 2 2 2 4 2 4 2 4 2 4 4 2 2 4 4 4 4 4 2 4 2 4 3 3 3 3 3 3 2 3 2 2 1 1 1
##  [482] 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 2 2 2 2 2 4 4 4 4 4 4 2 4 4 4
##  [519] 4 2 2 2 3 3 2 2 4 2 4 4 2 2 2 2 2 4 2 2 2 2 2 2 2 4 4 2 4 3 3 4 2 4 4 2 4
##  [556] 4 4 2 4 2 4 4 2 4 2 2 2 4 2 4 2 2 2 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3
##  [593] 3 4 3 3 3 3 4 4 2 4 2 4 4 2 2 2 4 2 4 2 4 4 2 4 4 3 4 3 4 4 3 2 4 4 2 2 2
##  [630] 2 4 4 2 4 2 2 2 2 4 2 4 4 4 2 2 2 2 4 4 4 2 4 1 1 1 1 1 1 1 1 3 3 3 3 3 3
##  [667] 3 3 3 2 3 3 3 4 4 2 2 2 2 2 2 4 2 4 4 4 2 2 2 4 4 3 2 2 3 4 4 2 3 4 4 2 4
##  [704] 4 4 2 4 4 2 2 4 4 4 4 2 3 4 4 4 2 4 4 2 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3
##  [741] 3 3 3 3 2 3 3 3 3 3 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 2 3 4 3 3 3 3 3 1 1 1 1
##  [778] 1 1 1 1 3 3 3 3 3 3 3 3 3 2 3 3 3 2 3 3 4 4 2 2 4 4 4 4 2 2 4 2 4 4 2 4 4
##  [815] 2 4 4 4 4 3 4 2 4 2 4 4 4 2 4 2 4 2 4 2 2 4 4 2 4 2 4 3 3 2 4 2 2 1 1 1 2
##  [852] 1 4 2 1 4 1 2 4 4 2 3 2 2 2 3 3 2 3 2 3 3 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3
##  [889] 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1
##  [926] 1 2 1 4 4 4 2 4 4 3 4 4 4 3 4 3 3 2 2 3 3 3 4 4 2 2 4 2 4 4 2 4 4 4 4 2 2
##  [963] 4 4 4 3 2 2 4 3 2 2 4 2 2 4 4 2 4 4 4 2 2 4 2 4 4 2 2 4 2 2 4 2 2 2 4 4 4
## [1000] 2 4 2 4 4 2 2 4 2 4 4 4 4 2 4 2 3 2 2 4 2 2 2 4 1 1 1 1 1 2 1 1 1 2 1 2 2
## [1037] 2 4 4 3 4 4 3 3 3 4 3 2 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 2 1 1
## [1074] 1 1 2 1 1 1 1 2 4 3 3 3 3 3 3 3 3 3 3 2 3 3 4 4 2 4 4 4 4 4 4 4 2 2 2 4 2
## [1111] 4 2 4 4 2 4 4 2 2 2 4 4 4 2 2 2 2 2 2 2 2 4 4 4 4 2 4 4 3 3 2 3 2 2 1 1 1
## [1148] 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 4 3 3 1 1 1 1 4 1 2 2 4 1 2 4 2 2 3 3
## [1185] 3 3 2 4 3 3 3 2 2 2 4 4 4 4 4 2 2 4 2 2 2 2 4 4 2 4 4 2 2 4 4 2 2 2 2 1 1
## [1222] 1 1 1 1 1 1 1 1 3 3 3 3 3 2 3 3 3 3 2 1 1 1 1 1 1 1 1 2 1 1 1 3 3 3 3 2 3
## [1259] 3 3 4 3 3 3 3 3 1 1 2 1 2 2 2 4 4 2 4 2 4 2 2 4 2 4 4 2 2 2 4 3 2 2 3 1 1
## [1296] 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 2 2 4 3 2 3 3 2 4 4 4 4 2 4 2 4 2 4 4 2
## [1333] 4 2 2 2 2 2 4 3 3 2 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 4 3 3 4 3 3 1 1 1 1 1
## [1370] 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3
## [1407] 3 2 3 3 3 3 3 3 3 1 1 1 1 2 2 1 1 1 4 4 2 4 4 2 4 2 3 3 2 3 2 2 3 1 1 1 1
## [1444] 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 2 1 1 1 3 3 3 4 4 3 4 3 3
## [1481] 3 3 3 2 3 3 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 2 3 3 2 3 3 1 1 1 1 1 1 1
## [1518] 1 1 1 1 3 3 3 3 3 3 3 3 3 4 3 3 2 3 2 2 2 1 2 1 2 1 1 4 2 2 2 4 4 2 4 4 4
## [1555] 2 3 3 3 2 4 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1 4 4 2 2 4 1
## [1592] 4 2 2 4 2 4 2 2 4 2 4 2 4 2 2 2 4 4 2 2 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3
## [1629] 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 3 3 3 3 3 3 2 3 1 1 1 1 1 1 1
## [1666] 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3
## [1703] 3 3 2 3 3 3 3 3 3 2 1 1 1 1 1 4 1 1 4 1 2 2 2 2 2 3 3 4 3 2 2 1 1 1 1 1 1
## [1740] 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3
## [1777] 3 3 3 2 3 3 3 2 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1
## [1814] 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3
## [1851] 3 3 3 3 3 3 3 3 1 1 1 1 1 2 2 1 1 1 1 2 2 2 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1
## [1888] 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 4 1 4 2 2 2 2 4 2 4 4 4 2 2 4
## [1925] 4 2 4 3 3 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1
## [1962] 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 1 2 1 1 1 1 1 2 1 1 2 2 2 4 2 4 2 3
## [1999] 2 4 4 3 3 3 2 4 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 2 3 3 2 3 2 3 3 3 1 1 1 1 1
## [2036] 1 1 1 1 1 3 3 3 3 3 3 3 3 3 2 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 2
## [2073] 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 2 3 4 3 3 3 1 1 1 1 1
## [2110] 1 1 1 1 1 1 3 3 3 3 2 3 2 3 2 2 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3
## [2147] 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 2 3 3 2 3 3 3 3 3 1 1 1
## [2184] 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 4 3 3 4 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2221] 1 1 3 3 3 2 3 2 3 2 3 3 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1
## [2258] 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 4 3 3 4 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3
## [2295] 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 4 1 1 1 2 3 3 2 4 3 2 3 3
## [2332] 2 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 3 3 2 3 1 1 1 1 1 1 1 1
## [2369] 1 1 1 1 1 3 3 3 3 4 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3
## [2406] 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 2 3 3 3 1 1 1 1 1 1
## [2443] 1 1 1 1 1 1 1 1 1 3 3 3 4 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3
## [2480] 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 2 3 1 1 1
## [2517] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2554] 1 1 1 3 3 3 2 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 2 2 2 2 3 3 2 3 4 3 3 3 2 3 3
## [2591] 2 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 2 3 1 1 1 1 1 1 1 1 1 1 1
## [2628] 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3
## [2665] 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1
## [2702] 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2739] 3 3 3 3 3 3 3 3 3
## 
## Within cluster sum of squares by cluster:
## [1] 2992.672 2809.578 3283.607 2041.726
##  (between_SS / total_SS =  42.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Graficamos los cluster en funcion del precio

DB_PRICE = DB_PRICE %>% mutate(cluster = kmcluster$cluster)

(g1=ggplot(DB_PRICE, aes(x = PRICEEACH, y = QUANTITYORDERED)) +
    geom_point(aes(color=as.factor(cluster)), size=10)+
    geom_text(aes(label = cluster), size = 5) +
    theme_bw() +
    theme(legend.position = "none")+
    labs(title = "Kmenas con k=4") 
)

graficamos sus 2 primeras componentes

fviz_cluster(kmcluster, DB_PRICE)+
  theme_minimal()

Creamos 2 cluster k=2

kmcluster2 = kmeans(DB_PRICE, centers=2, nstart = 50)
DB_PRICE = DB_PRICE %>% mutate(cluster2 = kmcluster2$cluster)

(g2=ggplot(DB_PRICE, aes(x = PRICEEACH, y = QUANTITYORDERED)) +
    geom_point(aes(color=as.factor(cluster2)), size=10)+
    geom_text(aes(label = cluster2), size = 5) +
    theme_bw() +
    theme(legend.position = "none")+
    labs(title = "Kmenas con k=2") 
)

Podemos graficar ambas al mismo tiempo (notese el cambio en las etiquetas)

plot_grid(g1,g2)

##Buscar el numero optimo de Clusters

# creamos una funcion que nos retorne la var.within para cada k
total_within = function(n_clusters, data, iter.max=1000, nstart=50){
  
  cluster_means = kmeans(data,centers = n_clusters,
                         iter.max = iter.max,
                         nstart = nstart)
  return(cluster_means$tot.withinss)
}


# Se aplica esta funci?n con para diferentes valores de k
total_withinss <- map_dbl(.x = 1:15,
                          .f = total_within,
                          data = DB_PRICE)
total_withinss
##  [1] 23259.428 16594.254 13580.049 11246.821 10566.704  9917.628  9329.845
##  [8]  8896.108  8477.753  8080.707  7811.835  7493.261  7264.373  7062.069
## [15]  6870.473
#graficamos la varianza total

data.frame(n_clusters = 1:15, suma_cuadrados_internos = total_withinss) %>%
  ggplot(aes(x = n_clusters, y = suma_cuadrados_internos)) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = 1:15) +
  labs(title = "Suma total de cuadrados intra-cluster") +
  theme_bw()

En este punto nos damos cuenta que 4 clusters es el punto donde la varianza deja de incrementarse mucho y por eso escogemos ese punto como el número de clusters ideal

#otro metodo, usando el paquete “factoextra”

matriz_dist=get_dist(DB_PRICE, method = "euclidean")

fviz_nbclust(DB_PRICE, FUNcluster = kmeans, 
             method = "wss", k.max = 15, 
             diss = matriz_dist, nstart = 50)
## Warning: did not converge in 10 iterations