Se agregan las librerias necesarias para realizar el proceso clustering
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.1
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(ggpubr)
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:cowplot':
##
## get_legend
library(cluster)
library(purrr)
library(dplyr)
La base de datos a utilizar es el registro de las ventas de automóviles
db <- read_csv("Auto Sales data.csv",
col_types = cols(ORDERDATE = col_skip(),
STATUS = col_skip(), PRODUCTLINE = col_skip(),
PRODUCTCODE = col_skip(), CUSTOMERNAME = col_skip(),
PHONE = col_skip(), ADDRESSLINE1 = col_skip(),
CITY = col_skip(), POSTALCODE = col_skip(),
CONTACTLASTNAME = col_skip(), CONTACTFIRSTNAME = col_skip(),
DEALSIZE = col_skip(),COUNTRY = col_skip()))
View(db)
summary(db)
## ORDERNUMBER QUANTITYORDERED PRICEEACH ORDERLINENUMBER
## Min. :10100 Min. : 6.0 Min. : 26.88 Min. : 1.000
## 1st Qu.:10181 1st Qu.:27.0 1st Qu.: 68.75 1st Qu.: 3.000
## Median :10264 Median :35.0 Median : 95.55 Median : 6.000
## Mean :10260 Mean :35.1 Mean :101.10 Mean : 6.491
## 3rd Qu.:10334 3rd Qu.:43.0 3rd Qu.:127.10 3rd Qu.: 9.000
## Max. :10425 Max. :97.0 Max. :252.87 Max. :18.000
## SALES DAYS_SINCE_LASTORDER MSRP
## Min. : 482.1 Min. : 42 Min. : 33.0
## 1st Qu.: 2204.3 1st Qu.:1077 1st Qu.: 68.0
## Median : 3184.8 Median :1761 Median : 99.0
## Mean : 3553.0 Mean :1757 Mean :100.7
## 3rd Qu.: 4503.1 3rd Qu.:2436 3rd Qu.:124.0
## Max. :14082.8 Max. :3562 Max. :214.0
N
DB_PRICE = scale(db, center = TRUE, scale = TRUE)
summary(DB_PRICE)
## ORDERNUMBER QUANTITYORDERED PRICEEACH ORDERLINENUMBER
## Min. :-1.73885 Min. :-2.98122 Min. :-1.7653 Min. :-1.2980
## 1st Qu.:-0.85725 1st Qu.:-0.83005 1st Qu.:-0.7696 1st Qu.:-0.8252
## Median : 0.04613 Median :-0.01055 Median :-0.1320 Median :-0.1161
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.81346 3rd Qu.: 0.80894 3rd Qu.: 0.6184 3rd Qu.: 0.5930
## Max. : 1.79846 Max. : 6.34052 Max. : 3.6099 Max. : 2.7204
## SALES DAYS_SINCE_LASTORDER MSRP
## Min. :-1.6699 Min. :-2.093405 Min. :-1.68745
## 1st Qu.:-0.7334 1st Qu.:-0.830101 1st Qu.:-0.81495
## Median :-0.2002 Median : 0.004777 Median :-0.04217
## Mean : 0.0000 Mean : 0.000000 Mean : 0.00000
## 3rd Qu.: 0.5166 3rd Qu.: 0.829281 3rd Qu.: 0.58104
## Max. : 5.7259 Max. : 2.203048 Max. : 2.82460
DB_PRICE = as.data.frame(DB_PRICE)
AUTOS=rownames(DB_PRICE)
creamos 4 cluster en funcion a su precio
?kmeans
## starting httpd help server ... done
kmcluster = kmeans(DB_PRICE,centers=4,nstart = 50)
kmcluster
## K-means clustering with 4 clusters of sizes 839, 549, 840, 519
##
## Cluster means:
## ORDERNUMBER QUANTITYORDERED PRICEEACH ORDERLINENUMBER SALES
## 1 -0.86999726 -0.08747219 -0.5958556 0.10085656 -0.52789961
## 2 0.11082118 0.97033795 1.0539473 -0.16171905 1.54028532
## 3 0.85349510 -0.09201466 -0.6012092 0.01730381 -0.53710556
## 4 -0.09219463 -0.73609644 0.8214288 -0.01998091 0.09337149
## DAYS_SINCE_LASTORDER MSRP
## 1 0.84410654 -0.5997182
## 2 -0.63661781 0.9013419
## 3 0.07118736 -0.5765595
## 4 -0.80635760 0.9492038
##
## Clustering vector:
## [1] 4 1 4 1 4 4 2 4 4 4 4 4 2 4 4 2 2 2 2 3 3 3 3 2 4 2 2 2 2 4 2 4 2 2 2 2 2
## [38] 2 4 2 2 2 2 2 4 4 2 4 4 2 4 4 4 4 4 2 4 4 2 4 2 4 4 2 4 4 2 4 3 3 2 3 2 2
## [75] 4 2 2 4 4 2 2 2 2 2 2 2 2 4 2 4 2 4 4 4 4 4 2 2 2 2 4 2 4 2 4 4 2 2 4 4 4
## [112] 4 4 4 4 3 3 4 3 4 4 2 4 2 4 4 4 4 4 4 4 4 4 2 2 4 4 4 2 4 2 2 3 4 4 2 4 3
## [149] 4 2 4 2 2 2 4 4 4 4 2 4 4 4 2 2 4 4 2 2 4 2 4 4 4 4 2 4 2 2 4 2 2 4 2 2 2
## [186] 2 2 2 2 2 2 2 4 4 4 2 4 4 4 2 4 4 4 4 4 2 2 2 4 4 2 4 2 2 4 4 2 2 4 4 4 2
## [223] 2 3 2 2 4 2 4 2 4 2 4 4 2 4 2 2 2 2 4 4 4 4 2 4 2 2 2 4 2 2 2 4 2 4 2 4 4
## [260] 4 4 2 4 4 4 4 4 2 4 4 2 3 2 4 2 2 4 2 4 2 4 2 4 4 4 4 4 4 2 2 4 4 4 3 2 4
## [297] 4 3 4 4 2 2 2 2 4 2 2 2 4 4 2 4 2 4 2 4 2 4 2 4 2 2 4 2 4 1 1 1 1 1 1 1 3
## [334] 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 2 4 4 4 2 4 4 2 4 2 4 2 2 4 2 4 2 2 3 2
## [371] 2 3 3 3 2 4 4 4 4 4 2 4 4 2 4 4 4 2 4 4 4 4 4 4 3 2 2 2 2 2 4 4 2 4 4 4 4
## [408] 2 4 4 4 2 4 4 2 2 4 2 2 4 4 3 4 4 3 4 2 4 4 4 4 4 4 2 4 2 4 4 2 4 4 4 2 2
## [445] 2 2 2 2 2 4 2 4 2 4 2 4 4 2 2 4 4 4 4 4 2 4 2 4 3 3 3 3 3 3 2 3 2 2 1 1 1
## [482] 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 4 2 2 2 2 2 4 4 4 4 4 4 2 4 4 4
## [519] 4 2 2 2 3 3 2 2 4 2 4 4 2 2 2 2 2 4 2 2 2 2 2 2 2 4 4 2 4 3 3 4 2 4 4 2 4
## [556] 4 4 2 4 2 4 4 2 4 2 2 2 4 2 4 2 2 2 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3
## [593] 3 4 3 3 3 3 4 4 2 4 2 4 4 2 2 2 4 2 4 2 4 4 2 4 4 3 4 3 4 4 3 2 4 4 2 2 2
## [630] 2 4 4 2 4 2 2 2 2 4 2 4 4 4 2 2 2 2 4 4 4 2 4 1 1 1 1 1 1 1 1 3 3 3 3 3 3
## [667] 3 3 3 2 3 3 3 4 4 2 2 2 2 2 2 4 2 4 4 4 2 2 2 4 4 3 2 2 3 4 4 2 3 4 4 2 4
## [704] 4 4 2 4 4 2 2 4 4 4 4 2 3 4 4 4 2 4 4 2 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3
## [741] 3 3 3 3 2 3 3 3 3 3 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 2 3 4 3 3 3 3 3 1 1 1 1
## [778] 1 1 1 1 3 3 3 3 3 3 3 3 3 2 3 3 3 2 3 3 4 4 2 2 4 4 4 4 2 2 4 2 4 4 2 4 4
## [815] 2 4 4 4 4 3 4 2 4 2 4 4 4 2 4 2 4 2 4 2 2 4 4 2 4 2 4 3 3 2 4 2 2 1 1 1 2
## [852] 1 4 2 1 4 1 2 4 4 2 3 2 2 2 3 3 2 3 2 3 3 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3
## [889] 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1
## [926] 1 2 1 4 4 4 2 4 4 3 4 4 4 3 4 3 3 2 2 3 3 3 4 4 2 2 4 2 4 4 2 4 4 4 4 2 2
## [963] 4 4 4 3 2 2 4 3 2 2 4 2 2 4 4 2 4 4 4 2 2 4 2 4 4 2 2 4 2 2 4 2 2 2 4 4 4
## [1000] 2 4 2 4 4 2 2 4 2 4 4 4 4 2 4 2 3 2 2 4 2 2 2 4 1 1 1 1 1 2 1 1 1 2 1 2 2
## [1037] 2 4 4 3 4 4 3 3 3 4 3 2 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 2 1 1
## [1074] 1 1 2 1 1 1 1 2 4 3 3 3 3 3 3 3 3 3 3 2 3 3 4 4 2 4 4 4 4 4 4 4 2 2 2 4 2
## [1111] 4 2 4 4 2 4 4 2 2 2 4 4 4 2 2 2 2 2 2 2 2 4 4 4 4 2 4 4 3 3 2 3 2 2 1 1 1
## [1148] 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 4 3 3 1 1 1 1 4 1 2 2 4 1 2 4 2 2 3 3
## [1185] 3 3 2 4 3 3 3 2 2 2 4 4 4 4 4 2 2 4 2 2 2 2 4 4 2 4 4 2 2 4 4 2 2 2 2 1 1
## [1222] 1 1 1 1 1 1 1 1 3 3 3 3 3 2 3 3 3 3 2 1 1 1 1 1 1 1 1 2 1 1 1 3 3 3 3 2 3
## [1259] 3 3 4 3 3 3 3 3 1 1 2 1 2 2 2 4 4 2 4 2 4 2 2 4 2 4 4 2 2 2 4 3 2 2 3 1 1
## [1296] 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 2 2 4 3 2 3 3 2 4 4 4 4 2 4 2 4 2 4 4 2
## [1333] 4 2 2 2 2 2 4 3 3 2 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 4 3 3 4 3 3 1 1 1 1 1
## [1370] 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3
## [1407] 3 2 3 3 3 3 3 3 3 1 1 1 1 2 2 1 1 1 4 4 2 4 4 2 4 2 3 3 2 3 2 2 3 1 1 1 1
## [1444] 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 2 1 1 1 3 3 3 4 4 3 4 3 3
## [1481] 3 3 3 2 3 3 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 2 3 3 2 3 3 1 1 1 1 1 1 1
## [1518] 1 1 1 1 3 3 3 3 3 3 3 3 3 4 3 3 2 3 2 2 2 1 2 1 2 1 1 4 2 2 2 4 4 2 4 4 4
## [1555] 2 3 3 3 2 4 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1 4 4 2 2 4 1
## [1592] 4 2 2 4 2 4 2 2 4 2 4 2 4 2 2 2 4 4 2 2 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3
## [1629] 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 3 3 3 3 3 3 2 3 1 1 1 1 1 1 1
## [1666] 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3
## [1703] 3 3 2 3 3 3 3 3 3 2 1 1 1 1 1 4 1 1 4 1 2 2 2 2 2 3 3 4 3 2 2 1 1 1 1 1 1
## [1740] 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3
## [1777] 3 3 3 2 3 3 3 2 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1
## [1814] 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3
## [1851] 3 3 3 3 3 3 3 3 1 1 1 1 1 2 2 1 1 1 1 2 2 2 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1
## [1888] 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 4 1 4 2 2 2 2 4 2 4 4 4 2 2 4
## [1925] 4 2 4 3 3 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1
## [1962] 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 1 2 1 1 1 1 1 2 1 1 2 2 2 4 2 4 2 3
## [1999] 2 4 4 3 3 3 2 4 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 2 3 3 2 3 2 3 3 3 1 1 1 1 1
## [2036] 1 1 1 1 1 3 3 3 3 3 3 3 3 3 2 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 2
## [2073] 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 2 3 4 3 3 3 1 1 1 1 1
## [2110] 1 1 1 1 1 1 3 3 3 3 2 3 2 3 2 2 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3
## [2147] 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 2 3 3 2 3 3 3 3 3 1 1 1
## [2184] 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 4 3 3 4 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2221] 1 1 3 3 3 2 3 2 3 2 3 3 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1
## [2258] 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 4 3 3 4 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3
## [2295] 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 4 1 1 1 2 3 3 2 4 3 2 3 3
## [2332] 2 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 3 3 2 3 1 1 1 1 1 1 1 1
## [2369] 1 1 1 1 1 3 3 3 3 4 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3
## [2406] 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 2 3 3 3 1 1 1 1 1 1
## [2443] 1 1 1 1 1 1 1 1 1 3 3 3 4 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3
## [2480] 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 2 3 1 1 1
## [2517] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2554] 1 1 1 3 3 3 2 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 2 2 2 2 3 3 2 3 4 3 3 3 2 3 3
## [2591] 2 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 2 3 1 1 1 1 1 1 1 1 1 1 1
## [2628] 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3
## [2665] 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1
## [2702] 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [2739] 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 2992.672 2809.578 3283.607 2041.726
## (between_SS / total_SS = 42.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Graficamos los cluster en funcion del precio
DB_PRICE = DB_PRICE %>% mutate(cluster = kmcluster$cluster)
(g1=ggplot(DB_PRICE, aes(x = PRICEEACH, y = QUANTITYORDERED)) +
geom_point(aes(color=as.factor(cluster)), size=10)+
geom_text(aes(label = cluster), size = 5) +
theme_bw() +
theme(legend.position = "none")+
labs(title = "Kmenas con k=4")
)
graficamos sus 2 primeras componentes
fviz_cluster(kmcluster, DB_PRICE)+
theme_minimal()
Creamos 2 cluster k=2
kmcluster2 = kmeans(DB_PRICE, centers=2, nstart = 50)
DB_PRICE = DB_PRICE %>% mutate(cluster2 = kmcluster2$cluster)
(g2=ggplot(DB_PRICE, aes(x = PRICEEACH, y = QUANTITYORDERED)) +
geom_point(aes(color=as.factor(cluster2)), size=10)+
geom_text(aes(label = cluster2), size = 5) +
theme_bw() +
theme(legend.position = "none")+
labs(title = "Kmenas con k=2")
)
Podemos graficar ambas al mismo tiempo (notese el cambio en las
etiquetas)
plot_grid(g1,g2)
##Buscar el numero optimo de Clusters
# creamos una funcion que nos retorne la var.within para cada k
total_within = function(n_clusters, data, iter.max=1000, nstart=50){
cluster_means = kmeans(data,centers = n_clusters,
iter.max = iter.max,
nstart = nstart)
return(cluster_means$tot.withinss)
}
# Se aplica esta funci?n con para diferentes valores de k
total_withinss <- map_dbl(.x = 1:15,
.f = total_within,
data = DB_PRICE)
total_withinss
## [1] 23259.428 16594.254 13580.049 11246.821 10566.704 9917.628 9329.845
## [8] 8896.108 8477.753 8080.707 7811.835 7493.261 7264.373 7062.069
## [15] 6870.473
#graficamos la varianza total
data.frame(n_clusters = 1:15, suma_cuadrados_internos = total_withinss) %>%
ggplot(aes(x = n_clusters, y = suma_cuadrados_internos)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = 1:15) +
labs(title = "Suma total de cuadrados intra-cluster") +
theme_bw()
En este punto nos damos cuenta que 4 clusters es el punto donde la varianza deja de incrementarse mucho y por eso escogemos ese punto como el número de clusters ideal
#otro metodo, usando el paquete “factoextra”
matriz_dist=get_dist(DB_PRICE, method = "euclidean")
fviz_nbclust(DB_PRICE, FUNcluster = kmeans,
method = "wss", k.max = 15,
diss = matriz_dist, nstart = 50)
## Warning: did not converge in 10 iterations