library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(ggpubr)
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:cowplot':
##
## get_legend
library(cluster)
library(purrr)
library(dplyr)
#Importar datos
data<-read.csv('C:/Users/rusoc/OneDrive/Escritorio/TEC/Mineria de datos/Clustering.csv')
head(data)
## X x y
## 1 1 3.3675960 3.536694
## 2 2 2.6678698 4.479919
## 3 3 1.3441712 3.282591
## 4 4 1.3894138 4.683227
## 5 5 1.6446438 4.320822
## 6 6 0.7760274 2.653667
summary(data)
## X x y
## Min. : 1.00 Min. :-1.428 Min. :-0.4767
## 1st Qu.: 48.25 1st Qu.: 1.713 1st Qu.: 3.1270
## Median : 95.50 Median : 3.462 Median : 4.6977
## Mean : 95.50 Mean : 4.721 Mean : 6.3453
## 3rd Qu.:142.75 3rd Qu.: 7.859 3rd Qu.: 9.8023
## Max. :190.00 Max. :10.675 Max. :12.8944
Quitamos la primera columna que indica el numero de fila
data1<- data[,-1]
Estandarizamos los datos
dataestand = scale(data1, center = TRUE, scale = TRUE)
dataestand = as.data.frame(dataestand)
Buscamos numero optimo de Clusters
optimo <- fviz_nbclust(dataestand, kmeans, method = "wss")
optimo
Resultados de K-means = 2
kmdata<-kmeans(dataestand, centers = 2)
kmdata
## K-means clustering with 2 clusters of sizes 100, 90
##
## Cluster means:
## x y
## 1 -0.8964941 -0.9024881
## 2 0.9961046 1.0027646
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [186] 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 18.70339 17.67945
## (between_SS / total_SS = 90.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
Graficamos
clusterdata <- kmdata$cluster
(dataplot=ggplot(dataestand, aes(x, y)) +
geom_point(aes(color=as.factor(clusterdata)), size=10)+
geom_text(aes(label = clusterdata), size = 5) +
theme_bw() +
theme(legend.position = "none")+
labs(title = "Kmenas con k=2")
)
fviz_cluster(kmdata, dataestand, show.clust.cent = T,
ellipse.type = "euclid", star.plot = T, repel = T) +
labs(title = "Resultados clustering K-means") +
theme_bw()