CLUSTERING

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(cowplot)

## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp

library(ggpubr)

## 
## Attaching package: 'ggpubr'
## 
## The following object is masked from 'package:cowplot':
## 
##     get_legend

library(cluster)
library(purrr)
library(dplyr)

#Importar datos
data<-read.csv('C:/Users/rusoc/OneDrive/Escritorio/TEC/Mineria de datos/Clustering.csv')

head(data)

##   X         x        y
## 1 1 3.3675960 3.536694
## 2 2 2.6678698 4.479919
## 3 3 1.3441712 3.282591
## 4 4 1.3894138 4.683227
## 5 5 1.6446438 4.320822
## 6 6 0.7760274 2.653667

summary(data)

##        X                x                y          
##  Min.   :  1.00   Min.   :-1.428   Min.   :-0.4767  
##  1st Qu.: 48.25   1st Qu.: 1.713   1st Qu.: 3.1270  
##  Median : 95.50   Median : 3.462   Median : 4.6977  
##  Mean   : 95.50   Mean   : 4.721   Mean   : 6.3453  
##  3rd Qu.:142.75   3rd Qu.: 7.859   3rd Qu.: 9.8023  
##  Max.   :190.00   Max.   :10.675   Max.   :12.8944

Quitamos la primera columna que indica el numero de fila

data1<- data[,-1]

Estandarizamos los datos

dataestand = scale(data1, center = TRUE, scale = TRUE)
dataestand = as.data.frame(dataestand)

Buscamos numero optimo de Clusters

optimo <- fviz_nbclust(dataestand, kmeans, method = "wss")
optimo

Resultados de K-means = 2

kmdata<-kmeans(dataestand, centers = 2)
kmdata

## K-means clustering with 2 clusters of sizes 100, 90
## 
## Cluster means:
##            x          y
## 1 -0.8964941 -0.9024881
## 2  0.9961046  1.0027646
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [186] 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 18.70339 17.67945
##  (between_SS / total_SS =  90.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

Graficamos

clusterdata <- kmdata$cluster

(dataplot=ggplot(dataestand, aes(x, y)) +
    geom_point(aes(color=as.factor(clusterdata)), size=10)+
    geom_text(aes(label = clusterdata), size = 5) +
    theme_bw() +
    theme(legend.position = "none")+
    labs(title = "Kmenas con k=2") 
)

fviz_cluster(kmdata, dataestand, show.clust.cent = T,
             ellipse.type = "euclid", star.plot = T, repel = T) +
  labs(title = "Resultados clustering K-means") +
  theme_bw()

CLUSTERING

Adrian Israel Castillo Lara

2023-10-13