library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(ggpubr)
## 
## Attaching package: 'ggpubr'
## 
## The following object is masked from 'package:cowplot':
## 
##     get_legend
library(cluster)
library(purrr)
library(dplyr)
library(tree)
## Warning: package 'tree' was built under R version 4.3.2
#Importar datos
pingui<-read.csv('C:/Users/rusoc/OneDrive/Escritorio/TEC/Mineria de datos/penguins.csv')
head(pingui)
##   culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g    sex
## 1             39.1            18.7               181        3750   MALE
## 2             39.5            17.4               186        3800 FEMALE
## 3             40.3            18.0               195        3250 FEMALE
## 4             36.7            19.3               193        3450 FEMALE
## 5             39.3            20.6               190        3650   MALE
## 6             38.9            17.8               181        3625 FEMALE
summary(pingui)
##  culmen_length_mm culmen_depth_mm flipper_length_mm  body_mass_g  
##  Min.   :32.10    Min.   :13.10   Min.   :-132.0    Min.   :2700  
##  1st Qu.:39.23    1st Qu.:15.60   1st Qu.: 190.0    1st Qu.:3550  
##  Median :44.45    Median :17.30   Median : 197.0    Median :4050  
##  Mean   :43.92    Mean   :17.15   Mean   : 214.0    Mean   :4202  
##  3rd Qu.:48.50    3rd Qu.:18.70   3rd Qu.: 213.8    3rd Qu.:4750  
##  Max.   :59.60    Max.   :21.50   Max.   :5000.0    Max.   :6300  
##      sex           
##  Length:342        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Quitamos el la columna del sexo

pingui2<- pingui[,-5]

Estandarizamos los datos

pinguiestand = scale(pingui2, center = TRUE, scale = TRUE)
pinguiestand = as.data.frame(pinguiestand)

Escogemos 6 clusters ya que es donde hay mas cambio en la varianza.

optimo <- fviz_nbclust(pinguiestand, kmeans, method = "wss")
optimo

Resultados del k-means=6

kmdata<-kmeans(pinguiestand, centers = 6)
kmdata
## K-means clustering with 6 clusters of sizes 65, 58, 82, 73, 63, 1
## 
## Cluster means:
##   culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## 1        0.2910184      -1.4391071      -0.003244533  0.60258480
## 2        1.0655987      -0.7165118       0.029456143  1.63658873
## 3       -1.1416448       0.1630556      -0.102182125 -1.05525506
## 4       -0.6827277       1.0333291      -0.099473913 -0.19947959
## 5        1.0013509       0.7103522      -0.067067346 -0.52472053
## 6       -0.3520286       1.5438733      18.368211019  0.06016004
## 
## Clustering vector:
##   [1] 4 3 3 3 4 3 4 3 6 3 3 3 4 4 3 4 4 3 4 3 3 4 4 3 4 4 3 3 4 3 3 3 4 3 4 4 4
##  [38] 3 4 3 4 3 4 3 4 4 3 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 5 3
##  [75] 4 3 4 3 4 3 4 4 4 3 4 4 3 4 4 3 4 3 4 3 4 4 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4
## [112] 3 4 4 4 3 4 3 3 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 3 3 3 3 4 4 3 3
## [149] 3 3 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 3 5 3 5 5 5 5 5 5 5 3 5 3 5
## [186] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 3 5 5 5 5 5 5 5 5 5 3 5 5 5 1 2 1
## [223] 2 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 1 1 2 1 1 1 2 1 2 2 1 1 2 2 2 1 2 1 2
## [260] 1 2 1 1 2 1 1 2 1 2 1 2 1 2 1 1 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 1 2 1
## [297] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 1 1 2 1 2 1 2 1 2 1 2 1 2 2 2 1 2
## [334] 1 2 1 2 1 1 2 1 2
## 
## Within cluster sum of squares by cluster:
## [1] 21.29938 28.78220 37.22916 44.01601 48.59824  0.00000
##  (between_SS / total_SS =  86.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
penguins_clustered <- as.data.frame(pinguiestand)
penguins_clustered$cluster <- as.factor(kmdata$cluster)
clusterdata <- kmdata$cluster

(dataplot = ggplot(pinguiestand, aes(x = culmen_length_mm, y = body_mass_g)) +
    geom_point(aes(color = as.factor(clusterdata)), size = 6) +
    geom_text(aes(label = clusterdata), vjust = -1, size = 2) +  
    theme_bw() +
    theme(legend.position = "none") +
    labs(title = "K-means con k=6")
)

(dataplot = ggplot(pinguiestand, aes(x = culmen_depth_mm, y = body_mass_g)) +
    geom_point(aes(color = as.factor(clusterdata)), size = 6) +
    geom_text(aes(label = clusterdata), vjust = -1, size = 2) +  
    theme_bw() +
    theme(legend.position = "none") +
    labs(title = "K-means con k=6")
)

(dataplot = ggplot(pinguiestand, aes(x = flipper_length_mm, y = body_mass_g)) +
    geom_point(aes(color = as.factor(clusterdata)), size = 6) +
    geom_text(aes(label = clusterdata), vjust = -1, size = 2) +  
    theme_bw() +
    theme(legend.position = "none") +
    labs(title = "K-means con k=6")
)