library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(ggpubr)
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:cowplot':
##
## get_legend
library(cluster)
library(purrr)
library(dplyr)
library(tree)
## Warning: package 'tree' was built under R version 4.3.2
#Importar datos
pingui<-read.csv('C:/Users/rusoc/OneDrive/Escritorio/TEC/Mineria de datos/penguins.csv')
head(pingui)
## culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g sex
## 1 39.1 18.7 181 3750 MALE
## 2 39.5 17.4 186 3800 FEMALE
## 3 40.3 18.0 195 3250 FEMALE
## 4 36.7 19.3 193 3450 FEMALE
## 5 39.3 20.6 190 3650 MALE
## 6 38.9 17.8 181 3625 FEMALE
summary(pingui)
## culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## Min. :32.10 Min. :13.10 Min. :-132.0 Min. :2700
## 1st Qu.:39.23 1st Qu.:15.60 1st Qu.: 190.0 1st Qu.:3550
## Median :44.45 Median :17.30 Median : 197.0 Median :4050
## Mean :43.92 Mean :17.15 Mean : 214.0 Mean :4202
## 3rd Qu.:48.50 3rd Qu.:18.70 3rd Qu.: 213.8 3rd Qu.:4750
## Max. :59.60 Max. :21.50 Max. :5000.0 Max. :6300
## sex
## Length:342
## Class :character
## Mode :character
##
##
##
Quitamos el la columna del sexo
pingui2<- pingui[,-5]
Estandarizamos los datos
pinguiestand = scale(pingui2, center = TRUE, scale = TRUE)
pinguiestand = as.data.frame(pinguiestand)
Escogemos 6 clusters ya que es donde hay mas cambio en la varianza.
optimo <- fviz_nbclust(pinguiestand, kmeans, method = "wss")
optimo
Resultados del k-means=6
kmdata<-kmeans(pinguiestand, centers = 6)
kmdata
## K-means clustering with 6 clusters of sizes 65, 58, 82, 73, 63, 1
##
## Cluster means:
## culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g
## 1 0.2910184 -1.4391071 -0.003244533 0.60258480
## 2 1.0655987 -0.7165118 0.029456143 1.63658873
## 3 -1.1416448 0.1630556 -0.102182125 -1.05525506
## 4 -0.6827277 1.0333291 -0.099473913 -0.19947959
## 5 1.0013509 0.7103522 -0.067067346 -0.52472053
## 6 -0.3520286 1.5438733 18.368211019 0.06016004
##
## Clustering vector:
## [1] 4 3 3 3 4 3 4 3 6 3 3 3 4 4 3 4 4 3 4 3 3 4 4 3 4 4 3 3 4 3 3 3 4 3 4 4 4
## [38] 3 4 3 4 3 4 3 4 4 3 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 5 3
## [75] 4 3 4 3 4 3 4 4 4 3 4 4 3 4 4 3 4 3 4 3 4 4 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4
## [112] 3 4 4 4 3 4 3 3 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 3 3 3 3 4 4 3 3
## [149] 3 3 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 3 5 3 5 5 5 5 5 5 5 3 5 3 5
## [186] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 3 5 5 5 5 5 5 5 5 5 3 5 5 5 1 2 1
## [223] 2 2 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 1 1 2 1 1 1 2 1 2 2 1 1 2 2 2 1 2 1 2
## [260] 1 2 1 1 2 1 1 2 1 2 1 2 1 2 1 1 1 1 1 2 1 2 1 2 1 2 1 2 1 2 1 1 2 1 1 2 1
## [297] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 2 1 1 2 1 2 1 2 1 2 1 2 1 2 2 2 1 2
## [334] 1 2 1 2 1 1 2 1 2
##
## Within cluster sum of squares by cluster:
## [1] 21.29938 28.78220 37.22916 44.01601 48.59824 0.00000
## (between_SS / total_SS = 86.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
penguins_clustered <- as.data.frame(pinguiestand)
penguins_clustered$cluster <- as.factor(kmdata$cluster)
clusterdata <- kmdata$cluster
(dataplot = ggplot(pinguiestand, aes(x = culmen_length_mm, y = body_mass_g)) +
geom_point(aes(color = as.factor(clusterdata)), size = 6) +
geom_text(aes(label = clusterdata), vjust = -1, size = 2) +
theme_bw() +
theme(legend.position = "none") +
labs(title = "K-means con k=6")
)
(dataplot = ggplot(pinguiestand, aes(x = culmen_depth_mm, y = body_mass_g)) +
geom_point(aes(color = as.factor(clusterdata)), size = 6) +
geom_text(aes(label = clusterdata), vjust = -1, size = 2) +
theme_bw() +
theme(legend.position = "none") +
labs(title = "K-means con k=6")
)
(dataplot = ggplot(pinguiestand, aes(x = flipper_length_mm, y = body_mass_g)) +
geom_point(aes(color = as.factor(clusterdata)), size = 6) +
geom_text(aes(label = clusterdata), vjust = -1, size = 2) +
theme_bw() +
theme(legend.position = "none") +
labs(title = "K-means con k=6")
)