Del paso 5 se vuelve a pasar al paso 3.
Se usa principalmente cuando no sabemos las etiquetas que se pueden colocar, por eso se llama aprendizaje no supervisado. Con cada iteración el centro se ira corriendo despues de promediar los puntos. Lo que esto ocasiona es que se separen los clusters para que podamos identificarlo con una etiqueta.
library(dplyr)
petal_length <-
seq(min(iris$Petal.Length), max(iris$Petal.Length), by = 0.01)
petal_width <-
seq(min(iris$Petal.Width), max(iris$Petal.Width), by = 0.01)
set.seed(12345)
cluster_x <-
sample(petal_length, size = 2)
#donde ese 2 es el numero de clusters
cluster_y <-
sample(petal_width, size = 2)
#pch es el tipo de punto, 16 es punto relleno
plot(iris$Petal.Length,
iris$Petal.Width,
pch = 16,
xlab = "Petal length",
ylab = "Petal width",
main = "Iris Dataset")
points(cluster_x, cluster_y, col = c('blue', 'red'), pch = 17, cex =2)
Dataset con: en la primer columna el length y en la segunda el width. Coordenada \(x\): length Coordenada \(y\): width
df <- tibble(petal_length = iris$Petal.Length,
petal_width = iris$Petal.Width)
Del dataframe le agregamos 3 columnas, la primera es la distancia del punto al cluster 1, con la formula de la distancia, en cluster[1]
viene la coordenada en x del petal_length. distance_2 es lo mismmo pero para el otro punto. Luego asigna el cluster (cuando son dos es facil, ifelse) Si la distancia 1 es menor que la distancia 2 se asigna como blue, si no como red.
df <-
df %>%
mutate(distance_1 = sqrt((petal_length - cluster_x[1])^2 + (petal_width - cluster_y[1])^2),
distance_2 = sqrt((petal_length - cluster_x[2])^2 + (petal_width - cluster_y[2])^2),
cluster = ifelse(distance_1 < distance_2, "blue", "red"))
plot(df$petal_length, df$petal_width, col=df$cluster,
pch = 16,
xlab = "Petal length",
ylab = "Petal width",
main = "Iris Dataset")
points(cluster_x, cluster_y, col = c('blue', 'red'), pch = 17, cex = 2)
El dataframe se agrupa por los clusters. Se calcula un promedio del petal_length y el petal_width (summarise) y luego desagrupa. Esto devuelve un dataframe con “cluster” en la columna 1, avg_x en la columna 2 y avg_y en la columna 3. Como esta agupado solo se tienen 2 filas: Cluster1 y Cluster2.
new_clusters <-
df %>%
group_by(cluster) %>%
summarise(avg_x = mean(petal_length),
avg_y = mean(petal_width)) %>%
ungroup()
new_clusters
Pull es una funcion que agarra el dato y devuelve un numerico, como esta devolviendo la columna eso es un vector.
cluster_x <-
new_clusters %>% pull(avg_x)
cluster_y <-
new_clusters %>% pull(avg_y)
df <- tibble(petal_length = iris$Petal.Length,
petal_width = iris$Petal.Width)
df <-
df %>%
mutate(distance_1 = sqrt((petal_length - cluster_x[1])^2 + (petal_width - cluster_y[1])^2),
distance_2 = sqrt((petal_length - cluster_x[2])^2 + (petal_width - cluster_y[2])^2),
cluster = ifelse(distance_1 < distance_2, "blue", "red"))
plot(df$petal_length, df$petal_width, col=df$cluster,
pch = 16,
xlab = "Petal length",
ylab = "Petal width",
main = "Iris Dataset")
points(cluster_x, cluster_y, col = c('blue', 'red'), pch = 17, cex = 2)
new_clusters <-
df %>%
group_by(cluster) %>%
summarise(avg_x = mean(petal_length),
avg_y = mean(petal_width)) %>%
ungroup()
cluster_x <-
new_clusters %>% pull(avg_x)
cluster_y <-
new_clusters %>% pull(avg_y)
df <- tibble(petal_length = iris$Petal.Length,
petal_width = iris$Petal.Width)
df <-
df %>%
mutate(distance_1 = sqrt((petal_length - cluster_x[1])^2 + (petal_width - cluster_y[1])^2),
distance_2 = sqrt((petal_length - cluster_x[2])^2 + (petal_width - cluster_y[2])^2),
cluster = ifelse(distance_1 < distance_2, "blue", "red"))
plot(df$petal_length, df$petal_width, col=df$cluster,
pch = 16,
xlab = "Petal length",
ylab = "Petal width",
main = "Iris Dataset")
points(cluster_x, cluster_y, col = c('blue', 'red'), pch = 17, cex = 2)
new_clusters <-
df %>%
group_by(cluster) %>%
summarise(avg_x = mean(petal_length),
avg_y = mean(petal_width)) %>%
ungroup()
cluster_x <-
new_clusters %>% pull(avg_x)
cluster_y <-
new_clusters %>% pull(avg_y)
df <- tibble(petal_length = iris$Petal.Length,
petal_width = iris$Petal.Width)
df <-
df %>%
mutate(distance_1 = sqrt((petal_length - cluster_x[1])^2 + (petal_width - cluster_y[1])^2),
distance_2 = sqrt((petal_length - cluster_x[2])^2 + (petal_width - cluster_y[2])^2),
cluster = ifelse(distance_1 < distance_2, "blue", "red"))
plot(df$petal_length, df$petal_width, col=df$cluster,
pch = 16,
xlab = "Petal length",
ylab = "Petal width",
main = "Iris Dataset")
points(cluster_x, cluster_y, col = c('blue', 'red'), pch = 17, cex = 2)
new_clusters <-
df %>%
group_by(cluster) %>%
summarise(avg_x = mean(petal_length),
avg_y = mean(petal_width)) %>%
ungroup()
cluster_x <-
new_clusters %>% pull(avg_x)
cluster_y <-
new_clusters %>% pull(avg_y)
df <- tibble(petal_length = iris$Petal.Length,
petal_width = iris$Petal.Width)
df <-
df %>%
mutate(distance_1 = sqrt((petal_length - cluster_x[1])^2 + (petal_width - cluster_y[1])^2),
distance_2 = sqrt((petal_length - cluster_x[2])^2 + (petal_width - cluster_y[2])^2),
cluster = ifelse(distance_1 < distance_2, "blue", "red"))
plot(df$petal_length, df$petal_width, col=df$cluster,
pch = 16,
xlab = "Petal length",
ylab = "Petal width",
main = "Iris Dataset")
points(cluster_x, cluster_y, col = c('blue', 'red'), pch = 17, cex = 2)
new_clusters <-
df %>%
group_by(cluster) %>%
summarise(avg_x = mean(petal_length),
avg_y = mean(petal_width)) %>%
ungroup()
cluster_x <-
new_clusters %>% pull(avg_x)
cluster_y <-
new_clusters %>% pull(avg_y)
df <- tibble(petal_length = iris$Petal.Length,
petal_width = iris$Petal.Width)
df <-
df %>%
mutate(distance_1 = sqrt((petal_length - cluster_x[1])^2 + (petal_width - cluster_y[1])^2),
distance_2 = sqrt((petal_length - cluster_x[2])^2 + (petal_width - cluster_y[2])^2),
cluster = ifelse(distance_1 < distance_2, "blue", "red"))
plot(df$petal_length, df$petal_width, col=df$cluster,
pch = 16,
xlab = "Petal length",
ylab = "Petal width",
main = "Iris Dataset")
points(cluster_x, cluster_y, col = c('blue', 'red'), pch = 17, cex = 2)