summary
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
Sepal width & length 分佈
ggplot(iris,aes(x = Sepal.Length, y = Sepal.Width, col= Species)) + geom_point()

Petal width & length 分佈
ggplot(iris,aes(x = Petal.Length, y = Petal.Width, col= Species)) + geom_point()

my_kmeans 實作kmeans
#clust存k means cluster的中心點位址
#clust[25, 3,4]
#data存iris[,1:4] + 分類編號
#clust_assign存所有資料在所有組下分配到的cluster
#cluster_assign[25,150]
my_kmeans <- function(dataset, n_cluster, n_ver) {
set.seed(100)
n_factor = ncol(dataset)
data <- dataset
data$clust <- NA
# 初始化各組的每個cluster中心點
clust <- array(NA, dim = c(n_ver, n_cluster, n_factor))
for(i in 1:n_factor) {
for(j in 1:n_ver) {
#均勻分布
clust[j,,i] <- runif(n = n_cluster, min(dataset[,i]), max = dataset[,i])
}
}
clust_assign <- array(NA, dim = c(n_ver, nrow(dataset)))
clust_dist <- array(NA, dim = c(n_ver, nrow(dataset)))
finish <- FALSE
# 每一組cluster
for(i in 1:n_ver) {
# 當分組後取平均跟原本cluster中心點一模一樣時跳出while迴圈
while(finish == FALSE) {
# 每一筆data
for(j in 1:nrow(dataset)) {
sum <- rep(0, times = n_cluster)
#計算每筆資料到各個cluster中心的距離
for(k in 1:n_factor) {
sum = sum + (data[j,k] - clust[i,,k])^2
}
dist <- sqrt(sum)
clust_assign[i,j] <- which.min(dist)
clust_dist[i,j] <- dist[which.min(dist)]
}
old_clust <- clust
# 分組後取平均作為中心點
for(k in 1:n_cluster) {
for(l in 1:n_factor) {
clust[i,k,l] <- mean(subset(data[,l], clust_assign[i,] == k))
}
}
# 當分組後取平均跟原本cluster中心點一模一樣時跳出while迴圈
if(identical(old_clust[i,,], clust[i,,]))
finish <- TRUE
}
finish = FALSE
}
# 取出最佳解
best_solution <- which.min(sum(clust_dist[,1:nrow(dataset)]))
smallest_dist <- min(sum(clust_dist[best_solution,]))
data$clust <- clust_assign[i,]
# 將分類結果跟最小距離回傳
obj <- list("clust" = data$clust, "distance" = smallest_dist)
return(obj)
}
分三群
predict <- iris[,1:4]
result <- my_kmeans(iris[,1:3], 3, 12)
predict$clust <- result$clust
clust_centers <- result$center
dist <- result$distance
分群結果統計
table(predict$clust, iris$Species)
##
## setosa versicolor virginica
## 1 0 45 13
## 2 0 5 37
## 3 50 0 0
Sepal lenth & Width
ggplot(predict[,1:4], aes(Sepal.Length, Sepal.Width, color = as.factor(predict$clust))) +
geom_point()

# geom_point(data=clust_centers, aes(Sepal.Length, Sepal.Width, color = clust), pch=8, size=5)
Petal lenth & Width
ggplot(predict[,1:4], aes(Petal.Length, Petal.Width, color = as.factor(predict$clust))) +
geom_point()

# geom_point(data=clust_centers, aes(Sepal.Length, Sepal.Width, color = as.factor(row.names(data))), pch=8, size=5)
分五群
result <- my_kmeans(iris[,1:4], 5, 5)
predict$clust <- result$clust
clust_centers <- result$center
dist <- result$distance
Sepal lenth & Width
ggplot(predict[,1:4], aes(Sepal.Length, Sepal.Width, color = as.factor(predict$clust))) +
geom_point()

# geom_point(data=clust_centers, aes(Sepal.Length, Sepal.Width, color = clust), pch=8, size=5)
Petal lenth & Width
ggplot(predict[,1:4], aes(Petal.Length, Petal.Width, color = as.factor(predict$clust))) +
geom_point()

# geom_point(data=clust_centers, aes(Sepal.Length, Sepal.Width, color = as.factor(row.names(data))), pch=8, size=5)
1組cluster
wss_1 <- sapply(1:10, function(k){my_kmeans(iris[,1:4], k, 1)$distance})
plot(1:10, wss_1, type= "b", xlab = "Number of clusters(k)", ylab = "Within cluster sum of squares")

20組cluster
wss_10 <- sapply(1:10, function(k){my_kmeans(iris[,1:4], k, 20)$distance})
plot(1:10, wss_10, type= "b", xlab = "Number of clusters(k)", ylab = "Within cluster sum of squares")

套用kmeans
set.seed(200)
k.max <- 10
wss<- sapply(1:k.max,function(k){kmeans(iris[,1:4],k,nstart = 20,iter.max = 20)$tot.withinss})
wss
## [1] 681.37060 152.34795 78.85144 57.22847 46.46117 39.03999 34.29823
## [8] 29.98894 27.78609 25.84946
plot(1:k.max,wss, type= "b", xlab = "Number of clusters(k)", ylab = "Within cluster sum of squares")

icluster <- kmeans(iris[,1:4],3,nstart = 20)
table(icluster$cluster,iris$Species)
##
## setosa versicolor virginica
## 1 0 48 14
## 2 0 2 36
## 3 50 0 0