summary

summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

Sepal width & length 分佈

ggplot(iris,aes(x = Sepal.Length, y = Sepal.Width, col= Species)) + geom_point()

Petal width & length 分佈

ggplot(iris,aes(x = Petal.Length, y = Petal.Width, col= Species)) + geom_point()

my_kmeans 實作kmeans

#clust存k means cluster的中心點位址
#clust[25, 3,4]

#data存iris[,1:4] + 分類編號

#clust_assign存所有資料在所有組下分配到的cluster
#cluster_assign[25,150]

my_kmeans <- function(dataset, n_cluster, n_ver) {
  set.seed(100)
  n_factor = ncol(dataset)
  
  data <- dataset
  data$clust <- NA
  # 初始化各組的每個cluster中心點
  clust <- array(NA, dim = c(n_ver, n_cluster, n_factor))
  for(i in 1:n_factor) {
    for(j in 1:n_ver) {
      #均勻分布
      clust[j,,i] <- runif(n = n_cluster, min(dataset[,i]), max = dataset[,i])
    }
  }
  clust_assign <- array(NA, dim = c(n_ver, nrow(dataset)))
  clust_dist <- array(NA, dim = c(n_ver, nrow(dataset)))
  
  finish <- FALSE
  
  # 每一組cluster
  for(i in 1:n_ver) {
    # 當分組後取平均跟原本cluster中心點一模一樣時跳出while迴圈
    while(finish == FALSE) {
      # 每一筆data
      for(j in 1:nrow(dataset)) {
        sum <- rep(0, times = n_cluster)
        #計算每筆資料到各個cluster中心的距離
        for(k in 1:n_factor) {
          sum = sum + (data[j,k] - clust[i,,k])^2
        }
        dist <- sqrt(sum)
        clust_assign[i,j] <- which.min(dist)
        clust_dist[i,j] <- dist[which.min(dist)]
      }
      old_clust <- clust
      
      # 分組後取平均作為中心點
      for(k in 1:n_cluster) {
        for(l in 1:n_factor) {
          clust[i,k,l] <- mean(subset(data[,l], clust_assign[i,] == k))
        }
      }
      
      # 當分組後取平均跟原本cluster中心點一模一樣時跳出while迴圈
      if(identical(old_clust[i,,], clust[i,,]))
        finish <- TRUE
    }
    finish = FALSE
  }
  # 取出最佳解
  best_solution <- which.min(sum(clust_dist[,1:nrow(dataset)]))
  smallest_dist <- min(sum(clust_dist[best_solution,]))
  data$clust <- clust_assign[i,]
  # 將分類結果跟最小距離回傳
  obj <- list("clust" = data$clust, "distance" = smallest_dist)
  return(obj)
}

分三群

predict <- iris[,1:4]
result <- my_kmeans(iris[,1:3], 3, 12)
predict$clust <- result$clust
clust_centers <- result$center
dist <- result$distance
分群結果統計
table(predict$clust, iris$Species)
##    
##     setosa versicolor virginica
##   1      0         45        13
##   2      0          5        37
##   3     50          0         0
Sepal lenth & Width
ggplot(predict[,1:4], aes(Sepal.Length, Sepal.Width, color = as.factor(predict$clust))) + 
  geom_point() 

#  geom_point(data=clust_centers, aes(Sepal.Length, Sepal.Width, color = clust), pch=8, size=5)
Petal lenth & Width
ggplot(predict[,1:4], aes(Petal.Length, Petal.Width, color = as.factor(predict$clust))) + 
  geom_point()

#  geom_point(data=clust_centers, aes(Sepal.Length, Sepal.Width, color = as.factor(row.names(data))), pch=8, size=5)

分五群

result <- my_kmeans(iris[,1:4], 5, 5)
predict$clust <- result$clust
clust_centers <- result$center
dist <- result$distance
Sepal lenth & Width
ggplot(predict[,1:4], aes(Sepal.Length, Sepal.Width, color = as.factor(predict$clust))) + 
  geom_point() 

#  geom_point(data=clust_centers, aes(Sepal.Length, Sepal.Width, color = clust), pch=8, size=5)
Petal lenth & Width
ggplot(predict[,1:4], aes(Petal.Length, Petal.Width, color = as.factor(predict$clust))) + 
  geom_point()

#  geom_point(data=clust_centers, aes(Sepal.Length, Sepal.Width, color = as.factor(row.names(data))), pch=8, size=5)

1組cluster

wss_1 <- sapply(1:10, function(k){my_kmeans(iris[,1:4], k, 1)$distance})
plot(1:10, wss_1, type= "b", xlab = "Number of clusters(k)", ylab = "Within cluster sum of squares")

20組cluster

wss_10 <- sapply(1:10, function(k){my_kmeans(iris[,1:4], k, 20)$distance})
plot(1:10, wss_10, type= "b", xlab = "Number of clusters(k)", ylab = "Within cluster sum of squares")

套用kmeans

set.seed(200)
k.max <- 10
wss<- sapply(1:k.max,function(k){kmeans(iris[,1:4],k,nstart = 20,iter.max = 20)$tot.withinss})
wss
##  [1] 681.37060 152.34795  78.85144  57.22847  46.46117  39.03999  34.29823
##  [8]  29.98894  27.78609  25.84946
plot(1:k.max,wss, type= "b", xlab = "Number of clusters(k)", ylab = "Within cluster sum of squares")

icluster <- kmeans(iris[,1:4],3,nstart = 20)
table(icluster$cluster,iris$Species)
##    
##     setosa versicolor virginica
##   1      0         48        14
##   2      0          2        36
##   3     50          0         0