library

library(factoextra)

import dataset

dataset yang digunakan adalah nyc.csv, data tersebut memiliki xx atribut dan xx observasi

data.nyc <- read.csv("data_input/nyc.csv")
head(data.nyc)
##   X BOROUGH  NEIGHBORHOOD                     BUILDING.CLASS.CATEGORY
## 1 4       1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS             
## 2 5       1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS             
## 3 6       1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS             
## 4 7       1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS             
## 5 8       1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS             
## 6 9       1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS             
##   TAX.CLASS.AT.PRESENT BLOCK LOT EASE.MENT BUILDING.CLASS.AT.PRESENT
## 1                   2A   392   6        NA                        C2
## 2                    2   399  26        NA                        C7
## 3                    2   399  39        NA                        C7
## 4                   2B   402  21        NA                        C4
## 5                   2A   404  55        NA                        C2
## 6                    2   405  16        NA                        C4
##                  ADDRESS APARTMENT.NUMBER ZIP.CODE RESIDENTIAL.UNITS
## 1           153 AVENUE B                     10009                 5
## 2  234 EAST 4TH   STREET                     10009                28
## 3  197 EAST 3RD   STREET                     10009                16
## 4    154 EAST 7TH STREET                     10009                10
## 5 301 EAST 10TH   STREET                     10009                 6
## 6 516 EAST 12TH   STREET                     10009                20
##   COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET
## 1                0           5             1633              6440
## 2                3          31             4616             18690
## 3                1          17             2212              7803
## 4                0          10             2272              6794
## 5                0           6             2369              4615
## 6                0          20             2581              9730
##   YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE BUILDING.CLASS.AT.TIME.OF.SALE
## 1       1900                         2                             C2
## 2       1900                         2                             C7
## 3       1900                         2                             C7
## 4       1913                         2                             C4
## 5       1900                         2                             C2
## 6       1900                         2                             C4
##   SALE.PRICE           SALE.DATE
## 1    6625000 2017-07-19 00:00:00
## 2        -   2016-12-14 00:00:00
## 3        -   2016-12-09 00:00:00
## 4    3936272 2016-09-23 00:00:00
## 5    8000000 2016-11-17 00:00:00
## 6        -   2017-07-20 00:00:00
iris_prcomp <- prcomp(iris[,-5], scale. = T, center = T)
summary(iris_prcomp)
## Importance of components:
##                           PC1    PC2     PC3     PC4
## Standard deviation     1.7084 0.9560 0.38309 0.14393
## Proportion of Variance 0.7296 0.2285 0.03669 0.00518
## Cumulative Proportion  0.7296 0.9581 0.99482 1.00000
biplot(iris_prcomp)

iris_new <- as.data.frame(iris_prcomp$x)[,1:2] 
model_iris_prcomp <- kmeans(iris_new,3)
iris$clus <- as.factor(model_iris_prcomp$cluster) 
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species clus
## 1          5.1         3.5          1.4         0.2  setosa    1
## 2          4.9         3.0          1.4         0.2  setosa    1
## 3          4.7         3.2          1.3         0.2  setosa    1
## 4          4.6         3.1          1.5         0.2  setosa    1
## 5          5.0         3.6          1.4         0.2  setosa    1
## 6          5.4         3.9          1.7         0.4  setosa    1
data(iris)
model3 <- kmeans(iris[,-5], 3)

model3$withinss
## [1] 39.82097 15.15100 23.87947
model3$tot.withinss
## [1] 78.85144
model3$betweenss/model3$totss
## [1] 0.8842753
fviz_cluster(model3, iris[,-5])

data(iris)
model4 <- kmeans(iris[,-5], 4)

model4$withinss
## [1] 15.151000  9.749286 13.624750 18.703437
model4$tot.withinss
## [1] 57.22847
model4$betweenss/model4$totss
## [1] 0.9160098
fviz_cluster(model4, iris[,-5])

pada kode dibawah akan dilakukan pengecekan nilai k yg optimum pada kasus data iris

data(iris)
for (i in 3:7) {
  temp <- kmeans(iris[,-5],i)
  print(temp$betweenss/temp$totss)
  print(fviz_cluster(temp, iris[,-5])) 
}
## [1] 0.8842753

## [1] 0.9159553

## [1] 0.9318342

## [1] 0.9330869

## [1] 0.9410219

nilai k optimum adalah k = 4, karena mendapatkan nilai between/tots sebesar 0.91 (91%)

data(iris)
#fviz_cluster(model k means, data)
fviz_cluster(temp, iris[,-5])

semoga dengan dikirimnya contoh lbb ini dapat mempermudah lbb Bapak/ Ibu sekalian