library(factoextra)
dataset yang digunakan adalah nyc.csv, data tersebut memiliki xx atribut dan xx observasi
data.nyc <- read.csv("data_input/nyc.csv")
head(data.nyc)
## X BOROUGH NEIGHBORHOOD BUILDING.CLASS.CATEGORY
## 1 4 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS
## 2 5 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS
## 3 6 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS
## 4 7 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS
## 5 8 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS
## 6 9 1 ALPHABET CITY 07 RENTALS - WALKUP APARTMENTS
## TAX.CLASS.AT.PRESENT BLOCK LOT EASE.MENT BUILDING.CLASS.AT.PRESENT
## 1 2A 392 6 NA C2
## 2 2 399 26 NA C7
## 3 2 399 39 NA C7
## 4 2B 402 21 NA C4
## 5 2A 404 55 NA C2
## 6 2 405 16 NA C4
## ADDRESS APARTMENT.NUMBER ZIP.CODE RESIDENTIAL.UNITS
## 1 153 AVENUE B 10009 5
## 2 234 EAST 4TH STREET 10009 28
## 3 197 EAST 3RD STREET 10009 16
## 4 154 EAST 7TH STREET 10009 10
## 5 301 EAST 10TH STREET 10009 6
## 6 516 EAST 12TH STREET 10009 20
## COMMERCIAL.UNITS TOTAL.UNITS LAND.SQUARE.FEET GROSS.SQUARE.FEET
## 1 0 5 1633 6440
## 2 3 31 4616 18690
## 3 1 17 2212 7803
## 4 0 10 2272 6794
## 5 0 6 2369 4615
## 6 0 20 2581 9730
## YEAR.BUILT TAX.CLASS.AT.TIME.OF.SALE BUILDING.CLASS.AT.TIME.OF.SALE
## 1 1900 2 C2
## 2 1900 2 C7
## 3 1900 2 C7
## 4 1913 2 C4
## 5 1900 2 C2
## 6 1900 2 C4
## SALE.PRICE SALE.DATE
## 1 6625000 2017-07-19 00:00:00
## 2 - 2016-12-14 00:00:00
## 3 - 2016-12-09 00:00:00
## 4 3936272 2016-09-23 00:00:00
## 5 8000000 2016-11-17 00:00:00
## 6 - 2017-07-20 00:00:00
iris_prcomp <- prcomp(iris[,-5], scale. = T, center = T)
summary(iris_prcomp)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.7084 0.9560 0.38309 0.14393
## Proportion of Variance 0.7296 0.2285 0.03669 0.00518
## Cumulative Proportion 0.7296 0.9581 0.99482 1.00000
biplot(iris_prcomp)
iris_new <- as.data.frame(iris_prcomp$x)[,1:2]
model_iris_prcomp <- kmeans(iris_new,3)
iris$clus <- as.factor(model_iris_prcomp$cluster)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species clus
## 1 5.1 3.5 1.4 0.2 setosa 1
## 2 4.9 3.0 1.4 0.2 setosa 1
## 3 4.7 3.2 1.3 0.2 setosa 1
## 4 4.6 3.1 1.5 0.2 setosa 1
## 5 5.0 3.6 1.4 0.2 setosa 1
## 6 5.4 3.9 1.7 0.4 setosa 1
data(iris)
model3 <- kmeans(iris[,-5], 3)
model3$withinss
## [1] 39.82097 15.15100 23.87947
model3$tot.withinss
## [1] 78.85144
model3$betweenss/model3$totss
## [1] 0.8842753
fviz_cluster(model3, iris[,-5])
data(iris)
model4 <- kmeans(iris[,-5], 4)
model4$withinss
## [1] 15.151000 9.749286 13.624750 18.703437
model4$tot.withinss
## [1] 57.22847
model4$betweenss/model4$totss
## [1] 0.9160098
fviz_cluster(model4, iris[,-5])
pada kode dibawah akan dilakukan pengecekan nilai k yg optimum pada kasus data iris
data(iris)
for (i in 3:7) {
temp <- kmeans(iris[,-5],i)
print(temp$betweenss/temp$totss)
print(fviz_cluster(temp, iris[,-5]))
}
## [1] 0.8842753
## [1] 0.9159553
## [1] 0.9318342
## [1] 0.9330869
## [1] 0.9410219
nilai k optimum adalah k = 4, karena mendapatkan nilai between/tots sebesar 0.91 (91%)
data(iris)
#fviz_cluster(model k means, data)
fviz_cluster(temp, iris[,-5])
semoga dengan dikirimnya contoh lbb ini dapat mempermudah lbb Bapak/ Ibu sekalian