Unsupervised Machine Learning in R: K-Means
- K-Means clustering is unsupervised machine learning because there is not a target variable.
- Clustering can be used to create a target variable, or simply group data by certain characteristics.
# message=FALSE
library(factoextra)
# eval=TRUE, echo=TRUE
ABC <-read.table("AbcBank.csv",header=TRUE, sep=",")
ABC_num<- ABC[,2:5]
str(ABC_num)
## 'data.frame': 5000 obs. of 4 variables:
## $ Age : int 45 35 43 61 52 63 34 44 45 45 ...
## $ Income : int 202 203 201 188 180 178 180 154 200 194 ...
## $ CCAvg : num 10 10 10 9.3 9 9 8.9 8.8 8.8 8.8 ...
## $ Mortgage: int 0 0 0 0 297 0 0 0 0 428 ...
head(ABC_num)
## Age Income CCAvg Mortgage
## 1 45 202 10.0 0
## 2 35 203 10.0 0
## 3 43 201 10.0 0
## 4 61 188 9.3 0
## 5 52 180 9.0 297
## 6 63 178 9.0 0
ABC_scaled <-as.data.frame(scale(ABC_num)) #표준화
head(ABC_scaled)
## Age Income CCAvg Mortgage
## 1 -0.02952064 2.785475 4.613064 -0.5554684
## 2 -0.90188002 2.807198 4.613064 -0.5554684
## 3 -0.20399252 2.763752 4.613064 -0.5554684
## 4 1.36625436 2.481350 4.212528 -0.5554684
## 5 0.58113092 2.307565 4.040870 2.3644893
## 6 1.54072623 2.264118 4.040870 -0.5554684
# eval=TRUE, echo=TRUE
k3<- kmeans(ABC_scaled, centers=3, nstart=25)
# eval=TRUE, echo=TRUE
fviz_cluster(k3, data=ABC_scaled,
ellipse.type="convex",
axes =c(1,2),
geom="point",
label="none",
ggtheme=theme_classic())

# eval=TRUE, echo=TRUE
k3$centers
## Age Income CCAvg Mortgage
## 1 -0.89279937 -0.3047082 -0.3238836 -0.1233689
## 2 0.87269134 -0.3861332 -0.3707163 -0.1321248
## 3 -0.07556447 1.5435569 1.5495668 0.5693976
ABC$Cluster<-as.numeric(k3$cluster)
table(ABC$Cluster)
##
## 1 2 3
## 1979 2104 917