setwd('C:/Users/Administrator/Desktop/R Analysis/Fast Campus')
read.csv('wine.csv') -> df
#-----------------------------------
# NA 값 확인
#----------------------------------
colSums(is.na(df))## Class Alcohol Acid
## 0 0 0
## Ash Alcalinity Magnesium
## 0 0 0
## Total_phenols Flavanoids Nonflavanoid_phenols
## 0 0 0
## Proanthocyanins color_intensity Hue
## 0 0 0
## X0D280 proline
## 0 0
#-----------------------------------
# 기초 통계량 확인
#----------------------------------
summary(df[,-1])## Alcohol Acid Ash Alcalinity
## Min. :11.03 Min. :0.74 Min. :1.360 Min. :10.60
## 1st Qu.:12.36 1st Qu.:1.60 1st Qu.:2.210 1st Qu.:17.20
## Median :13.05 Median :1.87 Median :2.360 Median :19.50
## Mean :12.99 Mean :2.34 Mean :2.366 Mean :19.52
## 3rd Qu.:13.67 3rd Qu.:3.10 3rd Qu.:2.560 3rd Qu.:21.50
## Max. :14.83 Max. :5.80 Max. :3.230 Max. :30.00
## Magnesium Total_phenols Flavanoids Nonflavanoid_phenols
## Min. : 70.00 Min. :0.980 Min. :0.340 Min. :0.1300
## 1st Qu.: 88.00 1st Qu.:1.740 1st Qu.:1.200 1st Qu.:0.2700
## Median : 98.00 Median :2.350 Median :2.130 Median :0.3400
## Mean : 99.59 Mean :2.292 Mean :2.023 Mean :0.3623
## 3rd Qu.:107.00 3rd Qu.:2.800 3rd Qu.:2.860 3rd Qu.:0.4400
## Max. :162.00 Max. :3.880 Max. :5.080 Max. :0.6600
## Proanthocyanins color_intensity Hue X0D280
## Min. :0.410 Min. : 1.280 Min. :0.480 Min. :1.270
## 1st Qu.:1.250 1st Qu.: 3.210 1st Qu.:0.780 1st Qu.:1.930
## Median :1.550 Median : 4.680 Median :0.960 Median :2.780
## Mean :1.587 Mean : 5.055 Mean :0.957 Mean :2.604
## 3rd Qu.:1.950 3rd Qu.: 6.200 3rd Qu.:1.120 3rd Qu.:3.170
## Max. :3.580 Max. :13.000 Max. :1.710 Max. :4.000
## proline
## Min. : 278.0
## 1st Qu.: 500.0
## Median : 672.0
## Mean : 745.1
## 3rd Qu.: 985.0
## Max. :1680.0
#---------------------------------------------------------------
# BOX PLOT - Class 변수를 뺀 나머지는 모두 연속형 : 분포도 확인
#---------------------------------------------------------------
boxplot(df[,-1])Data Preparation
df.train <- df[,-1] #class 제외
#---------------------------------------------------------------
# 표준화 Scaling
#---------------------------------------------------------------
scale(df.train) -> df.train.scale반복구매를 할 때, 유사한 종류의 와인을 구매할 것이다.
가끔 새로운 와인을 구매할 것 이다.
-> 유사도 거리 -> 군집분석을 통한 유사종류/다른 종류 와인 분류 -> 와인을 각 상품칸에 적절하게 배치 하기 -> 반응을 확인하기
Euclidian Distance K means K medoids Hierarchical Clustering
Euclidian Distance
유사도 행렬을 구해서, 같은/다른 와인으로 분류해주기
차순 정렬로 가까운 거리의 와인들을 살펴본다
K-MEANS
#---------------------------------------------------------------
# 군집 개수 설정해주기
#---------------------------------------------------------------
set.seed(1234)
fviz_nbclust(df.train.scale,
kmeans,
method = 'wss',
k.max=15)+theme_classic()K-mean Modelling
#---------------------------------------------------------------
# Centers Clustering - Bar Graph
#---------------------------------------------------------------
df.kmeans$centers #각 군집별 평균값 확인 ## Alcohol Acid Ash Alcalinity Magnesium Total_phenols
## 1 0.8333649 -0.3013131 0.3661731 -0.6065538 0.56922228 0.88768039
## 2 0.1736447 0.8642504 0.1871775 0.5168437 -0.06497127 -0.97106500
## 3 -0.9183253 -0.3953334 -0.4905017 0.1637039 -0.48321576 -0.07114136
## Flavanoids Nonflavanoid_phenols Proanthocyanins color_intensity Hue
## 1 0.98016451 -0.56173008 0.57583669 0.1702296 0.4753467
## 2 -1.20624204 0.71915195 -0.77171004 0.9378162 -1.1566204
## 3 0.02658937 -0.03709561 0.06509498 -0.8955790 0.4614076
## X0D280 proline
## 1 0.7753334 1.1296451
## 2 -1.2872265 -0.4002655
## 3 0.2823571 -0.7460740
barplot(t(df.kmeans$centers), beside = T, col=2:14)
legend("topleft", colnames(df.train.scale), fill = 2:14,
cex = 0.3, bty= 'n')## Class Alcohol Acid Ash Alcalinity Magnesium Total_phenols Flavanoids
## 1 1 13.20 1.78 2.14 11.2 100 2.65 2.76
## 2 1 13.16 2.36 2.67 18.6 101 2.80 3.24
## 3 1 14.37 1.95 2.50 16.8 113 3.85 3.49
## Nonflavanoid_phenols Proanthocyanins color_intensity Hue X0D280 proline
## 1 0.26 1.28 4.38 1.05 3.40 1050
## 2 0.30 2.81 5.68 1.03 3.17 1185
## 3 0.24 2.18 7.80 0.86 3.45 1480
## kmean_cluster
## 1 1
## 2 1
## 3 1
K-Medoid
Cluster package- pam
set.seed(1234)
fviz_nbclust(df.train.scale,
pam,
method = 'wss',
k.max = 15)+
ggtitle("Elbow Method")barplot(t(df.kmedoids$medoids), beside = T, col = 2:14)
legend("bottomleft", colnames(df.train.scale), fill=2:14, cex=0.3)## Class Alcohol Acid Ash Alcalinity Magnesium Total_phenols Flavanoids
## 1 1 13.20 1.78 2.14 11.2 100 2.65 2.76
## 2 1 13.16 2.36 2.67 18.6 101 2.80 3.24
## 3 1 14.37 1.95 2.50 16.8 113 3.85 3.49
## Nonflavanoid_phenols Proanthocyanins color_intensity Hue X0D280 proline
## 1 0.26 1.28 4.38 1.05 3.40 1050
## 2 0.30 2.81 5.68 1.03 3.17 1185
## 3 0.24 2.18 7.80 0.86 3.45 1480
## kmean_cluster kmedois_cluster
## 1 1 1
## 2 1 1
## 3 1 1
Hierarchical Clustering
hclust(dist(df.train.scale), method = 'single')-> df.hclust.single
hclust(dist(df.train.scale), method = 'complete')-> df.hclust.cplt
hclust(dist(df.train.scale), method = 'average')-> df.hclust.avg
hclust(dist(df.train.scale), method = 'ward.D')-> df.hclust.wardpar(mfrow=(c(2,2)))
plot(df.hclust.single, hang=-1, cex=0.4)
rect.hclust(df.hclust.single, k=3, border='skyblue')
plot(df.hclust.cplt, hang = -1, cex=0.4)
rect.hclust(df.hclust.cplt, k=3, border='skyblue')
plot(df.hclust.avg, hang=-1, cex=0.4)
rect.hclust(df.hclust.avg, k=3, border='skyblue')
plot(df.hclust.ward, hang = -1, cex=0.4)
rect.hclust(df.hclust.ward, k=3, border='skyblue')군집 할당하기 (Ward)
#--------------------------------------------------------------------
# 군집화 및 군집 할당
#-------------------------------------------------------------------
cutree(df.hclust.ward, k=3) -> hclust_cluster
df$cluster_cluster <- hclust_cluster
head(df,3)## Class Alcohol Acid Ash Alcalinity Magnesium Total_phenols Flavanoids
## 1 1 13.20 1.78 2.14 11.2 100 2.65 2.76
## 2 1 13.16 2.36 2.67 18.6 101 2.80 3.24
## 3 1 14.37 1.95 2.50 16.8 113 3.85 3.49
## Nonflavanoid_phenols Proanthocyanins color_intensity Hue X0D280 proline
## 1 0.26 1.28 4.38 1.05 3.40 1050
## 2 0.30 2.81 5.68 1.03 3.17 1185
## 3 0.24 2.18 7.80 0.86 3.45 1480
## kmean_cluster kmedois_cluster cluster_cluster
## 1 1 1 1
## 2 1 1 1
## 3 1 1 1
```