K-mean analysis

資料整理

讀資料，並選取欲分析變項

CAKM_raw<-read.csv("D:/104/ML_R/WOW_data.csv", 
                header=TRUE, sep=",")
CAKM<-CAKM_raw[c(12,13,20,21)]
head(CAKM)

##   reading_3 math_3 WA_S_2   CO_S_2
## 1       492    498    3.2 1.000000
## 2       494    508    4.9 1.000000
## 3       523    513    3.0 1.333333
## 4       476    493    2.2 1.833333
## 5       515    508    4.1 1.166667
## 6       450    498    4.7 2.166667

讀取套件

library(factoextra)
library(gridExtra)
library(cluster)
library(plotly)
library(plyr)

先利用PCA看看可以分為多少群

PCA分析(Principal Component Analysis)

CAKM_Z <- data.frame(scale(CAKM))
pca <- princomp(CAKM_Z)
plot(pca,type="lines",main = "PCA screeplot")

三個主成份的解釋量到>90%

summary(pca)

## Importance of components:
##                           Comp.1    Comp.2    Comp.3     Comp.4
## Standard deviation     1.2793645 1.0406093 0.9285078 0.63048139
## Proportion of Variance 0.4113246 0.2721269 0.2166542 0.09989428
## Cumulative Proportion  0.4113246 0.6834515 0.9001057 1.00000000

將三個主成份存成comp

comp <- data.frame(pca$scores[,1:3])

進行K-mean分析

set.seed(1234)
fit_2<-kmeans(CAKM,2,iter.max=100)
fit_3<-kmeans(CAKM,3,iter.max=100)
fit_4<-kmeans(CAKM,4,iter.max=100)
fit_3

## K-means clustering with 3 clusters of sizes 25, 77, 91
## 
## Cluster means:
##   reading_3   math_3   WA_S_2   CO_S_2
## 1  464.5200 492.0800 3.484000 1.846667
## 2  492.7273 502.5584 3.561472 1.766234
## 3  514.4725 510.4286 3.444414 1.656044
## 
## Clustering vector:
##   [1] 2 2 3 1 3 1 3 3 3 3 3 3 2 3 2 3 1 3 2 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 2
##  [36] 3 2 3 3 3 3 2 3 3 2 2 3 2 3 2 3 3 2 1 3 3 3 2 3 2 3 2 2 2 3 1 3 2 3 2
##  [71] 3 2 2 3 3 3 3 2 3 2 2 3 3 3 3 3 2 2 2 2 2 1 1 1 2 1 2 1 2 2 1 1 3 3 2
## [106] 2 2 2 2 2 1 3 2 2 3 2 2 1 3 2 3 1 2 3 1 2 2 1 3 3 1 3 1 2 3 2 3 3 2 2
## [141] 2 2 2 2 2 3 3 2 1 2 1 3 3 3 3 2 2 2 3 3 2 3 3 2 3 1 3 2 2 2 2 2 2 3 2
## [176] 3 2 2 3 3 2 3 3 3 2 2 3 3 3 1 1 2 3
## 
## Within cluster sum of squares by cluster:
## [1]  7354.307  8297.201 12960.197
##  (between_SS / total_SS =  68.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

繪製集群=2、3、4圖形

p2<- fviz_cluster(fit_2, data = CAKM, frame.type = "convex")+
      theme_minimal()
p3<- fviz_cluster(fit_3, data = CAKM, frame.type = "convex")+
      theme_minimal()
p4<- fviz_cluster(fit_4, data = CAKM, frame.type = "convex")+
      theme_minimal()
grid.arrange(p2,p3,p4,as.table=T)

進行ANOVA均值比較(以三群的分類結果為例)

將剛剛分群結果存入新變項中

CAKM_group<-cbind(CAKM,fit_3$cluster)
head(CAKM_group)

##   reading_3 math_3 WA_S_2   CO_S_2 fit_3$cluster
## 1       492    498    3.2 1.000000             2
## 2       494    508    4.9 1.000000             2
## 3       523    513    3.0 1.333333             3
## 4       476    493    2.2 1.833333             1
## 5       515    508    4.1 1.166667             3
## 6       450    498    4.7 2.166667             1

names(CAKM_group)<-c("Reading","Math","Warth","Conflict","Cluster")

進行均值比較

fit_3R<- aov(CAKM_group$Reading  ~ CAKM_group$Cluster, data=CAKM_group) 
fit_3M<- aov(CAKM_group$Math     ~ CAKM_group$Cluster, data=CAKM_group)
fit_3W<- aov(CAKM_group$Warth    ~ CAKM_group$Cluster, data=CAKM_group)
fit_3C<- aov(CAKM_group$Conflict ~ CAKM_group$Cluster, data=CAKM_group)

## reading_3 
summary(fit_3R)

##                     Df Sum Sq Mean Sq F value Pr(>F)    
## CAKM_group$Cluster   1  54111   54111   650.8 <2e-16 ***
## Residuals          191  15880      83                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## math_3
summary(fit_3M)

##                     Df Sum Sq Mean Sq F value Pr(>F)    
## CAKM_group$Cluster   1   7246    7246   106.6 <2e-16 ***
## Residuals          191  12979      68                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## WA_S_2
summary(fit_3W)

##                     Df Sum Sq Mean Sq F value Pr(>F)
## CAKM_group$Cluster   1   0.21  0.2082   0.265  0.607
## Residuals          191 149.80  0.7843

## CO_S_2
summary(fit_3C)

##                     Df Sum Sq Mean Sq F value Pr(>F)
## CAKM_group$Cluster   1   0.93  0.9251   2.357  0.126
## Residuals          191  74.98  0.3925

作圖

繪製reading_3、math_3的分群結果

CAKM_group$Cluster <- mapvalues(CAKM_group$Cluster,from = c(1,2,3),to = c("Cluster1", "Cluster2","Cluster3"))
plot_ly(data = CAKM_group , x = ~Reading, y = ~Math, color = ~Cluster)

各變項之間的分群結果

pairs(CAKM, col= fit_2$cluster)

pairs(CAKM, col= fit_3$cluster)

pairs(CAKM, col= fit_4$cluster)

利用PCA和kmean結果繪製3D圖

kmean3 <-cbind(comp,CAKM_group$Cluster)
head(kmean3)

##       Comp.1     Comp.2      Comp.3 CAKM_group$Cluster
## 1  0.3918984 -0.5404339  1.19858957           Cluster2
## 2 -0.1852734 -1.9504337 -0.09646834           Cluster2
## 3 -1.5843584  0.1895629  0.38261591           Cluster3
## 4  1.5369261  1.0680770  1.22572882           Cluster1
## 5 -0.9292431 -1.0013681 -0.04619167           Cluster3
## 6  2.4931532 -0.8518379 -0.55330209           Cluster1

p5 <- plot_ly(kmean3, x = ~Comp.1, y = ~Comp.2, z = ~Comp.3, color = ~CAKM_group$Cluster) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'Comp.1'),
                      yaxis = list(title = 'Comp.2'),
                      zaxis = list(title = 'Comp.3')))

p5

K-mean analysis

邱浩恩

2016年12月25日

資料整理

先利用PCA看看可以分為多少群

進行K-mean分析

進行ANOVA均值比較(以三群的分類結果為例)

作圖