讀資料,並選取欲分析變項
CAKM_raw<-read.csv("D:/104/ML_R/WOW_data.csv",
header=TRUE, sep=",")
CAKM<-CAKM_raw[c(12,13,20,21)]
head(CAKM)
## reading_3 math_3 WA_S_2 CO_S_2
## 1 492 498 3.2 1.000000
## 2 494 508 4.9 1.000000
## 3 523 513 3.0 1.333333
## 4 476 493 2.2 1.833333
## 5 515 508 4.1 1.166667
## 6 450 498 4.7 2.166667
讀取套件
library(factoextra)
library(gridExtra)
library(cluster)
library(plotly)
library(plyr)
PCA分析(Principal Component Analysis)
CAKM_Z <- data.frame(scale(CAKM))
pca <- princomp(CAKM_Z)
plot(pca,type="lines",main = "PCA screeplot")
三個主成份的解釋量到>90%
summary(pca)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 1.2793645 1.0406093 0.9285078 0.63048139
## Proportion of Variance 0.4113246 0.2721269 0.2166542 0.09989428
## Cumulative Proportion 0.4113246 0.6834515 0.9001057 1.00000000
將三個主成份存成comp
comp <- data.frame(pca$scores[,1:3])
set.seed(1234)
fit_2<-kmeans(CAKM,2,iter.max=100)
fit_3<-kmeans(CAKM,3,iter.max=100)
fit_4<-kmeans(CAKM,4,iter.max=100)
fit_3
## K-means clustering with 3 clusters of sizes 25, 77, 91
##
## Cluster means:
## reading_3 math_3 WA_S_2 CO_S_2
## 1 464.5200 492.0800 3.484000 1.846667
## 2 492.7273 502.5584 3.561472 1.766234
## 3 514.4725 510.4286 3.444414 1.656044
##
## Clustering vector:
## [1] 2 2 3 1 3 1 3 3 3 3 3 3 2 3 2 3 1 3 2 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 2
## [36] 3 2 3 3 3 3 2 3 3 2 2 3 2 3 2 3 3 2 1 3 3 3 2 3 2 3 2 2 2 3 1 3 2 3 2
## [71] 3 2 2 3 3 3 3 2 3 2 2 3 3 3 3 3 2 2 2 2 2 1 1 1 2 1 2 1 2 2 1 1 3 3 2
## [106] 2 2 2 2 2 1 3 2 2 3 2 2 1 3 2 3 1 2 3 1 2 2 1 3 3 1 3 1 2 3 2 3 3 2 2
## [141] 2 2 2 2 2 3 3 2 1 2 1 3 3 3 3 2 2 2 3 3 2 3 3 2 3 1 3 2 2 2 2 2 2 3 2
## [176] 3 2 2 3 3 2 3 3 3 2 2 3 3 3 1 1 2 3
##
## Within cluster sum of squares by cluster:
## [1] 7354.307 8297.201 12960.197
## (between_SS / total_SS = 68.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
繪製集群=2、3、4圖形
p2<- fviz_cluster(fit_2, data = CAKM, frame.type = "convex")+
theme_minimal()
p3<- fviz_cluster(fit_3, data = CAKM, frame.type = "convex")+
theme_minimal()
p4<- fviz_cluster(fit_4, data = CAKM, frame.type = "convex")+
theme_minimal()
grid.arrange(p2,p3,p4,as.table=T)
將剛剛分群結果存入新變項中
CAKM_group<-cbind(CAKM,fit_3$cluster)
head(CAKM_group)
## reading_3 math_3 WA_S_2 CO_S_2 fit_3$cluster
## 1 492 498 3.2 1.000000 2
## 2 494 508 4.9 1.000000 2
## 3 523 513 3.0 1.333333 3
## 4 476 493 2.2 1.833333 1
## 5 515 508 4.1 1.166667 3
## 6 450 498 4.7 2.166667 1
names(CAKM_group)<-c("Reading","Math","Warth","Conflict","Cluster")
進行均值比較
fit_3R<- aov(CAKM_group$Reading ~ CAKM_group$Cluster, data=CAKM_group)
fit_3M<- aov(CAKM_group$Math ~ CAKM_group$Cluster, data=CAKM_group)
fit_3W<- aov(CAKM_group$Warth ~ CAKM_group$Cluster, data=CAKM_group)
fit_3C<- aov(CAKM_group$Conflict ~ CAKM_group$Cluster, data=CAKM_group)
## reading_3
summary(fit_3R)
## Df Sum Sq Mean Sq F value Pr(>F)
## CAKM_group$Cluster 1 54111 54111 650.8 <2e-16 ***
## Residuals 191 15880 83
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## math_3
summary(fit_3M)
## Df Sum Sq Mean Sq F value Pr(>F)
## CAKM_group$Cluster 1 7246 7246 106.6 <2e-16 ***
## Residuals 191 12979 68
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## WA_S_2
summary(fit_3W)
## Df Sum Sq Mean Sq F value Pr(>F)
## CAKM_group$Cluster 1 0.21 0.2082 0.265 0.607
## Residuals 191 149.80 0.7843
## CO_S_2
summary(fit_3C)
## Df Sum Sq Mean Sq F value Pr(>F)
## CAKM_group$Cluster 1 0.93 0.9251 2.357 0.126
## Residuals 191 74.98 0.3925
繪製reading_3、math_3的分群結果
CAKM_group$Cluster <- mapvalues(CAKM_group$Cluster,from = c(1,2,3),to = c("Cluster1", "Cluster2","Cluster3"))
plot_ly(data = CAKM_group , x = ~Reading, y = ~Math, color = ~Cluster)
各變項之間的分群結果
pairs(CAKM, col= fit_2$cluster)
pairs(CAKM, col= fit_3$cluster)
pairs(CAKM, col= fit_4$cluster)
利用PCA和kmean結果繪製3D圖
kmean3 <-cbind(comp,CAKM_group$Cluster)
head(kmean3)
## Comp.1 Comp.2 Comp.3 CAKM_group$Cluster
## 1 0.3918984 -0.5404339 1.19858957 Cluster2
## 2 -0.1852734 -1.9504337 -0.09646834 Cluster2
## 3 -1.5843584 0.1895629 0.38261591 Cluster3
## 4 1.5369261 1.0680770 1.22572882 Cluster1
## 5 -0.9292431 -1.0013681 -0.04619167 Cluster3
## 6 2.4931532 -0.8518379 -0.55330209 Cluster1
p5 <- plot_ly(kmean3, x = ~Comp.1, y = ~Comp.2, z = ~Comp.3, color = ~CAKM_group$Cluster) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Comp.1'),
yaxis = list(title = 'Comp.2'),
zaxis = list(title = 'Comp.3')))
p5