data<-read.csv("HW2_data.csv")
CA<-data[c(6,7)]
CAz<-as.data.frame(scale(CA))
str(CA)
## 'data.frame': 15 obs. of 2 variables:
## $ ComputerAbility: int 2 1 2 1 3 2 3 2 3 1 ...
## $ PChour : int 12 22 15 30 10 12 16 8 11 26 ...
str(CAz)
## 'data.frame': 15 obs. of 2 variables:
## $ ComputerAbility: num 0.0835 -1.1684 0.0835 -1.1684 1.3353 ...
## $ PChour : num -0.644 0.479 -0.307 1.378 -0.869 ...
#Euclidean Distance
eucd<-dist(CAz,method="euclidean")
eucd2<-eucd^2
eucd_matrix<-as.matrix(eucd)
eucd2_matrix<-as.matrix(eucd2)
##Nearest Neighbor
#hierarchical cluster analysis
CA_N<-hclust(eucd2,method="single")
CA_N
##
## Call:
## hclust(d = eucd2, method = "single")
##
## Cluster method : single
## Distance : euclidean
## Number of objects: 15
#plot cluster dendrogram
group2n<-cutree(CA_N,k=2)
group3n<-cutree(CA_N,k=3)
group4n<-cutree(CA_N,k=4)
CAN<-cbind(CAz,group2n,group3n,group4n)
CAN
## ComputerAbility PChour group2n group3n group4n
## 1 0.08345762 -0.6441967 1 1 1
## 2 -1.16840666 0.4794022 2 2 2
## 3 0.08345762 -0.3071170 1 1 1
## 4 -1.16840666 1.3782813 2 2 2
## 5 1.33532190 -0.8689165 1 3 3
## 6 0.08345762 -0.6441967 1 1 1
## 7 1.33532190 -0.1947571 1 3 3
## 8 0.08345762 -1.0936362 1 1 1
## 9 1.33532190 -0.7565566 1 3 3
## 10 -1.16840666 0.9288417 2 2 2
## 11 0.08345762 -0.4194769 1 1 1
## 12 1.33532190 -1.3183560 1 3 3
## 13 -1.16840666 2.0524406 2 2 4
## 14 0.08345762 0.2546824 1 1 1
## 15 -1.16840666 1.1535615 2 2 2
plot(CA_N)
rect.hclust(CA_N,k=3,border="red")
#data type & descriptives statistics
str(CAN)
## 'data.frame': 15 obs. of 5 variables:
## $ ComputerAbility: num 0.0835 -1.1684 0.0835 -1.1684 1.3353 ...
## $ PChour : num -0.644 0.479 -0.307 1.378 -0.869 ...
## $ group2n : int 1 2 1 2 1 1 1 1 1 2 ...
## $ group3n : int 1 2 1 2 3 1 3 1 3 2 ...
## $ group4n : int 1 2 1 2 3 1 3 1 3 2 ...
CAN$group3n<-as.factor(CAN$group3n)
library(psych)

describeBy(CAN[1:2],CAN$group3n)
##
## Descriptive statistics by group
## group: 1
## vars n mean sd median trimmed mad min max range skew
## ComputerAbility 1 6 0.08 0.00 0.08 0.08 0.00 0.08 0.08 0.00 NaN
## PChour 2 6 -0.48 0.45 -0.53 -0.48 0.25 -1.09 0.25 1.35 0.28
## kurtosis se
## ComputerAbility NaN 0.00
## PChour -1.21 0.18
## ------------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew
## ComputerAbility 1 5 -1.17 0.00 -1.17 -1.17 0.00 -1.17 -1.17 0.00 NaN
## PChour 2 5 1.20 0.58 1.15 1.20 0.33 0.48 2.05 1.57 0.24
## kurtosis se
## ComputerAbility NaN 0.00
## PChour -1.59 0.26
## ------------------------------------------------------------
## group: 3
## vars n mean sd median trimmed mad min max range skew
## ComputerAbility 1 4 1.34 0.00 1.34 1.34 0.00 1.34 1.34 0.00 NaN
## PChour 2 4 -0.78 0.46 -0.81 -0.78 0.42 -1.32 -0.19 1.12 0.13
## kurtosis se
## ComputerAbility NaN 0.00
## PChour -1.89 0.23
#Leven test
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
leveneTest(ComputerAbility~group3n,data=CAN)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 NaN NaN
## 12
leveneTest(PChour~group3n,data=CAN)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.1333 0.8765
## 12
#ANOVA
fit_cn<-aov(ComputerAbility~group3n,data=CAN)
summary(fit_cn)
## Df Sum Sq Mean Sq F value Pr(>F)
## group3n 2 14 7 1.57e+32 <2e-16 ***
## Residuals 12 0 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit_pn<-aov(PChour~group3n,data=CAN)
summary(fit_pn)
## Df Sum Sq Mean Sq F value Pr(>F)
## group3n 2 11.002 5.501 22.02 9.64e-05 ***
## Residuals 12 2.998 0.250
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#post hoc
library(agricolae)
sch_cn<-scheffe.test(fit_cn,"group3n",group=F)
## Warning in anova.lm(y): ANOVA F-tests on an essentially perfect fit are
## unreliable
sch_cn$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 1.251864 0 *** 1.251864 1.251864
## 1 - 3 -1.251864 0 *** -1.251864 -1.251864
## 2 - 3 -2.503729 0 *** -2.503729 -2.503729
sch_pn<-scheffe.test(fit_pn,"group3n",group=F)
sch_pn$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 -1.6741623 0.0005 *** -2.5178261 -0.8304985
## 1 - 3 0.3089897 0.6428 -0.5903589 1.2083383
## 2 - 3 1.9831520 0.0003 *** 1.0485215 2.9177825
##Furthest Neighbor
#hierarchical cluster analysis
CA_F<-hclust(eucd2,method="complete")
CA_F
##
## Call:
## hclust(d = eucd2, method = "complete")
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 15
#plot cluster dendrogram
group2f<-cutree(CA_F,k=2)
group3f<-cutree(CA_F,k=3)
group4f<-cutree(CA_F,k=4)
CAF<-cbind(CAz,group2f,group3f,group4f)
CAF
## ComputerAbility PChour group2f group3f group4f
## 1 0.08345762 -0.6441967 1 1 1
## 2 -1.16840666 0.4794022 2 2 2
## 3 0.08345762 -0.3071170 1 1 1
## 4 -1.16840666 1.3782813 2 2 3
## 5 1.33532190 -0.8689165 1 3 4
## 6 0.08345762 -0.6441967 1 1 1
## 7 1.33532190 -0.1947571 1 3 4
## 8 0.08345762 -1.0936362 1 1 1
## 9 1.33532190 -0.7565566 1 3 4
## 10 -1.16840666 0.9288417 2 2 2
## 11 0.08345762 -0.4194769 1 1 1
## 12 1.33532190 -1.3183560 1 3 4
## 13 -1.16840666 2.0524406 2 2 3
## 14 0.08345762 0.2546824 1 1 1
## 15 -1.16840666 1.1535615 2 2 3
plot(CA_F)
rect.hclust(CA_F,k=3,border="red")

#data type & descriptives statistics
str(CAF)
## 'data.frame': 15 obs. of 5 variables:
## $ ComputerAbility: num 0.0835 -1.1684 0.0835 -1.1684 1.3353 ...
## $ PChour : num -0.644 0.479 -0.307 1.378 -0.869 ...
## $ group2f : int 1 2 1 2 1 1 1 1 1 2 ...
## $ group3f : int 1 2 1 2 3 1 3 1 3 2 ...
## $ group4f : int 1 2 1 3 4 1 4 1 4 2 ...
CAF$group3f<-as.factor(CAF$group3f)
describeBy(CAF[1:2],CAF$group3f)
##
## Descriptive statistics by group
## group: 1
## vars n mean sd median trimmed mad min max range skew
## ComputerAbility 1 6 0.08 0.00 0.08 0.08 0.00 0.08 0.08 0.00 NaN
## PChour 2 6 -0.48 0.45 -0.53 -0.48 0.25 -1.09 0.25 1.35 0.28
## kurtosis se
## ComputerAbility NaN 0.00
## PChour -1.21 0.18
## ------------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew
## ComputerAbility 1 5 -1.17 0.00 -1.17 -1.17 0.00 -1.17 -1.17 0.00 NaN
## PChour 2 5 1.20 0.58 1.15 1.20 0.33 0.48 2.05 1.57 0.24
## kurtosis se
## ComputerAbility NaN 0.00
## PChour -1.59 0.26
## ------------------------------------------------------------
## group: 3
## vars n mean sd median trimmed mad min max range skew
## ComputerAbility 1 4 1.34 0.00 1.34 1.34 0.00 1.34 1.34 0.00 NaN
## PChour 2 4 -0.78 0.46 -0.81 -0.78 0.42 -1.32 -0.19 1.12 0.13
## kurtosis se
## ComputerAbility NaN 0.00
## PChour -1.89 0.23
#Leven test
leveneTest(ComputerAbility~group3f,data=CAF)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 NaN NaN
## 12
leveneTest(PChour~group3f,data=CAF)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.1333 0.8765
## 12
#ANOVA
fit_cf<-aov(ComputerAbility~group3f,data=CAF)
summary(fit_cf)
## Df Sum Sq Mean Sq F value Pr(>F)
## group3f 2 14 7 1.57e+32 <2e-16 ***
## Residuals 12 0 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit_pf<-aov(PChour~group3f,data=CAF)
summary(fit_pf)
## Df Sum Sq Mean Sq F value Pr(>F)
## group3f 2 11.002 5.501 22.02 9.64e-05 ***
## Residuals 12 2.998 0.250
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#post hoc
sch_cf<-scheffe.test(fit_cf,"group3f",group=F)
## Warning in anova.lm(y): ANOVA F-tests on an essentially perfect fit are
## unreliable
sch_cf$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 1.251864 0 *** 1.251864 1.251864
## 1 - 3 -1.251864 0 *** -1.251864 -1.251864
## 2 - 3 -2.503729 0 *** -2.503729 -2.503729
sch_pf<-scheffe.test(fit_pf,"group3f",group=F)
sch_pf$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 -1.6741623 0.0005 *** -2.5178261 -0.8304985
## 1 - 3 0.3089897 0.6428 -0.5903589 1.2083383
## 2 - 3 1.9831520 0.0003 *** 1.0485215 2.9177825
##Centroid,
#hierarchical cluster analysis
CA_C<-hclust(eucd2,method="average")
CA_C
##
## Call:
## hclust(d = eucd2, method = "average")
##
## Cluster method : average
## Distance : euclidean
## Number of objects: 15
#plot cluster dendrogram
group2c<-cutree(CA_C,k=2)
group3c<-cutree(CA_C,k=3)
group4c<-cutree(CA_C,k=4)
CAC<-cbind(CAz,group2c,group3c,group4c)
CAC
## ComputerAbility PChour group2c group3c group4c
## 1 0.08345762 -0.6441967 1 1 1
## 2 -1.16840666 0.4794022 2 2 2
## 3 0.08345762 -0.3071170 1 1 1
## 4 -1.16840666 1.3782813 2 2 2
## 5 1.33532190 -0.8689165 1 3 3
## 6 0.08345762 -0.6441967 1 1 1
## 7 1.33532190 -0.1947571 1 3 3
## 8 0.08345762 -1.0936362 1 1 1
## 9 1.33532190 -0.7565566 1 3 3
## 10 -1.16840666 0.9288417 2 2 2
## 11 0.08345762 -0.4194769 1 1 1
## 12 1.33532190 -1.3183560 1 3 3
## 13 -1.16840666 2.0524406 2 2 4
## 14 0.08345762 0.2546824 1 1 1
## 15 -1.16840666 1.1535615 2 2 2
plot(CA_C)
rect.hclust(CA_C,k=3,border="red")

#data type & descriptives statistics
str(CAC)
## 'data.frame': 15 obs. of 5 variables:
## $ ComputerAbility: num 0.0835 -1.1684 0.0835 -1.1684 1.3353 ...
## $ PChour : num -0.644 0.479 -0.307 1.378 -0.869 ...
## $ group2c : int 1 2 1 2 1 1 1 1 1 2 ...
## $ group3c : int 1 2 1 2 3 1 3 1 3 2 ...
## $ group4c : int 1 2 1 2 3 1 3 1 3 2 ...
CAC$group3c<-as.factor(CAC$group3c)
describeBy(CAC[1:2],CAC$group3c)
##
## Descriptive statistics by group
## group: 1
## vars n mean sd median trimmed mad min max range skew
## ComputerAbility 1 6 0.08 0.00 0.08 0.08 0.00 0.08 0.08 0.00 NaN
## PChour 2 6 -0.48 0.45 -0.53 -0.48 0.25 -1.09 0.25 1.35 0.28
## kurtosis se
## ComputerAbility NaN 0.00
## PChour -1.21 0.18
## ------------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew
## ComputerAbility 1 5 -1.17 0.00 -1.17 -1.17 0.00 -1.17 -1.17 0.00 NaN
## PChour 2 5 1.20 0.58 1.15 1.20 0.33 0.48 2.05 1.57 0.24
## kurtosis se
## ComputerAbility NaN 0.00
## PChour -1.59 0.26
## ------------------------------------------------------------
## group: 3
## vars n mean sd median trimmed mad min max range skew
## ComputerAbility 1 4 1.34 0.00 1.34 1.34 0.00 1.34 1.34 0.00 NaN
## PChour 2 4 -0.78 0.46 -0.81 -0.78 0.42 -1.32 -0.19 1.12 0.13
## kurtosis se
## ComputerAbility NaN 0.00
## PChour -1.89 0.23
#Leven test
leveneTest(ComputerAbility~group3c,data=CAC)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 NaN NaN
## 12
leveneTest(PChour~group3c,data=CAC)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.1333 0.8765
## 12
#ANOVA
fit_cc<-aov(ComputerAbility~group3c,data=CAC)
summary(fit_cc)
## Df Sum Sq Mean Sq F value Pr(>F)
## group3c 2 14 7 1.57e+32 <2e-16 ***
## Residuals 12 0 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit_pc<-aov(PChour~group3c,data=CAC)
summary(fit_pc)
## Df Sum Sq Mean Sq F value Pr(>F)
## group3c 2 11.002 5.501 22.02 9.64e-05 ***
## Residuals 12 2.998 0.250
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#post hoc
sch_cc<-scheffe.test(fit_cc,"group3c",group=F)
## Warning in anova.lm(y): ANOVA F-tests on an essentially perfect fit are
## unreliable
sch_cc$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 1.251864 0 *** 1.251864 1.251864
## 1 - 3 -1.251864 0 *** -1.251864 -1.251864
## 2 - 3 -2.503729 0 *** -2.503729 -2.503729
sch_pc<-scheffe.test(fit_pc,"group3c",group=F)
sch_pc$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 -1.6741623 0.0005 *** -2.5178261 -0.8304985
## 1 - 3 0.3089897 0.6428 -0.5903589 1.2083383
## 2 - 3 1.9831520 0.0003 *** 1.0485215 2.9177825
##Ward’s Method
#hierarchical cluster analysis
CA_W<-hclust(eucd2,method="ward.D2")
CA_W
##
## Call:
## hclust(d = eucd2, method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 15
#plot cluster dendrogram
group2w<-cutree(CA_W,k=2)
group3w<-cutree(CA_W,k=3)
group4w<-cutree(CA_W,k=4)
CAW<-cbind(CAz,group2w,group3w,group4w)
CAW
## ComputerAbility PChour group2w group3w group4w
## 1 0.08345762 -0.6441967 1 1 1
## 2 -1.16840666 0.4794022 2 2 2
## 3 0.08345762 -0.3071170 1 1 1
## 4 -1.16840666 1.3782813 2 2 2
## 5 1.33532190 -0.8689165 1 3 3
## 6 0.08345762 -0.6441967 1 1 1
## 7 1.33532190 -0.1947571 1 3 3
## 8 0.08345762 -1.0936362 1 1 1
## 9 1.33532190 -0.7565566 1 3 3
## 10 -1.16840666 0.9288417 2 2 2
## 11 0.08345762 -0.4194769 1 1 1
## 12 1.33532190 -1.3183560 1 3 3
## 13 -1.16840666 2.0524406 2 2 4
## 14 0.08345762 0.2546824 1 1 1
## 15 -1.16840666 1.1535615 2 2 2
plot(CA_W)
rect.hclust(CA_W,k=3,border="red")

#data type & descriptives statistics
str(CAW)
## 'data.frame': 15 obs. of 5 variables:
## $ ComputerAbility: num 0.0835 -1.1684 0.0835 -1.1684 1.3353 ...
## $ PChour : num -0.644 0.479 -0.307 1.378 -0.869 ...
## $ group2w : int 1 2 1 2 1 1 1 1 1 2 ...
## $ group3w : int 1 2 1 2 3 1 3 1 3 2 ...
## $ group4w : int 1 2 1 2 3 1 3 1 3 2 ...
CAW$group3w<-as.factor(CAW$group3w)
describeBy(CAW[1:2],CAW$group3w)
##
## Descriptive statistics by group
## group: 1
## vars n mean sd median trimmed mad min max range skew
## ComputerAbility 1 6 0.08 0.00 0.08 0.08 0.00 0.08 0.08 0.00 NaN
## PChour 2 6 -0.48 0.45 -0.53 -0.48 0.25 -1.09 0.25 1.35 0.28
## kurtosis se
## ComputerAbility NaN 0.00
## PChour -1.21 0.18
## ------------------------------------------------------------
## group: 2
## vars n mean sd median trimmed mad min max range skew
## ComputerAbility 1 5 -1.17 0.00 -1.17 -1.17 0.00 -1.17 -1.17 0.00 NaN
## PChour 2 5 1.20 0.58 1.15 1.20 0.33 0.48 2.05 1.57 0.24
## kurtosis se
## ComputerAbility NaN 0.00
## PChour -1.59 0.26
## ------------------------------------------------------------
## group: 3
## vars n mean sd median trimmed mad min max range skew
## ComputerAbility 1 4 1.34 0.00 1.34 1.34 0.00 1.34 1.34 0.00 NaN
## PChour 2 4 -0.78 0.46 -0.81 -0.78 0.42 -1.32 -0.19 1.12 0.13
## kurtosis se
## ComputerAbility NaN 0.00
## PChour -1.89 0.23
#Leven test
leveneTest(ComputerAbility~group3w,data=CAW)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 NaN NaN
## 12
leveneTest(PChour~group3w,data=CAW)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.1333 0.8765
## 12
#ANOVA
fit_cw<-aov(ComputerAbility~group3w,data=CAW)
summary(fit_cw)
## Df Sum Sq Mean Sq F value Pr(>F)
## group3w 2 14 7 1.57e+32 <2e-16 ***
## Residuals 12 0 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit_pw<-aov(PChour~group3w,data=CAW)
summary(fit_pw)
## Df Sum Sq Mean Sq F value Pr(>F)
## group3w 2 11.002 5.501 22.02 9.64e-05 ***
## Residuals 12 2.998 0.250
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#post hoc
sch_cw<-scheffe.test(fit_cw,"group3w",group=F)
## Warning in anova.lm(y): ANOVA F-tests on an essentially perfect fit are
## unreliable
sch_cw$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 1.251864 0 *** 1.251864 1.251864
## 1 - 3 -1.251864 0 *** -1.251864 -1.251864
## 2 - 3 -2.503729 0 *** -2.503729 -2.503729
sch_pw<-scheffe.test(fit_pw,"group3w",group=F)
sch_pw$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 -1.6741623 0.0005 *** -2.5178261 -0.8304985
## 1 - 3 0.3089897 0.6428 -0.5903589 1.2083383
## 2 - 3 1.9831520 0.0003 *** 1.0485215 2.9177825
##K-mean
KM<-data[c(6,7)]
KMz<-as.data.frame(scale(KM))
#group 2
set.seed(1234)
KM2<-kmeans(KMz,centers=2,iter.max=10,nstart=1)
KM2
## K-means clustering with 2 clusters of sizes 10, 5
##
## Cluster means:
## ComputerAbility PChour
## 1 0.5842033 -0.5992527
## 2 -1.1684067 1.1985055
##
## Clustering vector:
## [1] 1 2 1 2 1 1 1 1 1 2 1 1 2 1 2
##
## Within cluster sum of squares by cluster:
## [1] 5.634706 1.353373
## (between_SS / total_SS = 75.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
KM2$size
## [1] 10 5
KM2$centers
## ComputerAbility PChour
## 1 0.5842033 -0.5992527
## 2 -1.1684067 1.1985055
KM2_c<-cbind(KMz,group2=KM2$cluster)
str(KM2_c)
## 'data.frame': 15 obs. of 3 variables:
## $ ComputerAbility: num 0.0835 -1.1684 0.0835 -1.1684 1.3353 ...
## $ PChour : num -0.644 0.479 -0.307 1.378 -0.869 ...
## $ group2 : int 1 2 1 2 1 1 1 1 1 2 ...
KM2_c$group2<-as.factor(KM2_c$group2)
str(KM2_c)
## 'data.frame': 15 obs. of 3 variables:
## $ ComputerAbility: num 0.0835 -1.1684 0.0835 -1.1684 1.3353 ...
## $ PChour : num -0.644 0.479 -0.307 1.378 -0.869 ...
## $ group2 : Factor w/ 2 levels "1","2": 1 2 1 2 1 1 1 1 1 2 ...
#leven test
leveneTest(ComputerAbility~group2,data=KM2_c)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 2.8889 0.113
## 13
leveneTest(PChour~group2,data=KM2_c)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.1512 0.7037
## 13
#ANOVA
fit_c2<-aov(ComputerAbility~group2,data=KM2_c)
summary(fit_c2)
## Df Sum Sq Mean Sq F value Pr(>F)
## group2 1 10.239 10.239 35.39 4.83e-05 ***
## Residuals 13 3.761 0.289
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit_p2<-aov(PChour~group2,data=KM2_c)
summary(fit_p2)
## Df Sum Sq Mean Sq F value Pr(>F)
## group2 1 10.773 10.773 43.4 1.75e-05 ***
## Residuals 13 3.227 0.248
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#post hoc
sch_c2<-scheffe.test(fit_c2,"group2",group=F)
sch_c2$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 1.75261 0 *** 1.116137 2.389083
sch_p2<-scheffe.test(fit_p2,"group2",group=F)
sch_p2$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 -1.797758 0 *** -2.387292 -1.208225
#scatter plot
library(factoextra)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
##
## Attaching package: 'factoextra'
## The following object is masked from 'package:agricolae':
##
## hcut
fviz_cluster(KM2,data=KMz[1:2],ellipse.type="norm")
## Warning: Computation failed in `stat_ellipse()`
## Caused by error in `chol.default()`:
## ! the leading minor of order 1 is not positive

#group 3
set.seed(1234)
KM3<-kmeans(KMz,centers=3,iter.max=10,nstart=1)
KM3
## K-means clustering with 3 clusters of sizes 4, 5, 6
##
## Cluster means:
## ComputerAbility PChour
## 1 1.33532190 -0.7846466
## 2 -1.16840666 1.1985055
## 3 0.08345762 -0.4756569
##
## Clustering vector:
## [1] 3 2 3 2 1 3 1 3 1 2 3 1 2 3 2
##
## Within cluster sum of squares by cluster:
## [1] 0.6407058 1.3533726 1.0036672
## (between_SS / total_SS = 89.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
KM3$size
## [1] 4 5 6
KM3$centers
## ComputerAbility PChour
## 1 1.33532190 -0.7846466
## 2 -1.16840666 1.1985055
## 3 0.08345762 -0.4756569
KM3_c<-cbind(KMz,group3=KM3$cluster)
str(KM3_c)
## 'data.frame': 15 obs. of 3 variables:
## $ ComputerAbility: num 0.0835 -1.1684 0.0835 -1.1684 1.3353 ...
## $ PChour : num -0.644 0.479 -0.307 1.378 -0.869 ...
## $ group3 : int 3 2 3 2 1 3 1 3 1 2 ...
KM3_c$group3<-as.factor(KM3_c$group3)
str(KM3_c)
## 'data.frame': 15 obs. of 3 variables:
## $ ComputerAbility: num 0.0835 -1.1684 0.0835 -1.1684 1.3353 ...
## $ PChour : num -0.644 0.479 -0.307 1.378 -0.869 ...
## $ group3 : Factor w/ 3 levels "1","2","3": 3 2 3 2 1 3 1 3 1 2 ...
#leven test
leveneTest(ComputerAbility~group3,data=KM3_c)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 NaN NaN
## 12
leveneTest(PChour~group3,data=KM3_c)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.1333 0.8765
## 12
#ANOVA
fit_c3<-aov(ComputerAbility~group3,data=KM3_c)
summary(fit_c3)
## Df Sum Sq Mean Sq F value Pr(>F)
## group3 2 14 7 1.115e+32 <2e-16 ***
## Residuals 12 0 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit_p3<-aov(PChour~group3,data=KM3_c)
summary(fit_p3)
## Df Sum Sq Mean Sq F value Pr(>F)
## group3 2 11.002 5.501 22.02 9.64e-05 ***
## Residuals 12 2.998 0.250
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#post hoc
sch_c3<-scheffe.test(fit_c3,"group3",group=F)
## Warning in anova.lm(y): ANOVA F-tests on an essentially perfect fit are
## unreliable
sch_c3$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 2.503729 0 *** 2.503729 2.503729
## 1 - 3 1.251864 0 *** 1.251864 1.251864
## 2 - 3 -1.251864 0 *** -1.251864 -1.251864
sch_p3<-scheffe.test(fit_p3,"group3",group=F)
sch_p3$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 -1.9831520 0.0003 *** -2.9177825 -1.0485215
## 1 - 3 -0.3089897 0.6428 -1.2083383 0.5903589
## 2 - 3 1.6741623 0.0005 *** 0.8304985 2.5178261
#scatter plot
fviz_cluster(KM3,data=KMz[1:2],ellipse.type="norm")
## Warning: Computation failed in `stat_ellipse()`
## Caused by error in `chol.default()`:
## ! the leading minor of order 1 is not positive

#group 4
set.seed(1234)
KM4<-kmeans(KMz,centers=4,iter.max=10,nstart=1)
KM4
## K-means clustering with 4 clusters of sizes 3, 5, 6, 1
##
## Cluster means:
## ComputerAbility PChour
## 1 1.33532190 -0.9812764
## 2 -1.16840666 1.1985055
## 3 0.08345762 -0.4756569
## 4 1.33532190 -0.1947571
##
## Clustering vector:
## [1] 3 2 3 2 1 3 4 3 1 2 3 1 2 3 2
##
## Within cluster sum of squares by cluster:
## [1] 0.1767464 1.3533726 1.0036672 0.0000000
## (between_SS / total_SS = 91.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
KM4$size
## [1] 3 5 6 1
KM4$centers
## ComputerAbility PChour
## 1 1.33532190 -0.9812764
## 2 -1.16840666 1.1985055
## 3 0.08345762 -0.4756569
## 4 1.33532190 -0.1947571
KM4_c<-cbind(KMz,group4=KM4$cluster)
str(KM4_c)
## 'data.frame': 15 obs. of 3 variables:
## $ ComputerAbility: num 0.0835 -1.1684 0.0835 -1.1684 1.3353 ...
## $ PChour : num -0.644 0.479 -0.307 1.378 -0.869 ...
## $ group4 : int 3 2 3 2 1 3 4 3 1 2 ...
KM4_c$group4<-as.factor(KM4_c$group4)
str(KM4_c)
## 'data.frame': 15 obs. of 3 variables:
## $ ComputerAbility: num 0.0835 -1.1684 0.0835 -1.1684 1.3353 ...
## $ PChour : num -0.644 0.479 -0.307 1.378 -0.869 ...
## $ group4 : Factor w/ 4 levels "1","2","3","4": 3 2 3 2 1 3 4 3 1 2 ...
#leven test
leveneTest(ComputerAbility~group4,data=KM4_c)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 3 NaN NaN
## 11
leveneTest(PChour~group4,data=KM4_c)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 3 0.6335 0.6087
## 11
#ANOVA
fit_c4<-aov(ComputerAbility~group4,data=KM4_c)
summary(fit_c4)
## Df Sum Sq Mean Sq F value Pr(>F)
## group4 3 14 4.667 6.813e+31 <2e-16 ***
## Residuals 11 0 0.000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
fit_p4<-aov(PChour~group4,data=KM4_c)
summary(fit_p4)
## Df Sum Sq Mean Sq F value Pr(>F)
## group4 3 11.466 3.822 16.59 0.000215 ***
## Residuals 11 2.534 0.230
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#post hoc
sch_c4<-scheffe.test(fit_c4,"group4",group=F)
## Warning in anova.lm(y): ANOVA F-tests on an essentially perfect fit are
## unreliable
sch_c4$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 2.503729 0 *** 2.503729e+00 2.503729e+00
## 1 - 3 1.251864 0 *** 1.251864e+00 1.251864e+00
## 1 - 4 0.000000 1 -9.913831e-16 9.913831e-16
## 2 - 3 -1.251864 0 *** -1.251864e+00 -1.251864e+00
## 2 - 4 -2.503729 0 *** -2.503729e+00 -2.503729e+00
## 3 - 4 -1.251864 0 *** -1.251864e+00 -1.251864e+00
sch_p4<-scheffe.test(fit_p4,"group4",group=F)
sch_p4$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 -2.1797818 0.0006 *** -3.3296302 -1.0299335
## 1 - 3 -0.5056195 0.5501 -1.6189554 0.6077164
## 1 - 4 -0.7865192 0.5872 -2.6045891 1.0315507
## 2 - 3 1.6741623 0.0012 ** 0.7207584 2.6275662
## 2 - 4 1.3932626 0.1294 -0.3315099 3.1180352
## 3 - 4 -0.2808997 0.9596 -1.9815484 1.4197490
#scatter plot
fviz_cluster(KM4,data=KMz[1:2],ellipse.type="norm")
## Too few points to calculate an ellipse
## Warning: Computation failed in `stat_ellipse()`
## Caused by error in `chol.default()`:
## ! the leading minor of order 1 is not positive

#number of group
library(NbClust)
all<-NbClust(data=KMz,diss=NULL,distance="euclidean",min.nc=2,
max.nc=5,method="kmeans",index="all",alphaBeale=0.05)
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 7 proposed 2 as the best number of clusters
## * 12 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 3 proposed 5 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
all$Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## Number_clusters 5.0000 3.0000 3.0000 3.0000 3.0000 3.0000 3.0000 3.0000
## Value_Index 4.7851 50.0421 15.1071 0.8324 529.3342 45.1004 15.1357 3.5264
## Friedman Rubin Cindex DB Silhouette Duda PseudoT2
## Number_clusters 3.000000e+00 3.0000 4.0000 5.0000 2.0000 2.0000 NA
## Value_Index 7.881299e+15 -3.6232 0.3699 0.3962 0.6345 0.2918 NA
## Beale Ratkowsky Ball PtBiserial Frey McClain Dunn Hubert
## Number_clusters 2.000 2.0000 3.0000 2.0000 3.0000 2.0000 3.000 0
## Value_Index 2.157 0.6125 2.4948 0.7951 2.7667 0.3354 0.799 0
## SDindex Dindex SDbw
## Number_clusters 2.0000 0 5.000
## Value_Index 1.5384 0 0.051
#f題
datap<-cbind(data,group = KM3_c$group)
#gender
datap$Male<-as.factor(datap$Male)
datap$group<-as.factor(datap$group)
library(plyr)
indtable<-table(datap$Male,datap$group)
indtable
##
## 1 2 3
## 0 3 0 4
## 1 1 5 2
chisq.test(indtable)
## Warning in chisq.test(indtable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: indtable
## X-squared = 6.6295, df = 2, p-value = 0.03634
#residence
str(datap)
## 'data.frame': 15 obs. of 11 variables:
## $ Name : chr "Adam" "Bart" "Chris" "Dennis" ...
## $ Male : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 1 2 1 2 ...
## $ Residency : chr "Taichung" "Taipei" "Taichung" "Tainan" ...
## $ Height : int 152 164 173 168 175 168 165 169 156 166 ...
## $ Weight : int 43 60 83 69 66 55 47 72 58 56 ...
## $ ComputerAbility: int 2 1 2 1 3 2 3 2 3 1 ...
## $ PChour : int 12 22 15 30 10 12 16 8 11 26 ...
## $ InitialScore : int 64 80 70 90 50 70 56 72 42 78 ...
## $ MidtermScore : int 70 86 66 86 62 78 70 78 62 82 ...
## $ FinalScore : int 76 92 82 96 66 62 74 64 60 68 ...
## $ group : Factor w/ 3 levels "1","2","3": 3 2 3 2 1 3 1 3 1 2 ...
datap$Residency<-as.factor(datap$Residency)
indtabler<-table(datap$Residency,datap$group)
indtabler
##
## 1 2 3
## Taichung 2 0 3
## Tainan 2 1 2
## Taipei 0 4 1
chisq.test(indtabler)
## Warning in chisq.test(indtabler): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: indtabler
## X-squared = 8.2, df = 4, p-value = 0.08452
#weight
library(psych)
describeBy(datap$Weight)
## Warning in describeBy(datap$Weight): no grouping variable requested
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15 59.47 12.48 58 58.92 16.31 43 83 40 0.22 -1.29 3.22
library(car)
leveneTest(Weight~group,data=datap)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 0.8119 0.467
## 12
aovw<-aov(Weight~group,data=datap)
summary(aovw)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 353.1 176.6 1.16 0.346
## Residuals 12 1826.6 152.2
#height
describeBy(datap$Height)
## Warning in describeBy(datap$Height): no grouping variable requested
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15 164.93 7.13 166 165.15 8.9 152 175 23 -0.28 -1.29 1.84
leveneTest(Height~group,data=datap)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 2.2983 0.1429
## 12
aovw<-aov(Height~group,data=datap)
summary(aovw)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 90.7 45.37 0.878 0.441
## Residuals 12 620.2 51.68
#midterm exam score
describeBy(datap$MidtermScore)
## Warning in describeBy(datap$MidtermScore): no grouping variable requested
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 15 76.8 9.25 78 76.92 11.86 62 90 28 -0.24 -1.42 2.39
leveneTest(MidtermScore~group,data=datap)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 2.3411 0.1385
## 12
aovm<-aov(MidtermScore~group,data=datap)
summary(aovm)
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 782.1 391.0 11.27 0.00176 **
## Residuals 12 416.3 34.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(agricolae)
sch_m<-scheffe.test(aovm,"group",group=F)
sch_m$comparison
## Difference pvalue sig LCL UCL
## 1 - 2 -18.500000 0.0020 ** -29.514459 -7.485541
## 1 - 3 -7.833333 0.1625 -18.432001 2.765334
## 2 - 3 10.666667 0.0354 * 0.724235 20.609098
library(rstatix)
##
## Attaching package: 'rstatix'
## The following objects are masked from 'package:plyr':
##
## desc, mutate
## The following object is masked from 'package:stats':
##
## filter
eta_squared(aovm)
## group
## 0.6525923