显示数据集的前几列:
library(rpart)
head(kyphosis)
## Kyphosis Age Number Start
## 1 absent 71 3 5
## 2 absent 158 3 14
## 3 present 128 4 5
## 4 absent 2 5 1
## 5 absent 1 4 15
## 6 absent 1 2 16
用help(kyphosis)查看该数据集的解释,大意如下:
数据集是从儿童接受外科脊柱矫正手术中来的,有4列、81行(81个病例)。 四列变量的含义:
建立决策树模型,并画出来:
# 建立模型
kyphosis.rp = rpart(Kyphosis ~ ., data = kyphosis, method = "class")
# 计算结果
summary(kyphosis.rp)
## Call:
## rpart(formula = Kyphosis ~ ., data = kyphosis, method = "class")
## n= 81
##
## CP nsplit rel error xerror xstd
## 1 0.17647 0 1.0000 1.0000 0.2156
## 2 0.01961 1 0.8235 0.9412 0.2108
## 3 0.01000 4 0.7647 1.0000 0.2156
##
## Variable importance
## Start Age Number
## 64 24 12
##
## Node number 1: 81 observations, complexity param=0.1765
## predicted class=absent expected loss=0.2099 P(node) =1
## class counts: 64 17
## probabilities: 0.790 0.210
## left son=2 (62 obs) right son=3 (19 obs)
## Primary splits:
## Start < 8.5 to the right, improve=6.762, (0 missing)
## Number < 5.5 to the left, improve=2.867, (0 missing)
## Age < 39.5 to the left, improve=2.250, (0 missing)
## Surrogate splits:
## Number < 6.5 to the left, agree=0.802, adj=0.158, (0 split)
##
## Node number 2: 62 observations, complexity param=0.01961
## predicted class=absent expected loss=0.09677 P(node) =0.7654
## class counts: 56 6
## probabilities: 0.903 0.097
## left son=4 (29 obs) right son=5 (33 obs)
## Primary splits:
## Start < 14.5 to the right, improve=1.0210, (0 missing)
## Age < 55 to the left, improve=0.6849, (0 missing)
## Number < 4.5 to the left, improve=0.2975, (0 missing)
## Surrogate splits:
## Number < 3.5 to the left, agree=0.645, adj=0.241, (0 split)
## Age < 16 to the left, agree=0.597, adj=0.138, (0 split)
##
## Node number 3: 19 observations
## predicted class=present expected loss=0.4211 P(node) =0.2346
## class counts: 8 11
## probabilities: 0.421 0.579
##
## Node number 4: 29 observations
## predicted class=absent expected loss=0 P(node) =0.358
## class counts: 29 0
## probabilities: 1.000 0.000
##
## Node number 5: 33 observations, complexity param=0.01961
## predicted class=absent expected loss=0.1818 P(node) =0.4074
## class counts: 27 6
## probabilities: 0.818 0.182
## left son=10 (12 obs) right son=11 (21 obs)
## Primary splits:
## Age < 55 to the left, improve=1.2470, (0 missing)
## Start < 12.5 to the right, improve=0.2888, (0 missing)
## Number < 3.5 to the right, improve=0.1753, (0 missing)
## Surrogate splits:
## Start < 9.5 to the left, agree=0.758, adj=0.333, (0 split)
## Number < 5.5 to the right, agree=0.697, adj=0.167, (0 split)
##
## Node number 10: 12 observations
## predicted class=absent expected loss=0 P(node) =0.1481
## class counts: 12 0
## probabilities: 1.000 0.000
##
## Node number 11: 21 observations, complexity param=0.01961
## predicted class=absent expected loss=0.2857 P(node) =0.2593
## class counts: 15 6
## probabilities: 0.714 0.286
## left son=22 (14 obs) right son=23 (7 obs)
## Primary splits:
## Age < 111 to the right, improve=1.71400, (0 missing)
## Start < 12.5 to the right, improve=0.79370, (0 missing)
## Number < 4.5 to the left, improve=0.07143, (0 missing)
##
## Node number 22: 14 observations
## predicted class=absent expected loss=0.1429 P(node) =0.1728
## class counts: 12 2
## probabilities: 0.857 0.143
##
## Node number 23: 7 observations
## predicted class=present expected loss=0.4286 P(node) =0.08642
## class counts: 3 4
## probabilities: 0.429 0.571
# 画出决策树
plot(kyphosis.rp, uniform = T, branch = 0, margin = 0.1, main = "Classification Tree\nKyphosis by Age, number and start")
text(kyphosis.rp, fancy = T, col = "blue")
上述模型是用class方法建立的,如果用anova或poisson方法,得到的决策树分支是完全一样的,只是后两者会以因子数值的形式显示分类结果。
首先将薛毅书498页数据(不包括首行header)复制粘贴到记事本文件“8-3raw.txt”,其中中文显示为乱码,数据断行不规则,但可以用正则表达式处理之。
# 首先用readLines读入多行数据,用一个或多个空字符匹配strsplit的字符串分隔符
(lst <- strsplit(readLines(paste(getwd(), "/8-3raw.txt", sep = "")), split = "\\s+"))
## [[1]]
## [1] "ke" "9.30" "30.55" "8.70" "ba" "0.85" "26.55" "16.15"
## [9] "Mc" "4.67" "29.38" "8.92" "dk" "1.57" "23.16" "15.79"
## [17] "bk" "0.96" "24.69" "15.21" "da" "1.14" "22.57" "12.10"
## [25] "\021d" "1.38" "29.24" "11.30" "?g" "1.34" "23.04" "10.45"
## [33] "8fe" "1.48" "25.47" "15.39" "?d" "0.79" "19.14" "10.61"
##
## [[2]]
## [1] "gl" "2.60" "32.32" "8.81" "\025a" "1.24" "22.53" "13.97"
## [9] "fK" "2.15" "26.31" "10.49" "h" "0.96" "21.65" "16.24"
## [17] "ghh" "2.14" "28.46" "10.87" "if" "0.78" "14.65" "24.27"
## [25] "9\025" "6.53" "31.59" "11.04" "ga" "0.81" "13.85" "25.44"
## [33] "h"
##
## [[3]]
## [1] "" "1.47" "26.43" "17.23" "dl" "0.57" "3.85" "44.43"
##
## [[4]]
## [1] "gh" "1.17" "23.74" "17.46" "md" "1.67" "24.36" "17.62"
##
## [[5]]
## [1] "9j" "0.88" "19.97" "24.43" "ie" "1.10" "16.85" "27.93"
##
## [[6]]
## [1] "j" "1.23" "16.87" "15.63" "`\025" "1.49" "17.76" "27.70"
## [9] "hd" "0.99" "18.84" "16.22" "lf" "1.61" "20.27" "22.06"
## [17] "\021g" "0.98" "25.18" "16.87" "Ik" "1.85" "20.66" "12.75"
# 由此产生的是一个list,将其转换为vector
# 普通的c函数也能达到同样效果: c(lst, recursive=T)
(vec <- unlist(lst))
## [1] "ke" "9.30" "30.55" "8.70" "ba" "0.85" "26.55" "16.15"
## [9] "Mc" "4.67" "29.38" "8.92" "dk" "1.57" "23.16" "15.79"
## [17] "bk" "0.96" "24.69" "15.21" "da" "1.14" "22.57" "12.10"
## [25] "\021d" "1.38" "29.24" "11.30" "?g" "1.34" "23.04" "10.45"
## [33] "8fe" "1.48" "25.47" "15.39" "?d" "0.79" "19.14" "10.61"
## [41] "gl" "2.60" "32.32" "8.81" "\025a" "1.24" "22.53" "13.97"
## [49] "fK" "2.15" "26.31" "10.49" "h" "0.96" "21.65" "16.24"
## [57] "ghh" "2.14" "28.46" "10.87" "if" "0.78" "14.65" "24.27"
## [65] "9\025" "6.53" "31.59" "11.04" "ga" "0.81" "13.85" "25.44"
## [73] "h" "" "1.47" "26.43" "17.23" "dl" "0.57" "3.85"
## [81] "44.43" "gh" "1.17" "23.74" "17.46" "md" "1.67" "24.36"
## [89] "17.62" "9j" "0.88" "19.97" "24.43" "ie" "1.10" "16.85"
## [97] "27.93" "j" "1.23" "16.87" "15.63" "`\025" "1.49" "17.76"
## [105] "27.70" "hd" "0.99" "18.84" "16.22" "lf" "1.61" "20.27"
## [113] "22.06" "\021g" "0.98" "25.18" "16.87" "Ik" "1.85" "20.66"
## [121] "12.75"
# 观察发现第74位的字符串不是我们需要的
# 用以下语句可以查看其ASCII码,但发现结果为空!
asc <- function(x) {
strtoi(charToRaw(x), 16L)
}
asc(vec[74])
## integer(0)
is.na(vec[74])
## [1] FALSE
length(vec[74])
## [1] 1
# 不管它是什么,去掉它就是了
vec <- vec[-74]
# 将vector转换成我们需要的matrix
mat <- matrix(vec, ncol = 8, byrow = T)
mat <- rbind(mat[, 2:4], mat[, 6:8])
# 将字符矩阵转换成数字矩阵
mat <- matrix(as.numeric(mat), ncol = ncol(mat))
# 将30个省区市的名称从一个预先输入好的文本文件“provinces.txt”中读取出来,与变量名一起赋给矩阵型的数据集mat
rnames <- unlist(strsplit(readLines(paste(getwd(), "/provinces.txt", sep = "")),
split = " "))
cnames <- c("DXBZ", "CZBZ", "WMBZ")
dimnames(mat) <- list(rnames, cnames)
mat
## DXBZ CZBZ WMBZ
## 北京 9.30 30.55 8.70
## 天津 4.67 29.38 8.92
## 河北 0.96 24.69 15.21
## 山西 1.38 29.24 11.30
## 内蒙古 1.48 25.47 15.39
## 辽宁 2.60 32.32 8.81
## 吉林 2.15 26.31 10.49
## 黑龙江 2.14 28.46 10.87
## 上海 6.53 31.59 11.04
## 江苏 1.47 26.43 17.23
## 浙江 1.17 23.74 17.46
## 安徽 0.88 19.97 24.43
## 福建 1.23 16.87 15.63
## 江西 0.99 18.84 16.22
## 山东 0.98 25.18 16.87
## 河南 0.85 26.55 16.15
## 湖北 1.57 23.16 15.79
## 湖南 1.14 22.57 12.10
## 广东 1.34 23.04 10.45
## 广西 0.79 19.14 10.61
## 海南 1.24 22.53 13.97
## 四川 0.96 21.65 16.24
## 贵州 0.78 14.65 24.27
## 云南 0.81 13.85 25.44
## 西藏 0.57 3.85 44.43
## 陕西 1.67 24.36 17.62
## 甘肃 1.10 16.85 27.93
## 青海 1.49 17.76 27.70
## 宁夏 1.61 20.27 22.06
## 新疆 1.85 20.66 12.75
# 计算标准化的欧氏距离
d <- dist(scale(mat))
# 进行聚类分析
hc1 <- hclust(d, "complete")
hc2 <- hclust(d, "average")
hc3 <- hclust(d, "centroid")
hc4 <- hclust(d, "ward")
# 画出谱系图
opar <- par(mfrow = c(2, 2))
plot(hc1, hang = -1, main = "complete")
plot(hc2, hang = -1, main = "average")
plot(hc3, hang = -1, main = "centroid")
plot(hc4, hang = -1, main = "ward")
par(opar)
# 书上的另一种方法
dend1 <- as.dendrogram(hc1)
dend2 <- as.dendrogram(hc2)
dend3 <- as.dendrogram(hc3)
dend4 <- as.dendrogram(hc4)
# 用书中电子版501页(纸版484页)的自编程序
nP <- list(col = 3:2, cex = c(2, 0.75), pch = 21:22, bg = c("light blue", "pink"),
lab.cex = 1, lab.col = "tomato")
addE <- function(n) {
if (!is.leaf(n)) {
attr(n, "edgePar") <- list(p.col = "plum")
attr(n, "edgetext") <- attr(n, "members")
}
n
}
opar <- par(mfrow = c(2, 2), mar = c(4, 3, 1, 2))
de1 <- dendrapply(dend1, addE)
plot(de1, nodePar = nP, main = "complete")
de2 <- dendrapply(dend2, addE)
plot(de2, nodePar = nP, main = "average")
de3 <- dendrapply(dend3, addE)
plot(de3, nodePar = nP, main = "centroid")
de4 <- dendrapply(dend4, addE)
plot(de4, nodePar = nP, main = "ward")
par(opar)
将所有样本分为4类:
opar <- par(mfrow = c(2, 2), mar = c(4, 3, 1, 2))
plclust(hc1, hang = -1, main = "complete")
re1 <- rect.hclust(hc1, k = 4, border = "red")
plclust(hc2, hang = -1, main = "average")
re2 <- rect.hclust(hc2, k = 4, border = "red")
plclust(hc3, hang = -1, main = "centroid")
re3 <- rect.hclust(hc3, k = 4, border = "red")
plclust(hc4, hang = -1, main = "ward")
re4 <- rect.hclust(hc4, k = 4, border = "red")
par(opar)
4种聚类方法的分类结果如上图所示,就不一一列举了。基本的趋势是:
由此可见,在此问题中,最大距离法和Ward方法的分类效果最好,因为在重心法和类平均法中,教育水平中等的类都包含了太多省份。
用K-means方法计算动态聚类:
km <- kmeans(scale(mat), 4, nstart = 20)
km
## K-means clustering with 4 clusters of sizes 3, 18, 1, 8
##
## Cluster means:
## DXBZ CZBZ WMBZ
## 1 2.6874 1.3004 -0.9593
## 2 -0.2304 0.3458 -0.3958
## 3 -0.6948 -3.1197 3.6139
## 4 -0.4025 -0.8758 0.7986
##
## Clustering vector:
## 北京 天津 河北 山西 内蒙古 辽宁 吉林 黑龙江 上海 江苏
## 1 1 2 2 2 2 2 2 1 2
## 浙江 安徽 福建 江西 山东 河南 湖北 湖南 广东 广西
## 2 4 4 4 2 2 2 2 2 2
## 海南 四川 贵州 云南 西藏 陕西 甘肃 青海 宁夏 新疆
## 2 2 4 4 3 2 4 4 4 2
##
## Within cluster sum of squares by cluster:
## [1] 3.290 8.531 0.000 3.938
## (between_SS / total_SS = 81.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size"
# 分类结果
st <- sort(km$cluster)
split(st, st)
## $`1`
## 北京 天津 上海
## 1 1 1
##
## $`2`
## 河北 山西 内蒙古 辽宁 吉林 黑龙江 江苏 浙江 山东 河南
## 2 2 2 2 2 2 2 2 2 2
## 湖北 湖南 广东 广西 海南 四川 陕西 新疆
## 2 2 2 2 2 2 2 2
##
## $`3`
## 西藏
## 3
##
## $`4`
## 安徽 福建 江西 贵州 云南 甘肃 青海 宁夏
## 4 4 4 4 4 4 4 4
这个结果接近前面的Ward方法。
用K-center方法计算动态聚类:
require(cluster)
kc <- pam(scale(mat), 4)
# 分类结果
st <- sort(kc$clustering)
split(st, st)
## $`1`
## 北京 天津 上海
## 1 1 1
##
## $`2`
## 河北 内蒙古 江苏 浙江 福建 江西 山东 河南 湖北 湖南
## 2 2 2 2 2 2 2 2 2 2
## 广东 广西 海南 四川 陕西 新疆
## 2 2 2 2 2 2
##
## $`3`
## 山西 辽宁 吉林 黑龙江
## 3 3 3 3
##
## $`4`
## 安徽 贵州 云南 西藏 甘肃 青海 宁夏
## 4 4 4 4 4 4 4
这个结果包含了最大距离法和Ward法两者的优点,既分出了中上等的类型,也分出了中下等的类型。所以在这个问题中,K-center方法是最为理想的方法。
对48位应聘者数据(见第三章例3.17中的表3.5)做聚类分析,选择变量的相关系数作为变量间的相似系数\( c_{ij} \),距离定义为\( d_{ij}=1-c_{ij} \)。
分别用最长距离法、均值法、重心法和Ward法做聚类分析,并画出相应的谱系图。
如果将所有变量分为5类,试写出各种方法的分类结果。
# 读入数据,计算距离
app <- read.table(paste(getwd(), "/applicant.data", sep = ""))
r <- cor(app)
d <- as.dist(1 - r)
# 进行聚类分析
hc1 <- hclust(d, "complete")
hc2 <- hclust(d, "average")
hc3 <- hclust(d, "centroid")
hc4 <- hclust(d, "ward")
# 画出谱系图
opar <- par(mfrow = c(2, 2))
plot(hc1, hang = -1, main = "complete")
plot(hc2, hang = -1, main = "average")
plot(hc3, hang = -1, main = "centroid")
plot(hc4, hang = -1, main = "ward")
par(opar)
# 书上的另一种方法
dend1 <- as.dendrogram(hc1)
dend2 <- as.dendrogram(hc2)
dend3 <- as.dendrogram(hc3)
dend4 <- as.dendrogram(hc4)
opar <- par(mfrow = c(2, 2), mar = c(4, 3, 1, 2))
de1 <- dendrapply(dend1, addE)
plot(de1, nodePar = nP, main = "complete")
de2 <- dendrapply(dend2, addE)
plot(de2, nodePar = nP, main = "average")
de3 <- dendrapply(dend3, addE)
plot(de3, nodePar = nP, main = "centroid")
de4 <- dendrapply(dend4, addE)
plot(de4, nodePar = nP, main = "ward")
par(opar)
# 将所有变量分成5类
opar <- par(mfrow = c(2, 2), mar = c(4, 3, 1, 2))
plclust(hc1, hang = -1, main = "complete")
re1 <- rect.hclust(hc1, k = 5, border = "red")
plclust(hc2, hang = -1, main = "average")
re2 <- rect.hclust(hc2, k = 5, border = "red")
plclust(hc3, hang = -1, main = "centroid")
re3 <- rect.hclust(hc3, k = 5, border = "red")
plclust(hc4, hang = -1, main = "ward")
re4 <- rect.hclust(hc4, k = 5, border = "red")
par(opar)
4种聚类方法的分类结果如上图所示,就不一一列举了。基本的趋势是:
在此问题中,除了重心法外,其他方法都表现较好。
class包可以实现基本的KNN算法,FNN包可以实现更多种KNN算法。
具体实现见帮助文档示例代码。
fpc包可以实现DBSCAN算法,具体实现见帮助文档示例代码。