####加载程序包
library(rpart)
#rpart包实现了分类和回归决策树,我们将调用其中的rpart()和predict()
#函数。
library(rpart.plot)
#rpart.plot包含各种决策树的可视化函数,我们将调用其中的prp()函数。
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## XXXX 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## 键入'rattle()'去轻摇、晃动、翻滚你的数据。
#rattle可实现数据挖掘和图形交互式可视化界面,我们将调用其中的
#fancyRpartPlot()函数实现决策树可视化。
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#我们将调用其中的管道函数。
library(ggplot2)
#我们将调用其中的ggplot()等函数。
library(sampling)
#sampling包含有各种抽样函数,这里我们将调用其中的strata()函数。
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
library(pdp)
View(kyphosis)
fit <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis)
fit2 <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis,
parms = list(prior = c(.65,.35), split = "information"))
fit3 <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis,
control = rpart.control(cp = 0.05))
#par(mfrow = c(1,2), xpd = NA) # otherwise on some devices the text is clipped
plot(fit)
text(fit, use.n = TRUE)

plot(fit2)
text(fit2, use.n = TRUE)

fit
## n= 81
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 81 17 absent (0.79012346 0.20987654)
## 2) Start>=8.5 62 6 absent (0.90322581 0.09677419)
## 4) Start>=14.5 29 0 absent (1.00000000 0.00000000) *
## 5) Start< 14.5 33 6 absent (0.81818182 0.18181818)
## 10) Age< 55 12 0 absent (1.00000000 0.00000000) *
## 11) Age>=55 21 6 absent (0.71428571 0.28571429)
## 22) Age>=111 14 2 absent (0.85714286 0.14285714) *
## 23) Age< 111 7 3 present (0.42857143 0.57142857) *
## 3) Start< 8.5 19 8 present (0.42105263 0.57894737) *
fancyRpartPlot(fit, type = 2, digits = 3,
main = "", sub="")

vip(fit, num_features = 3, bar = TRUE)

summary(fit)
## Call:
## rpart(formula = Kyphosis ~ Age + Number + Start, data = kyphosis)
## n= 81
##
## CP nsplit rel error xerror xstd
## 1 0.17647059 0 1.0000000 1.000000 0.2155872
## 2 0.01960784 1 0.8235294 1.117647 0.2243268
## 3 0.01000000 4 0.7647059 1.117647 0.2243268
##
## Variable importance
## Start Age Number
## 64 24 12
##
## Node number 1: 81 observations, complexity param=0.1764706
## predicted class=absent expected loss=0.2098765 P(node) =1
## class counts: 64 17
## probabilities: 0.790 0.210
## left son=2 (62 obs) right son=3 (19 obs)
## Primary splits:
## Start < 8.5 to the right, improve=6.762330, (0 missing)
## Number < 5.5 to the left, improve=2.866795, (0 missing)
## Age < 39.5 to the left, improve=2.250212, (0 missing)
## Surrogate splits:
## Number < 6.5 to the left, agree=0.802, adj=0.158, (0 split)
##
## Node number 2: 62 observations, complexity param=0.01960784
## predicted class=absent expected loss=0.09677419 P(node) =0.7654321
## class counts: 56 6
## probabilities: 0.903 0.097
## left son=4 (29 obs) right son=5 (33 obs)
## Primary splits:
## Start < 14.5 to the right, improve=1.0205280, (0 missing)
## Age < 55 to the left, improve=0.6848635, (0 missing)
## Number < 4.5 to the left, improve=0.2975332, (0 missing)
## Surrogate splits:
## Number < 3.5 to the left, agree=0.645, adj=0.241, (0 split)
## Age < 16 to the left, agree=0.597, adj=0.138, (0 split)
##
## Node number 3: 19 observations
## predicted class=present expected loss=0.4210526 P(node) =0.2345679
## class counts: 8 11
## probabilities: 0.421 0.579
##
## Node number 4: 29 observations
## predicted class=absent expected loss=0 P(node) =0.3580247
## class counts: 29 0
## probabilities: 1.000 0.000
##
## Node number 5: 33 observations, complexity param=0.01960784
## predicted class=absent expected loss=0.1818182 P(node) =0.4074074
## class counts: 27 6
## probabilities: 0.818 0.182
## left son=10 (12 obs) right son=11 (21 obs)
## Primary splits:
## Age < 55 to the left, improve=1.2467530, (0 missing)
## Start < 12.5 to the right, improve=0.2887701, (0 missing)
## Number < 3.5 to the right, improve=0.1753247, (0 missing)
## Surrogate splits:
## Start < 9.5 to the left, agree=0.758, adj=0.333, (0 split)
## Number < 5.5 to the right, agree=0.697, adj=0.167, (0 split)
##
## Node number 10: 12 observations
## predicted class=absent expected loss=0 P(node) =0.1481481
## class counts: 12 0
## probabilities: 1.000 0.000
##
## Node number 11: 21 observations, complexity param=0.01960784
## predicted class=absent expected loss=0.2857143 P(node) =0.2592593
## class counts: 15 6
## probabilities: 0.714 0.286
## left son=22 (14 obs) right son=23 (7 obs)
## Primary splits:
## Age < 111 to the right, improve=1.71428600, (0 missing)
## Start < 12.5 to the right, improve=0.79365080, (0 missing)
## Number < 3.5 to the right, improve=0.07142857, (0 missing)
##
## Node number 22: 14 observations
## predicted class=absent expected loss=0.1428571 P(node) =0.1728395
## class counts: 12 2
## probabilities: 0.857 0.143
##
## Node number 23: 7 observations
## predicted class=present expected loss=0.4285714 P(node) =0.08641975
## class counts: 3 4
## probabilities: 0.429 0.571
predict(fit, type = "prob")
## absent present
## 1 0.4210526 0.5789474
## 2 0.8571429 0.1428571
## 3 0.4210526 0.5789474
## 4 0.4210526 0.5789474
## 5 1.0000000 0.0000000
## 6 1.0000000 0.0000000
## 7 1.0000000 0.0000000
## 8 1.0000000 0.0000000
## 9 1.0000000 0.0000000
## 10 0.4285714 0.5714286
## 11 0.4285714 0.5714286
## 12 1.0000000 0.0000000
## 13 0.4210526 0.5789474
## 14 1.0000000 0.0000000
## 15 1.0000000 0.0000000
## 16 1.0000000 0.0000000
## 17 1.0000000 0.0000000
## 18 0.8571429 0.1428571
## 19 1.0000000 0.0000000
## 20 1.0000000 0.0000000
## 21 1.0000000 0.0000000
## 22 0.4210526 0.5789474
## 23 0.4285714 0.5714286
## 24 0.4210526 0.5789474
## 25 0.4210526 0.5789474
## 26 1.0000000 0.0000000
## 27 0.4210526 0.5789474
## 28 0.4285714 0.5714286
## 29 1.0000000 0.0000000
## 30 1.0000000 0.0000000
## 31 1.0000000 0.0000000
## 32 0.8571429 0.1428571
## 33 0.8571429 0.1428571
## 34 1.0000000 0.0000000
## 35 0.8571429 0.1428571
## 36 1.0000000 0.0000000
## 37 1.0000000 0.0000000
## 38 0.4210526 0.5789474
## 39 1.0000000 0.0000000
## 40 0.4285714 0.5714286
## 41 0.4210526 0.5789474
## 42 1.0000000 0.0000000
## 43 0.4210526 0.5789474
## 44 0.4210526 0.5789474
## 45 1.0000000 0.0000000
## 46 0.8571429 0.1428571
## 47 1.0000000 0.0000000
## 48 0.8571429 0.1428571
## 49 0.4210526 0.5789474
## 50 0.8571429 0.1428571
## 51 0.4285714 0.5714286
## 52 1.0000000 0.0000000
## 53 0.4210526 0.5789474
## 54 1.0000000 0.0000000
## 55 1.0000000 0.0000000
## 56 1.0000000 0.0000000
## 57 1.0000000 0.0000000
## 58 0.4210526 0.5789474
## 59 1.0000000 0.0000000
## 60 0.4285714 0.5714286
## 61 0.4210526 0.5789474
## 62 0.4210526 0.5789474
## 63 0.4210526 0.5789474
## 64 1.0000000 0.0000000
## 65 1.0000000 0.0000000
## 66 1.0000000 0.0000000
## 67 1.0000000 0.0000000
## 68 0.8571429 0.1428571
## 69 1.0000000 0.0000000
## 70 1.0000000 0.0000000
## 71 0.8571429 0.1428571
## 72 0.8571429 0.1428571
## 73 1.0000000 0.0000000
## 74 0.8571429 0.1428571
## 75 1.0000000 0.0000000
## 76 1.0000000 0.0000000
## 77 0.8571429 0.1428571
## 78 1.0000000 0.0000000
## 79 0.8571429 0.1428571
## 80 0.4210526 0.5789474
## 81 1.0000000 0.0000000
t_pred =predict(fit, type = "class")
xpred.rpart(fit)
## 0.58823529 0.05882353 0.01400280
## 1 1 2 7.862057e-67
## 2 1 1 2.052268e-289
## 3 1 1 4.277148e-314
## 4 1 2 2.803072e-309
## 5 1 1 1.379807e-309
## 6 1 1 1.249431e-310
## 7 1 1 1.195108e-310
## 8 1 1 3.259386e-311
## 9 1 1 1.792662e-310
## 10 1 1 1.575370e-310
## 11 1 1 6.518771e-311
## 12 1 1 7.605233e-311
## 13 1 2 5.975540e-311
## 14 1 1 2.172924e-311
## 15 1 1 1.379807e-309
## 16 1 1 6.835427e-304
## 17 1 1 1.390671e-309
## 18 1 1 1.254927e-321
## 19 1 1 2.781342e-309
## 20 1 1 8.224288e-317
## 21 1 1 4.778312e-299
## 22 1 2 3.030718e+180
## 23 1 1 8.224288e-317
## 24 1 2 8.344027e-309
## 25 1 2 2.173140e-312
## 26 1 1 5.389869e-312
## 27 1 2 5.389869e-312
## 28 1 1 5.562706e-309
## 29 1 1 2.781342e-309
## 30 1 1 8.804498e+199
## 31 1 1 1.379807e-309
## 32 1 1 1.385345e-309
## 33 1 1 4.674283e+180
## 34 1 1 8.814426e-280
## 35 1 2 -5.727786e+250
## 36 1 1 2.483463e-265
## 37 1 1 5.389869e-312
## 38 1 1 5.389869e-312
## 39 1 1 5.389869e-312
## 40 1 1 5.389869e-312
## 41 1 1 1.485397e-313
## 42 1 1 3.370629e+160
## 43 1 2 3.225804e-319
## 44 1 2 3.259386e-311
## 45 1 1 3.212612e-319
## 46 1 1 3.259386e-311
## 47 1 1 3.259386e-311
## 48 1 1 3.212612e-319
## 49 1 2 3.212612e-319
## 50 1 1 3.212612e-319
## 51 1 1 3.212612e-319
## 52 1 1 1.320051e-309
## 53 1 2 1.086462e-310
## 54 1 1 1.320051e-309
## 55 1 1 5.432309e-312
## 56 1 1 1.280580e+214
## 57 1 1 4.782145e+180
## 58 1 2 7.068615e-304
## 59 1 1 7.072175e-304
## 60 1 1 6.953356e-309
## 61 1 2 1.390671e-309
## 62 1 1 5.562685e-309
## 63 1 2 1.807873e-308
## 64 1 1 4.172013e-309
## 65 1 1 1.809502e-308
## 66 1 1 2.803072e-309
## 67 1 1 4.782145e+180
## 68 1 1 1.809502e-308
## 69 1 1 0.000000e+00
## 70 1 1 2.225074e-308
## 71 1 1 1.839967e+223
## 72 1 1 5.562685e-309
## 73 1 1 1.251604e-308
## 74 1 1 1.062714e-314
## 75 1 1 2.781342e-309
## 76 1 1 8.344027e-309
## 77 1 1 1.112537e-308
## 78 1 1 1.390671e-308
## 79 1 1 1.807873e-308
## 80 1 1 1.946940e-308
## 81 1 1 2.225074e-308
p1 <- partial(fit, pred.var = "Start") %>% autoplot()
partial(fit, pred.var = "Start")
## Start yhat
## 1 1 -0.1592269
## 2 2 -0.1592269
## 3 3 -0.1592269
## 4 4 -0.1592269
## 5 5 -0.1592269
## 6 6 -0.1592269
## 7 7 -0.1592269
## 8 8 -0.1592269
## 9 10 6.7835065
## 10 11 6.7835065
## 11 12 6.7835065
## 12 13 6.7835065
## 13 14 6.7835065
## 14 15 18.0218267
## 15 16 18.0218267
## 16 18 18.0218267
mean(kyphosis$Kyphosis == t_pred)#准确度
## [1] 0.8395062