使用rpart 做出分類結果
library(rpart)
data(iris)
fit <- rpart(Species ~Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data=iris)
summary(fit)
## Call:
## rpart(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length +
## Petal.Width, data = iris)
## n= 150
##
## CP nsplit rel error xerror xstd
## 1 0.50 0 1.00 1.17 0.05073460
## 2 0.44 1 0.50 0.70 0.06110101
## 3 0.01 2 0.06 0.08 0.02751969
##
## Variable importance
## Petal.Width Petal.Length Sepal.Length Sepal.Width
## 34 31 21 14
##
## Node number 1: 150 observations, complexity param=0.5
## predicted class=setosa expected loss=0.6666667 P(node) =1
## class counts: 50 50 50
## probabilities: 0.333 0.333 0.333
## left son=2 (50 obs) right son=3 (100 obs)
## Primary splits:
## Petal.Length < 2.45 to the left, improve=50.00000, (0 missing)
## Petal.Width < 0.8 to the left, improve=50.00000, (0 missing)
## Sepal.Length < 5.45 to the left, improve=34.16405, (0 missing)
## Sepal.Width < 3.35 to the right, improve=19.03851, (0 missing)
## Surrogate splits:
## Petal.Width < 0.8 to the left, agree=1.000, adj=1.00, (0 split)
## Sepal.Length < 5.45 to the left, agree=0.920, adj=0.76, (0 split)
## Sepal.Width < 3.35 to the right, agree=0.833, adj=0.50, (0 split)
##
## Node number 2: 50 observations
## predicted class=setosa expected loss=0 P(node) =0.3333333
## class counts: 50 0 0
## probabilities: 1.000 0.000 0.000
##
## Node number 3: 100 observations, complexity param=0.44
## predicted class=versicolor expected loss=0.5 P(node) =0.6666667
## class counts: 0 50 50
## probabilities: 0.000 0.500 0.500
## left son=6 (54 obs) right son=7 (46 obs)
## Primary splits:
## Petal.Width < 1.75 to the left, improve=38.969400, (0 missing)
## Petal.Length < 4.75 to the left, improve=37.353540, (0 missing)
## Sepal.Length < 6.15 to the left, improve=10.686870, (0 missing)
## Sepal.Width < 2.45 to the left, improve= 3.555556, (0 missing)
## Surrogate splits:
## Petal.Length < 4.75 to the left, agree=0.91, adj=0.804, (0 split)
## Sepal.Length < 6.15 to the left, agree=0.73, adj=0.413, (0 split)
## Sepal.Width < 2.95 to the left, agree=0.67, adj=0.283, (0 split)
##
## Node number 6: 54 observations
## predicted class=versicolor expected loss=0.09259259 P(node) =0.36
## class counts: 0 49 5
## probabilities: 0.000 0.907 0.093
##
## Node number 7: 46 observations
## predicted class=virginica expected loss=0.02173913 P(node) =0.3066667
## class counts: 0 1 45
## probabilities: 0.000 0.022 0.978
plot(fit, margin = 0.1)
text(fit)

將分類結果顯示在圖上
plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species)
abline(h = 1.75, col="blue")
abline(v = 2.45, col="red")

觀看分類結果
table(predict(fit, iris[,1:4], type="class"), iris[,5])
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 5
## virginica 0 1 45
使用caret 套件找出準確率
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
cm <- table(predict(fit, iris[,1:4], type="class"), iris[,5])
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 5
## virginica 0 1 45
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.915, 0.9852)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.94
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9800 0.9000
## Specificity 1.0000 0.9500 0.9900
## Pos Pred Value 1.0000 0.9074 0.9783
## Neg Pred Value 1.0000 0.9896 0.9519
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3267 0.3000
## Detection Prevalence 0.3333 0.3600 0.3067
## Balanced Accuracy 1.0000 0.9650 0.9450
將資料分為訓練與測試資料集
set.seed(123)
idx <- sample.int(2, nrow(iris), replace=TRUE, prob=c(0.7,0.3))
trainset <- iris[idx==1, ]
testset <- iris[idx==2, ]
dim(trainset)
## [1] 106 5
dim(testset)
## [1] 44 5
使用訓練資料集建立模型
fit2 <- rpart(Species ~., data=trainset)
plot(fit2, margin = 0.1)
text(fit2)

套用在測試資料集測試模型
pred <- predict(fit2, testset[,-5], type= "class")
cm <- table(pred, testset[,5])
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## pred setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 10 1
## virginica 0 4 14
##
## Overall Statistics
##
## Accuracy : 0.8864
## 95% CI : (0.7544, 0.9621)
## No Information Rate : 0.3409
## P-Value [Acc > NIR] : 8.552e-14
##
## Kappa : 0.8291
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.7143 0.9333
## Specificity 1.0000 0.9667 0.8621
## Pos Pred Value 1.0000 0.9091 0.7778
## Neg Pred Value 1.0000 0.8788 0.9615
## Prevalence 0.3409 0.3182 0.3409
## Detection Rate 0.3409 0.2273 0.3182
## Detection Prevalence 0.3409 0.2500 0.4091
## Balanced Accuracy 1.0000 0.8405 0.8977