20160528Tutorial

使用rpart 做出分類結果

library(rpart)
data(iris)
fit <- rpart(Species ~Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data=iris)
summary(fit)

## Call:
## rpart(formula = Species ~ Sepal.Length + Sepal.Width + Petal.Length + 
##     Petal.Width, data = iris)
##   n= 150 
## 
##     CP nsplit rel error xerror       xstd
## 1 0.50      0      1.00   1.17 0.05073460
## 2 0.44      1      0.50   0.70 0.06110101
## 3 0.01      2      0.06   0.08 0.02751969
## 
## Variable importance
##  Petal.Width Petal.Length Sepal.Length  Sepal.Width 
##           34           31           21           14 
## 
## Node number 1: 150 observations,    complexity param=0.5
##   predicted class=setosa      expected loss=0.6666667  P(node) =1
##     class counts:    50    50    50
##    probabilities: 0.333 0.333 0.333 
##   left son=2 (50 obs) right son=3 (100 obs)
##   Primary splits:
##       Petal.Length < 2.45 to the left,  improve=50.00000, (0 missing)
##       Petal.Width  < 0.8  to the left,  improve=50.00000, (0 missing)
##       Sepal.Length < 5.45 to the left,  improve=34.16405, (0 missing)
##       Sepal.Width  < 3.35 to the right, improve=19.03851, (0 missing)
##   Surrogate splits:
##       Petal.Width  < 0.8  to the left,  agree=1.000, adj=1.00, (0 split)
##       Sepal.Length < 5.45 to the left,  agree=0.920, adj=0.76, (0 split)
##       Sepal.Width  < 3.35 to the right, agree=0.833, adj=0.50, (0 split)
## 
## Node number 2: 50 observations
##   predicted class=setosa      expected loss=0  P(node) =0.3333333
##     class counts:    50     0     0
##    probabilities: 1.000 0.000 0.000 
## 
## Node number 3: 100 observations,    complexity param=0.44
##   predicted class=versicolor  expected loss=0.5  P(node) =0.6666667
##     class counts:     0    50    50
##    probabilities: 0.000 0.500 0.500 
##   left son=6 (54 obs) right son=7 (46 obs)
##   Primary splits:
##       Petal.Width  < 1.75 to the left,  improve=38.969400, (0 missing)
##       Petal.Length < 4.75 to the left,  improve=37.353540, (0 missing)
##       Sepal.Length < 6.15 to the left,  improve=10.686870, (0 missing)
##       Sepal.Width  < 2.45 to the left,  improve= 3.555556, (0 missing)
##   Surrogate splits:
##       Petal.Length < 4.75 to the left,  agree=0.91, adj=0.804, (0 split)
##       Sepal.Length < 6.15 to the left,  agree=0.73, adj=0.413, (0 split)
##       Sepal.Width  < 2.95 to the left,  agree=0.67, adj=0.283, (0 split)
## 
## Node number 6: 54 observations
##   predicted class=versicolor  expected loss=0.09259259  P(node) =0.36
##     class counts:     0    49     5
##    probabilities: 0.000 0.907 0.093 
## 
## Node number 7: 46 observations
##   predicted class=virginica   expected loss=0.02173913  P(node) =0.3066667
##     class counts:     0     1    45
##    probabilities: 0.000 0.022 0.978

plot(fit, margin = 0.1)
text(fit)

將分類結果顯示在圖上

plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species)
abline(h = 1.75, col="blue")
abline(v = 2.45, col="red")

觀看分類結果

table(predict(fit, iris[,1:4], type="class"), iris[,5])

##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45

使用caret 套件找出準確率

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

cm <- table(predict(fit, iris[,1:4], type="class"), iris[,5])
confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
## 
## Overall Statistics
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.915, 0.9852)
##     No Information Rate : 0.3333         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.94           
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9800           0.9000
## Specificity                 1.0000            0.9500           0.9900
## Pos Pred Value              1.0000            0.9074           0.9783
## Neg Pred Value              1.0000            0.9896           0.9519
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3267           0.3000
## Detection Prevalence        0.3333            0.3600           0.3067
## Balanced Accuracy           1.0000            0.9650           0.9450

將資料分為訓練與測試資料集

set.seed(123)
idx <- sample.int(2, nrow(iris), replace=TRUE, prob=c(0.7,0.3))
trainset <- iris[idx==1, ]
testset <- iris[idx==2, ]
dim(trainset)

## [1] 106   5

dim(testset)

## [1] 44  5

使用訓練資料集建立模型

fit2 <- rpart(Species ~., data=trainset)
plot(fit2, margin = 0.1)
text(fit2)

套用在測試資料集測試模型

pred <- predict(fit2, testset[,-5], type= "class")
cm <- table(pred, testset[,5])
confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##             
## pred         setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         10         1
##   virginica       0          4        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8864          
##                  95% CI : (0.7544, 0.9621)
##     No Information Rate : 0.3409          
##     P-Value [Acc > NIR] : 8.552e-14       
##                                           
##                   Kappa : 0.8291          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.7143           0.9333
## Specificity                 1.0000            0.9667           0.8621
## Pos Pred Value              1.0000            0.9091           0.7778
## Neg Pred Value              1.0000            0.8788           0.9615
## Prevalence                  0.3409            0.3182           0.3409
## Detection Rate              0.3409            0.2273           0.3182
## Detection Prevalence        0.3409            0.2500           0.4091
## Balanced Accuracy           1.0000            0.8405           0.8977

20160528Tutorial

David Chiu

2016年5月28日

使用rpart 做出分類結果

將分類結果顯示在圖上

觀看分類結果

使用caret 套件找出準確率

將資料分為訓練與測試資料集

使用訓練資料集建立模型

套用在測試資料集測試模型