Classification

이 예제에서는 트리모형을 중심으로 코드 소개를 한다. 따라서 아래의 패키지를 설치할 필요가 있다.

install.packages(c("party", "rpart", "randomForest"))

설명도 역시 같은 순서로 진행된다.

party

Conditional Inference Tree를 이용한 분류

library(party)

str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
ind <- sample(2,nrow(iris), replace=TRUE, prob=c(0.7,0.3))

#split test, train set
trainData<-iris[ind==1,]
testData<-iris[ind==2,]

#variable relation
myFormula <- Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width

iris_ctree<-ctree(myFormula,data=trainData)

table(predict(iris_ctree),trainData$Species)
##             
##              setosa versicolor virginica
##   setosa         33          0         0
##   versicolor      0         38         4
##   virginica       0          1        30
print(iris_ctree)
## 
##   Conditional inference tree with 4 terminal nodes
## 
## Response:  Species 
## Inputs:  Sepal.Length, Sepal.Width, Petal.Length, Petal.Width 
## Number of observations:  106 
## 
## 1) Petal.Length <= 1.9; criterion = 1, statistic = 99.082
##   2)*  weights = 33 
## 1) Petal.Length > 1.9
##   3) Petal.Width <= 1.7; criterion = 1, statistic = 46.778
##     4) Petal.Length <= 4.6; criterion = 1, statistic = 15.626
##       5)*  weights = 32 
##     4) Petal.Length > 4.6
##       6)*  weights = 10 
##   3) Petal.Width > 1.7
##     7)*  weights = 31

Conditional Inference Tree

plot(iris_ctree)

plot of chunk unnamed-chunk-1

plot(iris_ctree,type="simple")

plot of chunk unnamed-chunk-1

테스트 데이터 예측

testPred <- predict(iris_ctree, newdata=testData)

table(testPred, testData$Species)
##             
## testPred     setosa versicolor virginica
##   setosa         17          0         0
##   versicolor      0         11         1
##   virginica       0          0        15

rpart를 활용한 예측

전처리, EDA

data("bodyfat",package="TH.data")

dim(bodyfat)
## [1] 71 10
attributes(bodyfat)
## $names
##  [1] "age"          "DEXfat"       "waistcirc"    "hipcirc"     
##  [5] "elbowbreadth" "kneebreadth"  "anthro3a"     "anthro3b"    
##  [9] "anthro3c"     "anthro4"     
## 
## $row.names
##  [1] "47"  "48"  "49"  "50"  "51"  "52"  "53"  "54"  "55"  "56"  "57" 
## [12] "58"  "59"  "60"  "61"  "62"  "63"  "64"  "65"  "66"  "67"  "68" 
## [23] "69"  "70"  "71"  "72"  "73"  "74"  "75"  "76"  "77"  "78"  "79" 
## [34] "80"  "81"  "82"  "83"  "84"  "85"  "86"  "87"  "88"  "89"  "90" 
## [45] "91"  "92"  "93"  "94"  "95"  "96"  "97"  "98"  "99"  "100" "101"
## [56] "102" "103" "104" "105" "106" "107" "108" "109" "110" "111" "112"
## [67] "113" "114" "115" "116" "117"
## 
## $class
## [1] "data.frame"
bodyfat[1:5,]
##    age DEXfat waistcirc hipcirc elbowbreadth kneebreadth anthro3a anthro3b
## 47  57  41.68     100.0   112.0          7.1         9.4     4.42     4.95
## 48  65  43.29      99.5   116.5          6.5         8.9     4.63     5.01
## 49  59  35.41      96.0   108.5          6.2         8.9     4.12     4.74
## 50  58  22.79      72.0    96.5          6.1         9.2     4.03     4.48
## 51  60  36.42      89.5   100.5          7.1        10.0     4.24     4.68
##    anthro3c anthro4
## 47     4.50    6.13
## 48     4.48    6.37
## 49     4.60    5.82
## 50     3.91    5.66
## 51     4.15    5.91
set.seed(123)

# divide to train, test sets
ind <- sample(2, nrow(bodyfat), replace=TRUE, prob=c(0.7,0.3))
bodyfat.train<-bodyfat[ind==1,]
bodyfat.test<-bodyfat[ind==2,]


library(rpart)

myFormula <- DEXfat ~ age + waistcirc + hipcirc + elbowbreadth + kneebreadth

bodyfat_rpart <- rpart(myFormula, data=bodyfat.train,
                       control=rpart.control(minsplit=10))

트리 플로팅

plot(bodyfat_rpart)

text(bodyfat_rpart,use.n=TRUE)

plot of chunk unnamed-chunk-2

#Gives a visual representation of the cross-validation results in an rpart object.
plotcp(bodyfat_rpart)

plot of chunk unnamed-chunk-3

에러를 최소화 하는 최적 CP값을 구한다.

opt <- which.min(bodyfat_rpart$cptable[,"xerror"])

cp<-bodyfat_rpart$cptable[opt,"CP"]

bodyfat_prune<-prune(bodyfat_rpart,cp=cp)

print(bodyfat_prune)
## n= 48 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 48 6297.93600 31.20146  
##    2) waistcirc< 88.4 28  686.87710 23.09857  
##      4) waistcirc< 71.5 6   77.11608 16.49833 *
##      5) waistcirc>=71.5 22  277.09710 24.89864  
##       10) hipcirc< 99.65 12   68.28282 22.87250 *
##       11) hipcirc>=99.65 10  100.43620 27.33000 *
##    3) waistcirc>=88.4 20 1198.92300 42.54550  
##      6) kneebreadth< 11.1 17  370.61110 40.09235  
##       12) hipcirc< 109.9 7   75.36960 35.72000 *
##       13) hipcirc>=109.9 10   67.74361 43.15300 *
##      7) kneebreadth>=11.1 3  146.28030 56.44667 *
DEXfat_pred<-predict(bodyfat_prune,newdata=bodyfat.test)

xlim<-range(bodyfat$DEXfat)

예측값과 정답을 플로팅 한다.

plot(DEXfat_pred ~ DEXfat,data=bodyfat.test,xlab="Observed",
     ylab="Predicted",ylim=xlim,xlim=xlim)

abline(a=0,b=1)

plot of chunk unnamed-chunk-4

randomForest 를 활용한 예측

전처리 - 학습셋과 테스트셋 분류

모델 생성

ind<-sample(2,nrow(iris),replace=TRUE,prob=c(0.7,0.3))

trainData<-iris[ind==1,]

testData<-iris[ind==2,]


library(randomForest)

rf<-randomForest(Species~.,data=trainData,ntree=100,proximity=TRUE)

table(predict(rf),trainData$Species)
##             
##              setosa versicolor virginica
##   setosa         40          0         0
##   versicolor      0         35         3
##   virginica       0          2        32
print(rf)
## 
## Call:
##  randomForest(formula = Species ~ ., data = trainData, ntree = 100,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4.46%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         40          0         0  0.00000000
## versicolor      0         35         2  0.05405405
## virginica       0          3        32  0.08571429
attributes(rf)
## $names
##  [1] "call"            "type"            "predicted"      
##  [4] "err.rate"        "confusion"       "votes"          
##  [7] "oob.times"       "classes"         "importance"     
## [10] "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"         
## [16] "y"               "test"            "inbag"          
## [19] "terms"          
## 
## $class
## [1] "randomForest.formula" "randomForest"
(rf$err.rate)
##               OOB     setosa versicolor  virginica
##   [1,] 0.06666667 0.00000000 0.07142857 0.13333333
##   [2,] 0.10769231 0.11538462 0.09523810 0.11111111
##   [3,] 0.08536585 0.06250000 0.15384615 0.04166667
##   [4,] 0.09890110 0.02857143 0.20000000 0.07692308
##   [5,] 0.10000000 0.02631579 0.21212121 0.06896552
##   [6,] 0.08490566 0.00000000 0.17142857 0.09375000
##   [7,] 0.07407407 0.00000000 0.14285714 0.08823529
##   [8,] 0.06422018 0.00000000 0.14285714 0.05882353
##   [9,] 0.07272727 0.00000000 0.13888889 0.08823529
##  [10,] 0.05454545 0.00000000 0.08333333 0.08823529
##  [11,] 0.05405405 0.00000000 0.08108108 0.08823529
##  [12,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [13,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [14,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [15,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [16,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [17,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [18,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [19,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [20,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [21,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [22,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [23,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [24,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [25,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [26,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [27,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [28,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [29,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [30,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [31,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [32,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [33,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [34,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [35,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [36,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [37,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [38,] 0.05357143 0.00000000 0.08108108 0.08571429
##  [39,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [40,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [41,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [42,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [43,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [44,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [45,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [46,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [47,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [48,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [49,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [50,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [51,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [52,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [53,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [54,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [55,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [56,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [57,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [58,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [59,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [60,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [61,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [62,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [63,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [64,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [65,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [66,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [67,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [68,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [69,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [70,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [71,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [72,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [73,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [74,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [75,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [76,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [77,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [78,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [79,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [80,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [81,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [82,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [83,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [84,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [85,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [86,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [87,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [88,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [89,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [90,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [91,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [92,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [93,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [94,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [95,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [96,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [97,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [98,] 0.04464286 0.00000000 0.05405405 0.08571429
##  [99,] 0.04464286 0.00000000 0.05405405 0.08571429
## [100,] 0.04464286 0.00000000 0.05405405 0.08571429

변수 중요도 플로팅- 변수 선택을 위함

plot(rf)

plot of chunk unnamed-chunk-6

importance(rf)
##              MeanDecreaseGini
## Sepal.Length         8.096503
## Sepal.Width          1.262377
## Petal.Length        28.761431
## Petal.Width         35.755582
varImpPlot(rf)

plot of chunk unnamed-chunk-6

  1. 예측값 출력
  2. Margin 플로팅

정답 클래스를 예측한 트리의 비율 - 오답 클래스를 예측한 트리의 비율

irisPred <- predict(rf, newdata = testData)

table(irisPred, testData$Species)
##             
## irisPred     setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0         12         0
##   virginica       0          1        15
plot(margin(rf, trainData$Species))

plot of chunk unnamed-chunk-7

Reference