# Terms
#
# root node
# decision node
# branch
# leaf node ( = terminal node )
#
# entropy , information gain
# cost / rule
# 적용시 퍼포먼스가 좋지 않을 수 있는 케이스
# a large number of nominal features with many levels or it has a large number of numeric features 인 경우,
# very large number of decisions and an overly complex tree 생성 => overfit 될 가능성도 커짐
# Recursive Partitioning ( Split )
# - Decision trees are built using a heuristic called recursive partitioning
# - subsets are sufciently homogenous, or another stopping criterion has been met 할 때까지 recursive 하게 subsetting 을 수행
#
# Stopping Criterion
# • All (or nearly all) of the examples at the node have the same class
# • There are no remaining features to distinguish among the examples
# • The tree has grown to a predefined size limit ( avoid overfitting )
# ref : http://www.rulequest.com/
#
# C5.0 algorithm has become the industry standard to produce decision trees
# overfit / underfit 이 쉽게 될 수 있고, training data bias 에 취약하다는 단점이 있지만
# ROI 가 잘나오는 기법이라는 장점도 있고, ensemble 로 단점이 개선되는 측면도 있다.
# Decision Tree 는 결국 Best Split 을 찾는 것이 목표이다.
#
# Purity
# - Best Split 인지를 판단하는 지표중 하나, 얼마나 homogenous 하게 나눴냐 하는 척도
# - C5.0 측정방식
# - entropy
# - quantifies the randomness ( or disorder )
# - is measured in bits ( twoClass = (0 or 1), n-Classes = (0, log2(n)) )
# - 알고리즘에서는, 각 split 시도시 homogeneity 의 Change 를 entroy 량의 변화값으로 측정한다.
# - InformationGain(F) = Entropy(S1) - Entropy(S2)
# - Minimize the total entropy ( = sum of each (weight * parition's entropy ) )
# - Gini index, Chi-Squared statistic, Gain Ratio.
#
# ex> two classes , 60% & 40%
-0.60 * log2(0.60) - 0.40 * log2(0.40)
[1] 0.9709506
curve(-x * log2(x) - (1-x)*log2(1-x), col = "red", ylab = "enthropy", lwd = 4)
# Pruning ( 가지치기 / merge )
# - full tree : leaf node 가 순도 100% 인 tree ( entropy = 0 ) => overfitting 위험도 증가
# - pruning : 적당한 수준에서 terminal node 를 결합
#
# 방식
# - ealry stopping ( pre-pruning ) , post-pruning => 주로 post-pruning ( 일단 다 키운다음에 merge )
library(C50)
credit <- read.csv("/Users/CA/Machine-Learning-with-R-datasets/credit.csv")
credit$default <- factor(credit$default)
fit.c50 <- C5.0( x = subset(credit, select = -c(default)),
y = credit$default,
# an integer specifying the number of boosting iterations. A value of one indicates that a single model is used.
# C5.0 uses adaptive boosting
trials = 10,
control = C5.0Control())
print(fit.c50)
Call:
C5.0.default(x = subset(credit, select = -c(default)), y = credit$default, trials = 10, control = C5.0Control())
Classification Tree
Number of samples: 1000
Number of predictors: 20
Number of boosting iterations: 10
Average tree size: 63.9
Non-standard options: attempt to group attributes
# trails 로 셋팅한 10에 의해 10개의 tree 가 생성되었고,
# 위의 summary 중 boost 항목을 통해, 10개 모델에 대한 boosting 결과를 볼 수 있으며, error 율이 매우 낮아짐을 확인할 수 있다.
fit.c50$boostResults
# Cost Matrix : 예측이 틀린 경우에 대해, 서로 다른 가중치를 두어, 평가하려는 경우 사용
# ex> 스팸이 아닌 메일을 스팸이라고 할 때의 costA 와 스팸 메일을 정상이라고 할 때의 costB 를 따로 처리 ( costA >> costB )
library(C50)
credit <- read.csv("/Users/CA/Machine-Learning-with-R-datasets/credit.csv")
credit <- transform(credit, default = ifelse(default == 1, "no", "yes"))
credit$default <- factor(credit$default)
# create cost matrix
matrix_dimensions <- list(c("no", "yes"), c("no", "yes"))
names(matrix_dimensions) <- c("predicted", "actual")
error_cost <- matrix(c(0, 1, 4, 0), nrow = 2, dimnames = matrix_dimensions)
print(error_cost)
actual
predicted no yes
no 0 4
yes 1 0
fit.c50 <- C5.0( x = subset(credit, select = -c(default)),
y = credit$default,
# an integer specifying the number of boosting iterations. A value of one indicates that a single model is used.
# C5.0 uses adaptive boosting
trials = 10,
costs = error_cost,
control = C5.0Control())
# summary 시, errors 와 함께 costs 가 추가된 것을 볼 수 있고, boost 로 인하여 error / cost 모두 떨어짐을 확인할 수 있다.
# summary(fit.c50)
# antecedent ( 전제 )
# consequent ( 결론 )
# [antecedent] 키>185, 고향:서울, 채력등급>2 => [consequent] 청와대 헌병
#
# Covering Algorithms
# - 전체 학습패턴과 비어있는 규칙 집합으로부터 시작하여 최대한 많은 학습패턴을 만족시키는 규칙들을 만들고
# - 규칙 집합에 넣고 만족된 학습패턴은 제거한 후 나머지 학습패턴들에 대해서 다시 규칙을 만들어 넣는 것을
# - 반복하여 규칙 집합을 구성
#
# - The 1R / RIPPER Algorithm
# -- 1R : F1 == A 면 B 다 (1) , F2 == C 면 B 다 를 각각 평가하고, 에러율이 낮은 것을 택하는 방식 ( rule 에 feature 가 하나만 사용 )
# -- RIPPER : rule 에 feature 여러개를 조합하여 사용 가능
#
# 1R / RIPPER ( RWeka package 를 통해 셋팅 가능 )
mr <- read.csv("/Users/CA/Machine-Learning-with-R-datasets/mushrooms.csv")
library(RWeka)
mr.1R <- OneR(type ~ ., data = mr)
print(mr.1R) # selected feature : odor ( Rule 의 갯수 1 개 )
odor:
a -> e
c -> p
f -> p
l -> e
m -> p
n -> e
p -> p
s -> p
y -> p
(8004/8124 instances correct)
summary(mr.1R) # Accuracy 는 98% 로 높지만, 120개의 독버섯을 정상으로 판단;;
=== Summary ===
Correctly Classified Instances 8004 98.5229 %
Incorrectly Classified Instances 120 1.4771 %
Kappa statistic 0.9704
Mean absolute error 0.0148
Root mean squared error 0.1215
Relative absolute error 2.958 %
Root relative squared error 24.323 %
Total Number of Instances 8124
=== Confusion Matrix ===
a b <-- classified as
4208 0 | a = e
120 3796 | b = p
mr.JRip <- JRip(type ~ ., data = mr)
print(mr.JRip) # selected features : odor, gill_size, .... ( Rule 의 갯수 9 개 )
JRIP rules:
===========
(odor = f) => type=p (2160.0/0.0)
(gill_size = n) and (gill_color = b) => type=p (1152.0/0.0)
(gill_size = n) and (odor = p) => type=p (256.0/0.0)
(odor = c) => type=p (192.0/0.0)
(spore_print_color = r) => type=p (72.0/0.0)
(stalk_surface_below_ring = y) and (stalk_surface_above_ring = k) => type=p (68.0/0.0)
(habitat = l) and (cap_color = w) => type=p (8.0/0.0)
(stalk_color_above_ring = y) => type=p (8.0/0.0)
=> type=e (4208.0/0.0)
Number of Rules : 9
summary(mr.JRip) # 100%
=== Summary ===
Correctly Classified Instances 8124 100 %
Incorrectly Classified Instances 0 0 %
Kappa statistic 1
Mean absolute error 0
Root mean squared error 0
Relative absolute error 0 %
Root relative squared error 0 %
Total Number of Instances 8124
=== Confusion Matrix ===
a b <-- classified as
4208 0 | a = e
0 3916 | b = p
# useful packages
#
# rpart
# party
# + randomForest
# rpart pacakge
# model lookup 을 보면 cp 가 튜닝할 수 있는 파라미터임을 알 수 있다.
library(rpart)
modelLookup("rpart")
# grow
fit.rpart <- rpart(Kyphosis ~ Age + Number + Start, method = "class", data = kyphosis)
printcp(fit.rpart)
Classification tree:
rpart(formula = Kyphosis ~ Age + Number + Start, data = kyphosis,
method = "class")
Variables actually used in tree construction:
[1] Age Start
Root node error: 17/81 = 0.20988
n= 81
CP nsplit rel error xerror xstd
1 0.176471 0 1.00000 1.00000 0.21559
2 0.019608 1 0.82353 0.94118 0.21078
3 0.010000 4 0.76471 0.94118 0.21078
#plotcp(fit.rpart)
#summary(fit.rpart)
plot(fit.rpart, uniform = T); text(fit.rpart, use.n = T, all = T, cex = .5)
post(fit.rpart)
# prune
head(fit.rpart$cptable)
CP nsplit rel error xerror xstd
1 0.17647059 0 1.0000000 1.0000000 0.2155872
2 0.01960784 1 0.8235294 0.9411765 0.2107780
3 0.01000000 4 0.7647059 0.9411765 0.2107780
fit.rpart.prune <- prune(fit.rpart, cp = fit.rpart$cptable[which.min(fit.rpart$cptable[, "xerror"]), "CP"])
plot(fit.rpart.prune, uniform = T); text(fit.rpart.prune, use.n = T, all = T, cex = .5)
library(party)
fit <- ctree(Kyphosis ~ Age + Number + Start, data=kyphosis)
fit2 <- ctree(Mileage~Price + Country + Reliability + Type, data=na.omit(cu.summary))
par(mfrow=c(1,2))
plot(fit, main="Conditional Inference Tree for Kyphosis")
plot(fit2)
library(randomForest)
fit <- randomForest(Kyphosis ~ Age + Number + Start, data=kyphosis)
print(fit)
Call:
randomForest(formula = Kyphosis ~ Age + Number + Start, data = kyphosis)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 1
OOB estimate of error rate: 18.52%
Confusion matrix:
absent present class.error
absent 61 3 0.0468750
present 12 5 0.7058824
importance(fit)
MeanDecreaseGini
Age 8.773154
Number 5.516590
Start 10.340227
# rpart 패키지에서 수행했던 부분을, caret train 을 통해 수행해본다.
library(caret)
modelLookup("rpart")
fit.caret <- train(Kyphosis ~ Age + Number + Start, data=kyphosis, method = "rpart")
fit.caret
CART
81 samples
3 predictor
2 classes: 'absent', 'present'
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 81, 81, 81, 81, 81, 81, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.00000000 0.7726512 0.3023266
0.01960784 0.7726512 0.3023266
0.17647059 0.7686655 0.2646701
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.01960784.
# ctree 를, caret train 을 통해 수행해본다.
library(caret)
modelLookup("ctree")
fit.ctree <- train(Kyphosis ~ Age + Number + Start, data=kyphosis, method = "ctree", tuneGrid = expand.grid(mincriterion = 0.95))
fit.ctree
Conditional Inference Tree
81 samples
3 predictor
2 classes: 'absent', 'present'
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 81, 81, 81, 81, 81, 81, ...
Resampling results:
Accuracy Kappa
0.7594273 0.2196265
Tuning parameter 'mincriterion' was held constant at a value of 0.95
varImp(fit.ctree)
ROC curve variable importance
Importance
Start 100.00
Number 61.89
Age 0.00
# References
#
# machine learning with R, CH.5
# https://rpubs.com/chengjiun/52658