작성 중

# 데이터 분석은 아래와 같은 순서대로 중요하지 않을까 하는 2017년 현재 생각.
# 30% 도메인 이해
# 30% EDA + FE 
# 15% 데이터 정제 
# 10% 평가        
# 10% 모델링      
# 5%  앙상블       
#
# 이중 개별 모델링을 제외한 나머지를 알아야, 와꾸가 서는 것이고
# 개별 모델링 기법만 몇개 아는 것은 ... 생각보다 덜 중요할 수 도 있다는 생각. 
# 특히 도메인 이해는 시간과 애정을 많이 쏟아야 하는 부분 같고, 올바른 평가에도 많은 시간을 써야 하는 것 같다. ( 이 부분은 보통 간과되기 슆다. )

terms

# Bagging
# Boosting
# Stacked Generalization
# 
# XGBOOST
# LASAGNE NN
# ADABOOST ET
#
# Stacking
# Generalization Error
# meta features / out-of-fold predictions
# meta learning
#
# Average
# Less Variance
# Less Generalization Error
# Less chance of overfitting

meta learning

# 1. meta learning?
#  - automatic learning algorithms are applied on metadata about machine learning experiments 
#
# 2. variations of meta learning
#  * selection ( algorithm learning )
#  * hyper-parameter optimization
#  * ensemble ( bagging . boosting . stacked generalization )

ensemble

ensemble method > bagging ( = Bootstrap Aggregating )

# Bagging generates a number of training datasets by bootstrap sampling the original training data. 
# These datasets are then used to generate a set of models using a single learning algorithm. 
# The models' predictions are combined using voting (for classi cation) or averaging (for numeric prediction).
#
# 1. BootStrap : training dataset으로부터 크기가 같은 표본을 반복추출 (bootstrap sampling)
# 2. Generating Model : 각각에 대해 하나의 ML 알고리즘에 따라 모델 생성, 
# 3. Ensemble : 각 모델들의 결과를 종합하여 의사결정을 내리는 방법( voting / average )
#
# it can perform quite well as long as it is used with relatively unstable learners,
# that is, those generating models that tend to change substantially when the input data changes only slightly
#
# bagging is often used with decision trees, which have the tendency to vary dramatically given minor changes in the input data.
#
# variance 를 줄이는 효과.
library(ipred)
credit <- read.csv("/Users/CA/Machine-Learning-with-R-datasets/credit.csv")
set.seed(1234)
rf.bag <- bagging(default ~ ., data = credit, nbagg = 25)
credit$pred <- predict(rf.bag, credit)
library(InformationValue)
threshold <- optimalCutoff(credit$default, credit$pred)
confusionMatrix(credit$default, credit$pred, threshold = threshold)
library(caret)
Loading required package: lattice
Loading required package: ggplot2

Attaching package: 'ggplot2'

The following object is masked from 'package:randomForest':

    margin


Attaching package: 'caret'

The following objects are masked from 'package:InformationValue':

    confusionMatrix, precision, sensitivity, specificity
credit <- read.csv("/Users/CA/Machine-Learning-with-R-datasets/credit.csv")
credit <- transform(credit, default = ifelse(default == 1, "no", "yes"))
ctrl <- trainControl(method = "cv", number = 10)
train(default ~ ., data = credit, method = "treebag", trControl = ctrl)
Loading required package: plyr
Loading required package: e1071
Bagged CART 

1000 samples
  20 predictor
   2 classes: 'no', 'yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 900, 900, 900, 900, 900, 900, ... 
Resampling results:

  Accuracy  Kappa    
  0.752     0.3742893
# The caret package also includes example objects for bags of 
# naive Bayes models (nbBag), decision trees (ctreeBag), and neural networks (nnetBag).
str(svmBag)
List of 3
 $ fit      :function (x, y, ...)  
 $ pred     :function (object, x)  
 $ aggregate:function (x, type = "class")  
library(caret)
library(kernlab)
library(e1071)

credit <- read.csv("/Users/CA/Machine-Learning-with-R-datasets/credit.csv")
credit <- transform(credit, default = ifelse(default == 1, "no", "yes"))

svm.predict <- function (object, x)
{
 if (is.character(lev(object))) {
    out <- predict(object, as.matrix(x), type = "probabilities")
    colnames(out) <- lev(object)
    rownames(out) <- NULL
  }
  else out <- predict(object, as.matrix(x))[, 1]
  out
}

bagCtrl <- bagControl(fit = svmBag$fit, predict = svm.predict, aggregate = svmBag$aggregate)
trCtrl    <- trainControl(method = "cv", number = 10)

svmbag.result <- train(default ~., data = credit, method = "bag", trControl = trCtrl, bagControl = bagCtrl, verbose = F)
svmbag.result
Bagged Model 

1000 samples
  20 predictor
   2 classes: 'no', 'yes' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 900, 900, 900, 900, 900, 900, ... 
Resampling results:

  Accuracy  Kappa    
  0.764     0.4015154

Tuning parameter 'vars' was held constant at a value of 48

ensemble method > boosting

# it boosts the performance of weak learners to attain the performance of stronger learners
# boosting uses ensembles of models trained on resampled data and a vote to determine the  nal prediction
#
# First, the resampled datasets in boosting are constructed specically to generate complementary learners. 
# Second, rather than giving each learner an equal vote, boosting gives each learner's vote a weight based on its past performance.
# 
# variance 와 함께, bias 도 줄여가는 효과 (weak learner / strong learner 간 weight 조정)
library(randomForest)
library(data.table)
library(gbm)
library(ggplot2)
library(plyr)
library(dplyr)
library(rpart)
x <- seq(-2,2,by=0.01)
lenx<- length(x)
y <- 2 + 3*x^2 + rnorm(lenx, 0, 0.5)
y_r <- 2 + 3*x^2
x.y <- data.frame(x=x,y=y, y_r=y_r)
x.y.samp <- x.y %>% sample_frac(0.5)
x.y.samp.test <- x.y %>% sample_frac(0.1)
mdl_cart <- rpart(y ~ x, data=x.y.samp)
x.y.samp.test$cart_fit <- predict(mdl_cart, newdata=x.y.samp.test)
mdl_rf <- randomForest(y ~ x,data=x.y.samp)
x.y.samp.test$rf_fit <- predict(mdl_rf,newdata=x.y.samp.test)
ggplot(x.y.samp, aes(x,y_r)) + geom_line(size=1.5, colour='black') + geom_point(aes(y=y), size=1) + geom_line(data =  x.y.samp.test, aes(x=x, y=cart_fit), colour = "blue") + geom_line(data =  x.y.samp.test, aes(x=x, y=rf_fit), colour = "red")

shrink <- 0.1
#regression based boosting
y_n <- x.y.samp$y
x   <- x.y.samp$x
v_y_l <- list()
for(i in 1:100){
  lm_fit <- lm(y_n ~ x*I(0 < x))
  v_y <- shrink * predict(lm_fit)
  v_y_l[[i]] <- shrink * predict(lm_fit, newdata=x.y.samp.test)
  resid_n <-  y_n - v_y
  y_n <- resid_n
}
x.y.samp.test$lm_fit   <- apply(as.data.table(v_y_l),1,sum)
x.y.samp.test$lm_fit_3 <- apply(as.data.table(v_y_l)[,1:10,with=F],1,sum)
x.y.samp.test$lm_fit_2 <- apply(as.data.table(v_y_l)[,1:5,with=F],1,sum)
x.y.samp.test$lm_fit_1 <- apply(as.data.table(v_y_l)[,1:2,with=F],1,sum)
ggplot(x.y.samp, aes(x=x,y=y_r)) + geom_line(size=1.5, colour='red') + geom_point(aes(y=y), size=1) + geom_line(data=x.y.samp.test, aes(x=x,y=lm_fit), colour='purple', linetype=2, size=1) + geom_line(data=x.y.samp.test, aes(x=x,y=lm_fit_2),colour='purple',linetype=4) + geom_line(data=x.y.samp.test, aes(x=x,y=lm_fit_3),colour='purple',linetype=4) + geom_line(data=x.y.samp.test,aes(x=x,y=lm_fit_1),colour='purple',linetype=4)

#cart based boosting
y_n <- x.y.samp$y
x <- x.y.samp$x
v_y_l <- list()
for(i in 1:100){
  rpart_fit <- rpart(y_n ~ x)
  v_y <- shrink * predict(rpart_fit)
  v_y_l[[i]] <- shrink * predict(rpart_fit, newdata=x.y.samp.test)
  resid_n <-  y_n - v_y
  y_n <- resid_n
}
x.y.samp.test$rpart_fit <- apply(as.data.table(v_y_l),1,sum)
x.y.samp.test$rpart_fit_3 <- apply(as.data.table(v_y_l)[,1:10,with=F],1,sum)
x.y.samp.test$rpart_fit_2 <- apply(as.data.table(v_y_l)[,1:5,with=F],1,sum)
x.y.samp.test$rpart_fit_1 <- apply(as.data.table(v_y_l)[,1:2,with=F],1,sum)
ggplot(x.y.samp, aes(x=x,y=y_r)) + geom_line(size=1.5, colour='red') + geom_point(aes(y=y), size=1) + geom_line(data=x.y.samp.test, aes(x=x,y=rpart_fit), colour='purple', linetype=2, size=1) + geom_line(data=x.y.samp.test, aes(x=x,y=rpart_fit_2),colour='purple',linetype=4) + geom_line(data=x.y.samp.test, aes(x=x,y=rpart_fit_3),colour='purple',linetype=4) + geom_line(data=x.y.samp.test,aes(x=x,y=rpart_fit_1),colour='purple',linetype=4) 

ensemble method > boosting > adaptive boosting > AdaBoost
# AdaBoost.M1 algorithm : adabag pacakge
library(adabag)
Loading required package: mlbench

Attaching package: 'adabag'

The following object is masked from 'package:ipred':

    bagging
data(iris)
iris.adaboost <- boosting(Species~., data = iris, boos = TRUE, mfinal = 10, coeflearn = "Breiman")
importanceplot(iris.adaboost)

iris.adaboost$weights
 [1] 1.4381928 1.6711499 1.2659370 0.8562498 0.8457676 1.4800669 0.7712694 0.9158454 1.1614991 1.3841249
head(iris.adaboost$votes)
        [,1] [,2] [,3]
[1,] 11.7901    0    0
[2,] 11.7901    0    0
[3,] 11.7901    0    0
[4,] 11.7901    0    0
[5,] 11.7901    0    0
[6,] 11.7901    0    0
data(iris)
iris.boostcv <- boosting.cv(Species ~ ., v=2, data=iris, mfinal=10, control=rpart.control(cp=0.01))
i:  1 Wed Aug  9 15:44:24 2017 
i:  2 Wed Aug  9 15:44:25 2017 
iris.boostcv[-1]
$confusion
               Observed Class
Predicted Class setosa versicolor virginica
     setosa         50          0         0
     versicolor      0         45         1
     virginica       0          5        49

$error
[1] 0.04
seeds <- vector(mode = "list", length = nrow(iris) + 1)
seeds <- lapply(seeds, function(x) 1:20)
grid <- expand.grid(mfinal = (1:3)*3, 
                    maxdepth = c(1, 3),
                    coeflearn = c("Breiman", "Freund", "Zhu"))
cctrl1 <- trainControl(method = "cv", number = 3, returnResamp = "all",
                       classProbs = TRUE, 
                       summaryFunction = multiClassSummary)
                       #, seeds = seeds)
test_class_cv_form <- train(Species ~ ., data = iris, 
                            method = "AdaBoost.M1",
                            tuneGrid = grid, 
                            trControl = cctrl1,
                            metric = "Accuracy", 
                            preProc = c("center", "scale"))
importanceplot(test_class_cv_form$finalModel)

test_class_cv_form$bestTune
iris$pred <- predict(test_class_cv_form, iris)
caret::confusionMatrix(iris$pred, iris$Species)
Confusion Matrix and Statistics

            Reference
Prediction   setosa versicolor virginica
  setosa         50          0         0
  versicolor      0         49         5
  virginica       0          1        45

Overall Statistics
                                         
               Accuracy : 0.96           
                 95% CI : (0.915, 0.9852)
    No Information Rate : 0.3333         
    P-Value [Acc > NIR] : < 2.2e-16      
                                         
                  Kappa : 0.94           
 Mcnemar's Test P-Value : NA             

Statistics by Class:

                     Class: setosa Class: versicolor Class: virginica
Sensitivity                 1.0000            0.9800           0.9000
Specificity                 1.0000            0.9500           0.9900
Pos Pred Value              1.0000            0.9074           0.9783
Neg Pred Value              1.0000            0.9896           0.9519
Prevalence                  0.3333            0.3333           0.3333
Detection Rate              0.3333            0.3267           0.3000
Detection Prevalence        0.3333            0.3600           0.3067
Balanced Accuracy           1.0000            0.9650           0.9450
ensemble method > boosting > random forest ( decision tree forest )
library(randomForest)
credit <- read.csv("/Users/CA/Machine-Learning-with-R-datasets/credit.csv")
credit <- transform(credit, default = ifelse(default == 1, 0, 1))
rf <- randomForest(default ~ ., data = credit, importance = T, proximity = T)
The response has five or fewer unique values.  Are you sure you want to do regression?
rf

Call:
 randomForest(formula = default ~ ., data = credit, importance = T,      proximity = T) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 6

          Mean of squared residuals: 0.160908
                    % Var explained: 23.38
round(importance(rf), 2)
                     %IncMSE IncNodePurity
checking_balance       40.51         23.75
months_loan_duration   22.28         17.54
credit_history         15.41         12.50
purpose                10.10         18.41
amount                 13.97         23.34
savings_balance        10.20         10.37
employment_length       6.62         11.91
installment_rate        4.47          5.95
personal_status         2.05          6.77
other_debtors          10.16          3.74
residence_history       4.99          5.69
property                6.42          9.13
age                    10.39         16.57
installment_plan        8.89          5.40
housing                 3.21          4.14
existing_credits        3.99          2.93
dependents              0.03          1.85
telephone               2.99          2.40
foreign_worker         -0.76          0.54
job                     5.24          5.62
credit$pred <- predict(rf, credit)
library(InformationValue)
threshold <- optimalCutoff(credit$default, credit$pred)
ModelMetrics::confusionMatrix(credit$default, credit$pred, threshold)
     [,1] [,2]
[1,]  700    0
[2,]    0  300
ensemble method > boosting > xgboost
# ref : http://xgboost.readthedocs.io/en/latest/model.html
# ref : https://medium.com/@peteryun/ml-kaggle%EC%97%90-%EC%A0%81%EC%9A%A9%ED%95%B4%EB%B3%B4%EB%8A%94-xgboost-f1650342ba93
library(xgboost)
data("agaricus.test"); data("agaricus.train")
train <- agaricus.train
test  <- agaricus.test
# fit 
model.xgboost <- xgboost(data = train$data, label = train$label,
                         max.depth = 2, eta = 1,
                         nround = 10, nthread = 2, 
                         objective = "binary:logistic")
[1] train-error:0.046522 
[2] train-error:0.022263 
[3] train-error:0.007063 
[4] train-error:0.015200 
[5] train-error:0.007063 
[6] train-error:0.001228 
[7] train-error:0.001228 
[8] train-error:0.001228 
[9] train-error:0.001228 
[10]    train-error:0.000000 
test$pred <- predict(model.xgboost, test$data)
library(InformationValue)
threshold <- optimalCutoff(test$data, test$pred)
misClassError(test$data, test$pred, threshold = threshold)
[1] 0.1762

GBM

# Decision Tree Based
# Boosted : Multiple weak models combined algorithmically
# Gradient Boosted : Iteratively solves residuals
# Stochastic

stacked generalizztion

references

---
title: "Ensemble"
output: html_notebook
---

작성 중 

```{r}
# 데이터 분석은 아래와 같은 순서대로 중요하지 않을까 하는 2017년 현재 생각.
# 30% 도메인 이해
# 30% EDA + FE 
# 15% 데이터 정제 
# 10% 평가        
# 10% 모델링      
# 5%  앙상블       
#
# 이중 개별 모델링을 제외한 나머지를 알아야, 와꾸가 서는 것이고
# 개별 모델링 기법만 몇개 아는 것은 ... 생각보다 덜 중요할 수 도 있다는 생각. 
# 특히 도메인 이해는 시간과 애정을 많이 쏟아야 하는 부분 같고, 올바른 평가에도 많은 시간을 써야 하는 것 같다. ( 이 부분은 보통 간과되기 슆다. )
```

#### terms 
```{r}
# Bagging
# Boosting
# Stacked Generalization
# 
# XGBOOST
# LASAGNE NN
# ADABOOST ET
#
# Stacking
# Generalization Error
# meta features / out-of-fold predictions
# meta learning
#
# Average
# Less Variance
# Less Generalization Error
# Less chance of overfitting
```

#### meta learning
```{r}
# 1. meta learning?
#  - automatic learning algorithms are applied on metadata about machine learning experiments 
#
# 2. variations of meta learning
#  * selection ( algorithm learning )
#  * hyper-parameter optimization
#  * ensemble ( bagging . boosting . stacked generalization )
```

#### ensemble
```{r}
```
![](/Users/CA/Downloads/ens1.png)

#### ensemble method > bagging ( = Bootstrap Aggregating )
```{r}
# Bagging generates a number of training datasets by bootstrap sampling the original training data. 
# These datasets are then used to generate a set of models using a single learning algorithm. 
# The models' predictions are combined using voting (for classi cation) or averaging (for numeric prediction).
#
# 1. BootStrap : training dataset으로부터 크기가 같은 표본을 반복추출 (bootstrap sampling)
# 2. Generating Model : 각각에 대해 하나의 ML 알고리즘에 따라 모델 생성, 
# 3. Ensemble : 각 모델들의 결과를 종합하여 의사결정을 내리는 방법( voting / average )
#
# it can perform quite well as long as it is used with relatively unstable learners,
# that is, those generating models that tend to change substantially when the input data changes only slightly
#
# bagging is often used with decision trees, which have the tendency to vary dramatically given minor changes in the input data.
#
# variance 를 줄이는 효과.
```

```{r}
library(ipred)

credit <- read.csv("/Users/CA/Machine-Learning-with-R-datasets/credit.csv")

set.seed(1234)
rf.bag <- bagging(default ~ ., data = credit, nbagg = 25)

credit$pred <- predict(rf.bag, credit)

library(InformationValue)
threshold <- optimalCutoff(credit$default, credit$pred)
confusionMatrix(credit$default, credit$pred, threshold = threshold)
```

```{r}
library(caret)
credit <- read.csv("/Users/CA/Machine-Learning-with-R-datasets/credit.csv")
credit <- transform(credit, default = ifelse(default == 1, "no", "yes"))

ctrl <- trainControl(method = "cv", number = 10)
train(default ~ ., data = credit, method = "treebag", trControl = ctrl)
```

```{r}
# The caret package also includes example objects for bags of 
# naive Bayes models (nbBag), decision trees (ctreeBag), and neural networks (nnetBag).
str(svmBag)
```

```{r}
library(caret)
library(kernlab)
library(e1071)

credit <- read.csv("/Users/CA/Machine-Learning-with-R-datasets/credit.csv")
credit <- transform(credit, default = ifelse(default == 1, "no", "yes"))

svm.predict <- function (object, x)
{
 if (is.character(lev(object))) {
    out <- predict(object, as.matrix(x), type = "probabilities")
    colnames(out) <- lev(object)
    rownames(out) <- NULL
  }
  else out <- predict(object, as.matrix(x))[, 1]
  out
}

bagCtrl <- bagControl(fit = svmBag$fit, predict = svm.predict, aggregate = svmBag$aggregate)
trCtrl    <- trainControl(method = "cv", number = 10)

svmbag.result <- train(default ~., data = credit, method = "bag", trControl = trCtrl, bagControl = bagCtrl, verbose = F)
```

```{r}
svmbag.result
```

#### ensemble method > boosting
```{r}
# it boosts the performance of weak learners to attain the performance of stronger learners
# boosting uses ensembles of models trained on resampled data and a vote to determine the  nal prediction
#
# First, the resampled datasets in boosting are constructed specically to generate complementary learners. 
# Second, rather than giving each learner an equal vote, boosting gives each learner's vote a weight based on its past performance.
# 
# variance 와 함께, bias 도 줄여가는 효과 (weak learner / strong learner 간 weight 조정)
```


```{r}
library(randomForest)
library(data.table)
library(gbm)
library(ggplot2)
library(plyr)
library(dplyr)
library(rpart)

x <- seq(-2,2,by=0.01)
lenx<- length(x)
y <- 2 + 3*x^2 + rnorm(lenx, 0, 0.5)
y_r <- 2 + 3*x^2
x.y <- data.frame(x=x,y=y, y_r=y_r)

x.y.samp <- x.y %>% sample_frac(0.5)
x.y.samp.test <- x.y %>% sample_frac(0.1)

mdl_cart <- rpart(y ~ x, data=x.y.samp)
x.y.samp.test$cart_fit <- predict(mdl_cart, newdata=x.y.samp.test)

mdl_rf <- randomForest(y ~ x,data=x.y.samp)
x.y.samp.test$rf_fit <- predict(mdl_rf,newdata=x.y.samp.test)

ggplot(x.y.samp, aes(x,y_r)) + geom_line(size=1.5, colour='black') + geom_point(aes(y=y), size=1) + geom_line(data =  x.y.samp.test, aes(x=x, y=cart_fit), colour = "blue") + geom_line(data =  x.y.samp.test, aes(x=x, y=rf_fit), colour = "red")
```

```{r}
shrink <- 0.1

#regression based boosting
y_n <- x.y.samp$y
x   <- x.y.samp$x
v_y_l <- list()

for(i in 1:100){
  lm_fit <- lm(y_n ~ x*I(0 < x))
  v_y <- shrink * predict(lm_fit)
  v_y_l[[i]] <- shrink * predict(lm_fit, newdata=x.y.samp.test)
  resid_n <-  y_n - v_y
  y_n <- resid_n
}

x.y.samp.test$lm_fit   <- apply(as.data.table(v_y_l),1,sum)

x.y.samp.test$lm_fit_3 <- apply(as.data.table(v_y_l)[,1:10,with=F],1,sum)

x.y.samp.test$lm_fit_2 <- apply(as.data.table(v_y_l)[,1:5,with=F],1,sum)

x.y.samp.test$lm_fit_1 <- apply(as.data.table(v_y_l)[,1:2,with=F],1,sum)

ggplot(x.y.samp, aes(x=x,y=y_r)) + geom_line(size=1.5, colour='red') + geom_point(aes(y=y), size=1) + geom_line(data=x.y.samp.test, aes(x=x,y=lm_fit), colour='purple', linetype=2, size=1) + geom_line(data=x.y.samp.test, aes(x=x,y=lm_fit_2),colour='purple',linetype=4) + geom_line(data=x.y.samp.test, aes(x=x,y=lm_fit_3),colour='purple',linetype=4) + geom_line(data=x.y.samp.test,aes(x=x,y=lm_fit_1),colour='purple',linetype=4)
```

```{r}
#cart based boosting
y_n <- x.y.samp$y
x <- x.y.samp$x
v_y_l <- list()
for(i in 1:100){
  rpart_fit <- rpart(y_n ~ x)
  v_y <- shrink * predict(rpart_fit)
  v_y_l[[i]] <- shrink * predict(rpart_fit, newdata=x.y.samp.test)
  resid_n <-  y_n - v_y
  y_n <- resid_n
}



x.y.samp.test$rpart_fit <- apply(as.data.table(v_y_l),1,sum)

x.y.samp.test$rpart_fit_3 <- apply(as.data.table(v_y_l)[,1:10,with=F],1,sum)

x.y.samp.test$rpart_fit_2 <- apply(as.data.table(v_y_l)[,1:5,with=F],1,sum)

x.y.samp.test$rpart_fit_1 <- apply(as.data.table(v_y_l)[,1:2,with=F],1,sum)


ggplot(x.y.samp, aes(x=x,y=y_r)) + geom_line(size=1.5, colour='red') + geom_point(aes(y=y), size=1) + geom_line(data=x.y.samp.test, aes(x=x,y=rpart_fit), colour='purple', linetype=2, size=1) + geom_line(data=x.y.samp.test, aes(x=x,y=rpart_fit_2),colour='purple',linetype=4) + geom_line(data=x.y.samp.test, aes(x=x,y=rpart_fit_3),colour='purple',linetype=4) + geom_line(data=x.y.samp.test,aes(x=x,y=rpart_fit_1),colour='purple',linetype=4) 

```

##### ensemble method > boosting > adaptive boosting > AdaBoost
```{r}
# AdaBoost.M1 algorithm : adabag pacakge
library(adabag)
data(iris)

iris.adaboost <- boosting(Species~., data = iris, boos = TRUE, mfinal = 10, coeflearn = "Breiman")
importanceplot(iris.adaboost)
iris.adaboost$weights
head(iris.adaboost$votes)
```


```{r}
data(iris)
iris.boostcv <- boosting.cv(Species ~ ., v=2, data=iris, mfinal=10, control=rpart.control(cp=0.01))
iris.boostcv[-1]
```

```{r}
seeds <- vector(mode = "list", length = nrow(iris) + 1)
seeds <- lapply(seeds, function(x) 1:20)

grid <- expand.grid(mfinal = (1:3)*3, 
                    maxdepth = c(1, 3),
                    coeflearn = c("Breiman", "Freund", "Zhu"))

cctrl1 <- trainControl(method = "cv", number = 3, returnResamp = "all",
                       classProbs = TRUE, 
                       summaryFunction = multiClassSummary)
                       #, seeds = seeds)

test_class_cv_form <- train(Species ~ ., data = iris, 
                            method = "AdaBoost.M1",
                            tuneGrid = grid, 
                            trControl = cctrl1,
                            metric = "Accuracy", 
                            preProc = c("center", "scale"))
importanceplot(test_class_cv_form$finalModel)
test_class_cv_form$bestTune

iris$pred <- predict(test_class_cv_form, iris)
caret::confusionMatrix(iris$pred, iris$Species)
```

##### ensemble method > boosting > random forest ( decision tree forest )
```{r}
# a.k.a 묻지마 모델 
#
# ada boost 는 여러 모델에 적용이 가능한 모델인 반면, random forest 는 decision tree 들의 ensemble 만 가능 
#
# After the ensemble of trees (the forest) is generated, 
# the model uses a vote to combine the trees' predictions.
```
![](/Users/CA/Downloads/rf.png)

```{r}
library(randomForest)
credit <- read.csv("/Users/CA/Machine-Learning-with-R-datasets/credit.csv")
credit <- transform(credit, default = ifelse(default == 1, 0, 1))

rf <- randomForest(default ~ ., data = credit, importance = T, proximity = T)
rf

round(importance(rf), 2)

credit$pred <- predict(rf, credit)

library(InformationValue)
threshold <- optimalCutoff(credit$default, credit$pred)
ModelMetrics::confusionMatrix(credit$default, credit$pred, threshold)
```

##### ensemble method > boosting > xgboost

```{r}
# ref : http://xgboost.readthedocs.io/en/latest/model.html
# ref : https://medium.com/@peteryun/ml-kaggle%EC%97%90-%EC%A0%81%EC%9A%A9%ED%95%B4%EB%B3%B4%EB%8A%94-xgboost-f1650342ba93

library(xgboost)

data("agaricus.test"); data("agaricus.train")

train <- agaricus.train
test  <- agaricus.test

# fit 
model.xgboost <- xgboost(data = train$data, label = train$label,
                         max.depth = 2, eta = 1,
                         nround = 10, nthread = 2, 
                         objective = "binary:logistic")

test$pred <- predict(model.xgboost, test$data)

library(InformationValue)
threshold <- optimalCutoff(test$data, test$pred)
misClassError(test$data, test$pred, threshold = threshold)
```

#### GBM
```{r}
# Decision Tree Based
# Boosted : Multiple weak models combined algorithmically
# Gradient Boosted : Iteratively solves residuals
# Stochastic
```

#### stacked generalizztion
```{r}
```



#### references
```{r}
# [textbook] machine learning with R, CH 11
# [ubc lecture] http://www.cs.ubc.ca/labs/beta/Courses/CPSC532H-13/Slides/content-session-4-slides.pdf
# [boosting vs bagging] http://freesearch.pe.kr/archives/4349
# [A boosting a new idea of building models] https://pdfs.semanticscholar.org/8d81/b7e930284becae9c3382ba33c4d48c7e598a.pdf
# [단순참고: SAS] https://www.sas.com/content/dam/SAS/ko_kr/doc/productbrief/Machine_learning_with_SAS_Eminer_16P.pdf 
# [gbm _ kaggle titanic] https://www.r-bloggers.com/predicting-titanic-deaths-on-kaggle-ii-gbm/
```