常见机器学习算法:

1、线性回归

2、逻辑回归

3、决策树

4、支持向量机SVM

5、朴素贝叶斯

6、K最近邻

7、K均值

8、随机森林

9、降维算法

10、Gradient Boost 和 Adaboost 算法

1、线性回归

data(women)  #基础安装包的数据集women提供了15个人年龄在30~39岁间女性的身高和体重数据
head(women)
##   height weight
## 1     58    115
## 2     59    117
## 3     60    120
## 4     61    123
## 5     62    126
## 6     63    129
fit <- lm(formula = weight ~ height, data = women)  #拟合线性回归模型
summary(fit)  #查看模型信息
## 
## Call:
## lm(formula = weight ~ height, data = women)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7333 -1.1333 -0.3833  0.7417  3.1167 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -87.51667    5.93694  -14.74 1.71e-09 ***
## height        3.45000    0.09114   37.85 1.09e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared:  0.991,  Adjusted R-squared:  0.9903 
## F-statistic:  1433 on 1 and 13 DF,  p-value: 1.091e-14
fitted(fit)  #拟合值
##        1        2        3        4        5        6        7        8 
## 112.5833 116.0333 119.4833 122.9333 126.3833 129.8333 133.2833 136.7333 
##        9       10       11       12       13       14       15 
## 140.1833 143.6333 147.0833 150.5333 153.9833 157.4333 160.8833
residuals(fit)  #残差
##           1           2           3           4           5           6 
##  2.41666667  0.96666667  0.51666667  0.06666667 -0.38333333 -0.83333333 
##           7           8           9          10          11          12 
## -1.28333333 -1.73333333 -1.18333333 -1.63333333 -1.08333333 -0.53333333 
##          13          14          15 
##  0.01666667  1.56666667  3.11666667
predict(fit, data.frame(height = c(55, 60, 61)))  #应用模型进行预测
##        1        2        3 
## 102.2333 119.4833 122.9333

2、逻辑回归

适用于处理二分类问题

# install.packages('pbkrtest') install.packages('caret')
# install.packages('mlbench') install.packages('kernlab')
library(caret)  #加载caret包
## Loading required package: lattice
## Loading required package: ggplot2
library(mlbench)  #加载mlbench包,为了获取数据集PimaIndiansDiabetes
data(PimaIndiansDiabetes)  #载入数据集
str(PimaIndiansDiabetes)  #查看数据集结构,diabetes为二水平因子变量,其他变量为连续型数值变量
## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
head(PimaIndiansDiabetes)  #查看数据集的前6行数据
##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74       0       0 25.6    0.201  30      neg
fit <- glm(diabetes ~ ., data = PimaIndiansDiabetes, family = binomial(link = "logit"))  #训练模型
summary(fit)  #查看模型信息
## 
## Call:
## glm(formula = diabetes ~ ., family = binomial(link = "logit"), 
##     data = PimaIndiansDiabetes)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5566  -0.7274  -0.4159   0.7267   2.9297  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.4046964  0.7166359 -11.728  < 2e-16 ***
## pregnant     0.1231823  0.0320776   3.840 0.000123 ***
## glucose      0.0351637  0.0037087   9.481  < 2e-16 ***
## pressure    -0.0132955  0.0052336  -2.540 0.011072 *  
## triceps      0.0006190  0.0068994   0.090 0.928515    
## insulin     -0.0011917  0.0009012  -1.322 0.186065    
## mass         0.0897010  0.0150876   5.945 2.76e-09 ***
## pedigree     0.9451797  0.2991475   3.160 0.001580 ** 
## age          0.0148690  0.0093348   1.593 0.111192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 723.45  on 759  degrees of freedom
## AIC: 741.45
## 
## Number of Fisher Scoring iterations: 5
probabilities <- predict(fit, PimaIndiansDiabetes[, 1:8], type = "response")  #应用模型进行预测
head(probabilities)  #查看预测结果
##          1          2          3          4          5          6 
## 0.72172655 0.04864161 0.79670208 0.04162486 0.90218390 0.14663156
predictions <- ifelse(probabilities > 0.5, "pos", "neg")  #对预测结果进行处理,大于0.5为pos,否则为neg
head(predictions)
##     1     2     3     4     5     6 
## "pos" "neg" "pos" "neg" "pos" "neg"
# summarize accuracy
table(predictions, PimaIndiansDiabetes$diabetes)  #混淆矩阵
##            
## predictions neg pos
##         neg 445 112
##         pos  55 156
confusionMatrix(predictions, PimaIndiansDiabetes$diabetes)  #caret包中的函数,可获取混淆矩阵和准确率、灵敏度等统计量
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 445 112
##        pos  55 156
##                                           
##                Accuracy : 0.7826          
##                  95% CI : (0.7517, 0.8112)
##     No Information Rate : 0.651           
##     P-Value [Acc > NIR] : 1.373e-15       
##                                           
##                   Kappa : 0.4966          
##  Mcnemar's Test P-Value : 1.468e-05       
##                                           
##             Sensitivity : 0.8900          
##             Specificity : 0.5821          
##          Pos Pred Value : 0.7989          
##          Neg Pred Value : 0.7393          
##              Prevalence : 0.6510          
##          Detection Rate : 0.5794          
##    Detection Prevalence : 0.7253          
##       Balanced Accuracy : 0.7360          
##                                           
##        'Positive' Class : neg             
## 

应用K折交差分析,评估模型效果

library(caret)
library(mlbench)  #获取数据集
data(PimaIndiansDiabetes)
set.seed(7)  #设置随机种子,使结果能重现
control <- trainControl(method = "cv", number = 5)  #设置训练的参数
fit.glm <- train(diabetes ~ ., data = PimaIndiansDiabetes, method = "glm", metric = "Accuracy", 
    preProc = c("center", "scale"), trControl = control)
# summarize fit
print(fit.glm)
## Generalized Linear Model 
## 
## 768 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## Pre-processing: centered (8), scaled (8) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 614, 614, 615, 615, 614 
## Resampling results
## 
##   Accuracy   Kappa      Accuracy SD  Kappa SD 
##   0.7695442  0.4656824  0.02692468   0.0616666
## 
## 

3、决策树

rpart包中的rpart()函数可用于拟合CART分类树和回归树模型。

3.1、分类树模型:

因变量是因子变量,自变量可以是因子变量,亦可为连续型变量

library(rpart)  #加载包
library(mlbench)  #
data(PimaIndiansDiabetes)
str(PimaIndiansDiabetes)
## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
fit <- rpart(diabetes ~ ., data = PimaIndiansDiabetes)  #训练模型
print(fit)  #查看模型信息
## n= 768 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 768 268 neg (0.65104167 0.34895833)  
##     2) glucose< 127.5 485  94 neg (0.80618557 0.19381443)  
##       4) age< 28.5 271  23 neg (0.91512915 0.08487085) *
##       5) age>=28.5 214  71 neg (0.66822430 0.33177570)  
##        10) mass< 26.35 41   2 neg (0.95121951 0.04878049) *
##        11) mass>=26.35 173  69 neg (0.60115607 0.39884393)  
##          22) glucose< 99.5 55  10 neg (0.81818182 0.18181818) *
##          23) glucose>=99.5 118  59 neg (0.50000000 0.50000000)  
##            46) pedigree< 0.561 84  34 neg (0.59523810 0.40476190)  
##              92) pedigree< 0.2 21   4 neg (0.80952381 0.19047619) *
##              93) pedigree>=0.2 63  30 neg (0.52380952 0.47619048)  
##               186) pregnant>=1.5 52  21 neg (0.59615385 0.40384615)  
##                 372) pressure>=67 40  12 neg (0.70000000 0.30000000) *
##                 373) pressure< 67 12   3 pos (0.25000000 0.75000000) *
##               187) pregnant< 1.5 11   2 pos (0.18181818 0.81818182) *
##            47) pedigree>=0.561 34   9 pos (0.26470588 0.73529412) *
##     3) glucose>=127.5 283 109 pos (0.38515901 0.61484099)  
##       6) mass< 29.95 76  24 neg (0.68421053 0.31578947)  
##        12) glucose< 145.5 41   6 neg (0.85365854 0.14634146) *
##        13) glucose>=145.5 35  17 pos (0.48571429 0.51428571)  
##          26) insulin< 14.5 21   8 neg (0.61904762 0.38095238) *
##          27) insulin>=14.5 14   4 pos (0.28571429 0.71428571) *
##       7) mass>=29.95 207  57 pos (0.27536232 0.72463768)  
##        14) glucose< 157.5 115  45 pos (0.39130435 0.60869565)  
##          28) age< 30.5 50  23 neg (0.54000000 0.46000000)  
##            56) pressure>=61 40  13 neg (0.67500000 0.32500000)  
##             112) mass< 41.8 31   7 neg (0.77419355 0.22580645) *
##             113) mass>=41.8 9   3 pos (0.33333333 0.66666667) *
##            57) pressure< 61 10   0 pos (0.00000000 1.00000000) *
##          29) age>=30.5 65  18 pos (0.27692308 0.72307692) *
##        15) glucose>=157.5 92  12 pos (0.13043478 0.86956522) *
predictions <- predict(fit, PimaIndiansDiabetes[, 1:8], type = "class")  #应用训练模型进行分类预测
head(predictions)  #查看分类结果
##   1   2   3   4   5   6 
## pos neg neg neg pos neg 
## Levels: neg pos
table(predictions, PimaIndiansDiabetes$diabetes)  #混淆矩阵
##            
## predictions neg pos
##         neg 449  72
##         pos  51 196

k折交叉分析

library(caret)
library(mlbench)
data(PimaIndiansDiabetes)
set.seed(7)
control <- trainControl(method = "cv", number = 10)
fit.rpart <- train(diabetes ~ ., data = PimaIndiansDiabetes, method = "rpart", 
    metric = "Accuracy", trControl = control)
print(fit.rpart)
## CART 
## 
## 768 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 691, 691, 692, 691, 691, 692, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa      Accuracy SD  Kappa SD  
##   0.01741294  0.7343131  0.3810595  0.05308756   0.11239089
##   0.10447761  0.7133459  0.3586950  0.04901214   0.09103908
##   0.24253731  0.6744190  0.1617716  0.03471308   0.17427370
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.01741294.

3.2、回归树模型:

因变量是连续型数值变量

library(rpart)
library(mlbench)
data(BostonHousing)
head(BostonHousing)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio      b
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12
##   lstat medv
## 1  4.98 24.0
## 2  9.14 21.6
## 3  4.03 34.7
## 4  2.94 33.4
## 5  5.33 36.2
## 6  5.21 28.7
str(BostonHousing)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : num  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ b      : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
fit <- rpart(medv ~ ., data = BostonHousing, control = rpart.control(minsplit = 5))
print(fit)
## n= 506 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 506 42716.3000 22.53281  
##    2) rm< 6.941 430 17317.3200 19.93372  
##      4) lstat>=14.4 175  3373.2510 14.95600  
##        8) crim>=6.99237 74  1085.9050 11.97838 *
##        9) crim< 6.99237 101  1150.5370 17.13762 *
##      5) lstat< 14.4 255  6632.2170 23.34980  
##       10) dis>=1.38485 250  3721.1630 22.90520  
##         20) rm< 6.543 195  1636.0670 21.62974 *
##         21) rm>=6.543 55   643.1691 27.42727 *
##       11) dis< 1.38485 5   390.7280 45.58000 *
##    3) rm>=6.941 76  6059.4190 37.23816  
##      6) rm< 7.437 46  1899.6120 32.11304  
##       12) crim>=7.393425 3    27.9200 14.40000 *
##       13) crim< 7.393425 43   864.7674 33.34884 *
##      7) rm>=7.437 30  1098.8500 45.09667  
##       14) ptratio>=18.3 3   223.8200 33.30000 *
##       15) ptratio< 18.3 27   411.1585 46.40741 *
predictions <- predict(fit, BostonHousing[, 1:13])
mse <- mean((BostonHousing$medv - predictions)^2)
print(mse)
## [1] 12.71556

k折交叉分析:

library(caret)
library(mlbench)
# Load the dataset
data(BostonHousing)
# train
set.seed(7)
control <- trainControl(method = "cv", number = 2)
fit.rpart <- train(medv ~ ., data = BostonHousing, method = "rpart", metric = "RMSE", 
    trControl = control)
# summarize fit
print(fit.rpart)
## CART 
## 
## 506 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (2 fold) 
## Summary of sample sizes: 253, 253 
## Resampling results across tuning parameters:
## 
##   cp          RMSE      Rsquared   RMSE SD    Rsquared SD
##   0.07165784  5.983605  0.5774758  0.1385397  0.01654185 
##   0.17117244  7.173727  0.3939811  0.2713431  0.04677649 
##   0.45274420  7.173727  0.3939811  0.2713431  0.04677649 
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was cp = 0.07165784.

4、支持向量机SVM

kernlab包中的ksvm()函数可用于拟合分类和回归问题中的支持向量机模型

4.1.1、分类模型

# install.packages('kernlab') 安装包
library(kernlab)
library(mlbench)
data(PimaIndiansDiabetes)  #加载数据
fit <- ksvm(diabetes ~ ., data = PimaIndiansDiabetes, kernel = "rbfdot")  #训练模型
print(fit)  #查看模型信息
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.111287269529721 
## 
## Number of Support Vectors : 431 
## 
## Objective Function Value : -356.4694 
## Training error : 0.175781
predictions <- predict(fit, PimaIndiansDiabetes[, 1:8], type = "response")  #应用模型做预测
table(predictions, PimaIndiansDiabetes$diabetes)  #混淆矩阵
##            
## predictions neg pos
##         neg 462  97
##         pos  38 171

4.1.2、K折交叉分析

library(caret)
library(mlbench)
# Load the dataset
data(PimaIndiansDiabetes)
# train
set.seed(7)
control <- trainControl(method = "cv", number = 5)
fit.svmRadial <- train(diabetes ~ ., data = PimaIndiansDiabetes, method = "svmRadial", 
    metric = "Accuracy", trControl = control)
# summarize fit
print(fit.svmRadial)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 768 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 614, 614, 615, 615, 614 
## Resampling results across tuning parameters:
## 
##   C     Accuracy   Kappa      Accuracy SD  Kappa SD  
##   0.25  0.7603684  0.4415912  0.02943632   0.06952557
##   0.50  0.7551906  0.4322310  0.01961619   0.05163166
##   1.00  0.7604278  0.4458565  0.01037865   0.02321585
## 
## Tuning parameter 'sigma' was held constant at a value of 0.1178216
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were sigma = 0.1178216 and C = 1.

4.1.3、多分类问题

# 对数据集iris进行分类(Species三个分类,属于多分类问题):
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
irismodel <- ksvm(Species ~ ., data = iris, type = "C-bsvc", kernel = "rbfdot", 
    kpar = list(sigma = 0.1), C = 10, prob.model = TRUE)
irismodel
## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-bsvc  (classification) 
##  parameter : cost C = 10 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.1 
## 
## Number of Support Vectors : 32 
## 
## Objective Function Value : -5.8442 -3.0652 -136.9786 
## Training error : 0.02 
## Probability model included.
# 进行预测:
predict(irismodel, iris[c(3, 10, 56, 68, 107, 120), -5], type = "probabilities")
##           setosa  versicolor   virginica
## [1,] 0.984456890 0.008914961 0.006628150
## [2,] 0.980856710 0.012152543 0.006990747
## [3,] 0.005483066 0.953760778 0.040756155
## [4,] 0.009979338 0.987273933 0.002746729
## [5,] 0.013356347 0.081680756 0.904962897
## [6,] 0.012076805 0.162322667 0.825600528
predict(irismodel, iris[c(3, 10, 56, 68, 107, 120), -5])
## [1] setosa     setosa     versicolor versicolor virginica  virginica 
## Levels: setosa versicolor virginica

4.2、回归模型

library(kernlab)
library(mlbench)
data(BostonHousing)
fit <- ksvm(medv ~ ., BostonHousing, kernel = "rbfdot")
print(fit)
## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 1 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.106113556914206 
## 
## Number of Support Vectors : 334 
## 
## Objective Function Value : -75.2136 
## Training error : 0.090716
predictions <- predict(fit, BostonHousing)
mse <- mean((BostonHousing$medv - predictions)^2)
print(mse)
## [1] 7.673373

k折交叉分析

library(caret)
library(mlbench)
data(BostonHousing)
set.seed(7)
control <- trainControl(method = "cv", number = 5)
fit.svmRadial <- train(medv ~ ., data = BostonHousing, method = "svmRadial", 
    metric = "RMSE", trControl = control)
# summarize fit
print(fit.svmRadial)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 506 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 405, 405, 403, 405, 406 
## Resampling results across tuning parameters:
## 
##   C     RMSE      Rsquared   RMSE SD   Rsquared SD
##   0.25  4.818004  0.7564430  1.171043  0.09975117 
##   0.50  4.279601  0.7985729  1.201454  0.09310021 
##   1.00  3.820843  0.8337146  1.162899  0.08297657 
## 
## Tuning parameter 'sigma' was held constant at a value of 0.114971
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were sigma = 0.114971 and C = 1.

5、朴素贝叶斯

5.1、对于离散型数据集:

library(e1071)
data(HouseVotes84, package = "mlbench")
head(HouseVotes84)
##        Class   V1 V2 V3   V4   V5 V6 V7 V8 V9 V10  V11  V12 V13 V14 V15
## 1 republican    n  y  n    y    y  y  n  n  n   y <NA>    y   y   y   n
## 2 republican    n  y  n    y    y  y  n  n  n   n    n    y   y   y   n
## 3   democrat <NA>  y  y <NA>    y  y  n  n  n   n    y    n   y   y   n
## 4   democrat    n  y  y    n <NA>  y  n  n  n   n    y    n   y   n   n
## 5   democrat    y  y  y    n    y  y  n  n  n   n    y <NA>   y   y   y
## 6   democrat    n  y  y    n    y  y  n  n  n   n    n    n   y   y   y
##    V16
## 1    y
## 2 <NA>
## 3    n
## 4    y
## 5    y
## 6    y
str(HouseVotes84)
## 'data.frame':    435 obs. of  17 variables:
##  $ Class: Factor w/ 2 levels "democrat","republican": 2 2 1 1 1 1 1 2 2 1 ...
##  $ V1   : Factor w/ 2 levels "n","y": 1 1 NA 1 2 1 1 1 1 2 ...
##  $ V2   : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ V3   : Factor w/ 2 levels "n","y": 1 1 2 2 2 2 1 1 1 2 ...
##  $ V4   : Factor w/ 2 levels "n","y": 2 2 NA 1 1 1 2 2 2 1 ...
##  $ V5   : Factor w/ 2 levels "n","y": 2 2 2 NA 2 2 2 2 2 1 ...
##  $ V6   : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 1 ...
##  $ V7   : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 2 ...
##  $ V8   : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 2 ...
##  $ V9   : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 2 ...
##  $ V10  : Factor w/ 2 levels "n","y": 2 1 1 1 1 1 1 1 1 1 ...
##  $ V11  : Factor w/ 2 levels "n","y": NA 1 2 2 2 1 1 1 1 1 ...
##  $ V12  : Factor w/ 2 levels "n","y": 2 2 1 1 NA 1 1 1 2 1 ...
##  $ V13  : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 NA 2 2 1 ...
##  $ V14  : Factor w/ 2 levels "n","y": 2 2 2 1 2 2 2 2 2 1 ...
##  $ V15  : Factor w/ 2 levels "n","y": 1 1 1 1 2 2 2 NA 1 NA ...
##  $ V16  : Factor w/ 2 levels "n","y": 2 NA 1 2 2 2 2 2 2 NA ...
model <- naiveBayes(Class ~ ., data = HouseVotes84)  #训练模型
predict(model, HouseVotes84[1:10, ])  #预测前10个样本的分类
##  [1] republican republican republican democrat   democrat   democrat  
##  [7] republican republican republican democrat  
## Levels: democrat republican
predict(model, HouseVotes84[1:10, ], type = "raw")  #返回前10个样本的分类的概率
##           democrat   republican
##  [1,] 1.029209e-07 9.999999e-01
##  [2,] 5.820415e-08 9.999999e-01
##  [3,] 5.684937e-03 9.943151e-01
##  [4,] 9.985798e-01 1.420152e-03
##  [5,] 9.666720e-01 3.332802e-02
##  [6,] 8.121430e-01 1.878570e-01
##  [7,] 1.751512e-04 9.998248e-01
##  [8,] 8.300100e-06 9.999917e-01
##  [9,] 8.277705e-08 9.999999e-01
## [10,] 1.000000e+00 5.029425e-11
pred <- predict(model, HouseVotes84)  #输出训练集的预测分类
table(pred, HouseVotes84$Class)  #输出混淆矩阵
##             
## pred         democrat republican
##   democrat        238         13
##   republican       29        155
## using laplace smoothing:使用laplace平滑
model <- naiveBayes(Class ~ ., data = HouseVotes84, laplace = 3)
pred <- predict(model, HouseVotes84[, -1])
table(pred, HouseVotes84$Class)
##             
## pred         democrat republican
##   democrat        237         12
##   republican       30        156

5.2、对于连续型数据集

## Example with metric predictors:
data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
m <- naiveBayes(Species ~ ., data = iris)
## alternatively:
m <- naiveBayes(iris[, -5], iris[, 5])
m
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = iris[, -5], y = iris[, 5])
## 
## A-priori probabilities:
## iris[, 5]
##     setosa versicolor  virginica 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             Sepal.Length
## iris[, 5]     [,1]      [,2]
##   setosa     5.006 0.3524897
##   versicolor 5.936 0.5161711
##   virginica  6.588 0.6358796
## 
##             Sepal.Width
## iris[, 5]     [,1]      [,2]
##   setosa     3.428 0.3790644
##   versicolor 2.770 0.3137983
##   virginica  2.974 0.3224966
## 
##             Petal.Length
## iris[, 5]     [,1]      [,2]
##   setosa     1.462 0.1736640
##   versicolor 4.260 0.4699110
##   virginica  5.552 0.5518947
## 
##             Petal.Width
## iris[, 5]     [,1]      [,2]
##   setosa     0.246 0.1053856
##   versicolor 1.326 0.1977527
##   virginica  2.026 0.2746501
table(predict(m, iris), iris[, 5])
##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         47         3
##   virginica       0          3        47

5.3、Example of using a contingency table:

data(Titanic)
str(Titanic)
##  table [1:4, 1:2, 1:2, 1:2] 0 0 35 0 0 0 17 0 118 154 ...
##  - attr(*, "dimnames")=List of 4
##   ..$ Class   : chr [1:4] "1st" "2nd" "3rd" "Crew"
##   ..$ Sex     : chr [1:2] "Male" "Female"
##   ..$ Age     : chr [1:2] "Child" "Adult"
##   ..$ Survived: chr [1:2] "No" "Yes"
Titanic
## , , Age = Child, Survived = No
## 
##       Sex
## Class  Male Female
##   1st     0      0
##   2nd     0      0
##   3rd    35     17
##   Crew    0      0
## 
## , , Age = Adult, Survived = No
## 
##       Sex
## Class  Male Female
##   1st   118      4
##   2nd   154     13
##   3rd   387     89
##   Crew  670      3
## 
## , , Age = Child, Survived = Yes
## 
##       Sex
## Class  Male Female
##   1st     5      1
##   2nd    11     13
##   3rd    13     14
##   Crew    0      0
## 
## , , Age = Adult, Survived = Yes
## 
##       Sex
## Class  Male Female
##   1st    57    140
##   2nd    14     80
##   3rd    75     76
##   Crew  192     20
m <- naiveBayes(Survived ~ ., data = Titanic)
m
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.formula(formula = Survived ~ ., data = Titanic)
## 
## A-priori probabilities:
## Survived
##       No      Yes 
## 0.676965 0.323035 
## 
## Conditional probabilities:
##         Class
## Survived        1st        2nd        3rd       Crew
##      No  0.08187919 0.11208054 0.35436242 0.45167785
##      Yes 0.28551336 0.16596343 0.25035162 0.29817159
## 
##         Sex
## Survived       Male     Female
##      No  0.91543624 0.08456376
##      Yes 0.51617440 0.48382560
## 
##         Age
## Survived      Child      Adult
##      No  0.03489933 0.96510067
##      Yes 0.08016878 0.91983122
predict(m, as.data.frame(Titanic))
##  [1] Yes No  No  No  Yes Yes Yes Yes No  No  No  No  Yes Yes Yes Yes Yes
## [18] No  No  No  Yes Yes Yes Yes No  No  No  No  Yes Yes Yes Yes
## Levels: No Yes

6、KNN(K–最近邻算法)

算法分析:

优点:

(i)基于实例的学习,不需要建立模型,不必维护源自数据的模型;

(ii)可以生成任意形状的决策边界,而决策树和基于规则的分类器只局限于直线决策边界(高维时是超平面)

缺点:

(i)分类测试样例时,开销很大,为O(n),n为training set个数;

(ii)基于局部信息进行预测,所以当k比较小时,对噪声数据很敏感,而模型分类算法是基于整体的。

# install.packages('class') 安装包
library(class)  #加载包
train <- rbind(iris3[1:25, , 1], iris3[1:25, , 2], iris3[1:25, , 3])
test <- rbind(iris3[26:50, , 1], iris3[26:50, , 2], iris3[26:50, , 3])
cl <- factor(c(rep("s", 25), rep("c", 25), rep("v", 25)))
iris.predict <- knn(train, test, cl, k = 3, prob = TRUE)
iris.predict
##  [1] s s s s s s s s s s s s s s s s s s s s s s s s s c c v c c c c c v c
## [36] c c c c c c c c c c c c c c c v c c v v v v v v v v v v c v v v v v v
## [71] v v v v v
## attr(,"prob")
##  [1] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
##  [8] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [15] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [22] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.6666667
## [29] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.6666667 1.0000000
## [36] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [43] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [50] 1.0000000 1.0000000 0.6666667 0.7500000 1.0000000 1.0000000 1.0000000
## [57] 1.0000000 1.0000000 0.5000000 1.0000000 1.0000000 1.0000000 1.0000000
## [64] 0.6666667 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [71] 1.0000000 0.6666667 1.0000000 1.0000000 0.6666667
## Levels: c s v
table(cl, iris.predict)  #混淆矩阵
##    iris.predict
## cl   c  s  v
##   c 23  0  2
##   s  0 25  0
##   v  3  0 22

7、K均值(k-means)

x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
           matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
head(x)
##               x            y
## [1,]  0.2417803 -0.362296912
## [2,] -0.2492418 -0.007919125
## [3,] -0.3955914  0.002895658
## [4,]  0.1265264 -0.045113657
## [5,]  0.1152272  0.469655065
## [6,]  0.1493029  0.017974243
(cl <- kmeans(x, 2))
## K-means clustering with 2 clusters of sizes 50, 50
## 
## Cluster means:
##            x          y
## 1 -0.0291091 0.05759471
## 2  1.0035628 0.98531780
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 8.069328 9.368503
##  (between_SS / total_SS =  73.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
cl$centers
##            x          y
## 1 -0.0291091 0.05759471
## 2  1.0035628 0.98531780
plot(x, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex = 2)

# sum of squares
ss <- function(x) sum(scale(x, scale = FALSE)^2)

## cluster centers "fitted" to each obs.:
fitted.x <- fitted(cl);  head(fitted.x)
##            x          y
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471
resid.x <- x - fitted(cl)

## Equalities : ----------------------------------
cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns
         c(ss(fitted.x), ss(resid.x),    ss(x)))
##              [,1]     [,2]    
## betweenss    48.17703 48.17703
## tot.withinss 17.43783 17.43783
## totss        65.61486 65.61486
stopifnot(all.equal(cl$ totss,        ss(x)),
      all.equal(cl$ tot.withinss, ss(resid.x)),
      ## these three are the same:
      all.equal(cl$ betweenss,    ss(fitted.x)),
      all.equal(cl$ betweenss, cl$totss - cl$tot.withinss),
      ## and hence also
      all.equal(ss(x), ss(fitted.x) + ss(resid.x))
      )

kmeans(x,1)$withinss # trivial one-cluster, (its W.SS == ss(x))
## [1] 65.61486
## random starts do help here with too many clusters
## (and are often recommended anyway!):
(cl <- kmeans(x, 5, nstart = 25))
## K-means clustering with 5 clusters of sizes 13, 18, 24, 19, 26
## 
## Cluster means:
##             x          y
## 1 -0.36210808  0.1267287
## 2  0.14055856  0.3248003
## 3  1.05190114  1.2975444
## 4  0.03799452 -0.2428496
## 5  0.95894273  0.6971086
## 
## Clustering vector:
##   [1] 4 1 1 4 2 4 1 2 2 1 1 1 4 4 4 4 4 4 1 1 2 1 4 2 2 2 2 4 1 2 2 2 4 4 4
##  [36] 4 2 1 2 4 4 4 4 2 1 2 1 2 5 2 3 3 3 3 5 5 5 5 5 5 3 5 5 5 5 5 5 5 3 5
##  [71] 3 3 5 2 5 3 3 3 3 3 5 3 3 3 3 5 5 3 3 5 3 3 5 3 3 5 5 3 5 5
## 
## Within cluster sum of squares by cluster:
## [1] 0.7133834 1.1859169 2.4012601 1.0623752 2.3600708
##  (between_SS / total_SS =  88.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
plot(x, col = cl$cluster)
points(cl$centers, col = 1:5, pch = 8)

8、随机森林

library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
# 分类:
data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
set.seed(71)
iris.rf <- randomForest(Species ~ ., data = iris, importance = TRUE, proximity = TRUE)

print(iris.rf)
## 
## Call:
##  randomForest(formula = Species ~ ., data = iris, importance = TRUE,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 5.33%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         50          0         0        0.00
## versicolor      0         46         4        0.08
## virginica       0          4        46        0.08
round(importance(iris.rf), 2)
##              setosa versicolor virginica MeanDecreaseAccuracy
## Sepal.Length   6.04       7.85      7.93                11.51
## Sepal.Width    4.40       1.03      5.44                 5.40
## Petal.Length  21.76      31.33     29.64                32.94
## Petal.Width   22.84      32.67     31.68                34.50
##              MeanDecreaseGini
## Sepal.Length             8.77
## Sepal.Width              2.19
## Petal.Length            42.54
## Petal.Width             45.77
# 回归:
data(airquality)
head(airquality)
##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6
set.seed(131)
ozone.rf <- randomForest(Ozone ~ ., data = airquality, mtry = 3, importance = TRUE, 
    na.action = na.omit)
print(ozone.rf)
## 
## Call:
##  randomForest(formula = Ozone ~ ., data = airquality, mtry = 3,      importance = TRUE, na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 303.8304
##                     % Var explained: 72.31
## Show 'importance' of variables: higher value mean more important:
round(importance(ozone.rf), 2)
##         %IncMSE IncNodePurity
## Solar.R   11.09      10534.24
## Wind      23.50      43833.13
## Temp      42.03      55218.05
## Month      4.07       2032.65
## Day        2.63       7173.19

9、降维算法

head(USArrests)
##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7
str(USArrests)
## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
pca <- princomp(USArrests, cor = TRUE)
print(pca)
## Call:
## princomp(x = USArrests, cor = TRUE)
## 
## Standard deviations:
##    Comp.1    Comp.2    Comp.3    Comp.4 
## 1.5748783 0.9948694 0.5971291 0.4164494 
## 
##  4  variables and  50 observations.
train_reduced <- predict(pca, USArrests)
head(train_reduced)
##                Comp.1     Comp.2      Comp.3       Comp.4
## Alabama    -0.9855659  1.1333924 -0.44426879  0.156267145
## Alaska     -1.9501378  1.0732133  2.04000333 -0.438583440
## Arizona    -1.7631635 -0.7459568  0.05478082 -0.834652924
## Arkansas    0.1414203  1.1197968  0.11457369 -0.182810896
## California -2.5239801 -1.5429340  0.59855680 -0.341996478
## Colorado   -1.5145629 -0.9875551  1.09500699  0.001464887

10、Gradient Boost 和 Adaboost 算法

Boosting算法有很多种,比如梯度推进(Gradient Boosting)、XGBoost、AdaBoost、Gentle Boost等等 Bagging:对数据进行随机抽样、建立学习算法并且通过简单平均来得到最终概率结论的一种方法。 Boosting:与Bagging类似,但在样本选择方面显得更为聪明一些——在算法进行过程中,对难以进行分类的观测值赋予了越来越大的权重。

10.1、GBM算法

R语言中gbm包用来实现boosting的扩展包。在gbm包中,采用的是决策树作为基学习器,重要的参数设置如下:

  • 损失函数的形式(distribution)
  • 迭代次数(n.trees)
  • 学习速率(shrinkage)
  • 再抽样比率(bag.fraction)
  • 决策树的深度(interaction.depth)

损失函数的形式容易设定,分类问题一般选择bernoulli分布,而回归问题可以选择gaussian分布。学习速率方面,学习速率是越小越好,但是步子太小的话,步数就得增加,也就是训练的迭代次数需要加大才能使模型达到最优,这样训练所需时间和计算资源也相应加大了。经验法则是设置shrinkage参数在0.01-0.001之间,而n.trees参数在3000-10000之间。

library(gbm)
## Loading required package: survival
## Loading required package: splines
## 
## Attaching package: 'survival'
## 下列对象被屏蔽了from 'package:caret':
## 
##     cluster
## Loading required package: parallel
## Loaded gbm 2.1.1
#构造数据集
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- ordered(sample(letters[1:4],N,replace=TRUE),levels=letters[4:1])
X4 <- factor(sample(letters[1:6],N,replace=TRUE))
X5 <- factor(sample(letters[1:3],N,replace=TRUE))
X6 <- 3*runif(N) 
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# introduce some missing values
X1[sample(1:N,size=500)] <- NA
X4[sample(1:N,size=300)] <- NA
data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
head(data)
##           Y       X1        X2 X3   X4 X5        X6
## 1 2.6177200 0.811623 0.0756076  b    e  a 0.4734696
## 2 0.3321644       NA 0.3012093  d <NA>  b 2.4982320
## 3 1.9411223       NA 0.9489865  c <NA>  b 1.9155245
## 4 0.6659258       NA 0.5874069  d <NA>  b 1.5870628
## 5 5.3654999       NA 1.4601941  a <NA>  c 2.2784020
## 6 3.2528407       NA 1.7593645  d    a  c 2.3502413
#使用gbm函数训练模型
gbm1 <-
gbm(Y~X1+X2+X3+X4+X5+X6,         # formula
    data=data,                   # dataset
    var.monotone=c(0,0,0,0,0,0), # -1: monotone decrease,
                                 # +1: monotone increase,
                                 #  0: no monotone restrictions
    distribution="gaussian",     # see the help for other choices
    n.trees=1000,                # number of trees
    shrinkage=0.05,              # shrinkage or learning rate,
                                 # 0.001 to 0.1 usually work
    interaction.depth=3,         # 1: additive model, 2: two-way interactions, etc.
    bag.fraction = 0.5,          # subsampling fraction, 0.5 is probably best
    train.fraction = 0.5,        # fraction of data for training,
                                 # first train.fraction*N used for training
    n.minobsinnode = 10,         # minimum total weight needed in each node
    cv.folds = 3,                # do 3-fold cross-validation
    keep.data=TRUE,              # keep a copy of the dataset with the object
    verbose=FALSE,               # don't print out progress
    n.cores=1)                   # use only a single core (detecting #cores is
                                 # error-prone, so avoided here)
# check performance using an out-of-bag estimator
# OOB underestimates the optimal number of iterations
best.iter <- gbm.perf(gbm1,method="OOB")

print(best.iter)
## [1] 92
# check performance using a 50% heldout test set
best.iter <- gbm.perf(gbm1,method="test")

print(best.iter)
## [1] 115
# check performance using 5-fold cross-validation
best.iter <- gbm.perf(gbm1,method="cv")

print(best.iter)
## [1] 122
# plot the performance # plot variable influence
summary(gbm1,n.trees=1)         # based on the first tree

##    var  rel.inf
## X3  X3 89.79752
## X2  X2 10.20248
## X1  X1  0.00000
## X4  X4  0.00000
## X5  X5  0.00000
## X6  X6  0.00000
summary(gbm1,n.trees=best.iter) # based on the estimated best number of trees

##    var     rel.inf
## X3  X3 65.98348516
## X2  X2 27.96046528
## X1  X1  4.68887674
## X4  X4  0.75854559
## X6  X6  0.56625428
## X5  X5  0.04237296
# compactly print the first and last trees for curiosity
print(pretty.gbm.tree(gbm1,1))
##   SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction
## 0        2    1.50000000        1         5           9      228.91593
## 1        2    0.50000000        2         3           4       41.36040
## 2       -1   -0.07306434       -1        -1          -1        0.00000
## 3       -1   -0.01706217       -1        -1          -1        0.00000
## 4       -1   -0.04421474       -1        -1          -1        0.00000
## 5        1    0.99442576        6         7           8       30.70785
## 6       -1    0.02654803       -1        -1          -1        0.00000
## 7       -1    0.07756869       -1        -1          -1        0.00000
## 8       -1    0.05162598       -1        -1          -1        0.00000
## 9       -1    0.00102208       -1        -1          -1        0.00000
##   Weight  Prediction
## 0    250  0.00102208
## 1    132 -0.04421474
## 2     64 -0.07306434
## 3     68 -0.01706217
## 4    132 -0.04421474
## 5    118  0.05162598
## 6     60  0.02654803
## 7     58  0.07756869
## 8    118  0.05162598
## 9    250  0.00102208
print(pretty.gbm.tree(gbm1,gbm1$n.trees))
##   SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction
## 0        1  1.772046e+00        1         8           9      0.2062632
## 1        2  1.500000e+00        2         3           7      0.1578085
## 2       -1  4.121006e-05       -1        -1          -1      0.0000000
## 3        1  1.075647e+00        4         5           6      0.3111820
## 4       -1  4.721652e-03       -1        -1          -1      0.0000000
## 5       -1 -7.636156e-04       -1        -1          -1      0.0000000
## 6       -1  2.695562e-03       -1        -1          -1      0.0000000
## 7       -1  1.356536e-03       -1        -1          -1      0.0000000
## 8       -1 -3.348254e-03       -1        -1          -1      0.0000000
## 9       -1  8.672380e-04       -1        -1          -1      0.0000000
##   Weight    Prediction
## 0    250  8.672380e-04
## 1    224  1.356536e-03
## 2    113  4.121006e-05
## 3    111  2.695562e-03
## 4     70  4.721652e-03
## 5     41 -7.636156e-04
## 6    111  2.695562e-03
## 7    224  1.356536e-03
## 8     26 -3.348254e-03
## 9    250  8.672380e-04
# make some new data
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- ordered(sample(letters[1:4],N,replace=TRUE))
X4 <- factor(sample(letters[1:6],N,replace=TRUE))
X5 <- factor(sample(letters[1:3],N,replace=TRUE))
X6 <- 3*runif(N) 
mu <- c(-1,0,1,2)[as.numeric(X3)]
 
Y <- X1**1.5 + 2 * (X2**.5) + mu + rnorm(N,0,sigma)
 
data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
 
# predict on the new data using "best" number of trees
# f.predict generally will be on the canonical scale (logit,log,etc.)
f.predict <- predict(gbm1,data2,best.iter)
 
# least squares error
print(sum((data2$Y-f.predict)^2))
## [1] 5023.52
# create marginal plots
# plot variable X1,X2,X3 after "best" iterations
par(mfrow=c(1,3))
plot(gbm1,1,best.iter)
plot(gbm1,2,best.iter)
plot(gbm1,3,best.iter)

par(mfrow=c(1,1))
# contour plot of variables 1 and 2 after "best" iterations
plot(gbm1,1:2,best.iter)

# lattice plot of variables 2 and 3
plot(gbm1,2:3,best.iter)

# lattice plot of variables 3 and 4
plot(gbm1,3:4,best.iter)

# 3-way plots
plot(gbm1,c(1,2,6),best.iter,cont=20)

plot(gbm1,1:3,best.iter)

plot(gbm1,2:4,best.iter)

plot(gbm1,3:5,best.iter)

# do another 100 iterations
gbm2 <- gbm.more(gbm1,100,
                 verbose=FALSE) # stop printing detailed progress

10.2、Adaboost算法

AdaBoost方法的自适应在于:前一个分类器分错的样本会被用来训练下一个分类器。AdaBoost方法对于噪声数据和异常数据很敏感。但在一些问题中,AdaBoost方法相对于大多数其它学习算法而言,不会很容易出现过拟合现象。

AdaBoost方法中使用的分类器可能很弱(比如出现很大错误率),但只要它的分类效果比随机好一点(比如两类问题分类错误率略小于0.5),就能够改善最终得到的模型。而错误率高于随机分类器的弱分类器也是有用的,因为在最终得到的多个分类器的线性组合中,可以给它们赋予负系数,同样也能提升分类效果。

boosting函数用法: boosting(formula, data, boos = TRUE, mfinal = 100, coeflearn = ‘Breiman’, control)

  • formula:公式
  • data:包含公式里的变量的数据框
  • boos:逻辑值,if TRUE (by default), a bootstrap sample of the training set is drawn using the weights for each observation on that iteration. If FALSE, every observation is used with its weights.
  • mfinal:an integer, the number of iterations for which boosting is run or the number of trees to use. Defaults to mfinal=100 iterations.
  • coeflearn:if ‘Breiman’(by default), alpha=1/2ln((1-err)/err) is used. If ‘Freund’ alpha=ln((1-err)/err) is used. In both cases the AdaBoost.M1 algorithm is used and alpha is the weight updating coefficient. On the other hand, if coeflearn is ‘Zhu’ the SAMME algorithm is implemented with alpha=ln((1-err)/err)+ ln(nclasses-1).
  • control:options that control details of the rpart algorithm. See rpart.control for more details.
library(adabag)
data(iris)
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
iris.adaboost <- boosting(Species ~ ., data = iris, boos = TRUE, mfinal = 5)
# print(iris.adaboost)

## 多分类的数据集Vehicle
data(Vehicle)
str(Vehicle)
## 'data.frame':    846 obs. of  19 variables:
##  $ Comp        : num  95 91 104 93 85 107 97 90 86 93 ...
##  $ Circ        : num  48 41 50 41 44 57 43 43 34 44 ...
##  $ D.Circ      : num  83 84 106 82 70 106 73 66 62 98 ...
##  $ Rad.Ra      : num  178 141 209 159 205 172 173 157 140 197 ...
##  $ Pr.Axis.Ra  : num  72 57 66 63 103 50 65 65 61 62 ...
##  $ Max.L.Ra    : num  10 9 10 9 52 6 6 9 7 11 ...
##  $ Scat.Ra     : num  162 149 207 144 149 255 153 137 122 183 ...
##  $ Elong       : num  42 45 32 46 45 26 42 48 54 36 ...
##  $ Pr.Axis.Rect: num  20 19 23 19 19 28 19 18 17 22 ...
##  $ Max.L.Rect  : num  159 143 158 143 144 169 143 146 127 146 ...
##  $ Sc.Var.Maxis: num  176 170 223 160 241 280 176 162 141 202 ...
##  $ Sc.Var.maxis: num  379 330 635 309 325 957 361 281 223 505 ...
##  $ Ra.Gyr      : num  184 158 220 127 188 264 172 164 112 152 ...
##  $ Skew.Maxis  : num  70 72 73 63 127 85 66 67 64 64 ...
##  $ Skew.maxis  : num  6 9 14 6 9 5 13 3 2 4 ...
##  $ Kurt.maxis  : num  16 14 9 10 11 9 1 3 14 14 ...
##  $ Kurt.Maxis  : num  187 189 188 199 180 181 200 193 200 195 ...
##  $ Holl.Ra     : num  197 199 196 207 183 183 204 202 208 204 ...
##  $ Class       : Factor w/ 4 levels "bus","opel","saab",..: 4 4 3 4 1 1 1 4 4 3 ...
l <- length(Vehicle[, 1])  #数据集的样本数
sub <- sample(1:l, 2 * l/3)  #随机数
train <- Vehicle[sub, ]  #2/3的样本数据为训练集
test <- Vehicle[-sub, ]  #1/3的样本数据为测试集
mfinal <- 15  #迭代次数
maxdepth <- 5  #树的最大深度

# 使用分类树算法:
Vehicle.rpart <- rpart(Class ~ ., data = train, maxdepth = maxdepth)  #训练分类树模型


Vehicle.rpart.pred <- predict(Vehicle.rpart, newdata = test, type = "class")  #应用模型对测试集进行分类预测
tb <- table(Vehicle.rpart.pred, test$Class)  #混淆矩阵
tb
##                   
## Vehicle.rpart.pred bus opel saab van
##               bus   62    5    9   3
##               opel   0    8    2   0
##               saab   4   61   49   8
##               van    0    9    3  59
error.rpart <- 1 - (sum(diag(tb))/sum(tb))  #模型预测的错误率
error.rpart
## [1] 0.3687943
# 使用adaboost算法:
Vehicle.adaboost <- boosting(Class ~ ., data = train, mfinal = mfinal, coeflearn = "Zhu", 
    control = rpart.control(maxdepth = maxdepth))  #训练模型
Vehicle.adaboost.pred <- predict.boosting(Vehicle.adaboost, newdata = test)  #应用模型进行预测
Vehicle.adaboost.pred$confusion  #混淆矩阵
##                Observed Class
## Predicted Class bus opel saab van
##            bus   64    1    1   1
##            opel   0   40   20   1
##            saab   1   42   41   6
##            van    1    0    1  62
Vehicle.adaboost.pred$error  #错误率
## [1] 0.2659574
# comparing error evolution in training and test set
evol.train <- errorevol(Vehicle.adaboost, newdata = train)
evol.test <- errorevol(Vehicle.adaboost, newdata = test)

plot.errorevol(evol.test, evol.train)