常见机器学习算法：

1、线性回归

2、逻辑回归

3、决策树

4、支持向量机SVM

5、朴素贝叶斯

6、K最近邻

7、K均值

8、随机森林

9、降维算法

10、Gradient Boost 和 Adaboost 算法

1、线性回归

data(women)  #基础安装包的数据集women提供了15个人年龄在30~39岁间女性的身高和体重数据
head(women)

##   height weight
## 1     58    115
## 2     59    117
## 3     60    120
## 4     61    123
## 5     62    126
## 6     63    129

fit <- lm(formula = weight ~ height, data = women)  #拟合线性回归模型
summary(fit)  #查看模型信息

## 
## Call:
## lm(formula = weight ~ height, data = women)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7333 -1.1333 -0.3833  0.7417  3.1167 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -87.51667    5.93694  -14.74 1.71e-09 ***
## height        3.45000    0.09114   37.85 1.09e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared:  0.991,  Adjusted R-squared:  0.9903 
## F-statistic:  1433 on 1 and 13 DF,  p-value: 1.091e-14

fitted(fit)  #拟合值

##        1        2        3        4        5        6        7        8 
## 112.5833 116.0333 119.4833 122.9333 126.3833 129.8333 133.2833 136.7333 
##        9       10       11       12       13       14       15 
## 140.1833 143.6333 147.0833 150.5333 153.9833 157.4333 160.8833

residuals(fit)  #残差

##           1           2           3           4           5           6 
##  2.41666667  0.96666667  0.51666667  0.06666667 -0.38333333 -0.83333333 
##           7           8           9          10          11          12 
## -1.28333333 -1.73333333 -1.18333333 -1.63333333 -1.08333333 -0.53333333 
##          13          14          15 
##  0.01666667  1.56666667  3.11666667

predict(fit, data.frame(height = c(55, 60, 61)))  #应用模型进行预测

##        1        2        3 
## 102.2333 119.4833 122.9333

2、逻辑回归

适用于处理二分类问题

# install.packages('pbkrtest') install.packages('caret')
# install.packages('mlbench') install.packages('kernlab')
library(caret)  #加载caret包

## Loading required package: lattice

## Loading required package: ggplot2

library(mlbench)  #加载mlbench包，为了获取数据集PimaIndiansDiabetes
data(PimaIndiansDiabetes)  #载入数据集
str(PimaIndiansDiabetes)  #查看数据集结构，diabetes为二水平因子变量，其他变量为连续型数值变量

## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...

head(PimaIndiansDiabetes)  #查看数据集的前6行数据

##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74       0       0 25.6    0.201  30      neg

fit <- glm(diabetes ~ ., data = PimaIndiansDiabetes, family = binomial(link = "logit"))  #训练模型
summary(fit)  #查看模型信息

## 
## Call:
## glm(formula = diabetes ~ ., family = binomial(link = "logit"), 
##     data = PimaIndiansDiabetes)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5566  -0.7274  -0.4159   0.7267   2.9297  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.4046964  0.7166359 -11.728  < 2e-16 ***
## pregnant     0.1231823  0.0320776   3.840 0.000123 ***
## glucose      0.0351637  0.0037087   9.481  < 2e-16 ***
## pressure    -0.0132955  0.0052336  -2.540 0.011072 *  
## triceps      0.0006190  0.0068994   0.090 0.928515    
## insulin     -0.0011917  0.0009012  -1.322 0.186065    
## mass         0.0897010  0.0150876   5.945 2.76e-09 ***
## pedigree     0.9451797  0.2991475   3.160 0.001580 ** 
## age          0.0148690  0.0093348   1.593 0.111192    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 723.45  on 759  degrees of freedom
## AIC: 741.45
## 
## Number of Fisher Scoring iterations: 5

probabilities <- predict(fit, PimaIndiansDiabetes[, 1:8], type = "response")  #应用模型进行预测
head(probabilities)  #查看预测结果

##          1          2          3          4          5          6 
## 0.72172655 0.04864161 0.79670208 0.04162486 0.90218390 0.14663156

predictions <- ifelse(probabilities > 0.5, "pos", "neg")  #对预测结果进行处理，大于0.5为pos，否则为neg
head(predictions)

##     1     2     3     4     5     6 
## "pos" "neg" "pos" "neg" "pos" "neg"

# summarize accuracy
table(predictions, PimaIndiansDiabetes$diabetes)  #混淆矩阵

##            
## predictions neg pos
##         neg 445 112
##         pos  55 156

confusionMatrix(predictions, PimaIndiansDiabetes$diabetes)  #caret包中的函数，可获取混淆矩阵和准确率、灵敏度等统计量

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 445 112
##        pos  55 156
##                                           
##                Accuracy : 0.7826          
##                  95% CI : (0.7517, 0.8112)
##     No Information Rate : 0.651           
##     P-Value [Acc > NIR] : 1.373e-15       
##                                           
##                   Kappa : 0.4966          
##  Mcnemar's Test P-Value : 1.468e-05       
##                                           
##             Sensitivity : 0.8900          
##             Specificity : 0.5821          
##          Pos Pred Value : 0.7989          
##          Neg Pred Value : 0.7393          
##              Prevalence : 0.6510          
##          Detection Rate : 0.5794          
##    Detection Prevalence : 0.7253          
##       Balanced Accuracy : 0.7360          
##                                           
##        'Positive' Class : neg             
##

应用K折交差分析，评估模型效果

library(caret)
library(mlbench)  #获取数据集
data(PimaIndiansDiabetes)
set.seed(7)  #设置随机种子，使结果能重现
control <- trainControl(method = "cv", number = 5)  #设置训练的参数
fit.glm <- train(diabetes ~ ., data = PimaIndiansDiabetes, method = "glm", metric = "Accuracy", 
    preProc = c("center", "scale"), trControl = control)
# summarize fit
print(fit.glm)

## Generalized Linear Model 
## 
## 768 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## Pre-processing: centered (8), scaled (8) 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 614, 614, 615, 615, 614 
## Resampling results
## 
##   Accuracy   Kappa      Accuracy SD  Kappa SD 
##   0.7695442  0.4656824  0.02692468   0.0616666
## 
##

3、决策树

rpart包中的rpart()函数可用于拟合CART分类树和回归树模型。

3.1、分类树模型：

因变量是因子变量，自变量可以是因子变量，亦可为连续型变量

library(rpart)  #加载包
library(mlbench)  #
data(PimaIndiansDiabetes)
str(PimaIndiansDiabetes)

## 'data.frame':    768 obs. of  9 variables:
##  $ pregnant: num  6 1 8 1 0 5 3 10 2 8 ...
##  $ glucose : num  148 85 183 89 137 116 78 115 197 125 ...
##  $ pressure: num  72 66 64 66 40 74 50 0 70 96 ...
##  $ triceps : num  35 29 0 23 35 0 32 0 45 0 ...
##  $ insulin : num  0 0 0 94 168 0 88 0 543 0 ...
##  $ mass    : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ pedigree: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ age     : num  50 31 32 21 33 30 26 29 53 54 ...
##  $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...

fit <- rpart(diabetes ~ ., data = PimaIndiansDiabetes)  #训练模型
print(fit)  #查看模型信息

## n= 768 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 768 268 neg (0.65104167 0.34895833)  
##     2) glucose< 127.5 485  94 neg (0.80618557 0.19381443)  
##       4) age< 28.5 271  23 neg (0.91512915 0.08487085) *
##       5) age>=28.5 214  71 neg (0.66822430 0.33177570)  
##        10) mass< 26.35 41   2 neg (0.95121951 0.04878049) *
##        11) mass>=26.35 173  69 neg (0.60115607 0.39884393)  
##          22) glucose< 99.5 55  10 neg (0.81818182 0.18181818) *
##          23) glucose>=99.5 118  59 neg (0.50000000 0.50000000)  
##            46) pedigree< 0.561 84  34 neg (0.59523810 0.40476190)  
##              92) pedigree< 0.2 21   4 neg (0.80952381 0.19047619) *
##              93) pedigree>=0.2 63  30 neg (0.52380952 0.47619048)  
##               186) pregnant>=1.5 52  21 neg (0.59615385 0.40384615)  
##                 372) pressure>=67 40  12 neg (0.70000000 0.30000000) *
##                 373) pressure< 67 12   3 pos (0.25000000 0.75000000) *
##               187) pregnant< 1.5 11   2 pos (0.18181818 0.81818182) *
##            47) pedigree>=0.561 34   9 pos (0.26470588 0.73529412) *
##     3) glucose>=127.5 283 109 pos (0.38515901 0.61484099)  
##       6) mass< 29.95 76  24 neg (0.68421053 0.31578947)  
##        12) glucose< 145.5 41   6 neg (0.85365854 0.14634146) *
##        13) glucose>=145.5 35  17 pos (0.48571429 0.51428571)  
##          26) insulin< 14.5 21   8 neg (0.61904762 0.38095238) *
##          27) insulin>=14.5 14   4 pos (0.28571429 0.71428571) *
##       7) mass>=29.95 207  57 pos (0.27536232 0.72463768)  
##        14) glucose< 157.5 115  45 pos (0.39130435 0.60869565)  
##          28) age< 30.5 50  23 neg (0.54000000 0.46000000)  
##            56) pressure>=61 40  13 neg (0.67500000 0.32500000)  
##             112) mass< 41.8 31   7 neg (0.77419355 0.22580645) *
##             113) mass>=41.8 9   3 pos (0.33333333 0.66666667) *
##            57) pressure< 61 10   0 pos (0.00000000 1.00000000) *
##          29) age>=30.5 65  18 pos (0.27692308 0.72307692) *
##        15) glucose>=157.5 92  12 pos (0.13043478 0.86956522) *

predictions <- predict(fit, PimaIndiansDiabetes[, 1:8], type = "class")  #应用训练模型进行分类预测
head(predictions)  #查看分类结果

##   1   2   3   4   5   6 
## pos neg neg neg pos neg 
## Levels: neg pos

table(predictions, PimaIndiansDiabetes$diabetes)  #混淆矩阵

##            
## predictions neg pos
##         neg 449  72
##         pos  51 196

k折交叉分析

library(caret)
library(mlbench)
data(PimaIndiansDiabetes)
set.seed(7)
control <- trainControl(method = "cv", number = 10)
fit.rpart <- train(diabetes ~ ., data = PimaIndiansDiabetes, method = "rpart", 
    metric = "Accuracy", trControl = control)
print(fit.rpart)

## CART 
## 
## 768 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 691, 691, 692, 691, 691, 692, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa      Accuracy SD  Kappa SD  
##   0.01741294  0.7343131  0.3810595  0.05308756   0.11239089
##   0.10447761  0.7133459  0.3586950  0.04901214   0.09103908
##   0.24253731  0.6744190  0.1617716  0.03471308   0.17427370
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.01741294.

3.2、回归树模型：

因变量是连续型数值变量

library(rpart)
library(mlbench)
data(BostonHousing)
head(BostonHousing)

##      crim zn indus chas   nox    rm  age    dis rad tax ptratio      b
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12
##   lstat medv
## 1  4.98 24.0
## 2  9.14 21.6
## 3  4.03 34.7
## 4  2.94 33.4
## 5  5.33 36.2
## 6  5.21 28.7

str(BostonHousing)

## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : num  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ b      : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

fit <- rpart(medv ~ ., data = BostonHousing, control = rpart.control(minsplit = 5))
print(fit)

## n= 506 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 506 42716.3000 22.53281  
##    2) rm< 6.941 430 17317.3200 19.93372  
##      4) lstat>=14.4 175  3373.2510 14.95600  
##        8) crim>=6.99237 74  1085.9050 11.97838 *
##        9) crim< 6.99237 101  1150.5370 17.13762 *
##      5) lstat< 14.4 255  6632.2170 23.34980  
##       10) dis>=1.38485 250  3721.1630 22.90520  
##         20) rm< 6.543 195  1636.0670 21.62974 *
##         21) rm>=6.543 55   643.1691 27.42727 *
##       11) dis< 1.38485 5   390.7280 45.58000 *
##    3) rm>=6.941 76  6059.4190 37.23816  
##      6) rm< 7.437 46  1899.6120 32.11304  
##       12) crim>=7.393425 3    27.9200 14.40000 *
##       13) crim< 7.393425 43   864.7674 33.34884 *
##      7) rm>=7.437 30  1098.8500 45.09667  
##       14) ptratio>=18.3 3   223.8200 33.30000 *
##       15) ptratio< 18.3 27   411.1585 46.40741 *

predictions <- predict(fit, BostonHousing[, 1:13])
mse <- mean((BostonHousing$medv - predictions)^2)
print(mse)

## [1] 12.71556

k折交叉分析：

library(caret)
library(mlbench)
# Load the dataset
data(BostonHousing)
# train
set.seed(7)
control <- trainControl(method = "cv", number = 2)
fit.rpart <- train(medv ~ ., data = BostonHousing, method = "rpart", metric = "RMSE", 
    trControl = control)
# summarize fit
print(fit.rpart)

## CART 
## 
## 506 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (2 fold) 
## Summary of sample sizes: 253, 253 
## Resampling results across tuning parameters:
## 
##   cp          RMSE      Rsquared   RMSE SD    Rsquared SD
##   0.07165784  5.983605  0.5774758  0.1385397  0.01654185 
##   0.17117244  7.173727  0.3939811  0.2713431  0.04677649 
##   0.45274420  7.173727  0.3939811  0.2713431  0.04677649 
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final value used for the model was cp = 0.07165784.

4、支持向量机SVM

kernlab包中的ksvm()函数可用于拟合分类和回归问题中的支持向量机模型

4.1.1、分类模型

# install.packages('kernlab') 安装包
library(kernlab)
library(mlbench)
data(PimaIndiansDiabetes)  #加载数据
fit <- ksvm(diabetes ~ ., data = PimaIndiansDiabetes, kernel = "rbfdot")  #训练模型
print(fit)  #查看模型信息

## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.111287269529721 
## 
## Number of Support Vectors : 431 
## 
## Objective Function Value : -356.4694 
## Training error : 0.175781

predictions <- predict(fit, PimaIndiansDiabetes[, 1:8], type = "response")  #应用模型做预测
table(predictions, PimaIndiansDiabetes$diabetes)  #混淆矩阵

##            
## predictions neg pos
##         neg 462  97
##         pos  38 171

4.1.2、K折交叉分析

library(caret)
library(mlbench)
# Load the dataset
data(PimaIndiansDiabetes)
# train
set.seed(7)
control <- trainControl(method = "cv", number = 5)
fit.svmRadial <- train(diabetes ~ ., data = PimaIndiansDiabetes, method = "svmRadial", 
    metric = "Accuracy", trControl = control)
# summarize fit
print(fit.svmRadial)

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 768 samples
##   8 predictor
##   2 classes: 'neg', 'pos' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 614, 614, 615, 615, 614 
## Resampling results across tuning parameters:
## 
##   C     Accuracy   Kappa      Accuracy SD  Kappa SD  
##   0.25  0.7603684  0.4415912  0.02943632   0.06952557
##   0.50  0.7551906  0.4322310  0.01961619   0.05163166
##   1.00  0.7604278  0.4458565  0.01037865   0.02321585
## 
## Tuning parameter 'sigma' was held constant at a value of 0.1178216
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were sigma = 0.1178216 and C = 1.

4.1.3、多分类问题

# 对数据集iris进行分类(Species三个分类，属于多分类问题)：
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

irismodel <- ksvm(Species ~ ., data = iris, type = "C-bsvc", kernel = "rbfdot", 
    kpar = list(sigma = 0.1), C = 10, prob.model = TRUE)
irismodel

## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-bsvc  (classification) 
##  parameter : cost C = 10 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.1 
## 
## Number of Support Vectors : 32 
## 
## Objective Function Value : -5.8442 -3.0652 -136.9786 
## Training error : 0.02 
## Probability model included.

# 进行预测：
predict(irismodel, iris[c(3, 10, 56, 68, 107, 120), -5], type = "probabilities")

##           setosa  versicolor   virginica
## [1,] 0.984456890 0.008914961 0.006628150
## [2,] 0.980856710 0.012152543 0.006990747
## [3,] 0.005483066 0.953760778 0.040756155
## [4,] 0.009979338 0.987273933 0.002746729
## [5,] 0.013356347 0.081680756 0.904962897
## [6,] 0.012076805 0.162322667 0.825600528

predict(irismodel, iris[c(3, 10, 56, 68, 107, 120), -5])

## [1] setosa     setosa     versicolor versicolor virginica  virginica 
## Levels: setosa versicolor virginica

4.2、回归模型

library(kernlab)
library(mlbench)
data(BostonHousing)
fit <- ksvm(medv ~ ., BostonHousing, kernel = "rbfdot")
print(fit)

## Support Vector Machine object of class "ksvm" 
## 
## SV type: eps-svr  (regression) 
##  parameter : epsilon = 0.1  cost C = 1 
## 
## Gaussian Radial Basis kernel function. 
##  Hyperparameter : sigma =  0.106113556914206 
## 
## Number of Support Vectors : 334 
## 
## Objective Function Value : -75.2136 
## Training error : 0.090716

predictions <- predict(fit, BostonHousing)
mse <- mean((BostonHousing$medv - predictions)^2)
print(mse)

## [1] 7.673373

k折交叉分析

library(caret)
library(mlbench)
data(BostonHousing)
set.seed(7)
control <- trainControl(method = "cv", number = 5)
fit.svmRadial <- train(medv ~ ., data = BostonHousing, method = "svmRadial", 
    metric = "RMSE", trControl = control)
# summarize fit
print(fit.svmRadial)

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 506 samples
##  13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 405, 405, 403, 405, 406 
## Resampling results across tuning parameters:
## 
##   C     RMSE      Rsquared   RMSE SD   Rsquared SD
##   0.25  4.818004  0.7564430  1.171043  0.09975117 
##   0.50  4.279601  0.7985729  1.201454  0.09310021 
##   1.00  3.820843  0.8337146  1.162899  0.08297657 
## 
## Tuning parameter 'sigma' was held constant at a value of 0.114971
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were sigma = 0.114971 and C = 1.

5、朴素贝叶斯

5.1、对于离散型数据集:

library(e1071)
data(HouseVotes84, package = "mlbench")
head(HouseVotes84)

##        Class   V1 V2 V3   V4   V5 V6 V7 V8 V9 V10  V11  V12 V13 V14 V15
## 1 republican    n  y  n    y    y  y  n  n  n   y <NA>    y   y   y   n
## 2 republican    n  y  n    y    y  y  n  n  n   n    n    y   y   y   n
## 3   democrat <NA>  y  y <NA>    y  y  n  n  n   n    y    n   y   y   n
## 4   democrat    n  y  y    n <NA>  y  n  n  n   n    y    n   y   n   n
## 5   democrat    y  y  y    n    y  y  n  n  n   n    y <NA>   y   y   y
## 6   democrat    n  y  y    n    y  y  n  n  n   n    n    n   y   y   y
##    V16
## 1    y
## 2 <NA>
## 3    n
## 4    y
## 5    y
## 6    y

str(HouseVotes84)

## 'data.frame':    435 obs. of  17 variables:
##  $ Class: Factor w/ 2 levels "democrat","republican": 2 2 1 1 1 1 1 2 2 1 ...
##  $ V1   : Factor w/ 2 levels "n","y": 1 1 NA 1 2 1 1 1 1 2 ...
##  $ V2   : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ V3   : Factor w/ 2 levels "n","y": 1 1 2 2 2 2 1 1 1 2 ...
##  $ V4   : Factor w/ 2 levels "n","y": 2 2 NA 1 1 1 2 2 2 1 ...
##  $ V5   : Factor w/ 2 levels "n","y": 2 2 2 NA 2 2 2 2 2 1 ...
##  $ V6   : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 1 ...
##  $ V7   : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 2 ...
##  $ V8   : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 2 ...
##  $ V9   : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 2 ...
##  $ V10  : Factor w/ 2 levels "n","y": 2 1 1 1 1 1 1 1 1 1 ...
##  $ V11  : Factor w/ 2 levels "n","y": NA 1 2 2 2 1 1 1 1 1 ...
##  $ V12  : Factor w/ 2 levels "n","y": 2 2 1 1 NA 1 1 1 2 1 ...
##  $ V13  : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 NA 2 2 1 ...
##  $ V14  : Factor w/ 2 levels "n","y": 2 2 2 1 2 2 2 2 2 1 ...
##  $ V15  : Factor w/ 2 levels "n","y": 1 1 1 1 2 2 2 NA 1 NA ...
##  $ V16  : Factor w/ 2 levels "n","y": 2 NA 1 2 2 2 2 2 2 NA ...

model <- naiveBayes(Class ~ ., data = HouseVotes84)  #训练模型
predict(model, HouseVotes84[1:10, ])  #预测前10个样本的分类

##  [1] republican republican republican democrat   democrat   democrat  
##  [7] republican republican republican democrat  
## Levels: democrat republican

predict(model, HouseVotes84[1:10, ], type = "raw")  #返回前10个样本的分类的概率

##           democrat   republican
##  [1,] 1.029209e-07 9.999999e-01
##  [2,] 5.820415e-08 9.999999e-01
##  [3,] 5.684937e-03 9.943151e-01
##  [4,] 9.985798e-01 1.420152e-03
##  [5,] 9.666720e-01 3.332802e-02
##  [6,] 8.121430e-01 1.878570e-01
##  [7,] 1.751512e-04 9.998248e-01
##  [8,] 8.300100e-06 9.999917e-01
##  [9,] 8.277705e-08 9.999999e-01
## [10,] 1.000000e+00 5.029425e-11

pred <- predict(model, HouseVotes84)  #输出训练集的预测分类
table(pred, HouseVotes84$Class)  #输出混淆矩阵

##             
## pred         democrat republican
##   democrat        238         13
##   republican       29        155

## using laplace smoothing:使用laplace平滑
model <- naiveBayes(Class ~ ., data = HouseVotes84, laplace = 3)
pred <- predict(model, HouseVotes84[, -1])
table(pred, HouseVotes84$Class)

##             
## pred         democrat republican
##   democrat        237         12
##   republican       30        156

5.2、对于连续型数据集

## Example with metric predictors:
data(iris)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

m <- naiveBayes(Species ~ ., data = iris)
## alternatively:
m <- naiveBayes(iris[, -5], iris[, 5])
m

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = iris[, -5], y = iris[, 5])
## 
## A-priori probabilities:
## iris[, 5]
##     setosa versicolor  virginica 
##  0.3333333  0.3333333  0.3333333 
## 
## Conditional probabilities:
##             Sepal.Length
## iris[, 5]     [,1]      [,2]
##   setosa     5.006 0.3524897
##   versicolor 5.936 0.5161711
##   virginica  6.588 0.6358796
## 
##             Sepal.Width
## iris[, 5]     [,1]      [,2]
##   setosa     3.428 0.3790644
##   versicolor 2.770 0.3137983
##   virginica  2.974 0.3224966
## 
##             Petal.Length
## iris[, 5]     [,1]      [,2]
##   setosa     1.462 0.1736640
##   versicolor 4.260 0.4699110
##   virginica  5.552 0.5518947
## 
##             Petal.Width
## iris[, 5]     [,1]      [,2]
##   setosa     0.246 0.1053856
##   versicolor 1.326 0.1977527
##   virginica  2.026 0.2746501

table(predict(m, iris), iris[, 5])

##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         47         3
##   virginica       0          3        47

5.3、Example of using a contingency table:

data(Titanic)
str(Titanic)

##  table [1:4, 1:2, 1:2, 1:2] 0 0 35 0 0 0 17 0 118 154 ...
##  - attr(*, "dimnames")=List of 4
##   ..$ Class   : chr [1:4] "1st" "2nd" "3rd" "Crew"
##   ..$ Sex     : chr [1:2] "Male" "Female"
##   ..$ Age     : chr [1:2] "Child" "Adult"
##   ..$ Survived: chr [1:2] "No" "Yes"

Titanic

## , , Age = Child, Survived = No
## 
##       Sex
## Class  Male Female
##   1st     0      0
##   2nd     0      0
##   3rd    35     17
##   Crew    0      0
## 
## , , Age = Adult, Survived = No
## 
##       Sex
## Class  Male Female
##   1st   118      4
##   2nd   154     13
##   3rd   387     89
##   Crew  670      3
## 
## , , Age = Child, Survived = Yes
## 
##       Sex
## Class  Male Female
##   1st     5      1
##   2nd    11     13
##   3rd    13     14
##   Crew    0      0
## 
## , , Age = Adult, Survived = Yes
## 
##       Sex
## Class  Male Female
##   1st    57    140
##   2nd    14     80
##   3rd    75     76
##   Crew  192     20

m <- naiveBayes(Survived ~ ., data = Titanic)
m

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.formula(formula = Survived ~ ., data = Titanic)
## 
## A-priori probabilities:
## Survived
##       No      Yes 
## 0.676965 0.323035 
## 
## Conditional probabilities:
##         Class
## Survived        1st        2nd        3rd       Crew
##      No  0.08187919 0.11208054 0.35436242 0.45167785
##      Yes 0.28551336 0.16596343 0.25035162 0.29817159
## 
##         Sex
## Survived       Male     Female
##      No  0.91543624 0.08456376
##      Yes 0.51617440 0.48382560
## 
##         Age
## Survived      Child      Adult
##      No  0.03489933 0.96510067
##      Yes 0.08016878 0.91983122

predict(m, as.data.frame(Titanic))

##  [1] Yes No  No  No  Yes Yes Yes Yes No  No  No  No  Yes Yes Yes Yes Yes
## [18] No  No  No  Yes Yes Yes Yes No  No  No  No  Yes Yes Yes Yes
## Levels: No Yes

6、KNN（K–最近邻算法）

算法分析：

优点：

（i）基于实例的学习，不需要建立模型，不必维护源自数据的模型；

（ii）可以生成任意形状的决策边界，而决策树和基于规则的分类器只局限于直线决策边界（高维时是超平面）

缺点：

（i）分类测试样例时，开销很大，为O(n)，n为training set个数；

（ii）基于局部信息进行预测，所以当k比较小时，对噪声数据很敏感，而模型分类算法是基于整体的。

# install.packages('class') 安装包
library(class)  #加载包
train <- rbind(iris3[1:25, , 1], iris3[1:25, , 2], iris3[1:25, , 3])
test <- rbind(iris3[26:50, , 1], iris3[26:50, , 2], iris3[26:50, , 3])
cl <- factor(c(rep("s", 25), rep("c", 25), rep("v", 25)))
iris.predict <- knn(train, test, cl, k = 3, prob = TRUE)
iris.predict

##  [1] s s s s s s s s s s s s s s s s s s s s s s s s s c c v c c c c c v c
## [36] c c c c c c c c c c c c c c c v c c v v v v v v v v v v c v v v v v v
## [71] v v v v v
## attr(,"prob")
##  [1] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
##  [8] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [15] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [22] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.6666667
## [29] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.6666667 1.0000000
## [36] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [43] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [50] 1.0000000 1.0000000 0.6666667 0.7500000 1.0000000 1.0000000 1.0000000
## [57] 1.0000000 1.0000000 0.5000000 1.0000000 1.0000000 1.0000000 1.0000000
## [64] 0.6666667 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [71] 1.0000000 0.6666667 1.0000000 1.0000000 0.6666667
## Levels: c s v

table(cl, iris.predict)  #混淆矩阵

##    iris.predict
## cl   c  s  v
##   c 23  0  2
##   s  0 25  0
##   v  3  0 22

7、K均值（k-means）

x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
           matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
head(x)

##               x            y
## [1,]  0.2417803 -0.362296912
## [2,] -0.2492418 -0.007919125
## [3,] -0.3955914  0.002895658
## [4,]  0.1265264 -0.045113657
## [5,]  0.1152272  0.469655065
## [6,]  0.1493029  0.017974243

(cl <- kmeans(x, 2))

## K-means clustering with 2 clusters of sizes 50, 50
## 
## Cluster means:
##            x          y
## 1 -0.0291091 0.05759471
## 2  1.0035628 0.98531780
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 8.069328 9.368503
##  (between_SS / total_SS =  73.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

cl$centers

##            x          y
## 1 -0.0291091 0.05759471
## 2  1.0035628 0.98531780

plot(x, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex = 2)

# sum of squares
ss <- function(x) sum(scale(x, scale = FALSE)^2)

## cluster centers "fitted" to each obs.:
fitted.x <- fitted(cl);  head(fitted.x)

##            x          y
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471

resid.x <- x - fitted(cl)

## Equalities : ----------------------------------
cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns
         c(ss(fitted.x), ss(resid.x),    ss(x)))

##              [,1]     [,2]    
## betweenss    48.17703 48.17703
## tot.withinss 17.43783 17.43783
## totss        65.61486 65.61486

stopifnot(all.equal(cl$ totss,        ss(x)),
      all.equal(cl$ tot.withinss, ss(resid.x)),
      ## these three are the same:
      all.equal(cl$ betweenss,    ss(fitted.x)),
      all.equal(cl$ betweenss, cl$totss - cl$tot.withinss),
      ## and hence also
      all.equal(ss(x), ss(fitted.x) + ss(resid.x))
      )

kmeans(x,1)$withinss # trivial one-cluster, (its W.SS == ss(x))

## [1] 65.61486

## random starts do help here with too many clusters
## (and are often recommended anyway!):
(cl <- kmeans(x, 5, nstart = 25))

## K-means clustering with 5 clusters of sizes 13, 18, 24, 19, 26
## 
## Cluster means:
##             x          y
## 1 -0.36210808  0.1267287
## 2  0.14055856  0.3248003
## 3  1.05190114  1.2975444
## 4  0.03799452 -0.2428496
## 5  0.95894273  0.6971086
## 
## Clustering vector:
##   [1] 4 1 1 4 2 4 1 2 2 1 1 1 4 4 4 4 4 4 1 1 2 1 4 2 2 2 2 4 1 2 2 2 4 4 4
##  [36] 4 2 1 2 4 4 4 4 2 1 2 1 2 5 2 3 3 3 3 5 5 5 5 5 5 3 5 5 5 5 5 5 5 3 5
##  [71] 3 3 5 2 5 3 3 3 3 3 5 3 3 3 3 5 5 3 3 5 3 3 5 3 3 5 5 3 5 5
## 
## Within cluster sum of squares by cluster:
## [1] 0.7133834 1.1859169 2.4012601 1.0623752 2.3600708
##  (between_SS / total_SS =  88.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

plot(x, col = cl$cluster)
points(cl$centers, col = 1:5, pch = 8)

8、随机森林

library(randomForest)

## randomForest 4.6-10

## Type rfNews() to see new features/changes/bug fixes.

# 分类：
data(iris)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

set.seed(71)
iris.rf <- randomForest(Species ~ ., data = iris, importance = TRUE, proximity = TRUE)

print(iris.rf)

## 
## Call:
##  randomForest(formula = Species ~ ., data = iris, importance = TRUE,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 5.33%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         50          0         0        0.00
## versicolor      0         46         4        0.08
## virginica       0          4        46        0.08

round(importance(iris.rf), 2)

##              setosa versicolor virginica MeanDecreaseAccuracy
## Sepal.Length   6.04       7.85      7.93                11.51
## Sepal.Width    4.40       1.03      5.44                 5.40
## Petal.Length  21.76      31.33     29.64                32.94
## Petal.Width   22.84      32.67     31.68                34.50
##              MeanDecreaseGini
## Sepal.Length             8.77
## Sepal.Width              2.19
## Petal.Length            42.54
## Petal.Width             45.77

# 回归：
data(airquality)
head(airquality)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

set.seed(131)
ozone.rf <- randomForest(Ozone ~ ., data = airquality, mtry = 3, importance = TRUE, 
    na.action = na.omit)
print(ozone.rf)

## 
## Call:
##  randomForest(formula = Ozone ~ ., data = airquality, mtry = 3,      importance = TRUE, na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 303.8304
##                     % Var explained: 72.31

## Show 'importance' of variables: higher value mean more important:
round(importance(ozone.rf), 2)

##         %IncMSE IncNodePurity
## Solar.R   11.09      10534.24
## Wind      23.50      43833.13
## Temp      42.03      55218.05
## Month      4.07       2032.65
## Day        2.63       7173.19

9、降维算法

head(USArrests)

##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7

str(USArrests)

## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

pca <- princomp(USArrests, cor = TRUE)
print(pca)

## Call:
## princomp(x = USArrests, cor = TRUE)
## 
## Standard deviations:
##    Comp.1    Comp.2    Comp.3    Comp.4 
## 1.5748783 0.9948694 0.5971291 0.4164494 
## 
##  4  variables and  50 observations.

train_reduced <- predict(pca, USArrests)
head(train_reduced)

##                Comp.1     Comp.2      Comp.3       Comp.4
## Alabama    -0.9855659  1.1333924 -0.44426879  0.156267145
## Alaska     -1.9501378  1.0732133  2.04000333 -0.438583440
## Arizona    -1.7631635 -0.7459568  0.05478082 -0.834652924
## Arkansas    0.1414203  1.1197968  0.11457369 -0.182810896
## California -2.5239801 -1.5429340  0.59855680 -0.341996478
## Colorado   -1.5145629 -0.9875551  1.09500699  0.001464887

10、Gradient Boost 和 Adaboost 算法

Boosting算法有很多种，比如梯度推进（Gradient Boosting）、XGBoost、AdaBoost、Gentle Boost等等 Bagging：对数据进行随机抽样、建立学习算法并且通过简单平均来得到最终概率结论的一种方法。 Boosting：与Bagging类似，但在样本选择方面显得更为聪明一些——在算法进行过程中，对难以进行分类的观测值赋予了越来越大的权重。

10.1、GBM算法

R语言中gbm包用来实现boosting的扩展包。在gbm包中，采用的是决策树作为基学习器，重要的参数设置如下：

损失函数的形式(distribution)
迭代次数(n.trees)
学习速率(shrinkage)
再抽样比率(bag.fraction)
决策树的深度(interaction.depth)

损失函数的形式容易设定，分类问题一般选择bernoulli分布，而回归问题可以选择gaussian分布。学习速率方面，学习速率是越小越好，但是步子太小的话，步数就得增加，也就是训练的迭代次数需要加大才能使模型达到最优，这样训练所需时间和计算资源也相应加大了。经验法则是设置shrinkage参数在0.01-0.001之间，而n.trees参数在3000-10000之间。

library(gbm)

## Loading required package: survival

## Loading required package: splines

## 
## Attaching package: 'survival'

## 下列对象被屏蔽了from 'package:caret':
## 
##     cluster

## Loading required package: parallel

## Loaded gbm 2.1.1

#构造数据集
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- ordered(sample(letters[1:4],N,replace=TRUE),levels=letters[4:1])
X4 <- factor(sample(letters[1:6],N,replace=TRUE))
X5 <- factor(sample(letters[1:3],N,replace=TRUE))
X6 <- 3*runif(N) 
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# introduce some missing values
X1[sample(1:N,size=500)] <- NA
X4[sample(1:N,size=300)] <- NA
data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
head(data)

##           Y       X1        X2 X3   X4 X5        X6
## 1 2.6177200 0.811623 0.0756076  b    e  a 0.4734696
## 2 0.3321644       NA 0.3012093  d <NA>  b 2.4982320
## 3 1.9411223       NA 0.9489865  c <NA>  b 1.9155245
## 4 0.6659258       NA 0.5874069  d <NA>  b 1.5870628
## 5 5.3654999       NA 1.4601941  a <NA>  c 2.2784020
## 6 3.2528407       NA 1.7593645  d    a  c 2.3502413

#使用gbm函数训练模型
gbm1 <-
gbm(Y~X1+X2+X3+X4+X5+X6,         # formula
    data=data,                   # dataset
    var.monotone=c(0,0,0,0,0,0), # -1: monotone decrease,
                                 # +1: monotone increase,
                                 #  0: no monotone restrictions
    distribution="gaussian",     # see the help for other choices
    n.trees=1000,                # number of trees
    shrinkage=0.05,              # shrinkage or learning rate,
                                 # 0.001 to 0.1 usually work
    interaction.depth=3,         # 1: additive model, 2: two-way interactions, etc.
    bag.fraction = 0.5,          # subsampling fraction, 0.5 is probably best
    train.fraction = 0.5,        # fraction of data for training,
                                 # first train.fraction*N used for training
    n.minobsinnode = 10,         # minimum total weight needed in each node
    cv.folds = 3,                # do 3-fold cross-validation
    keep.data=TRUE,              # keep a copy of the dataset with the object
    verbose=FALSE,               # don't print out progress
    n.cores=1)                   # use only a single core (detecting #cores is
                                 # error-prone, so avoided here)
# check performance using an out-of-bag estimator
# OOB underestimates the optimal number of iterations
best.iter <- gbm.perf(gbm1,method="OOB")

print(best.iter)

## [1] 92

# check performance using a 50% heldout test set
best.iter <- gbm.perf(gbm1,method="test")

print(best.iter)

## [1] 115

# check performance using 5-fold cross-validation
best.iter <- gbm.perf(gbm1,method="cv")

print(best.iter)

## [1] 122

# plot the performance # plot variable influence
summary(gbm1,n.trees=1)         # based on the first tree

##    var  rel.inf
## X3  X3 89.79752
## X2  X2 10.20248
## X1  X1  0.00000
## X4  X4  0.00000
## X5  X5  0.00000
## X6  X6  0.00000

summary(gbm1,n.trees=best.iter) # based on the estimated best number of trees

##    var     rel.inf
## X3  X3 65.98348516
## X2  X2 27.96046528
## X1  X1  4.68887674
## X4  X4  0.75854559
## X6  X6  0.56625428
## X5  X5  0.04237296

# compactly print the first and last trees for curiosity
print(pretty.gbm.tree(gbm1,1))

##   SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction
## 0        2    1.50000000        1         5           9      228.91593
## 1        2    0.50000000        2         3           4       41.36040
## 2       -1   -0.07306434       -1        -1          -1        0.00000
## 3       -1   -0.01706217       -1        -1          -1        0.00000
## 4       -1   -0.04421474       -1        -1          -1        0.00000
## 5        1    0.99442576        6         7           8       30.70785
## 6       -1    0.02654803       -1        -1          -1        0.00000
## 7       -1    0.07756869       -1        -1          -1        0.00000
## 8       -1    0.05162598       -1        -1          -1        0.00000
## 9       -1    0.00102208       -1        -1          -1        0.00000
##   Weight  Prediction
## 0    250  0.00102208
## 1    132 -0.04421474
## 2     64 -0.07306434
## 3     68 -0.01706217
## 4    132 -0.04421474
## 5    118  0.05162598
## 6     60  0.02654803
## 7     58  0.07756869
## 8    118  0.05162598
## 9    250  0.00102208

print(pretty.gbm.tree(gbm1,gbm1$n.trees))

##   SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction
## 0        1  1.772046e+00        1         8           9      0.2062632
## 1        2  1.500000e+00        2         3           7      0.1578085
## 2       -1  4.121006e-05       -1        -1          -1      0.0000000
## 3        1  1.075647e+00        4         5           6      0.3111820
## 4       -1  4.721652e-03       -1        -1          -1      0.0000000
## 5       -1 -7.636156e-04       -1        -1          -1      0.0000000
## 6       -1  2.695562e-03       -1        -1          -1      0.0000000
## 7       -1  1.356536e-03       -1        -1          -1      0.0000000
## 8       -1 -3.348254e-03       -1        -1          -1      0.0000000
## 9       -1  8.672380e-04       -1        -1          -1      0.0000000
##   Weight    Prediction
## 0    250  8.672380e-04
## 1    224  1.356536e-03
## 2    113  4.121006e-05
## 3    111  2.695562e-03
## 4     70  4.721652e-03
## 5     41 -7.636156e-04
## 6    111  2.695562e-03
## 7    224  1.356536e-03
## 8     26 -3.348254e-03
## 9    250  8.672380e-04

# make some new data
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- ordered(sample(letters[1:4],N,replace=TRUE))
X4 <- factor(sample(letters[1:6],N,replace=TRUE))
X5 <- factor(sample(letters[1:3],N,replace=TRUE))
X6 <- 3*runif(N) 
mu <- c(-1,0,1,2)[as.numeric(X3)]
 
Y <- X1**1.5 + 2 * (X2**.5) + mu + rnorm(N,0,sigma)
 
data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
 
# predict on the new data using "best" number of trees
# f.predict generally will be on the canonical scale (logit,log,etc.)
f.predict <- predict(gbm1,data2,best.iter)
 
# least squares error
print(sum((data2$Y-f.predict)^2))

## [1] 5023.52

# create marginal plots
# plot variable X1,X2,X3 after "best" iterations
par(mfrow=c(1,3))
plot(gbm1,1,best.iter)
plot(gbm1,2,best.iter)
plot(gbm1,3,best.iter)

par(mfrow=c(1,1))
# contour plot of variables 1 and 2 after "best" iterations
plot(gbm1,1:2,best.iter)

# lattice plot of variables 2 and 3
plot(gbm1,2:3,best.iter)

# lattice plot of variables 3 and 4
plot(gbm1,3:4,best.iter)

# 3-way plots
plot(gbm1,c(1,2,6),best.iter,cont=20)

plot(gbm1,1:3,best.iter)

plot(gbm1,2:4,best.iter)

plot(gbm1,3:5,best.iter)

# do another 100 iterations
gbm2 <- gbm.more(gbm1,100,
                 verbose=FALSE) # stop printing detailed progress

10.2、Adaboost算法

AdaBoost方法的自适应在于：前一个分类器分错的样本会被用来训练下一个分类器。AdaBoost方法对于噪声数据和异常数据很敏感。但在一些问题中，AdaBoost方法相对于大多数其它学习算法而言，不会很容易出现过拟合现象。

AdaBoost方法中使用的分类器可能很弱（比如出现很大错误率），但只要它的分类效果比随机好一点（比如两类问题分类错误率略小于0.5），就能够改善最终得到的模型。而错误率高于随机分类器的弱分类器也是有用的，因为在最终得到的多个分类器的线性组合中，可以给它们赋予负系数，同样也能提升分类效果。

boosting函数用法： boosting(formula, data, boos = TRUE, mfinal = 100, coeflearn = ‘Breiman’, control)

formula：公式
data：包含公式里的变量的数据框
boos：逻辑值，if TRUE (by default), a bootstrap sample of the training set is drawn using the weights for each observation on that iteration. If FALSE, every observation is used with its weights.
mfinal：an integer, the number of iterations for which boosting is run or the number of trees to use. Defaults to mfinal=100 iterations.
coeflearn：if ‘Breiman’(by default), alpha=1/2ln((1-err)/err) is used. If ‘Freund’ alpha=ln((1-err)/err) is used. In both cases the AdaBoost.M1 algorithm is used and alpha is the weight updating coefficient. On the other hand, if coeflearn is ‘Zhu’ the SAMME algorithm is implemented with alpha=ln((1-err)/err)+ ln(nclasses-1).
control：options that control details of the rpart algorithm. See rpart.control for more details.

library(adabag)
data(iris)
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

iris.adaboost <- boosting(Species ~ ., data = iris, boos = TRUE, mfinal = 5)
# print(iris.adaboost)

## 多分类的数据集Vehicle
data(Vehicle)
str(Vehicle)

## 'data.frame':    846 obs. of  19 variables:
##  $ Comp        : num  95 91 104 93 85 107 97 90 86 93 ...
##  $ Circ        : num  48 41 50 41 44 57 43 43 34 44 ...
##  $ D.Circ      : num  83 84 106 82 70 106 73 66 62 98 ...
##  $ Rad.Ra      : num  178 141 209 159 205 172 173 157 140 197 ...
##  $ Pr.Axis.Ra  : num  72 57 66 63 103 50 65 65 61 62 ...
##  $ Max.L.Ra    : num  10 9 10 9 52 6 6 9 7 11 ...
##  $ Scat.Ra     : num  162 149 207 144 149 255 153 137 122 183 ...
##  $ Elong       : num  42 45 32 46 45 26 42 48 54 36 ...
##  $ Pr.Axis.Rect: num  20 19 23 19 19 28 19 18 17 22 ...
##  $ Max.L.Rect  : num  159 143 158 143 144 169 143 146 127 146 ...
##  $ Sc.Var.Maxis: num  176 170 223 160 241 280 176 162 141 202 ...
##  $ Sc.Var.maxis: num  379 330 635 309 325 957 361 281 223 505 ...
##  $ Ra.Gyr      : num  184 158 220 127 188 264 172 164 112 152 ...
##  $ Skew.Maxis  : num  70 72 73 63 127 85 66 67 64 64 ...
##  $ Skew.maxis  : num  6 9 14 6 9 5 13 3 2 4 ...
##  $ Kurt.maxis  : num  16 14 9 10 11 9 1 3 14 14 ...
##  $ Kurt.Maxis  : num  187 189 188 199 180 181 200 193 200 195 ...
##  $ Holl.Ra     : num  197 199 196 207 183 183 204 202 208 204 ...
##  $ Class       : Factor w/ 4 levels "bus","opel","saab",..: 4 4 3 4 1 1 1 4 4 3 ...

l <- length(Vehicle[, 1])  #数据集的样本数
sub <- sample(1:l, 2 * l/3)  #随机数
train <- Vehicle[sub, ]  #2/3的样本数据为训练集
test <- Vehicle[-sub, ]  #1/3的样本数据为测试集
mfinal <- 15  #迭代次数
maxdepth <- 5  #树的最大深度

# 使用分类树算法：
Vehicle.rpart <- rpart(Class ~ ., data = train, maxdepth = maxdepth)  #训练分类树模型


Vehicle.rpart.pred <- predict(Vehicle.rpart, newdata = test, type = "class")  #应用模型对测试集进行分类预测
tb <- table(Vehicle.rpart.pred, test$Class)  #混淆矩阵
tb

##                   
## Vehicle.rpart.pred bus opel saab van
##               bus   62    5    9   3
##               opel   0    8    2   0
##               saab   4   61   49   8
##               van    0    9    3  59

error.rpart <- 1 - (sum(diag(tb))/sum(tb))  #模型预测的错误率
error.rpart

## [1] 0.3687943

# 使用adaboost算法：
Vehicle.adaboost <- boosting(Class ~ ., data = train, mfinal = mfinal, coeflearn = "Zhu", 
    control = rpart.control(maxdepth = maxdepth))  #训练模型
Vehicle.adaboost.pred <- predict.boosting(Vehicle.adaboost, newdata = test)  #应用模型进行预测
Vehicle.adaboost.pred$confusion  #混淆矩阵

##                Observed Class
## Predicted Class bus opel saab van
##            bus   64    1    1   1
##            opel   0   40   20   1
##            saab   1   42   41   6
##            van    1    0    1  62

Vehicle.adaboost.pred$error  #错误率

## [1] 0.2659574

# comparing error evolution in training and test set
evol.train <- errorevol(Vehicle.adaboost, newdata = train)
evol.test <- errorevol(Vehicle.adaboost, newdata = test)

plot.errorevol(evol.test, evol.train)

常见机器学习算法

haoyi.li

Friday, May 06, 2016

1、线性回归

2、逻辑回归

3、决策树

3.1、分类树模型：

3.2、回归树模型：

k折交叉分析：

4、支持向量机SVM

4.1.1、分类模型

4.1.2、K折交叉分析

4.1.3、多分类问题

4.2、回归模型

5、朴素贝叶斯

5.1、对于离散型数据集:

5.2、对于连续型数据集

5.3、Example of using a contingency table:

6、KNN（K–最近邻算法）

7、K均值（k-means）

8、随机森林

9、降维算法

10、Gradient Boost 和 Adaboost 算法

10.1、GBM算法

10.2、Adaboost算法