常见机器学习算法:
1、线性回归
2、逻辑回归
3、决策树
4、支持向量机SVM
5、朴素贝叶斯
6、K最近邻
7、K均值
8、随机森林
9、降维算法
10、Gradient Boost 和 Adaboost 算法
data(women) #基础安装包的数据集women提供了15个人年龄在30~39岁间女性的身高和体重数据
head(women)
## height weight
## 1 58 115
## 2 59 117
## 3 60 120
## 4 61 123
## 5 62 126
## 6 63 129
fit <- lm(formula = weight ~ height, data = women) #拟合线性回归模型
summary(fit) #查看模型信息
##
## Call:
## lm(formula = weight ~ height, data = women)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7333 -1.1333 -0.3833 0.7417 3.1167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87.51667 5.93694 -14.74 1.71e-09 ***
## height 3.45000 0.09114 37.85 1.09e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared: 0.991, Adjusted R-squared: 0.9903
## F-statistic: 1433 on 1 and 13 DF, p-value: 1.091e-14
fitted(fit) #拟合值
## 1 2 3 4 5 6 7 8
## 112.5833 116.0333 119.4833 122.9333 126.3833 129.8333 133.2833 136.7333
## 9 10 11 12 13 14 15
## 140.1833 143.6333 147.0833 150.5333 153.9833 157.4333 160.8833
residuals(fit) #残差
## 1 2 3 4 5 6
## 2.41666667 0.96666667 0.51666667 0.06666667 -0.38333333 -0.83333333
## 7 8 9 10 11 12
## -1.28333333 -1.73333333 -1.18333333 -1.63333333 -1.08333333 -0.53333333
## 13 14 15
## 0.01666667 1.56666667 3.11666667
predict(fit, data.frame(height = c(55, 60, 61))) #应用模型进行预测
## 1 2 3
## 102.2333 119.4833 122.9333
适用于处理二分类问题
# install.packages('pbkrtest') install.packages('caret')
# install.packages('mlbench') install.packages('kernlab')
library(caret) #加载caret包
## Loading required package: lattice
## Loading required package: ggplot2
library(mlbench) #加载mlbench包,为了获取数据集PimaIndiansDiabetes
data(PimaIndiansDiabetes) #载入数据集
str(PimaIndiansDiabetes) #查看数据集结构,diabetes为二水平因子变量,其他变量为连续型数值变量
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
head(PimaIndiansDiabetes) #查看数据集的前6行数据
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
fit <- glm(diabetes ~ ., data = PimaIndiansDiabetes, family = binomial(link = "logit")) #训练模型
summary(fit) #查看模型信息
##
## Call:
## glm(formula = diabetes ~ ., family = binomial(link = "logit"),
## data = PimaIndiansDiabetes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5566 -0.7274 -0.4159 0.7267 2.9297
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.4046964 0.7166359 -11.728 < 2e-16 ***
## pregnant 0.1231823 0.0320776 3.840 0.000123 ***
## glucose 0.0351637 0.0037087 9.481 < 2e-16 ***
## pressure -0.0132955 0.0052336 -2.540 0.011072 *
## triceps 0.0006190 0.0068994 0.090 0.928515
## insulin -0.0011917 0.0009012 -1.322 0.186065
## mass 0.0897010 0.0150876 5.945 2.76e-09 ***
## pedigree 0.9451797 0.2991475 3.160 0.001580 **
## age 0.0148690 0.0093348 1.593 0.111192
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 723.45 on 759 degrees of freedom
## AIC: 741.45
##
## Number of Fisher Scoring iterations: 5
probabilities <- predict(fit, PimaIndiansDiabetes[, 1:8], type = "response") #应用模型进行预测
head(probabilities) #查看预测结果
## 1 2 3 4 5 6
## 0.72172655 0.04864161 0.79670208 0.04162486 0.90218390 0.14663156
predictions <- ifelse(probabilities > 0.5, "pos", "neg") #对预测结果进行处理,大于0.5为pos,否则为neg
head(predictions)
## 1 2 3 4 5 6
## "pos" "neg" "pos" "neg" "pos" "neg"
# summarize accuracy
table(predictions, PimaIndiansDiabetes$diabetes) #混淆矩阵
##
## predictions neg pos
## neg 445 112
## pos 55 156
confusionMatrix(predictions, PimaIndiansDiabetes$diabetes) #caret包中的函数,可获取混淆矩阵和准确率、灵敏度等统计量
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 445 112
## pos 55 156
##
## Accuracy : 0.7826
## 95% CI : (0.7517, 0.8112)
## No Information Rate : 0.651
## P-Value [Acc > NIR] : 1.373e-15
##
## Kappa : 0.4966
## Mcnemar's Test P-Value : 1.468e-05
##
## Sensitivity : 0.8900
## Specificity : 0.5821
## Pos Pred Value : 0.7989
## Neg Pred Value : 0.7393
## Prevalence : 0.6510
## Detection Rate : 0.5794
## Detection Prevalence : 0.7253
## Balanced Accuracy : 0.7360
##
## 'Positive' Class : neg
##
应用K折交差分析,评估模型效果
library(caret)
library(mlbench) #获取数据集
data(PimaIndiansDiabetes)
set.seed(7) #设置随机种子,使结果能重现
control <- trainControl(method = "cv", number = 5) #设置训练的参数
fit.glm <- train(diabetes ~ ., data = PimaIndiansDiabetes, method = "glm", metric = "Accuracy",
preProc = c("center", "scale"), trControl = control)
# summarize fit
print(fit.glm)
## Generalized Linear Model
##
## 768 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## Pre-processing: centered (8), scaled (8)
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 614, 614, 615, 615, 614
## Resampling results
##
## Accuracy Kappa Accuracy SD Kappa SD
## 0.7695442 0.4656824 0.02692468 0.0616666
##
##
rpart包中的rpart()函数可用于拟合CART分类树和回归树模型。
因变量是因子变量,自变量可以是因子变量,亦可为连续型变量
library(rpart) #加载包
library(mlbench) #
data(PimaIndiansDiabetes)
str(PimaIndiansDiabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ pregnant: num 6 1 8 1 0 5 3 10 2 8 ...
## $ glucose : num 148 85 183 89 137 116 78 115 197 125 ...
## $ pressure: num 72 66 64 66 40 74 50 0 70 96 ...
## $ triceps : num 35 29 0 23 35 0 32 0 45 0 ...
## $ insulin : num 0 0 0 94 168 0 88 0 543 0 ...
## $ mass : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ pedigree: num 0.627 0.351 0.672 0.167 2.288 ...
## $ age : num 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetes: Factor w/ 2 levels "neg","pos": 2 1 2 1 2 1 2 1 2 2 ...
fit <- rpart(diabetes ~ ., data = PimaIndiansDiabetes) #训练模型
print(fit) #查看模型信息
## n= 768
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 768 268 neg (0.65104167 0.34895833)
## 2) glucose< 127.5 485 94 neg (0.80618557 0.19381443)
## 4) age< 28.5 271 23 neg (0.91512915 0.08487085) *
## 5) age>=28.5 214 71 neg (0.66822430 0.33177570)
## 10) mass< 26.35 41 2 neg (0.95121951 0.04878049) *
## 11) mass>=26.35 173 69 neg (0.60115607 0.39884393)
## 22) glucose< 99.5 55 10 neg (0.81818182 0.18181818) *
## 23) glucose>=99.5 118 59 neg (0.50000000 0.50000000)
## 46) pedigree< 0.561 84 34 neg (0.59523810 0.40476190)
## 92) pedigree< 0.2 21 4 neg (0.80952381 0.19047619) *
## 93) pedigree>=0.2 63 30 neg (0.52380952 0.47619048)
## 186) pregnant>=1.5 52 21 neg (0.59615385 0.40384615)
## 372) pressure>=67 40 12 neg (0.70000000 0.30000000) *
## 373) pressure< 67 12 3 pos (0.25000000 0.75000000) *
## 187) pregnant< 1.5 11 2 pos (0.18181818 0.81818182) *
## 47) pedigree>=0.561 34 9 pos (0.26470588 0.73529412) *
## 3) glucose>=127.5 283 109 pos (0.38515901 0.61484099)
## 6) mass< 29.95 76 24 neg (0.68421053 0.31578947)
## 12) glucose< 145.5 41 6 neg (0.85365854 0.14634146) *
## 13) glucose>=145.5 35 17 pos (0.48571429 0.51428571)
## 26) insulin< 14.5 21 8 neg (0.61904762 0.38095238) *
## 27) insulin>=14.5 14 4 pos (0.28571429 0.71428571) *
## 7) mass>=29.95 207 57 pos (0.27536232 0.72463768)
## 14) glucose< 157.5 115 45 pos (0.39130435 0.60869565)
## 28) age< 30.5 50 23 neg (0.54000000 0.46000000)
## 56) pressure>=61 40 13 neg (0.67500000 0.32500000)
## 112) mass< 41.8 31 7 neg (0.77419355 0.22580645) *
## 113) mass>=41.8 9 3 pos (0.33333333 0.66666667) *
## 57) pressure< 61 10 0 pos (0.00000000 1.00000000) *
## 29) age>=30.5 65 18 pos (0.27692308 0.72307692) *
## 15) glucose>=157.5 92 12 pos (0.13043478 0.86956522) *
predictions <- predict(fit, PimaIndiansDiabetes[, 1:8], type = "class") #应用训练模型进行分类预测
head(predictions) #查看分类结果
## 1 2 3 4 5 6
## pos neg neg neg pos neg
## Levels: neg pos
table(predictions, PimaIndiansDiabetes$diabetes) #混淆矩阵
##
## predictions neg pos
## neg 449 72
## pos 51 196
k折交叉分析
library(caret)
library(mlbench)
data(PimaIndiansDiabetes)
set.seed(7)
control <- trainControl(method = "cv", number = 10)
fit.rpart <- train(diabetes ~ ., data = PimaIndiansDiabetes, method = "rpart",
metric = "Accuracy", trControl = control)
print(fit.rpart)
## CART
##
## 768 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 691, 691, 692, 691, 691, 692, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa Accuracy SD Kappa SD
## 0.01741294 0.7343131 0.3810595 0.05308756 0.11239089
## 0.10447761 0.7133459 0.3586950 0.04901214 0.09103908
## 0.24253731 0.6744190 0.1617716 0.03471308 0.17427370
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01741294.
因变量是连续型数值变量
library(rpart)
library(mlbench)
data(BostonHousing)
head(BostonHousing)
## crim zn indus chas nox rm age dis rad tax ptratio b
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12
## lstat medv
## 1 4.98 24.0
## 2 9.14 21.6
## 3 4.03 34.7
## 4 2.94 33.4
## 5 5.33 36.2
## 6 5.21 28.7
str(BostonHousing)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : num 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ b : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
fit <- rpart(medv ~ ., data = BostonHousing, control = rpart.control(minsplit = 5))
print(fit)
## n= 506
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 506 42716.3000 22.53281
## 2) rm< 6.941 430 17317.3200 19.93372
## 4) lstat>=14.4 175 3373.2510 14.95600
## 8) crim>=6.99237 74 1085.9050 11.97838 *
## 9) crim< 6.99237 101 1150.5370 17.13762 *
## 5) lstat< 14.4 255 6632.2170 23.34980
## 10) dis>=1.38485 250 3721.1630 22.90520
## 20) rm< 6.543 195 1636.0670 21.62974 *
## 21) rm>=6.543 55 643.1691 27.42727 *
## 11) dis< 1.38485 5 390.7280 45.58000 *
## 3) rm>=6.941 76 6059.4190 37.23816
## 6) rm< 7.437 46 1899.6120 32.11304
## 12) crim>=7.393425 3 27.9200 14.40000 *
## 13) crim< 7.393425 43 864.7674 33.34884 *
## 7) rm>=7.437 30 1098.8500 45.09667
## 14) ptratio>=18.3 3 223.8200 33.30000 *
## 15) ptratio< 18.3 27 411.1585 46.40741 *
predictions <- predict(fit, BostonHousing[, 1:13])
mse <- mean((BostonHousing$medv - predictions)^2)
print(mse)
## [1] 12.71556
library(caret)
library(mlbench)
# Load the dataset
data(BostonHousing)
# train
set.seed(7)
control <- trainControl(method = "cv", number = 2)
fit.rpart <- train(medv ~ ., data = BostonHousing, method = "rpart", metric = "RMSE",
trControl = control)
# summarize fit
print(fit.rpart)
## CART
##
## 506 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (2 fold)
## Summary of sample sizes: 253, 253
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared RMSE SD Rsquared SD
## 0.07165784 5.983605 0.5774758 0.1385397 0.01654185
## 0.17117244 7.173727 0.3939811 0.2713431 0.04677649
## 0.45274420 7.173727 0.3939811 0.2713431 0.04677649
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.07165784.
kernlab包中的ksvm()函数可用于拟合分类和回归问题中的支持向量机模型
# install.packages('kernlab') 安装包
library(kernlab)
library(mlbench)
data(PimaIndiansDiabetes) #加载数据
fit <- ksvm(diabetes ~ ., data = PimaIndiansDiabetes, kernel = "rbfdot") #训练模型
print(fit) #查看模型信息
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.111287269529721
##
## Number of Support Vectors : 431
##
## Objective Function Value : -356.4694
## Training error : 0.175781
predictions <- predict(fit, PimaIndiansDiabetes[, 1:8], type = "response") #应用模型做预测
table(predictions, PimaIndiansDiabetes$diabetes) #混淆矩阵
##
## predictions neg pos
## neg 462 97
## pos 38 171
library(caret)
library(mlbench)
# Load the dataset
data(PimaIndiansDiabetes)
# train
set.seed(7)
control <- trainControl(method = "cv", number = 5)
fit.svmRadial <- train(diabetes ~ ., data = PimaIndiansDiabetes, method = "svmRadial",
metric = "Accuracy", trControl = control)
# summarize fit
print(fit.svmRadial)
## Support Vector Machines with Radial Basis Function Kernel
##
## 768 samples
## 8 predictor
## 2 classes: 'neg', 'pos'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 614, 614, 615, 615, 614
## Resampling results across tuning parameters:
##
## C Accuracy Kappa Accuracy SD Kappa SD
## 0.25 0.7603684 0.4415912 0.02943632 0.06952557
## 0.50 0.7551906 0.4322310 0.01961619 0.05163166
## 1.00 0.7604278 0.4458565 0.01037865 0.02321585
##
## Tuning parameter 'sigma' was held constant at a value of 0.1178216
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.1178216 and C = 1.
# 对数据集iris进行分类(Species三个分类,属于多分类问题):
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
irismodel <- ksvm(Species ~ ., data = iris, type = "C-bsvc", kernel = "rbfdot",
kpar = list(sigma = 0.1), C = 10, prob.model = TRUE)
irismodel
## Support Vector Machine object of class "ksvm"
##
## SV type: C-bsvc (classification)
## parameter : cost C = 10
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.1
##
## Number of Support Vectors : 32
##
## Objective Function Value : -5.8442 -3.0652 -136.9786
## Training error : 0.02
## Probability model included.
# 进行预测:
predict(irismodel, iris[c(3, 10, 56, 68, 107, 120), -5], type = "probabilities")
## setosa versicolor virginica
## [1,] 0.984456890 0.008914961 0.006628150
## [2,] 0.980856710 0.012152543 0.006990747
## [3,] 0.005483066 0.953760778 0.040756155
## [4,] 0.009979338 0.987273933 0.002746729
## [5,] 0.013356347 0.081680756 0.904962897
## [6,] 0.012076805 0.162322667 0.825600528
predict(irismodel, iris[c(3, 10, 56, 68, 107, 120), -5])
## [1] setosa setosa versicolor versicolor virginica virginica
## Levels: setosa versicolor virginica
library(kernlab)
library(mlbench)
data(BostonHousing)
fit <- ksvm(medv ~ ., BostonHousing, kernel = "rbfdot")
print(fit)
## Support Vector Machine object of class "ksvm"
##
## SV type: eps-svr (regression)
## parameter : epsilon = 0.1 cost C = 1
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.106113556914206
##
## Number of Support Vectors : 334
##
## Objective Function Value : -75.2136
## Training error : 0.090716
predictions <- predict(fit, BostonHousing)
mse <- mean((BostonHousing$medv - predictions)^2)
print(mse)
## [1] 7.673373
k折交叉分析
library(caret)
library(mlbench)
data(BostonHousing)
set.seed(7)
control <- trainControl(method = "cv", number = 5)
fit.svmRadial <- train(medv ~ ., data = BostonHousing, method = "svmRadial",
metric = "RMSE", trControl = control)
# summarize fit
print(fit.svmRadial)
## Support Vector Machines with Radial Basis Function Kernel
##
## 506 samples
## 13 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 405, 405, 403, 405, 406
## Resampling results across tuning parameters:
##
## C RMSE Rsquared RMSE SD Rsquared SD
## 0.25 4.818004 0.7564430 1.171043 0.09975117
## 0.50 4.279601 0.7985729 1.201454 0.09310021
## 1.00 3.820843 0.8337146 1.162899 0.08297657
##
## Tuning parameter 'sigma' was held constant at a value of 0.114971
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.114971 and C = 1.
library(e1071)
data(HouseVotes84, package = "mlbench")
head(HouseVotes84)
## Class V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15
## 1 republican n y n y y y n n n y <NA> y y y n
## 2 republican n y n y y y n n n n n y y y n
## 3 democrat <NA> y y <NA> y y n n n n y n y y n
## 4 democrat n y y n <NA> y n n n n y n y n n
## 5 democrat y y y n y y n n n n y <NA> y y y
## 6 democrat n y y n y y n n n n n n y y y
## V16
## 1 y
## 2 <NA>
## 3 n
## 4 y
## 5 y
## 6 y
str(HouseVotes84)
## 'data.frame': 435 obs. of 17 variables:
## $ Class: Factor w/ 2 levels "democrat","republican": 2 2 1 1 1 1 1 2 2 1 ...
## $ V1 : Factor w/ 2 levels "n","y": 1 1 NA 1 2 1 1 1 1 2 ...
## $ V2 : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 2 ...
## $ V3 : Factor w/ 2 levels "n","y": 1 1 2 2 2 2 1 1 1 2 ...
## $ V4 : Factor w/ 2 levels "n","y": 2 2 NA 1 1 1 2 2 2 1 ...
## $ V5 : Factor w/ 2 levels "n","y": 2 2 2 NA 2 2 2 2 2 1 ...
## $ V6 : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 2 2 2 1 ...
## $ V7 : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 2 ...
## $ V8 : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 2 ...
## $ V9 : Factor w/ 2 levels "n","y": 1 1 1 1 1 1 1 1 1 2 ...
## $ V10 : Factor w/ 2 levels "n","y": 2 1 1 1 1 1 1 1 1 1 ...
## $ V11 : Factor w/ 2 levels "n","y": NA 1 2 2 2 1 1 1 1 1 ...
## $ V12 : Factor w/ 2 levels "n","y": 2 2 1 1 NA 1 1 1 2 1 ...
## $ V13 : Factor w/ 2 levels "n","y": 2 2 2 2 2 2 NA 2 2 1 ...
## $ V14 : Factor w/ 2 levels "n","y": 2 2 2 1 2 2 2 2 2 1 ...
## $ V15 : Factor w/ 2 levels "n","y": 1 1 1 1 2 2 2 NA 1 NA ...
## $ V16 : Factor w/ 2 levels "n","y": 2 NA 1 2 2 2 2 2 2 NA ...
model <- naiveBayes(Class ~ ., data = HouseVotes84) #训练模型
predict(model, HouseVotes84[1:10, ]) #预测前10个样本的分类
## [1] republican republican republican democrat democrat democrat
## [7] republican republican republican democrat
## Levels: democrat republican
predict(model, HouseVotes84[1:10, ], type = "raw") #返回前10个样本的分类的概率
## democrat republican
## [1,] 1.029209e-07 9.999999e-01
## [2,] 5.820415e-08 9.999999e-01
## [3,] 5.684937e-03 9.943151e-01
## [4,] 9.985798e-01 1.420152e-03
## [5,] 9.666720e-01 3.332802e-02
## [6,] 8.121430e-01 1.878570e-01
## [7,] 1.751512e-04 9.998248e-01
## [8,] 8.300100e-06 9.999917e-01
## [9,] 8.277705e-08 9.999999e-01
## [10,] 1.000000e+00 5.029425e-11
pred <- predict(model, HouseVotes84) #输出训练集的预测分类
table(pred, HouseVotes84$Class) #输出混淆矩阵
##
## pred democrat republican
## democrat 238 13
## republican 29 155
## using laplace smoothing:使用laplace平滑
model <- naiveBayes(Class ~ ., data = HouseVotes84, laplace = 3)
pred <- predict(model, HouseVotes84[, -1])
table(pred, HouseVotes84$Class)
##
## pred democrat republican
## democrat 237 12
## republican 30 156
## Example with metric predictors:
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
m <- naiveBayes(Species ~ ., data = iris)
## alternatively:
m <- naiveBayes(iris[, -5], iris[, 5])
m
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = iris[, -5], y = iris[, 5])
##
## A-priori probabilities:
## iris[, 5]
## setosa versicolor virginica
## 0.3333333 0.3333333 0.3333333
##
## Conditional probabilities:
## Sepal.Length
## iris[, 5] [,1] [,2]
## setosa 5.006 0.3524897
## versicolor 5.936 0.5161711
## virginica 6.588 0.6358796
##
## Sepal.Width
## iris[, 5] [,1] [,2]
## setosa 3.428 0.3790644
## versicolor 2.770 0.3137983
## virginica 2.974 0.3224966
##
## Petal.Length
## iris[, 5] [,1] [,2]
## setosa 1.462 0.1736640
## versicolor 4.260 0.4699110
## virginica 5.552 0.5518947
##
## Petal.Width
## iris[, 5] [,1] [,2]
## setosa 0.246 0.1053856
## versicolor 1.326 0.1977527
## virginica 2.026 0.2746501
table(predict(m, iris), iris[, 5])
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 47 3
## virginica 0 3 47
data(Titanic)
str(Titanic)
## table [1:4, 1:2, 1:2, 1:2] 0 0 35 0 0 0 17 0 118 154 ...
## - attr(*, "dimnames")=List of 4
## ..$ Class : chr [1:4] "1st" "2nd" "3rd" "Crew"
## ..$ Sex : chr [1:2] "Male" "Female"
## ..$ Age : chr [1:2] "Child" "Adult"
## ..$ Survived: chr [1:2] "No" "Yes"
Titanic
## , , Age = Child, Survived = No
##
## Sex
## Class Male Female
## 1st 0 0
## 2nd 0 0
## 3rd 35 17
## Crew 0 0
##
## , , Age = Adult, Survived = No
##
## Sex
## Class Male Female
## 1st 118 4
## 2nd 154 13
## 3rd 387 89
## Crew 670 3
##
## , , Age = Child, Survived = Yes
##
## Sex
## Class Male Female
## 1st 5 1
## 2nd 11 13
## 3rd 13 14
## Crew 0 0
##
## , , Age = Adult, Survived = Yes
##
## Sex
## Class Male Female
## 1st 57 140
## 2nd 14 80
## 3rd 75 76
## Crew 192 20
m <- naiveBayes(Survived ~ ., data = Titanic)
m
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.formula(formula = Survived ~ ., data = Titanic)
##
## A-priori probabilities:
## Survived
## No Yes
## 0.676965 0.323035
##
## Conditional probabilities:
## Class
## Survived 1st 2nd 3rd Crew
## No 0.08187919 0.11208054 0.35436242 0.45167785
## Yes 0.28551336 0.16596343 0.25035162 0.29817159
##
## Sex
## Survived Male Female
## No 0.91543624 0.08456376
## Yes 0.51617440 0.48382560
##
## Age
## Survived Child Adult
## No 0.03489933 0.96510067
## Yes 0.08016878 0.91983122
predict(m, as.data.frame(Titanic))
## [1] Yes No No No Yes Yes Yes Yes No No No No Yes Yes Yes Yes Yes
## [18] No No No Yes Yes Yes Yes No No No No Yes Yes Yes Yes
## Levels: No Yes
算法分析:
优点:
(i)基于实例的学习,不需要建立模型,不必维护源自数据的模型;
(ii)可以生成任意形状的决策边界,而决策树和基于规则的分类器只局限于直线决策边界(高维时是超平面)
缺点:
(i)分类测试样例时,开销很大,为O(n),n为training set个数;
(ii)基于局部信息进行预测,所以当k比较小时,对噪声数据很敏感,而模型分类算法是基于整体的。
# install.packages('class') 安装包
library(class) #加载包
train <- rbind(iris3[1:25, , 1], iris3[1:25, , 2], iris3[1:25, , 3])
test <- rbind(iris3[26:50, , 1], iris3[26:50, , 2], iris3[26:50, , 3])
cl <- factor(c(rep("s", 25), rep("c", 25), rep("v", 25)))
iris.predict <- knn(train, test, cl, k = 3, prob = TRUE)
iris.predict
## [1] s s s s s s s s s s s s s s s s s s s s s s s s s c c v c c c c c v c
## [36] c c c c c c c c c c c c c c c v c c v v v v v v v v v v c v v v v v v
## [71] v v v v v
## attr(,"prob")
## [1] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [8] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [15] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [22] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.6666667
## [29] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.6666667 1.0000000
## [36] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [43] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [50] 1.0000000 1.0000000 0.6666667 0.7500000 1.0000000 1.0000000 1.0000000
## [57] 1.0000000 1.0000000 0.5000000 1.0000000 1.0000000 1.0000000 1.0000000
## [64] 0.6666667 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [71] 1.0000000 0.6666667 1.0000000 1.0000000 0.6666667
## Levels: c s v
table(cl, iris.predict) #混淆矩阵
## iris.predict
## cl c s v
## c 23 0 2
## s 0 25 0
## v 3 0 22
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
head(x)
## x y
## [1,] 0.2417803 -0.362296912
## [2,] -0.2492418 -0.007919125
## [3,] -0.3955914 0.002895658
## [4,] 0.1265264 -0.045113657
## [5,] 0.1152272 0.469655065
## [6,] 0.1493029 0.017974243
(cl <- kmeans(x, 2))
## K-means clustering with 2 clusters of sizes 50, 50
##
## Cluster means:
## x y
## 1 -0.0291091 0.05759471
## 2 1.0035628 0.98531780
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 8.069328 9.368503
## (between_SS / total_SS = 73.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
cl$centers
## x y
## 1 -0.0291091 0.05759471
## 2 1.0035628 0.98531780
plot(x, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex = 2)
# sum of squares
ss <- function(x) sum(scale(x, scale = FALSE)^2)
## cluster centers "fitted" to each obs.:
fitted.x <- fitted(cl); head(fitted.x)
## x y
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471
## 1 -0.0291091 0.05759471
resid.x <- x - fitted(cl)
## Equalities : ----------------------------------
cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns
c(ss(fitted.x), ss(resid.x), ss(x)))
## [,1] [,2]
## betweenss 48.17703 48.17703
## tot.withinss 17.43783 17.43783
## totss 65.61486 65.61486
stopifnot(all.equal(cl$ totss, ss(x)),
all.equal(cl$ tot.withinss, ss(resid.x)),
## these three are the same:
all.equal(cl$ betweenss, ss(fitted.x)),
all.equal(cl$ betweenss, cl$totss - cl$tot.withinss),
## and hence also
all.equal(ss(x), ss(fitted.x) + ss(resid.x))
)
kmeans(x,1)$withinss # trivial one-cluster, (its W.SS == ss(x))
## [1] 65.61486
## random starts do help here with too many clusters
## (and are often recommended anyway!):
(cl <- kmeans(x, 5, nstart = 25))
## K-means clustering with 5 clusters of sizes 13, 18, 24, 19, 26
##
## Cluster means:
## x y
## 1 -0.36210808 0.1267287
## 2 0.14055856 0.3248003
## 3 1.05190114 1.2975444
## 4 0.03799452 -0.2428496
## 5 0.95894273 0.6971086
##
## Clustering vector:
## [1] 4 1 1 4 2 4 1 2 2 1 1 1 4 4 4 4 4 4 1 1 2 1 4 2 2 2 2 4 1 2 2 2 4 4 4
## [36] 4 2 1 2 4 4 4 4 2 1 2 1 2 5 2 3 3 3 3 5 5 5 5 5 5 3 5 5 5 5 5 5 5 3 5
## [71] 3 3 5 2 5 3 3 3 3 3 5 3 3 3 3 5 5 3 3 5 3 3 5 3 3 5 5 3 5 5
##
## Within cluster sum of squares by cluster:
## [1] 0.7133834 1.1859169 2.4012601 1.0623752 2.3600708
## (between_SS / total_SS = 88.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(x, col = cl$cluster)
points(cl$centers, col = 1:5, pch = 8)
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
# 分类:
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
set.seed(71)
iris.rf <- randomForest(Species ~ ., data = iris, importance = TRUE, proximity = TRUE)
print(iris.rf)
##
## Call:
## randomForest(formula = Species ~ ., data = iris, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 5.33%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 50 0 0 0.00
## versicolor 0 46 4 0.08
## virginica 0 4 46 0.08
round(importance(iris.rf), 2)
## setosa versicolor virginica MeanDecreaseAccuracy
## Sepal.Length 6.04 7.85 7.93 11.51
## Sepal.Width 4.40 1.03 5.44 5.40
## Petal.Length 21.76 31.33 29.64 32.94
## Petal.Width 22.84 32.67 31.68 34.50
## MeanDecreaseGini
## Sepal.Length 8.77
## Sepal.Width 2.19
## Petal.Length 42.54
## Petal.Width 45.77
# 回归:
data(airquality)
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
set.seed(131)
ozone.rf <- randomForest(Ozone ~ ., data = airquality, mtry = 3, importance = TRUE,
na.action = na.omit)
print(ozone.rf)
##
## Call:
## randomForest(formula = Ozone ~ ., data = airquality, mtry = 3, importance = TRUE, na.action = na.omit)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 303.8304
## % Var explained: 72.31
## Show 'importance' of variables: higher value mean more important:
round(importance(ozone.rf), 2)
## %IncMSE IncNodePurity
## Solar.R 11.09 10534.24
## Wind 23.50 43833.13
## Temp 42.03 55218.05
## Month 4.07 2032.65
## Day 2.63 7173.19
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
str(USArrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
pca <- princomp(USArrests, cor = TRUE)
print(pca)
## Call:
## princomp(x = USArrests, cor = TRUE)
##
## Standard deviations:
## Comp.1 Comp.2 Comp.3 Comp.4
## 1.5748783 0.9948694 0.5971291 0.4164494
##
## 4 variables and 50 observations.
train_reduced <- predict(pca, USArrests)
head(train_reduced)
## Comp.1 Comp.2 Comp.3 Comp.4
## Alabama -0.9855659 1.1333924 -0.44426879 0.156267145
## Alaska -1.9501378 1.0732133 2.04000333 -0.438583440
## Arizona -1.7631635 -0.7459568 0.05478082 -0.834652924
## Arkansas 0.1414203 1.1197968 0.11457369 -0.182810896
## California -2.5239801 -1.5429340 0.59855680 -0.341996478
## Colorado -1.5145629 -0.9875551 1.09500699 0.001464887
Boosting算法有很多种,比如梯度推进(Gradient Boosting)、XGBoost、AdaBoost、Gentle Boost等等 Bagging:对数据进行随机抽样、建立学习算法并且通过简单平均来得到最终概率结论的一种方法。 Boosting:与Bagging类似,但在样本选择方面显得更为聪明一些——在算法进行过程中,对难以进行分类的观测值赋予了越来越大的权重。
R语言中gbm包用来实现boosting的扩展包。在gbm包中,采用的是决策树作为基学习器,重要的参数设置如下:
损失函数的形式容易设定,分类问题一般选择bernoulli分布,而回归问题可以选择gaussian分布。学习速率方面,学习速率是越小越好,但是步子太小的话,步数就得增加,也就是训练的迭代次数需要加大才能使模型达到最优,这样训练所需时间和计算资源也相应加大了。经验法则是设置shrinkage参数在0.01-0.001之间,而n.trees参数在3000-10000之间。
library(gbm)
## Loading required package: survival
## Loading required package: splines
##
## Attaching package: 'survival'
## 下列对象被屏蔽了from 'package:caret':
##
## cluster
## Loading required package: parallel
## Loaded gbm 2.1.1
#构造数据集
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- ordered(sample(letters[1:4],N,replace=TRUE),levels=letters[4:1])
X4 <- factor(sample(letters[1:6],N,replace=TRUE))
X5 <- factor(sample(letters[1:3],N,replace=TRUE))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1**1.5 + 2 * (X2**.5) + mu
sigma <- sqrt(var(Y)/SNR)
Y <- Y + rnorm(N,0,sigma)
# introduce some missing values
X1[sample(1:N,size=500)] <- NA
X4[sample(1:N,size=300)] <- NA
data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
head(data)
## Y X1 X2 X3 X4 X5 X6
## 1 2.6177200 0.811623 0.0756076 b e a 0.4734696
## 2 0.3321644 NA 0.3012093 d <NA> b 2.4982320
## 3 1.9411223 NA 0.9489865 c <NA> b 1.9155245
## 4 0.6659258 NA 0.5874069 d <NA> b 1.5870628
## 5 5.3654999 NA 1.4601941 a <NA> c 2.2784020
## 6 3.2528407 NA 1.7593645 d a c 2.3502413
#使用gbm函数训练模型
gbm1 <-
gbm(Y~X1+X2+X3+X4+X5+X6, # formula
data=data, # dataset
var.monotone=c(0,0,0,0,0,0), # -1: monotone decrease,
# +1: monotone increase,
# 0: no monotone restrictions
distribution="gaussian", # see the help for other choices
n.trees=1000, # number of trees
shrinkage=0.05, # shrinkage or learning rate,
# 0.001 to 0.1 usually work
interaction.depth=3, # 1: additive model, 2: two-way interactions, etc.
bag.fraction = 0.5, # subsampling fraction, 0.5 is probably best
train.fraction = 0.5, # fraction of data for training,
# first train.fraction*N used for training
n.minobsinnode = 10, # minimum total weight needed in each node
cv.folds = 3, # do 3-fold cross-validation
keep.data=TRUE, # keep a copy of the dataset with the object
verbose=FALSE, # don't print out progress
n.cores=1) # use only a single core (detecting #cores is
# error-prone, so avoided here)
# check performance using an out-of-bag estimator
# OOB underestimates the optimal number of iterations
best.iter <- gbm.perf(gbm1,method="OOB")
print(best.iter)
## [1] 92
# check performance using a 50% heldout test set
best.iter <- gbm.perf(gbm1,method="test")
print(best.iter)
## [1] 115
# check performance using 5-fold cross-validation
best.iter <- gbm.perf(gbm1,method="cv")
print(best.iter)
## [1] 122
# plot the performance # plot variable influence
summary(gbm1,n.trees=1) # based on the first tree
## var rel.inf
## X3 X3 89.79752
## X2 X2 10.20248
## X1 X1 0.00000
## X4 X4 0.00000
## X5 X5 0.00000
## X6 X6 0.00000
summary(gbm1,n.trees=best.iter) # based on the estimated best number of trees
## var rel.inf
## X3 X3 65.98348516
## X2 X2 27.96046528
## X1 X1 4.68887674
## X4 X4 0.75854559
## X6 X6 0.56625428
## X5 X5 0.04237296
# compactly print the first and last trees for curiosity
print(pretty.gbm.tree(gbm1,1))
## SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction
## 0 2 1.50000000 1 5 9 228.91593
## 1 2 0.50000000 2 3 4 41.36040
## 2 -1 -0.07306434 -1 -1 -1 0.00000
## 3 -1 -0.01706217 -1 -1 -1 0.00000
## 4 -1 -0.04421474 -1 -1 -1 0.00000
## 5 1 0.99442576 6 7 8 30.70785
## 6 -1 0.02654803 -1 -1 -1 0.00000
## 7 -1 0.07756869 -1 -1 -1 0.00000
## 8 -1 0.05162598 -1 -1 -1 0.00000
## 9 -1 0.00102208 -1 -1 -1 0.00000
## Weight Prediction
## 0 250 0.00102208
## 1 132 -0.04421474
## 2 64 -0.07306434
## 3 68 -0.01706217
## 4 132 -0.04421474
## 5 118 0.05162598
## 6 60 0.02654803
## 7 58 0.07756869
## 8 118 0.05162598
## 9 250 0.00102208
print(pretty.gbm.tree(gbm1,gbm1$n.trees))
## SplitVar SplitCodePred LeftNode RightNode MissingNode ErrorReduction
## 0 1 1.772046e+00 1 8 9 0.2062632
## 1 2 1.500000e+00 2 3 7 0.1578085
## 2 -1 4.121006e-05 -1 -1 -1 0.0000000
## 3 1 1.075647e+00 4 5 6 0.3111820
## 4 -1 4.721652e-03 -1 -1 -1 0.0000000
## 5 -1 -7.636156e-04 -1 -1 -1 0.0000000
## 6 -1 2.695562e-03 -1 -1 -1 0.0000000
## 7 -1 1.356536e-03 -1 -1 -1 0.0000000
## 8 -1 -3.348254e-03 -1 -1 -1 0.0000000
## 9 -1 8.672380e-04 -1 -1 -1 0.0000000
## Weight Prediction
## 0 250 8.672380e-04
## 1 224 1.356536e-03
## 2 113 4.121006e-05
## 3 111 2.695562e-03
## 4 70 4.721652e-03
## 5 41 -7.636156e-04
## 6 111 2.695562e-03
## 7 224 1.356536e-03
## 8 26 -3.348254e-03
## 9 250 8.672380e-04
# make some new data
N <- 1000
X1 <- runif(N)
X2 <- 2*runif(N)
X3 <- ordered(sample(letters[1:4],N,replace=TRUE))
X4 <- factor(sample(letters[1:6],N,replace=TRUE))
X5 <- factor(sample(letters[1:3],N,replace=TRUE))
X6 <- 3*runif(N)
mu <- c(-1,0,1,2)[as.numeric(X3)]
Y <- X1**1.5 + 2 * (X2**.5) + mu + rnorm(N,0,sigma)
data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6)
# predict on the new data using "best" number of trees
# f.predict generally will be on the canonical scale (logit,log,etc.)
f.predict <- predict(gbm1,data2,best.iter)
# least squares error
print(sum((data2$Y-f.predict)^2))
## [1] 5023.52
# create marginal plots
# plot variable X1,X2,X3 after "best" iterations
par(mfrow=c(1,3))
plot(gbm1,1,best.iter)
plot(gbm1,2,best.iter)
plot(gbm1,3,best.iter)
par(mfrow=c(1,1))
# contour plot of variables 1 and 2 after "best" iterations
plot(gbm1,1:2,best.iter)
# lattice plot of variables 2 and 3
plot(gbm1,2:3,best.iter)
# lattice plot of variables 3 and 4
plot(gbm1,3:4,best.iter)
# 3-way plots
plot(gbm1,c(1,2,6),best.iter,cont=20)
plot(gbm1,1:3,best.iter)
plot(gbm1,2:4,best.iter)
plot(gbm1,3:5,best.iter)
# do another 100 iterations
gbm2 <- gbm.more(gbm1,100,
verbose=FALSE) # stop printing detailed progress
AdaBoost方法的自适应在于:前一个分类器分错的样本会被用来训练下一个分类器。AdaBoost方法对于噪声数据和异常数据很敏感。但在一些问题中,AdaBoost方法相对于大多数其它学习算法而言,不会很容易出现过拟合现象。
AdaBoost方法中使用的分类器可能很弱(比如出现很大错误率),但只要它的分类效果比随机好一点(比如两类问题分类错误率略小于0.5),就能够改善最终得到的模型。而错误率高于随机分类器的弱分类器也是有用的,因为在最终得到的多个分类器的线性组合中,可以给它们赋予负系数,同样也能提升分类效果。
boosting函数用法: boosting(formula, data, boos = TRUE, mfinal = 100, coeflearn = ‘Breiman’, control)
library(adabag)
data(iris)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
iris.adaboost <- boosting(Species ~ ., data = iris, boos = TRUE, mfinal = 5)
# print(iris.adaboost)
## 多分类的数据集Vehicle
data(Vehicle)
str(Vehicle)
## 'data.frame': 846 obs. of 19 variables:
## $ Comp : num 95 91 104 93 85 107 97 90 86 93 ...
## $ Circ : num 48 41 50 41 44 57 43 43 34 44 ...
## $ D.Circ : num 83 84 106 82 70 106 73 66 62 98 ...
## $ Rad.Ra : num 178 141 209 159 205 172 173 157 140 197 ...
## $ Pr.Axis.Ra : num 72 57 66 63 103 50 65 65 61 62 ...
## $ Max.L.Ra : num 10 9 10 9 52 6 6 9 7 11 ...
## $ Scat.Ra : num 162 149 207 144 149 255 153 137 122 183 ...
## $ Elong : num 42 45 32 46 45 26 42 48 54 36 ...
## $ Pr.Axis.Rect: num 20 19 23 19 19 28 19 18 17 22 ...
## $ Max.L.Rect : num 159 143 158 143 144 169 143 146 127 146 ...
## $ Sc.Var.Maxis: num 176 170 223 160 241 280 176 162 141 202 ...
## $ Sc.Var.maxis: num 379 330 635 309 325 957 361 281 223 505 ...
## $ Ra.Gyr : num 184 158 220 127 188 264 172 164 112 152 ...
## $ Skew.Maxis : num 70 72 73 63 127 85 66 67 64 64 ...
## $ Skew.maxis : num 6 9 14 6 9 5 13 3 2 4 ...
## $ Kurt.maxis : num 16 14 9 10 11 9 1 3 14 14 ...
## $ Kurt.Maxis : num 187 189 188 199 180 181 200 193 200 195 ...
## $ Holl.Ra : num 197 199 196 207 183 183 204 202 208 204 ...
## $ Class : Factor w/ 4 levels "bus","opel","saab",..: 4 4 3 4 1 1 1 4 4 3 ...
l <- length(Vehicle[, 1]) #数据集的样本数
sub <- sample(1:l, 2 * l/3) #随机数
train <- Vehicle[sub, ] #2/3的样本数据为训练集
test <- Vehicle[-sub, ] #1/3的样本数据为测试集
mfinal <- 15 #迭代次数
maxdepth <- 5 #树的最大深度
# 使用分类树算法:
Vehicle.rpart <- rpart(Class ~ ., data = train, maxdepth = maxdepth) #训练分类树模型
Vehicle.rpart.pred <- predict(Vehicle.rpart, newdata = test, type = "class") #应用模型对测试集进行分类预测
tb <- table(Vehicle.rpart.pred, test$Class) #混淆矩阵
tb
##
## Vehicle.rpart.pred bus opel saab van
## bus 62 5 9 3
## opel 0 8 2 0
## saab 4 61 49 8
## van 0 9 3 59
error.rpart <- 1 - (sum(diag(tb))/sum(tb)) #模型预测的错误率
error.rpart
## [1] 0.3687943
# 使用adaboost算法:
Vehicle.adaboost <- boosting(Class ~ ., data = train, mfinal = mfinal, coeflearn = "Zhu",
control = rpart.control(maxdepth = maxdepth)) #训练模型
Vehicle.adaboost.pred <- predict.boosting(Vehicle.adaboost, newdata = test) #应用模型进行预测
Vehicle.adaboost.pred$confusion #混淆矩阵
## Observed Class
## Predicted Class bus opel saab van
## bus 64 1 1 1
## opel 0 40 20 1
## saab 1 42 41 6
## van 1 0 1 62
Vehicle.adaboost.pred$error #错误率
## [1] 0.2659574
# comparing error evolution in training and test set
evol.train <- errorevol(Vehicle.adaboost, newdata = train)
evol.test <- errorevol(Vehicle.adaboost, newdata = test)
plot.errorevol(evol.test, evol.train)