========================================================
主要使用的包有:
1、e1071: tuning函数(tune.nnet, tune.randomForest, tune.rpart, tune.svm, tune.knn)
2、caret(分类和回归训练):通过trianControl函数设置训练控制参数;通过varImp函数对变量重要性进行排序
3、caret包中的findCorrelation函数可找到高度关联的特征
4、caret包中的递归特征排除函数rfe,可自动挑选出符合要求的特征
5、计算预测值和实际值之间的差值大小:均方根误差(root mean square error, RMSE),相对平方差(Reative Square Error, RSE),可决系数(R-Square)
6、rminer:使用fit函数训练一个svm模型,用Importance函数 基于敏感度检测对变量重要性进行分级,使用mgtaph函数绘制变量重要性分级图
7、rocr:受试者工作特征曲线(Receiver Operating Characteristic, ROC)是一种常见的二元分类系统性能展示图形(标注了不同切点的真正率和假正率),通过曲线下方的面积AUC(Area Under Curve)衡量模型的性能
8、caret包比较ROC曲线
9、比较模型性能差异
10、caret包中混淆矩阵confusionMatrix获取模型的精确度、召回率、特异性、准确率等性能指标
********************************************************************************* 一、基于K折交叉验证方法测评模型性能
1. 使用循环函数,数据集为telecom churn(C50包中的churn数据集,其中的churnTrain数据集划分为训练集和测试集),应用SVM训练模型/也可以使用naiveBayes训练模型
library(C50)
library(e1071)
data(churn)
ind <- cut(1:nrow(churnTrain), breaks = 10, labels = F)
accuracies <- c()
for (i in 1:10) {
fit <- svm(churn ~ ., data = churnTrain[ind != i,])
predictions <- predict(fit, churnTrain[ind == i, !names(churnTrain) %in% c('churn')])
correct_count <- sum(predictions == churnTrain[ind == i, c('churn')])
accuracies <- append(correct_count / nrow(churnTrain[ind == i, ]),accuracies)
}
accuracies
## [1] 0.8952096 0.8468468 0.8558559 0.9039039 0.8588589 0.9041916 0.8978979
## [8] 0.9219219 0.8888889 0.9071856
mean(accuracies)
## [1] 0.8880761
# 构建数据集
set.seed(2)
ind <- sample(2, nrow(churnTrain),replace = T, prob = c(0.7,0.3))
churnTrain <- churnTrain[,!names(churnTrain) %in% c('state', 'area_code', 'account_length')]
trainset <- churnTrain[ind == 1,]
testset <- churnTrain[ind == 2,]
# 调用tune.svm函数处理训练集,优化控制参数设置为10折交叉验证
tuned <- tune.svm(churn ~ ., data = trainset, gamma = 10^-2, cost = 10^2, tunecontrol = tune.control(cross = 10))
summary(tuned)
##
## Error estimation of 'svm' using 10-fold cross validation: 0.0808031
# tuned模型的性能信息
tuned$performances
## gamma cost error dispersion
## 1 0.01 100 0.0808031 0.02367426
# 使用优化后的模型产生分类表
svmfit <- tuned$best.model
table(trainset[,c('churn')],predict(svmfit))
##
## yes no
## yes 234 108
## no 13 1960
# 设置训练控制参数,进行重复3次的10折交叉验证
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
control <- trainControl(method = 'repeatedcv', number = 10, repeats = 3)
# 调用rpart 产生使用上节中的数据trainset 产生分类模型
model <- train(churn ~ ., data = trainset, method = 'rpart', preProcess = 'scale', trControl = control)
model
## CART
##
## 2315 samples
## 16 predictor
## 2 classes: 'yes', 'no'
##
## Pre-processing: scaled (16)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2084, 2083, 2083, 2083, 2083, 2084, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.05555556 0.9036559 0.5315844
## 0.07456140 0.8671038 0.2796749
## 0.07602339 0.8613423 0.2142758
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05555556.
# 使用上一节中model,调用VarImp()函数
importance <- varImp(model, scale = F)
importance
## rpart variable importance
##
## Overall
## number_customer_service_calls 116.015
## total_day_minutes 106.988
## total_day_charge 100.648
## international_planyes 86.789
## voice_mail_planyes 25.974
## total_eve_minutes 23.097
## total_eve_charge 23.097
## number_vmail_messages 19.885
## total_intl_minutes 6.347
## total_night_charge 0.000
## total_intl_charge 0.000
## total_night_calls 0.000
## total_eve_calls 0.000
## total_night_minutes 0.000
## total_intl_calls 0.000
## total_day_calls 0.000
# 调用plot()函数绘制变量重要性图
plot(importance)
扩展:rpart包中训练的模型产生的对象本身包含了变量重要性,可以直接输出查看变量的重要性
library(rpart)
model.rp <- rpart(churn ~ ., data = trainset)
model.rp$variable.importance
## total_day_minutes total_day_charge
## 111.645286 110.881583
## number_customer_service_calls total_intl_minutes
## 58.486651 48.283228
## total_intl_charge total_eve_charge
## 47.698379 47.166646
## total_eve_minutes international_plan
## 47.166646 42.194508
## total_intl_calls number_vmail_messages
## 36.730344 19.884863
## voice_mail_plan total_night_calls
## 19.884863 7.195828
## total_eve_calls total_night_charge
## 3.553423 1.754547
## total_night_minutes total_day_calls
## 1.754547 1.494986
library(rminer)
# 建立SVM模型
model <- fit(churn ~ ., data = trainset, model = 'svm')
# 使用Importance()函数获取变量重要性
VariableImportance <- Importance(model, data = trainset, method = 'sensv')
# 绘制排序后的变量重要性图
L <- list(runs = 1, sen = t(VariableImportance$imp), sresponses = VariableImportance$sresponses)
mgraph(L, graph = 'IMP', leg = names(trainset), col = 'gray', Grid = 10)
# 去掉非数值类型的数据
new_train <- trainset[,!names(churnTrain) %in% c('churn', 'international_plan','voice_mail_plan')]
# 计算属性之间的关联度
cor_mat <- cor(new_train)
# 调用findCorrelation()函数,找到关联度 >0.75 的属性:
highCorrelated <- findCorrelation(cor_mat, cutoff = 0.75)
# 输出这些高度关联的属性名称
names(new_train)[highCorrelated]
## [1] "total_intl_minutes" "total_day_charge" "total_eve_minutes"
## [4] "total_night_minutes"
扩展:subselect包中的leaps, genetic 和 anneal函数可达到同样效果。
数据准备:
# 将训练集trainset中名为international_plan 的特征转换为intl_yes和intl_no:
intl_plan <- model.matrix(~ trainset.international_plan -1, data = data.frame(trainset$international_plan))
colnames(intl_plan) <- c('trainset.international_planno' = 'intl_no','trainset.international_planyes' = 'intl_yes')
# 将训练集中名为voice_mail_plan 的特征转换为voice_yes 和 voice_no :
voice_plan <- model.matrix(~trainset.voice_mail_plan - 1, data = data.frame(trainset$voice_mail_plan))
colnames(voice_plan) <- c('trainset.voice_mail_planno' = 'voice_no', 'trainset.voice_mail_planyes' = 'voice_yes')
# 去掉international_plan 和voice_mail_plan 两个属性, 将训练集trainset和数据框 intl_plan、voice_plan合并
trainset$international_plan = NULL
trainset$voice_mail_plan = NULL
trainset = cbind(intl_plan,voice_plan, trainset)
# 对测试集testset执行类似操作
intl_plan <- model.matrix(~ testset.international_plan - 1, data = data.frame(testset$international_plan))
colnames(intl_plan) = c('testset.international_planno' = 'intl_no', 'testset.international_planyes' = 'intl_yes')
voice_plan <- model.matrix(~ testset.voice_mail_plan - 1, data = data.frame(testset$voice_mail_plan))
colnames(voice_plan) <- c('testset.voice_mail_planno' = 'voice_no', 'testset.voice_mail_planyes' = 'voice_yes')
testset$international_plan = NULL
testset$voice_mail_plan = NULL
testset <- cbind(intl_plan,voice_plan, testset)
使用线性判别分析方法创建一个特征筛选算法:
训练方法为cv(交叉验证),控制函数为线性判别函数ldaFuncs
ldaControl <- rfeControl(functions = ldaFuncs, method = 'cv')
利用编号1-18的数据子集对训练数据trainset进行反向特征筛选:
ldaProfile <- rfe(trainset[,!names(trainset) %in% c('churn')], trainset[,c('churn')], sizes = c(1:18), rfeControl = ldaControl)
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
## Warning in lda.default(x, grouping, ...): variables are collinear
ldaProfile
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 1 0.8527 0.004918 0.002407 0.01555
## 2 0.8527 0.004918 0.002407 0.01555
## 3 0.8385 0.186362 0.009054 0.08610
## 4 0.8393 0.204067 0.010022 0.06399
## 5 0.8462 0.234419 0.012815 0.05604
## 6 0.8467 0.239708 0.014787 0.06056
## 7 0.8475 0.241635 0.015138 0.06166
## 8 0.8467 0.234810 0.015030 0.06798
## 9 0.8462 0.229552 0.015417 0.06525
## 10 0.8471 0.231536 0.015301 0.06732
## 11 0.8480 0.238535 0.014856 0.06184
## 12 0.8514 0.258129 0.016840 0.07183
## 13 0.8523 0.264539 0.016560 0.07053
## 14 0.8523 0.260536 0.015660 0.06444
## 15 0.8553 0.279156 0.014952 0.06070 *
## 16 0.8536 0.272941 0.015095 0.05518
## 17 0.8527 0.270848 0.016154 0.06085
## 18 0.8540 0.276611 0.016941 0.06151
##
## The top 5 variables (out of 15):
## total_day_charge, total_day_minutes, intl_yes, intl_no, number_customer_service_calls
绘制选择结果示意图
plot(ldaProfile , type = c('o','g'))
检测合适的模型
ldaProfile$fit
## Call:
## lda(x, y)
##
## Prior probabilities of groups:
## yes no
## 0.1477322 0.8522678
##
## Group means:
## total_day_charge total_day_minutes intl_yes intl_no
## yes 35.00143 205.8877 0.29532164 0.7046784
## no 29.62402 174.2555 0.06487582 0.9351242
## number_customer_service_calls total_eve_minutes total_eve_charge
## yes 2.204678 213.7269 18.16702
## no 1.441460 199.6197 16.96789
## total_intl_calls voice_yes voice_no number_vmail_messages
## yes 4.134503 0.1666667 0.8333333 5.099415
## no 4.514445 0.2954891 0.7045109 8.674607
## total_intl_charge total_intl_minutes total_night_minutes
## yes 2.899386 10.73684 205.4640
## no 2.741343 10.15119 201.4184
## total_night_charge
## yes 9.245994
## no 9.063882
##
## Coefficients of linear discriminants:
## LD1
## total_day_charge 0.671376020
## total_day_minutes -0.123085518
## intl_yes -1.130269257
## intl_no 1.130269257
## number_customer_service_calls -0.421346323
## total_eve_minutes 0.183124313
## total_eve_charge -2.210668112
## total_intl_calls 0.066574547
## voice_yes 0.330197066
## voice_no -0.330197066
## number_vmail_messages -0.003558459
## total_intl_charge 2.316437631
## total_intl_minutes -0.694000839
## total_night_minutes 0.651028494
## total_night_charge -14.513932481
通过重新采样评估模型性能
postResample(predict(ldaProfile, testset[, !names(testset) %in% c('churn')]), testset[,c('churn')])
## Accuracy Kappa
## 0.8585462 0.2568816