一、支持向量机SVM
使用telecom customer chum数据集
e1071中提供了libsvm, klab包提供了SVMLite函数
library(e1071)
1、准备数据集:训练集和测试集
require(C50)
## Loading required package: C50
data(churn)
churnTrain <- churnTrain[,!names(churnTrain) %in% c('state', 'area_code', 'account_length')]
set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = T, prob = c(0.7,0.3))
trainset <- churnTrain[ind == 1,]
testset <- churnTrain[ind == 2,]
使用svm函数训练支持向量机,trainset数据集作为输入数据集,churn 是分类类别
model <- svm(churn ~ ., data = trainset, kernel = 'radial', cost = 1, gamma = 1/ncol(trainset))
2、选择支持向量机的惩罚因子
# 选择数据集iris
iris.subset <- subset(iris, select = c('Sepal.Length', 'Sepal.Width', 'Species'), Species %in% c('setosa', 'virginica'))
plot(x = iris.subset$Sepal.Length, y = iris.subset$Sepal.Width, col = iris.subset$Species, pch = 19)
#cost = 1
svm.model <- svm(Species ~ ., data = iris.subset, kernel = 'linear', cost = 1, scale = F)
# 将支持向量用蓝色的圈标注出来
points(iris.subset[svm.model$index, c(1,2)],col = 'blue', cex = 2)
# 加分隔线
w <- t(svm.model$coefs) %*% svm.model$SV
b <- -svm.model$rho
abline(a = -b/w[1,2], b = -w[1,1]/w[1,2], col = 'red', lty = 5)
# cost = 10000
svm.model <- svm(Species ~ ., data = iris.subset,type = 'C-classification', kernel = 'linear', cost = 10000, scale = F)
plot(iris.subset$Sepal.Length, iris.subset$Sepal.Width, col = iris.subset$Species, pch = 19)
w <- t(svm.model$coefs) %*% svm.model$SV
b <- -svm.model$rho
abline(a = -b/w[1,2], b = -w[1,1]/w[1,2], col = 'red', lty = 5)
3、SVM模型的可视化
3.1 使用的是iris数据集和telecom churn数据集
data(iris)
model.iris <- svm(Species ~ ., data = iris)
plot(model.iris, iris, Petal.Width ~ Petal.Length, slice = list(Sepal.Width = 3, Sepal.Length = 4))
3.2 telscom churn 数据集上训练的svm模型的可视化
plot(model, trainset, total_day_minutes ~ total_intl_charge)
4、基于SVM训练模型实现类预测
svm.pred <- predict(model,testset[,!names(testset) %in% c('churn')])
# 分类表
svm.table <- table(svm.pred, testset$churn)
svm.table
##
## svm.pred yes no
## yes 70 12
## no 71 865
# 调用classAgreement计算分类一致性系数
classAgreement(svm.table)
## $diag
## [1] 0.9184676
##
## $kappa
## [1] 0.5855903
##
## $rand
## [1] 0.850083
##
## $crand
## [1] 0.5260472
# 调用confusionMatrix (caret包)基于分类表评估预测性能
require(caret)
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(svm.table)
## Confusion Matrix and Statistics
##
##
## svm.pred yes no
## yes 70 12
## no 71 865
##
## Accuracy : 0.9185
## 95% CI : (0.8999, 0.9345)
## No Information Rate : 0.8615
## P-Value [Acc > NIR] : 1.251e-08
##
## Kappa : 0.5856
## Mcnemar's Test P-Value : 1.936e-10
##
## Sensitivity : 0.49645
## Specificity : 0.98632
## Pos Pred Value : 0.85366
## Neg Pred Value : 0.92415
## Prevalence : 0.13851
## Detection Rate : 0.06876
## Detection Prevalence : 0.08055
## Balanced Accuracy : 0.74139
##
## 'Positive' Class : yes
##
5、扩展:使用svm实现回归分析
使用Quartet数据集,利用eps-regression模型,使用SVM执行回归分析
library(car)
data("Quartet")
model.regression <- svm(Quartet$y1 ~ Quartet$x, type = 'eps-regression')
# 调用predict得到预测结果
predict.y <- predict(model.regression, Quartet$x)
predict.y
## 1 2 3 4 5 6 7 8
## 8.196894 7.152946 8.807471 7.713099 8.533578 8.774046 6.186349 5.763689
## 9 10 11
## 8.726925 6.621373 5.882946
# 使用plot函数绘制预测值和训练数据的散点图(预测值用正方形,训练数据用圆形)
plot(Quartet$x, predict.y, pch = 15, col = 'red')
points(Quartet$x, Quartet$y1, pch = 19)
6、调整SVM
借助参数gamma 和惩罚因子调整支持向量机
本节探讨tune.svm函数调整SVM的方法
tuned <- tune.svm(churn ~ ., data = trainset, gamma = 10^(-6:-1), cost = 10^(1:2))
summary(tuned)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.01 100
##
## - best performance: 0.0808031
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 1e-06 10 0.14774780 0.02237867
## 2 1e-05 10 0.14774780 0.02237867
## 3 1e-04 10 0.14774780 0.02237867
## 4 1e-03 10 0.14774780 0.02237867
## 5 1e-02 10 0.09202493 0.01981595
## 6 1e-01 10 0.08900582 0.02392487
## 7 1e-06 100 0.14774780 0.02237867
## 8 1e-05 100 0.14774780 0.02237867
## 9 1e-04 100 0.14774780 0.02237867
## 10 1e-03 100 0.11621697 0.02317553
## 11 1e-02 100 0.08080310 0.02367426
## 12 1e-01 100 0.13048776 0.02155659
# 使用tuned得到的best parameters重新训练模型
model.tuned <- svm(churn ~ ., data = trainset, gamma = tuned$best.parameters$gamma, cost = tuned$best.parameters$cost)
summary(model.tuned)
##
## Call:
## svm(formula = churn ~ ., data = trainset, gamma = tuned$best.parameters$gamma,
## cost = tuned$best.parameters$cost)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 100
## gamma: 0.01
##
## Number of Support Vectors: 547
##
## ( 304 243 )
##
##
## Number of Classes: 2
##
## Levels:
## yes no
# 用调整后的模型进行类别预测
svm.tuned.pred <- predict(model.tuned, testset[,!names(testset) %in% c('churn')])
# 分类表
svm.tuned.table <- table(svm.tuned.pred,testset$churn)
svm.tuned.table
##
## svm.tuned.pred yes no
## yes 95 24
## no 46 853
# 调用classAgreement函数得到相关系数完成算法性能评测
classAgreement(svm.tuned.table)
## $diag
## [1] 0.9312377
##
## $kappa
## [1] 0.691678
##
## $rand
## [1] 0.871806
##
## $crand
## [1] 0.6303615
# 应用confusionMatrix评估模型性能
confusionMatrix(svm.tuned.table)
## Confusion Matrix and Statistics
##
##
## svm.tuned.pred yes no
## yes 95 24
## no 46 853
##
## Accuracy : 0.9312
## 95% CI : (0.9139, 0.946)
## No Information Rate : 0.8615
## P-Value [Acc > NIR] : 1.56e-12
##
## Kappa : 0.6917
## Mcnemar's Test P-Value : 0.01207
##
## Sensitivity : 0.67376
## Specificity : 0.97263
## Pos Pred Value : 0.79832
## Neg Pred Value : 0.94883
## Prevalence : 0.13851
## Detection Rate : 0.09332
## Detection Prevalence : 0.11690
## Balanced Accuracy : 0.82320
##
## 'Positive' Class : yes
##
二、神经网络模型
1、neuralnet包 1.1 使用iris数据集
# 数据准备,划分训练数据和测试数据
data("iris")
ind <- sample(2,nrow(iris),replace = T, prob = c(0.7, 0.3))
trainset <- iris[ind == 1,]
testset <- iris[ind == 2,]
library(neuralnet)
# 根据Species列的取值,增加Versicolor, setosa, virginica数据列
trainset$versicolor <- trainset$Species == 'Versicolor'
trainset$setosa <- trainset$Species == 'setosa'
trainset$virginica <- trainset$Species == 'virginica'
1.2 使用neuralnet函数构建包含3个隐藏层的神经网络
如果提前设置seed值,可使每次训练返回相同的值
network <- neuralnet(versicolor + virginica + setosa ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = trainset, hidden = 3)
network$result.matrix
## 1
## error 0.856772664053340
## reached.threshold 0.009847762487250
## steps 4921.000000000000000
## Intercept.to.1layhid1 -16.385507747099865
## Sepal.Length.to.1layhid1 -2.202652268640603
## Sepal.Width.to.1layhid1 -2.820021731848652
## Petal.Length.to.1layhid1 5.263972229287643
## Petal.Width.to.1layhid1 7.158939785143700
## Intercept.to.1layhid2 0.711445587046781
## Sepal.Length.to.1layhid2 0.111449514143766
## Sepal.Width.to.1layhid2 3.602734234124951
## Petal.Length.to.1layhid2 -1.596937101232157
## Petal.Width.to.1layhid2 -9.693646291517846
## Intercept.to.1layhid3 -2.555369162236858
## Sepal.Length.to.1layhid3 -0.192848640605812
## Sepal.Width.to.1layhid3 2.996297852746109
## Petal.Length.to.1layhid3 1.754493922934692
## Petal.Width.to.1layhid3 -0.310658963114995
## Intercept.to.versicolor 1.350335425134183
## 1layhid.1.to.versicolor 0.000009963020239
## 1layhid.2.to.versicolor -0.000304455866022
## 1layhid.3.to.versicolor -1.350351760620393
## Intercept.to.virginica 0.575801002951326
## 1layhid.1.to.virginica 1.038883374275100
## 1layhid.2.to.virginica 0.016804461790844
## 1layhid.3.to.virginica -0.592661443221412
## Intercept.to.setosa 0.913877796978970
## 1layhid.1.to.setosa 0.001564731810324
## 1layhid.2.to.setosa 1.001726329197295
## 1layhid.3.to.setosa -0.915136021708912
1.3 神经网络模型可视化
# 直接使用plot函数
plot(network)
# 使用gwplot函数可视化泛化权值
par(mfrow = c(2,2))
gwplot(network, selected.covariate = 'Petal.Width')
gwplot(network, selected.covariate = 'Sepal.Width')
gwplot(network, selected.covariate = 'Petal.Length')
gwplot(network, selected.covariate = 'Sepal.Length')
1.4 利用得到的network模型实现类标号预测
net.pred <- compute(network, testset[-5])$net.result
# 通过找到概率最大的那一列,得到其他可能的类别
net.prediction <- c('versicolor','virginica','setosa')[apply(net.pred, 1, which.max)]
# 分类表
predict.table <- table(testset$Species, net.prediction)
predict.table
## net.prediction
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 5 5 4
## virginica 0 0 12
# 使用classAgreement函数计算分类表
classAgreement(predict.table)
## $diag
## [1] 0.7804878049
##
## $kappa
## [1] 0.6702412869
##
## $rand
## [1] 0.7707317073
##
## $crand
## [1] 0.5020028427
1.5 调用混淆矩阵评估模型性能
confusionMatrix(predict.table)
## Confusion Matrix and Statistics
##
## net.prediction
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 5 5 4
## virginica 0 0 12
##
## Overall Statistics
##
## Accuracy : 0.7804878
## 95% CI : (0.6238625, 0.894392)
## No Information Rate : 0.4878049
## P-Value [Acc > NIR] : 0.0001195313
##
## Kappa : 0.6702413
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 0.7500000 1.0000000 0.7500000
## Specificity 1.0000000 0.7500000 1.0000000
## Pos Pred Value 1.0000000 0.3571429 1.0000000
## Neg Pred Value 0.8076923 1.0000000 0.8620690
## Prevalence 0.4878049 0.1219512 0.3902439
## Detection Rate 0.3658537 0.1219512 0.2926829
## Detection Prevalence 0.3658537 0.3414634 0.2926829
## Balanced Accuracy 0.8750000 0.8750000 0.8750000
2、nnet包
使用上例中iris数据集划分出来的trainset 和 testset
library(nnet)
2.1 重新装载数据
data(iris)
set.seed(2)
ind <- sample(2, nrow(iris),replace = T, prob = c(0.7,0.3))
trainset <- iris[ind == 1,]
testset <- iris[ind == 2,]
2.2 使用nnet包中的nnet函数训练神经网络
在调用函数时,可以设置分类规则、数据源、隐藏单元个数(size参数),初始随机数权值(rang参数),权值衰减参数(decay参数),最大迭代次数(maxit参数)
iris.nn <- nnet(Species ~ .,data = trainset, size = 2, rang = 0.1, decay = 5e-4, maxit = 200)
## # weights: 19
## initial value 114.539765
## iter 10 value 52.100312
## iter 20 value 50.231442
## iter 30 value 49.526599
## iter 40 value 49.402229
## iter 50 value 44.680338
## iter 60 value 5.254389
## iter 70 value 2.836695
## iter 80 value 2.744315
## iter 90 value 2.687069
## iter 100 value 2.621556
## iter 110 value 2.589096
## iter 120 value 2.410539
## iter 130 value 2.096462
## iter 140 value 1.938715
## iter 150 value 1.857105
## iter 160 value 1.825393
## iter 170 value 1.817409
## iter 180 value 1.815591
## iter 190 value 1.815030
## iter 200 value 1.814746
## final value 1.814746
## stopped after 200 iterations
summary(iris.nn)
## a 4-2-3 network with 19 weights
## options were - softmax modelling decay=0.0005
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1
## -20.60 0.31 -3.84 3.36 7.72
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2
## -7.15 1.50 2.49 -4.14 5.59
## b->o1 h1->o1 h2->o1
## -7.28 -3.67 13.16
## b->o2 h1->o2 h2->o2
## 15.90 -16.64 -19.40
## b->o3 h1->o3 h2->o3
## -8.62 20.31 6.24
通过summary得到神经网络的信息,可知该模型为一个拥有19个权值的4-2-3网络结构
2.3 基于nnt包得到的模型实现类标号的预测
如果没有指定type为class, 系统将默认输出为概率矩阵
iris.predict <- predict(iris.nn, testset, type = 'class')
nn.table <- table(testset$Species,iris.predict)
nn.table
## iris.predict
## setosa versicolor virginica
## setosa 17 0 0
## versicolor 0 13 1
## virginica 0 2 13
2.4 模型评估:混淆矩阵
confusionMatrix(nn.table)
## Confusion Matrix and Statistics
##
## iris.predict
## setosa versicolor virginica
## setosa 17 0 0
## versicolor 0 13 1
## virginica 0 2 13
##
## Overall Statistics
##
## Accuracy : 0.9347826
## 95% CI : (0.8210356, 0.9863432)
## No Information Rate : 0.3695652
## P-Value [Acc > NIR] : 0.000000000000001019459
##
## Kappa : 0.901919
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000000 0.8666667 0.9285714
## Specificity 1.0000000 0.9677419 0.9375000
## Pos Pred Value 1.0000000 0.9285714 0.8666667
## Neg Pred Value 1.0000000 0.9375000 0.9677419
## Prevalence 0.3695652 0.3260870 0.3043478
## Detection Rate 0.3695652 0.2826087 0.2826087
## Detection Prevalence 0.3695652 0.3043478 0.3260870
## Balanced Accuracy 1.0000000 0.9172043 0.9330357