一、支持向量机SVM

使用telecom customer chum数据集

e1071中提供了libsvm, klab包提供了SVMLite函数

library(e1071)

1、准备数据集:训练集和测试集

require(C50)
## Loading required package: C50
data(churn)
churnTrain <- churnTrain[,!names(churnTrain) %in% c('state', 'area_code', 'account_length')]
set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = T, prob = c(0.7,0.3))
trainset <- churnTrain[ind == 1,]
testset <- churnTrain[ind == 2,]

使用svm函数训练支持向量机,trainset数据集作为输入数据集,churn 是分类类别

model <- svm(churn ~ ., data = trainset, kernel = 'radial', cost = 1, gamma = 1/ncol(trainset))

2、选择支持向量机的惩罚因子

# 选择数据集iris
iris.subset <- subset(iris, select = c('Sepal.Length', 'Sepal.Width', 'Species'), Species %in% c('setosa', 'virginica'))
plot(x = iris.subset$Sepal.Length, y = iris.subset$Sepal.Width, col = iris.subset$Species, pch = 19)
#cost = 1
svm.model <- svm(Species ~ ., data = iris.subset, kernel = 'linear', cost = 1, scale = F)
# 将支持向量用蓝色的圈标注出来
points(iris.subset[svm.model$index, c(1,2)],col = 'blue', cex = 2)
# 加分隔线
w <- t(svm.model$coefs) %*% svm.model$SV
b <- -svm.model$rho
abline(a = -b/w[1,2], b = -w[1,1]/w[1,2], col = 'red', lty = 5)

# cost = 10000
svm.model <- svm(Species ~ ., data = iris.subset,type = 'C-classification', kernel = 'linear', cost = 10000, scale = F)
plot(iris.subset$Sepal.Length, iris.subset$Sepal.Width, col = iris.subset$Species, pch = 19)
w <- t(svm.model$coefs) %*% svm.model$SV
b <- -svm.model$rho
abline(a = -b/w[1,2], b = -w[1,1]/w[1,2], col = 'red', lty = 5)

3、SVM模型的可视化

3.1 使用的是iris数据集和telecom churn数据集

data(iris)
model.iris <- svm(Species ~ ., data = iris)
plot(model.iris, iris, Petal.Width ~ Petal.Length, slice = list(Sepal.Width = 3, Sepal.Length = 4))

3.2 telscom churn 数据集上训练的svm模型的可视化

plot(model, trainset, total_day_minutes ~ total_intl_charge)

4、基于SVM训练模型实现类预测

svm.pred <- predict(model,testset[,!names(testset) %in% c('churn')])
# 分类表
svm.table <- table(svm.pred, testset$churn)
svm.table
##         
## svm.pred yes  no
##      yes  70  12
##      no   71 865
# 调用classAgreement计算分类一致性系数
classAgreement(svm.table)
## $diag
## [1] 0.9184676
## 
## $kappa
## [1] 0.5855903
## 
## $rand
## [1] 0.850083
## 
## $crand
## [1] 0.5260472
# 调用confusionMatrix (caret包)基于分类表评估预测性能
require(caret)
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(svm.table)
## Confusion Matrix and Statistics
## 
##         
## svm.pred yes  no
##      yes  70  12
##      no   71 865
##                                           
##                Accuracy : 0.9185          
##                  95% CI : (0.8999, 0.9345)
##     No Information Rate : 0.8615          
##     P-Value [Acc > NIR] : 1.251e-08       
##                                           
##                   Kappa : 0.5856          
##  Mcnemar's Test P-Value : 1.936e-10       
##                                           
##             Sensitivity : 0.49645         
##             Specificity : 0.98632         
##          Pos Pred Value : 0.85366         
##          Neg Pred Value : 0.92415         
##              Prevalence : 0.13851         
##          Detection Rate : 0.06876         
##    Detection Prevalence : 0.08055         
##       Balanced Accuracy : 0.74139         
##                                           
##        'Positive' Class : yes             
## 

5、扩展:使用svm实现回归分析
使用Quartet数据集,利用eps-regression模型,使用SVM执行回归分析

library(car)
data("Quartet")
model.regression <- svm(Quartet$y1 ~ Quartet$x, type = 'eps-regression')
# 调用predict得到预测结果
predict.y <- predict(model.regression, Quartet$x)
predict.y
##        1        2        3        4        5        6        7        8 
## 8.196894 7.152946 8.807471 7.713099 8.533578 8.774046 6.186349 5.763689 
##        9       10       11 
## 8.726925 6.621373 5.882946
# 使用plot函数绘制预测值和训练数据的散点图(预测值用正方形,训练数据用圆形)
plot(Quartet$x, predict.y, pch = 15, col = 'red')
points(Quartet$x, Quartet$y1, pch = 19)

6、调整SVM
借助参数gamma 和惩罚因子调整支持向量机
本节探讨tune.svm函数调整SVM的方法

tuned <- tune.svm(churn ~ ., data = trainset, gamma = 10^(-6:-1), cost = 10^(1:2))
summary(tuned)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  gamma cost
##   0.01  100
## 
## - best performance: 0.0808031 
## 
## - Detailed performance results:
##    gamma cost      error dispersion
## 1  1e-06   10 0.14774780 0.02237867
## 2  1e-05   10 0.14774780 0.02237867
## 3  1e-04   10 0.14774780 0.02237867
## 4  1e-03   10 0.14774780 0.02237867
## 5  1e-02   10 0.09202493 0.01981595
## 6  1e-01   10 0.08900582 0.02392487
## 7  1e-06  100 0.14774780 0.02237867
## 8  1e-05  100 0.14774780 0.02237867
## 9  1e-04  100 0.14774780 0.02237867
## 10 1e-03  100 0.11621697 0.02317553
## 11 1e-02  100 0.08080310 0.02367426
## 12 1e-01  100 0.13048776 0.02155659
# 使用tuned得到的best parameters重新训练模型
model.tuned <- svm(churn ~ ., data = trainset, gamma = tuned$best.parameters$gamma, cost = tuned$best.parameters$cost)
summary(model.tuned)
## 
## Call:
## svm(formula = churn ~ ., data = trainset, gamma = tuned$best.parameters$gamma, 
##     cost = tuned$best.parameters$cost)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  100 
##       gamma:  0.01 
## 
## Number of Support Vectors:  547
## 
##  ( 304 243 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  yes no
# 用调整后的模型进行类别预测
svm.tuned.pred <- predict(model.tuned, testset[,!names(testset) %in% c('churn')])
# 分类表
svm.tuned.table <- table(svm.tuned.pred,testset$churn)
svm.tuned.table
##               
## svm.tuned.pred yes  no
##            yes  95  24
##            no   46 853
# 调用classAgreement函数得到相关系数完成算法性能评测
classAgreement(svm.tuned.table)
## $diag
## [1] 0.9312377
## 
## $kappa
## [1] 0.691678
## 
## $rand
## [1] 0.871806
## 
## $crand
## [1] 0.6303615
# 应用confusionMatrix评估模型性能
confusionMatrix(svm.tuned.table)
## Confusion Matrix and Statistics
## 
##               
## svm.tuned.pred yes  no
##            yes  95  24
##            no   46 853
##                                          
##                Accuracy : 0.9312         
##                  95% CI : (0.9139, 0.946)
##     No Information Rate : 0.8615         
##     P-Value [Acc > NIR] : 1.56e-12       
##                                          
##                   Kappa : 0.6917         
##  Mcnemar's Test P-Value : 0.01207        
##                                          
##             Sensitivity : 0.67376        
##             Specificity : 0.97263        
##          Pos Pred Value : 0.79832        
##          Neg Pred Value : 0.94883        
##              Prevalence : 0.13851        
##          Detection Rate : 0.09332        
##    Detection Prevalence : 0.11690        
##       Balanced Accuracy : 0.82320        
##                                          
##        'Positive' Class : yes            
## 

二、神经网络模型
1、neuralnet包 1.1 使用iris数据集

# 数据准备,划分训练数据和测试数据
data("iris")
ind <- sample(2,nrow(iris),replace = T, prob = c(0.7, 0.3))
trainset <- iris[ind == 1,]
testset <- iris[ind == 2,]
library(neuralnet)
# 根据Species列的取值,增加Versicolor, setosa, virginica数据列
trainset$versicolor <- trainset$Species == 'Versicolor'
trainset$setosa <- trainset$Species == 'setosa'
trainset$virginica <- trainset$Species == 'virginica'

1.2 使用neuralnet函数构建包含3个隐藏层的神经网络
如果提前设置seed值,可使每次训练返回相同的值

network <- neuralnet(versicolor + virginica + setosa ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = trainset, hidden = 3)
network$result.matrix
##                                             1
## error                       0.856772664053340
## reached.threshold           0.009847762487250
## steps                    4921.000000000000000
## Intercept.to.1layhid1     -16.385507747099865
## Sepal.Length.to.1layhid1   -2.202652268640603
## Sepal.Width.to.1layhid1    -2.820021731848652
## Petal.Length.to.1layhid1    5.263972229287643
## Petal.Width.to.1layhid1     7.158939785143700
## Intercept.to.1layhid2       0.711445587046781
## Sepal.Length.to.1layhid2    0.111449514143766
## Sepal.Width.to.1layhid2     3.602734234124951
## Petal.Length.to.1layhid2   -1.596937101232157
## Petal.Width.to.1layhid2    -9.693646291517846
## Intercept.to.1layhid3      -2.555369162236858
## Sepal.Length.to.1layhid3   -0.192848640605812
## Sepal.Width.to.1layhid3     2.996297852746109
## Petal.Length.to.1layhid3    1.754493922934692
## Petal.Width.to.1layhid3    -0.310658963114995
## Intercept.to.versicolor     1.350335425134183
## 1layhid.1.to.versicolor     0.000009963020239
## 1layhid.2.to.versicolor    -0.000304455866022
## 1layhid.3.to.versicolor    -1.350351760620393
## Intercept.to.virginica      0.575801002951326
## 1layhid.1.to.virginica      1.038883374275100
## 1layhid.2.to.virginica      0.016804461790844
## 1layhid.3.to.virginica     -0.592661443221412
## Intercept.to.setosa         0.913877796978970
## 1layhid.1.to.setosa         0.001564731810324
## 1layhid.2.to.setosa         1.001726329197295
## 1layhid.3.to.setosa        -0.915136021708912

1.3 神经网络模型可视化

# 直接使用plot函数
plot(network)
# 使用gwplot函数可视化泛化权值
par(mfrow = c(2,2))
gwplot(network, selected.covariate = 'Petal.Width')
gwplot(network, selected.covariate = 'Sepal.Width')
gwplot(network, selected.covariate = 'Petal.Length')
gwplot(network, selected.covariate = 'Sepal.Length')

1.4 利用得到的network模型实现类标号预测

net.pred <- compute(network, testset[-5])$net.result
# 通过找到概率最大的那一列,得到其他可能的类别
net.prediction <- c('versicolor','virginica','setosa')[apply(net.pred, 1, which.max)]
# 分类表
predict.table <- table(testset$Species, net.prediction)
predict.table
##             net.prediction
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      5          5         4
##   virginica       0          0        12
# 使用classAgreement函数计算分类表
classAgreement(predict.table)
## $diag
## [1] 0.7804878049
## 
## $kappa
## [1] 0.6702412869
## 
## $rand
## [1] 0.7707317073
## 
## $crand
## [1] 0.5020028427

1.5 调用混淆矩阵评估模型性能

confusionMatrix(predict.table)
## Confusion Matrix and Statistics
## 
##             net.prediction
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      5          5         4
##   virginica       0          0        12
## 
## Overall Statistics
##                                                
##                Accuracy : 0.7804878            
##                  95% CI : (0.6238625, 0.894392)
##     No Information Rate : 0.4878049            
##     P-Value [Acc > NIR] : 0.0001195313         
##                                                
##                   Kappa : 0.6702413            
##  Mcnemar's Test P-Value : NA                   
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity              0.7500000         1.0000000        0.7500000
## Specificity              1.0000000         0.7500000        1.0000000
## Pos Pred Value           1.0000000         0.3571429        1.0000000
## Neg Pred Value           0.8076923         1.0000000        0.8620690
## Prevalence               0.4878049         0.1219512        0.3902439
## Detection Rate           0.3658537         0.1219512        0.2926829
## Detection Prevalence     0.3658537         0.3414634        0.2926829
## Balanced Accuracy        0.8750000         0.8750000        0.8750000

2、nnet包
使用上例中iris数据集划分出来的trainset 和 testset

library(nnet)

2.1 重新装载数据

data(iris)
set.seed(2)
ind <- sample(2, nrow(iris),replace = T, prob = c(0.7,0.3))
trainset <- iris[ind == 1,]
testset <- iris[ind == 2,]

2.2 使用nnet包中的nnet函数训练神经网络
在调用函数时,可以设置分类规则、数据源、隐藏单元个数(size参数),初始随机数权值(rang参数),权值衰减参数(decay参数),最大迭代次数(maxit参数)

iris.nn <- nnet(Species ~ .,data = trainset, size = 2, rang = 0.1, decay = 5e-4, maxit = 200)
## # weights:  19
## initial  value 114.539765 
## iter  10 value 52.100312
## iter  20 value 50.231442
## iter  30 value 49.526599
## iter  40 value 49.402229
## iter  50 value 44.680338
## iter  60 value 5.254389
## iter  70 value 2.836695
## iter  80 value 2.744315
## iter  90 value 2.687069
## iter 100 value 2.621556
## iter 110 value 2.589096
## iter 120 value 2.410539
## iter 130 value 2.096462
## iter 140 value 1.938715
## iter 150 value 1.857105
## iter 160 value 1.825393
## iter 170 value 1.817409
## iter 180 value 1.815591
## iter 190 value 1.815030
## iter 200 value 1.814746
## final  value 1.814746 
## stopped after 200 iterations
summary(iris.nn)
## a 4-2-3 network with 19 weights
## options were - softmax modelling  decay=0.0005
##  b->h1 i1->h1 i2->h1 i3->h1 i4->h1 
## -20.60   0.31  -3.84   3.36   7.72 
##  b->h2 i1->h2 i2->h2 i3->h2 i4->h2 
##  -7.15   1.50   2.49  -4.14   5.59 
##  b->o1 h1->o1 h2->o1 
##  -7.28  -3.67  13.16 
##  b->o2 h1->o2 h2->o2 
##  15.90 -16.64 -19.40 
##  b->o3 h1->o3 h2->o3 
##  -8.62  20.31   6.24

通过summary得到神经网络的信息,可知该模型为一个拥有19个权值的4-2-3网络结构

2.3 基于nnt包得到的模型实现类标号的预测
如果没有指定type为class, 系统将默认输出为概率矩阵

iris.predict <- predict(iris.nn, testset, type = 'class')
nn.table <- table(testset$Species,iris.predict)
nn.table
##             iris.predict
##              setosa versicolor virginica
##   setosa         17          0         0
##   versicolor      0         13         1
##   virginica       0          2        13

2.4 模型评估:混淆矩阵

confusionMatrix(nn.table)
## Confusion Matrix and Statistics
## 
##             iris.predict
##              setosa versicolor virginica
##   setosa         17          0         0
##   versicolor      0         13         1
##   virginica       0          2        13
## 
## Overall Statistics
##                                                  
##                Accuracy : 0.9347826              
##                  95% CI : (0.8210356, 0.9863432) 
##     No Information Rate : 0.3695652              
##     P-Value [Acc > NIR] : 0.000000000000001019459
##                                                  
##                   Kappa : 0.901919               
##  Mcnemar's Test P-Value : NA                     
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity              1.0000000         0.8666667        0.9285714
## Specificity              1.0000000         0.9677419        0.9375000
## Pos Pred Value           1.0000000         0.9285714        0.8666667
## Neg Pred Value           1.0000000         0.9375000        0.9677419
## Prevalence               0.3695652         0.3260870        0.3043478
## Detection Rate           0.3695652         0.2826087        0.2826087
## Detection Prevalence     0.3695652         0.3043478        0.3260870
## Balanced Accuracy        1.0000000         0.9172043        0.9330357