practice9

Загрузим данные согласно заданию и отобразим их на графике. Составим обучающую выборку.

my.seed <- 1
data(Auto)
str(Auto)

## 'data.frame':    392 obs. of  9 variables:
##  $ mpg         : num  18 15 18 16 17 15 14 14 14 15 ...
##  $ cylinders   : num  8 8 8 8 8 8 8 8 8 8 ...
##  $ displacement: num  307 350 318 304 302 429 454 440 455 390 ...
##  $ horsepower  : num  130 165 150 150 140 198 220 215 225 190 ...
##  $ weight      : num  3504 3693 3436 3433 3449 ...
##  $ acceleration: num  12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
##  $ year        : num  70 70 70 70 70 70 70 70 70 70 ...
##  $ origin      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ name        : Factor w/ 304 levels "amc ambassador brougham",..: 49 36 231 14 161 141 54 223 241 2 ...

high.mpg <- ifelse(Auto$mpg <= 23, 'blue', 'red')
# создаём наблюдения

x <- matrix(c(Auto$displacement, Auto$horsepower), ncol=2)
y <- c(high.mpg)

# таблица с данными, отклик -- фактор 
dat <- data.frame(x = x, y = as.factor(y))
plot(x, col = y, pch = 19)

train <- sample(1:nrow(dat), nrow(dat)/2) # обучающая выборка -- 50%

Построим SVM с полиномиальным ядром второй степени и подберем настроечный параметр.

# SVM с радиальным ядром и маленьким cost
svmfit <- svm(y ~ ., data = dat[train, ], kernel = "radial", 
              coef=2, cost = 1)
plot(svmfit, dat[train, ])

summary(svmfit)

## 
## Call:
## svm(formula = y ~ ., data = dat[train, ], kernel = "radial", 
##     coef = 2, cost = 1)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.5 
## 
## Number of Support Vectors:  60
## 
##  ( 31 29 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  blue red

# SVM с радиальным ядром и большим cost
svmfit <- svm(y ~ ., data = dat[train, ], kernel = "radial", 
              coef=2, cost = 1e5)
plot(svmfit, dat[train, ])

# перекрёстная проверка
set.seed(1)
tune.out <- tune(svm, y ~ ., data = dat[train, ], kernel = "radial", 
                 ranges = list(cost = c(0.1, 1, 10, 100, 1000),
                               coef0 = c(0.5, 1, 2, 3, 4)))
summary(tune.out)

## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost coef0
##   100   0.5
## 
## - best performance: 0.1023684 
## 
## - Detailed performance results:
##     cost coef0     error dispersion
## 1  1e-01   0.5 0.1073684 0.06288685
## 2  1e+00   0.5 0.1073684 0.06288685
## 3  1e+01   0.5 0.1123684 0.05972712
## 4  1e+02   0.5 0.1023684 0.08215183
## 5  1e+03   0.5 0.1071053 0.08695323
## 6  1e-01   1.0 0.1073684 0.06288685
## 7  1e+00   1.0 0.1073684 0.06288685
## 8  1e+01   1.0 0.1123684 0.05972712
## 9  1e+02   1.0 0.1023684 0.08215183
## 10 1e+03   1.0 0.1071053 0.08695323
## 11 1e-01   2.0 0.1073684 0.06288685
## 12 1e+00   2.0 0.1073684 0.06288685
## 13 1e+01   2.0 0.1123684 0.05972712
## 14 1e+02   2.0 0.1023684 0.08215183
## 15 1e+03   2.0 0.1071053 0.08695323
## 16 1e-01   3.0 0.1073684 0.06288685
## 17 1e+00   3.0 0.1073684 0.06288685
## 18 1e+01   3.0 0.1123684 0.05972712
## 19 1e+02   3.0 0.1023684 0.08215183
## 20 1e+03   3.0 0.1071053 0.08695323
## 21 1e-01   4.0 0.1073684 0.06288685
## 22 1e+00   4.0 0.1073684 0.06288685
## 23 1e+01   4.0 0.1123684 0.05972712
## 24 1e+02   4.0 0.1023684 0.08215183
## 25 1e+03   4.0 0.1071053 0.08695323

# матрица неточностей для прогноза по лучшей модели
tbl <- table(true = dat[-train, "y"], 
      pred = predict(tune.out$best.model, newdata = dat[-train, ]))

# оценка точности
acc.test <- sum(diag(tbl))/sum(tbl)
acc.test

## [1] 0.8928571

Полученная модель обладает высокой точностью. Лучшими показателями стали cost=0,1, coef0=0,5. построим ROC-кривые по нашей лучшей модели.

# функция построения ROC-кривой: pred -- прогноз, truth -- факт
rocplot <- function(pred, truth, ...){
  predob = prediction(pred, truth)
  perf = performance(predob, "tpr", "fpr")
  plot(perf,...)}

# последняя оптимальная модель
svmfit.opt <- svm(y ~ ., data = dat[train, ], 
                  kernel = "radial", coef0=0.5, cost = 0.1, decision.values = T)

# количественные модельные значения, на основе которых присваивается класс
fitted <- attributes(predict(svmfit.opt, dat[train, ],
                             decision.values = TRUE))$decision.values

# график для обучающей выборки
par(mfrow = c(1, 2))
rocplot(fitted, dat[train, "y"], main = "Training Data")


svmfit.flex = svm(y ~ ., data = dat[train, ], kernel = "radial", 
                  coef0=0.5, cost = 0.1, decision.values = T)
fitted <- attributes(predict(svmfit.flex, dat[train, ], 
                             decision.values = T))$decision.values
rocplot(fitted, dat[train,"y"], add = T, col = "red")

# график для тестовой выборки
fitted <- attributes(predict(svmfit.opt, dat[-train, ], 
                             decision.values = T))$decision.values
rocplot(fitted, dat[-train, "y"], main = "Test Data")
fitted <- attributes(predict(svmfit.flex, dat[-train, ], 
                             decision.values = T))$decision.values
rocplot(fitted, dat[-train, "y"], add = T, col = "red")

На тестовой и обучающей выборке модели ведут себя практически одинаково. Можно говорить о высоком качестве моделей.

practice9

A.Lukyanova

10 мая 2018 г