Данные: Online_Shopping_for_models.csv
Модели: SVM
Расчистаем основные параметры описательной статистики.
library('GGally') # графики совместного разброса переменных
## Warning: package 'GGally' was built under R version 3.5.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.5.3
library('lmtest') # тесты остатков регрессионных моделей
## Warning: package 'lmtest' was built under R version 3.5.3
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library('FNN') # алгоритм kNN
## Warning: package 'FNN' was built under R version 3.5.3
library('mlbench')
## Warning: package 'mlbench' was built under R version 3.5.3
library('ISLR')
## Warning: package 'ISLR' was built under R version 3.5.3
library('e1071') # SVM
## Warning: package 'e1071' was built under R version 3.5.3
library('ROCR') # ROC-кривые
## Warning: package 'ROCR' was built under R version 3.5.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.5.3
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library('randomForest') # случайный лес randomForest()
## Warning: package 'randomForest' was built under R version 3.5.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library('gbm') # бустинг gbm()
## Warning: package 'gbm' was built under R version 3.5.3
## Loaded gbm 2.1.5
library('tree')
## Warning: package 'tree' was built under R version 3.5.3
setwd("D:/Desktop")
DF <- read.table('Online_Shopping_for_models.csv', header = T, # заголовок в первой строке
dec = ',', # разделитель целой и дробной части
sep = ';') # символы пропущенных значений
df <- na.omit(DF)
dim(df)
## [1] 616 18
head(df)
str(df)
## 'data.frame': 616 obs. of 18 variables:
## $ Administrative : int 1 1 2 11 10 0 2 8 0 0 ...
## $ Administrative_Duration: num 28.2 6 51 471.6 237.8 ...
## $ Informational : int 0 0 0 4 2 0 0 1 0 0 ...
## $ Informational_Duration : num 0 0 0 236 23 ...
## $ ProductRelated : int 1 15 25 22 82 4 16 227 3 1 ...
## $ ProductRelated_Duration: num 0 762 699 883 1815 ...
## $ BounceRates : num 0 0 0 0.00645 0.00233 ...
## $ ExitRates : num 0.05 0.01429 0.00873 0.0182 0.01039 ...
## $ PageValues : num 0 0 0 19.4 8.5 ...
## $ SpecialDay : num 0 0 0 0 0 0.8 0 0 0 0 ...
## $ Month : Factor w/ 10 levels "Aug","Dec","Feb",..: 10 2 7 10 7 7 3 9 7 6 ...
## $ OperatingSystems : int 1 8 2 3 3 2 2 2 2 1 ...
## $ Browser : int 1 13 10 2 2 2 2 2 2 1 ...
## $ Region : int 1 9 1 3 3 1 1 1 3 1 ...
## $ TrafficType : int 5 20 4 4 2 13 6 13 1 9 ...
## $ VisitorType : Factor w/ 3 levels "New_Visitor",..: 1 2 3 3 3 3 3 3 3 3 ...
## $ Weekend : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Revenue : logi TRUE FALSE FALSE FALSE FALSE FALSE ...
my.seed <- 12
summary(df)
## Administrative Administrative_Duration Informational
## Min. : 0.000 Min. : 0.00 Min. :0.0000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.:0.0000
## Median : 1.000 Median : 4.00 Median :0.0000
## Mean : 2.239 Mean : 75.32 Mean :0.4627
## 3rd Qu.: 3.000 3rd Qu.: 85.20 3rd Qu.:0.0000
## Max. :24.000 Max. :2047.23 Max. :9.0000
##
## Informational_Duration ProductRelated ProductRelated_Duration
## Min. : 0.0 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 7.75 1st Qu.: 166.2
## Median : 0.0 Median : 16.00 Median : 531.8
## Mean : 26.4 Mean : 31.08 Mean : 1158.1
## 3rd Qu.: 0.0 3rd Qu.: 36.25 3rd Qu.: 1378.3
## Max. :1146.7 Max. :318.00 Max. :13717.4
##
## BounceRates ExitRates PageValues SpecialDay
## Min. :0.000000 Min. :0.00000 Min. : 0.00 Min. :0.00000
## 1st Qu.:0.000000 1st Qu.:0.01518 1st Qu.: 0.00 1st Qu.:0.00000
## Median :0.003689 Median :0.02671 Median : 0.00 Median :0.00000
## Mean :0.024784 Mean :0.04488 Mean : 5.66 Mean :0.05227
## 3rd Qu.:0.020033 3rd Qu.:0.05000 3rd Qu.: 0.00 3rd Qu.:0.00000
## Max. :0.200000 Max. :0.20000 Max. :261.49 Max. :1.00000
##
## Month OperatingSystems Browser Region
## May :171 Min. :1.000 Min. : 1.000 Min. :1.000
## Nov :155 1st Qu.:2.000 1st Qu.: 2.000 1st Qu.:1.000
## Dec : 81 Median :2.000 Median : 2.000 Median :3.000
## Mar : 78 Mean :2.083 Mean : 2.404 Mean :3.094
## Oct : 35 3rd Qu.:2.000 3rd Qu.: 2.000 3rd Qu.:4.000
## Sep : 27 Max. :8.000 Max. :13.000 Max. :9.000
## (Other): 69
## TrafficType VisitorType Weekend Revenue
## Min. : 1.000 New_Visitor : 85 Mode :logical Mode :logical
## 1st Qu.: 2.000 Other : 3 FALSE:487 FALSE:526
## Median : 3.000 Returning_Visitor:528 TRUE :129 TRUE :90
## Mean : 4.244
## 3rd Qu.: 4.000
## Max. :20.000
##
# общее число наблюдений
n <- nrow(df)
# доля обучающей выборки
train.percent <- 0.5
# выбрать наблюдения в обучающую выборку
set.seed(my.seed)
inTrain <- sample(n, n * train.percent)
train <- sample(n, n * train.percent)
Построим дерево для категориального отклика Revenue.
Revenue <- df$Revenue
tree.shopping <- tree(Revenue ~ ., df, subset = train)
summary(tree.shopping)
##
## Regression tree:
## tree(formula = Revenue ~ ., data = df, subset = train)
## Variables actually used in tree construction:
## [1] "PageValues" "Administrative_Duration"
## [3] "Month" "Informational_Duration"
## [5] "VisitorType" "Administrative"
## [7] "TrafficType" "Informational"
## [9] "ProductRelated_Duration"
## Number of terminal nodes: 14
## Residual mean deviance: 0.05338 = 15.69 / 294
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.87500 -0.03226 0.00000 0.00000 0.00000 0.96770
# визуализация
plot(tree.shopping)
text(tree.shopping, pretty = 0)
yhat <- predict(tree.shopping, newdata = df[-train, ])
df.test <- df[-train, "Revenue"]
Рассмотрим более сложные методы улучшения качества дерева. Бэггинг – частный случай случайного леса с \(m = p\), поэтому и то, и другое можно построить функцией randomForest().
Для начала используем бэггинг, причём возьмём все 17 предикторов на каждом шаге (аргумент mtry).
# бэггинг с 14 предикторами
df.test <- df[-train,]
set.seed(my.seed)
bag.df <- randomForest(Revenue ~ ., data = df, subset = train,
mtry = 17, importance = TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
bag.df
##
## Call:
## randomForest(formula = Revenue ~ ., data = df, mtry = 17, importance = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 17
##
## Mean of squared residuals: 0.1015887
## % Var explained: 20.04
# прогноз
yhat.bag = predict(bag.df, newdata = df[-train, ])
yhat.bag
## 2 5 6 10 11
## 3.913333e-02 3.538333e-01 -8.365530e-17 2.000000e-03 6.266667e-03
## 12 16 17 18 20
## 6.500000e-03 5.181333e-01 8.876667e-02 -7.072121e-17 7.810667e-01
## 25 26 27 29 30
## 1.853000e-01 -9.936496e-17 4.182333e-01 6.356000e-01 4.765667e-01
## 31 32 33 34 38
## -8.082424e-17 2.903000e-01 1.669333e-01 1.883333e-02 -1.009193e-16
## 39 41 45 47 54
## 2.374333e-01 4.976333e-01 5.060000e-02 6.600000e-03 -8.226753e-17
## 60 63 64 68 71
## 1.420000e-02 5.166667e-03 8.600000e-03 8.286667e-02 2.000000e-03
## 72 73 75 77 79
## 1.274333e-01 1.900000e-02 1.559667e-01 1.306667e-02 7.333333e-03
## 80 82 83 84 85
## 2.465667e-01 6.666667e-03 3.432667e-01 1.014667e-01 2.000000e-03
## 87 90 91 92 93
## 2.000000e-03 5.402000e-01 1.680000e-02 8.220333e-01 3.736667e-02
## 95 96 104 106 107
## 3.800000e-02 2.233333e-03 -9.903189e-17 1.083333e-02 2.861333e-01
## 109 111 115 117 119
## 4.000000e-04 -6.966649e-17 4.000000e-03 4.400000e-03 1.717000e-01
## 120 121 125 132 134
## 3.210000e-02 7.436333e-01 1.600000e-02 1.076667e-01 3.463333e-02
## 135 136 137 139 142
## 1.307333e-01 6.000000e-03 8.846667e-02 1.844667e-01 3.538333e-01
## 143 144 146 150 154
## 4.000000e-03 3.800000e-03 -8.065770e-17 -1.003087e-16 2.666667e-02
## 159 160 164 167 173
## 2.590667e-01 1.061333e-01 2.000000e-03 3.213333e-02 2.000000e-03
## 174 177 179 180 181
## -9.364731e-17 1.800000e-02 5.660000e-01 -1.031952e-16 4.990000e-02
## 189 190 193 194 196
## 8.436667e-02 2.843333e-02 6.147667e-01 -1.046385e-16 3.473333e-02
## 198 199 201 202 206
## 1.433667e-01 4.713333e-02 1.466667e-02 -6.522560e-17 -1.025846e-16
## 214 215 216 218 219
## 3.316333e-01 6.066667e-02 4.734667e-01 8.140000e-02 2.000000e-03
## 222 225 228 229 230
## -1.033618e-16 4.682333e-01 3.863333e-02 6.000000e-03 4.456333e-01
## 231 233 234 235 237
## 5.218667e-01 -1.087463e-16 2.272667e-01 4.666667e-03 5.246667e-02
## 238 239 240 242 243
## 2.386667e-02 7.015667e-01 8.170000e-02 6.136667e-02 1.347000e-01
## 244 246 248 252 253
## 8.506667e-02 8.132000e-01 9.790000e-02 -8.321122e-17 2.000000e-03
## 256 258 260 261 264
## 2.320000e-02 3.850000e-02 9.355000e-01 8.666667e-03 -1.000866e-16
## 265 267 268 270 273
## 4.514667e-01 1.083333e-01 4.000000e-04 -8.287815e-17 2.800000e-03
## 274 276 277 278 280
## 4.733333e-03 1.000000e-03 2.236667e-01 -6.783463e-17 -7.838175e-17
## 281 283 284 288 290
## 2.106667e-01 8.533333e-03 8.587000e-01 5.753333e-02 2.000000e-03
## 291 292 293 294 295
## 2.366667e-02 2.400000e-03 -6.017409e-17 6.003333e-02 1.943333e-02
## 297 299 300 303 304
## 1.680000e-02 1.346667e-02 2.054667e-01 -9.309220e-17 3.066667e-02
## 305 310 312 313 314
## 6.400000e-03 6.946667e-01 7.229667e-01 2.596667e-02 2.666667e-03
## 315 318 319 320 321
## 1.123333e-01 -8.404388e-17 2.000000e-03 6.066667e-03 1.240000e-02
## 326 328 329 332 333
## -8.326673e-17 -1.019740e-16 5.410000e-02 9.200000e-03 7.300000e-02
## 334 338 339 340 341
## 1.484333e-01 1.276667e-01 5.200000e-03 6.900000e-02 9.346667e-02
## 343 347 348 350 353
## 1.237333e-01 2.017333e-01 3.283000e-01 6.266667e-03 6.213333e-02
## 354 356 357 361 362
## 1.100000e-01 3.963667e-01 9.066667e-03 -8.154588e-17 -8.465451e-17
## 363 364 365 366 368
## 4.884000e-01 -1.063594e-16 1.433667e-01 2.849000e-01 3.866333e-01
## 371 372 374 375 377
## 1.326667e-02 9.507667e-01 1.000000e-02 7.333333e-03 2.530000e-02
## 378 379 381 384 385
## 3.048000e-01 1.280333e-01 6.018333e-01 2.896667e-01 -8.237855e-17
## 390 393 397 399 400
## 4.306667e-02 1.328333e-01 7.833333e-03 6.996667e-02 6.273333e-02
## 401 402 405 407 410
## 6.602333e-01 6.698667e-01 1.934667e-01 -7.954748e-17 -9.753309e-17
## 414 416 418 419 420
## 2.464000e-01 -8.232304e-17 -8.532064e-17 4.448000e-01 1.945667e-01
## 421 424 426 427 428
## -6.694645e-17 1.108333e-01 -9.497958e-17 6.710667e-01 1.114667e-01
## 429 430 433 437 438
## -6.172840e-17 7.246000e-01 3.800000e-03 -8.382184e-17 7.756667e-02
## 439 440 442 444 445
## 4.406667e-02 1.577667e-01 1.741000e-01 2.666667e-03 1.333333e-03
## 447 448 450 454 455
## 4.960000e-02 5.000000e-04 2.833333e-03 2.307667e-01 1.439333e-01
## 457 459 460 461 462
## 5.823333e-01 1.786667e-02 1.406667e-02 2.800000e-03 -8.287815e-17
## 463 465 466 467 468
## 2.840333e-01 5.776667e-02 6.360000e-02 3.950000e-02 8.056667e-02
## 475 476 477 478 484
## -1.045830e-16 9.266667e-03 2.900000e-03 8.000000e-04 1.066667e-01
## 487 488 492 493 495
## 1.155000e-01 -8.371082e-17 2.408333e-01 3.643333e-02 1.072333e-01
## 496 497 498 499 500
## 7.371000e-01 -8.931744e-17 3.986667e-02 6.046000e-01 4.000000e-04
## 502 503 516 517 518
## 6.696667e-01 1.966333e-01 5.345667e-01 -6.894485e-17 3.936667e-02
## 522 523 525 527 528
## -8.493206e-17 -7.044365e-17 1.418667e-01 2.505667e-01 -7.949197e-17
## 529 531 532 533 536
## 3.312333e-01 1.756667e-02 -7.921441e-17 8.250000e-02 -8.204548e-17
## 537 538 540 542 544
## 6.900000e-03 2.000000e-02 1.409667e-01 1.465667e-01 4.000000e-03
## 545 546 547 548 550
## 2.000000e-03 1.116000e-01 -1.011968e-16 4.400000e-03 5.123333e-02
## 551 553 555 556 558
## 1.570667e-01 4.530000e-02 2.000000e-03 4.430667e-01 4.666667e-03
## 559 562 563 564 565
## 1.000000e-03 5.290000e-02 5.213333e-02 7.176667e-02 1.200000e-03
## 569 571 572 573 577
## 5.191667e-01 6.979667e-01 4.100000e-03 5.940000e-02 9.333333e-03
## 578 580 582 586 588
## 5.196667e-02 4.513333e-02 1.745333e-01 4.689000e-01 4.000000e-04
## 590 591 593 595 596
## 9.000000e-03 2.000000e-03 8.000000e-03 -9.864332e-17 3.613333e-02
## 598 601 604 606 610
## 3.780000e-02 2.400000e-03 1.331667e-01 4.401000e-01 8.671333e-01
## 613 615 616
## 5.730333e-01 3.170000e-02 2.508000e-01
Можно изменить число деревьев с помощью аргумента ntree.
# бэггинг с 13 предикторами и 25 деревьями
bag.df <- randomForest(Revenue ~ ., data = df, subset = train,
mtry = 16, ntree = 25)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
# прогноз
yhat.bag <- predict(bag.df, newdata = df[-train, ])
yhat.bag
## 2 5 6 10 11
## 8.000000e-02 2.780000e-01 -7.549517e-17 4.000000e-02 -9.769963e-17
## 12 16 17 18 20
## 2.400000e-02 4.366667e-01 1.400000e-01 8.000000e-03 6.033333e-01
## 25 26 27 29 30
## 2.220000e-01 -9.436896e-17 3.726667e-01 5.660000e-01 2.960000e-01
## 31 32 33 34 38
## -7.771561e-17 3.933333e-01 1.800000e-01 6.400000e-02 -9.769963e-17
## 39 41 45 47 54
## 3.280000e-01 4.900000e-01 1.180000e-01 -9.769963e-17 -7.771561e-17
## 60 63 64 68 71
## -4.551914e-17 -4.884981e-17 -5.884182e-17 2.000000e-02 -8.992806e-17
## 72 73 75 77 79
## 1.466667e-01 9.600000e-02 9.000000e-02 4.000000e-02 -4.662937e-17
## 80 82 83 84 85
## 1.520000e-01 -6.994405e-17 4.933333e-01 8.800000e-02 -6.550316e-17
## 87 90 91 92 93
## -7.438494e-17 6.186667e-01 3.200000e-02 7.000000e-01 1.440000e-01
## 95 96 104 106 107
## 1.100000e-01 -1.065814e-16 -1.088019e-16 1.000000e-02 3.213333e-01
## 109 111 115 117 119
## 3.200000e-02 -6.772360e-17 -3.885781e-17 3.200000e-02 1.346667e-01
## 120 121 125 132 134
## 8.000000e-03 7.220000e-01 -3.441691e-17 1.253333e-01 1.000000e-01
## 135 136 137 139 142
## 2.100000e-01 -9.325873e-17 9.666667e-02 2.080000e-01 4.000000e-01
## 143 144 146 150 154
## -4.662937e-17 4.000000e-02 -7.327472e-17 -8.548717e-17 3.200000e-02
## 159 160 164 167 173
## 1.800000e-01 1.000000e-01 -7.549517e-17 4.000000e-02 -3.108624e-17
## 174 177 179 180 181
## -8.437695e-17 4.000000e-02 5.966667e-01 -8.215650e-17 2.400000e-02
## 189 190 193 194 196
## 1.706667e-01 3.466667e-02 7.413333e-01 -8.659740e-17 -6.439294e-17
## 198 199 201 202 206
## 1.800000e-01 5.000000e-02 4.000000e-02 -5.440093e-17 -1.121325e-16
## 214 215 216 218 219
## 3.473333e-01 6.666667e-02 4.126667e-01 8.000000e-03 -7.771561e-17
## 222 225 228 229 230
## -6.772360e-17 5.200000e-01 8.800000e-02 -3.663736e-17 4.640000e-01
## 231 233 234 235 237
## 4.000000e-01 -9.103829e-17 3.000000e-01 -6.883383e-17 1.200000e-01
## 238 239 240 242 243
## 6.666667e-02 8.120000e-01 1.000000e-01 1.100000e-01 8.600000e-02
## 244 246 248 252 253
## 6.400000e-02 6.813333e-01 9.000000e-02 -7.549517e-17 -6.661338e-17
## 256 258 260 261 264
## 8.800000e-02 2.000000e-02 9.400000e-01 -5.551115e-17 -9.325873e-17
## 265 267 268 270 273
## 1.926667e-01 5.000000e-02 -8.881784e-17 -9.436896e-17 -6.994405e-17
## 274 276 277 278 280
## -3.774758e-17 -6.772360e-17 3.400000e-01 4.000000e-02 -7.438494e-17
## 281 283 284 288 290
## 1.680000e-01 -6.439294e-17 7.400000e-01 4.000000e-02 -6.661338e-17
## 291 292 293 294 295
## -5.884182e-17 -2.886580e-17 -6.772360e-17 6.333333e-02 -6.328271e-17
## 297 299 300 303 304
## -5.329071e-17 4.000000e-02 3.166667e-01 -8.659740e-17 4.000000e-02
## 305 310 312 313 314
## -7.882583e-17 6.766667e-01 6.400000e-01 1.000000e-02 -6.661338e-17
## 315 318 319 320 321
## 4.000000e-02 -8.881784e-17 -3.885781e-17 -6.217249e-17 -8.770762e-17
## 326 328 329 332 333
## -4.662937e-17 -1.032507e-16 9.000000e-02 -7.993606e-17 2.666667e-02
## 334 338 339 340 341
## 1.240000e-01 1.800000e-01 -3.885781e-17 6.400000e-02 1.240000e-01
## 343 347 348 350 353
## 2.426667e-01 2.040000e-01 4.760000e-01 -5.662137e-17 1.000000e-02
## 354 356 357 361 362
## 4.800000e-02 3.713333e-01 -6.217249e-17 -7.549517e-17 -7.771561e-17
## 363 364 365 366 368
## 5.173333e-01 -9.658940e-17 1.200000e-01 2.160000e-01 4.180000e-01
## 371 372 374 375 377
## 4.000000e-02 9.866667e-01 -5.884182e-17 -6.661338e-17 6.400000e-02
## 378 379 381 384 385
## 4.000000e-01 2.166667e-01 5.580000e-01 3.200000e-01 4.000000e-02
## 390 393 397 399 400
## 8.000000e-02 6.000000e-02 -5.773160e-17 4.000000e-02 -4.662937e-17
## 401 402 405 407 410
## 7.280000e-01 5.826667e-01 3.380000e-01 -7.438494e-17 -7.327472e-17
## 414 416 418 419 420
## 7.800000e-02 -7.771561e-17 -6.772360e-17 3.400000e-01 2.440000e-01
## 421 424 426 427 428
## 4.000000e-02 1.346667e-01 -6.661338e-17 4.900000e-01 7.200000e-02
## 429 430 433 437 438
## -5.218048e-17 7.646667e-01 4.000000e-02 -8.881784e-17 6.000000e-02
## 439 440 442 444 445
## 7.800000e-02 7.333333e-02 1.466667e-01 -3.663736e-17 -9.992007e-17
## 447 448 450 454 455
## 9.333333e-02 -4.551914e-17 -6.994405e-17 1.500000e-01 1.866667e-01
## 457 459 460 461 462
## 5.520000e-01 7.000000e-02 -6.439294e-17 2.000000e-02 -5.440093e-17
## 463 465 466 467 468
## 2.866667e-01 8.000000e-02 -3.441691e-17 4.000000e-02 9.600000e-02
## 475 476 477 478 484
## -7.438494e-17 -8.104628e-17 -8.992806e-17 -1.110223e-16 -4.551914e-17
## 487 488 492 493 495
## 1.240000e-01 -7.771561e-17 1.360000e-01 7.000000e-02 5.000000e-02
## 496 497 498 499 500
## 6.533333e-01 -9.658940e-17 6.400000e-02 5.706667e-01 -7.438494e-17
## 502 503 516 517 518
## 8.200000e-01 3.620000e-01 5.160000e-01 -5.329071e-17 5.000000e-02
## 522 523 525 527 528
## -6.661338e-17 -6.217249e-17 1.020000e-01 1.980000e-01 -8.881784e-17
## 529 531 532 533 536
## 2.800000e-01 -5.440093e-17 -5.995204e-17 5.000000e-02 -7.771561e-17
## 537 538 540 542 544
## 4.000000e-02 -6.106227e-17 7.000000e-02 1.340000e-01 -7.882583e-17
## 545 546 547 548 550
## -7.327472e-17 1.540000e-01 -9.547918e-17 4.000000e-02 5.000000e-02
## 551 553 555 556 558
## 1.153333e-01 -4.551914e-17 -7.660539e-17 5.640000e-01 -3.663736e-17
## 559 562 563 564 565
## -7.882583e-17 8.400000e-02 9.333333e-02 1.013333e-01 8.000000e-03
## 569 571 572 573 577
## 3.786667e-01 7.146667e-01 -6.550316e-17 1.200000e-01 -7.438494e-17
## 578 580 582 586 588
## 1.066667e-01 6.000000e-02 1.653333e-01 5.000000e-01 -5.551115e-17
## 590 591 593 595 596
## -8.548717e-17 -1.032507e-16 -4.440892e-17 -9.325873e-17 7.200000e-02
## 598 601 604 606 610
## 2.666667e-02 -1.032507e-16 4.000000e-02 5.113333e-01 7.800000e-01
## 613 615 616
## 5.306667e-01 7.200000e-02 2.900000e-01
Теперь попробуем вырастить случайный лес. Берём 6 предикторов на каждом шаге.
# обучаем модель
set.seed(my.seed)
rf.df <- randomForest(Revenue ~ ., data = df, subset = train,
mtry = 6, importance = TRUE)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
# важность предикторов
importance(rf.df) # оценки
## %IncMSE IncNodePurity
## Administrative 7.01243560 2.2514137
## Administrative_Duration 15.20309178 3.8685014
## Informational 4.21484408 0.8031463
## Informational_Duration 2.63838479 1.2950842
## ProductRelated 3.01028245 2.5711900
## ProductRelated_Duration 3.16214100 3.0978554
## BounceRates 0.84008803 1.4787549
## ExitRates 3.63764484 3.0654662
## PageValues 31.15494004 8.3153534
## SpecialDay -0.18903096 0.1761869
## Month 1.28034160 3.5383491
## OperatingSystems -0.02029111 0.5559958
## Browser -1.40030795 0.4402597
## Region -2.29699316 1.0030740
## TrafficType 0.93838479 1.5915002
## VisitorType 6.88040137 0.9903911
## Weekend 0.80777991 0.4295586
varImpPlot(rf.df) # графики
По полученным данным можно сделать вывод о том, что наибольшее влияние в модели оказывают такие показатели, как PageValues и Administrative_Duration.
Построим 5000 регрессионных деревьев с глубиной 4.
library(gbm)
df$Weekend <- as.factor(df$Weekend)
set.seed(my.seed)
boost.df <- gbm(Revenue ~ ., data = df[train, ], distribution = "gaussian",
n.trees = 5000, interaction.depth = 4)
# график и таблица относительной важности переменных
summary(boost.df)
Теперь построим графики частной зависимости для двух наиболее важных предикторов: PageValues и Month.
par(mfrow = c(1, 2))
plot(boost.df, i = "PageValues")
plot(boost.df, i = "Month")
Построим полученные модели, по лучшей сделаем прогноз на прогнозных данных, обучим модель SVM с различными формами ядерной функции и построим ROC-кривые.
Модель 1: \(\hat{Revenue} = \hat{\beta}_0 + \hat{\beta}_1 \cdot PageValues+\hat{\beta}_2 \cdot Administrative_Duration\).
# присоединить таблицу с данными: названия стоблцов будут доступны напрямую
attach(df)
## The following object is masked _by_ .GlobalEnv:
##
## Revenue
# подгонка линейной модели на обучающей выборке
fit.lm <- lm(Revenue ~ PageValues + Administrative_Duration, subset = inTrain)
# считаем MSE на тестовой выборке
mean((df$Revenue[-inTrain] - predict(fit.lm,
df[-inTrain, ]))^2)
## [1] 0.09654843
# отсоединить таблицу с данными
detach(df)
Модель 2: \(\hat{Revenue} = \hat{\beta}_0 + \hat{\beta}_1 \cdot PageValues+\hat{\beta}_2 \cdot Month\).
# присоединить таблицу с данными: названия стоблцов будут доступны напрямую
attach(df)
## The following object is masked _by_ .GlobalEnv:
##
## Revenue
# подгонка линейной модели на обучающей выборке
fit.lm <- lm(Revenue ~ PageValues + Month, subset = inTrain)
# считаем MSE на тестовой выборке
mean((df$Revenue[-inTrain] - predict(fit.lm,
df[-inTrain, ]))^2)
## [1] 0.09768727
# отсоединить таблицу с данными
detach(df)
Оба значения MSE оказалась минимальными, с незначительной разницей значение первой модели оказалось меньше, будем использовать эту модель для дальнейшей работы.
# таблица с данными, отклик — фактор
PageValues <- df$PageValues
Administrative_Duration <- df$Administrative_Duration
Revenue <- df$Revenue
Revenue <- as.factor(Revenue)
dat <- data.frame(PageValues, Administrative_Duration, Revenue)
# обучающая выборка
train <- sample(1:nrow(dat), nrow(dat)/2)
# SVM с радиальным ядром и маленьким cost
svmfit <- svm(Revenue ~ ., data = dat[train, ], kernel = "radial",
gamma = 1, cost = 1)
plot(svmfit, dat[train, ])
summary(svmfit)
##
## Call:
## svm(formula = Revenue ~ ., data = dat[train, ], kernel = "radial",
## gamma = 1, cost = 1)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 1
##
## Number of Support Vectors: 77
##
## ( 40 37 )
##
##
## Number of Classes: 2
##
## Levels:
## FALSE TRUE
# SVM с радиальным ядром и большим cost
svmfit <- svm(Revenue ~ ., data = dat[train, ], kernel = "radial",
gamma = 1, cost = 1e5)
plot(svmfit, dat[train, ])
summary(svmfit)
##
## Call:
## svm(formula = Revenue ~ ., data = dat[train, ], kernel = "radial",
## gamma = 1, cost = 1e+05)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1e+05
## gamma: 1
##
## Number of Support Vectors: 64
##
## ( 39 25 )
##
##
## Number of Classes: 2
##
## Levels:
## FALSE TRUE
Перекрестная проверка.
# перекрёстная проверка
set.seed(my.seed)
tune.out <- tune(svm, Revenue ~ ., data = dat[train, ], kernel = "radial",
ranges = list(cost = c(0.1, 1, 10, 5),
gamma = c(0.5, 0.1,0.05,1, 2, 3)))
summary(tune.out)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 5 0.1
##
## - best performance: 0.09387097
##
## - Detailed performance results:
## cost gamma error dispersion
## 1 0.1 0.50 0.12333333 0.05640363
## 2 1.0 0.50 0.10354839 0.06910047
## 3 10.0 0.50 0.11354839 0.05933648
## 4 5.0 0.50 0.10688172 0.06059829
## 5 0.1 0.10 0.12333333 0.05640363
## 6 1.0 0.10 0.10043011 0.05547155
## 7 10.0 0.10 0.09720430 0.06057920
## 8 5.0 0.10 0.09387097 0.06330210
## 9 0.1 0.05 0.12333333 0.05640363
## 10 1.0 0.05 0.10376344 0.05420429
## 11 10.0 0.05 0.09720430 0.06057920
## 12 5.0 0.05 0.10043011 0.05547155
## 13 0.1 1.00 0.12333333 0.05640363
## 14 1.0 1.00 0.11010753 0.06085848
## 15 10.0 1.00 0.11666667 0.06277059
## 16 5.0 1.00 0.12000000 0.06267218
## 17 0.1 2.00 0.12333333 0.05640363
## 18 1.0 2.00 0.11666667 0.06635231
## 19 10.0 2.00 0.11989247 0.06246285
## 20 5.0 2.00 0.11666667 0.05897174
## 21 0.1 3.00 0.12333333 0.05640363
## 22 1.0 3.00 0.11989247 0.06246285
## 23 10.0 3.00 0.11021505 0.04826149
## 24 5.0 3.00 0.11666667 0.06090079
Построим матрицу неточностей для прогноза по лучшей модели на прогнозных данных и рассчитаем MSE.
setwd("D:/Desktop")
DF1 <- read.table('Online_Shopping_for_forecast.csv', header = T, # заголовок в первой строке
dec = ',', # разделитель целой и дробной части
sep = ';') # символы пропущенных значений
df1 <- na.omit(DF1)
Revenue <- c("TRUE", "FALSE")
df1 <- data.frame(df1, Revenue)
Revenue <- as.factor(Revenue)
Revenue <- df1$Revenue
Revenue <- Revenue[1:30]
PageValues <- df1$PageValues
PageValues <- PageValues[1:30]
Administrative_Duration <- df1$Administrative_Duration
Administrative_Duration <- Administrative_Duration[1:30]
n <- nrow(df1)
# доля обучающей выборки
train.percent <- 0.5
# выбрать наблюдения в обучающую выборку
set.seed(my.seed)
train <- sample(n, n * train.percent)
dat1 <- data.frame(PageValues, Administrative_Duration, Revenue)
#матрица неточностей для прогноза по лучшей модели
matrix <- table(true = dat1[-train, "Revenue"],
pred = predict(tune.out$best.model, newdata = dat1[-train, ]))
bestmod <- tune.out$best.model
summary(bestmod)
##
## Call:
## best.tune(method = svm, train.x = Revenue ~ ., data = dat[train,
## ], ranges = list(cost = c(0.1, 1, 10, 5), gamma = c(0.5,
## 0.1, 0.05, 1, 2, 3)), kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 5
## gamma: 0.1
##
## Number of Support Vectors: 71
##
## ( 36 35 )
##
##
## Number of Classes: 2
##
## Levels:
## FALSE TRUE
#MSE
sum(diag(matrix))/sum(matrix)
## [1] 0.4210526
MSE по лучшей модели на прогнозных данных составило- 0.4210526, значение оказалось очень маленьким, а значит предикторы в модели выбраны верно.