print(Sys.time())
## [1] "2020-03-16 12:34:03 CST"
K-nearest neighbour algorithm for classification and regression wiki, and an introduction paper Altman, Am Stat, 1992, 46:175-85. Not confused with K-means clustering.
library(kknn)
Algorithm wiki Step 1 Decision tree learning wiki
Step 2 Bagging for evaluating error wiki
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(71)
iris.rf <- randomForest(Species ~ ., data=iris, importance=TRUE,
proximity=TRUE)
print(iris.rf)
##
## Call:
## randomForest(formula = Species ~ ., data = iris, importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.67%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 50 0 0 0.00
## versicolor 0 47 3 0.06
## virginica 0 4 46 0.08
## Look at variable importance:
round(importance(iris.rf), 2)
## setosa versicolor virginica MeanDecreaseAccuracy MeanDecreaseGini
## Sepal.Length 5.88 5.87 9.21 10.62 9.37
## Sepal.Width 5.23 0.31 4.71 4.94 2.45
## Petal.Length 21.60 31.41 27.71 32.39 42.13
## Petal.Width 22.96 33.74 32.07 33.85 45.28
## Do MDS on 1 - proximity:
iris.mds <- cmdscale(1 - iris.rf$proximity, eig=TRUE)
op <- par(pty="s")
pairs(cbind(iris[,1:4], iris.mds$points), cex=0.6, gap=0,
col=c("red", "green", "blue")[as.numeric(iris$Species)],
main="Iris Data: Predictors and MDS of Proximity Based on RandomForest")
par(op)
print(iris.mds$GOF)
## [1] 0.7373483 0.7989356
## The `unsupervised' case:
set.seed(17)
iris.urf <- randomForest(iris[, -5])
MDSplot(iris.urf, iris$Species)
## stratified sampling: draw 20, 30, and 20 of the species to grow each tree.
(iris.rf2 <- randomForest(iris[1:4], iris$Species,
sampsize=c(20, 30, 20)))
##
## Call:
## randomForest(x = iris[1:4], y = iris$Species, sampsize = c(20, 30, 20))
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.67%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 50 0 0 0.00
## versicolor 0 47 3 0.06
## virginica 0 4 46 0.08
## Regression:
## data(airquality)
set.seed(131)
ozone.rf <- randomForest(Ozone ~ ., data=airquality, mtry=3,
importance=TRUE, na.action=na.omit)
print(ozone.rf)
##
## Call:
## randomForest(formula = Ozone ~ ., data = airquality, mtry = 3, importance = TRUE, na.action = na.omit)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 302.2117
## % Var explained: 72.46
## Show "importance" of variables: higher value mean more important:
round(importance(ozone.rf), 2)
## %IncMSE IncNodePurity
## Solar.R 9.76 10741.33
## Wind 22.15 44234.96
## Temp 43.85 53787.56
## Month 2.59 1692.48
## Day 0.93 6606.39
## "x" can be a matrix instead of a data frame:
set.seed(17)
x <- matrix(runif(5e2), 100)
y <- gl(2, 50)
(myrf <- randomForest(x, y))
##
## Call:
## randomForest(x = x, y = y)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 52%
## Confusion matrix:
## 1 2 class.error
## 1 25 25 0.50
## 2 27 23 0.54
(predict(myrf, x))
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## Levels: 1 2
## "complicated" formula:
(swiss.rf <- randomForest(sqrt(Fertility) ~ . - Catholic + I(Catholic < 50),
data=swiss))
##
## Call:
## randomForest(formula = sqrt(Fertility) ~ . - Catholic + I(Catholic < 50), data = swiss)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.3277816
## % Var explained: 44.34
(predict(swiss.rf, swiss))
## Courtelary Delemont Franches-Mnt Moutier Neuveville Porrentruy
## 8.541742 8.992064 9.124838 8.760460 8.541086 8.949770
## Broye Glane Gruyere Sarine Veveyse Aigle
## 8.933005 9.149158 8.908574 8.887482 9.117942 7.922911
## Aubonne Avenches Cossonay Echallens Grandson Lausanne
## 8.333468 8.221398 8.031662 8.306702 8.439164 7.720665
## La Vallee Lavaux Morges Moudon Nyone Orbe
## 7.571783 8.193442 7.984520 8.340828 7.854661 7.872754
## Oron Payerne Paysd'enhaut Rolle Vevey Yverdon
## 8.497338 8.503535 8.411120 8.035106 7.839470 8.330708
## Conthey Entremont Herens Martigwy Monthey St Maurice
## 8.834288 8.654145 8.757652 8.560958 8.882983 8.469376
## Sierre Sion Boudry La Chauxdfnd Le Locle Neuchatel
## 8.928218 8.635219 8.258310 8.028697 8.177889 7.779149
## Val de Ruz ValdeTravers V. De Geneve Rive Droite Rive Gauche
## 8.573018 8.155001 6.776675 7.527191 7.289894
## Test use of 32-level factor as a predictor:
set.seed(1)
x <- data.frame(x1=gl(53, 10), x2=runif(530), y=rnorm(530))
(rf1 <- randomForest(x[-3], x[[3]], ntree=10))
##
## Call:
## randomForest(x = x[-3], y = x[[3]], ntree = 10)
## Type of random forest: regression
## Number of trees: 10
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 1.599315
## % Var explained: -44.33
## Grow no more than 4 nodes per tree:
(treesize(randomForest(Species ~ ., data=iris, maxnodes=4, ntree=30)))
## [1] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## test proximity in regression
iris.rrf <- randomForest(iris[-1], iris[[1]], ntree=101, proximity=TRUE, oob.prox=FALSE)
str(iris.rrf$proximity)
## num [1:150, 1:150] 1 0.386 0.347 0.307 0.881 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:150] "1" "2" "3" "4" ...
## ..$ : chr [1:150] "1" "2" "3" "4" ...
library(e1071)
data(iris)
attach(iris)
## classification mode
# default with factor response:
model <- svm(Species ~ ., data = iris)
# alternatively the traditional interface:
x <- subset(iris, select = -Species)
y <- Species
model <- svm(x, y)
print(model)
##
## Call:
## svm.default(x = x, y = y)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 51
summary(model)
##
## Call:
## svm.default(x = x, y = y)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 51
##
## ( 8 22 21 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
# test with train data
pred <- predict(model, x)
# (same as:)
pred <- fitted(model)
# Check accuracy:
table(pred, y)
## y
## pred setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48
# compute decision values and probabilities:
pred <- predict(model, x, decision.values = TRUE)
attr(pred, "decision.values")[1:4,]
## setosa/versicolor setosa/virginica versicolor/virginica
## 1 1.196152 1.091757 0.6708810
## 2 1.064621 1.056185 0.8483518
## 3 1.180842 1.074542 0.6439798
## 4 1.110699 1.053012 0.6782041
# visualize (classes by color, SV by crosses):
plot(cmdscale(dist(iris[,-5])),
col = as.integer(iris[,5]),
pch = c("o","+")[1:150 %in% model$index + 1])
## try regression mode on two dimensions
# create data
x <- seq(0.1, 5, by = 0.05)
y <- log(x) + rnorm(x, sd = 0.2)
# estimate model and predict input values
m <- svm(x, y)
new <- predict(m, x)
# visualize
plot(x, y)
points(x, log(x), col = 2)
points(x, new, col = 4)
## density-estimation
# create 2-dim. normal with rho=0:
X <- data.frame(a = rnorm(1000), b = rnorm(1000))
attach(X)
# traditional way:
m <- svm(X, gamma = 0.1)
# formula interface:
m <- svm(~., data = X, gamma = 0.1)
# or:
m <- svm(~ a + b, gamma = 0.1)
# test:
newdata <- data.frame(a = c(0, 4), b = c(0, 4))
predict (m, newdata)
## 1 2
## TRUE FALSE
# visualize:
plot(X, col = 1:1000 %in% m$index + 1, xlim = c(-5,5), ylim=c(-5,5))
points(newdata, pch = "+", col = 2, cex = 5)
## weights: (example not particularly sensible)
i2 <- iris
levels(i2$Species)[3] <- "versicolor"
summary(i2$Species)
## setosa versicolor
## 50 100
wts <- 100 / table(i2$Species)
wts
##
## setosa versicolor
## 2 1
m <- svm(Species ~ ., data = i2, class.weights = wts)
## extract coefficients for linear kernel
# a. regression
x <- 1:100
y <- x + rnorm(100)
m <- svm(y ~ x, scale = FALSE, kernel = "linear")
coef(m)
## (Intercept) x
## 0.0887966 1.0001559
plot(y ~ x)
abline(m, col = "red")
# b. classification
# transform iris data to binary problem, and scale data
setosa <- as.factor(iris$Species == "setosa")
iris2 = scale(iris[,-5])
# fit binary C-classification model
m <- svm(setosa ~ Petal.Width + Petal.Length,
data = iris2, kernel = "linear")
# plot data and separating hyperplane
plot(Petal.Length ~ Petal.Width, data = iris2, col = setosa)
(cf <- coef(m))
## (Intercept) Petal.Width Petal.Length
## -1.658953 -1.172523 -1.358030
abline(-cf[1]/cf[3], -cf[2]/cf[3], col = "red")
# plot margin and mark support vectors
abline(-(cf[1] + 1)/cf[3], -cf[2]/cf[3], col = "blue")
abline(-(cf[1] - 1)/cf[3], -cf[2]/cf[3], col = "blue")
points(m$SV, pch = 5, cex = 2)
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.3
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] e1071_1.7-3 randomForest_4.6-14 kknn_1.3.1
## [4] Rcpp_1.0.3
##
## loaded via a namespace (and not attached):
## [1] lattice_0.20-40 class_7.3-15 digest_0.6.25 grid_3.6.2
## [5] magrittr_1.5 evaluate_0.14 rlang_0.4.5 stringi_1.4.6
## [9] Matrix_1.2-18 rmarkdown_2.1 RColorBrewer_1.1-2 tools_3.6.2
## [13] stringr_1.4.0 igraph_1.2.4.2 xfun_0.12 yaml_2.2.1
## [17] compiler_3.6.2 pkgconfig_2.0.3 htmltools_0.4.0 knitr_1.28