Statistical Learning

print(Sys.time())

## [1] "2020-03-16 12:34:03 CST"

Linear regression

General linear regression

Logistic regression

KNN

K-nearest neighbour algorithm for classification and regression wiki, and an introduction paper Altman, Am Stat, 1992, 46:175-85. Not confused with K-means clustering.

library(kknn)

Random Forest

Algorithm wiki Step 1 Decision tree learning wiki

Step 2 Bagging for evaluating error wiki

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

set.seed(71)
iris.rf <- randomForest(Species ~ ., data=iris, importance=TRUE,
                        proximity=TRUE)
print(iris.rf)

## 
## Call:
##  randomForest(formula = Species ~ ., data = iris, importance = TRUE,      proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4.67%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         50          0         0        0.00
## versicolor      0         47         3        0.06
## virginica       0          4        46        0.08

## Look at variable importance:
round(importance(iris.rf), 2)

##              setosa versicolor virginica MeanDecreaseAccuracy MeanDecreaseGini
## Sepal.Length   5.88       5.87      9.21                10.62             9.37
## Sepal.Width    5.23       0.31      4.71                 4.94             2.45
## Petal.Length  21.60      31.41     27.71                32.39            42.13
## Petal.Width   22.96      33.74     32.07                33.85            45.28

## Do MDS on 1 - proximity:
iris.mds <- cmdscale(1 - iris.rf$proximity, eig=TRUE)
op <- par(pty="s")
pairs(cbind(iris[,1:4], iris.mds$points), cex=0.6, gap=0,
      col=c("red", "green", "blue")[as.numeric(iris$Species)],
      main="Iris Data: Predictors and MDS of Proximity Based on RandomForest")

par(op)
print(iris.mds$GOF)

## [1] 0.7373483 0.7989356

## The `unsupervised' case:
set.seed(17)
iris.urf <- randomForest(iris[, -5])
MDSplot(iris.urf, iris$Species)

## stratified sampling: draw 20, 30, and 20 of the species to grow each tree.
(iris.rf2 <- randomForest(iris[1:4], iris$Species, 
                          sampsize=c(20, 30, 20)))

## 
## Call:
##  randomForest(x = iris[1:4], y = iris$Species, sampsize = c(20,      30, 20)) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 4.67%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         50          0         0        0.00
## versicolor      0         47         3        0.06
## virginica       0          4        46        0.08

## Regression:
## data(airquality)
set.seed(131)
ozone.rf <- randomForest(Ozone ~ ., data=airquality, mtry=3,
                         importance=TRUE, na.action=na.omit)
print(ozone.rf)

## 
## Call:
##  randomForest(formula = Ozone ~ ., data = airquality, mtry = 3,      importance = TRUE, na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 302.2117
##                     % Var explained: 72.46

## Show "importance" of variables: higher value mean more important:
round(importance(ozone.rf), 2)

##         %IncMSE IncNodePurity
## Solar.R    9.76      10741.33
## Wind      22.15      44234.96
## Temp      43.85      53787.56
## Month      2.59       1692.48
## Day        0.93       6606.39

## "x" can be a matrix instead of a data frame:
set.seed(17)
x <- matrix(runif(5e2), 100)
y <- gl(2, 50)
(myrf <- randomForest(x, y))

## 
## Call:
##  randomForest(x = x, y = y) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 52%
## Confusion matrix:
##    1  2 class.error
## 1 25 25        0.50
## 2 27 23        0.54

(predict(myrf, x))

##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   1   1   1   1   1   1   1   1   1   1   2   2   2   2   2   2   2   2   2   2 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## Levels: 1 2

## "complicated" formula:
(swiss.rf <- randomForest(sqrt(Fertility) ~ . - Catholic + I(Catholic < 50),
                          data=swiss))

## 
## Call:
##  randomForest(formula = sqrt(Fertility) ~ . - Catholic + I(Catholic <      50), data = swiss) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.3277816
##                     % Var explained: 44.34

(predict(swiss.rf, swiss))

##   Courtelary     Delemont Franches-Mnt      Moutier   Neuveville   Porrentruy 
##     8.541742     8.992064     9.124838     8.760460     8.541086     8.949770 
##        Broye        Glane      Gruyere       Sarine      Veveyse        Aigle 
##     8.933005     9.149158     8.908574     8.887482     9.117942     7.922911 
##      Aubonne     Avenches     Cossonay    Echallens     Grandson     Lausanne 
##     8.333468     8.221398     8.031662     8.306702     8.439164     7.720665 
##    La Vallee       Lavaux       Morges       Moudon        Nyone         Orbe 
##     7.571783     8.193442     7.984520     8.340828     7.854661     7.872754 
##         Oron      Payerne Paysd'enhaut        Rolle        Vevey      Yverdon 
##     8.497338     8.503535     8.411120     8.035106     7.839470     8.330708 
##      Conthey    Entremont       Herens     Martigwy      Monthey   St Maurice 
##     8.834288     8.654145     8.757652     8.560958     8.882983     8.469376 
##       Sierre         Sion       Boudry La Chauxdfnd     Le Locle    Neuchatel 
##     8.928218     8.635219     8.258310     8.028697     8.177889     7.779149 
##   Val de Ruz ValdeTravers V. De Geneve  Rive Droite  Rive Gauche 
##     8.573018     8.155001     6.776675     7.527191     7.289894

## Test use of 32-level factor as a predictor:
set.seed(1)
x <- data.frame(x1=gl(53, 10), x2=runif(530), y=rnorm(530))
(rf1 <- randomForest(x[-3], x[[3]], ntree=10))

## 
## Call:
##  randomForest(x = x[-3], y = x[[3]], ntree = 10) 
##                Type of random forest: regression
##                      Number of trees: 10
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 1.599315
##                     % Var explained: -44.33

## Grow no more than 4 nodes per tree:
(treesize(randomForest(Species ~ ., data=iris, maxnodes=4, ntree=30)))

##  [1] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4

## test proximity in regression
iris.rrf <- randomForest(iris[-1], iris[[1]], ntree=101, proximity=TRUE, oob.prox=FALSE)
str(iris.rrf$proximity)

##  num [1:150, 1:150] 1 0.386 0.347 0.307 0.881 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:150] "1" "2" "3" "4" ...
##   ..$ : chr [1:150] "1" "2" "3" "4" ...

SVM

SVM wiki SVM in R intro

library(e1071)
data(iris)
attach(iris)

## classification mode
# default with factor response:
model <- svm(Species ~ ., data = iris)

# alternatively the traditional interface:
x <- subset(iris, select = -Species)
y <- Species
model <- svm(x, y) 

print(model)

## 
## Call:
## svm.default(x = x, y = y)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  51

summary(model)

## 
## Call:
## svm.default(x = x, y = y)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  51
## 
##  ( 8 22 21 )
## 
## 
## Number of Classes:  3 
## 
## Levels: 
##  setosa versicolor virginica

# test with train data
pred <- predict(model, x)
# (same as:)
pred <- fitted(model)

# Check accuracy:
table(pred, y)

##             y
## pred         setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          2        48

# compute decision values and probabilities:
pred <- predict(model, x, decision.values = TRUE)
attr(pred, "decision.values")[1:4,]

##   setosa/versicolor setosa/virginica versicolor/virginica
## 1          1.196152         1.091757            0.6708810
## 2          1.064621         1.056185            0.8483518
## 3          1.180842         1.074542            0.6439798
## 4          1.110699         1.053012            0.6782041

# visualize (classes by color, SV by crosses):
plot(cmdscale(dist(iris[,-5])),
     col = as.integer(iris[,5]),
     pch = c("o","+")[1:150 %in% model$index + 1])

## try regression mode on two dimensions

# create data
x <- seq(0.1, 5, by = 0.05)
y <- log(x) + rnorm(x, sd = 0.2)

# estimate model and predict input values
m   <- svm(x, y)
new <- predict(m, x)

# visualize
plot(x, y)
points(x, log(x), col = 2)
points(x, new, col = 4)

## density-estimation

# create 2-dim. normal with rho=0:
X <- data.frame(a = rnorm(1000), b = rnorm(1000))
attach(X)

# traditional way:
m <- svm(X, gamma = 0.1)

# formula interface:
m <- svm(~., data = X, gamma = 0.1)
# or:
m <- svm(~ a + b, gamma = 0.1)

# test:
newdata <- data.frame(a = c(0, 4), b = c(0, 4))
predict (m, newdata)

##     1     2 
##  TRUE FALSE

# visualize:
plot(X, col = 1:1000 %in% m$index + 1, xlim = c(-5,5), ylim=c(-5,5))
points(newdata, pch = "+", col = 2, cex = 5)

## weights: (example not particularly sensible)
i2 <- iris
levels(i2$Species)[3] <- "versicolor"
summary(i2$Species)

##     setosa versicolor 
##         50        100

wts <- 100 / table(i2$Species)
wts

## 
##     setosa versicolor 
##          2          1

m <- svm(Species ~ ., data = i2, class.weights = wts)

## extract coefficients for linear kernel

# a. regression
x <- 1:100
y <- x + rnorm(100)
m <- svm(y ~ x, scale = FALSE, kernel = "linear")
coef(m)

## (Intercept)           x 
##   0.0887966   1.0001559

plot(y ~ x)
abline(m, col = "red")

# b. classification
# transform iris data to binary problem, and scale data
setosa <- as.factor(iris$Species == "setosa")
iris2 = scale(iris[,-5])

# fit binary C-classification model
m <- svm(setosa ~ Petal.Width + Petal.Length,
        data = iris2, kernel = "linear")

# plot data and separating hyperplane
plot(Petal.Length ~ Petal.Width, data = iris2, col = setosa)
(cf <- coef(m))

##  (Intercept)  Petal.Width Petal.Length 
##    -1.658953    -1.172523    -1.358030

abline(-cf[1]/cf[3], -cf[2]/cf[3], col = "red")

# plot margin and mark support vectors
abline(-(cf[1] + 1)/cf[3], -cf[2]/cf[3], col = "blue")
abline(-(cf[1] - 1)/cf[3], -cf[2]/cf[3], col = "blue")
points(m$SV, pch = 5, cex = 2)

Rsys

sessionInfo()

## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.3
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] e1071_1.7-3         randomForest_4.6-14 kknn_1.3.1         
## [4] Rcpp_1.0.3         
## 
## loaded via a namespace (and not attached):
##  [1] lattice_0.20-40    class_7.3-15       digest_0.6.25      grid_3.6.2        
##  [5] magrittr_1.5       evaluate_0.14      rlang_0.4.5        stringi_1.4.6     
##  [9] Matrix_1.2-18      rmarkdown_2.1      RColorBrewer_1.1-2 tools_3.6.2       
## [13] stringr_1.4.0      igraph_1.2.4.2     xfun_0.12          yaml_2.2.1        
## [17] compiler_3.6.2     pkgconfig_2.0.3    htmltools_0.4.0    knitr_1.28

EleSL

gc5k

3/16/2020