# Installing some useful packages
library(tidyverse)
library(kableExtra)
library(magrittr)
library(data.table)

Chapter 1

# Installing some useful packages 
# Tools
library(RCurl)
library(jsonlite)
library(caret)  # Classification and Regression Training
library(e1071)
# Basic statistics packages
library(statmod)
library(MASS)
# Neural networks (NN)
library(nnet)
library(neuralnet)
library(RSNNS)  # NN using the Stuttgart Neural Network Simulator
# deep learning
require(darch)
library(deepnet)
library(h2o)

本章我們將透過一些例題 (example)來展示如何在R中建立並訓練基本的Neuro Network,在建立基本的Neuro Network前,先在R安裝上述所需的套件,並先安裝好Java,以利之後使用建立模型使用

Build a Neuro Network

For our first examples of building neural networks, we will use a classic classification problem : recognizing handwritten digits based on pictures.

The dataset (MNIST) can be downloaded from https://www.kaggle.com/c/digit-recognizer

# Download some dataset (MNIST)
setwd("/Users/linweixiang/R/Deep Learning/Dataset/MNIST")
digits.train <- fread("train.csv") %>% as.data.frame() # dim() = 42000 * 785
digits.test <- fread("test.csv") %>% as.data.frame()    # dim() = 28000 * 784

讀進來的資料有785個變數 (28 * 28 pixels + 1 label)

# Transform to 28 * 28 matrix and plot some figures
par(mfrow = c(2, 4))
for (i in 1:8) {
  # Transform to 28 * 28 matrix
  obs_matrix <- digits.train[i, -1] %>% unlist %>% 
  matrix(., nrow = 28, byrow = T) %>% 
  apply(., 2, rev) %>% t
  # Visualization
  obs_matrix %>% image(., col = grey.colors(255), 
                       main = paste("Label", 
                                    digits.train[i, 1], 
                                    sep=": "))
}

If this were a real-world problem, we would want to use all 42,000 observations but, for the sake of reducing how long it takes to run, we will select just the first 5,000 for these first examples of building and training a neural network

# Convert to factor and select 5000 samples to train model 
digits.train$label <-  factor(digits.train$label, levels = 0:9)
i <- 1:5000
digits.X <- digits.train[i, -1]
digits.y <- digits.train[i, 1]

Before we get started building our neural network, let’s quickly check the distribution of the digits.

# Visualization 
digits.y %>% table() %>% as.data.frame() %>% 
  `colnames<-`(., c("label", "freq")) %>% 
  ggplot(data = ., aes(x = label, y = freq)) + 
  geom_bar(stat = "identity") + 
  labs(y = "frequency")

Training

Now let’s build and train our first neural network using the nnet package through the caret package wrapper.

# Building a neuro network model 
l <- 1
for (i in c(5, 10)) {
  # 1. Set assign names
  model_name <- paste("digits.m", l, sep = "") %>% print()
  time_name <- paste("Elapsed_time", l, sep = "_") 
  Predict_name <- paste("digits.yhat", l, sep = "_") 
  Confusion_name <- paste("confusionMatrix", l, sep = "_")
  
  # 2. Execute code 
  set.seed(1234)
  ## 2.1 Training a Prediction Model
  time.start <- Sys.time()
  Model <- caret::train(x = digits.X, y = digits.y,
                        method = "nnet",  # Neuro network
                        tuneGrid = expand.grid(.size = c(i), # The number of hidden neuros 
                                               .decay = 0.1), # Learning rate 
                        trControl = trainControl(method = "none"),
                        MaxNWts = 10000,
                        maxit = 100) 
  time.stop <- Sys.time()
  Time <- (time.stop - time.start) %>% round(., 2)
  ## 2.2 Model prediction 
  Model_prediction <- Model %>% predict() 
  Confusion_matrix <- caret::confusionMatrix(xtabs(formula = ~ Model_prediction + digits.y))
  # 3. Assign variable 
  assign(x = model_name, value = Model)
  assign(x = time_name, value = Time)
  assign(x = Predict_name, value = Model_prediction)
  assign(x = Confusion_name, value = Confusion_matrix)
  l = l + 1
}
# Remove some object to release memory 
remove(i, l, model_name, time_name, Model, Time, Predict_name, Confusion_name, Model_prediction, Confusion_matrix)
# Summarize model_1 and model_2 
cbind(rbind(confusionMatrix_1$overall[1],
            confusionMatrix_2$overall[1]) %>% round(., 3), 
      rbind(5, 10), 
      rbind(paste(Elapsed_time_1, "s"), 
            paste(Elapsed_time_2, "min"))) %>% 
  `rownames<-`(., paste("Model", 1:2, sep = "_")) %>% 
  kable(., format = "html", 
        col.names = c("Accuracy", "Neuros", "Times")) %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
Accuracy Neuros Times
Model_1 0.495 5 51.98 s
Model_2 0.629 10 3.42 min

由上述表格可看出,增加隱藏神經元(hidden neurons)的個數,是提升模型性能(Accuracy)的關鍵,但其代價是模型的複雜度會隨之增加

可看出accuracy依然不夠好,故我們可以考慮增加至40個hidden neurons

set.seed(1234)
# Building a Neuro Network
time.start <- Sys.time()
digits.m3 <- caret::train(x = digits.X, y = digits.y,
                   method = "nnet",  # Neuro network
                   tuneGrid = expand.grid(.size = c(40), # The number of hidden neuros 
                                          .decay = 0.1), # Learning rate 
                   trControl = trainControl(method = "none"),
                   MaxNWts = 50000,
                   maxit = 100) 
time.stop <- Sys.time()
# Calculating elapsed time 
Elapsed_time_3 <- (time.stop - time.start) %>% round(., 2)
# Confusion matrix 
digits.yhat_3 <- digits.m3 %>% predict()
confusionMatrix_3 <- xtabs(formula = ~ digits.yhat_3 + digits.y) %>% 
  caret::confusionMatrix(.)
# Summarize model_1 to model_3 
cbind(rbind(confusionMatrix_1$overall[1],
            confusionMatrix_2$overall[1], 
            confusionMatrix_3$overall[1]) %>% round(., 3), 
      rbind(5, 10, 40), 
      rbind(paste(Elapsed_time_1, "s"), 
            paste(Elapsed_time_2, "min"), 
            paste(Elapsed_time_3, "min"))) %>%
  `rownames<-`(., paste("Model", 1:3, sep = "_")) %>% 
  kable(., format = "html", 
        col.names = c("Accuracy", "Neuros", "Times")) %>% 
  kable_styling(bootstrap_options = "striped",
                full_width = F)
Accuracy Neuros Times
Model_1 0.495 5 51.98 s
Model_2 0.629 10 3.42 min
Model_3 0.871 40 49.56 min

由上述表格知,當hidden neuros 增加至40個時,可看出accuracy上升至87%且模型複雜度也隨之增加

接著我們討論如何使用RSNNpackage 去建構Neuro Network

# RSNNS : Stuttgart Neural Network Simulator
# Multi-layer perception (MLP) 
set.seed(1234)
time.start <- Sys.time()
digits.m4 <- mlp(x = as.matrix(digits.X),
                 y = decodeClassLabels(digits.y),
                 size = 40,
                 learnFunc = "Rprop",
                 shufflePatterns = FALSE,
                 maxit = 100)
time.stop <- Sys.time()
Elapsed_time_4 <- (time.stop - time.start) %>% round(., 2)
digits.yhat_4 <- digits.m4 %>% fitted.values() %>%
  encodeClassLabels() 
confusionMatrix_4 <- xtabs(formula = ~ I(digits.yhat_4-1) + digits.y) %>% 
  caret::confusionMatrix()
# Compare 
cbind(rbind(confusionMatrix_3$overall[1], 
            confusionMatrix_4$overall[1]) %>% round(., 3), 
      rbind(40, 40), 
      rbind(paste(Elapsed_time_3, "min"), 
            paste(Elapsed_time_4, "min"))) %>%
  `rownames<-`(., c("NN", "MLP")) %>% 
  kable(., format = "html", 
        col.names = c("Accuracy", "Neuros", "Times")) %>% 
  kable_styling(bootstrap_options = "striped",
                full_width = F)
Accuracy Neuros Times
NN 0.871 40 49.56 min
MLP 0.88 40 4.11 min

由上述表格可看出,使用RSNNSpackage中的Multi-layer perception (MLP) 的演算法,在hiden neuros相同下,其結果較好。

Prediction

只要不存在linear seperable,最簡單的的方法就是基於高預測機率去對觀測值做分類,即

digits.m3 %>% fitted.values() %>% round(., 3) %>% 
  as.data.frame() %>% head %>% 
  kable(., format = "html", caption = "Neuro network:40") %>% 
  kable_styling(bootstrap_options = "striped", full_width = F) %>% 
  footnote("dim() = 5000 * 10")
Neuro network:40
0 1 2 3 4 5 6 7 8 9
X1 0.000 0.997 0.000 0.001 0.000 0.000 0.000 0.000 0.001 0.000
X2 0.963 0.000 0.014 0.009 0.000 0.003 0.000 0.000 0.011 0.000
X3 0.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
X4 0.000 0.000 0.085 0.000 0.353 0.002 0.050 0.278 0.021 0.210
X5 0.997 0.000 0.001 0.001 0.000 0.002 0.000 0.000 0.000 0.000
X6 0.252 0.000 0.338 0.003 0.000 0.005 0.005 0.071 0.325 0.001
Note:
dim() = 5000 * 10

上述表格意思為,第一筆資料(X1)有99.7%的機率是1,故在此模型下會把X1分類為1

使用predict()可以很容易的進行預測,接下來要對training set中另外的5000筆資料進行預測

# Prediction to the new data
i2 <- 5001:10000
digits.yhat_3.pred <- predict(object = digits.m3, 
                              newdata = digits.train[i2, -1])
xtabs(formula = ~ digits.yhat_3.pred + digits.train[i2, 1]) %>% 
  caret::confusionMatrix(.) %>% .$overall %>% .[1]
## Accuracy 
##    0.844

Overfitting

機器學習中常見的問題有overfitting.
Overfitting 簡單來說就是model在training set (In sample)中分類的很好,但在testing set (Out sample)中分類的不好,即

# RSNNS
digits.yhat_4.pred <- predict(object = digits.m4, 
                              newdata = digits.train[i2, -1])
confusionMatrix_4.pred <- xtabs(formula = ~ I(encodeClassLabels(digits.yhat_4.pred) - 1) + 
        digits.train[i2, 1]) %>% 
  caret::confusionMatrix() 
c(confusionMatrix_4$overall[1], 
  confusionMatrix_4.pred$overall[1]) %>%  t() %>% 
  round(., 3) %>% 
  `rownames<-`(., "RSNNS:MLP") %>% 
  kable(., format = "html", caption = "Accuracy", 
        col.names = c("In sample", "Out sample")) %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
Accuracy
In sample Out sample
RSNNS:MLP 0.88 0.831

上述表格可看出,我們用RSNNS model 在in sample 中的準確率高於RSNNS model 在out sample 中的準確率,表此模型有overfitting 的情況

Example

最後,我們來討論Neuro network的一個實際案例,即 Human Activity Recognition Using Smartphones

此組資料是用戶在行走 (Walking)、上樓 (Walking_upstairs)、下樓 (Walking_downstairs)、坐下 (Sitting)、站立 (Standing)、躺著 (Laying)時,都會把智慧型手機帶在身上,並藉由手機上的加速計、陀螺儀得到時間和頻率的561個features (資料都以標準化)。

分析目的為,藉由手機的數據去預測人們在行為(Activity)

# Loading dataset
setwd("/Users/linweixiang/R/Deep Learning/Dataset")
har_train_x <- read_table("UCI HAR Dataset/train/X_train.txt", col_names = FALSE)   
har_train_y <- read_table("UCI HAR Dataset/train/y_train.txt", col_names = FALSE)

har_test_x <- read_table("UCI HAR Dataset/test/X_test.txt", col_names = FALSE)
har_test_y <- read_table("UCI HAR Dataset/test/y_test.txt", col_names = FALSE)

har_labels <- read_table("UCI HAR Dataset/activity_labels.txt", col_names = FALSE)
har_train_x[1:5, 1:7] %>% 
  kable(.,format = "html", caption = "Train set_X") %>% 
  kable_styling(bootstrap_options = "striped", full_width = F) %>%
  footnote("dim() = 7352 * 561")
Train set_X
X1 X2 X3 X4 X5 X6 X7
0.2885845 -0.0202942 -0.1329051 -0.9952786 -0.9831106 -0.9135264 -0.9951121
0.2784188 -0.0164106 -0.1235202 -0.9982453 -0.9753002 -0.9603220 -0.9988072
0.2796531 -0.0194672 -0.1134617 -0.9953796 -0.9671870 -0.9789440 -0.9965199
0.2791739 -0.0262006 -0.1232826 -0.9960915 -0.9834027 -0.9906751 -0.9970995
0.2766288 -0.0165697 -0.1153618 -0.9981386 -0.9808173 -0.9904816 -0.9983211
Note:
dim() = 7352 * 561
har_train_y[1:7, ] %>% t %>% 
  kable(., format = "html", caption = "Train set of y") %>% 
  kable_styling(bootstrap_options = "striped", full_width = F) %>% 
  footnote("length() - 7352")
Train set of y
X1 5 5 5 5 5 5 5
Note:
length() - 7352

可由bar chart看出,人們行為(Activity)的分佈

# Visualization
har_train_y %>% table() %>% data.frame %>% 
  `colnames<-`(., c("Label", "Freq")) %>% 
  ggplot(data = ., aes(x = Label, y = Freq)) + 
  geom_bar(stat = "identity")

為了比較在不同參數下,模型的表現,我們在此使用平行計算,以節省運算時間。

# Installing some useful packages
library(readr)
library(parallel)
library(foreach)
library(doSNOW)

現在我們設定五組不同的turning parameter,並建立一個local cluster

# Setting parallel computing 
tuning <- list(
  size = c(40, 20, 20, 50, 50),
  maxit = c(60, 100, 100, 100, 100),
  shuffle = c(FALSE, FALSE, TRUE, FALSE, FALSE),
  params = list(FALSE, FALSE, FALSE, FALSE, c(0.1, 20, 3)))
## setup cluster using 4 cores
## load packages, export required data and variables
## and register as a backend for use with the foreach / doParallel package
library(doParallel)

cl <- makeCluster(4)
# 讓所有節點 (workers)載入指定套件
clusterEvalQ(cl, {
  c(library(RSNNS), library(magrittr))
})
## [[1]]
##  [1] "RSNNS"     "Rcpp"      "stats"     "graphics"  "grDevices"
##  [6] "utils"     "datasets"  "methods"   "base"      "magrittr" 
## [11] "RSNNS"     "Rcpp"      "stats"     "graphics"  "grDevices"
## [16] "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "RSNNS"     "Rcpp"      "stats"     "graphics"  "grDevices"
##  [6] "utils"     "datasets"  "methods"   "base"      "magrittr" 
## [11] "RSNNS"     "Rcpp"      "stats"     "graphics"  "grDevices"
## [16] "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "RSNNS"     "Rcpp"      "stats"     "graphics"  "grDevices"
##  [6] "utils"     "datasets"  "methods"   "base"      "magrittr" 
## [11] "RSNNS"     "Rcpp"      "stats"     "graphics"  "grDevices"
## [16] "utils"     "datasets"  "methods"   "base"     
## 
## [[4]]
##  [1] "RSNNS"     "Rcpp"      "stats"     "graphics"  "grDevices"
##  [6] "utils"     "datasets"  "methods"   "base"      "magrittr" 
## [11] "RSNNS"     "Rcpp"      "stats"     "graphics"  "grDevices"
## [16] "utils"     "datasets"  "methods"   "base"
# 設定全域變數
clusterExport(cl,
              c("tuning", "har_train_x", "har_train_y",
                "har_test_x", "har_test_y"))
# registerDoParallel(cl)
# Set a turning parameter
tuning <- list(
  size = c(40, 20, 20, 50, 50),
  maxit = c(60, 100, 100, 100, 100),
  shuffle = c(FALSE, FALSE, TRUE, FALSE, FALSE),
  params = list(FALSE, FALSE, FALSE, FALSE, c(0.1, 20, 3)))
# Parallel computing
time.start <- Sys.time()

har_models <- parLapply(cl, 1 : 5, function(x){
  set.seed(1234)
  mlp_tmp <- mlp(
    x = har_train_x %>% as.matrix(),
    y = har_train_y$X1 %>% decodeClassLabels(),
    size = tuning$size[[x]],
    learnFunc = "Rprop",
    shufflePatterns = tuning$shuffle[[x]],
    learnFuncParams = tuning$params[[x]],
    maxit = tuning$maxit[[x]]
  ) 
  return(mlp_tmp)
})

time.stop <- Sys.time()
time.stop - time.start
## Time difference of 16.03543 mins

接著進行樣本外 (Out_sample) 的預測,也需要一些時間,故我們依舊使用parallel來處理

First, we need to export the model results to each of the workers on our cluster

registerDoParallel(cl)
# Predict model 
time.start <- Sys.time()
# Input the model results to each of the workers on our cluster
clusterExport(cl, "har_models")
# Parallel computing
har_yhat <- foreach(i = 1:5, .combine = 'c') %dopar% {
  list(list(
    Insample = har_models[[i]] %>% fitted.values() %>%
      encodeClassLabels() ,
    Outsample = har_models[[i]] %>% 
      predict(., newdata = as.matrix(har_test_x)) %>% 
      encodeClassLabels() 
  ))
}

time.stop <- Sys.time()
time.stop - time.start
## Time difference of 4.929616 secs

Finally, we can merge the actual and fitted or predicted values together into a dataset, calculate performance measures on each one, and store the overall results together for examination and comparison

# Calculate performance measusures of insample 
har_insample <- cbind(Y = har_train_y,
                      do.call(cbind, 
                              lapply(har_yhat, 
                                     `[[`, "Insample"))) %>% 
  `colnames<-`(., c("Y", paste0("Yhat", 1:5))) # dim() = 7352 * 6

insample_cbind_fun <- function(i) {
  f <- substitute(~ Y + x, 
                  list(x = as.name(paste0("Yhat", i))))
  har_dat <- har_insample[ har_insample[, paste0("Yhat", i)] != 0, ]  # 0 value indicate the model was uncertain how to classify an observation
  har_dat$Y <- factor(har_dat$Y, levels = 1:6)
  har_dat[, paste0("Yhat", i)] <- factor(har_dat[, paste0("Yhat", i)], levels = 1:6)
  res <- caret::confusionMatrix(xtabs(f, data = har_dat))
  # Cbind the data frame 
  cbind(Size = tuning$size[[i]],
        Maxit = tuning$maxit[[i]],
        Shuffle = tuning$shuffle[[i]],
        as.data.frame(t(res$overall[c("Accuracy", "AccuracyLower", "AccuracyUpper")])))
}
# Combine the performance of insample 
performance_insample <- do.call(rbind, 
                                lapply(1:5, 
                                       FUN = function(x){
                                         insample_cbind_fun(x)
                                }))
# Calculate performance measusures of outsample 
har_outsample <- cbind(Y = har_test_y,
                       do.call(cbind, 
                               lapply(har_yhat, 
                                     `[[`, "Outsample"))) %>% 
  `colnames<-`(., c("Y", paste0("Yhat", 1:5))) # dim() = 2947 * 6

outsample_cbind_fun <- function(i) {
  f <- substitute(~ Y + x, 
                  list(x = as.name(paste0("Yhat", i))))
  har_dat <- har_outsample[har_outsample[, paste0("Yhat", i)] != 0, ]  # 0 value indicate the model was uncertain how to classify an observation
  har_dat$Y <- factor(har_dat$Y, levels = 1:6)
  har_dat[, paste0("Yhat", i)] <- factor(har_dat[, paste0("Yhat", i)], levels = 1:6)
  res <- caret::confusionMatrix(xtabs(f, data = har_dat))
  # Cbind the data frame 
  cbind(Size = tuning$size[[i]],
        Maxit = tuning$maxit[[i]],
        Shuffle = tuning$shuffle[[i]],
        as.data.frame(t(res$overall[c("Accuracy", "AccuracyLower", "AccuracyUpper")])))
}

# Combine the performance of outsampel 
performance_outsample <- do.call(rbind, 
                                 lapply(1:5, 
                                        FUN = function(x){
                                          outsample_cbind_fun(x)
                                          }))

可將 insample 和 outsample 的表現結果以表格呈現

# Visualization 
options(digits = 3)
performance_insample %>% 
  kable(., format = "html", 
        caption = "Insample (Training set) performance") %>% 
  kable_styling(bootstrap_options = "striped", full_width = F)
Insample (Training set) performance
Size Maxit Shuffle Accuracy AccuracyLower AccuracyUpper
40 60 FALSE 0.996 0.994 0.997
20 100 FALSE 0.997 0.996 0.998
20 100 TRUE 0.997 0.996 0.998
50 100 FALSE 1.000 0.999 1.000
50 100 FALSE 1.000 0.999 1.000
performance_outsample %>% 
  kable(., format = "html", 
        caption = "Outsample (Testing set) performance") %>% 
  kable_styling(bootstrap_options = "striped", full_width = F) %>%
  row_spec(1, bold = T, color = "black")
Outsample (Testing set) performance
Size Maxit Shuffle Accuracy AccuracyLower AccuracyUpper
40 60 FALSE 0.931 0.921 0.940
20 100 FALSE 0.926 0.916 0.935
20 100 TRUE 0.926 0.916 0.935
50 100 FALSE 0.947 0.938 0.955
50 100 FALSE 0.924 0.914 0.934

由上表 (Outsample performance)可知,我們可以根據智慧型手機的數據精準的預測人們的行動 (Human Activity)

從 Insample performance 可看出,越複雜的模型越好,但我們從 Outsample performance 看出,事實上正好相反(即,越複雜的模型準確度越低),此即overfitting。

綜上所述,我們會選出最好的model (粗體字),並且很有信心的說,此模型可讓我們對用戶的行為,做很好的分類。

Summary

本章介紹了如何在R中建立並訓練 Neuro Network model,用來對圖像辨識和人類行為進行分類。

下一章,我們將討論幾種防止overfitting的方法,我們稱為正規化 (Regularization),從而使模型獲得更準確的估計。