keras
패키지이다.
keras
는 python Keras API와 연계하여 R에서 빠르게 Deep learning을 수행할 수 있게 만들어준다.keras
패키지는 파이썬과 연계하여 실행되기 때문에 파이썬이 깔려있어야 한다. 이를 포함한 여러가지 이유로 인해 keras
패키지를 사용할 수 있는 환경을 조성하는 것은 여러 어려움이 따른다.keras
패키지를 사용할 수 있는 환경을 조성하는 방법에 대해서 다루고자 한다. 자세한 내용은 https://hastie.su.domains/ISLR2/keras-instructions.html 참조.
keras
패키지외에 torch
라는 새로운 R패키지가 등장하였다.torch
패키지는 python 설치를 하지 않아도 된다라는 장점을 가진다.keras
패키지보다는 약간 느리다는 단점을 가지고 있다.torch
가 아닌 keras
패키지를 활용하여 진행될 것이다.keras
패키지 사용 환경 조성 방법은 다음과 같다 :ISLR2
패키지를 설치한다.### step 1
if(sum(installed.packages()[,1] %in% "ISLR2") == 0){
install.packages("ISLR2")
}
keras
, reticulate
, tensorflow
패키지를 삭제한다.### step 2
tryCatch(
remove.packages(c("keras", "tensorflow", "reticulate")),
error = function(e) "Some or all packages not previously installed, that's ok!"
)
keras
패키지를 설치한다. 이 때 reticulate
패키지와 tensorflow
패키지도 함께 설치된다.### step 3
install.packages("keras", repos = 'https://cloud.r-project.org')
islr-miniconda
에 miniconda를 설치하도록 한다.### step 4
write('RETICULATE_AUTOCONFIGURE=FALSE', file = "~/.Renviron", append = TRUE)
write(sprintf('RETICULATE_MINICONDA_PATH=%s', normalizePath("~/islr-miniconda", winslash = "/", mustWork = FALSE)),
file = "~/.Renviron", append = TRUE)
### step 5 - restart!!!!
# Sys.setenv(RETICULATE_AUTOCONFIGURE='FALSE',
# RETICULATE_MINICONDA_PATH=normalizePath("~/islr-miniconda", winslash = "/", mustWork = FALSE))
ISLR2
패키지의 helpers 함수를 사용할 것이다. 이후 install_miniconda()
와 install_tensorflow()
함수를 통해 설치를 완료한다.### step 6
install_miniconda()
install_tensorflow()
### step 7
print_py_config()
######################################################
# library
######################################################
library(ISLR2)
library(dplyr)
library(ggplot2)
######################################################
# data
######################################################
Gitters <- na.omit(Hitters)
n <- nrow (Gitters)
######################################################
# test set
######################################################
set.seed (13)
ntest <- trunc(n / 3)
testid <- sample(1:n, ntest)
######################################################
# linear regression
######################################################
lr.fit <- lm(Salary ~ ., data=Gitters[-testid,])
lr.pred <- predict(lr.fit , Gitters[testid , ])
# mean absolute prediction error
mean(abs(Gitters[testid,]$Salary - lr.pred))
[1] 254.6687
######################################################
# Single layer Neural network
######################################################
library(keras)
# library(tensorflow)
# install_tensorflow()
# reshaping data
x <- scale (model.matrix(Salary ~ . -1, data=Gitters))
y <- Gitters$Salary
# construct the model
modnn <- keras_model_sequential() %>%
layer_dense(units=50, activation="relu", input_shape = ncol(x)) %>% # first hidden layer with 50 units with ReLU
layer_dropout(rate=0.4) %>% # dropout layer (you will learn this later)
layer_dense(units=1) # output layer - one unit with no activation function
# fitting algorithm
modnn %>% compile(loss="mse", # minimize the squared error loss
optimizer=optimizer_rmsprop(),
metric=list("mean_absolute_error"))
# fitting
history <- modnn %>% fit (
x[-testid , ], y[-testid],
epochs = 1500, # an epoch amounts to the number of SGD steps required to process n observations
batch_size = 32, # randomly select 32 training observations for each step of SGD
validation_data = list (x[testid , ], y[testid])
)
plot(history)
[1] 250.7136
######################################################
# library
######################################################
library(dplyr)
library(ggplot2)
library(keras)
######################################################
# data
######################################################
### load MNIST data
mnist <- dataset_mnist ()
### train & test datta
x_train <- mnist$train$x
g_train <- mnist$train$y
x_test <- mnist$test$x
g_test <- mnist$test$y
dim(x_train)
[1] 60000 28 28
dim(x_test)
[1] 10000 28 28
### reshape X from a three dimensional array to a matrix
x_train <- array_reshape(x_train , c( nrow(x_train), 784))
x_test <- array_reshape(x_test , c( nrow(x_test), 784))
### reshape Y (one-hot-encoding)
y_train <- to_categorical(g_train , 10)
y_test <- to_categorical(g_test , 10)
### scaling input
# Neural networks are somewhat sensitive to the scale of the inputs
# For example, ridge and lasso regularization are affected by scaling
# 0~255 to 0~1
x_train <- x_train/255
x_test <- x_test/255
######################################################
# Multi layer Neural network
######################################################
### construct the model
modelnn <- keras_model_sequential()
modelnn %>% layer_dense(units = 256,
activation = "relu",
input_shape = c(784)) %>%
layer_dropout(rate = 0.4) %>%
layer_dense(units = 128, activation = "relu") %>%
layer_dropout(rate = 0.3) %>%
layer_dense(units = 10, activation = "softmax")
summary(modelnn)
Model: "sequential_1"
______________________________________________________________________
Layer (type) Output Shape Param #
======================================================================
dense_4 (Dense) (None, 256) 200960
______________________________________________________________________
dropout_2 (Dropout) (None, 256) 0
______________________________________________________________________
dense_3 (Dense) (None, 128) 32896
______________________________________________________________________
dropout_1 (Dropout) (None, 128) 0
______________________________________________________________________
dense_2 (Dense) (None, 10) 1290
======================================================================
Total params: 235,146
Trainable params: 235,146
Non-trainable params: 0
______________________________________________________________________
### fitting algorithm
modelnn %>% compile(loss = "categorical_crossentropy",
optimizer = optimizer_rmsprop(), metrics = c("accuracy"))
### fitting
history <- modelnn %>%
fit(x_train , y_train ,
epochs = 30, batch_size = 128,
validation_split = 0.2)
plot(history)
### prediction
result <- modelnn %>% predict(x_test) %>% k_argmax()
mean((result %>% as.numeric()) == g_test)
[1] 0.9806
######################################################
# Multinomial logistic regression (for comparison)
######################################################
# much faster than glmnet
modellr <- keras_model_sequential () %>%
layer_dense(input_shape = 784, units = 10,
activation = "softmax")
summary(modellr)
Model: "sequential_2"
______________________________________________________________________
Layer (type) Output Shape Param #
======================================================================
dense_5 (Dense) (None, 10) 7850
======================================================================
Total params: 7,850
Trainable params: 7,850
Non-trainable params: 0
______________________________________________________________________
modellr %>% compile(loss = "categorical_crossentropy",
optimizer = optimizer_rmsprop (),
metrics = c("accuracy"))
modellr %>% fit(x_train , y_train, epochs = 30,
batch_size = 128, validation_split = 0.2)
### prediction
result2 <- modellr %>% predict(x_test) %>% k_argmax()
mean((result2 %>% as.numeric()) == g_test)
[1] 0.9276
######################################################
# library
######################################################
library(dplyr)
library(ggplot2)
library(keras)
######################################################
# data
######################################################
### load MNIST data
cifar100 <- dataset_cifar100()
### train & test datta
x_train <- cifar100$train$x
g_train <- cifar100$train$y
x_test <- cifar100$test$x
g_test <- cifar100$test$y
### We keep the array structure for X
### reshape Y (one-hot-encoding)
y_train <- to_categorical(g_train , 100)
y_test <- to_categorical(g_test , 100)
### scaling input
# Neural networks are somewhat sensitive to the scale of the inputs
# For example, ridge and lasso regularization are affected by scaling
# 0~255 to 0~1
x_train <- x_train/255
x_test <- x_test/255
### plot
library(jpeg)
par(mar = c(0, 0, 0, 0), mfrow = c(4, 4))
index <- sample( seq(50000), 16)
for(i in index){plot(as.raster(x_train[i,,,]))}
# as.raster() function converts the feature map so that
# it can be plotted as.raster() as a color image
######################################################
# CNN
######################################################
### construct the model
model <- keras_model_sequential() %>%
layer_conv_2d(filters = 32, kernel_size = c(3, 3),
padding = "same", # which means that output channels have the same dimension as the input channels
activation = "relu",
input_shape = c(32, 32, 3)) %>%
layer_max_pooling_2d(pool_size = c(2, 2)) %>%
layer_conv_2d(filters = 64, kernel_size = c(3, 3),
padding = "same", activation = "relu") %>%
layer_max_pooling_2d(pool_size = c(2, 2)) %>%
layer_conv_2d(filters = 128, kernel_size = c(3, 3),
padding = "same", activation = "relu") %>%
layer_max_pooling_2d(pool_size = c(2, 2)) %>%
layer_conv_2d(filters = 256, kernel_size = c(3, 3),
padding = "same", activation = "relu") %>%
layer_max_pooling_2d(pool_size = c(2, 2)) %>%
layer_flatten() %>%
layer_dropout(rate = 0.5) %>%
layer_dense(units = 512, activation = "relu") %>%
layer_dense(units = 100, activation = "softmax")
summary(model)
Model: "sequential_3"
______________________________________________________________________
Layer (type) Output Shape Param #
======================================================================
conv2d_3 (Conv2D) (None, 32, 32, 32) 896
______________________________________________________________________
max_pooling2d_3 (MaxPooling2D) (None, 16, 16, 32) 0
______________________________________________________________________
conv2d_2 (Conv2D) (None, 16, 16, 64) 18496
______________________________________________________________________
max_pooling2d_2 (MaxPooling2D) (None, 8, 8, 64) 0
______________________________________________________________________
conv2d_1 (Conv2D) (None, 8, 8, 128) 73856
______________________________________________________________________
max_pooling2d_1 (MaxPooling2D) (None, 4, 4, 128) 0
______________________________________________________________________
conv2d (Conv2D) (None, 4, 4, 256) 295168
______________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 2, 2, 256) 0
______________________________________________________________________
flatten (Flatten) (None, 1024) 0
______________________________________________________________________
dropout_3 (Dropout) (None, 1024) 0
______________________________________________________________________
dense_7 (Dense) (None, 512) 524800
______________________________________________________________________
dense_6 (Dense) (None, 100) 51300
======================================================================
Total params: 964,516
Trainable params: 964,516
Non-trainable params: 0
______________________________________________________________________
### fitting algorithm
model %>% compile(loss = "categorical_crossentropy",
optimizer = optimizer_rmsprop(), metrics = c("accuracy"))
### fitting
history <- model %>% fit(x_train , y_train , epochs = 30,
batch_size = 128, validation_split = 0.2)
### prediction
result <- model %>% predict(x_test) %>% k_argmax()
mean((result %>% as.numeric()) == g_test)
[1] 0.4247
CNN을 학습하는 과정에서의 핵심은 convolution filter를 학습하는 과정.
이 때, 비슷한 종류의 classification을 한다면 기존에 학습된 classifier의 convolution filter의 weight를 그대로 사용하고 (weight freezing)
마지막 소수의 layer만 새로 학습하는 방식을 사용하기도 함 : Transfer learning!!
다음의 R코드 예제는
imagenet
이라는 데이터셋을 학습한 resnet50
classifier를 불러와서 사용
######################################################
# library
######################################################
library(dplyr)
library(ggplot2)
library(keras)
######################################################
# data
######################################################
img_dir <- "book_images"
image_names <- list.files(img_dir)
num_images <- length(image_names)
x <- array( dim = c(num_images , 224, 224, 3))
for(i in 1:num_images){
img_path <- paste(img_dir, image_names[i], sep = "/")
img <- image_load(img_path, target_size = c(224, 224))
x[i,,, ] <- image_to_array(img)
}
x <- imagenet_preprocess_input (x)
# # python pillow module installation
# library(reticulate)
# py_install("pillow")
######################################################
# load the trained network
######################################################
model <- application_resnet50(weights = "imagenet")
# summary(model)
######################################################
# Classification
######################################################
pred6 <- model %>% predict(x) %>%
imagenet_decode_predictions (top = 3)
names(pred6) <- image_names
print(pred6)
$flamingo.jpg
class_name class_description score
1 n02007558 flamingo 0.930109262
2 n02006656 spoonbill 0.068014868
3 n02002556 white_stork 0.001172375
$hawk.jpg
class_name class_description score
1 n03388043 fountain 0.2772168
2 n03532672 hook 0.1794709
3 n03804744 nail 0.1103582
$hawk_cropped.jpeg
class_name class_description score
1 n01608432 kite 0.72434253
2 n01622779 great_grey_owl 0.08683522
3 n01532829 house_finch 0.03997646
$huey.jpg
class_name class_description score
1 n02097474 Tibetan_terrier 0.52462161
2 n02098413 Lhasa 0.40590140
3 n02094114 Norfolk_terrier 0.01699529
$kitty.jpg
class_name class_description score
1 n02105641 Old_English_sheepdog 0.83900332
2 n02086240 Shih-Tzu 0.04235809
3 n03223299 doormat 0.03226566
$weaver.jpg
class_name class_description score
1 n01843065 jacamar 0.48659834
2 n01818515 macaw 0.23028962
3 n02494079 squirrel_monkey 0.04457098
######################################################
# library
######################################################
library(dplyr)
library(ggplot2)
library(keras)
######################################################
# data
######################################################
max_features <- 10000
imdb <- dataset_imdb (num_words = max_features)
c(c(x_train , y_train), c(x_test , y_test)) %<-% imdb
### see the words in the document
word_index <- dataset_imdb_word_index()
decode_review <- function(text , word_index){
word <- names(word_index)
idx <- unlist(word_index , use.names = FALSE)
word <- c("<PAD>", "<START>", "<UNK>", "<UNUSED>", word)
idx <- c(0:3, idx + 3)
words <- word[ match(text, idx , 2)]
paste(words , collapse = " ")
}
decode_review(x_train[[1]][1:12], word_index)
[1] "<START> this film was just brilliant casting location scenery story direction everyone's"
### for one-hot encoding
library (Matrix)
one_hot <- function(sequences, dimension){
seqlen <- sapply(sequences, length)
n <- length(seqlen)
rowind <- rep (1:n, seqlen)
colind <- unlist (sequences)
sparseMatrix(i = rowind, j = colind,
dims = c(n, dimension))
}
### data : one-hot econding
x_train_1h <- one_hot(x_train , 10000)
x_test_1h <- one_hot(x_test , 10000)
nnzero(x_train_1h) / (25000 * 10000)
[1] 0.01316987
### validation set : 2000 / training set : 23000
set.seed (3)
ival <- sample(seq(along = y_train), 2000)
######################################################
# Lasso logistic regression
######################################################
library(glmnet)
fitlm <- glmnet(x_train_1h[-ival,], y_train[-ival],
family = "binomial", standardize = FALSE)
classlmv <- predict(fitlm, x_train_1h[ival,]) > 0
acclmv <- apply(classlmv, 2, function(x){ mean(x==(y_train[ival] > 0), na.rm=T) })
plot(-log(fitlm$lambda), acclmv, type='l')
######################################################
# Neural network
######################################################
# fully-connected neural network with two hidden layers,
# each with 16 units and ReLU activatio
######################################################
model <- keras_model_sequential() %>%
layer_dense(units = 16, activation = "relu",
input_shape = c (10000)) %>%
layer_dense(units = 16, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
model %>% compile(optimizer = "rmsprop",
loss = "binary_crossentropy", metrics = c("accuracy"))
# history <- model %>% fit(x_train_1h[-ival,], y_train[-ival],
# epochs = 20, batch_size = 512,
# validation_data = list(x_train_1h[ival,], y_train[ival]))
# history <- model %>%
# fit( x_train_1h[-ival,], y_train[-ival], epochs = 20,
# batch_size = 512, validation_data = list (x_test_1h, y_test))
#
# history <- model %>%
# fit( x_train_1h[-ival, ], y_train[-ival], epochs = 20,
# batch_size = 512, validation_split = 0.2 )
# plot(history)
word2vec
또는 GloVe
를 많이 사용.
[1] 178
[1] 0.91568
###
maxlen <- 500
x_train <- pad_sequences(x_train, maxlen = maxlen)
x_test <- pad_sequences(x_test, maxlen = maxlen)
###
model <- keras_model_sequential() %>%
layer_embedding(input_dim = 10000, output_dim = 32) %>%
layer_lstm(units = 32) %>%
layer_dense(units = 1, activation = "sigmoid")
###
model %>% compile(optimizer = "rmsprop",
loss = "binary_crossentropy", metrics = c("acc"))
history <- model %>% fit(x_train, y_train, epochs = 10,
batch_size = 128, validation_data = list(x_test, y_test))
plot(history)
predy <- predict(model, x_test) > 0.5
mean(abs(y_test == as.numeric(predy)))
[1] 0.86676
Log trading volume
Dow Jones return
Log volatility
Log trading volume
Dow Jones return
Log volatility
######################################################
# library
######################################################
library(dplyr)
library(ggplot2)
library(keras)
######################################################
# data
######################################################
library(ISLR2)
xdata <- data.matrix( NYSE[, c("DJ_return", "log_volume","log_volatility")] )
istrain <- NYSE[, "train"]
xdata <- scale(xdata)
### lag
lagm <- function(x, k = 1) {
n <- nrow(x)
pad <- matrix(NA, k, ncol(x))
rbind(pad, x[1:(n - k), ])
}
### data
arframe <- data.frame(log_volume = xdata[, "log_volume"],
L1 = lagm(xdata, 1), L2 = lagm(xdata, 2),
L3 = lagm(xdata, 3), L4 = lagm(xdata, 4),
L5 = lagm(xdata, 5))
arframe <- arframe[-(1:5), ]
istrain <- istrain[-(1:5)]
######################################################
# AR model
######################################################
### fitting
arfit <- lm(log_volume ~ ., data = arframe[istrain, ])
### prediction
arpred <- predict(arfit, arframe[!istrain, ])
V0 <- var(arframe[!istrain, "log_volume"])
1 - mean((arpred - arframe[!istrain, "log_volume"])^2) / V0
[1] 0.413223
### including DOW
arframed <- data.frame(day = NYSE[-(1:5), "day_of_week"], arframe)
arfitd <- lm(log_volume ~ ., data = arframed[istrain, ])
arpredd <- predict(arfitd, arframed[!istrain, ])
1 - mean((arpredd - arframe[!istrain, "log_volume"])^2) / V0
[1] 0.4598616
######################################################
# RNN
######################################################
n <- nrow(arframe)
xrnn <- data.matrix(arframe[, -1])
xrnn <- array(xrnn, c(n, 3, 5))
xrnn <- xrnn[,, 5:1]
xrnn <- aperm(xrnn, c(1, 3, 2))
dim(xrnn)
[1] 6046 5 3
### construct the model
model <- keras_model_sequential() %>%
layer_simple_rnn(units = 12,
input_shape = list(5, 3),
dropout = 0.1, recurrent_dropout = 0.1) %>%
layer_dense(units = 1)
### fitting algorithm
model %>% compile(optimizer = optimizer_rmsprop(),
loss = "mse")
### fitting
history <- model %>% fit(
xrnn[istrain,, ], arframe[istrain, "log_volume"],
batch_size = 64, epochs = 200,
validation_data =
list(xrnn[!istrain,, ], arframe[!istrain, "log_volume"])
)
kpred <- predict(model, xrnn[!istrain,, ])
1 - mean((kpred - arframe[!istrain, "log_volume"])^2) / V0
[1] 0.4130743
######################################################
# Comparison
######################################################
### linear AR model
model <- keras_model_sequential() %>%
layer_flatten(input_shape = c(5, 3)) %>%
layer_dense(units = 1)
### nonlinear AR model
x <- model.matrix(log_volume ~ . - 1, data = arframed)
colnames(x)
[1] "dayfri" "daymon" "daythur"
[4] "daytues" "daywed" "L1.DJ_return"
[7] "L1.log_volume" "L1.log_volatility" "L2.DJ_return"
[10] "L2.log_volume" "L2.log_volatility" "L3.DJ_return"
[13] "L3.log_volume" "L3.log_volatility" "L4.DJ_return"
[16] "L4.log_volume" "L4.log_volatility" "L5.DJ_return"
[19] "L5.log_volume" "L5.log_volatility"
### result
arnnd <- keras_model_sequential() %>%
layer_dense(units = 32, activation = 'relu',
input_shape = ncol(x)) %>%
layer_dropout(rate = 0.5) %>%
layer_dense(units = 1)
arnnd %>% compile(loss = "mse",
optimizer = optimizer_rmsprop())
history <- arnnd %>% fit(
x[istrain, ], arframe[istrain, "log_volume"], epochs = 100,
batch_size = 32, validation_data =
list(x[!istrain, ], arframe[!istrain, "log_volume"])
)
plot(history)
[1] 0.4678054
###