R for SQListas, a Continuation
… what can we do now?
MNIST - the “Drosophila of Machine Learning” (attributed to Geoffrey Hinton)
Use the R tensorflow library to load the data.
Explanations, later ;-)
library(tensorflow)
datasets <- tf$contrib$learn$datasets
mnist <- datasets$mnist$read_data_sets("MNIST-data", one_hot = TRUE)
train_images <- mnist$train$images
train_labels <- mnist$train$labels
label_1 <- train_labels[1,]
image_1 <- train_images[1,]
label_1
[1] 0 0 0 0 0 0 0 1 0 0
length(image_1)
[1] 784
image_1[250:300]
[1] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.54901963
[7] 0.98431379 0.99607849 0.99607849 0.99607849 0.99607849 0.99607849
[13] 0.99607849 0.99607849 0.99607849 0.99607849 0.99607849 0.99607849
[19] 0.99607849 0.99607849 0.99607849 0.74117649 0.09019608 0.00000000
[25] 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
[31] 0.00000000 0.00000000 0.00000000 0.88627458 0.99607849 0.81568635
[37] 0.78039223 0.78039223 0.78039223 0.78039223 0.54509807 0.23921570
[43] 0.23921570 0.23921570 0.23921570 0.23921570 0.50196081 0.87058830
[49] 0.99607849 0.99607849 0.74117649
grayscale <- colorRampPalette(c('white','black'))
par(mar=c(1,1,1,1), mfrow=c(8,8),pty='s',xaxt='n',yaxt='n')
for(i in 1:40)
{
z<-array(train_images[i,],dim=c(28,28))
z<-z[,28:1] ##right side up
image(1:28,1:28,z,main=which.max(train_labels[i,])-1,col=grayscale(256), , xlab="", ylab="")
}
Are my data linearly separable?
# fit the model
lda_fit <- lda(X_train, y_train)
# model predictions for the test set
lda_pred <- predict(lda_fit, X_test)
# prediction accuracy
ct <- table(lda_pred$class, y_test)
sum(diag(prop.table(ct)))
[1] 0.8736
# fit the model
svm_fit_linear <- ksvm(x = X_train, y = y_train, type='C-svc', kernel='vanilladot', C=1, scale=FALSE)
Setting default kernel parameters
# model predictions for the test set
svm_pred <- predict(svm_fit_linear, X_test)
# prediction accuracy
ct <- table(svm_pred, y_test)
sum(diag(prop.table(ct)))
[1] 0.9393
# fit the model
svm_fit_rbf <- ksvm(x = X_train, y = y_train, type='C-svc', kernel='rbf', C=1, scale=FALSE)
# model predictions for the test set
svm_pred <- predict(svm_fit_rbf, X_test)
# prediction accuracy
ct <- table(svm_pred, y_test)
sum(diag(prop.table(ct)))
[1] 0.9768
Let's try neural networks!
AI library open sourced by Google
“If you can express your computation as a data flow graph, you can use TensorFlow.”
datasets <- tf$contrib$learn$datasets
mnist <- datasets$mnist$read_data_sets("MNIST-data", one_hot = TRUE)
# images are 55000 * 784
x <- tf$placeholder(tf$float32, shape(NULL, 784L))
# labels are 55000 * 10
y_ <- tf$placeholder(tf$float32, shape(NULL, 10L))
# weight matrix is 784 * 10
W <- tf$Variable(tf$zeros(shape(784L, 10L)))
# bias is 10 * 1
b <- tf$Variable(tf$zeros(shape(10L)))
# initialize variables
# y_hat
y <- tf$nn$softmax(tf$matmul(x,W) + b)
# loss function
cross_entropy <- tf$reduce_mean(-tf$reduce_sum(y_ * tf$log(y), reduction_indices=1L))
# specify optimization method and step size
optimizer <- tf$train$GradientDescentOptimizer(0.5)
train_step <- optimizer$minimize(cross_entropy)
sess = tf$InteractiveSession()
sess$run(tf$initialize_all_variables())
for (i in 1:1000) {
batches <- mnist$train$next_batch(100L)
batch_xs <- batches[[1]]
batch_ys <- batches[[2]]
sess$run(train_step, feed_dict = dict(x = batch_xs, y_ = batch_ys))
}
correct_prediction <- tf$equal(tf$argmax(y, 1L), tf$argmax(y_, 1L))
accuracy <- tf$reduce_mean(tf$cast(correct_prediction, tf$float32))
# actually evaluate training accuracy
sess$run(accuracy, feed_dict=dict(x = mnist$train$images, y_ = mnist$train$labels))
[1] 0.9167272
# and test accuracy
sess$run(accuracy, feed_dict=dict(x = mnist$test$images, y_ = mnist$test$labels))
[1] 0.918
Bit disappointing right?
Anything we can do?
# template to initialize weights with a small amount of noise for symmetry breaking and to prevent 0 gradients
weight_variable <- function(shape) {
initial <- tf$truncated_normal(shape, stddev=0.1)
tf$Variable(initial)
}
# template to initialize bias to small positive value to avoid "dead neurons"
bias_variable <- function(shape) {
initial <- tf$constant(0.1, shape=shape)
tf$Variable(initial)
}
# compute 32 feature maps for each 5x5 patch
# we have just 1 channel
# so weights shape is: height, width, number of input channels, number of output channels
W_conv1 <- weight_variable(shape(5L, 5L, 1L, 32L))
# shape for bias: number of output channels
b_conv1 <- bias_variable(shape(32L))
# reshape x from 2d to 4d tensor with dimensions batch size, width, height, number of color channels
x_image <- tf$reshape(x, shape(-1L, 28L, 28L, 1L))
# template to define convolutional layer
# tf$nn$conv2d parameters: input tensor, kernel tensor, strides, padding
# input tensor has shape [batch size, in_height, in_width, in_channels] (NHWC)
# kernel tensor has shape [filter_height, filter_width, in_channels, out_channels]
conv2d <- function(x, W) {
tf$nn$conv2d(x, W, strides=c(1L, 1L, 1L, 1L), padding='SAME')
}
# perform convolution and ReLU activation
# output shape is batch size, 28, 28, 32
h_conv1 <- tf$nn$relu(conv2d(x_image, W_conv1) + b_conv1)
# template to define max pooling over 2x2 regions
max_pool_2x2 <- function(x) {
tf$nn$max_pool(
x,
ksize=c(1L, 2L, 2L, 1L),
strides=c(1L, 2L, 2L, 1L),
padding='SAME')
}
# output shape is batch size , 14, 14, 32
h_pool1 <- max_pool_2x2(h_conv1)
# next feature map is 5*5, takes 32 channels, produces 64 channels - size weights accordingly
W_conv2 <- weight_variable(shape = shape(5L, 5L, 32L, 64L))
b_conv2 <- bias_variable(shape = shape(64L))
# shape is ?, 14, 14, 64
h_conv2 <- tf$nn$relu(conv2d(h_pool1, W_conv2) + b_conv2)
# output shape is batch size, 7, 7, 64
h_pool2 <- max_pool_2x2(h_conv2)
# bring together all feature maps
# weights shape: 3136, 1024 (fully connected)
W_fc1 <- weight_variable(shape(7L * 7L * 64L, 1024L))
b_fc1 <- bias_variable(shape(1024L))
# reshape input: batch size, 3136
h_pool2_flat <- tf$reshape(h_pool2, shape(-1L, 7L * 7L * 64L))
# matrix multiply and ReLU
# new shape: batch size, 1024
h_fc1 <- tf$nn$relu(tf$matmul(h_pool2_flat, W_fc1) + b_fc1)
#dropout
keep_prob <- tf$placeholder(tf$float32)
# shape: ?, 1024
h_fc1_drop <- tf$nn$dropout(h_fc1, keep_prob)
W_fc2 <- weight_variable(shape(1024L, 10L))
b_fc2 <- bias_variable(shape(10L))
# output shape: batch size, 10
y_conv <- tf$nn$softmax(tf$matmul(h_fc1_drop, W_fc2) + b_fc2)
cross_entropy <- tf$reduce_mean(-tf$reduce_sum(y_ * tf$log(y_conv), reduction_indices=1L))
train_step <- tf$train$AdamOptimizer(1e-4)$minimize(cross_entropy)
correct_prediction <- tf$equal(tf$argmax(y_conv, 1L), tf$argmax(y_, 1L))
accuracy <- tf$reduce_mean(tf$cast(correct_prediction, tf$float32))
for (i in 1:2000) {
batch <- mnist$train$next_batch(50L)
if (i %% 250 == 0) {
train_accuracy <- accuracy$eval(feed_dict = dict(
x = batch[[1]], y_ = batch[[2]], keep_prob = 1.0))
cat(sprintf("step %d, training accuracy %g\n", i, train_accuracy))
}
train_step$run(feed_dict = dict(
x = batch[[1]], y_ = batch[[2]], keep_prob = 0.5), session=sess)
}
step 250, training accuracy 0.88
step 500, training accuracy 0.9
step 750, training accuracy 0.9
step 1000, training accuracy 0.96
step 1250, training accuracy 0.98
step 1500, training accuracy 1
step 1750, training accuracy 1
step 2000, training accuracy 0.98
test_accuracy <- accuracy$eval(feed_dict = dict(
x = mnist$test$images, y_ = mnist$test$labels, keep_prob = 1.0))
cat(sprintf("test accuracy %g", train_accuracy))
test accuracy 0.98
… but that will have to be another time …
Thanks for your attention!!