STAT654 Project

Author

Dan Hoang cu2107

Make a stylized picture. Using a Neural Network.

The goal of the Project is to run the style transfer code from Chapter 12 Section 3 on a picture of interest to you using a style of painting you like.

library(tensorflow)
library(tfdatasets)
library(keras)

display_image_tensor <- function(x, ..., max = 255,
                                 plot_margins = c(0, 0, 0, 0)) {

  if (!is.null(plot_margins))
    withr::local_par(mar = plot_margins)

  x %>%
    as.array() %>%
    drop() %>%
    as.raster(max = max) %>%
    plot(..., interpolate = FALSE)
}

## -------------------------------------------------------------------------
base_image_path <- get_file(
  "east-bay-sign.jpg",  origin = "https://www.csueastbay.edu/about/files/images/photo-gallery/east-bay-sign.jpg")

style_reference_image_path <- get_file(
  "starry_night.jpg",
  origin = "https://img-datasets.s3.amazonaws.com/starry_night.jpg")

c(original_height, original_width) %<-% {
  base_image_path %>%
    tf$io$read_file() %>%
    tf$io$decode_image() %>%
    dim() %>% .[1:2]
}
img_height <- 400
img_width <- round(img_height * (original_width /
                                   original_height))

plot(as.raster(jpeg::readJPEG(base_image_path)))

## -------------------------------------------------------------------------
preprocess_image <- function(image_path) {
  image_path %>%
    tf$io$read_file() %>%
    tf$io$decode_image() %>%
    tf$image$resize(as.integer(c(img_height, img_width))) %>%
    k_expand_dims(axis = 1) %>%
    imagenet_preprocess_input()
}

deprocess_image <- tf_function(function(img) {
  if (length(dim(img)) == 4)
    img <- k_squeeze(img, axis = 1)

  c(b, g, r) %<-% {
    img %>%
      k_reshape(c(img_height, img_width, 3)) %>%
      k_unstack(axis = 3)
  }

  r %<>% `+`(123.68)
  g %<>% `+`(103.939)
  b %<>% `+`(116.779)

  k_stack(c(r, g, b), axis = 3) %>%
    k_clip(0, 255) %>%
    k_cast("uint8")
})


## -------------------------------------------------------------------------
model <- application_vgg19(weights = "imagenet", include_top = FALSE)

outputs <- list()
for (layer in model$layers)
  outputs[[layer$name]] <- layer$output

feature_extractor <- keras_model(inputs = model$inputs,
                                 outputs = outputs)


## -------------------------------------------------------------------------
content_loss <- function(base_img, combination_img)
    sum((combination_img - base_img)^2)


## -------------------------------------------------------------------------
gram_matrix <- function(x) {
  n_features <- tf$shape(x)[3]
  x %>%
    tf$reshape(c(-1L, n_features)) %>%
    tf$matmul(., ., transpose_a = TRUE)
}

style_loss <- function(style_img, combination_img) {
  S <- gram_matrix(style_img)
  C <- gram_matrix(combination_img)
  channels <- 3
  size <- img_height * img_width
  sum((S - C) ^ 2) /
    (4 * (channels ^ 2) * (size ^ 2))
}


## -------------------------------------------------------------------------
total_variation_loss <- function(x) {
  a <- k_square(x[, NA:(img_height-1), NA:(img_width-1), ] -
                x[, 2:NA             , NA:(img_width-1), ])
  b <- k_square(x[, NA:(img_height-1), NA:(img_width-1), ] -
                x[, NA:(img_height-1), 2:NA            , ])
  sum((a + b) ^ 1.25)
}


## -------------------------------------------------------------------------
style_layer_names <- c(
    "block1_conv1",
    "block2_conv1",
    "block3_conv1",
    "block4_conv1",
    "block5_conv1"
)
content_layer_name <- "block5_conv2"
total_variation_weight <- 1e-6
content_weight <- 2.5e-8
style_weight <- 1e-6

compute_loss <-
  function(combination_image, base_image, style_reference_image) {

    input_tensor <-
      list(base_image,
           style_reference_image,
           combination_image) %>%
      k_concatenate(axis = 1)

    features <- feature_extractor(input_tensor)
    layer_features <- features[[content_layer_name]]
    base_image_features <- layer_features[1, , , ]
    combination_features <- layer_features[3, , , ]

    loss <- 0
    loss %<>% `+`(
      content_loss(base_image_features, combination_features) *
        content_weight
    )

    for (layer_name in style_layer_names) {
      layer_features <- features[[layer_name]]
      style_reference_features <- layer_features[2, , , ]
      combination_features <- layer_features[3, , , ]

      loss %<>% `+`(
        style_loss(style_reference_features, combination_features) *
          style_weight / length(style_layer_names)
      )
    }

    loss %<>% `+`(
      total_variation_loss(combination_image) *
        total_variation_weight
    )

    loss
  }


## -------------------------------------------------------------------------
compute_loss_and_grads <- tf_function(
  function(combination_image, base_image, style_reference_image) {
    with(tf$GradientTape() %as% tape, {
      loss <- compute_loss(combination_image,
                           base_image,
                           style_reference_image)
    })
    grads <- tape$gradient(loss, combination_image)
    list(loss, grads)
  })

optimizer <- optimizer_sgd(
  learning_rate_schedule_exponential_decay(
    initial_learning_rate = 100, decay_steps = 100, decay_rate = 0.96))



optimizer <-
  optimizer_sgd(learning_rate = learning_rate_schedule_exponential_decay(
    initial_learning_rate = 100,
    decay_steps = 100,
    decay_rate = 0.96
  ))


base_image <- preprocess_image(base_image_path)
style_reference_image <- preprocess_image(style_reference_image_path)
combination_image <- tf$Variable(preprocess_image(base_image_path))

## -------------------------------------------------------------------------
output_dir <- fs::path("style-transfer-generated-images")
iterations <- 200
for (i in seq(iterations)) {
  c(loss, grads) %<-% compute_loss_and_grads(
    combination_image, base_image, style_reference_image)

  optimizer$apply_gradients(list(
    tuple(grads, combination_image)))

  if ((i %% 100) == 0) {
    cat(sprintf("Iteration %i: loss = %.2f\n", i, loss))
    img <- deprocess_image(combination_image)
    display_image_tensor(img)
    fname <- sprintf("combination_image_at_iteration_%04i.png", i)
    tf$io$write_file(filename = output_dir / fname,
                     contents = tf$io$encode_png(img))
  }
}

Iteration 100: loss = 6733.86

Iteration 200: loss = 5424.67

Extra Credit - About VGG19 model

VGG19 is a deep convolutional neural network architecture for image classification. It was created by the Visual Geometry Group (VGG) at the University of Oxford in 2014. The VGG team developed several variations of the VGG network architecture, including VGG16 and VGG19, which were designed for the ImageNet Large Scale Visual Recognition Challenge (ILSVRC) in 2014.

The VGG19 model achieved state-of-the-art results on the ILSVRC 2014 classification task, and it has since become a popular model for feature extraction and transfer learning in computer vision applications.

Extra Credit - Microsoft Image Creator

Here is the picture generate from the text “starry night at CSU East Bay Campus”.

Bing Image Creator uses a OpenAI’s powerful array of artificial intelligence models, DALL-E, The images DALL-E creates are generated from its extensive training. It used millions of images to help the AI model learn and understand what images are, their contextual clues and meanings, and how they translate to text.

DALL-E was first released in January 2021 and has since been upgraded significantly in its second version called DALL-E 2. It was one of a few different AI art generators that became hugely popular for its ability to create beautiful images and art from scratch.

How Does DALL-E Work?

let’s start with how DALL-E was trained on millions of images from across the internet. The images used for training come from datasets that contain an enormous number of pictures that have a text caption. As you might imagine, with enough data the AI model can learn how to recognize what an object is and what it might look like in an image.

DALL-E was also built using a language model called GPT-3, or Generative Pre-trained Transformer. It was largely popularized when Open AI released ChatGPT, an AI chatbot that you can easily talk to in natural human language. This technology bridges the gap between text and image and helps turn the words you type into an image on the screen made up of pixels.

Another core part of DALL-E is the use of a Diffusion model. This model takes a noisy image—think of a highly pixilated image that isn’t recognizable—and works backward to produce a clear image that matches the text description you entered.