Introcution

In this competition, Kagglers will develop models capable of classifying mixed patterns of proteins in microscope images. The Human Protein Atlas will use these models to build a tool integrated with their smart-microscopy system to identify a protein’s location(s) from a high-throughput image.

Proteins are “the doers” in the human cell, executing many functions that together enable life. Historically, classification of proteins has been limited to single patterns in one or a few cell types, but in order to fully understand the complexity of the human cell, models must classify mixed patterns across a range of different human cells.

Images visualizing proteins in cells are commonly used for biomedical research, and these cells could hold the key for the next breakthrough in medicine. However, thanks to advances in high-throughput microscopy, these images are generated at a far greater pace than what can be manually evaluated. Therefore, the need is greater than ever for automating biomedical image analysis to accelerate the understanding of human cells and disease

Tide up our Data

Let’s strat with load relevant libraries csv files for train & test set.

library(EBImage)
library(magick)
library(keras)
library(stringi)
library(stringr)
library(data.table)
library(dplyr)
setwd("G:/DataScienceProject/Kaggle-human-protein-atlas-image-classification")
Train <- read.csv(file = "train.csv", header = TRUE)
Test <- read.csv(file = "sample_submission.csv", header = TRUE)

Here, how Im going to handle the train set: 1. Load image - only red & green types. 2. Crop only relevant protein from image center in resolution 96x96. 3. Image sharpness. 4. Color both in red. 5. Get imgae diffrences. 6. Save image.

disc = makeBrush(21, "disc")
disc = disc / sum(disc)
i <- 1
for(i in 1:length(Train$Id)){
    #Handle red image
    imgRed <- paste0("G:/DataScienceProject/Kaggle-human-protein-atlas-image-classification/train/", Train$Id[i], "_red.png")
    img1 <- readImage(imgRed)
      img1 = img1[144:367, 144:367]
    img_bg1 = filter2(img1, disc)
    offset = 0.02
    img1 = (img1 - img_bg1) > offset
        
    #Handle green image
    imgGreen <- paste0("G:/DataScienceProject/Kaggle-human-protein-atlas-image-classification/train/", Train$Id[i], "_green.png")
    img2 <- readImage(imgGreen)
      img2 = img2[144:367, 144:367]
    img_bg2 = filter2(img2, disc)
    offset = 0.02
    img2 = (img2 - img_bg2) > offset
      img1 = rgbImage(red = 1.3 * img1)
      img2 = rgbImage(green = 1.3 * img2)
      dif1 = rgbImage(red = 1.3 * dif1)

     imgdif1 <- paste0("G:/DataScienceProject/Kaggle-human-protein-atlas-image-classification/train/", Train$Id[i], "_dif1.png")
    writeImage(dif1, imgdif1, quality = 85)

      print(i)
    i <- i + 1
    
}

Doing the same for the test dataset.

disc = makeBrush(21, "disc")
disc = disc / sum(disc)
i <- 1
for(i in 1:length(Test$Id)){
    #Handle red image
    imgRed <- paste0("G:/DataScienceProject/Kaggle-human-protein-atlas-image-classification/test/", Test$Id[i], "_red.png")
    img1 <- readImage(imgRed)
      img1 = img1[144:367, 144:367]
    img_bg1 = filter2(img1, disc)
    offset = 0.02
    img1 = (img1 - img_bg1) > offset
        
    #Handle green image
    imgGreen <- paste0("G:/DataScienceProject/Kaggle-human-protein-atlas-image-classification/test/", Test$Id[i], "_green.png")
    img2 <- readImage(imgGreen)
      img2 = img2[144:367, 144:367]
    img_bg2 = filter2(img2, disc)
    offset = 0.02
    img2 = (img2 - img_bg2) > offset
      img1 = rgbImage(red = 1.3 * img1)
      img2 = rgbImage(green = 1.3 * img2)
      dif1 = rgbImage(red = 1.3 * dif1)
      imgdif1 <- paste0("G:/DataScienceProject/Kaggle-human-protein-atlas-image-classification/test/", Test$Id[i], "_dif1.png")
      writeImage(dif1, imgdif1, quality = 85)

      print(i)
    i <- i + 1
    
}

Now, we will split or clasify the dif image with aligned protein labels.

dir.create("./train/dif1")

i <- 1
for (i in 1:length(Train$Id)){
  a <- 0
    a <- strsplit(as.character(Train$Target[i]), " ")
    j <- 1
    for(j in 1:length(a[[1]])){
        newName <- paste0(Train$Id[i], ".", a[[1]][j], "")
    DifFile <- paste0("G:/DataScienceProject/Kaggle-human-protein-atlas-image-classification/train/dif1/", newName, ".jpeg", "")
  image_write(image_convert(img, "jpeg"), path = DifFile, format = "jpeg")
        j <- j + 1
    }

  i <- i + 1
}
dir.create("./test/dif1")

i <- 1
for (i in 1:length(Test$Id)){
  imgdif1 <- image_read(imgDif1)
  DifFile <- paste0("G:/DataScienceProject/Kaggle-human-protein-atlas-image-classification/test/dif1/", Test$Id[i], ".jpeg", "")
  image_write(image_convert(imgdif1, "jpeg"), path = DifFile, format = "jpeg")

  i <- i + 1
}

Deep Learning Part

I used VGG16 as my deep learning model. batch size of 100 on 96x96 resolution.

write(paste0(date(), "::", "Start!!!"), file = "log.txt", append = TRUE)
library(keras)
library(stringr)
library(data.table)
library(dplyr)
write(paste0(date(), "::", "Finish libraries"), file = "log.txt", append = TRUE)
setwd("G:/DataScienceProject/Kaggle-human-protein-atlas-image-classification")
myFiles <- dir("./train/dif1")
SampleDF <- as.data.frame(myFiles)
SampleDF$Protein <- NA
i <- 1
for (i in 1:length(SampleDF$myFiles)){
      a <- 0
      a <- strsplit(as.character(SampleDF$myFiles[i]), "[.]")
      SampleDF$Protein[i] <- a[[1]][2]
  i <- i + 1
}
Train1 <- data.frame(Id=character(), Target=character())
i <- 1
for (i in 1:28){
    temp <- SampleDF[which(SampleDF$Protein == (i - 1)),]
    if(length(temp$myFiles) > 150){
        temp <- head(temp, 150)
    } else if(length(temp$myFiles) < 150){
    
    }
    Train1 <- rbind(Train1, temp)
   i <- i + 1
}

train_directory <- "G:/DataScienceProject/Kaggle-human-protein-atlas-image-classification/train/dif1"
test_directory <- "G:/DataScienceProject/Kaggle-human-protein-atlas-image-classification/test/dif1"

#Build train array
img_width <- 96
img_height <- 96
batch_size <- 100
train.label <- 0
train.array <- 0
test.array <- 0
train.label <- rep(c(0), times = length(Train1$myFiles))
train.array <- array(NA, dim = c(length(Train1$myFiles), img_height, img_width, 3))
write(paste0(date(), "::", "Finish train attributes"), file = "log.txt", append = TRUE)

#Load files
for (i in 1:length(Train1$myFiles)) {  
  temp <- image_load(paste0(train_directory,"/",Train1$myFiles[i]),
                    target_size = c(img_height, img_width), 
                    grayscale = FALSE)    
  temp.array <- image_to_array(temp, data_format = "channels_last")
  train.array[i,,,] <- temp.array
  train.label[i] <- Train1$Protein[i]
  i <- i + 1
}

train.array <- train.array/255
label <- to_categorical(as.numeric(as.factor(train.label))-1)
write(paste0(date(), "::", "Finish train array"), file = "log.txt", append = TRUE)

val.size = 0.05
val.sample = sample(nrow(train.array), val.size*nrow(train.array))

val.array = train.array[val.sample,,,]
train.array = train.array[-c(val.sample),,,]

val.label = label[val.sample,]
label = label[-c(val.sample),]
write(paste0(date(), "::", "Finish valid samples"), file = "log.txt", append = TRUE)

files <- list.files(test_directory, recursive = TRUE)
test.array <- array(NA, dim = c(length(files), img_height, img_width, 3))

for (i in 1:length(files)) {
  
  temp <- image_load(paste0(test_directory,"/",files[i]), 
                    target_size = c(img_height, img_width), 
                    grayscale = FALSE)
    
  temp.array <- image_to_array(temp, data_format = "channels_last")
  test.array[i,,,] <- temp.array
  
}

test.array <- test.array/255
write(paste0(date(), "::", "Finish test array"), file = "log.txt", append = TRUE)

datagen <- image_data_generator(
  rotation_range = 10,
  width_shift_range = 0.1,
  height_shift_range = 0.1,
  horizontal_flip = TRUE,
  vertical_flip = TRUE)

train_generator <- flow_images_from_data(
  x = train.array,
  y = label,
  generator = datagen,
  batch_size = batch_size, 
  shuffle = TRUE,
  seed = 123)

validation_generator <- flow_images_from_data(
  x = val.array,
  y = val.label,
  generator = datagen,
  batch_size = batch_size, 
  shuffle = TRUE,
  seed = 123)

  # Set batch size to 1, since we want to predict 1 image at a time
test_generator <- flow_images_from_data(
  x = test.array,
  generator = image_data_generator(),
  batch_size = 1,
  shuffle = FALSE)
write(paste0(date(), "::", "Finish gen"), file = "log.txt", append = TRUE)

  base_model <- application_vgg16(include_top = FALSE, weights = "imagenet")
#  base_model <- application_resnet50(weights = 'imagenet')
write(paste0(date(), "::", "Finish base_model_vgg16"), file = "log.txt", append = TRUE)

predictions <- base_model$output %>% 
  layer_global_average_pooling_2d(trainable = T) %>% 
  layer_dense(128, activation = "relu", trainable = T) %>%
  layer_dropout(0.2, trainable = T) %>%
  layer_dense(28, trainable=T) %>%   
  layer_activation("softmax", trainable=T)

  model <- keras_model(inputs = base_model$input, outputs = predictions)
write(paste0(date(), "::", "Finish model"), file = "log.txt", append = TRUE)

#This is important:
for (layer in base_model$layers) layer$trainable = FALSE

model %>% compile(
  loss = "categorical_crossentropy",
  optimizer = optimizer_adam(lr = 0.01, decay = 1e-6),
  metrics = "accuracy"
)
write(paste0(date(), "::", "Finish compile"), file = "log.txt", append = TRUE)

summary(model)

model %>% fit_generator(
  train_generator,
  steps_per_epoch = round(nrow(train.array)/batch_size) + 1, 
  epochs = 1,
  validation_data = validation_generator,
  validation_steps = round(nrow(val.array)/batch_size) + 1,
  verbose = 1)
write(paste0(date(), "::", "Finish fit_gen"), file = "log.txt", append = TRUE)

  DifPerdict <- predict_generator(model, 
                      test_generator, 
                      steps = nrow(test.array),
                      verbose = 1)
  
DifPerdict <- as.data.frame(DifPerdict)
colnames(DifPerdict) <- unique(train.label)
write(paste0(date(), "::", "Finish predict"), file = "log.txt", append = TRUE)

Let’s do some protein predictions.

DifPredict <- data.frame(Id=as.character(), Predicted=as.character())

i <- 1
for(i in 1:length(files)){
  ProteinId <- as.character(gsub(".jpeg", "", files[i]))
  ProteinType <- (max.col(DifPerdict[i,]) - 1)
  DifPredict <- rbind(DifPredict, data.frame("Id"=ProteinId, "Predicted"= ProteinType))
  print(i)
  i <- i + 1
  
}
write.csv(DifPredict, file = "DifPredict.csv", row.names = FALSE)

Finally, it was really great to try and predict protein by images. Unfortely, the accuracy rate was very low 9.0%.