An attempt at a deep learning model. Most of the code is from: https://www.kaggle.com/tentotheminus9/normal-vs-pneumonia-keras-in-r and https://shirinsplayground.netlify.com/2018/06/keras_fruits/
The dataset is a X-ray images dataset from Kaggle: https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia/data
This code will build a neural network to detect bacterial and viral pneumonia in chest X-rays.
train_dir = 'datasets/chest_xray/train/'
validation_dir = 'datasets/chest_xray/val/'
test_dir = 'datasets/chest_xray/test/'
The augmentation now creates many variations on the training images, such as rotations, skews, mirrors, etc. This effectively increases the size of the training data. We shouldn’t do any of this for the test sets.
All the data is rescaled by dividing by 255. This means that pixel values are now from 0 to 1.
#Create Keras Generators
set.seed(123)
#The train generator uses augmentation,
train_datagen = image_data_generator(
rescale = 1/255,
rotation_range = 5,
width_shift_range = 0.1,
height_shift_range = 0.05,
shear_range = 0.1,
zoom_range = 0.15,
horizontal_flip = TRUE,
vertical_flip = FALSE,
fill_mode = "reflect"
)
validation_datagen <- image_data_generator(rescale = 1/255)
test_datagen <- image_data_generator(rescale = 1/255)
training_batch_size = 32
validation_batch_size = 32
train_generator <- flow_images_from_directory(
train_dir, # Target directory
train_datagen, # Data generator
classes = c('NORMAL', 'PNEUMONIA'),
target_size = c(224, 224), # Resizes all images
batch_size = training_batch_size,
class_mode = "categorical",
shuffle = T,
seed = 123
)
validation_generator <- flow_images_from_directory(
validation_dir,
classes = c('NORMAL', 'PNEUMONIA'),
validation_datagen,
target_size = c(224, 224),
batch_size = validation_batch_size,
class_mode = "categorical",
shuffle = T,
seed = 123
)
test_generator <- flow_images_from_directory(
test_dir,
classes = c('NORMAL', 'PNEUMONIA'),
test_datagen,
target_size = c(224, 224),
batch_size = 1,
class_mode = "categorical",
shuffle = FALSE
)
Here, a pre-defined model from google is used: https://ai.googleblog.com/2016/08/improving-inception-and-image.html
This is a “residual neural network”: https://en.wikipedia.org/wiki/Residual_neural_network
It also has pretrained weights, trained on ImageNet, which is a large database of annotated images. What this means is basically that the network has prelearned how to identify features from images. All there is left to do, is to expand the model to connect those features to the outcomes in the problem at hand. This is the “top” layer, so set “include_top” to false, then add a new top:
#Transfer learning
#Load inception resnet v2 as the base CNN, making sure the weights are the 'no top' version,
conv_base <- application_inception_resnet_v2(
weights = "imagenet",
include_top = FALSE,
input_shape = c(224, 224, 3)
)
#Create a new top,
model <- keras_model_sequential() %>%
conv_base %>%
layer_global_average_pooling_2d(trainable = T) %>%
layer_dropout(rate = 0.2, trainable = T) %>%
layer_dense(units = 224, activation = "relu", trainable = T) %>%
layer_dense(units = 2, activation = "softmax", trainable = T)
#Don't train the base,
freeze_weights(conv_base)
Compile the model. Use “categorical_crossentropy” as the loss function, this is most appropriate for classification problems.
According to it’s help function, the optimizer_rmsprop “is usually a good choice for recurrent neural networks”.
set.seed(123)
model %>% compile(
loss = "categorical_crossentropy",
optimizer = optimizer_rmsprop(lr = 1e-6),
metrics = c("accuracy")
)
number_normal_train_samples = length(list.files(paste(train_dir, '/NORMAL/', sep = ""), recursive = T))
number_pneumonia_train_samples = length(list.files(paste(train_dir, '/PNEUMONIA/', sep = ""), recursive = T))
class_rebalance = number_normal_train_samples / number_pneumonia_train_samples
There are 1341 and 3875 in the dataset. Attempt to balance these by giving the pneumonia samples less weight in the training.
# Train the new model. Save the results, because training this network takes a lot of time.
model_file = "data/191202_Pneumonia_TrainedModel.Rdata"
if(file.exists(model_file)){load(model_file)}else{
set.seed(123)
history <- model %>% fit_generator(
train_generator,
steps_per_epoch = 100,
class_weight = list("0"=1,"1"=class_rebalance),
epochs = 30,
validation_data = validation_generator,
validation_steps = ceiling(length(list.files(validation_dir, recursive = T)) / validation_batch_size)
)
save(model, history, file=model_file)
}
plot(history)
#Make predictions on the test set,
pred_file = "data/191202_Pneumonia_preds.Rdata"
if(file.exists(pred_file)){load(pred_file)}else{
preds = predict_generator(model,
test_generator,
steps = length(list.files(test_dir, recursive = T)))
save(preds, file=pred_file)
}
#Do some tidying,
predictions = data.frame(test_generator$filenames)
predictions$prob_pneumonia = preds[,2]
colnames(predictions) = c('Filename', 'Prob_Pneumonia')
predictions$Class_predicted = 'Normal'
predictions$Class_predicted[predictions$Prob_Pneumonia >= 0.5] = 'Pneumonia'
predictions$Class_actual = 'Normal'
predictions$Class_actual[grep("PNEUMONIA", predictions$Filename)] = 'Pneumonia'
predictions$Class_actual = as.factor(predictions$Class_actual)
predictions$Class_predicted = as.factor(predictions$Class_predicted)
Get optimal cutoff
library(pROC)
roc = roc(response = predictions$Class_actual,
predictor = as.vector(predictions$Prob_Pneumonia),
ci=T,
levels = c('Normal', 'Pneumonia'))
threshold = coords(roc, x = 'best', best.method='youden')
threshold
## threshold specificity sensitivity
## 0.3862341 0.8504274 0.6410256
boxplot(predictions$Prob_Pneumonia ~ predictions$Class_actual,
main = 'Probabilities of Normal vs Pneumoia',
ylab = 'Probability',
col = 'light blue')
abline(h=threshold[1], col = 'red', lwd = 3)
plot(roc, print.auc=T)
ci_sens = ci.se(roc, specificities = seq(from=0, to=1, by=0.01), boot.n = 2000)
plot(ci_sens, type="shape", col="#00860022", no.roc=TRUE)
Confusion Matrix
predictions$Class_predicted = 'Normal'
predictions$Class_predicted[predictions$Prob_Pneumonia >= threshold[1]] = 'Pneumonia'
predictions$Class_predicted = factor(predictions$Class_predicted)
caret::confusionMatrix(predictions$Class_predicted, predictions$Class_actual, positive = 'Pneumonia')
## Confusion Matrix and Statistics
##
## Reference
## Prediction Normal Pneumonia
## Normal 199 140
## Pneumonia 35 250
##
## Accuracy : 0.7196
## 95% CI : (0.6825, 0.7545)
## No Information Rate : 0.625
## P-Value [Acc > NIR] : 4.019e-07
##
## Kappa : 0.451
##
## Mcnemar's Test P-Value : 3.791e-15
##
## Sensitivity : 0.6410
## Specificity : 0.8504
## Pos Pred Value : 0.8772
## Neg Pred Value : 0.5870
## Prevalence : 0.6250
## Detection Rate : 0.4006
## Detection Prevalence : 0.4567
## Balanced Accuracy : 0.7457
##
## 'Positive' Class : Pneumonia
##
Good but not great. One person on Kaggle achieved an accuracy of >90% with very similar code, using a different predefined network structure (“xception”). But where my current network used the ImageNet weights, the other network learned those weights from an earlier point onwards:
https://github.com/fr407041/Pneumonia-Diagnosis-using-XRays
This might improve accuracy also with this model, but the current analysis already took almost 10 hours to run on my laptop.