IMDB dataset

library(keras)
library(tensorflow)

set_maxfreqword_allow  = 2300

myimdb = dataset_imdb(num_words = set_maxfreqword_allow)

mydat_train        = myimdb$train$x
mydat_train_Labels = myimdb$train$y

mydat_test        = myimdb$test$x
mydat_test_Labels = myimdb$test$y
word_to_index =  dataset_imdb_word_index()
index_to_word   = names(word_to_index)
names(index_to_word ) <- word_to_index
first 10 word list indexing in IMDB dataset
the and a of to is br in it i
positon 58319 64305 37292 20341 67378 16730 57051 16741 13579 9680
index 1 2 3 4 5 6 7 8 9 10
last 10 word list indexing in IMDB dataset
ev chicatillo transacting sics wheelers pipe’s copywrite artbox voorhees’ ‘l’
positon 88557 88558 88559 88563 88569 88572 88575 88577 88580 88582
index 88575 88576 88577 88578 88579 88580 88581 88582 88583 88584

An example of comment review decoded by setting set_maxfreqword_allow = 2300

## ? this film was just brilliant casting location scenery story direction ? really ? the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same ? island as myself so i loved the fact there was a real connection with this film the witty ? throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly ? was amazing really ? at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little ? that played the ? of ? and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big ? for the whole film but these children are amazing and should be ? for what they have done don't you think the whole story was so lovely because it was true and was ? life after all that was ? with us all
vec_train  = vectorize_sequences(mydat_train )
trainLabs  = as.numeric(mydat_train_Labels)

vec_test   = vectorize_sequences(mydat_test)
testLabs   = as.numeric(mydat_test_Labels)


trainLabels25000 = to_categorical(as.array(trainLabs) )
testLabels25000  = to_categorical(as.array(testLabs)  )


str(vec_train)
##  num [1:25000, 1:2300] 1 1 1 1 1 1 1 1 1 1 ...
str(trainLabs)
##  num [1:25000] 1 0 0 1 0 0 1 0 1 0 ...
str(vec_test)
##  num [1:25000, 1:2300] 1 1 1 1 1 1 1 1 1 1 ...
str(testLabs)
##  num [1:25000] 0 1 1 0 1 1 1 0 0 1 ...
trainLabels25000 = to_categorical(as.array(trainLabs) )
testLabels25000  = to_categorical(as.array(testLabs)  )

+ Construct RNN Model

model = keras_model_sequential()

model %>%
  layer_dense(units = 32, activation = "relu", 
              input_shape = c(set_maxfreqword_allow )) %>%
  layer_dense(units = 32, activation = "relu") %>%
  layer_dense(units = 2,  activation = "sigmoid")

model %>%
  compile(optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = c("accuracy") )

+ train 25000 samples : first run

TrainText25000 = model %>% fit(vec_train,  
                                trainLabels25000 , 
                                epochs = 50,
                                batch_size = 512,
                                validation_split = 0.4)

plot(TrainText25000)

+ train 25000 samples : second run

TrainText25000 = model %>% fit(vec_train,  
                                trainLabels25000 , 
                                epochs = 50,
                                batch_size = 512,
                                validation_split = 0.4)

plot(TrainText25000)

+ train 25000 samples : third run

TrainText25000 = model %>% fit(vec_train,  
                                trainLabels25000 , 
                                epochs = 50,
                                batch_size = 512,
                                validation_split = 0.4)

plot(TrainText25000)

+ Prediction with train dataset

## $loss
## [1] 0.6055875
## 
## $acc
## [1] 0.9388
##           ActualCls
## PredictCls     0     1
##          0 11753   784
##          1   747 11716
Prob_negative Prob_positive PredCls ActuCls ActuObjs
0 100 1 1 positive
100 0 0 0 negative
100 0 0 0 negative
0 100 1 1 positive
100 0 0 0 negative
100 0 0 0 negative
0 100 1 1 positive
100 0 0 0 negative
0 100 1 1 positive
100 0 0 0 negative
0 100 1 1 positive
100 0 0 0 negative
100 0 0 0 negative
100 0 0 0 negative
100 0 0 0 negative
100 0 0 0 negative
0 100 1 1 positive
0 100 1 1 positive
100 0 0 0 negative
0 100 1 1 positive
100 0 0 0 negative
100 0 0 0 negative
0 100 1 1 positive
100 0 0 0 negative
0 100 1 1 positive
0 100 1 1 positive
100 0 0 0 negative
100 0 0 0 negative
0 100 1 1 positive
100 0 0 0 negative
100 0 0 0 negative
0 100 1 1 positive
0 100 1 1 positive
0 100 1 1 positive
100 0 0 0 negative
100 0 0 0 negative
100 0 0 0 negative
0 100 1 1 positive
100 0 0 0 negative
100 0 0 0 negative
0 100 1 1 positive
100 0 0 0 negative
100 0 0 0 negative
100 0 0 0 negative
0 100 1 1 positive
0 100 1 1 positive
0 100 1 1 positive
100 0 0 0 negative
0 100 1 1 positive
100 0 0 0 negative
100 0 0 0 negative
100 0 0 0 negative
100 0 0 0 negative
100 0 0 0 negative
100 0 0 0 negative
100 0 0 0 negative
0 100 1 1 positive
100 0 0 0 negative
0 100 1 1 positive
0 100 1 1 positive

+ Prediction with test dataset

## $loss
## [1] 1.53563
## 
## $acc
## [1] 0.84544
##           ActualCls
## PredictCls     0     1
##          0 10641  2003
##          1  1859 10497
Prob_negative Prob_positive PredCls ActuCls ActuObjs
99.96 0.03 0 0 negative
0.00 100.00 1 1 positive
100.00 0.00 0 1 positive
0.00 100.00 1 0 negative
0.00 100.00 1 1 positive
0.00 100.00 1 1 positive
0.00 100.00 1 1 positive
100.00 0.00 0 0 negative
0.00 100.00 1 0 negative
0.00 100.00 1 1 positive
0.00 100.00 1 1 positive
99.55 0.33 0 0 negative
100.00 0.00 0 0 negative
0.03 99.99 1 0 negative
0.00 100.00 1 1 positive
100.00 0.00 0 0 negative
0.00 100.00 1 1 positive
99.92 0.09 0 0 negative
100.00 0.00 0 0 negative
100.00 0.00 0 0 negative
0.00 100.00 1 1 positive
0.00 100.00 1 1 positive
0.00 100.00 1 1 positive
0.00 100.00 1 1 positive
0.00 100.00 1 1 positive
0.00 100.00 1 1 positive
100.00 0.00 0 0 negative
0.00 100.00 1 1 positive
0.00 100.00 1 1 positive
100.00 0.00 0 0 negative
0.00 100.00 1 1 positive
100.00 0.00 0 1 positive
0.00 100.00 1 0 negative
100.00 0.00 0 0 negative
100.00 0.00 0 0 negative
100.00 0.00 0 0 negative
0.00 100.00 1 1 positive
0.00 100.00 1 1 positive
99.98 0.03 0 0 negative
100.00 0.00 0 0 negative
0.00 100.00 1 1 positive
0.00 100.00 1 1 positive
100.00 0.00 0 0 negative
0.00 100.00 1 1 positive
0.00 100.00 1 1 positive
99.09 0.89 0 1 positive
100.00 0.00 0 0 negative
100.00 0.00 0 0 negative
100.00 0.00 0 0 negative
100.00 0.00 0 1 positive
100.00 0.00 0 0 negative
99.98 0.02 0 0 negative
0.00 100.00 1 1 positive
0.00 100.00 1 1 positive
0.00 100.00 1 1 positive
99.58 0.53 0 1 positive
100.00 0.00 0 1 positive
99.98 0.01 0 1 positive
100.00 0.00 0 0 negative
99.98 0.02 0 0 negative