RNN Sentiment Classification with Keras + R

IMDB dataset

https://www.imdb.com/interfaces/
data fetch with Keras
total of 50000 reviews of internet movies customers
recognize high frequent wording used within their text content
binary classification of overall outcome comments : [positive(1) or negative (0)]

library(keras)
library(tensorflow)

set_maxfreqword_allow  = 2300

myimdb = dataset_imdb(num_words = set_maxfreqword_allow)

mydat_train        = myimdb$train$x
mydat_train_Labels = myimdb$train$y

mydat_test        = myimdb$test$x
mydat_test_Labels = myimdb$test$y

word list embeded in Keras: reformating
change word_to_index to index_to_word format

word_to_index =  dataset_imdb_word_index()
index_to_word   = names(word_to_index)
names(index_to_word ) <- word_to_index

first 10 word list indexing in IMDB dataset
	the	and	a	of	to	is	br	in	it	i
positon	58319	64305	37292	20341	67378	16730	57051	16741	13579	9680
index	1	2	3	4	5	6	7	8	9	10

last 10 word list indexing in IMDB dataset
	ev	chicatillo	transacting	sics	wheelers	pipe’s	copywrite	artbox	voorhees’	‘l’
positon	88557	88558	88559	88563	88569	88572	88575	88577	88580	88582
index	88575	88576	88577	88578	88579	88580	88581	88582	88583	88584

An example of comment review decoded by setting set_maxfreqword_allow = 2300

“?” means word not included for training or testing

## ? this film was just brilliant casting location scenery story direction ? really ? the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same ? island as myself so i loved the fact there was a real connection with this film the witty ? throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly ? was amazing really ? at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little ? that played the ? of ? and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big ? for the whole film but these children are amazing and should be ? for what they have done don't you think the whole story was so lovely because it was true and was ? life after all that was ? with us all

vec_train  = vectorize_sequences(mydat_train )
trainLabs  = as.numeric(mydat_train_Labels)

vec_test   = vectorize_sequences(mydat_test)
testLabs   = as.numeric(mydat_test_Labels)


trainLabels25000 = to_categorical(as.array(trainLabs) )
testLabels25000  = to_categorical(as.array(testLabs)  )


str(vec_train)

##  num [1:25000, 1:2300] 1 1 1 1 1 1 1 1 1 1 ...

str(trainLabs)

##  num [1:25000] 1 0 0 1 0 0 1 0 1 0 ...

str(vec_test)

##  num [1:25000, 1:2300] 1 1 1 1 1 1 1 1 1 1 ...

str(testLabs)

##  num [1:25000] 0 1 1 0 1 1 1 0 0 1 ...

trainLabels25000 = to_categorical(as.array(trainLabs) )
testLabels25000  = to_categorical(as.array(testLabs)  )

+ Construct RNN Model

model = keras_model_sequential()

model %>%
  layer_dense(units = 32, activation = "relu", 
              input_shape = c(set_maxfreqword_allow )) %>%
  layer_dense(units = 32, activation = "relu") %>%
  layer_dense(units = 2,  activation = "sigmoid")

model %>%
  compile(optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = c("accuracy") )

+ train 25000 samples : first run

TrainText25000 = model %>% fit(vec_train,  
                                trainLabels25000 , 
                                epochs = 50,
                                batch_size = 512,
                                validation_split = 0.4)

plot(TrainText25000)

+ train 25000 samples : second run

TrainText25000 = model %>% fit(vec_train,  
                                trainLabels25000 , 
                                epochs = 50,
                                batch_size = 512,
                                validation_split = 0.4)

plot(TrainText25000)

+ train 25000 samples : third run

TrainText25000 = model %>% fit(vec_train,  
                                trainLabels25000 , 
                                epochs = 50,
                                batch_size = 512,
                                validation_split = 0.4)

plot(TrainText25000)

+ Prediction with train dataset

## $loss
## [1] 0.6055875
## 
## $acc
## [1] 0.9388

##           ActualCls
## PredictCls     0     1
##          0 11753   784
##          1   747 11716

Prob_negative	Prob_positive	PredCls	ActuCls	ActuObjs
0	100	1	1	positive
100	0	0	0	negative
100	0	0	0	negative
0	100	1	1	positive
100	0	0	0	negative
100	0	0	0	negative
0	100	1	1	positive
100	0	0	0	negative
0	100	1	1	positive
100	0	0	0	negative
0	100	1	1	positive
100	0	0	0	negative
100	0	0	0	negative
100	0	0	0	negative
100	0	0	0	negative
100	0	0	0	negative
0	100	1	1	positive
0	100	1	1	positive
100	0	0	0	negative
0	100	1	1	positive
100	0	0	0	negative
100	0	0	0	negative
0	100	1	1	positive
100	0	0	0	negative
0	100	1	1	positive
0	100	1	1	positive
100	0	0	0	negative
100	0	0	0	negative
0	100	1	1	positive
100	0	0	0	negative
100	0	0	0	negative
0	100	1	1	positive
0	100	1	1	positive
0	100	1	1	positive
100	0	0	0	negative
100	0	0	0	negative
100	0	0	0	negative
0	100	1	1	positive
100	0	0	0	negative
100	0	0	0	negative
0	100	1	1	positive
100	0	0	0	negative
100	0	0	0	negative
100	0	0	0	negative
0	100	1	1	positive
0	100	1	1	positive
0	100	1	1	positive
100	0	0	0	negative
0	100	1	1	positive
100	0	0	0	negative
100	0	0	0	negative
100	0	0	0	negative
100	0	0	0	negative
100	0	0	0	negative
100	0	0	0	negative
100	0	0	0	negative
0	100	1	1	positive
100	0	0	0	negative
0	100	1	1	positive
0	100	1	1	positive

+ Prediction with test dataset

## $loss
## [1] 1.53563
## 
## $acc
## [1] 0.84544

##           ActualCls
## PredictCls     0     1
##          0 10641  2003
##          1  1859 10497

Prob_negative	Prob_positive	PredCls	ActuCls	ActuObjs
99.96	0.03	0	0	negative
0.00	100.00	1	1	positive
100.00	0.00	0	1	positive
0.00	100.00	1	0	negative
0.00	100.00	1	1	positive
0.00	100.00	1	1	positive
0.00	100.00	1	1	positive
100.00	0.00	0	0	negative
0.00	100.00	1	0	negative
0.00	100.00	1	1	positive
0.00	100.00	1	1	positive
99.55	0.33	0	0	negative
100.00	0.00	0	0	negative
0.03	99.99	1	0	negative
0.00	100.00	1	1	positive
100.00	0.00	0	0	negative
0.00	100.00	1	1	positive
99.92	0.09	0	0	negative
100.00	0.00	0	0	negative
100.00	0.00	0	0	negative
0.00	100.00	1	1	positive
0.00	100.00	1	1	positive
0.00	100.00	1	1	positive
0.00	100.00	1	1	positive
0.00	100.00	1	1	positive
0.00	100.00	1	1	positive
100.00	0.00	0	0	negative
0.00	100.00	1	1	positive
0.00	100.00	1	1	positive
100.00	0.00	0	0	negative
0.00	100.00	1	1	positive
100.00	0.00	0	1	positive
0.00	100.00	1	0	negative
100.00	0.00	0	0	negative
100.00	0.00	0	0	negative
100.00	0.00	0	0	negative
0.00	100.00	1	1	positive
0.00	100.00	1	1	positive
99.98	0.03	0	0	negative
100.00	0.00	0	0	negative
0.00	100.00	1	1	positive
0.00	100.00	1	1	positive
100.00	0.00	0	0	negative
0.00	100.00	1	1	positive
0.00	100.00	1	1	positive
99.09	0.89	0	1	positive
100.00	0.00	0	0	negative
100.00	0.00	0	0	negative
100.00	0.00	0	0	negative
100.00	0.00	0	1	positive
100.00	0.00	0	0	negative
99.98	0.02	0	0	negative
0.00	100.00	1	1	positive
0.00	100.00	1	1	positive
0.00	100.00	1	1	positive
99.58	0.53	0	1	positive
100.00	0.00	0	1	positive
99.98	0.01	0	1	positive
100.00	0.00	0	0	negative
99.98	0.02	0	0	negative

RNN Sentiment Classification with Keras + R

Written by DK