This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(keras)
library(tensorflow)
library(tidyverse)
## -- Attaching packages ------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
c(c(train_data,train_labels),c(test_data,test_labels)) %<-% dataset_reuters(num_words = 10000)
length(train_data)
## [1] 8982
length(test_data)
## [1] 2246
train_data[[1]]
## [1] 1 2 2 8 43 10 447 5 25 207 270 5 3095 111
## [15] 16 369 186 90 67 7 89 5 19 102 6 19 124 15
## [29] 90 67 84 22 482 26 7 48 4 49 8 864 39 209
## [43] 154 6 151 6 83 11 15 22 155 11 15 7 48 9
## [57] 4579 1005 504 6 258 6 272 11 15 22 134 44 11 15
## [71] 16 8 197 1245 90 67 52 29 209 30 32 132 6 109
## [85] 15 17 12
dataset_reuters_word_index() %>%
unlist()%>%
sort%>%
names()->word_index
library(purrr)
train_data[[1]]%>%
map(~ ifelse(.x>=3,word_index[.x-3],"?"))%>%
as_vector()%>%
cat()
## ? ? ? said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3
#onehotencoding
Vectorize_sequences <- function(sequences,dimension=10000){
results <- matrix(0,nrow = length(sequences),ncol = dimension)
for (i in 1:length(sequences)) {
results[i,sequences[[i]]]<- 1
}
results
}
train_data_vec<-Vectorize_sequences(train_data)
test_data_vec<-Vectorize_sequences(test_data)
train_example<-sort(unique(train_data[[1]]))
train_example
## [1] 1 2 4 5 6 7 8 9 10 11 12 15 16 17
## [15] 19 22 25 26 29 30 32 39 43 44 48 49 52 67
## [29] 83 84 89 90 102 109 111 124 132 134 151 154 155 186
## [43] 197 207 209 258 270 272 369 447 482 504 864 1005 1245 3095
## [57] 4579
train_data_vec[1,1:100]
## [1] 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 1 1 0 1 0 0 0
## [36] 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## [71] 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
str(train_labels)
## int [1:8982] 3 4 3 4 4 4 4 3 3 16 ...
sort(unique(train_labels))
## [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
## [24] 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
train_data_vec[1,1:100]
## [1] 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 1 1 0 1 0 0 0
## [36] 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## [71] 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
library(ggplot2)
train_labels %>%
plyr::count() %>%
ggplot(aes(x,freq))+
geom_col()
#the real one hot encoding
train_labels_vec <- to_categorical(train_labels)
test_labels_vec <- to_categorical(test_labels)
str(train_labels_vec)
## num [1:8982, 1:46] 0 0 0 0 0 0 0 0 0 0 ...
str(test_labels_vec)
## num [1:2246, 1:46] 0 0 0 0 0 0 0 0 0 0 ...
#creation of the deep learning architecture
network <- keras_model_sequential()%>%
layer_dense(units = 64,activation = 'relu',input_shape = c(10000))%>%
layer_dense(units = 64,activation = 'relu')%>%
layer_dense(units = 46,activation = 'softmax')
summary(network)
## Model: "sequential"
## ___________________________________________________________________________
## Layer (type) Output Shape Param #
## ===========================================================================
## dense (Dense) (None, 64) 640064
## ___________________________________________________________________________
## dense_1 (Dense) (None, 64) 4160
## ___________________________________________________________________________
## dense_2 (Dense) (None, 46) 2990
## ===========================================================================
## Total params: 647,214
## Trainable params: 647,214
## Non-trainable params: 0
## ___________________________________________________________________________
#complie
network%>%compile(
optimizer='rmsprop',
loss='categorical_crossentropy',
metric=c('accuracy')
)
#validate
index<- 1:1000
val_data_vec<-train_data_vec[index,]
train_data_vec<- train_data_vec[-index,]
val_labels_vec<-train_labels_vec[index,]
train_labels_vec<- train_labels_vec[-index,]
#training our model for 20 epochs
history <- network %>% fit(
train_data_vec,
train_labels_vec,
epochs=20,
batch_size=512,
validation_data= list(val_data_vec,val_labels_vec)
)
plot(history)
#recreating our model for 9 epochs since val_acc is going into overfitting
network <- keras_model_sequential()%>%
layer_dense(units = 64,activation = 'relu',input_shape = c(10000))%>%
layer_dense(units = 64,activation = 'relu')%>%
layer_dense(units = 46,activation = 'softmax')
summary(network)
## Model: "sequential_1"
## ___________________________________________________________________________
## Layer (type) Output Shape Param #
## ===========================================================================
## dense_3 (Dense) (None, 64) 640064
## ___________________________________________________________________________
## dense_4 (Dense) (None, 64) 4160
## ___________________________________________________________________________
## dense_5 (Dense) (None, 46) 2990
## ===========================================================================
## Total params: 647,214
## Trainable params: 647,214
## Non-trainable params: 0
## ___________________________________________________________________________
#complie
network%>%compile(
optimizer='rmsprop',
loss='categorical_crossentropy',
metric=c('accuracy')
)
history <- network %>% fit(
train_data_vec,
train_labels_vec,
epochs=9,
batch_size=512,
validation_data= list(val_data_vec,val_labels_vec)
)
plot(history)
#evaluat it on test data_set
metric<- network %>% evaluate(test_data_vec,test_labels_vec)
metric$accuracy
## [1] 0.7862867
metric$loss
## [1] 0.9619764
#predict
network %>% predict_classes(test_data_vec[1:10,])
## [1] 3 10 1 4 13 3 3 3 3 3
#prediction for all
predictions <- network %>% predict_classes(test_data_vec)
actual<- unlist(test_labels)
total_misses<-sum(predictions!=actual)
#confusion matrix
suppressPackageStartupMessages(library(tidyverse))
library(dplyr)
data.frame(target=actual,
prediction=predictions)%>%
filter(target !=prediction) %>%
group_by(target,prediction) %>%
count() %>%
ungroup() %>%
mutate(perc=n/nrow(.)*100) %>%
filter(n > 1)%>%
ggplot(aes(target,prediction,size=n))+
geom_point(shape= 15,col= "#9F92C6")+
scale_x_continuous("acutal target", breaks = 0:45)+
scale_y_continuous("prediction", breaks = 0:45)+
scale_size_area(breaks=c(2,5,10,15),max_size = 5)+
coord_fixed()+
ggtitle(paste(total_misses,"mismatches"))+
theme_classic()+
theme(rect=element_blank(),
axis.line = element_blank(),
axis.text = element_text(colour = "black"))