R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(keras)
library(tensorflow)
library(tidyverse)
## -- Attaching packages ------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
c(c(train_data,train_labels),c(test_data,test_labels)) %<-% dataset_reuters(num_words = 10000)

length(train_data)
## [1] 8982
length(test_data)
## [1] 2246
train_data[[1]]
##  [1]    1    2    2    8   43   10  447    5   25  207  270    5 3095  111
## [15]   16  369  186   90   67    7   89    5   19  102    6   19  124   15
## [29]   90   67   84   22  482   26    7   48    4   49    8  864   39  209
## [43]  154    6  151    6   83   11   15   22  155   11   15    7   48    9
## [57] 4579 1005  504    6  258    6  272   11   15   22  134   44   11   15
## [71]   16    8  197 1245   90   67   52   29  209   30   32  132    6  109
## [85]   15   17   12
dataset_reuters_word_index() %>%
  unlist()%>%
  sort%>%
  names()->word_index

library(purrr)

train_data[[1]]%>%
  map(~ ifelse(.x>=3,word_index[.x-3],"?"))%>%
  as_vector()%>%
  cat()
## ? ? ? said as a result of its december acquisition of space co it expects earnings per share in 1987 of 1 15 to 1 30 dlrs per share up from 70 cts in 1986 the company said pretax net should rise to nine to 10 mln dlrs from six mln dlrs in 1986 and rental operation revenues to 19 to 22 mln dlrs from 12 5 mln dlrs it said cash flow per share this year should be 2 50 to three dlrs reuter 3
#onehotencoding

Vectorize_sequences <- function(sequences,dimension=10000){
  
  results <- matrix(0,nrow = length(sequences),ncol = dimension)

  for (i in 1:length(sequences)) {
    results[i,sequences[[i]]]<- 1
  }
    results
  }



train_data_vec<-Vectorize_sequences(train_data)
test_data_vec<-Vectorize_sequences(test_data)

train_example<-sort(unique(train_data[[1]]))
train_example
##  [1]    1    2    4    5    6    7    8    9   10   11   12   15   16   17
## [15]   19   22   25   26   29   30   32   39   43   44   48   49   52   67
## [29]   83   84   89   90  102  109  111  124  132  134  151  154  155  186
## [43]  197  207  209  258  270  272  369  447  482  504  864 1005 1245 3095
## [57] 4579
train_data_vec[1,1:100]
##   [1] 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 1 1 0 1 0 0 0
##  [36] 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
##  [71] 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
str(train_labels)
##  int [1:8982] 3 4 3 4 4 4 4 3 3 16 ...
sort(unique(train_labels))
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
## [24] 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
train_data_vec[1,1:100]
##   [1] 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 1 1 0 1 0 0 0
##  [36] 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
##  [71] 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
library(ggplot2)
train_labels %>%
  plyr::count() %>%
  ggplot(aes(x,freq))+
  geom_col()

#the real one hot encoding 


train_labels_vec <- to_categorical(train_labels)


test_labels_vec <- to_categorical(test_labels)

str(train_labels_vec)
##  num [1:8982, 1:46] 0 0 0 0 0 0 0 0 0 0 ...
str(test_labels_vec)
##  num [1:2246, 1:46] 0 0 0 0 0 0 0 0 0 0 ...
#creation of the deep learning architecture 

network <- keras_model_sequential()%>%
  layer_dense(units = 64,activation = 'relu',input_shape = c(10000))%>%
  layer_dense(units = 64,activation = 'relu')%>%
  layer_dense(units = 46,activation = 'softmax')
  
summary(network)
## Model: "sequential"
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## dense (Dense)                    (None, 64)                    640064      
## ___________________________________________________________________________
## dense_1 (Dense)                  (None, 64)                    4160        
## ___________________________________________________________________________
## dense_2 (Dense)                  (None, 46)                    2990        
## ===========================================================================
## Total params: 647,214
## Trainable params: 647,214
## Non-trainable params: 0
## ___________________________________________________________________________
#complie
network%>%compile(
  optimizer='rmsprop',
  loss='categorical_crossentropy',
  metric=c('accuracy')
)

#validate

index<- 1:1000
val_data_vec<-train_data_vec[index,]
train_data_vec<- train_data_vec[-index,]

val_labels_vec<-train_labels_vec[index,]
train_labels_vec<- train_labels_vec[-index,]


#training our model for 20 epochs

history <- network %>% fit(
  train_data_vec,
  train_labels_vec,
  epochs=20,
  batch_size=512,
  validation_data= list(val_data_vec,val_labels_vec)
)

plot(history)

#recreating our model for 9 epochs since val_acc is going into overfitting 
network <- keras_model_sequential()%>%
  layer_dense(units = 64,activation = 'relu',input_shape = c(10000))%>%
  layer_dense(units = 64,activation = 'relu')%>%
  layer_dense(units = 46,activation = 'softmax')

summary(network)
## Model: "sequential_1"
## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## dense_3 (Dense)                  (None, 64)                    640064      
## ___________________________________________________________________________
## dense_4 (Dense)                  (None, 64)                    4160        
## ___________________________________________________________________________
## dense_5 (Dense)                  (None, 46)                    2990        
## ===========================================================================
## Total params: 647,214
## Trainable params: 647,214
## Non-trainable params: 0
## ___________________________________________________________________________
#complie
network%>%compile(
  optimizer='rmsprop',
  loss='categorical_crossentropy',
  metric=c('accuracy')
)


history <- network %>% fit(
  train_data_vec,
  train_labels_vec,
  epochs=9,
  batch_size=512,
  validation_data= list(val_data_vec,val_labels_vec)
)

plot(history)

#evaluat it on test data_set

metric<- network %>% evaluate(test_data_vec,test_labels_vec)


metric$accuracy
## [1] 0.7862867
metric$loss
## [1] 0.9619764
#predict 

network %>% predict_classes(test_data_vec[1:10,])
##  [1]  3 10  1  4 13  3  3  3  3  3
#prediction for all 
predictions <- network %>% predict_classes(test_data_vec)
actual<- unlist(test_labels)
total_misses<-sum(predictions!=actual)

#confusion matrix

suppressPackageStartupMessages(library(tidyverse))

library(dplyr)

data.frame(target=actual,
           prediction=predictions)%>%
  filter(target !=prediction) %>%
  group_by(target,prediction) %>%
  count() %>%
  ungroup() %>%
  mutate(perc=n/nrow(.)*100) %>%
  filter(n > 1)%>%
  ggplot(aes(target,prediction,size=n))+
  geom_point(shape= 15,col= "#9F92C6")+
  scale_x_continuous("acutal target", breaks = 0:45)+
  scale_y_continuous("prediction", breaks = 0:45)+
  scale_size_area(breaks=c(2,5,10,15),max_size = 5)+
  coord_fixed()+
  ggtitle(paste(total_misses,"mismatches"))+
  theme_classic()+
  theme(rect=element_blank(),
           axis.line = element_blank(),
           axis.text = element_text(colour = "black"))