Exploratory Analysis

Setting up the working directory, libraries and loading the corpus.

Working directory.

setwd("D:/OneDrive - FUNIDES/Data Science/Capstone")

Libraries.

library(NLP)
library(tm)
library(SnowballC)
library(stringr)
library(quanteda)

Loading the corpus as a volatile corpus.

nlp_cap <- VCorpus(DirSource("final/en_US/"), readerControl = list(reader = readPlain, 
    language = "en_US", load = TRUE))
summary(nlp_cap)

                  Length Class             Mode
en_US.blogs.txt   2      PlainTextDocument list
en_US.news.txt    2      PlainTextDocument list
en_US.twitter.txt 2      PlainTextDocument list

Making a random training and test samples.

Making the samples.

bases <- c("blogs", "news", "twitter")

set.seed(123)

for (i in 1:3) {
    
    assign(paste("train", i, sep = ""), sample(1:length(nlp_cap[[i]]$content), 
        assign(paste("twothirds", i, sep = ""), floor(length(nlp_cap[[i]]$content) * 
            2/3))))
}
h <- 1
for (j in bases) {
    A <- paste("train", h, sep = "")
    assign(paste("train", j, sep = "_"), nlp_cap[[h]]$content[get(A)])
    assign(paste("test", j, sep = "_"), nlp_cap[[h]]$content[-get(A)])
    h <- h + 1
}

Storing the samples.

dir.create("train")
file.create("train/train_blogs.txt", "train/train_news.txt", "train/train_twitter.txt", 
    overwrite = TRUE)

con1 <- file("train/train_blogs.txt", open = "wt")
write(train_blogs, con1)
close(con1)

con1 <- file("train/train_news.txt", open = "wt")
write(train_news, con1)
close(con1)

con1 <- file("train/train_twitter.txt", open = "wt")
write(train_twitter, con1)
close(con1)

dir.create("test")
file.create("test/test_blogs.txt", "test/test_news.txt", "test/test_twitter.txt", 
    overwrite = TRUE)

con1 <- file("test/test_blogs.txt", open = "wt")
write(test_blogs, con1)
close(con1)

con1 <- file("test/test_news.txt", open = "wt")
write(test_news, con1)
close(con1)

con1 <- file("test/test_twitter.txt", open = "wt")
write(test_twitter, con1)
close(con1)

Cleaning up the corpora.

rm(nlp_cap, con1, test_blogs, test_news, test_twitter, train1, train2, train3, 
    train_blogs, train_news, train_twitter, twothirds1, twothirds2, twothirds3, 
    A, i, j, h)

nlp_cap_train <- VCorpus(DirSource("train/"), readerControl = list(reader = readPlain, 
    language = "en_US", load = TRUE))

for (i in 1:3) {
    nlp_cap_train[[i]]$content <- iconv(nlp_cap_train[[i]]$content, "latin1", 
        "ASCII", sub = "")
}

nlp_cap_train <- tm_map(nlp_cap_train, removeNumbers)
nlp_cap_train <- tm_map(nlp_cap_train, PlainTextDocument)
nlp_cap_train <- tm_map(nlp_cap_train, stripWhitespace)

Exploratory analysis of the training sample.

rm(bases, i)
nlp_cap_train <- VCorpus(VectorSource(nlp_cap_train))
nlp_cap_train_quant <- corpus(nlp_cap_train)
summary(nlp_cap_train_quant)

Corpus consisting of 3 documents:

  Text  Types   Tokens Sentences       datetimestamp id language
 text1 362952 28058189   1375616 2018-05-15 23:34:12  1       en
 text2  85294  2017259     95764 2018-05-15 23:34:12  2       en
 text3 416317 24167691   1724830 2018-05-15 23:34:12  3       en

Source: Converted from tm Corpus 'nlp_cap_train'
Created: Tue May 15 17:34:22 2018
Notes:

nlp_cap_train_quant_toks <- tokens(nlp_cap_train_quant)

nlp_cap_train_quant_nopunct_toks <- tokens(nlp_cap_train_quant, remove_punct = TRUE)
head(nlp_cap_train_quant_nopunct_toks[[1]], 50)

 [1] "To"         "sum"        "this"       "up"         "when"      
 [6] "we"         "are"        "unjustly"   "wounded"    "by"        
[11] "men"        "let"        "us"         "overlook"   "their"     
[16] "wickedness" "which"      "would"      "but"        "worsen"    
[21] "our"        "pain"       "and"        "sharpen"    "our"       
[26] "minds"      "to"         "revenge"    "remember"   "to"        
[31] "mount"      "up"         "to"         "God"        "and"       
[36] "learn"      "to"         "believe"    "for"        "certain"   
[41] "that"       "whatever"   "our"        "enemy"      "has"       
[46] "wickedly"   "committed"  "against"    "us"         "was"

nlp_cap_train_quant_nopunct_nostop_toks <- tokens_remove(nlp_cap_train_quant_nopunct_toks, 
    stopwords("en"))
head(nlp_cap_train_quant_nopunct_nostop_toks[[1]], 50)

 [1] "sum"          "unjustly"     "wounded"      "men"         
 [5] "let"          "us"           "overlook"     "wickedness"  
 [9] "worsen"       "pain"         "sharpen"      "minds"       
[13] "revenge"      "remember"     "mount"        "God"         
[17] "learn"        "believe"      "certain"      "whatever"    
[21] "enemy"        "wickedly"     "committed"    "us"          
[25] "permitted"    "sent"         "Gods"         "just"        
[29] "dispensation" "Calvin"       "Institutes"   "Im"          
[33] "sure"         "Ill"          "go"           "pretty"      
[37] "much"         "crazy"        "working"      "achievements"
[41] "Ive"          "gotten"       "quite"        "belt"        
[45] "far"          "including"    "fishing"      "turtle"      
[49] "sewer"        "rat"

Creating a document feature matrix.

nlp_cap_train_quant_nopunct_dfm <- dfm(nlp_cap_train_quant_nopunct_toks)
nlp_cap_train_quant_nopunct_nostop_dfm <- dfm_select(nlp_cap_train_quant_nopunct_dfm, 
    stopwords("en"), selection = "remove")

Calculating some frequencies.

Below is a table with the 10 most frequent words.

topfeatures(nlp_cap_train_quant_nopunct_nostop_dfm, 10)

  just   like    one    can    get   time   love   good    now    day 
170688 149572 141956 127939 124537 112066 101607 101154  98103  96225

prop_nlp_cap_train_quant_nopunct_nostop_dfm <- dfm_weight(nlp_cap_train_quant_nopunct_nostop_dfm, 
    scheme = "prop")
i = 1
sum1 = 0
while (sum1 < 0.501) {
    sum1 = sum(topfeatures(prop_nlp_cap_train_quant_nopunct_nostop_dfm, i))
    i = i + 1
}
j = 1
sum2 = 0
while (sum2 < 0.901) {
    sum2 = sum(topfeatures(prop_nlp_cap_train_quant_nopunct_nostop_dfm, j))
    j = j + 1
}

The training sample from the corpora without punctuation and english stop words has 82 words that cover 50 percent of the corpora and 282 words that cover 90 percent of the corpora.

Constructing a feature-ocurrences matrix with the 82 words that cover 50 percent of the training sample.

nlp_cap_train_quant_nopunct_nostop_dfm_trim <- dfm_trim(nlp_cap_train_quant_nopunct_nostop_dfm, 
    min_termfreq = 300)
nlp_cap_train_quant_nopunct_nostop_fcm_trim <- fcm(nlp_cap_train_quant_nopunct_nostop_dfm_trim)

feat <- names(topfeatures(nlp_cap_train_quant_nopunct_nostop_fcm_trim, 82))
nlp_cap_train_quant_nopunct_nostop_fcm_trim <- fcm_select(nlp_cap_train_quant_nopunct_nostop_fcm_trim, 
    feat)
size <- log(colSums(dfm_select(nlp_cap_train_quant_nopunct_nostop_dfm_trim, 
    feat)))
textplot_network(nlp_cap_train_quant_nopunct_nostop_fcm_trim, min_freq = 0.5, 
    vertex_size = size/max(size) * 3)

Creating 2-grams and 3-grams excluding english stop words.

2-grams

gram_2 <- tokens_ngrams(nlp_cap_train_quant_nopunct_nostop_toks, n = 2)
head(gram_2[[1]], 50)

 [1] "sum_unjustly"         "unjustly_wounded"     "wounded_men"         
 [4] "men_let"              "let_us"               "us_overlook"         
 [7] "overlook_wickedness"  "wickedness_worsen"    "worsen_pain"         
[10] "pain_sharpen"         "sharpen_minds"        "minds_revenge"       
[13] "revenge_remember"     "remember_mount"       "mount_God"           
[16] "God_learn"            "learn_believe"        "believe_certain"     
[19] "certain_whatever"     "whatever_enemy"       "enemy_wickedly"      
[22] "wickedly_committed"   "committed_us"         "us_permitted"        
[25] "permitted_sent"       "sent_Gods"            "Gods_just"           
[28] "just_dispensation"    "dispensation_Calvin"  "Calvin_Institutes"   
[31] "Institutes_Im"        "Im_sure"              "sure_Ill"            
[34] "Ill_go"               "go_pretty"            "pretty_much"         
[37] "much_crazy"           "crazy_working"        "working_achievements"
[40] "achievements_Ive"     "Ive_gotten"           "gotten_quite"        
[43] "quite_belt"           "belt_far"             "far_including"       
[46] "including_fishing"    "fishing_turtle"       "turtle_sewer"        
[49] "sewer_rat"            "rat_stood"

3-grams

gram_3 <- tokens_ngrams(nlp_cap_train_quant_nopunct_nostop_toks, n = 3)
head(gram_3[[1]], 50)

 [1] "sum_unjustly_wounded"           "unjustly_wounded_men"          
 [3] "wounded_men_let"                "men_let_us"                    
 [5] "let_us_overlook"                "us_overlook_wickedness"        
 [7] "overlook_wickedness_worsen"     "wickedness_worsen_pain"        
 [9] "worsen_pain_sharpen"            "pain_sharpen_minds"            
[11] "sharpen_minds_revenge"          "minds_revenge_remember"        
[13] "revenge_remember_mount"         "remember_mount_God"            
[15] "mount_God_learn"                "God_learn_believe"             
[17] "learn_believe_certain"          "believe_certain_whatever"      
[19] "certain_whatever_enemy"         "whatever_enemy_wickedly"       
[21] "enemy_wickedly_committed"       "wickedly_committed_us"         
[23] "committed_us_permitted"         "us_permitted_sent"             
[25] "permitted_sent_Gods"            "sent_Gods_just"                
[27] "Gods_just_dispensation"         "just_dispensation_Calvin"      
[29] "dispensation_Calvin_Institutes" "Calvin_Institutes_Im"          
[31] "Institutes_Im_sure"             "Im_sure_Ill"                   
[33] "sure_Ill_go"                    "Ill_go_pretty"                 
[35] "go_pretty_much"                 "pretty_much_crazy"             
[37] "much_crazy_working"             "crazy_working_achievements"    
[39] "working_achievements_Ive"       "achievements_Ive_gotten"       
[41] "Ive_gotten_quite"               "gotten_quite_belt"             
[43] "quite_belt_far"                 "belt_far_including"            
[45] "far_including_fishing"          "including_fishing_turtle"      
[47] "fishing_turtle_sewer"           "turtle_sewer_rat"              
[49] "sewer_rat_stood"                "rat_stood_feet"

Future work.

Build a prediction model using Markov Chains and the “backoff” model and maybe get prediction improvements trying to implement a Recursive Neural Network and the word2vec package.

The shiny app would have a simple user interface that would permit the user to input a certain number of words that would be procesed by the server implementing the algorithm. Choose an algorithm based on performance and accuracy, or maybe let the user choose the algorithm he prefers. Give back tree word that the algorithm considers would bethe most likely next word.