setwd("D:/OneDrive - FUNIDES/Data Science/Capstone")
library(NLP)
library(tm)
library(SnowballC)
library(stringr)
library(quanteda)
nlp_cap <- VCorpus(DirSource("final/en_US/"), readerControl = list(reader = readPlain,
language = "en_US", load = TRUE))
summary(nlp_cap)
Length Class Mode
en_US.blogs.txt 2 PlainTextDocument list
en_US.news.txt 2 PlainTextDocument list
en_US.twitter.txt 2 PlainTextDocument list
bases <- c("blogs", "news", "twitter")
set.seed(123)
for (i in 1:3) {
assign(paste("train", i, sep = ""), sample(1:length(nlp_cap[[i]]$content),
assign(paste("twothirds", i, sep = ""), floor(length(nlp_cap[[i]]$content) *
2/3))))
}
h <- 1
for (j in bases) {
A <- paste("train", h, sep = "")
assign(paste("train", j, sep = "_"), nlp_cap[[h]]$content[get(A)])
assign(paste("test", j, sep = "_"), nlp_cap[[h]]$content[-get(A)])
h <- h + 1
}
dir.create("train")
file.create("train/train_blogs.txt", "train/train_news.txt", "train/train_twitter.txt",
overwrite = TRUE)
con1 <- file("train/train_blogs.txt", open = "wt")
write(train_blogs, con1)
close(con1)
con1 <- file("train/train_news.txt", open = "wt")
write(train_news, con1)
close(con1)
con1 <- file("train/train_twitter.txt", open = "wt")
write(train_twitter, con1)
close(con1)
dir.create("test")
file.create("test/test_blogs.txt", "test/test_news.txt", "test/test_twitter.txt",
overwrite = TRUE)
con1 <- file("test/test_blogs.txt", open = "wt")
write(test_blogs, con1)
close(con1)
con1 <- file("test/test_news.txt", open = "wt")
write(test_news, con1)
close(con1)
con1 <- file("test/test_twitter.txt", open = "wt")
write(test_twitter, con1)
close(con1)
rm(nlp_cap, con1, test_blogs, test_news, test_twitter, train1, train2, train3,
train_blogs, train_news, train_twitter, twothirds1, twothirds2, twothirds3,
A, i, j, h)
nlp_cap_train <- VCorpus(DirSource("train/"), readerControl = list(reader = readPlain,
language = "en_US", load = TRUE))
for (i in 1:3) {
nlp_cap_train[[i]]$content <- iconv(nlp_cap_train[[i]]$content, "latin1",
"ASCII", sub = "")
}
nlp_cap_train <- tm_map(nlp_cap_train, removeNumbers)
nlp_cap_train <- tm_map(nlp_cap_train, PlainTextDocument)
nlp_cap_train <- tm_map(nlp_cap_train, stripWhitespace)
rm(bases, i)
nlp_cap_train <- VCorpus(VectorSource(nlp_cap_train))
nlp_cap_train_quant <- corpus(nlp_cap_train)
summary(nlp_cap_train_quant)
Corpus consisting of 3 documents:
Text Types Tokens Sentences datetimestamp id language
text1 362952 28058189 1375616 2018-05-15 23:34:12 1 en
text2 85294 2017259 95764 2018-05-15 23:34:12 2 en
text3 416317 24167691 1724830 2018-05-15 23:34:12 3 en
Source: Converted from tm Corpus 'nlp_cap_train'
Created: Tue May 15 17:34:22 2018
Notes:
nlp_cap_train_quant_toks <- tokens(nlp_cap_train_quant)
nlp_cap_train_quant_nopunct_toks <- tokens(nlp_cap_train_quant, remove_punct = TRUE)
head(nlp_cap_train_quant_nopunct_toks[[1]], 50)
[1] "To" "sum" "this" "up" "when"
[6] "we" "are" "unjustly" "wounded" "by"
[11] "men" "let" "us" "overlook" "their"
[16] "wickedness" "which" "would" "but" "worsen"
[21] "our" "pain" "and" "sharpen" "our"
[26] "minds" "to" "revenge" "remember" "to"
[31] "mount" "up" "to" "God" "and"
[36] "learn" "to" "believe" "for" "certain"
[41] "that" "whatever" "our" "enemy" "has"
[46] "wickedly" "committed" "against" "us" "was"
nlp_cap_train_quant_nopunct_nostop_toks <- tokens_remove(nlp_cap_train_quant_nopunct_toks,
stopwords("en"))
head(nlp_cap_train_quant_nopunct_nostop_toks[[1]], 50)
[1] "sum" "unjustly" "wounded" "men"
[5] "let" "us" "overlook" "wickedness"
[9] "worsen" "pain" "sharpen" "minds"
[13] "revenge" "remember" "mount" "God"
[17] "learn" "believe" "certain" "whatever"
[21] "enemy" "wickedly" "committed" "us"
[25] "permitted" "sent" "Gods" "just"
[29] "dispensation" "Calvin" "Institutes" "Im"
[33] "sure" "Ill" "go" "pretty"
[37] "much" "crazy" "working" "achievements"
[41] "Ive" "gotten" "quite" "belt"
[45] "far" "including" "fishing" "turtle"
[49] "sewer" "rat"
nlp_cap_train_quant_nopunct_dfm <- dfm(nlp_cap_train_quant_nopunct_toks)
nlp_cap_train_quant_nopunct_nostop_dfm <- dfm_select(nlp_cap_train_quant_nopunct_dfm,
stopwords("en"), selection = "remove")
Below is a table with the 10 most frequent words.
topfeatures(nlp_cap_train_quant_nopunct_nostop_dfm, 10)
just like one can get time love good now day
170688 149572 141956 127939 124537 112066 101607 101154 98103 96225
prop_nlp_cap_train_quant_nopunct_nostop_dfm <- dfm_weight(nlp_cap_train_quant_nopunct_nostop_dfm,
scheme = "prop")
i = 1
sum1 = 0
while (sum1 < 0.501) {
sum1 = sum(topfeatures(prop_nlp_cap_train_quant_nopunct_nostop_dfm, i))
i = i + 1
}
j = 1
sum2 = 0
while (sum2 < 0.901) {
sum2 = sum(topfeatures(prop_nlp_cap_train_quant_nopunct_nostop_dfm, j))
j = j + 1
}
The training sample from the corpora without punctuation and english stop words has 82 words that cover 50 percent of the corpora and 282 words that cover 90 percent of the corpora.
nlp_cap_train_quant_nopunct_nostop_dfm_trim <- dfm_trim(nlp_cap_train_quant_nopunct_nostop_dfm,
min_termfreq = 300)
nlp_cap_train_quant_nopunct_nostop_fcm_trim <- fcm(nlp_cap_train_quant_nopunct_nostop_dfm_trim)
feat <- names(topfeatures(nlp_cap_train_quant_nopunct_nostop_fcm_trim, 82))
nlp_cap_train_quant_nopunct_nostop_fcm_trim <- fcm_select(nlp_cap_train_quant_nopunct_nostop_fcm_trim,
feat)
size <- log(colSums(dfm_select(nlp_cap_train_quant_nopunct_nostop_dfm_trim,
feat)))
textplot_network(nlp_cap_train_quant_nopunct_nostop_fcm_trim, min_freq = 0.5,
vertex_size = size/max(size) * 3)
gram_2 <- tokens_ngrams(nlp_cap_train_quant_nopunct_nostop_toks, n = 2)
head(gram_2[[1]], 50)
[1] "sum_unjustly" "unjustly_wounded" "wounded_men"
[4] "men_let" "let_us" "us_overlook"
[7] "overlook_wickedness" "wickedness_worsen" "worsen_pain"
[10] "pain_sharpen" "sharpen_minds" "minds_revenge"
[13] "revenge_remember" "remember_mount" "mount_God"
[16] "God_learn" "learn_believe" "believe_certain"
[19] "certain_whatever" "whatever_enemy" "enemy_wickedly"
[22] "wickedly_committed" "committed_us" "us_permitted"
[25] "permitted_sent" "sent_Gods" "Gods_just"
[28] "just_dispensation" "dispensation_Calvin" "Calvin_Institutes"
[31] "Institutes_Im" "Im_sure" "sure_Ill"
[34] "Ill_go" "go_pretty" "pretty_much"
[37] "much_crazy" "crazy_working" "working_achievements"
[40] "achievements_Ive" "Ive_gotten" "gotten_quite"
[43] "quite_belt" "belt_far" "far_including"
[46] "including_fishing" "fishing_turtle" "turtle_sewer"
[49] "sewer_rat" "rat_stood"
gram_3 <- tokens_ngrams(nlp_cap_train_quant_nopunct_nostop_toks, n = 3)
head(gram_3[[1]], 50)
[1] "sum_unjustly_wounded" "unjustly_wounded_men"
[3] "wounded_men_let" "men_let_us"
[5] "let_us_overlook" "us_overlook_wickedness"
[7] "overlook_wickedness_worsen" "wickedness_worsen_pain"
[9] "worsen_pain_sharpen" "pain_sharpen_minds"
[11] "sharpen_minds_revenge" "minds_revenge_remember"
[13] "revenge_remember_mount" "remember_mount_God"
[15] "mount_God_learn" "God_learn_believe"
[17] "learn_believe_certain" "believe_certain_whatever"
[19] "certain_whatever_enemy" "whatever_enemy_wickedly"
[21] "enemy_wickedly_committed" "wickedly_committed_us"
[23] "committed_us_permitted" "us_permitted_sent"
[25] "permitted_sent_Gods" "sent_Gods_just"
[27] "Gods_just_dispensation" "just_dispensation_Calvin"
[29] "dispensation_Calvin_Institutes" "Calvin_Institutes_Im"
[31] "Institutes_Im_sure" "Im_sure_Ill"
[33] "sure_Ill_go" "Ill_go_pretty"
[35] "go_pretty_much" "pretty_much_crazy"
[37] "much_crazy_working" "crazy_working_achievements"
[39] "working_achievements_Ive" "achievements_Ive_gotten"
[41] "Ive_gotten_quite" "gotten_quite_belt"
[43] "quite_belt_far" "belt_far_including"
[45] "far_including_fishing" "including_fishing_turtle"
[47] "fishing_turtle_sewer" "turtle_sewer_rat"
[49] "sewer_rat_stood" "rat_stood_feet"
Build a prediction model using Markov Chains and the “backoff” model and maybe get prediction improvements trying to implement a Recursive Neural Network and the word2vec package.
The shiny app would have a simple user interface that would permit the user to input a certain number of words that would be procesed by the server implementing the algorithm. Choose an algorithm based on performance and accuracy, or maybe let the user choose the algorithm he prefers. Give back tree word that the algorithm considers would bethe most likely next word.