The Data Exploration has been made in the Capstone week 2 file.
Here we are going to explain all the steps and assumptions made for the word prediction model.
For the cleaning of the data we used the Quanteda package because is the most optimized package for the task.
library(tm);library(R.utils)
library(readr);library(data.table);library(readtext)
library(quanteda);library(ggplot2);library(tidyr)
library(dplyr);library(purrr);library(spacyr)
library(doParallel);library(quanteda);library(hunspell)
library(stringr);library(stringi);library(wordnet)
library(pluralize);library(textcat);library(tidyverse)
library(arsenal)
Its important to make parallel processing so I used it to make everything quicker
cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl)
First of all I created a folder to input a sample of the 3 files. I preferred to do it outside RStudio because that way I spare more RAM.
if (!dir.exists("samples")) {
dir.create("samples")}
createSample <- function(path, fileName, p) {
# Calculate number of lines efficiently
readFilePath <- paste(path, fileName, sep="/")
nLines <- countLines(readFilePath)
# Prepare write connection to file
sampleFilePath <- paste("samples", fileName, sep="/")
conWrite <- file(sampleFilePath, "w")
testfileName<- paste("test", fileName, sep="_")
sampleFilePath <- paste("samples", testfileName, sep="/")
testWrite <- file(sampleFilePath, "w")
# Prepare read connection
conRead <- file(readFilePath, "r")
# The size of the samples with value = 1 is nLines*p, meaning its not a fixed length
samples <- rbinom(nLines, size=1, prob=p)
for (i in 1:nLines) {
t <- readLines(conRead, 1, skipNul = TRUE)
if (samples[i] == 1) {
writeLines(t, conWrite)
} else {
writeLines(t, testWrite)
}
}
close(conRead)
close(conWrite)
close(testWrite)
}
NOTE: I delete some special characters by hand (a square character on the NEWS file and double spacing in Twitter) that made the file read only partially.
Then created 2 files per each file one that has the probability stated (Train set) and the other one is his complement (Test set).
pathToFiles <- "en_US"
sampleProbability <- .3
set.seed(123)
createSample(pathToFiles, "en_US.blogs.txt", sampleProbability)
createSample(pathToFiles, "en_US.news.txt", sampleProbability)
createSample(pathToFiles, "en_US.twitter.txt", sampleProbability)
Then it only takes to read the sampled data
readdata<-function(filename){
pathfile<- paste(".","samples",filename,sep = "/")
datafile<-readtext(pathfile)
}
corpus<-map_dfr(list.files("en_US1"),readdata)%>%corpus()
Combine documents from the corpus and transform the whole text in sentences Also only use those sentences that have English characters, this reduces the probabilities to have other languages in the corpus
corpus<-corpus(texts(corpus, groups = rep(1, ndoc(corpus))))%>%
corpus_reshape(to="sentences")
corpus<-corpus[stri_enc_isascii(corpus)]
Here we removed all the symbols, punctuation, numbers, url’s and tokenize it with the quanteda package
tkn <- tokens(corpus,
remove_punct = TRUE,
remove_numbers = TRUE,
remove_symbols = TRUE,
split_hyphens = TRUE,
remove_url = TRUE,
remove_twitter = TRUE,
what = "word1")%>%
tokens_tolower()%>% tokens_remove("^[0-9]",valuetype = "regex")
rm(corpus)
For posterior use I took the first word of each sentence (this will be the first words that will appear in the app)
first_words<-character()
for(i in 1:10000) {
first_words<-rbind(first_words,tkn[[i]][1])
}
first_words<-table(first_words)
first_words<-tibble(name=names(first_words),freq=first_words)%>%arrange(-freq)
print(first_words[1:5,])
## # A tibble: 5 x 2
## name freq
## <chr> <table>
## 1 i 1229
## 2 the 691
## 3 it 316
## 4 and 251
## 5 we 240
Here we select those words that are not correct and because correct them would require allot of RAM, I change them to a stopword and subsequently cut the sentence where that word appear.
spell_tokens <- function(tkn, language = "en_US") {
# extract types to only work on them
types <- types1 <- types(tkn)
# spelling
correct <- hunspell_check(
words = as.character(types),
dict = hunspell::dictionary(language)
)
types[!correct]<- "STOP"
# replace original tokens
token_new <- tokens_replace(tkn, types1, types,valuetype = "fixed",case_insensitive= FALSE)
token_new <- quanteda::tokens_segment(token_new, pattern = "STOP", valuetype = "fixed",extract_pattern=TRUE)
token_new
}
tkn<-spell_tokens(tkn)
dfm<- dfm(tkn)
Now we want to see the accumulative graphic of the words.
freq<- textstat_frequency(dfm)[,1:2]
table_freq<- data.table(table(freq$frequency))%>%print()
## V1 N
## 1: 1 9143
## 2: 2 5177
## 3: 3 3583
## 4: 4 2715
## 5: 5 2077
## ---
## 2775: 484842 1
## 2776: 591841 1
## 2777: 591871 1
## 2778: 682812 1
## 2779: 1170762 1
words_coverage <- data.frame(
coverage = round(cumsum(freq$frequency) / sum(freq$frequency) * 100, 2),
words = 1:nrow(freq)
)
rm(freq,table_freq)
qplot(data=words_coverage[1:3000,],x=words,y=coverage)
I only plotted to 3.000 words because it represents the interesting part of the curve. Were the Zipf’s law shows. “Zipf’s law was originally formulated in terms of quantitative linguistics, stating that given some corpus of natural language utterances, the frequency of any word is inversely proportional to its rank in the frequency table. Thus the most frequent word will occur approximately twice as often as the second most frequent word, three times as often as the third most frequent word, etc.”
The minimum number of top words added to achieve 50% and 90% coverage are:
#50% coverage
min(words_coverage[words_coverage$coverage > 50, ]$words)
## [1] 109
#90% coverage
min(words_coverage[words_coverage$coverage > 90, ]$words)%>%print()
## [1] 4256
#Words that appeared only 1 time
tail(words_coverage, n = 1)[, 2]
## [1] 60882
Here we transform the data to Ngrams and transform it to a tipe of data more usable.
unigs <- dfm
bigrs<- tokens_ngrams(tkn, 2,concatenator = " ")%>%dfm()
trigs<- tokens_ngrams(tkn, 3,concatenator = " ")%>%dfm()
rm(dfm)
sums_U <- colSums(unigs)
sums_B <- colSums(bigrs)
sums_T <- colSums(trigs)
unigs <- tibble(word_1 = names(sums_U), freq_u = sums_U)
bigrs <- tibble(
word_1 = sapply(strsplit(names(sums_B), " ", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(names(sums_B), " ", fixed = TRUE), '[[', 2),
freq_b = sums_B)
trigs <- tibble(
word_1 = sapply(strsplit(names(sums_T), " ", fixed = TRUE), '[[', 1),
word_2 = sapply(strsplit(names(sums_T), " ", fixed = TRUE), '[[', 2),
word_3 = sapply(strsplit(names(sums_T), " ", fixed = TRUE), '[[', 3),
freq_t = sums_T)
rm(sums_B,sums_T,sums_U)
Obtain the most frequent words used in all the dataset. This will be useful is if the word used to predict is not in the dataset. This could be changed if we collect the user data so that we can predict their own most frequent words.
n_pred<-5
most_freq_word <-arrange(unigs,-freq_u)$word_1[1:n_pred]
I needed a function to detect wrong spelled words and suggest new ones
wordspell<-function(words_pred){
correct <- hunspell_check(
words = words_pred,
dict = hunspell::dictionary("en_US")
)
pattern <- words_pred[!correct]
replacement <- sapply(hunspell_suggest(pattern, dict = "en_US"), FUN = "[",1:n_pred)
return(replacement)
}
Then the function that will be inserted in the app
predict_this<- function(pred,n_pred =5,gamma2 =0.5,gamma3 = 0.5){
most_first_word <- c("i", "the", "it", "and", "we")
if(is.na(pred)| pred== "") {
return(most_first_word)}
pred<- tokens(pred,
remove_punct = TRUE,
remove_numbers = TRUE,
remove_symbols = TRUE,
split_hyphens = TRUE)
pred<-rev(pred[[1]])
correct<-wordspell(pred[1])
if(length(correct)>1) {
return(as.character(correct))
}
correct<-wordspell(pred[2])
if(length(correct)>1) {
pred<- pred[1]
}
pred<- tolower(pred)
obs_unigr<- unigs[unigs$word_1 == pred[1],]
obs_bigrams<- filter(bigrs, word_1 == pred[1])
obs_unigr2<-nrow(unigs[unigs$word_1 == pred[2],])
obsCount <- filter(bigrs, word_1 == pred[2],word_2 ==pred[1])
obs_trig <- filter(trigs, word_1 == pred[2],word_2 ==pred[1])
#function to predict
trigram_predcit<- function(pred){
qbo_obs_trigrams <- mutate(obs_trig, prob=((freq_t - gamma3) / sum(obsCount$freq_b)))%>%
select(-c(word_1,word_2,freq_t))
unobs_trig_tails <- unigs[!(unigs$word_1 %in% qbo_obs_trigrams$word_3), ]$word_1
alphaBi <- 1 - (sum(obs_bigrams$freq_b - gamma2) / obs_unigr$freq_u)
obs_boBigrams<-filter(bigrs, is.element(word_2,unobs_trig_tails) & word_1== pred[1])
unobs_boBigrams<-unobs_trig_tails[!is.element(unobs_trig_tails,obs_boBigrams$word_2)]
qboObsBigs <- mutate(obs_boBigrams,prob=(freq_b - gamma2) / obs_unigr$freq_u)%>%
select(-c(freq_b,word_1))
qboUnobsBigs <-filter(unigs,is.element(word_1,unobs_boBigrams)) %>%
rename(word_2= word_1) %>%
mutate(prob=(alphaBi * freq_u) / sum(freq_u))%>%
select(-freq_u)
qbo_bigrams <- bind_rows(qboObsBigs, qboUnobsBigs)%>%arrange(desc(prob))
#Checkpoint
#cheking if both numbers are equal it shows that you made everything good
#sum(qbo_bigrams[!is.element(qbo_bigrams$word_2,filter(bigrs,word_1==pred[1])$word_2),]$prob)
#alphaBi
alphaTrig <- 1 - sum((obs_trig$freq_t - gamma3) / obsCount$freq_b)
qbo_unobs_trigrams <- mutate(qbo_bigrams,
prob=alphaTrig * qbo_bigrams$prob / sum(qbo_bigrams$prob))%>%
rename(word_3=word_2)
qbo_trigrams <- rbind(qbo_obs_trigrams, qbo_unobs_trigrams)%>%
arrange(desc(prob))
return(qbo_trigrams[1:n_pred,]$word_3)
}
#Checkpoint
#cheking if the sum is equal to 1 shows that you made everything good
#sum(qbo_trigrams$prob)
# What has the code to check and return for each case
if(nrow(obs_unigr) == 0) {
return(most_first_word)
} else if( nrow(obs_bigrams) == 0) {
return( arrange(unigs,-freq_u)$word_1[1:n_pred] )
} else if( obs_unigr2 == 0) {
return(arrange(obs_bigrams,-freq_b)$word_2 [1:n_pred])
} else if (nrow(obsCount)==0){
return(trigram_predcit(pred))
} else if(nrow(obs_trig)==0) {
return(arrange(obs_bigrams,-freq_b)$word_2 [1:n_pred])
} else {
return(trigram_predcit(pred))
}
}
To show some examples using the model
predict_this("Shall we go to the")
## [1] "public" "next" "point" "new" "top"
predict_this("about his")
## [1] "life" "work" "new" "own" "past"
predict_this("monkeys this")
## [1] "is" "year" "week" "morning" "weekend"
Use the part of the data saved for the test set. As this is to much, we are going to use only 500 an we have to correct the spelling of the previous and the next word. Because we now that the data set has many errors
test_data<-readtext("./samples/test/test_en_US.twitter.txt")%>%corpus()
n_sample<-500
set.seed(2021)
test_data<-corpus(texts(test_data, groups = rep(1, ndoc(test_data))))%>%
corpus_reshape(test_data,to="sentences")%>%corpus_sample(n_sample)
#tokenize as it were the entry text
test_data2<-test_data[stri_enc_isascii(test_data)]%>%
tokens( remove_punct = TRUE,
remove_numbers = TRUE,
remove_symbols = TRUE,
split_hyphens = TRUE,
remove_url = TRUE,
remove_twitter = TRUE,
what = "word1")%>%
tokens_tolower()%>% tokens_remove("^[0-9]",valuetype = "regex")
## Warning: 'remove_twitter' is defunct; see 'quanteda Tokenizers' in ?tokens
test_data3<-tibble(w_1=as.character(rep(NA,times=ndoc(test_data2))),
pred=as.character(rep(NA,times=ndoc(test_data2))))
n_pred<-1
for(i in 1:ndoc(test_data2)) {
doc<-c(rev(test_data2[[i]]))
n_pred<-1
word<- character()
if(length(doc)==1){
word<- NA
}else{
word1<-wordspell(doc[3])
if(length(word1)==0) {word11<-doc[3]
}else {word11<-word1}
word2<-wordspell(doc[2])
if(length(word2)==0) {word21<-doc[2]
}else {word21<-word2}
}
word<-paste(word11,word21,sep = " ")
word3<-wordspell(doc[1])
if(length(word3)==0) {word31<-doc[1]
}else {word31<-word3}
test_data3[i,"w_1"]<-word
test_data3[i,"pred"]<-word31
}
result_test<-map(test_data3$w_1,predict_this)
result_test<-tibble(word=result_test)%>%
unnest_wider(word,names_sep = "_")
result_test2<-apply(result_test,2,map2,test_data3$pred,identical)
result_test2<-tibble(word=result_test2)%>%
unnest_wider(word,names_sep = "_")%>%
t()
rownames(result_test2)<-NULL
sum(result_test2)
## [1] 113
so there are
## [1] 22.6
% correct predictions.