Lars Bungum
March, 24th, 2017
exclude_last_word<-function(ngram){
matches <- gregexpr("_", ngram)
matchlen <- length(matches[[1]])
substring(ngram, 1,matches[[1]][matchlen]-1)
}
get_bigram_prob<-function(bigram){
ngram_minus_1 <- exclude_last_word(bigram)
bigram_prob <- bi_freq[bigram]/freq[ngram_minus_1]
unname(bigram_prob)
}
get_trigram_prob<-function(trigram){
ngram_minus_1 <- exclude_last_word(trigram)
trigram_prob <- tri_freq[trigram]/bi_freq[ngram_minus_1]
unname(trigram_prob)
}
corpusObject<-corpus(readtext(corpuspath))
....
bi_dictdfm <- dfm(corpusObject, ngrams=2, removePunct=TRUE, removeNumbers=T)
bi_dictdfm <- dfm_trim(bi_dictdfm, min_count = mincount)
bi_freq <- colSums(bi_dictdfm)
bigram_probs<-log(sapply(names(bi_freq), get_ngram_prob))
....
predict_ngram <- function(ngram, problist) {
query<-paste("^",ngram,"_",sep='')
matches<-grep(query, names(problist))
if (any(matches)){
which.max(problist[matches])
}
else
FALSE
}
predict <- function(ngram) {
ngramlength<-nchar(ngram)
if (ngramlength == 0){
return(names(which.max(unigram_probs)))
}
if (substring(ngram, ngramlength) == "_") {
ngram<-substr(ngram, 1,ngramlength-1)
}
tripred<-predict_ngram(ngram,trigram_probs)
if (tripred)
{
pred<-names(tripred)
secmatch <- gregexpr("_", pred)[[1]][2]
pred <- substring(pred, secmatch+1)
}
....