CapstoneReport

Yinning Zhang
May 2019

Steps

  • Create a training set with 2% of the original data.
  • Use data clean pipeline to process data.
  • Generate unigrams, 2grams, 3grams, 4grams, 5 grams files.
  • Write the prediction model.

The App

  • It searches from 5 grams to 2grams until it finds a match.
  • If the input has more than 5 words, only the last 5 words are used for prediction.
  • It calculat the probability of each match. For example, if it find match in the 5 grams file, it divide the frequency of each 5 grams by the sum of all the matches. See the next page for the code chunk

code for the Prediction Model 1

N <- length(inputwords)

if(N >= 5) {
# get last 5 words
w <- tail(inputwords, 5)

w1 <- w[1]; w2 <- w[2]; w3 <- w[3]; w4 <- w[4]; w5 <- w[5];
subdf <- data.table()

if(nrow(subdf) == 0) subdf <- subset(DF6grams, word1==w1 & word2==w2 & word3==w3 & word4==w4 & word5==w5)
if(nrow(subdf) == 0) subdf <- subset(DF5grams, word1==w2 & word2==w3 & word3==w4 & word4==w5)
if(nrow(subdf) == 0) subdf <- subset(DF4grams, word1==w3 & word2==w4 & word3==w5)
if(nrow(subdf) == 0) subdf <- subset(DF3grams, word1==w4 & word2==w5)
if(nrow(subdf) == 0) subdf <- subset(DF2grams, word1==w5)
if(nrow(subdf) == 0) subdf <- head(DF1grams, 5)
}

code for the Prediction Model 2

if(nrow(subdf)>0) {
subdf$prob <- subdf$freq / sum(subdf$freq)
predicted <- data.frame(subdf[order(-subdf$prob), (ncol(subdf)-2):ncol(subdf)])
predicted <- head(predicted, min(10, nrow(subdf)))
}