The first step in building a predictive model for text is understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build your first linguistic models.
The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager.
You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to:
library(quanteda)
library(readtext)
library(data.table)
library(stringr)
if(!file.exists('./final/en_US/en_US.blogs.txt')){
download.file('https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip',
destfile = paste0(getwd(), 'Coursera-SwiftKey.zip'),
method = 'curl', quiet = T)
unzip('./Coursera-SwiftKey.zip')
}
rawBlogs <- readtext(paste0(getwd(), '/final/en_US/en_US.blogs.txt'))
rawNews <- readtext(paste0(getwd(), '/final/en_US/en_US.news.txt'))
rawTwts <- readtext(paste0(getwd(), '/final/en_US/en_US.twitter.txt'))
corpBlogs <- corpus(rawBlogs)
docvars(corpBlogs, 'Source') <- 'blogs'
corpNews <- corpus(rawNews)
docvars(corpNews, 'Source') <- 'news'
corpTwts <- corpus(rawTwts)
docvars(corpTwts, 'Source') <- 'twitter'
corpAll <- corpBlogs + corpNews + corpTwts
rm(rawBlogs, rawNews, rawTwts)
rm(corpBlogs, corpNews, corpTwts)
tokenization <- function(input, what = 'word', ngrams = 1L) {
## This function calls the tokens function from quanteda
## takes an input (character, corpus, or token object)
## and returns the tokenized object
# step1: tokenize based on input values
results <- tokens(x = input, what = what, ngrams = ngrams,
remove_numbers = T, remove_punct = T,
remove_symbols = T, remove_separators = T,
remove_twitter = T, remove_hyphens = T,
remove_url = T)
# step2: get a list of profanity
if (!file.exists('badWords.txt')) {
download.file('https://raw.githubusercontent.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en',
dest = paste0(getwd(), 'badWords.txt'),
method = 'curl', quiet = T)
}
prof <- readLines('badWords.txt', skipNul = T)
# step3: remove profanity
results <- tokens_remove(results, pattern = prof)
}
tokWord <- tokenization(corpAll, what = 'word')
rm(corpAll)
makeNgrams <- function(inputTokens, n, outName) {
## inputTokens: tokenized object
## number of grams
## output file name
tokWordNg <- tokens_ngrams(inputTokens, n = n, concatenator = ' ')
dfmWordNg <- dfm(tokWordNg, tolower = T)
nGram <- textstat_frequency(dfmWordNg)
write.csv(nGram, file = paste0(outName, '.csv'), row.names = F)
}
makeNgrams(tokWord, 1L, 'uniGram')
makeNgrams(tokWord, 2L, 'biGram')
makeNgrams(tokWord, 3L, 'triGram')
makeNgrams(tokWord, 4L, 'quadGram')
makeNgrams(tokWord, 5L, 'quinGram')
makeNgrams(tokWord, 6L, 'sixGram')
makeNgrams(tokWord, 7L, 'septGram')
generatePred <- function(inputFile, thresh = 1L) {
## This function makes the prediction look up table
## inputFile: the ngram csv file generated from quanteda
## thresh: threshold to remove low frequency words (default is 1)
nGram <- fread(inputFile, select = c('feature', 'frequency'))
nGram <- nGram[nGram$frequency > thresh]
nGram <- nGram[, query := strsplit(feature, " [^ ]+$")][]
nGram <- nGram[, predict := sub('.* (.*)$','\\1', feature)][]
fwrite(nGram, paste0(sub('.csv', '', inputFile), 'Pred.csv'))
}
generatePred('biGram.csv')
generatePred('triGram.csv')
generatePred('quadGram.csv')
generatePred('quinGram.csv')
generatePred('sixGram.csv')
generatePred('septGram.csv')
#Note this is processed in unix due to the large number of lines!!!
cat biGramPred.csv <(sed '1d' triGramPred.csv) <(sed '1d' quadGramPred.csv) <(sed '1d' quinGramPred.csv) <(sed '1d' sixGramPred.csv) <(sed '1d' septGramPred.csv) > nGramPred.csv
nGram <- fread('nGramPred.csv', select = c('query', 'predict', 'frequency'))
nGram <- nGram[order(-frequency)]
nGramFilt <- nGram[frequency >= 5]
fwrite(nGramFilt, file = 'predictionTableFull.csv')
nGramUni <- nGram[(!duplicated(nGram$query)) & (frequency >= 5)]
fwrite(nGramUni, file = 'predictionTableUni.csv')