The overall goal of this project is to create a predictive model to generate suggested word options in a sentence based on the words that a user has previously typed e.g. if the user types I went to the, the model may suggest three options gym store restaurant. This is similar to the way in which SwiftKey build smart keyboards for mobile device users. The model will be built using the learnings from the Coursera Data Science Specialization and the principles of Natural Language Processing (NLP). The model will be distributed by means of a Shiny app.
The data on which the model is built has been supplied from the HC Corpora (LINKS NOT WORKING). The corpora cover four locales. For this project only data from the US locale will be used.
For reproducibility, the source data can be accessed here (https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip)
There are three files emanating from Twitter, news articles and blogs. Each file has a single field containing the content. Each line represents a separate tweet or article.
The files contain some non-UTF8 characters. The first step was to remove these using the bash translate command
tr -c “[[:print:][:space:]” " " < messy.txt > clean.txt
setwd("final/cleaned_en_US")
wordcount <- function(a){
test <- gsub("X","Y",a)
test <- gsub("( +[^ X]+ +)"," X ",test)
test <- gsub("( +[^ X]+ +)"," X ",test)
test <- gsub("(^[^ X]+ +)","X ",test)
test <- gsub("([^ X]+$)","X",test)
test <- gsub(" ","",test)
s <- sum(nchar(test))
rm(test)
s
}
init_stat <- function(txtfile){
cfile <- file(txtfile,"r")
corp <- readLines(cfile,skipNul=T)
close(cfile)
len_corp <- length(corp)
chars_corp <- sum(nchar(corp))
longest_line <- max(nchar(corp))
words <- wordcount(corp)
c(len_corp,chars_corp,longest_line,words,
chars_corp/len_corp,chars_corp/words,words/len_corp)
}
### main body
v <- vector()
v <- cbind(v,c(news=file.info("cld_en_US.news.txt")$size,
tweets=file.info("cld_en_US.twitter.txt")$size,
blogs=file.info("cld_en_US.blogs.txt")$size))
v <- v/1024
res <- init_stat("cld_en_US.news.txt")
res <- rbind(res,init_stat("cld_en_US.twitter.txt"))
res <- rbind(res,init_stat("cld_en_US.blogs.txt"))
v <- cbind(v,res)
colnames(v) <- c("Size(KB)","Lines","Characters","Longest Line",
"Words","Av. Chars/Line","Av. Chars/Word","Av. Words/Line")
print(v)
## Size(KB) Lines Characters Longest Line Words Av. Chars/Line
## news 200988 1010242 203791405 11384 34420489 201.7
## tweets 163189 2360148 162385042 213 30361328 68.8
## blogs 205234 899288 208361438 40835 37692449 231.7
## Av. Chars/Word Av. Words/Line
## news 5.921 34.07
## tweets 5.348 12.86
## blogs 5.528 41.91
Here a word is defined as any set of characters between two spaces
In the development of a single model, the source will be ignored and no weightings applied.
Initially samples of 10,000 records were taken from each file to carry out some exploratory analysis and build a tokenizer and the n-grams
set.seed(132)
cfile <- file("cld_en_US.news.txt","r")
corp <- readLines(cfile,skipNul=T)
length(corp)
close(cfile)
workingset1 <- sample(corp,10000)
cfile <- file("cld_en_US.twitter.txt","r")
corp <- readLines(cfile,skipNul=T)
length(corp)
close(cfile)
workingset2 <- sample(corp,10000)
cfile <- file("cld_en_US.blogs.txt","r")
corp <- readLines(cfile,skipNul=T)
length(corp)
close(cfile)
workingset3 <- sample(corp,10000)
ws <- c(workingset1,workingset2,workingset3)
rm(corp)
rm(workingset1)
rm(workingset2)
rm(workingset3)
The amount of data used to train, test and validate the model is yet to be finalised. The intention is to use 70%, 20% and 10% respectively but the final numbers will be determined by time and efficiency.
Rather than using a ready-made tokenizer, an attempt has been made to build a bespoke one in the hope that additional sense can be obtained from the data by for instance retaining some capitalization, punctuation and numbers rather than applying a blanket remove all option. The use of a placeholder for some numeric content will also be explored.
ws1 <- ws
## punctuation
ws1 <- gsub("\\\"","\\. ",ws1) # replace quote marks with fullstops
ws1 <- gsub("[/<=>~_]"," ",ws1) # take out /<=>~_
ws1 <- gsub("[\\*\\+\\^\\|]"," ",ws1) # take out *+^|
ws1 <- gsub("`","'",ws1)
ws1 <- gsub("%"," percent",ws1)
ws1 <- gsub("&"," and ",ws1)
ws1 <- gsub("[!:;\\?]","\\.",ws1) #change :;!? to fullstops
ws1 <- gsub("[\\(\\[\\{]"," \\.",ws1) # turn contents of brackets into sentences
ws1 <- gsub("[\\)}]","\\. ",ws1)
ws1 <- gsub("]","\\. ",ws1)
ws1 <- gsub("@ "," at ",ws1) # leave @twitterusername for the moment
ws1 <- gsub("# "," at ",ws1) # leave #hashtag for the moment
ws1 <- gsub("[A-Za-z0-9]+@[A-Za-z0-9]+\\.[A-Za-z]+","hXFfqP",ws1) # email addresses
ws1 <- gsub(" -| - |- "," ",ws1)
## put back apostrophes taken out by cleaning
ws1 <- gsub(" [Ss]([ \\.])","'s\\1",ws1)
ws1 <- gsub("[Nn] [Tt([ \\.])","n't\\1",ws1)
ws1 <- gsub(" [Ll][Ll]([ \\.])","'ll\\1",ws1)
ws1 <- gsub(" [Rr][Ee]([ \\.])","'re\\1",ws1)
ws1 <- gsub(" [Vv][Ee]([ \\.])","'ve\\1",ws1)
ws1 <- gsub(" [Dd]([ \\.])","'\\1",ws1)
#contractions
ws1 <- gsub("[Nn]'[Tt]"," not",ws1) # consider can't won't shan't ain't remedy
ws1 <- gsub("[Ii]'[Mm]","I am",ws1)
ws1 <- gsub("'[Ll][Ll]"," will",ws1)
ws1 <- gsub("'[Rr][Ee]"," are",ws1)
ws1 <- gsub("'[Vv][Ee]"," have",ws1)
ws1 <- gsub("'[Dd]"," would ",ws1) #is would/should a problem?
ws1 <- gsub(" [Hh][Ee]'[Ss]"," he is",ws1)
ws1 <- gsub(" [Ss][Hh][Ee]'[Ss]"," she is",ws1) # "its" too often misused
#numbers
for (i in 1:4) {
ws1 <- gsub("([0-9]{1,3}),([0-9]+)","\\1\\2",ws1)
ws1 <- gsub("[0-9]{1,}\\.[0-9]{1,}","hXFfqP",ws1)
ws1 <- gsub("[0-9]{1,}-[0-9]{1,}","hXFfqP",ws1)
ws1 <- gsub("[0-9]{1,}/[0-9]{1,}","hXFfqP",ws1)
}
ws1 <- gsub(" 0 "," zero ",ws1)
ws1 <- gsub(" 1 "," one ",ws1)
ws1 <- gsub(" 2 "," two ",ws1)
ws1 <- gsub(" 3 "," three ",ws1)
ws1 <- gsub(" 4 "," four ",ws1)
ws1 <- gsub(" 5 "," five ",ws1)
ws1 <- gsub(" 6 "," six ",ws1)
ws1 <- gsub(" 7 "," seven ",ws1)
ws1 <- gsub(" 8 "," eight ",ws1)
ws1 <- gsub(" 9 "," nine ",ws1)
ws1 <- gsub(" 1st "," first ",ws1)
ws1 <- gsub(" 2nd "," second ",ws1)
ws1 <- gsub(" 3rd "," third ",ws1)
ws1 <- gsub(" 4th "," fourth ",ws1)
ws1 <- gsub(" 5th "," fith ",ws1)
ws1 <- gsub(" 6th "," sixth ",ws1)
ws1 <- gsub(" 7th "," seventh ",ws1)
ws1 <- gsub(" 8th "," eighth ",ws1)
ws1 <- gsub(" 9th "," ninth ",ws1)
## NEXT Abbreviations
for (i in 1:7){
ws1 <- gsub("([A-Za-z])\\.([A-Za-z])\\.","\\1\\2",ws1)
}
ws1 <- gsub("([A-Za-z])\\.([A-Za-z]) ","\\1\\2 ",ws1)
## Commas and multiple space, fullstops
ws1 <- gsub(","," ",ws1)
ws1 <- gsub("\\.{2,}","\\.",ws1)
ws1 <- gsub(" {2,}"," ",ws1)
Each sentence in the record is then split up with a fullstop marking the end of the sentence.
ws_sent <- strsplit(ws1,"\\.")
sentences <- vector()
for(i in 1:length(ws_sent)){
x <- lapply(seq_len(length(ws_sent[[i]])), function(j) ws_sent[[i]][[j]])
sentences <- c(sentences,x)
}
rm(ws_sent)
sentences <- gsub("^ ","",sentences)
sentences <- gsub(" $","",sentences)
x <- grepl(" +",sentences) # must contain at least 2 words
sentences <- sentences[x]
words <- strsplit(sentences," ")
And the sentences are broken into tokens, or words, using the space. Only sentences with more than word are considered as the one-word sentences will not be relevant to the predictive model.
The 30,000 sample yielded 71,591 sentences
Some of the subsequent analysis had to be aborted because of time pressures. However here are the provisional findings.
46,382 sentences were processed (64.7875%). These generated 538,106 words, of which 42,303 were unique. The mean number of words per sentence is 7.5164
The most frequent 20 words are:
## wordlist Freq wordlength
## 37941 the 26952 3
## 38422 to 14188 2
## 4238 and 13257 3
## 3019 a 12904 1
## 27313 of 10748 2
## 20229 in 9433 2
## 19899 i 7234 1
## 16136 for 5792 3
## 21015 is 5556 2
## 37933 that 5241 4
## 27464 on 4280 2
## 42085 you 4180 3
## 21066 it 4160 2
## 26969 not 4128 3
## 41565 with 3689 4
## 40825 was 3425 3
## 5014 at 3267 2
## 18670 he 3168 2
## 4689 are 3091 3
## 18607 have 3000 4
And make up 27.4468% of the words
Mean word length is 4.5886
The next step is to slice up the sentences into 5-grams, 4-grams, 3-grams and 2-grams. This will give data frame that can then used to predict the next word on the basis of the 4, 3, 2 or 1 words that has preceded it.
fivegrams <- vector()
for (i in 1:length(words)) {
if (length(words[[i]])>5) {
for (j in 5:length(words[[i]])) {
x <- paste(words[[i]][[j-4]],words[[i]][[j-3]],
words[[i]][[j-2]],words[[i]][[j-1]],words[[i]][[j]])
fivegrams <- c(fivegrams,x)
}
}
}
fourgrams <- vector()
for (i in 1:length(words)) {
if (length(words[[i]])>4) {
for (j in 4:length(words[[i]])) {
x <- paste(words[[i]][[j-3]],
words[[i]][[j-2]],words[[i]][[j-1]],words[[i]][[j]])
fourgrams <- c(fourgrams,x)
}
}
}
trigrams <- vector()
for (i in 1:length(words)) {
if (length(words[[i]])>3) {
for (j in 3:length(words[[i]])) {
x <- paste(words[[i]][[j-2]],words[[i]][[j-1]],words[[i]][[j]])
trigrams <- c(trigrams,x)
}
}
}
bigrams <- vector()
for (i in 1:length(words)) {
if (length(words[[i]])>2) {
for (j in 2:length(words[[i]])) {
x <- paste(words[[i]][[j-1]],words[[i]][[j]])
bigrams <- c(bigrams,x)
}
}
}
df <- data.frame()
fivegrams <- as.data.frame(fivegrams)
colnames(fivegrams) <- "phrase"
fivegrams$src <- 5
df <- rbind(df,fivegrams)
fourgrams <- as.data.frame(fourgrams)
colnames(fourgrams) <- "phrase"
fourgrams$src <- 4
df <- rbind(df,fourgrams)
trigrams <- as.data.frame(trigrams)
colnames(trigrams) <- "phrase"
trigrams$src <- 3
df <- rbind(df,trigrams)
bigrams <- as.data.frame(bigrams)
colnames(bigrams) <- "phrase"
bigrams$src <- 2
df <- rbind(df,bigrams)
library(stringr)
df$lastword <- word(df$phrase,-1)
df$preceding <- word(df$phrase,1,df$src-1)
The column of predicted words will be cross-referenced against a list of profanities (yet to be identified) and these records removed so that the model does not suggest any offensive options.
With the ngram data frame built, concentration will move to developing and refining the actual model, testing its predictive capability and deploying it as a Shiny app
TO BE ADDED LATER