Executive Summary SwiftKey is an input method for Android and iOS devices, such as smartphones and tablets. SwiftKey uses a blend of artificial intelligence technologies that enable it to predict the next word the user intends to type.SwiftKey learns from previous SMS messages and output predictions based on currently input text and what it has learned.

This part of the project deals with cleaning to be able to prepare the data for exploration and analysis of the content to determine how best to use it for predictive analysis.

The purposes this portion of the code is to enable faster processing especially on a regular PC with limited memory.

Functions

Utility functions

# File Loading
Loading = function(folder, tf) {
    wd = getwd()
    zfile = "Coursera-SwiftKey.zip"
    url = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
    folder = paste(wd, folder, sep = "/")
    zfile = paste(wd, zfile, sep = "/")
    if (!dir.exists(folder)) {
        download.file(url, zfile)
        unzip(zfile)
    }
    docs = VCorpus(DirSource(folder, mode = "text", recursive = tf), readerControl = list(reader = readPlain, 
        language = "en"))
    docs
}
# Piecing data for faster processing and to optimize use of memory
brkdata = function(data, folder = dest, fn, nr) {
    x = length(data)
    i = 0
    j = 0
    folder = paste(getwd(), folder, sep = "/")
    while (i < x) {
        j = j + 1
        pdata = piececorp(data, nr)
        p = length(pdata)
        i = i + p
        data = data[-(1:p)]
        fn1 = paste(fn, "_", j, ".txt", sep = "")
        fpath = paste(folder, fn, fn1, sep = "/")
        write.table(pdata, fpath, row.names = FALSE, quote = FALSE, col.names = FALSE)
    }
}
# Piecing data for faster processing and to optimize use of memory
piececorp = function(data, nr) {
    l = length(data)
    if (l >= nr) 
        pdata = data[1:nr] else pdata = data[1:l]
    pdata
}
piecedf = function(data, nr) {
    l = nrow(data)
    if (l >= nr) 
        pdata = data[1:nr, ] else pdata = data[1:l, ]
    pdata
}
# Clean memory
cleanmem = function(data) {
    rm(objects = data)
    gc()
    xlcFreeMemory()
}

Cleaning functions

clean = function(data) {
    data = rm_email(data)  #remove emails
    data = rm_emoticon(data)  #remove emoticons
    data = rm_citation(data)  #remove citation
    data = rm_title_name(data)  #remove title
    data = rm_abbreviation(data)  #remove abbreviations
    data = rm_date(data)  #remove dates
    data = rm_non_ascii(data)  #remove non-ascii characters
    data = gsubfn("http[^ ]*|www[^ ]*", "", data)  #removing URLs
    data = tolower(data)  #convert to lower
    data = pclean(data)  #remove profanity
    data = removeNumbers(data)  #remove numbers
    data = rm_repeated_characters(data)  #remove repeititve characters
    data = gsubfn("[][#$%()`*:;\"\\+\\&\\/<=>@^_|~{}=\\-]", "", data)  # replacing special characters
    data = gsubfn("\\.+|\\s+\\.+", ".", data)  #replace repetitive periods with a single period
    data = gsubfn("\\?+|\\s+\\?+", "?", data)  #replace repetitive question marks with a single question mark
    data = gsubfn("\\'+|\\s+\\'?'+", "'", data)  #replace repetitive apostrophes with a single apostrphe
    data = gsubfn("!+|\\s+!+", "!", data)  #replace repetitive exclamation with a single exclamation
    data = gsubfn(",+|\\s+,+", ",", data)  #replace repetitive exclamation with a single exclamation
    data = gsubfn(" i ", " I ", data)  #capitalize standalone i
    data = gsubfn(" i've | ive ", " I've ", data)  #capitalize i've
    data = gsubfn(" i'm | im ", " I'm ", data)  #capitalize i've
    data = gsubfn(" i'd", " I'd ", data)  #capitalize i've
    data = gsubfn("(^|[.?!][[:space:]])([[:alpha:]])", "\\1\\U\\2", data, perl = TRUE)  #capitalize first letter of a sentence or row
    data = ctrim(data)
    data
}
pclean = function(data, xwords = words) {
    data = removeWords(data, xwords)  #remove profanity
    data
}
cpunt = function(data) {
    data = gsubfn("[\\.\\,\\!\\'?]", "", data)  # replacing special characters
    data = tolower(data)  #convert to lower
    ctrim(data)
    data
}
ctrim = function(data) {
    ddata = as.data.frame(data)
    names(ddata) = c("content")
    ddata = subset(ddata, ddata$content != "")  #removing empty rows
    data = as.character(ddata$content)
    data = rm_white_multiple(data)
    data = str_trim(data)
    data
}

Exploration functions

gram = function(data, n = 1, n1 = 3) {
    data = VCorpus(VectorSource(data))
    ngram = function(x = data) NGramTokenizer(x, Weka_control(min = n, max = n1))
    ngmatrix = TermDocumentMatrix(data, control = list(tokenize = ngram, weighting = weightTf))
    Freq = sort(row_sums(ngmatrix, na.rm = TRUE), decreasing = TRUE)  #sorting ngram by frequency (desc)
    Phrase = names(Freq)  #extracting ngram as a list
    dfgram = as.data.frame(Phrase, stringsAsFactors = FALSE)
    dfgram$Freq = Freq
    dfgram
}
gramformat = function(data) {
    data$PrePh = gsubfn(" [[:alpha:]]*$", replacement = "", as.character(data$Phrase))  #extract pre-phrase of ngram
    data$Word = stri_extract_last_words(data$Phrase)  #extract last phrase of ngram
    row.names(data) = NULL  #eliminating row names
    data
}
# Queries
sqlg = function(df) {
    squery = "INSERT INTO gramtr VALUES ($Phrase, $PrePh, $Word, $Freq, $NGram)"
    dbins(squery, df)
}
sqlt = function(df, tbl) {
    squery = paste("INSERT INTO ", tbl, " VALUES ($Phrase, $Freq, $NGram)", 
        sep = "")
    dbins(squery, df)
}
dbins = function(sql, df) {
    dbBegin(gramdb)  #connecting to db
    dbGetPreparedQuery(gramdb, sql, bind.data = df)
    dbCommit(gramdb)  #write to db
}

dbclose = function() {
    dbRemoveTable(gramdb, "gramtr")
    dbRemoveTable(gramdb, "temp")
    dbRemoveTable(gramdb, "temp1")
    dbDisconnect(gramdb)
}

Analysis functions

stats = function(data) {
    Sents = length(data)
    Schars = as.integer(lapply(data, nchar))
    Tchars = sum(Schars)
    samp = cbind(data, Schars)
    lsent = head(arrange(as.data.frame(samp), desc(Schars)), 1)
    lsChar = lsent$Schars
    stat = as.data.frame(cbind(Sents, Tchars, lsChar))
    stat
}
gramplot = function(db) {
    sqlsch = paste("SELECT * FROM gramtr WHERE NGram=1 ORDER BY Freq DESC LIMIT 20", 
        sep = "")
    res1 = dbGetQuery(gramdb, sqlsch)
    res1 = head(res1[order(-res1$Freq), ], 20)
    sqlsch = paste("SELECT * FROM gramtr WHERE NGram=2 ORDER BY Freq DESC LIMIT 20", 
        sep = "")
    res2 = dbGetQuery(gramdb, sqlsch)
    res2 = head(res2[order(-res2$Freq), ], 20)
    sqlsch = paste("SELECT * FROM gramtr WHERE NGram=3 ORDER BY Freq DESC LIMIT 20", 
        sep = "")
    res3 = dbGetQuery(gramdb, sqlsch)
    res3 = head(res3[order(-res3$Freq), ], 20)
    
    p1 = gplot(res1, 1)
    p2 = gplot(res2, 2)
    p3 = gplot(res3, 3)
    print(p1)
    print(p2)
    print(p3)
    # grid.arrange(p1, p2, p3,newpage = TRUE)
}
gplot = function(res, n) {
    if (n == 2) 
        fillc = "#f5f500" else if (n == 3) 
        fillc = "#ce0c6e" else if (n == 4) 
        fillc = "#11a4ff" else fillc = "#7bf402"
    mtitle = paste("30 Most Frequent Words in", n, "gram", sep = " ")
    xtitle = paste("Words for n-gram= ", n, sep = "")
    bplot = ggplot(res, aes(x = reorder(res$PrePh, res$Freq), y = reorder(res$Freq, 
        res$Freq), fill = res$Freq))
    bplot = bplot + geom_bar(stat = "identity", fill = fillc, color = "#A0A0A0", 
        width = 1)
    bplot = bplot + ggtitle(mtitle)
    bplot = bplot + labs(x = "Phrase", y = "Frequency")
    bplot = bplot + xlab(xtitle)
    bplot = bplot + ylab("Frequency\n")
    bplot = bplot + guides(fill = FALSE)
    bplot = bplot + theme(plot.title = element_text(size = 52), axis.text.y = element_text(size = 30, 
        hjust = 1), axis.text.x = element_text(size = 30, angle = 90, hjust = 1), 
        axis.title.x = element_text(size = 50), axis.title.y = element_text(size = 50), 
        panel.background = element_rect(fill = "#260033"), panel.grid.major = element_line(color = "#A0A0A0"), 
        panel.grid.minor = element_line(color = "#A0A0A0"))
    bplot = bplot + geom_text(aes(label = paste(res$Word, res$Freq, sep = "-")), 
        size = 10, hjust = 1)
    bplot = bplot + coord_flip()
    bplot
}

Execute functions

execbreak = function(sour, dest, n, fname, tf, nr) {
    data = Loading(sour, tf)
    data = data[n][[1]]$content
    cleanmem(docs)
    brkdata(data, dest, fname, nr)
}
execclean = function(data = docs, dest, fn) {
    n = length(data)
    i = 0
    while (i < n) {
        i = i + 1
        pdata = data[i][[1]]$content
        fn1 = paste(fn, "_", i, ".txt", sep = "")
        dest0 = paste(dest, fn, sep = "/")
        fpath = paste(dest0, fn1, sep = "/")
        pdata = as.character(lapply(pdata, clean))
        write.table(pdata, fpath, row.names = FALSE, quote = FALSE, col.names = FALSE)
    }
}
execstat = function(sour, fn, tf) {
    data = Loading(paste(sour, fn, sep = "/"), tf)
    n = length(data)
    cleanmem(docs)
    i = 0
    alls = data.frame()
    while (i < n) {
        i = i + 1
        data0 = data[i][[1]]$content
        s = stats(data0)
        alls = rbind(alls, s)
    }
    ID = paste("en_US.", fn, sep = "")
    Sents = sum(as.numeric(alls$Sent))
    Tchars = sum(as.numeric(alls$Tchars))
    lsChar = max(as.integer(alls$lsChar))
    stat = as.data.frame(cbind(ID, Sents, Tchars, lsChar))
    names(stat) = c("ID", "Total_Lines", "Total_Char", "Char_Longest_Line")
    stat
}
execgram = function(data = docs, n = 1, n1 = 3, nr = 10000, sp) {
    l = length(data)
    i = 0
    gdata = data.frame()
    while (i < l) {
        i = i + 1
        pdata = data[i][[1]]$content
        pdata = cpunt(pdata)
        pdata = gram(pdata, n, n1)
        nval = function(x) y = stri_stats_latex(x)[4]
        pdata$NGram = as.integer(unname(sapply(pdata$Phrase, nval)))
        sqlt(pdata, "temp")
    }
    cleanmem(docs)
    i = 0
    while (i < (n1 + 1)) {
        i = i + 1
        sqln = paste("SELECT * FROM temp WHERE NGram=", i, sep = "")
        tdata = dbGetQuery(gramdb, sqln)
        sqlt(tdata, "temp1")
        sqld = paste("DELETE FROM temp WHERE NGram=", i, sep = "")
        dbGetQuery(gramdb, sqld)
        sqlgrp = paste("SELECT Phrase As Phrase, Sum(Freq) As Freq,NGram FROM temp1 GROUP BY Phrase", 
            sep = "")
        grdata = dbGetQuery(gramdb, sqlgrp)
        grdata = subset(grdata, (grdata$Freq > sp))
        sqld = paste("DELETE FROM temp1", sep = "")
        dbGetQuery(gramdb, sqld)
        gdata = rbind(gdata, grdata)
    }
    sqlt(gdata, "temp")
    cleanmem(grdata)
    cleanmem(tdata)
    execgramdb(gdata, nr)
}
execgramdb = function(data, nr) {
    l = nrow(data)
    i = 0
    gdata = data.frame()
    while (i < l) {
        pdata = piecedf(data, nr)
        n = nrow(pdata)
        i = i + n
        data = data[-(1:n), ]
        pdata = gramformat(pdata)
        gdata = rbind(gdata, pdata)
    }
    gram1 = subset(gdata, (gdata$NGram == 1))
    sqlg(gdata)
    write.table(gdata, paste(getwd(), "final/en_USgrams/grams.txt", sep = "/"), 
        sep = "|", row.names = FALSE)
    gram1
}
execanalyze = function(data) {
    uwords = unique(data$Phrase)
    nuwords = ((length(uwords)/length(data)) * 100)
    uwordtbl = c(round(length(data)), round(length(uwords)), round(nuwords, 
        2))
    names(uwordtbl) = c("Total Words", "Unique Words", "% of unique words")
    cat("Unique words:", "\n")
    print(uwordtbl)
    lang = as.data.frame(as.character(lapply(data$Phrase, textcat)))
    names(lang) = "Lang"
    lang = lang %>% group_by(as.character(Lang)) %>% summarise(Total = length(Lang))
    names(lang) = c("Lang", "Total_Words")
    lang = arrange(lang, -Total_Words)
    cat("Top 30 Languages for words in the corpus", "\n")
    print(as.matrix(lang))
    gramplot(gramdb)
    dbclose()
}
  [1] 1e+08

Execution

execbreak("final/en_US", "final/en_USraws", 1, "blogs", FALSE, 10000)
execbreak("final/en_US", "final/en_USraws", 2, "news", FALSE, 10000)
execbreak("final/en_US", "final/en_USraws", 3, "twitter", FALSE, 10000)

myw = Loading("final/words", FALSE)
words = as.character(strsplit(myw[[1]]$content, "\n"))
words = ctrim(words)
cleanmem(myw)

docs = Loading("final/en_USraws/blogs", TRUE)
execclean(docs, "final/en_UScleans", "blogs")
docs = Loading("final/en_USraws/news", TRUE)
execclean(docs, "final/en_UScleans", "news")
docs = Loading("final/en_USraws/twitter", TRUE)
execclean(docs, "final/en_UScleans", "twitter")
cleanmem(docs)
  Stats before data tidying
               ID Total_Lines Total_Char Char_Longest_Line
  1   en_US.blogs      899288  208361438              1058
  2    en_US.news       77259   15683765               689
  3 en_US.twitter     2360148  162384825               146
  Stats after data tidying
               ID Total_Lines Total_Char Char_Longest_Line
  1   en_US.blogs      899288  201617596              1040
  2    en_US.news       77259   15096592               670
  3 en_US.twitter     2360148  153966682               142
gramdb = dbConnect(RSQLite::SQLite())
dbSendQuery(conn = gramdb, "CREATE TABLE gramtr
            (Phrase TEXT,
            PrePh TEXT,
            Word TEXT,
            Freq INTEGER,
            NGram INTEGER)")
  <SQLiteResult>
    SQL  CREATE TABLE gramtr
              (Phrase TEXT,
              PrePh TEXT,
              Word TEXT,
              Freq INTEGER,
              NGram INTEGER)
    ROWS Fetched: 0 [complete]
         Changed: 0
dbSendQuery(conn = gramdb, "CREATE TABLE temp
            (Phrase TEXT,
            Freq INTEGER)")
  <SQLiteResult>
    SQL  CREATE TABLE temp
              (Phrase TEXT,
              Freq INTEGER)
    ROWS Fetched: 0 [complete]
         Changed: 0
dbSendQuery(conn = gramdb, "CREATE TABLE temp1
            (Phrase TEXT,
            Freq INTEGER)")
  <SQLiteResult>
    SQL  CREATE TABLE temp1
              (Phrase TEXT,
              Freq INTEGER)
    ROWS Fetched: 0 [complete]
         Changed: 0
ng = execgram(docs, 1, 3, 10000, 50)
execanalyze(ng)
  Unique words: 
        Total Words      Unique Words % of unique words 
                  5             51708           1034160 
  Top 30 Languages for words in the corpus 
        Lang                  Total_Words
   [1,] "english"             "7344"     
   [2,] "french"              "3038"     
   [3,] "danish"              "2943"     
   [4,] "scots"               "2534"     
   [5,] "latin"               "2520"     
   [6,] "rumantsch"           "2119"     
   [7,] "romanian"            "2049"     
   [8,] "manx"                "1974"     
   [9,] "tagalog"             "1952"     
  [10,] "catalan"             "1539"     
  [11,] "estonian"            "1352"     
  [12,] "italian"             "1260"     
  [13,] "spanish"             "1235"     
  [14,] "dutch"               "1206"     
  [15,] "afrikaans"           "1076"     
  [16,] "german"              "1058"     
  [17,] "frisian"             "1049"     
  [18,] "portuguese"          " 949"     
  [19,] "slovak-ascii"        " 923"     
  [20,] "swedish"             " 893"     
  [21,] "breton"              " 764"     
  [22,] "middle_frisian"      " 760"     
  [23,] "welsh"               " 753"     
  [24,] "norwegian"           " 670"     
  [25,] "basque"              " 655"     
  [26,] "finnish"             " 652"     
  [27,] "latvian"             " 607"     
  [28,] "indonesian"          " 548"     
  [29,] "swahili"             " 547"     
  [30,] "malay"               " 543"     
  [31,] "polish"              " 494"     
  [32,] "scots_gaelic"        " 490"     
  [33,] "irish"               " 488"     
  [34,] "esperanto"           " 472"     
  [35,] "lithuanian"          " 443"     
  [36,] "hungarian"           " 434"     
  [37,] "slovenian-ascii"     " 431"     
  [38,] "slovenian-iso8859_2" " 404"     
  [39,] "sanskrit"            " 391"     
  [40,] "bosnian"             " 388"     
  [41,] "icelandic"           " 371"     
  [42,] "czech-iso8859_2"     " 279"     
  [43,] "slovak-windows1250"  " 245"     
  [44,] "serbian-ascii"       " 197"     
  [45,] "croatian-ascii"      " 179"     
  [46,] "turkish"             " 158"     
  [47,] "nepali"              " 115"     
  [48,] "albanian"            " 109"     
  [49,] NA                    " 108"

Result: Observation Objective of the project is to use the content of the corpus to predict the next word in a sentence. Given this objective the corpuses must be cleaned for the following observations in order to have a clean grammatically correct sentences that are in common use:

  1. Remove Profanity
  2. Remove Puntuations, while these are accepted as grammatically correct, punctuations are added to sentences in order to highlight the circumstances and to allow the reader to infer the situation. Since we are unaware of the circumstances and sitations of the user, at this time, most punctuations are removed while maintaining a few that are common across different circumstances. These punctuations include, period, comman, exclamation, apostrophes and question marks.
  3. Remove unnecessary spaces and white spaces, repeated letters in order to lend readability and accuracy while predicting.

Pre-processing The three files- en_US.blogs, en_US.news and en_US.twitter were all cleaned for the following: - Remove emails, emoticons, citation, title, abbreviations, dates, non-ascii characters, removing URLs, profanity, numbers, repeititve characters, puntuations except (./,/‘/!/?), white spaces - Convert to lower in order to have a consistent casing - Replace repetitive punctuation (/./,/’/!/?) with single punctuation - Capitalize first person “I”, capitalize first letter of a sentence(Letters that follow a ./!/?)or row, trim sentences.

I noticed some words are joined without any clear delimiters such as “#NeedACInSchool”. While I’d like to attempt splitting these phrases into individual words such as “Need AC in School”, I am leaving these words and proper nouns found in the document(s) to be sparsed out. The cleaned corpus is saved in a folder and it is this data that will used for exploratory analysis. For the purposes of faster processing on a regular PC with limited memory, the clean function broke down the large corpuses into smaller files. This reduced the processing time significantly by at least 80%.

While the data clean up organized the data for most part there were still issues such as proper nouns and spelling errors, merged words that contaminated the data. Sparsing the the phrases helped eliminate most of these issues. While here might still be some remnats of such issues, the probability of such occurence will quite low and will not impede the prediction model.

Considerations

  1. Some words are more frequent than others - what are the distributions of word frequencies?
  1. What are the frequencies of 2-grams and 3-grams in the dataset?
  1. How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the language? 90%?
  1. How do you evaluate how many of the words come from foreign languages? Addressed by the languages list

  2. Can you think of to increase the coverage – identifying words that may not be in the corpora or using a smaller number of words in the dictionary to cover the same number of phrases?

  1. How can you efficiently store an n-gram model (think Markov Chains)?
  1. How can you use the knowledge about word frequencies to make your model smaller and more efficient?
  1. How many parameters do you need (i.e. how big is n in your n-gram model)?
  1. Can you think of simple ways to “smooth” the probabilities (think about giving all n-grams a non-zero probability even if they aren’t observed in the data)?
  1. How do you evaluate whether your model is any good?
  1. How can you use backoff models to estimate the probability of unobserved n-grams?

Environment: 1. OS: Windows 10; HW: Surface Pro 4 tablet; Tool: R version 3.3.3; R Studio version 1.0.136; Publishing tool: RPubs, HTML 4. Data: With thanks to source: http://www.swiftkey.com, http://www.coursera.org, https://www.jhu.edu/. Reference: www.stackoverflow.com 6. Analyst: Uma Venkataramani; Date of Analysis: May 2017