Overview

This is a milestone report that explains my exploratory analysis and your goals for the eventual app and algorithm.

Download the Data

The original raw dataset can be found at https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip.

list.files()
[1] "DataScienceCapstone.Rproj" "milestoneReport.html"     
[3] "milestoneReport.Rmd"       "milestoneReport_cache"    
[5] "rawdata"                   "swearWords.csv"           
[7] "trainingData"             
setwd("rawdata")
list.files()
 [1] "de_DE.blogs.txt"   "de_DE.news.txt"    "de_DE.twitter.txt"
 [4] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"
 [7] "fi_FI.blogs.txt"   "fi_FI.news.txt"    "fi_FI.twitter.txt"
[10] "ru_RU.blogs.txt"   "ru_RU.news.txt"    "ru_RU.twitter.txt"
range(file.size(list.files()) / 1024 ^ 2)
[1]  24.15766 200.42421
data.frame(file = c("max file", "min file"), 
           size = c(length(readLines(list.files()[which.max(file.size(list.files()))], encoding = "UTF-8")), 
                    length(readLines(list.files()[which.min(file.size(list.files()))], encoding = "UTF-8"))))
      file   size
1 max file 899288
2 min file 285214

The smallest file is 24 MB and has 285214 entries while the largest is 200 MB and has 899288 entries.

Profanity Filtering

To make this world a place of love and kindness, it is very important for us to filter the profane words. The word list can be found at http://www.bannedwordlist.com/. It will be more effective if we filter the 3 types of a profane word. For instance, a profane word abcdefg (a made-up one) has at least 3 types: Abcdefg, ABCDEFG and abcdefg.

swearWords <- readLines("swearWords.csv")
library(stringr)
swearWords <- c(swearWords, str_to_title(swearWords), str_to_upper(swearWords))

Size Reduction

The original raw files are very large. We do not need that much to build our prediction models, or our computer will blow. So it is natural to reduce each dataset to 1000 entries. The English files and other files are processed seperately for profanity filtering. Entries which includes profane words make up significantly less than 50% of all entries. So we first reduce the entries to 2000, then filter them.

setwd("rawdata")
exEnList <- list.files()[-(4:6)]
for(i in 1:length(exEnList)) {
   tempchar <- readLines(exEnList[i], encoding = "UTF-8")
   tempchar <- tempchar[sample(1:length(tempchar), 1000)]
   name1 <- substring(exEnList[i], 1, 2)
   name2 <- str_to_title(substring(exEnList[i], 7, nchar(exEnList[i]) - 4))
   outputname <- paste("../trainingData/", name1, name2, "Training.txt", sep = "")
   write.table(tempchar, outputname, row.names = F, quote = F, col.names = F)
}
enList <- list.files()[4:6]
for(i in 1:length(enList)) {
   tempchar <- readLines(enList[i], encoding = "UTF-8")
   tempchar <- tempchar[sample(1:length(tempchar), 2000)]
   for(j in 1:length(swearWords)) {
      tempchar <- tempchar[!grepl(swearWords[j], tempchar)]
   }
   tempchar <- tempchar[1:1000]
   name1 <- substring(enList[i], 1, 2)
   name2 <- str_to_title(substring(enList[i], 7, nchar(enList[i]) - 4))
   outputname <- paste("../trainingData/", name1, name2, "Training.txt", sep = "")
   write.table(tempchar, outputname, row.names = F, quote = F, col.names = F)
}

Merge the 3 datasets of a specific language into 1.

setwd("trainingData")
trainingDataList <- list.files()[grep(".txt", list.files())]
deTraining <- c(readLines(trainingDataList[1], encoding = "UTF-8"), readLines(trainingDataList[2], encoding = "UTF-8"), readLines(trainingDataList[3], encoding = "UTF-8"))
enTraining <- c(readLines(trainingDataList[4], encoding = "UTF-8"), readLines(trainingDataList[5], encoding = "UTF-8"), readLines(trainingDataList[6], encoding = "UTF-8"))
fiTraining <- c(readLines(trainingDataList[7], encoding = "UTF-8"), readLines(trainingDataList[8], encoding = "UTF-8"), readLines(trainingDataList[9], encoding = "UTF-8"))
ruTraining <- c(readLines(trainingDataList[10], encoding = "UTF-8"), readLines(trainingDataList[11], encoding = "UTF-8"), readLines(trainingDataList[12], encoding = "UTF-8"))
write.table(deTraining, "l3000/de3000.txt", row.names = F, quote = F, col.names = F)
write.table(enTraining, "l3000/en3000.txt", row.names = F, quote = F, col.names = F)
write.table(fiTraining, "l3000/fi3000.txt", row.names = F, quote = F, col.names = F)
write.table(ruTraining, "l3000/ru3000.txt", row.names = F, quote = F, col.names = F)
list.files()
 [1] "deBlogsTraining.txt"   "deNewsTraining.txt"    "deTwitterTraining.txt"
 [4] "enBlogsTraining.txt"   "enNewsTraining.txt"    "enTwitterTraining.txt"
 [7] "fiBlogsTraining.txt"   "fiNewsTraining.txt"    "fiTwitterTraining.txt"
[10] "l3000"                 "ruBlogsTraining.txt"   "ruNewsTraining.txt"   
[13] "ruTwitterTraining.txt"

Data Cleaning

The cleaning process requires the {tm} package.

library(tm)
list3000 <- c("trainingData/l3000/de3000.txt", "trainingData/l3000/en3000.txt", "trainingData/l3000/fi3000.txt", "trainingData/l3000/ru3000.txt")
for(i in 1:4) {
   textCorpus <- Corpus(VectorSource(readLines(list3000[i])))
   textCorpus <- tm_map(textCorpus, content_transformer(function(x) 
      iconv(x, to = "UTF-8", sub = "byte")))
   textCorpus <- tm_map(textCorpus, content_transformer(tolower))
   textCorpus <- tm_map(textCorpus, content_transformer(removePunctuation),
                        preserve_intra_word_dashes = T)
   textCorpus <- tm_map(textCorpus, content_transformer(removeNumbers))
   removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
   textCorpus <- tm_map(textCorpus, content_transformer(removeURL))
   textCorpus <- tm_map(textCorpus, stripWhitespace)
   textCorpus <- tm_map(textCorpus, PlainTextDocument)
   filename <- paste("finalData/", substring(list3000[i], 20, 21), "Final.RData", sep = "")
   saveRDS(textCorpus, file = filename)
}
(readRDS("finalData/enFinal.RData")[[1]]$content)[1:10]
 [1] "while i like the dynamic view formats i found that one must be very careful when experimenting with the various templates for example if you click on your blogger template choice on the blogger sidebar you will see a snippet view of what your blog looks like at the top of the page below you will see the series of dynamic view templates"                                                                                                                                                                                                                                                                                                                        
 [2] "i love you ive loved you from the first moment that the doctor surprised me with the news that you were growing within me and i got to see the pure unadulterated look of joy on dads face when i shared the news with him"                                                                                                                                                                                                                                                                                                                                                                                                                                               
 [3] "oh so you think im being harsh or perhaps i just simply donut see the magic or the significance of hrc ok try this on for size lets forget for a moment that the republican in this race is john mccain lets pretend for a moment that its jeb bush"                                                                                                                                                                                                                                                                                                                                                                                                                      
 [4] "luke has been running around halifax since and selling shoes since currently luke is a partner in aerobics first on quinpool road which turned this past may luke is the csf chief shoe fitter and community out reach guy he is also the president of the independent running retailers of canada"                                                                                                                                                                                                                                                                                                                                                                       
 [5] "have photos too upload of my lovely husband modelling his scarf and hat but will have to wait until i get home to upload them"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
 [6] "scenario play in warmahordes is not just a tournament standard in fact i would advice anyone already over the game rules learning hill with a decent points collection to play scenarios from the get go hereus why"                                                                                                                                                                                                                                                                                                                                                                                                                                                      
 [7] "a topic sentence outlines your your research paper here again is the handout on how to construct an outline e-mail me if you have questions about how to do this"                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         
 [8] "then last but not least is my nd sponsored child he is special to me because on the last day i was in the village i went around and met all the families that got a tank as part of our water tank project while meeting the family across the street george spotted a little boy and called me over to the drivers side of the van he told me that he had been trying to help this little boy whose mom dad had abandoned him and who was living with his grandmother he had already been raising money to buy him clothes and would really like to get him in school george asked if i thought i could get a sponsor for him and i told him that he was looking at her "
 [9] "although right now we do have an extended house guest sleeping on the living room floor he just moved to la so were trying to help him out by letting him stay here for awhile friends say im crazy but thats my life"                                                                                                                                                                                                                                                                                                                                                                                                                                                    
[10] "slytherins are ucunningu and uuse any means to achieve their endsu this is the house that voldemort was in also most of his death eaters are from slytherin i think slytherins are a bit like ravenclaws theyure definitely clever but usually in an evil way basically they figure out a way to get whatever they want which is mostly evil but sometimes not i feel a bit like a slytherin i think it would be cool to be an evil genius slytherins are sneaky too and i think thatus cool as well"                                                                                                                                                                     

Tokenization

Tokenization requires {RWeka} package. 1-gram first.

library(RWeka)
library(dplyr)
finalList <- c("finalData/deFinal.RData", "finalData/enFinal.RData",
               "finalData/fiFinal.RData", "finalData/ruFinal.RData")
for(i in 1:4) {
   unigram <- NGramTokenizer(readRDS(finalList[i]),
                             Weka_control(min = 1, max = 1))
   unigram <- data.frame(table(unigram)) %>% arrange(desc(Freq))
   names(unigram)[1] <- "w1"
   unigram$w1 <- as.character(unigram$w1)
   saveRDS(unigram, file = paste("nGramData/", substring(finalList[i], 11, 12), "Unigram.RData", sep = ""))
}
library(ggplot2)
nGramList <- c("nGramData/deUnigram.RData", "nGramData/enUnigram.RData",
               "nGramData/fiUnigram.RData", "nGramData/ruUnigram.RData")

unigram <- readRDS(nGramList[1])
gUni <- ggplot(data = unigram[1:10, ], aes(x = reorder(w1, Freq), y = Freq))
gUni + geom_bar(stat = "identity", fill = "steelblue") +
   coord_flip() + labs(y = "Frequency") + labs(x = "Word") + 
   ggtitle("Words with Top Frequencies in Unigram") + theme_bw()

unigram <- readRDS(nGramList[2])
gUni <- ggplot(data = unigram[1:10, ], aes(x = reorder(w1, Freq), y = Freq))
gUni + geom_bar(stat = "identity", fill = "steelblue") +
   coord_flip() + labs(y = "Frequency") + labs(x = "Word") + 
   ggtitle("Words with Top Frequencies in Unigram") + theme_bw()

unigram <- readRDS(nGramList[3])
gUni <- ggplot(data = unigram[1:10, ], aes(x = reorder(w1, Freq), y = Freq))
gUni + geom_bar(stat = "identity", fill = "steelblue") +
   coord_flip() + labs(y = "Frequency") + labs(x = "Word") + 
   ggtitle("Words with Top Frequencies in Unigram") + theme_bw()

unigram <- readRDS(nGramList[4])
gUni <- ggplot(data = unigram[1:10, ], aes(x = reorder(w1, Freq), y = Freq))
gUni + geom_bar(stat = "identity", fill = "steelblue") +
   coord_flip() + labs(y = "Frequency") + labs(x = "Word") + 
   ggtitle("Words with Top Frequencies in Unigram") + theme_bw()

It is sad that Russian language does not appear to be its own way, i.e. Cyrillic Alphabet. Since predicting Russian words is not compulsory in this project. We now ignore Russian and work on the other 3.

library(stringr)
finalList <- c("finalData/deFinal.RData", "finalData/enFinal.RData",
               "finalData/fiFinal.RData", "finalData/ruFinal.RData")
for(i in 1:4) {
   bigram <- NGramTokenizer(readRDS(finalList[i]),
                            Weka_control(min = 2, max = 2))
   bigram <- data.frame(table(bigram)) %>% arrange(desc(Freq))
   bigram[, 1] <- as.character(bigram[, 1])
   spWords <- str_split(bigram[, 1], " ")
   bigram <- data.frame(w1 = sapply(spWords, "[[", 1),
                        w2 = sapply(spWords, "[[", 2),
                        Freq = bigram$Freq)
   saveRDS(bigram, file = paste("nGramData/", substring(finalList[i], 11, 12), "Bigram.RData", sep = ""))
}
finalList <- c("finalData/deFinal.RData", "finalData/enFinal.RData",
               "finalData/fiFinal.RData", "finalData/ruFinal.RData")
for(i in 1:4) {
   trigram <- NGramTokenizer(readRDS(finalList[i]),
                             Weka_control(min = 3, max = 3))
   trigram <- data.frame(table(trigram)) %>% arrange(desc(Freq))
   trigram[, 1] <- as.character(trigram[, 1])
   spWords <- str_split(trigram[, 1], " ")
   trigram <- data.frame(w1 = sapply(spWords, "[[", 1),
                         w2 = sapply(spWords, "[[", 2),
                         w3 = sapply(spWords, "[[", 3),
                         Freq = trigram$Freq)
   saveRDS(trigram, file = paste("nGramData/", substring(finalList[i], 11, 12), "Trigram.RData", sep = ""))
}
finalList <- c("finalData/deFinal.RData", "finalData/enFinal.RData",
               "finalData/fiFinal.RData", "finalData/ruFinal.RData")
for(i in 1:4) {
   quadgram <- NGramTokenizer(readRDS(finalList[i]),
                              Weka_control(min = 4, max = 4))
   quadgram <- data.frame(table(quadgram)) %>% arrange(desc(Freq))
   quadgram[, 1] <- as.character(quadgram[, 1])
   spWords <- str_split(quadgram[, 1], " ")
   quadgram <- data.frame(w1 = sapply(spWords, "[[", 1),
                          w2 = sapply(spWords, "[[", 2),
                          w3 = sapply(spWords, "[[", 3),
                          w3 = sapply(spWords, "[[", 4),
                          Freq = quadgram$Freq)
   saveRDS(quadgram, file = paste("nGramData/", substring(finalList[i], 11, 12), "Quadgram.RData", sep = ""))
}

Let’s see the results.

head(readRDS("nGramData/enBigram.RData"))
   w1  w2 Freq
1  of the  355
2  in the  320
3  to the  151
4 for the  145
5  on the  143
6  at the  118
head(readRDS("nGramData/enTrigram.RData"))
     w1   w2  w3 Freq
1   one   of the   31
2     a  lot  of   25
3    as well  as   16
4    it  was   a   14
5     a  bit  of   13
6 going   to  be   13
head(readRDS("nGramData/enQuadgram.RData"))
     w1  w2     w3 w3.1 Freq
1    in the middle   of    6
2   the end     of  the    6
3     a bit     of    a    5
4    at the   same time    5
5 going  to     be    a    5
6    is one     of  the    5
head(readRDS("nGramData/deQuadgram.RData"))
     w1         w2            w3   w3.1 Freq
1   auf        den        ersten  blick    5
2    am     montag            in  paris    4
3   auf        der       anderen  seite    4
4 davon        aus          dass    die    4
5   der erziehungs wissenschafts    und    4
6   der  vereinten      nationen unesco    4
head(readRDS("nGramData/fiQuadgram.RData"))
        w1     w2      w3  w3.1 Freq
1   tiimin  voima      on  site    5
2    voima     on    site   kun    5
3      klo    klo     klo   klo    3
4     yhte mielte   siite  ette    3
5 asteessa   noin   tunti lisee    2
6       ei    ole koskaan ollut    2

Summary and Future

Summary

  • It is inspiring to see that 3 languages which use Latin Alphabet have passed this phase. I am not sure what else do I need in order to let Cyrillic Alphabet work.
  • Some symbols such as - were not filtered through a series of filtering processes using the {tm} package. But this will not affect much in the future.

Future

  • I will build an algorithm which will predict the next word. Hope that this algorithm will work for all of the 3 languages.
  • I will also build a Shiny App, which is not hard.