This is a milestone report that explains my exploratory analysis and your goals for the eventual app and algorithm.
The original raw dataset can be found at https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip.
list.files()
[1] "DataScienceCapstone.Rproj" "milestoneReport.html"
[3] "milestoneReport.Rmd" "milestoneReport_cache"
[5] "rawdata" "swearWords.csv"
[7] "trainingData"
setwd("rawdata")
list.files()
[1] "de_DE.blogs.txt" "de_DE.news.txt" "de_DE.twitter.txt"
[4] "en_US.blogs.txt" "en_US.news.txt" "en_US.twitter.txt"
[7] "fi_FI.blogs.txt" "fi_FI.news.txt" "fi_FI.twitter.txt"
[10] "ru_RU.blogs.txt" "ru_RU.news.txt" "ru_RU.twitter.txt"
range(file.size(list.files()) / 1024 ^ 2)
[1] 24.15766 200.42421
data.frame(file = c("max file", "min file"),
size = c(length(readLines(list.files()[which.max(file.size(list.files()))], encoding = "UTF-8")),
length(readLines(list.files()[which.min(file.size(list.files()))], encoding = "UTF-8"))))
file size
1 max file 899288
2 min file 285214
The smallest file is 24 MB and has 285214 entries while the largest is 200 MB and has 899288 entries.
To make this world a place of love and kindness, it is very important for us to filter the profane words. The word list can be found at http://www.bannedwordlist.com/. It will be more effective if we filter the 3 types of a profane word. For instance, a profane word abcdefg (a made-up one) has at least 3 types: Abcdefg, ABCDEFG and abcdefg.
swearWords <- readLines("swearWords.csv")
library(stringr)
swearWords <- c(swearWords, str_to_title(swearWords), str_to_upper(swearWords))
The original raw files are very large. We do not need that much to build our prediction models, or our computer will blow. So it is natural to reduce each dataset to 1000 entries. The English files and other files are processed seperately for profanity filtering. Entries which includes profane words make up significantly less than 50% of all entries. So we first reduce the entries to 2000, then filter them.
setwd("rawdata")
exEnList <- list.files()[-(4:6)]
for(i in 1:length(exEnList)) {
tempchar <- readLines(exEnList[i], encoding = "UTF-8")
tempchar <- tempchar[sample(1:length(tempchar), 1000)]
name1 <- substring(exEnList[i], 1, 2)
name2 <- str_to_title(substring(exEnList[i], 7, nchar(exEnList[i]) - 4))
outputname <- paste("../trainingData/", name1, name2, "Training.txt", sep = "")
write.table(tempchar, outputname, row.names = F, quote = F, col.names = F)
}
enList <- list.files()[4:6]
for(i in 1:length(enList)) {
tempchar <- readLines(enList[i], encoding = "UTF-8")
tempchar <- tempchar[sample(1:length(tempchar), 2000)]
for(j in 1:length(swearWords)) {
tempchar <- tempchar[!grepl(swearWords[j], tempchar)]
}
tempchar <- tempchar[1:1000]
name1 <- substring(enList[i], 1, 2)
name2 <- str_to_title(substring(enList[i], 7, nchar(enList[i]) - 4))
outputname <- paste("../trainingData/", name1, name2, "Training.txt", sep = "")
write.table(tempchar, outputname, row.names = F, quote = F, col.names = F)
}
Merge the 3 datasets of a specific language into 1.
setwd("trainingData")
trainingDataList <- list.files()[grep(".txt", list.files())]
deTraining <- c(readLines(trainingDataList[1], encoding = "UTF-8"), readLines(trainingDataList[2], encoding = "UTF-8"), readLines(trainingDataList[3], encoding = "UTF-8"))
enTraining <- c(readLines(trainingDataList[4], encoding = "UTF-8"), readLines(trainingDataList[5], encoding = "UTF-8"), readLines(trainingDataList[6], encoding = "UTF-8"))
fiTraining <- c(readLines(trainingDataList[7], encoding = "UTF-8"), readLines(trainingDataList[8], encoding = "UTF-8"), readLines(trainingDataList[9], encoding = "UTF-8"))
ruTraining <- c(readLines(trainingDataList[10], encoding = "UTF-8"), readLines(trainingDataList[11], encoding = "UTF-8"), readLines(trainingDataList[12], encoding = "UTF-8"))
write.table(deTraining, "l3000/de3000.txt", row.names = F, quote = F, col.names = F)
write.table(enTraining, "l3000/en3000.txt", row.names = F, quote = F, col.names = F)
write.table(fiTraining, "l3000/fi3000.txt", row.names = F, quote = F, col.names = F)
write.table(ruTraining, "l3000/ru3000.txt", row.names = F, quote = F, col.names = F)
list.files()
[1] "deBlogsTraining.txt" "deNewsTraining.txt" "deTwitterTraining.txt"
[4] "enBlogsTraining.txt" "enNewsTraining.txt" "enTwitterTraining.txt"
[7] "fiBlogsTraining.txt" "fiNewsTraining.txt" "fiTwitterTraining.txt"
[10] "l3000" "ruBlogsTraining.txt" "ruNewsTraining.txt"
[13] "ruTwitterTraining.txt"
The cleaning process requires the {tm} package.
library(tm)
list3000 <- c("trainingData/l3000/de3000.txt", "trainingData/l3000/en3000.txt", "trainingData/l3000/fi3000.txt", "trainingData/l3000/ru3000.txt")
for(i in 1:4) {
textCorpus <- Corpus(VectorSource(readLines(list3000[i])))
textCorpus <- tm_map(textCorpus, content_transformer(function(x)
iconv(x, to = "UTF-8", sub = "byte")))
textCorpus <- tm_map(textCorpus, content_transformer(tolower))
textCorpus <- tm_map(textCorpus, content_transformer(removePunctuation),
preserve_intra_word_dashes = T)
textCorpus <- tm_map(textCorpus, content_transformer(removeNumbers))
removeURL <- function(x) gsub("http[[:alnum:]]*", "", x)
textCorpus <- tm_map(textCorpus, content_transformer(removeURL))
textCorpus <- tm_map(textCorpus, stripWhitespace)
textCorpus <- tm_map(textCorpus, PlainTextDocument)
filename <- paste("finalData/", substring(list3000[i], 20, 21), "Final.RData", sep = "")
saveRDS(textCorpus, file = filename)
}
(readRDS("finalData/enFinal.RData")[[1]]$content)[1:10]
[1] "while i like the dynamic view formats i found that one must be very careful when experimenting with the various templates for example if you click on your blogger template choice on the blogger sidebar you will see a snippet view of what your blog looks like at the top of the page below you will see the series of dynamic view templates"
[2] "i love you ive loved you from the first moment that the doctor surprised me with the news that you were growing within me and i got to see the pure unadulterated look of joy on dads face when i shared the news with him"
[3] "oh so you think im being harsh or perhaps i just simply donut see the magic or the significance of hrc ok try this on for size lets forget for a moment that the republican in this race is john mccain lets pretend for a moment that its jeb bush"
[4] "luke has been running around halifax since and selling shoes since currently luke is a partner in aerobics first on quinpool road which turned this past may luke is the csf chief shoe fitter and community out reach guy he is also the president of the independent running retailers of canada"
[5] "have photos too upload of my lovely husband modelling his scarf and hat but will have to wait until i get home to upload them"
[6] "scenario play in warmahordes is not just a tournament standard in fact i would advice anyone already over the game rules learning hill with a decent points collection to play scenarios from the get go hereus why"
[7] "a topic sentence outlines your your research paper here again is the handout on how to construct an outline e-mail me if you have questions about how to do this"
[8] "then last but not least is my nd sponsored child he is special to me because on the last day i was in the village i went around and met all the families that got a tank as part of our water tank project while meeting the family across the street george spotted a little boy and called me over to the drivers side of the van he told me that he had been trying to help this little boy whose mom dad had abandoned him and who was living with his grandmother he had already been raising money to buy him clothes and would really like to get him in school george asked if i thought i could get a sponsor for him and i told him that he was looking at her "
[9] "although right now we do have an extended house guest sleeping on the living room floor he just moved to la so were trying to help him out by letting him stay here for awhile friends say im crazy but thats my life"
[10] "slytherins are ucunningu and uuse any means to achieve their endsu this is the house that voldemort was in also most of his death eaters are from slytherin i think slytherins are a bit like ravenclaws theyure definitely clever but usually in an evil way basically they figure out a way to get whatever they want which is mostly evil but sometimes not i feel a bit like a slytherin i think it would be cool to be an evil genius slytherins are sneaky too and i think thatus cool as well"
Tokenization requires {RWeka} package. 1-gram first.
library(RWeka)
library(dplyr)
finalList <- c("finalData/deFinal.RData", "finalData/enFinal.RData",
"finalData/fiFinal.RData", "finalData/ruFinal.RData")
for(i in 1:4) {
unigram <- NGramTokenizer(readRDS(finalList[i]),
Weka_control(min = 1, max = 1))
unigram <- data.frame(table(unigram)) %>% arrange(desc(Freq))
names(unigram)[1] <- "w1"
unigram$w1 <- as.character(unigram$w1)
saveRDS(unigram, file = paste("nGramData/", substring(finalList[i], 11, 12), "Unigram.RData", sep = ""))
}
library(ggplot2)
nGramList <- c("nGramData/deUnigram.RData", "nGramData/enUnigram.RData",
"nGramData/fiUnigram.RData", "nGramData/ruUnigram.RData")
unigram <- readRDS(nGramList[1])
gUni <- ggplot(data = unigram[1:10, ], aes(x = reorder(w1, Freq), y = Freq))
gUni + geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() + labs(y = "Frequency") + labs(x = "Word") +
ggtitle("Words with Top Frequencies in Unigram") + theme_bw()
unigram <- readRDS(nGramList[2])
gUni <- ggplot(data = unigram[1:10, ], aes(x = reorder(w1, Freq), y = Freq))
gUni + geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() + labs(y = "Frequency") + labs(x = "Word") +
ggtitle("Words with Top Frequencies in Unigram") + theme_bw()
unigram <- readRDS(nGramList[3])
gUni <- ggplot(data = unigram[1:10, ], aes(x = reorder(w1, Freq), y = Freq))
gUni + geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() + labs(y = "Frequency") + labs(x = "Word") +
ggtitle("Words with Top Frequencies in Unigram") + theme_bw()
unigram <- readRDS(nGramList[4])
gUni <- ggplot(data = unigram[1:10, ], aes(x = reorder(w1, Freq), y = Freq))
gUni + geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() + labs(y = "Frequency") + labs(x = "Word") +
ggtitle("Words with Top Frequencies in Unigram") + theme_bw()
It is sad that Russian language does not appear to be its own way, i.e. Cyrillic Alphabet. Since predicting Russian words is not compulsory in this project. We now ignore Russian and work on the other 3.
library(stringr)
finalList <- c("finalData/deFinal.RData", "finalData/enFinal.RData",
"finalData/fiFinal.RData", "finalData/ruFinal.RData")
for(i in 1:4) {
bigram <- NGramTokenizer(readRDS(finalList[i]),
Weka_control(min = 2, max = 2))
bigram <- data.frame(table(bigram)) %>% arrange(desc(Freq))
bigram[, 1] <- as.character(bigram[, 1])
spWords <- str_split(bigram[, 1], " ")
bigram <- data.frame(w1 = sapply(spWords, "[[", 1),
w2 = sapply(spWords, "[[", 2),
Freq = bigram$Freq)
saveRDS(bigram, file = paste("nGramData/", substring(finalList[i], 11, 12), "Bigram.RData", sep = ""))
}
finalList <- c("finalData/deFinal.RData", "finalData/enFinal.RData",
"finalData/fiFinal.RData", "finalData/ruFinal.RData")
for(i in 1:4) {
trigram <- NGramTokenizer(readRDS(finalList[i]),
Weka_control(min = 3, max = 3))
trigram <- data.frame(table(trigram)) %>% arrange(desc(Freq))
trigram[, 1] <- as.character(trigram[, 1])
spWords <- str_split(trigram[, 1], " ")
trigram <- data.frame(w1 = sapply(spWords, "[[", 1),
w2 = sapply(spWords, "[[", 2),
w3 = sapply(spWords, "[[", 3),
Freq = trigram$Freq)
saveRDS(trigram, file = paste("nGramData/", substring(finalList[i], 11, 12), "Trigram.RData", sep = ""))
}
finalList <- c("finalData/deFinal.RData", "finalData/enFinal.RData",
"finalData/fiFinal.RData", "finalData/ruFinal.RData")
for(i in 1:4) {
quadgram <- NGramTokenizer(readRDS(finalList[i]),
Weka_control(min = 4, max = 4))
quadgram <- data.frame(table(quadgram)) %>% arrange(desc(Freq))
quadgram[, 1] <- as.character(quadgram[, 1])
spWords <- str_split(quadgram[, 1], " ")
quadgram <- data.frame(w1 = sapply(spWords, "[[", 1),
w2 = sapply(spWords, "[[", 2),
w3 = sapply(spWords, "[[", 3),
w3 = sapply(spWords, "[[", 4),
Freq = quadgram$Freq)
saveRDS(quadgram, file = paste("nGramData/", substring(finalList[i], 11, 12), "Quadgram.RData", sep = ""))
}
Let’s see the results.
head(readRDS("nGramData/enBigram.RData"))
w1 w2 Freq
1 of the 355
2 in the 320
3 to the 151
4 for the 145
5 on the 143
6 at the 118
head(readRDS("nGramData/enTrigram.RData"))
w1 w2 w3 Freq
1 one of the 31
2 a lot of 25
3 as well as 16
4 it was a 14
5 a bit of 13
6 going to be 13
head(readRDS("nGramData/enQuadgram.RData"))
w1 w2 w3 w3.1 Freq
1 in the middle of 6
2 the end of the 6
3 a bit of a 5
4 at the same time 5
5 going to be a 5
6 is one of the 5
head(readRDS("nGramData/deQuadgram.RData"))
w1 w2 w3 w3.1 Freq
1 auf den ersten blick 5
2 am montag in paris 4
3 auf der anderen seite 4
4 davon aus dass die 4
5 der erziehungs wissenschafts und 4
6 der vereinten nationen unesco 4
head(readRDS("nGramData/fiQuadgram.RData"))
w1 w2 w3 w3.1 Freq
1 tiimin voima on site 5
2 voima on site kun 5
3 klo klo klo klo 3
4 yhte mielte siite ette 3
5 asteessa noin tunti lisee 2
6 ei ole koskaan ollut 2