Predict the next word user is likely to type.
There are provided with 3 source data sets which are approximately 170 to 200 megabyes text files containing sentences as they appear in twitter posts, blogs, and news articles. Given the massive size of the files we randomly select only 15% of the files for further processing. The files are then combined into one large dataset of type tibble, which allows for faster processing of the large data. By combining data sources, we have a better model that can handle different types of writing styles. For later use we separate the data into train and test (80/20).
For our profanity filter we download Google banned words list.
Given that predicting every combination of sentence is nearly impossible. We opt to take the sentence and create n-Grams, which are word occurrences with number of times it occurs in the dataset. This will allow us to create probabilites later on. We create three types of n-grams.
We will take our n-grams and use the Markov Assumption, to estimate the sequence of words by taking the conditional probability of the word based on either the previous word or previous two words.
setwd("~/Desktop/capstone/data/final/en_US")
twtData <- readLines("en_US.twitter.txt", skipNul = TRUE)
newsData <- readLines("en_US.news.txt", skipNul = TRUE)
blogsData <- readLines("en_US.blogs.txt", skipNul = TRUE)
# Create table of file info
tibble(file = c("twitter", "news", "blogs"),
fileSize= c(file.size("en_US.twitter.txt"),file.size("en_US.news.txt"),
file.size("en_US.blogs.txt")),
lines = c(length(twtData),length(newsData),length(blogsData))) %>%
kable("html") %>% kable_styling(bootstrap_options = "striped", full_width = F, position = "left")
| file | fileSize | lines |
|---|---|---|
| 167105338 | 2360148 | |
| news | 205811889 | 1010242 |
| blogs | 210160014 | 899288 |
# The File is too large. Taking only 30% of data ramdonly as sample
twtData <- twtData[(rbinom(length(twtData), 1, .15)==1)]
newsData <- newsData[(rbinom(length(newsData), 1, .15)==1)]
blogsData <- blogsData[(rbinom(length(blogsData), 1, .15)==1)]
# Combine data -- and remove individual datasets
twtData <- tibble(text = twtData)
newsData <- tibble(text = newsData)
blogsData <- tibble(text = blogsData)
combData <- bind_rows(twtData, newsData, blogsData)
rm(twtData)
rm(newsData)
rm(blogsData)
# Creating training and test dataset
train <- sample(nrow(combData), round(nrow(combData)*.80))
test <- combData[-train,]
combData <- combData[train,]
write.csv(test, "testData.csv", row.names = FALSE)
rm(test)
rm(train)
# google banned words to use to filter twitter, news, and blog text
googleBannedWords <- readLines("googleBannedWords.txt", skipNul = TRUE)
googleBannedWords <- tibble(word = googleBannedWords)
data("stop_words")
# Unnest_token command handles punctuation and convert to lower case.
# anti_join is used to remove profanity
uniGram <- combData %>% unnest_tokens(word, text) %>%
anti_join(googleBannedWords, by = 'word')
# removing anything that is not a letter a-z; this takes out numbers and foreign characters
uniGram <- uniGram[!(str_detect(uniGram$word, regex("[^a-z]+"))),]
## Count number of rows, number of occurrence of each word and sort
uniGramRows <- nrow(uniGram)
uniGram <- uniGram %>% count(word, sort = TRUE)
## We notice that mean is much higher than median, indicating lot of outlier
print(paste("Mean uniGram occurrence", mean(uniGram$n)))
## [1] "Mean uniGram occurrence 69.8438029047492"
print(paste("Median uniGram occurrence", median(uniGram$n)))
## [1] "Median uniGram occurrence 2"
# Plot top 10 of individual word occurences
uniGram %>% top_n(10) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) + geom_col(fill="blue") +
labs(x ="word", title = "TOP 10") + coord_flip()
## Selecting by n
# It is very likely that the word occurrences at the bottom of the list represent either very seldomly used words or incorrectly spelled words. So we are looking to maximize word occurrence while reducing memory usage. From the plot below we can get 95% of word frequency by use of just approximately 15% of the words.
x <- c(.90, .91, .92, .93, .94, .95, .96, .97, .98, .99)
y <- sapply(x, function(x) nrow(uniGram %>% mutate(wordFreq = cumsum(n / sum(n))) %>% filter(wordFreq<=x)) / nrow(uniGram))
qplot(x,y, xlab = "% of word occurrence", ylab = "% of words in dataset", main = "uniGram Pruning")
## Let grab 98% of the word occurence; this allows us to maximize word occurrence while minimizing data in memory
uniGram <- uniGram %>% mutate(wordFreq = cumsum(n/sum(n))) %>% filter(wordFreq <= .98) %>% select(-wordFreq)
# As before tokens function is used to separate and clean data. In this case they create pairs
biGram <- combData %>% unnest_tokens(bigram, text, token = "ngrams", n=2)
biGram <- biGram %>% separate(bigram, c("word1", "word2"), sep = " ")
# Remove profanity
biGram <- biGram %>%
filter(!word1 %in% googleBannedWords$word) %>%
filter(!word2 %in% googleBannedWords$word)
# Remove numbers and foreign text
biGram <- biGram[!(str_detect(biGram$word1, regex("[^a-z]+"))),]
biGram <- biGram[!(str_detect(biGram$word2, regex("[^a-z]+"))),]
# Count the numbers
biGram <- biGram %>% count(word1, word2, sort = TRUE)
# Mean and Median
print(paste("Mean biGram occurrence", mean(biGram$n)))
## [1] "Mean biGram occurrence 3.80730320139965"
print(paste("Median biGram occurrence", median(biGram$n)))
## [1] "Median biGram occurrence 1"
# Similar analysis as above. We want to use less word, while getting at least good balance of data and performance
x <- c(.60, .65, .70, .75, .80, .85, .90, .95, .98, .99)
y <- sapply(x, function(x) {nrow(biGram %>% mutate(wordFreq = cumsum(n / sum(n))) %>% filter(wordFreq<=x)) / nrow(biGram)})
qplot(x,y, xlab = "% of word pair occurrence", ylab = "% of word pairs", main = "biGram Pruning")
## Let grab 95% of the occurence
biGram <- biGram %>% mutate(wordFreq = cumsum(n/sum(n))) %>% filter(wordFreq <= .95) %>% select(-wordFreq)
triGram <- combData %>% unnest_tokens(trigram, text, token = "ngrams", n=3)
triGram <- triGram %>% separate(trigram, c("word1", "word2", "word3"), sep = " ")
triGram <- triGram %>%
filter(!word1 %in% googleBannedWords$word) %>%
filter(!word2 %in% googleBannedWords$word) %>%
filter(!word3 %in% googleBannedWords$word)
triGram <- triGram[!(str_detect(triGram$word1, regex("[^a-z]+"))),]
triGram <- triGram[!(str_detect(triGram$word2, regex("[^a-z]+"))),]
triGram <- triGram[!(str_detect(triGram$word3, regex("[^a-z]+"))),]
triGram <- triGram %>% count(word1, word2, word3, sort = TRUE)
# Mean and Median
print(paste("Mean triGram occurrence", mean(triGram$n)))
## [1] "Mean triGram occurrence 1.48905726215293"
print(paste("Median triGram occurrence", median(triGram$n)))
## [1] "Median triGram occurrence 1"
# Similar Analysis -- just as we had done for the uni-gram and bi-gram
x <- c(.20, .30, .40, .50, .60, .70, .80, .90, .95, .99)
y <- sapply(x, function(x) {nrow(triGram %>% mutate(wordFreq = cumsum(n / sum(n))) %>% filter(wordFreq<=x)) / nrow(triGram)})
qplot(x,y, xlab = "% of word trigram occurrence", ylab = "% of trigram words", main = "triGram Pruning")
## Let grab 80% of the occurence
triGram <- triGram %>% mutate(wordFreq = cumsum(n/sum(n))) %>% filter(wordFreq <= .80) %>% select(-wordFreq)
## [1] "Combined n-grams take storage of (bytes) 226801936"
## [1] "Number of unique word in the model 38269"
## [1] "Number of word pairs in the model 2395269"
## [1] "Number of triple word combinations in the model 5092833"