The data was available from the Coursera website through their partner Swiftkey. https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
The goal for this product is for us to understanding the process of doing text analysis in R, and then build a predictive model to predict the possible next word user will type in.
Predictive text modeling using NLP follows generally the same approach to data as we learned in the Data Science Specialization. The data is obtained, cleaned and explored, before moving to the predictive modeling stage using a training, validation and test set, and finally text prediction itself.
Library necessary packages
file1 <- file("en_US.twitter.txt")
twitter <- readLines(file1, encoding = "UTF-8", n = -1, skipNul = TRUE)
close(file1)
file1 <- file("en_US.blogs.txt")
blogs <- readLines(file1,encoding = "UTF-8", n = -1, skipNul = TRUE)
close(file1)
file1 <- file("en_US.news.txt", "rb")
news <- readLines(file1,encoding = "UTF-8", n = -1, skipNul = TRUE)
close(file1)
set seed
set.seed(2018)
## Size of Files
x1 <- file.info("en_US.blogs.txt")$size / 1024^2 #200 Megabytes
x2 <- file.info("en_US.news.txt")$size / 1024^2 #196 Megabytes
x3 <- file.info("en_US.twitter.txt")$size / 1024^2# 159 Megabytes
## Number of lines
y1 <- length(blogs) #899,288 lines
y2 <- length(news) #1,010,242 lines
y3 <- length(twitter) #2,360,148
## Counting the Words
z1 <- sum(stri_count_words(blogs)) #7,546,246
z2 <- sum(stri_count_words(news)) #34,762,395
z3 <- sum(stri_count_words(twitter)) #= 30,093,410
## The length of the longest line seen in any of the three en_US data sets:
t1 <- max(nchar(blogs)) #40,833
t2 <- max(nchar(news)) #11,384
t3 <- max(nchar(twitter)) #140
Info <- data.frame(
files_Name = c("Blogs","News","Twitter"),
files_Size = c(x1, x2, x3),
lines_Count = c(y1, y2, y3),
words_Count = c(z1, z2, z3),
max_size_Line = c(t1, t2, t3)
)
Info
## files_Name files_Size lines_Count words_Count max_size_Line
## 1 Blogs 200.4242 899288 37546246 40833
## 2 News 196.2775 1010242 34762395 11384
## 3 Twitter 159.3641 2360148 30093410 140
# Remove non-ASCII characters
twitter <- iconv(twitter, from = "latin1", "ASCII", sub = "")
blogs <- iconv(blogs, from = "latin1", "ASCII", sub = "")
news <- iconv(news, from = "latin1", "ASCII", sub = "")
Sampling a certain size of data
sub_twitter <- sample(twitter, size = 10000, replace = TRUE)
sub_blogs <- sample(blogs, size = 10000, replace = TRUE)
sub_news <- sample(news, size = 10000, replace = TRUE)
removing punctuation, stopwords, numbers, and converting characters to lower case
UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
corpusMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = UnigramTokenizer))
inspect(corpusMatrix)
## <<TermDocumentMatrix (terms: 51188, documents: 3)>>
## Non-/sparse entries: 75805/77759
## Sparsity : 51%
## Maximal term length: 78
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms blogs.txt news.txt twitter.txt
## can 1100 557 367
## get 764 456 513
## just 1108 563 664
## like 1060 490 516
## new 636 677 296
## now 673 351 381
## one 1386 775 299
## said 408 2522 58
## time 940 497 326
## will 1150 1082 403
uniGram <- as.data.frame((as.matrix(corpusMatrix)))
uniGramSorted <- sort(rowSums(uniGram),decreasing=TRUE)
uniGram_df <- data.frame(word = names(uniGramSorted),freq=uniGramSorted)
data <- uniGram_df[1:10,]
g <- ggplot(data = data, aes(x = word, y = freq))
g <- g + geom_bar(stat = "identity") + coord_flip() + ggtitle("Frequency") + geom_text(data = data, aes(x = word, y = freq, label = freq), hjust=-1, position = "identity")
g
p <- brewer.pal(9, "BuPu")
p = p[-(1:3)]
wordcloud(uniGram_df$word, uniGram_df$freq, max.words = 50, random.order = F, colors = p)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
corpusMatrix2 <- TermDocumentMatrix(corpus, control = list(tokenize = BigramTokenizer))
corpusMatrix2 <- removeSparseTerms(corpusMatrix2, 0.25)
inspect(corpusMatrix2)
## <<TermDocumentMatrix (terms: 2374, documents: 3)>>
## Non-/sparse entries: 7122/0
## Sparsity : 0%
## Maximal term length: 21
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms blogs.txt news.txt twitter.txt
## cant wait 22 1 84
## dont know 71 34 41
## feel like 58 17 29
## first time 39 45 19
## high school 34 85 13
## last week 49 70 12
## last year 48 119 5
## new york 49 117 9
## right now 50 34 73
## years ago 57 66 8
biGram <- as.data.frame((as.matrix(corpusMatrix2)))
biGramSorted <- sort(rowSums(biGram),decreasing=TRUE)
biGram_df <- data.frame(word = names(biGramSorted),freq=biGramSorted)
kable(head(biGram_df, 10), row.names = FALSE, caption = "Top Ten Bigrams")
| word | freq |
|---|---|
| new york | 175 |
| last year | 172 |
| right now | 157 |
| dont know | 146 |
| high school | 132 |
| last week | 131 |
| years ago | 131 |
| cant wait | 107 |
| feel like | 104 |
| first time | 103 |
p <- brewer.pal(9, "Blues")
p = p[-(1:3)]
wordcloud(biGram_df$word, biGram_df$freq, max.words = 50, random.order = F, colors = p)
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
corpusMatrix3 <- TermDocumentMatrix(corpus, control = list(tokenize = TrigramTokenizer))
corpusMatrix3 <- removeSparseTerms(corpusMatrix3, 0.66)
inspect(corpusMatrix3)
## <<TermDocumentMatrix (terms: 929, documents: 3)>>
## Non-/sparse entries: 1909/878
## Sparsity : 32%
## Maximal term length: 28
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms blogs.txt news.txt twitter.txt
## cant wait see 5 0 19
## happy mothers day 1 0 15
## happy new year 6 0 8
## im looking forward 7 3 5
## let us know 4 1 10
## new york city 10 10 0
## new york times 8 10 0
## president barack obama 3 17 0
## two years ago 7 8 1
## world war ii 1 13 0
triGram <- as.data.frame((as.matrix(corpusMatrix3)))
triGramSorted <- sort(rowSums(triGram),decreasing=TRUE)
triGram_df <- data.frame(word = names(triGramSorted),freq=triGramSorted)
kable(head(triGram_df, 10), row.names = FALSE, caption = "Top Ten Trigrams")
| word | freq |
|---|---|
| cant wait see | 24 |
| new york city | 20 |
| president barack obama | 20 |
| new york times | 18 |
| happy mothers day | 16 |
| two years ago | 16 |
| im looking forward | 15 |
| let us know | 15 |
| happy new year | 14 |
| world war ii | 14 |
p <- brewer.pal(9, "Greens")
p = p[-(1:3)]
wordcloud(triGram_df$word, triGram_df$freq, max.words = 50, random.order = F, colors = p)
In the next step, I’ll dive deeper into building the predictive model, useing the most related word analyzing on bi-gram and tri-gram. After that I’ll try to build a effective model and inserted it into shiny app to display and improve user interaction.