This project is Milestone report of the Capstone Project for the Data Science specialization Coursera in collaboration with Swiftkey.
This project uses Swiftkey Dataset from blogs, new sites, and Twitter from this site.
## Libraries
library(tm)
library(SnowballC)
library(stringi)
library(ggplot2)
library(wordcloud)
library(data.table)
library(dplyr)
library(kableExtra)
library(RColorBrewer)
library(RWeka)
setwd("C:/Users/dongj/Desktop/R_data_Desk/Capstone/Capstone_Project")
# blogs
blogsFileName <- "data/en_US.blogs.txt"
con <- file(blogsFileName, open = "r")
blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
# news
newsFileName <- "data/en_US.news.txt"
con <- file(newsFileName, open = "r")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
# twitter
twitterFileName <- "data/en_US.twitter.txt"
con <- file(twitterFileName, open = "r")
twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
rm(con)
In this summary, it shows number of lines, number of characters and number of words for each file. It also includes number of words per line (Min, Mean, Max).
numLines <- sapply(list(blogs, news, twitter), length)
numChars <- sapply(list(nchar(blogs), nchar(news), nchar(twitter)), sum)
numWords <- sapply(list(blogs, news, twitter), stri_stats_latex)[4,]
wpl <- lapply(list(blogs, news, twitter), function(x) stri_count_words(x))
wplSummary = sapply(list(blogs, news, twitter),
function(x) summary(stri_count_words(x))[c('Min.', 'Mean', 'Max.')])
rownames(wplSummary) = c('WPL.Min', 'WPL.Mean', 'WPL.Max')
summary <- data.frame(
File =c(blogsFileName, newsFileName, twitterFileName),
Lines = numLines,
Characters = numChars,
Words = numWords,
t(rbind(round(wplSummary)))
)
kable(summary,
row.names = FALSE,
align = c("l", rep("r", 7)),
caption = "") %>% kable_styling(position = "left")
| File | Lines | Characters | Words | WPL.Min | WPL.Mean | WPL.Max |
|---|---|---|---|---|---|---|
| data/en_US.blogs.txt | 899288 | 206824505 | 37570839 | 0 | 42 | 6726 |
| data/en_US.news.txt | 77259 | 15639408 | 2651432 | 1 | 35 | 1123 |
| data/en_US.twitter.txt | 2360148 | 162096241 | 30451170 | 1 | 13 | 47 |
In our dataset, there are 3 datasets files (en_US.blogs, en_US.news, en_US.twitter). Created en.US.combine that combine 3 datasets files into one files. Using en.US.combine dataset, it would be easier and faster to clean the data.
set.seed(1130)
sampleSize = 0.01
sampleBlogs <- sample(blogs, length(blogs) * sampleSize, replace = FALSE)
sampleNews <- sample(news, length(news) * sampleSize, replace = FALSE)
sampleTwitter <- sample(twitter, length(twitter) * sampleSize, replace = FALSE)
sampleBlogs <- iconv(sampleBlogs, "latin1", "ASCII", sub = "")
sampleNews <- iconv(sampleNews, "latin1", "ASCII", sub = "")
sampleTwitter <- iconv(sampleTwitter, "latin1", "ASCII", sub = "")
comb <- c(sampleBlogs, sampleNews, sampleTwitter)
combined <- "data/en_US.combine.txt"
con <- file(combined, open = "w")
writeLines(comb, con)
close(con)
This part is to clean, prepare, and build collection of written text which is corpus.
build_corpus <- function (x = comb) {
sample_c <- VCorpus(VectorSource(x)) # Create corpus dataset
sample_c <- tm_map(sample_c, tolower) # all lowercase
sample_c <- tm_map(sample_c, removePunctuation) # Eleminate punctuation
sample_c <- tm_map(sample_c, removeNumbers) # Eliminate numbers
sample_c <- tm_map(sample_c, stripWhitespace) # Strip Whitespace
sample_c <- tm_map(sample_c, removeWords, stopwords("english")) # Eliminate English stop words
sample_c <- tm_map(sample_c, stemDocument) # Stem the document
sample_c <- tm_map(sample_c, PlainTextDocument) # Create plain text format
}
combData <- build_corpus(comb)
Build wordcloud to represent word frequency in graphically. Based on Wordcloud, word “just” has most frequency.
wordcloud(combData, max.words =100,min.freq=3,scale=c(4,.5),
random.order = FALSE,rot.per=.5,vfont=c("sans serif","plain"),colors=brewer.pal(8, "Dark2"))
Since we already clean our data, we need to convert our dataset into N-gram format from Natural Language Processing (NLP). N-gram is the simplest model that assign probabilities to sentences and sequences of words.
We used RWeka package to construct functions that tokenize the dataset and construct matrices of uniqrams, bigrams, and trigrams.
In our graph, it shows Top 20 most common words.
unigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
unigramMatrix <- TermDocumentMatrix(combData, control = list(tokenize = unigramTokenizer))
unigramterm <- findFreqTerms(unigramMatrix, lowfreq = 5)
unigramfreq <- rowSums(as.matrix(unigramMatrix[unigramterm,]))
unigramfreq <- data.frame(unigram=names(unigramfreq), frequency=unigramfreq)
unigramfreq <- unigramfreq[order(-unigramfreq$frequency),]
unigramMatrixlist <-setDT(unigramfreq)
saveRDS(unigramMatrixlist, "data/unigram.RData")
ggplot(unigramMatrixlist[1:20,], aes(x=reorder(unigram,-frequency), y=frequency)) +
geom_bar(stat= "identity", fill = I("grey50")) + theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5))+ labs(title="Top 20 Most Common Unigram", x="", y="Frequency")
bigramMatrix <- TermDocumentMatrix(combData, control = list(tokenize = bigramTokenizer))
bigramterm <- findFreqTerms(bigramMatrix, lowfreq = 3)
bigramfreq <- rowSums(as.matrix(bigramMatrix[bigramterm,]))
bigramfreq <- data.frame(bigram=names(bigramfreq), frequency=bigramfreq)
bigramfreq <- bigramfreq[order(-bigramfreq$frequency),]
bigramMatrixlist <-setDT(bigramfreq)
saveRDS(bigramMatrixlist, "data/bigram.RData")
ggplot(bigramMatrixlist[1:20,], aes(x=reorder(bigram,-frequency), y=frequency)) +
geom_bar(stat= "identity", fill = I("grey50"))+ theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5))+ labs(title="Top 20 Most Common Bigram", x="", y="Frequency")
trigramMatrix <- TermDocumentMatrix(combData, control = list(tokenize = trigramTokenizer))
trigramterm <- findFreqTerms(trigramMatrix, lowfreq = 2)
trigramfreq <- rowSums(as.matrix(trigramMatrix[trigramterm,]))
trigramfreq <- data.frame(trigram=names(trigramfreq), frequency=trigramfreq)
trigramfreq <- trigramfreq[order(-trigramfreq$frequency),]
trigramMatrixlist <-setDT(trigramfreq)
saveRDS(trigramMatrixlist, "data/trigram.RData")
ggplot(trigramMatrixlist[1:20,], aes(x=reorder(trigram,-frequency), y=frequency)) +
geom_bar(stat= "identity", fill = I("grey50"))+ theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5))+ labs(title="Top 20 Most Common Trigram", x="", y="Frequency")
This Capstone project milestone Report is just initial exploratory analysis to build predictive model. Based on Data from exploratory analysis, we will build a predictive algorithm using data we analyzed. Goal of this capstone is building the predictive models with Shiny application with user interface.