The goal of this project is to show an understanding of three data sets for news, blogs, and tweets along with an exploration of this data. Statistics of N-grams are included. The original data was downloaded using thw link provided in the lecture.
Following libraries are going to be used in this report and analysis:
library(NLP)
library(tm) # Text Mining - Requires NLP and slam
library(RColorBrewer)
library(wordcloud)
Read in the raw data sets - each is a text file in English.
basePath <- 'C:/Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US'
flist <- list.files(path=basePath, recursive=T, pattern=".*en_.*.txt")
fileNames <- paste(basePath, flist, sep="/")
samplePerc<- 0.001 # Sample 0.1% of the entire data sets
blogs<-readLines(fileNames[1])
blogss<-sample(blogs, round(samplePerc*length(blogs)), replace = F)
rm(blogs)
news<-readLines(fileNames[2])
## Warning in readLines(fileNames[2]): incomplete final line found on 'C:/
## Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/
## final/en_US/en_US.news.txt'
newss<-sample(news, round(samplePerc*length(news)), replace = F)
rm(news)
twitter<-readLines(fileNames[3])
## Warning in readLines(fileNames[3]): line 167155 appears to contain an
## embedded nul
## Warning in readLines(fileNames[3]): line 268547 appears to contain an
## embedded nul
## Warning in readLines(fileNames[3]): line 1274086 appears to contain an
## embedded nul
## Warning in readLines(fileNames[3]): line 1759032 appears to contain an
## embedded nul
twitters<-sample(twitter, round(samplePerc*length(twitter)), replace = F)
rm(twitter)
blogssCorpus <- VCorpus(VectorSource(blogss))
newssCorpus <- VCorpus(VectorSource(newss))
twittersCorpus <- VCorpus(VectorSource(twitters))
l <- lapply(paste(basePath, flist, sep="/"), function(f) {
fsize <- file.info(f)[1]/1024/1024
con <- file(f, open="r")
lines <- readLines(con)
nchars <- lapply(lines, nchar)
maxchars <- which.max(nchars)
nwords <- sum(sapply(strsplit(lines, "\\s+"), length))
close(con)
return(c(f, format(round(fsize, 2), nsmall=2), length(lines), maxchars, nwords))
})
## Warning in readLines(con): incomplete final line found on 'C:/Users/AID-
## FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US/
## en_US.news.txt'
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
df <- data.frame(matrix(unlist(l), nrow=length(l), byrow=T))
colnames(df) <- c("file", "size(MB)", "num.of.lines", "longest.line", "num.of.words")
print(df)
## file
## 1 C:/Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US/en_US.blogs.txt
## 2 C:/Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US/en_US.news.txt
## 3 C:/Users/AID-FLOATER-2015-02/Desktop/DataScience/Capstone/Coursera-SwiftKey/final/en_US/en_US.twitter.txt
## size(MB) num.of.lines longest.line num.of.words
## 1 200.42 899288 483415 37334441
## 2 196.28 77259 14556 2643972
## 3 159.36 2360148 1484357 30373792
This removes punctuation, numbers, white spaces, stop words, and converts to lower case.
cleanCorpus <- function(x){
x <- tm_map(x, removePunctuation)
x <- tm_map(x, removeNumbers)
x <- tm_map(x, stripWhitespace)
#docsX <- tm_map(docsX, tolower) # Error explained here: http://stackoverflow.com/questions/24191728/documenttermmatrix-error-on-corpus-argument
x <- tm_map(x, content_transformer(tolower))
x <- tm_map(x, removeWords, stopwords("english")) # Stop words for tm: https://github.com/arc12/Text-Mining-Weak-Signals/wiki/Standard-set-of-english-stopwords
x <- tm_map(x, PlainTextDocument)
#x <- tm_map(x, stemDocument) # This takes too long so we will skip it.
return(x)
}
blogssCorpus <- cleanCorpus(blogssCorpus)
newssCorpus <- cleanCorpus(newssCorpus)
twittersCorpus <- cleanCorpus(twittersCorpus)
Create some 1-gram, 2-gram, and 3-gram tokenizers
UnigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
BigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <-
function(x)
unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)
The blogs corpus.
btdm1 <- TermDocumentMatrix(blogssCorpus, control = list(tokenize = UnigramTokenizer))
btdm2 <- TermDocumentMatrix(blogssCorpus, control = list(tokenize = BigramTokenizer))
btdm3 <- TermDocumentMatrix(blogssCorpus, control = list(tokenize = TrigramTokenizer))
The news corpus
ntdm1 <- TermDocumentMatrix(newssCorpus, control = list(tokenize = UnigramTokenizer))
ntdm2 <- TermDocumentMatrix(newssCorpus, control = list(tokenize = BigramTokenizer))
ntdm3 <- TermDocumentMatrix(newssCorpus, control = list(tokenize = TrigramTokenizer))
The twitter corpus
ttdm1 <- TermDocumentMatrix(twittersCorpus, control = list(tokenize = UnigramTokenizer))
ttdm2 <- TermDocumentMatrix(twittersCorpus, control = list(tokenize = BigramTokenizer))
ttdm3 <- TermDocumentMatrix(twittersCorpus, control = list(tokenize = TrigramTokenizer))
A function to compute the frequency of words and create a bar plot.
showCorpusInfo <- function(theCorpus)
{
m <- as.matrix(theCorpus)
v <- sort(rowSums(m), decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
#print(head(d, 10))
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightblue", main ="Most frequent words",
ylab = "Word frequencies")
return (d)
}
A function to plot a word cloud.
createAcloud <- function(d)
{
minf = 40
wordcloud(words = d$word, freq = d$freq, min.freq = minf,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
}
Plot the top 10 1-grams
d1<-showCorpusInfo(btdm1)
createAcloud(d1)
d2<-showCorpusInfo(btdm2)
d3<-showCorpusInfo(btdm3)
createAcloud(d1)