Executive summary

The goal of this project is to show an understanding of three data sets for news, blogs, and tweets along.

Used Libraries

library(NLP)
library(tm)
library(RColorBrewer)
library(wordcloud)
library(SnowballC)
library(ggplot2)
library(fpc)

Load the data

We load data from [this link][1]

basePath <- '/home/mohamed/R_Coursr_Coursera/CastponeProject/week2_Assignment/Coursera-SwiftKey/final/en_US'
flist <- list.files(path=basePath, recursive=T, pattern=".*en_.*.txt")
fileNames <- paste(basePath, flist, sep="/")
samplePerc<- 0.001  # Sample 0.1% of the entire data sets

blogs<-readLines(fileNames[1])
blogss<-sample(blogs, round(samplePerc*length(blogs)), replace = F)
rm(blogs)

news<-readLines(fileNames[2])
newss<-sample(news, round(samplePerc*length(news)), replace = F)
rm(news)

twitter<-readLines(fileNames[3])
twitters<-sample(twitter, round(samplePerc*length(twitter)), replace = F)
rm(twitter) 

We create three copora now

blogsCorpus <- VCorpus(VectorSource(blogss))
newsCorpus <- VCorpus(VectorSource(newss))
twitterCorpus <- VCorpus(VectorSource(twitters))

Summary of the Corpus

l <- lapply(paste(basePath, flist, sep="/"), function(f) {
    fsize <- file.info(f)[1]/1024/1024
    con <- file(f, open="r")
    lines <- readLines(con)
    nchars <- lapply(lines, nchar)
    maxchars <- which.max(nchars)
    nwords <- sum(sapply(strsplit(lines, "\\s+"), length))
    close(con)
    return(c(f, format(round(fsize, 2), nsmall=2), length(lines), maxchars, nwords))
})

df <- data.frame(matrix(unlist(l), nrow=length(l), byrow=T))
colnames(df) <- c("file", "size(MB)", "num.of.lines", "longest.line", "num.of.words")
print(df)
##                                                                                                               file
## 1   /home/mohamed/R_Coursr_Coursera/CastponeProject/week2_Assignment/Coursera-SwiftKey/final/en_US/en_US.blogs.txt
## 2    /home/mohamed/R_Coursr_Coursera/CastponeProject/week2_Assignment/Coursera-SwiftKey/final/en_US/en_US.news.txt
## 3 /home/mohamed/R_Coursr_Coursera/CastponeProject/week2_Assignment/Coursera-SwiftKey/final/en_US/en_US.twitter.txt
##   size(MB) num.of.lines longest.line num.of.words
## 1   200.42       899288       483415     37334131
## 2   196.28      1010242       123628     34372530
## 3   159.36      2360148           26     30373543

Clean the Corpus

CorpusClean <- function(x){
    x <- tm_map(x, removePunctuation)
    x <- tm_map(x, removeNumbers)
    x <- tm_map(x, stripWhitespace)
   
    x <- tm_map(x, content_transformer(tolower))
    x <- tm_map(x, removeWords, stopwords("english"))   
    x <- tm_map(x, PlainTextDocument)
    x
}

blogsCorpus <- CorpusClean(blogsCorpus)
newsCorpus <- CorpusClean(newsCorpus)
twitterCorpus <- CorpusClean(twitterCorpus)
UnigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 1), paste, collapse = " "), use.names = FALSE)
BigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
TrigramTokenizer <-
    function(x)
        unlist(lapply(ngrams(words(x), 3), paste, collapse = " "), use.names = FALSE)

Blogs Corpus

btdm1 <- TermDocumentMatrix(blogsCorpus, control = list(tokenize = UnigramTokenizer))
btdm2 <- TermDocumentMatrix(blogsCorpus, control = list(tokenize = BigramTokenizer))
btdm3 <- TermDocumentMatrix(blogsCorpus, control = list(tokenize = TrigramTokenizer))

News Corpus

ntdm1 <- TermDocumentMatrix(newsCorpus, control = list(tokenize = UnigramTokenizer))
ntdm2 <- TermDocumentMatrix(newsCorpus, control = list(tokenize = BigramTokenizer))
ntdm3 <- TermDocumentMatrix(newsCorpus, control = list(tokenize = TrigramTokenizer))

Twitter Corpus

ttdm1 <- TermDocumentMatrix(twitterCorpus, control = list(tokenize = UnigramTokenizer))
ttdm2 <- TermDocumentMatrix(twitterCorpus, control = list(tokenize = BigramTokenizer))
ttdm3 <- TermDocumentMatrix(twitterCorpus, control = list(tokenize = TrigramTokenizer))

Compute freq of words in histogram

showCorpusInfo <- function(theCorpus)
{
    m <- as.matrix(theCorpus)
    v <- sort(rowSums(m), decreasing=TRUE)
    d <- data.frame(word = names(v),freq=v)
    #print(head(d, 10))
    barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")
    return (d)
}

Word Plotting Function

createAcloud <- function(d)
{
    minf = 40
    wordcloud(words = d$word, freq = d$freq, min.freq = minf,
                  max.words=200, random.order=FALSE, rot.per=0.35, 
                  colors=brewer.pal(8, "Dark2"))
}

Analysis here

d1<-showCorpusInfo(btdm1)

createAcloud(d1)

d2<-showCorpusInfo(btdm2)

d3<-showCorpusInfo(btdm3)

d1<-showCorpusInfo(ntdm1)

createAcloud(d1)

d2<-showCorpusInfo(ntdm2)

d3<-showCorpusInfo(ntdm3)

d1<-showCorpusInfo(ttdm1)

createAcloud(d1)

d2<-showCorpusInfo(ttdm2)

d3<-showCorpusInfo(ttdm3)

[1] https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip