Importing Corpora

getwd()
## [1] "C:/Users/smooth computers llc/Documents"

file path of corpora

filePath <- file.path(".", "en_US")

Content of corpora

dir(filePath)
## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"

Import library for text mining

Creating of corpus

corpus <- Corpus( DirSource(filePath, encoding = "UTF-8"), readerControl = list(reader=readPlain, language="en") )
show(corpus)
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3

Loop of the content of each document

en_US.blogs.txt

# read line by line and display the first lines
blogs <- readLines("./en_US/en_US.blogs.txt",warn=FALSE,encoding="UTF-8",skipNul=TRUE)
blogs[1:3]
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan “gods”."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
## [2] "We love you Mr. Brown."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
## [3] "Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him."

en_US.news.txt

# read line by line and display the first lines
news <- readLines("./en_US/en_US.news.txt", warn=FALSE,encoding="UTF-8",skipNul=TRUE)
news[1:3]
## [1] "He wasn't home alone, apparently."                                                                                                                                                
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."                        
## [3] "WSU's plans quickly became a hot topic on local online sites. Though most people applauded plans for the new biomedical center, many deplored the potential loss of the building."

en_US.twitter.txt

# read line by line and display the first lines
twitter <- readLines("./en_US/en_US.twitter.txt", warn=FALSE,encoding="UTF-8",skipNul=TRUE)
twitter[1:3]
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."  
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
## [3] "they've decided its more fun if I don't."

Clean corpus before exploratory analyse

The function below will remove punctuaction, strip white space, remove stopwords on the content of corpus

cleanCorpus <- function(corpus){
  corpus.tmp <- tm_map(corpus, removePunctuation, preserve_intra_word_dashes = TRUE)
  toSpace  <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
  corpus.tmp <- tm_map(corpus.tmp, toSpace, "\"|/|@|\\|")
  corpus.tmp <- tm_map(corpus.tmp, toSpace, "\\W")
  corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
  corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
  corpus.tmp <- tm_map(corpus.tmp, removeWords, stopwords("english"))
  return(corpus.tmp)
}
#cleaned_corpus <- cleanCorpus(corpus)

Exploratory analysis

Word cloud

buildWordcloud <- function(x){
  docs <- Corpus(VectorSource(x))
  docs <- cleanCorpus(docs)
  dtm <- TermDocumentMatrix(docs)
  m <- as.matrix(dtm)
  v <- sort(rowSums(m), decreasing = TRUE)
  d <- data.frame(word=names(v), freq=v)
  set.seed(1101)
  wordcloud(words = d$word, freq = d$freq, max.words = 400, random.order = FALSE, colors = rainbow(10))
}

words cloud for blogs

buildWordcloud(blogs[sample(1:length(blogs), 2000)])

words cloud for news

buildWordcloud(news[sample(1:length(news), 2000)])

words cloud for twitter

buildWordcloud(twitter[sample(1:length(twitter), 2000)])

Draw the frequency of words (top ten words with more frequency)

barFreqWords <- function(x, n){
  docs <- Corpus(VectorSource(x))
  docs <- cleanCorpus(docs)
  dtm <- TermDocumentMatrix(docs)
  m <- as.matrix(dtm)
  v <- sort(rowSums(m), decreasing = TRUE)
  d <- data.frame(word=names(v), freq=v)
  barplot(d[1:n,]$freq, las = 2, names.arg = d[1:n,]$word,
        col ="lightblue", main ="Most frequent words",
        ylab = "Word frequencies")
}

For Blogs

barFreqWords(blogs[sample(1:length(blogs), 400)], n=10)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation,
## preserve_intra_word_dashes = TRUE): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus.tmp, toSpace, "\"|/|@|\\|"):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus.tmp, toSpace, "\\W"): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus.tmp, stripWhitespace): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus.tmp, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus.tmp, removeWords,
## stopwords("english")): transformation drops documents

For news

barFreqWords(news[sample(1:length(news), 400)], n=10)

For twitter

barFreqWords(twitter[sample(1:length(news), 400)], n=10)

Analysing lines and characters

for blogs

stri_stats_general(blogs)
##       Lines LinesNEmpty       Chars CharsNWhite 
##      899288      899288   206824382   170389539

for news

stri_stats_general(news)
##       Lines LinesNEmpty       Chars CharsNWhite 
##       77259       77259    15639408    13072698

for twitter

stri_stats_general(twitter)
##       Lines LinesNEmpty       Chars CharsNWhite 
##     2360148     2360148   162096241   134082806

histogram of word frequency

For blogs

blog_words <- stri_count_words(blogs)
summary(blog_words);
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   28.00   41.75   60.00 6726.00
hist(blog_words)
rug(blog_words)

For news

news_words <- stri_count_words(news)
summary( news_words)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.00   32.00   34.62   46.00 1123.00
hist(news_words)
rug(news_words)

For twitter

twitter_words <- stri_count_words(twitter)
summary( twitter_words)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   12.75   18.00   47.00
hist(twitter_words)
rug(twitter_words)

term Document matrix

#corpusv <- Corpus(VectorSource(corpus))
#tdm <- TermDocumentMatrix(corpusv)
#tdm
#m <- as.matrix(tdm)
#colnames(m) <- c("blogs", "news", "twitter")
#head(m)
#comparison.cloud(m)
#findFreqTerms(m, 50)

Conclusion

There are many others things that we can do complete our report. For example interpret all graphics done.

And for the main purpose of our goal, we plan to build an interesting N_gram model language with some smoothing technique for word predicting.