Data source

Location: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

The data sets consist of text from 3 different sources:
1) News,
2) Blogs and
3) Twitter feeds.

The text data are provided in 4 different languages:

German,
English - United States,
Finnish and
Russian.
In this project, we will only focus on the English - United States data sets.

Set up libraries and working directory

library(tm)

## Loading required package: NLP

library(stringi)
library(RWeka)
library(wordcloud)

## Loading required package: RColorBrewer

library(stringi)

setwd("C:\\Users\\mahajvi1\\Desktop\\Coursera_capstone\\final\\en_US\\")

Read the blogs and Twitter data into R

Examine the data sets and summarize our findings (file sizes, line counts, word counts, and mean words per line) below.

blogs <- readLines("en_US.blogs.txt", encoding = "ASCII", skipNul = TRUE, warn = FALSE)
news <- readLines("en_US.news.txt", encoding = "ASCII", skipNul = TRUE,warn = FALSE)
twitter <- readLines("en_US.twitter.txt", encoding = "ASCII", skipNul = TRUE, warn = FALSE)

# Get file sizes

blogs.size <- file.info("en_US.blogs.txt")$size / 1024 ^ 2
news.size <- file.info("en_US.news.txt")$size / 1024 ^ 2
twitter.size <- file.info("en_US.twitter.txt")$size / 1024 ^ 2

# Get words in files

blogs.words <- stri_count_words(blogs)
news.words <- stri_count_words(news)
twitter.words <- stri_count_words(twitter)

# Summary of the data sets

data.frame(source = c("blogs", "news", "twitter"),
           
           file.size.MB = c(blogs.size, news.size, twitter.size),
           num.lines = c(length(blogs), length(news), length(twitter)),
           num.words = c(sum(blogs.words), sum(news.words), sum(twitter.words)),
           mean.num.words = c(mean(blogs.words), mean(news.words), mean(twitter.words)))

##    source file.size.MB num.lines num.words mean.num.words
## 1   blogs     200.4242    899288  38154238       42.42716
## 2    news     196.2775     77259   2693898       34.86840
## 3 twitter     159.3641   2360148  30218166       12.80350

Use limited data for testing

# Load 5000 lines from every set in corpus
merged <- paste(news[1:5000], blogs[1:5000], twitter[1:5000])
corpus <- VCorpus(VectorSource(merged))

# Remove large files to clean up memory
rm (blogs.words)
rm(news.words)
rm(twitter.words)

rm(blogs)
rm(news)
rm(twitter)

Clean The Data

Before performing exploratory analysis, we must clean the data first.

This involves removing
(1) URLs,
(2) special characters,
(3) punctuations,
(4) numbers,
(5) excess whitespace,
(6) stopwords, and
(7) changing the text to lower case.

corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords())

corpusDf <-data.frame(text=unlist(sapply(corpus, 
                                         `[`, "content")), stringsAsFactors=F)

findNGrams <- function(corp, grams) {
  ngram <- NGramTokenizer(corp, Weka_control(min = grams, max = grams,
                                             delimiters = " \\r\\n\\t.,;:\"()?!"))
  ngram2 <- data.frame(table(ngram))
  #pick only top 25
  ngram3 <- ngram2[order(ngram2$Freq,decreasing = TRUE),][1:100,]
  colnames(ngram3) <- c("String","Count")
  ngram3
}

TwoGrams <- findNGrams(corpusDf, 2)
ThreeGrams <- findNGrams(corpusDf, 3)
FourGrams <- findNGrams(corpusDf, 4)

Plot word clouds and histograms

require(RColorBrewer)

par(mfrow = c(1, 3))
palette <- brewer.pal(8,"Dark2")

wordcloud(TwoGrams[,1], TwoGrams[,2], min.freq =1, 
          random.order = F, ordered.colors = F, colors=palette)
text(x=0.5, y=0, "2-gram cloud")

wordcloud(ThreeGrams[,1], ThreeGrams[,2], min.freq =1, 
          random.order = F, ordered.colors = F, colors=palette)
text(x=0.5, y=0, "3-gram cloud")

wordcloud(FourGrams[,1], FourGrams[,2], min.freq =1, 
          random.order = F, ordered.colors = F, colors=palette)
text(x=0.5, y=0, "4-gram cloud")

par(mfrow = c(1, 1))

barplot(TwoGrams[1:20,2], 
        cex.names=0.5, 
        names.arg=TwoGrams[1:20,1], 
        col="red", 
        main="2-Grams", 
        las=2)

barplot(ThreeGrams[1:20,2], 
        cex.names=0.5, 
        names.arg=ThreeGrams[1:20,1], 
        col="green", 
        main="3-Grams", 
        las=2)

barplot(FourGrams[1:20,2], 
        cex.names=0.5, 
        names.arg=FourGrams[1:20,1], 
        col="blue", 
        main="4-Grams", 
        las=2)

exploratory_reports

Vinay Mahajan

February 19, 2017

Introduction