Coursera Data Science Milestone Report

library(tm)
library(RWeka)
library(wordcloud)

Objective

The goal of this project is to perform an exploratory analysis of the three text datasets from the course website on Coursera.

Data Visualization

twitter <- readLines(file("final/en_US/en_US.twitter.txt", "r"), 
                     encoding = "UTF-8", skipNul = TRUE)
blogs <- readLines(file("final/en_US/en_US.blogs.txt", "r"), 
                     encoding = "UTF-8", skipNul = TRUE)
news <- readLines(file("final/en_US/en_US.news.txt", "r"), 
                     encoding = "UTF-8", skipNul = TRUE)

By having a quick look at the data, we first notice that all these datasets are quite large. For example, we can see that there are 2360148 observations in the Twitter dataset, 899288 lines in the blog data, and 77259 lines in the news data. Also, each line is a separate sentence with many characters. If we look at the distribution of the number of characters in each line of the Twitter data, we can observe that there are about 70 characters on average.

str(twitter); str(blogs); str(news)

##  chr [1:2360148] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long." ...

##  chr [1:899288] "In the years thereafter, most of the Oil fields and platforms were named after pagan â€œgodsâ€." ...

##  chr [1:77259] "He wasn't home alone, apparently." ...

twitter[1:5]

## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."  
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
## [3] "they've decided its more fun if I don't."                                                                       
## [4] "So Tired D; Played Lazer Tag & Ran A LOT D; Ughh Going To Sleep Like In 5 Minutes ;)"                           
## [5] "Words from a complete stranger! Made my birthday even better :)"

nchar(twitter[5])

## [1] 63

summary(nchar(twitter))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   37.00   64.00   68.68  100.00  140.00

Sampling

Due to the large computation time in later n-gram analysis, and also, for simplicity, I will randomly choose 1% of the data to perform.

set.seed(147)
twitter_sample <- sample(twitter, length(twitter)*0.01)
blogs_sample <- sample(blogs, length(blogs)*0.01)
news_sample <- sample(news, length(news)*0.01)

#Just make sure I dont get the same lines as before
twitter_sample[1:5]

## [1] "The kiddie circle pit during A Dog is a Dog Storytime today was so awesome."                
## [2] "An inconvenience is an adventure wrongly considered. - Gilbert K. Chesterton"               
## [3] "Diddy won his Oscar!"                                                                       
## [4] "I should go to the casino with the amount of bird shit that's on my car after this workday."
## [5] "Dana Spiotta's \"Eat the Document\" is great."

Text Cleaning

It is obvious to see that there are numbers, symbols, etc. in the Twitter data, which we don’t need to include. So, cleaning the data is essential here. By using the Corpus and tm_map functions from the tm package, we are able to clean the text data easily.

corpus <- Corpus(VectorSource(twitter_sample))
inspect(corpus[1])

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 1
## 
## [1] The kiddie circle pit during A Dog is a Dog Storytime today was so awesome.

# Lower cases
corpus <- tm_map(corpus, content_transformer(tolower))
# Remove numbers
corpus <- tm_map(corpus, removeNumbers)
# Remove common words
corpus <- tm_map(corpus, removeWords, stopwords("english"))
# Remove punctuations
corpus <- tm_map(corpus, removePunctuation)
# Remove extra white spaces
corpus <- tm_map(corpus, stripWhitespace)

N-gram

Now, we can begin with the N-gram analysis. Particularly, we are interested in unigram, bigram, and trigram. There is a function from the RWeka package that helps us do that.

By plotting the words using the wordcloud function, we can see the most frequently used terms, such as just, like, thanks, and so on. Similarly, we can make a barplot to demonstrate that.

# Unigram
token_delim <- " \\t\\r\\n.!?,;\"()"

unitoken <- NGramTokenizer(corpus, Weka_control(min=1, max=1, delimiters=token_delim))

unigram_words <- data.frame(table(unitoken))
unigram_words <- unigram_words[order(unigram_words$Freq, decreasing=T), ]

wordcloud(unigram_words$unitoken, unigram_words$Freq, min.freq=150, 
          colors=rainbow(8), random.order=FALSE)

par(mar=c(10,4,4,4)) 
barplot(head(sort(table(unitoken), decreasing=T), 20), las=2, ylab="Counts", 
        main="Top 20 Used Unigram for Twitter Data")

# Bigram 
bitoken <- NGramTokenizer(corpus, Weka_control(min=2, max=2, delimiters=token_delim))

bigram_words <- data.frame(table(bitoken))
bigram_words <- bigram_words[order(bigram_words$Freq, decreasing=T), ]

wordcloud(bigram_words$bitoken, bigram_words$Freq, min.freq=25, 
          colors=rainbow(8), random.order=FALSE)

par(mar=c(10,4,4,4)) 
barplot(head(sort(table(bitoken), decreasing=T), 20), las=2, ylab="Counts", 
        main="Top 20 Used Bigram for Twitter Data")

#Trigram
tritoken <- NGramTokenizer(corpus, Weka_control(min=3, max=3, delimiters=token_delim))

par(mar=c(10,4,4,4)) 
barplot(head(sort(table(tritoken), decreasing=T), 20), las=2, ylab="Counts", 
        main="Top 20 Used Trigram for Twitter Data")

Then, we can perform these on blogs and news data.

corpus_blogs <- Corpus(VectorSource(blogs_sample))
# Lower cases
corpus_blogs <- tm_map(corpus_blogs, content_transformer(tolower))
# Remove numbers
corpus_blogs <- tm_map(corpus_blogs, removeNumbers)
# Remove common words
corpus_blogs <- tm_map(corpus_blogs, removeWords, stopwords("english"))
# Remove punctuations
corpus_blogs <- tm_map(corpus_blogs, removePunctuation)
# Remove extra white spaces
corpus_blogs <- tm_map(corpus_blogs, stripWhitespace)

unitoken_blogs <- NGramTokenizer(corpus_blogs, Weka_control(min=1, max=1, delimiters=token_delim))
bitoken_blogs <- NGramTokenizer(corpus_blogs, Weka_control(min=2, max=2, delimiters=token_delim))
tritoken_blogs <- NGramTokenizer(corpus_blogs, Weka_control(min=3, max=3, delimiters=token_delim))

par(mar=c(10,4,4,4)) 
barplot(head(sort(table(unitoken_blogs), decreasing=T), 20), las=2, ylab="Counts", 
        main="Top 20 Used Unigram for Blogs Data")

par(mar=c(10,4,4,4)) 
barplot(head(sort(table(bitoken_blogs), decreasing=T), 20), las=2, ylab="Counts", 
        main="Top 20 Used Bigram for Blogs Data")

par(mar=c(10,4,4,4)) 
barplot(head(sort(table(tritoken_blogs), decreasing=T), 20), las=2, ylab="Counts", 
        main="Top 20 Used Trigram for Blogs Data")

corpus_news <- Corpus(VectorSource(news_sample))
# Lower cases
corpus_news <- tm_map(corpus_news, content_transformer(tolower))
# Remove numbers
corpus_news <- tm_map(corpus_news, removeNumbers)
# Remove common words
corpus_news <- tm_map(corpus_news, removeWords, stopwords("english"))
# Remove punctuations
corpus_news <- tm_map(corpus_news, removePunctuation)
# Remove extra white spaces
corpus_news <- tm_map(corpus_news, stripWhitespace)

unitoken_news <- NGramTokenizer(corpus_news, Weka_control(min=1, max=1, delimiters=token_delim))
bitoken_news <- NGramTokenizer(corpus_news, Weka_control(min=2, max=2, delimiters=token_delim))
tritoken_news <- NGramTokenizer(corpus_news, Weka_control(min=3, max=3, delimiters=token_delim))

par(mar=c(10,4,4,4)) 
barplot(head(sort(table(unitoken_news), decreasing=T), 20), las=2, ylab="Counts", 
        main="Top 20 Used Unigram for News Data")

par(mar=c(10,4,4,4)) 
barplot(head(sort(table(bitoken_news), decreasing=T), 20), las=2, ylab="Counts", 
        main="Top 20 Used Bigram for News Data")

Further Plans

To create a prediction algorithm, we will definitely need to train more data since I only select 1% of the whole data. Also, building a Markov model might be our next move. For example, using Twitter data, if a user enters the word “right,” we have a higher chance to predict “now” as his or her next word.