Intro

The Key partners for this project are Swiftkey and Coursera, explores Natural Language Processing facet of Data Science.

Load library and data

library(dplyr)
library(doParallel)
library(stringi)
library(tm)
library(slam)
library(ggplot2)
library(wordcloud)
blogs <- file("en_US.blogs.txt", open="rb")
blogs <- readLines(blogs, encoding = "UTF-8", skipNul=TRUE)

news <- file("en_US.news.txt", open = "rb") 
news <- readLines(news, encoding = "UTF-8", skipNul=TRUE)

twitter <- file("en_US.twitter.txt", open = "rb")
twitter <- readLines(twitter, encoding = "UTF-8", skipNul=TRUE)

Exploratory Analysis - Part 1

Basic statistics of the 3 data files, including Line, Character and Word counts, and Words Per Line (WPL) summaries, histograms are also plotted for the same

rawP <-lapply(list(blogs,news,twitter),function(x) stri_count_words(x))

stat <-data.frame(
            File=c("blogs","news","twitter"), 
            t(rbind(sapply(list(blogs,news,twitter),stri_stats_general),
                    TotalWords=sapply(list(blogs,news,twitter),stri_stats_latex)[4,])),
            WPL=rbind(summary(rawP[[1]]),summary(rawP[[2]]),summary(rawP[[3]])))
print(stat)
##      File   Lines LinesNEmpty     Chars CharsNWhite TotalWords WPL.Min.
## 1   blogs  899288      899288 206824382   170389539   37570839        0
## 2    news 1010242     1010242 203223154   169860866   34494539        1
## 3 twitter 2360148     2360148 162096241   134082806   30451170        1
##   WPL.1st.Qu. WPL.Median WPL.Mean WPL.3rd.Qu. WPL.Max.
## 1           9         28 41.75107          60     6726
## 2          19         32 34.40997          46     1796
## 3           7         12 12.75065          18       47

Plots

# Plot histogram for each data type
qplot(rawP[[1]],geom="histogram",main="US Blogs",
      xlab="WordCount",ylab="Freq",binwidth=10)

qplot(rawP[[2]],geom="histogram",main="US News",
      xlab="WordCount",ylab="Freq",binwidth=10)

qplot(rawP[[3]],geom="histogram",main="US Twits",
      xlab="WordCount",ylab="Freq",binwidth=1)

rm(rawP);rm(stat)

Sampling Raw Data

Sample 20000 lines from each data type before cleaning and performing exploratory analysis again

sizeS <- 20000  # Sample Size
set.seed(2809)  

# data and sample vectors
data <- list(blogs, news, twitter)
sampleVec <- list()

# Iterate each raw data 
for (i in 1:length(data)) {
    # Create sample dataset
    Ffil <- sample(1:length(data[[i]]), sizeS, replace = FALSE)
    sampleVec[[i]] <- data[[i]][Ffil]
    # Remove unconvention/funny characters
    for (j in 1:length(sampleVec[[i]])) {
        row1 <- sampleVec[[i]][j]
        row2 <- iconv(row1, "latin1", "ASCII", sub = "")
        sampleVec[[i]][j] <- row2
    }
}

rm(blogs)
rm(news)
rm(twitter)

Creating Corpus and Cleaning Data

# Corpus and document term matrix vectors
corpus <- list()
matrixD <- list()

# Iterate each sample data 
for (i in 1:length(sampleVec)) {
    # Create corpus dataset
    corpus[[i]] <- Corpus(VectorSource(sampleVec[[i]]))
    # Cleaning/stemming the data
    corpus[[i]] <- tm_map(corpus[[i]], tolower)
    corpus[[i]] <- tm_map(corpus[[i]], removeNumbers)
    corpus[[i]] <- tm_map(corpus[[i]], removeWords, stopwords("english"))
    corpus[[i]] <- tm_map(corpus[[i]], removePunctuation)
    corpus[[i]] <- tm_map(corpus[[i]], stemDocument)
    corpus[[i]] <- tm_map(corpus[[i]], stripWhitespace)
    # calculate document term frequency for corpus
    matrixD[[i]] <- DocumentTermMatrix(corpus[[i]], control = list(wordLengths = c(0, 
        Inf)))
}

rm(data)
rm(sampleVec)

Exploratory Analysis - Part 2

Illustration word frequencies via word cloud plotted for each data type

set.seed(3340)  
par(mfrow = c(1, 3))  # Plot Panel
h = c("Word Cloud - Blogs", "Word Cloud - News", "Word Cloud - Twitter")

# Iterate each corpus/DTM and plot word cloud for each
for (i in 1:length(corpus)) {
    wordcloud(words = colnames(matrixD[[i]]), freq = col_sums(matrixD[[i]]), 
        scale = c(3, 1), max.words = 100, random.order = FALSE, rot.per = 0.35, 
        use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))
    title(h[i])
}

Next step

Using N-grams to generate tokens Summarizing frequency of tokens Building predictive model Develop data product (i.e. shiny app)