Introduction

The purpose of the following project is to present a brief summary of three data sets from HC Corpora. Whereas the documents contain data on four languages—German, Finnish, Russian, and English— only the American English corpus will be used in this exercise. More especifically, we will analyse the en_US.blogs.txt, en_US.news.txt, en_US.twitter.txt files.

Getting and loading the data set

First, let us clean the workspace and set the working directory

rm(list=ls())
setwd("/home/sussa/Desktop")

Load the necessary packages

library(RCurl)
library(rJava)
library(RWeka)
library(tm)
library(dplyr)
library(parallel)
library(ggplot2)
library(SnowballC)
library(tau)
library(slam)
library(ggplot2)
library(stringr)
library(stringi)

Set global seed

set.seed(1234)

Download and unzip the data

url  <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
file <- "Coursera-SwiftKey.zip"
download.file(url, file, method = "wget")
unzip(file)

Reading the files

Let us now read the three required files:

setwd("/home/sussa/Desktop/final/en_US")
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8")
news <- readLines("en_US.news.txt",  encoding = "UTF-8")
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8")

Descriptive statistics

Check file size (in MB).

round(file.info("/home/sussa/Desktop/final/en_US/en_US.blogs.txt")$size   / 1024^2 ,2)
## [1] 200.42
round(file.info("/home/sussa/Desktop/final/en_US/en_US.news.txt")$size    / 1024^2 ,2)
## [1] 196.28
round(file.info("/home/sussa/Desktop/final/en_US/en_US.twitter.txt")$size / 1024^2 ,2)
## [1] 159.36

Number of lines

length(blogs)
## [1] 899288
length(news)
## [1] 1010242
length(twitter)
## [1] 2360148

Characters per line

summary(nchar(blogs))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1      47     156     230     329   40830
summary(nchar(news))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0   110.0   185.0   201.2   268.0 11380.0
summary(nchar(twitter))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   37.00   64.00   68.68  100.00  140.00

Word counts

summary(a <- stri_count_words(blogs))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   28.00   41.75   60.00 6726.00
summary(b <- stri_count_words(news))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.00   32.00   34.41   46.00 1796.00
summary(c <- stri_count_words(twitter))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   12.75   18.00   47.00

Basic plots

qplot(stri_count_words(blogs)) + xlim(0,300) + theme_bw()

qplot(stri_count_words(news)) + xlim(0,200) + theme_bw()

qplot(stri_count_words(twitter)) + xlim(0,50) + theme_bw()

Sampling

Since the original data sets are quite large, we’ll sample 1000 words from each file. The samples are much easier to handle.

sample.blogs   <- sample(blogs, 1000)
sample.news    <- sample(news, 1000)
sample.twitter <- sample(twitter, 1000)

Data Cleaning

We’re now ready to create a corpus and cleanse the (sample) data. The goal here is to convert all words to lowercase letters, remove punctuation, stopwords, profanity, numbers, whitespace and stem the files. The data profanity files can be downloaded here.

data.profanity <- read.table("/home/sussa/Desktop/final/full-list-of-bad-words-banned-by-google-txt-file_2013_11_26_04_53_31_867.txt", header = FALSE, sep="\n", quote = "", stringsAsFactors = FALSE)

Function get.corpus()

get.corpus <- function (data, rm_words)
{
corpus <- Corpus(VectorSource(data))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, rm_words)
return (corpus) 
}

Creating corpora:

corpus.blogs <- get.corpus(sample.blogs, c(stopwords('english'), data.profanity))
corpus.news <- get.corpus(sample.news, c(stopwords('english'), data.profanity))
corpus.twitter <- get.corpus(sample.twitter, c(stopwords('english'), data.profanity))

Tokenisation

A function built with RWeka’s tokeniser:

onegram.tokeniser <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
twogram.tokeniser <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
threegram.tokeniser <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

Data Analysis

Creating unigrams:

blogs.unigrams1 <- TermDocumentMatrix(corpus.blogs, control = list(tokenize = onegram.tokeniser))
news.unigrams1 <- TermDocumentMatrix(corpus.news, control = list(tokenize = onegram.tokeniser))
twitter.unigrams1 <- TermDocumentMatrix(corpus.twitter, control = list(tokenize = onegram.tokeniser))

Sort:

blogs.unigrams1a <- sort(rowSums(as.matrix(blogs.unigrams1)), decreasing = TRUE)
head(blogs.unigrams1a, 10)
##  one like time will just  can  get know make year 
##  136  125  120  117  109  103  101   89   87   85
news.unigrams1a <- sort(rowSums(as.matrix(news.unigrams1)), decreasing = TRUE)
head(news.unigrams1a, 10)
## said will year  can  one time  new  get also  two 
##  232  120   82   75   75   73   71   69   66   61
twitter.unigrams1a <- sort(rowSums(as.matrix(twitter.unigrams1)), decreasing = TRUE)
head(twitter.unigrams1a, 10)
##    get   just   good   like   love  thank    day   time follow   make 
##     64     62     56     54     52     48     47     43     41     41

Plots

barplot(head(blogs.unigrams1a, 10), cex.names = .7)

barplot(head(news.unigrams1a, 10), cex.names = .7)

barplot(head(twitter.unigrams1a, 10), cex.names = .7)

Creating twograms

blogs.twograms2 <- TermDocumentMatrix(corpus.blogs, control = list(tokenize = twogram.tokeniser))
news.twograms2 <- TermDocumentMatrix(corpus.news, control = list(tokenize = twogram.tokeniser))
twitter.twograms2 <- TermDocumentMatrix(corpus.twitter, control = list(tokenize = twogram.tokeniser))
## Warning in mclapply(unname(content(x)), termFreq, control): scheduled
## cores 2 encountered errors in user code, all values of the jobs will be
## affected
## Warning in simple_triplet_matrix(i = i, j = j, v = as.numeric(v), nrow =
## length(allTerms), : NAs introduced by coercion

Sort:

blogs.twograms2a <- sort(rowSums(as.matrix(blogs.twograms2)), decreasing = TRUE)
head(blogs.twograms2a, 10)
##  make sure   year old  dont know  last week  last year  look like 
##          9          9          8          7          7          7 
##    can see   can tell dont think  dont want 
##          6          6          6          6
news.twograms2a <- sort(rowSums(as.matrix(news.twograms2)), decreasing = TRUE)
head(news.twograms2a, 10)
##         new york        last year      high school        last week 
##               15               13                9                9 
## attorney general    san francisco          st loui         two year 
##                7                7                7                7 
##        dont want       free agent 
##                6                6
twitter.twograms2a <- sort(rowSums(as.matrix(twitter.twograms2)), decreasing = TRUE)
head(twitter.twograms2a, 10)
## look forward    cant wait    come back   last night   becaus can 
##            5            4            4            4            3 
##     im sorri      im sure      join us     let know    look like 
##            3            3            3            3            3

Plots

barplot(head(blogs.twograms2a, 10), las = 2, cex.names = .7)

barplot(head(news.twograms2a, 10), las = 2, cex.names = .7)

barplot(head(twitter.twograms2a, 10), las = 2, cex.names = .7)

Creating threegrams

blogs.threegrams3 <- TermDocumentMatrix(corpus.blogs, control = list(tokenize = threegram.tokeniser))
news.threegrams3 <- TermDocumentMatrix(corpus.news, control = list(tokenize = threegram.tokeniser))
twitter.threegrams3 <- TermDocumentMatrix(corpus.twitter, control = list(tokenize = threegram.tokeniser))

Sort:

blogs.threegrams3a <- sort(rowSums(as.matrix(blogs.threegrams3)), decreasing = TRUE)
head(blogs.threegrams3a, 10)
##   doubl jog stroller       move upon face        ad hill everi 
##                    4                    3                    2 
##     believ one minut        bell n whistl        cant wait get 
##                    2                    2                    2 
##    command dawn rise copyright appel film       dont get wrong 
##                    2                    2                    2 
##  drama angst section 
##                    2
news.threegrams3a <- sort(rowSums(as.matrix(news.threegrams3)), decreasing = TRUE)
head(news.threegrams3a, 10)
##              new york citi               two year ago 
##                          3                          3 
##          accord fire media      attorney general eric 
##                          2                          2 
##          bank credit union             cent per share 
##                          2                          2 
## cuyahoga counti commission        famili aquat center 
##                          2                          2 
##          fire media affair            first time sinc 
##                          2                          2
twitter.threegrams3a <- sort(rowSums(as.matrix(twitter.threegrams3)), decreasing = TRUE)
head(twitter.threegrams3a, 10)
##           ass ass ass       busi onlin know        get busi onlin 
##                     2                     2                     2 
##      get grandson dog   grandson dog collar happi practic compass 
##                     2                     2                     2 
##         look get busi      make one product       one product one 
##                     2                     2                     2 
##       onlin know goal 
##                     2

Plots

barplot(head(blogs.threegrams3a, 10), las = 2, cex.names = .7)

barplot(head(news.threegrams3a, 10), las = 2, cex.names = .7)

barplot(head(twitter.threegrams3a, 10), las = 2, cex.names = .7)

Possible extensions

  1. Improve data cleaning
  2. Separate the data set into training and testing sets
  3. Use a larger sample
  4. Build a predictive model using naiveBayes or other classifier
  5. Create a Shiny application