Data Science Capstone Milestone Report

Andria Hall

Sunday, March 20, 2016

Task 1: Getting and Cleaning the Data

This milestone report is the first in a set of tasks in building a predictive model based on understanding the distribution and relationships among words, tokens and phrases in texts.

The text datasets are in four different languages, English, German, Finish and Russia. For this report the English text dataset will be explored and are taken from en_US.blogs.txt, en_US.news.txt and en_US.twitter.txt made available by Coursera-SwiftKey.

The plan taken in this task to access, deploy and clean the text datasets and will undergo the following procesess:

Loading knitr

Since the report requires writing code chunks in the R markdown documents, set echo = TRUE and results = hold as global options so that the codes can be review for analysis.

library(knitr)
opts_chunk$set(echo = TRUE, results = 'hold')

Loading the library

# load required packages
suppressMessages(require("tm"))
suppressMessages(require("R.utils"))
suppressMessages(require("ggplot2"))
setwd("C:/Users/Andria/Data-Science-Capstone")

Summary of the datasets

1. Collecting file size information on each text dataset

file.info("final/en_US/en_US.blogs.txt")$size/1024^2
file.info("final/en_US/en_US.twitter.txt")$size/1024^2
file.info("final/en_US/en_US.news.txt")$size/1024^2

## [1] 200.4242
## [1] 159.3641
## [1] 196.2775

2. Collecting data on the number of lines in the blogs dataset

conblogs <- file("final/en_US/en_US.blogs.txt", open = "rb")
blogs <- readLines(conblogs, encoding = "UTF=8")
close(conblogs)
rm(conblogs)

# number of lines in en_US.blogs.txt dataset 
summary(blogs)

##    Length     Class      Mode 
##    899288 character character

3. Collecting data on the number of lines in the twitter dataset

contwit <- file("final/en_US/en_US.twitter.txt", open = "rb")
twitter <- readLines(contwit, encoding = "UTF=8")

## Warning in readLines(contwit, encoding = "UTF=8"): line 167155 appears to
## contain an embedded nul

## Warning in readLines(contwit, encoding = "UTF=8"): line 268547 appears to
## contain an embedded nul

## Warning in readLines(contwit, encoding = "UTF=8"): line 1274086 appears to
## contain an embedded nul

## Warning in readLines(contwit, encoding = "UTF=8"): line 1759032 appears to
## contain an embedded nul

close(contwit)
rm(contwit)

# number of lines in en_US.twitter.txt dataset 
summary(twitter)

##    Length     Class      Mode 
##   2360148 character character

4. Collecting data on the number of lines in the news dataset

# Reading the news dataset 
conews <- file("final/en_US/en_US.news.txt", open = "rb")
news <- readLines(conews, encoding = "UTF=8")
close(conews)
rm(conews)

# number of lines in en_US.news.txt dataset 
summary(news)

##    Length     Class      Mode 
##   1010242 character character

Data Aquisition

Since the dataset is very large only a sampled dataset will we used in our analysis

blogs <- readLines("final/en_US/en_US.news.txt",  20000)
news <- readLines("final/en_US/en_US.news.txt",  20000)
twitter <- readLines("final/en_US/en_US.twitter.txt", 20000)

Profanity Filtering

Available from: https://gist.github.com/jamiew/1112488

profanity <- readLines("final/banned_words.txt", skipNul = T)

## Warning in readLines("final/banned_words.txt", skipNul = T): incomplete
## final line found on 'final/banned_words.txt'

library(RWeka)
sampledData <- sample(paste(blogs, news, twitter), size = 10000, replace = TRUE)
rm(blogs, news, twitter)

Building the Corpus

corpus <- Corpus(VectorSource(sampledData))
rm(sampledData)

Data cleaning

# Cleaning the sampledData
  corpus <- tm_map(corpus, tolower)
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, function(x) gsub('[^[:alnum:] ]', "", x))
  corpus <- tm_map(corpus, function(x) gsub('[])(;:#%$^*\\~{}[&+=@/"`|<>_]+', "", x))
  corpus <- tm_map(corpus, removeWords, stopwords("english"))
  corpus <- tm_map(corpus, stripWhitespace)
  corpus <- tm_map(corpus, PlainTextDocument)
  corpus <- tm_map(corpus, removeWords, profanity)

# remove profanity
rm(profanity)

Building the Term Document Matrix

Tokenizing `cleanset` into Unigrams, Bigrams and Trigrams

uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

uniGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = uniGramTokenizer))
biGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = biGramTokenizer))
triGramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = triGramTokenizer))

Analysis

Bar plot showing frequency of words occuring in n-grams

freqTerms <- findFreqTerms(uniGramMatrix, lowfreq = 1000)
termFrequency <- rowSums(as.matrix(uniGramMatrix[freqTerms,]))
termFrequency <- data.frame(unigram=names(termFrequency), frequency=termFrequency)
termFrequency

##        unigram frequency
## also      also      1315
## can        can      1490
## first    first      1113
## get        get      1314
## good      good      1028
## just      just      1684
## last      last      1216
## like      like      1573
## new        new      1574
## now        now      1124
## one        one      2037
## people  people      1203
## said      said      5021
## state    state      1058
## time      time      1311
## two        two      1214
## will      will      2410
## year      year      1305
## years    years      1050

g <- ggplot(termFrequency, aes(x=reorder(unigram, frequency), y=frequency)) +
    geom_bar(stat = "identity", fill = "blue") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Unigram") + ylab("Frequency") +
    labs(title = "Top Unigrams by Frequency")
print(g)

freqTerms <- findFreqTerms(biGramMatrix, lowfreq = 50)
termFrequency <- rowSums(as.matrix(biGramMatrix[freqTerms,]))
termFrequency <- data.frame(bigram=names(termFrequency), frequency=termFrequency)
termFrequency

##                          bigram frequency
## can get                 can get        62
## cant wait             cant wait        68
## dont know             dont know       103
## dont think           dont think        57
## dont want             dont want        65
## even though         even though        64
## every day             every day        50
## feel like             feel like        98
## first time           first time        82
## four years           four years        57
## general manager general manager        66
## health care         health care        74
## high school         high school       169
## im going               im going        68
## last month           last month        97
## last night           last night        70
## last season         last season        60
## last week             last week       126
## last year             last year       275
## little bit           little bit        55
## looking forward looking forward        57
## looks like           looks like        57
## los angeles         los angeles        90
## make sure             make sure        96
## many people         many people        56
## new jersey           new jersey       118
## new york               new york       214
## next week             next week        58
## next year             next year        54
## officials said   officials said        80
## percent percent percent percent        50
## police said         police said        61
## right now             right now       146
## said im                 said im        51
## said will             said will        58
## san diego             san diego        69
## san francisco     san francisco        79
## st louis               st louis       178
## supreme court     supreme court        57
## three years         three years        64
## two weeks             two weeks        55
## two years             two years        93
## united states     united states        85
## white house         white house        59
## years ago             years ago       138

g <- ggplot(termFrequency, aes(x=reorder(bigram, frequency), y=frequency)) +
    geom_bar(stat = "identity", fill = "purple") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Bigram") + ylab("Frequency") +
    labs(title = "Top Bigrams by Frequency")
print(g)

freqTerms <- findFreqTerms(triGramMatrix, lowfreq = 15)
termFrequency <- rowSums(as.matrix(triGramMatrix[freqTerms,]))
termFrequency <- data.frame(trigram=names(termFrequency), frequency=termFrequency)
termFrequency

##                                       trigram frequency
## âhe hasnât really           âhe hasnât really        16
## cant wait see                   cant wait see        15
## east st louis                   east st louis        16
## four years ago                 four years ago        16
## long way go                       long way go        16
## new york city                   new york city        23
## new york times                 new york times        16
## past two years                 past two years        18
## president barack obama president barack obama        26
## st louis county               st louis county        24
## three years ago               three years ago        16
## two years ago                   two years ago        26
## us district court           us district court        16
## us supreme court             us supreme court        16

g <- ggplot(termFrequency, aes(x=reorder(trigram, frequency), y=frequency)) +
    geom_bar(stat = "identity", fill = "red") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Trigram") + ylab("Frequency") +
    labs(title = "Top Trigrams by Frequency")
print(g)