Milestone Report

Setup the Environment

First lets load the datasets and setup the environment

library(ggplot2)
library(NLP)

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(tm)
library(quanteda)

## Package version: 1.5.1

## Parallel computing: 2 of 8 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following object is masked from 'package:utils':
## 
##     View

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

blog_con <- file("en_US/en_US.blogs.txt", "r") 
blog_us <-readLines(blog_con) # Read the whole thing
close(blog_con) ## It's important to close the connection when you are done

twt_con <- file("en_US/en_US.twitter.txt", "r") 
twt_us <-readLines(twt_con) # Read the whole thing
close(twt_con) ## It's important to close the connection when you are done

news_con <- file("en_US/en_US.news.txt", "r") 
news_us <-readLines(news_con) # Read the whole thing
close(news_con) ## It's important to close the connection when you are done

curse_con <- file("cursewords.txt", "r") 
cursewords <-readLines(curse_con) # Read the whole thing
close(curse_con) ## It's important to close the connection when you are done

## Check the line counts
summary(blog_us)

##    Length     Class      Mode 
##    899288 character character

summary(twt_us)

##    Length     Class      Mode 
##   2360148 character character

summary(news_us)

##    Length     Class      Mode 
##     77259 character character

## Check the word counts
sum(sapply(strsplit(blog_us, " "), length))

## [1] 37334131

sum(sapply(strsplit(twt_us, " "), length))

## [1] 30373543

sum(sapply(strsplit(news_us, " "), length))

## [1] 2643969

So from what we can see, the twitter dataset has the most number of elements. Our data seems to be massive so lets sample a portion of the dataset for efficiency and runtime purposes.

Sampling the data

New Values

## Check the line counts
summary(new_blog)

##    Length     Class      Mode 
##      8993 character character

summary(new_twt)

##    Length     Class      Mode 
##     23602 character character

summary(new_news)

##    Length     Class      Mode 
##       773 character character

## Check the word counts
sum(sapply(strsplit(new_blog, " "), length))

## [1] 372659

sum(sapply(strsplit(new_twt, " "), length))

## [1] 304839

sum(sapply(strsplit(new_news, " "), length))

## [1] 26055

## Aggregate the data into a single data frame
list.set <- list(twitter = new_twt, blog = new_blog, news = new_news)

Cleaning the Data

# Create corpus and clean the data
corpus <- VCorpus(VectorSource(list.set))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removeWords, cursewords)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

Exploration of the Data

Lets check if any of the top unigrams are found in all 3 of the text sources

## Get the intersect of the 3
most_all <- data.frame(word = Reduce(intersect, list(most_blog$word, most_news$word, most_twitter$word)))
most_all <- arrange(most_all, word)

most_all

##      word
## 1    back
## 2     can
## 3     get
## 4   going
## 5    good
## 6    just
## 7    like
## 8    make
## 9     new
## 10    now
## 11    one
## 12 people
## 13   time

There are 13 words found in all 3 text sources.

Now lets check the bigrams and trigrams with the highest frequency