library(tm)
## Loading required package: NLP

Getting the data:

fileUrl <-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if (!file.exists("Coursera-SwiftKey.zip")){
  download.file(fileUrl, destfile = "Coursera-SwiftKey.zip", method="curl")
}
unzip("Coursera-SwiftKey.zip")

News data:

fileSize <- format(file.info("final/en_US/en_US.news.txt")$size/(1024^2),digits = 0)
con <- file("final/en_US/en_US.news.txt", "rb")
news <- readLines(con)
close(con)

News file is 196MB, of 1,010,242 Lines. I will use 10k random lines b/c file is so big.

Cleaning data: Create a corpus and make: all lowercase, no numbers, and no puncutation

# Sample the data
news <- sample(news,10000)

# Create corpus
corpus <- Corpus(VectorSource(news))

# To lower case
corpus <- tm_map(corpus, content_transformer(tolower))

# Remove punctuation marks
corpus <- tm_map(corpus, removePunctuation)

# Remove numbers
corpus <- tm_map(corpus, removeNumbers)

#remove stop words
corpus <- tm_map(corpus, removeWords, stopwords("english"))

#Remove whitespaces
corpus <- tm_map(corpus, stripWhitespace)

Create a term matrix and count terms

frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus))),decreasing=TRUE), 10)

barplot(frequentWords, 
        main = "News Data: Most Frequent Words", 
        xlab="Word", 
        ylab = "Count")

Twitter data:

fileSize <- format(file.info("final/en_US/en_US.twitter.txt")$size/(1024^2),digits = 0)
con <- file("final/en_US/en_US.twitter.txt", "rt")
twitter <- readLines(con)
## Warning in readLines(con): line 167155 appears to contain an embedded nul
## Warning in readLines(con): line 268547 appears to contain an embedded nul
## Warning in readLines(con): line 1274086 appears to contain an embedded nul
## Warning in readLines(con): line 1759032 appears to contain an embedded nul
close(con)

Twitter Data: 159MB. Will use 10,000 random lines as a sample for our analysis.

Cleaning data: Create a corpus and make: all lowercase, no numbers, and no puncutation

# Sample the data
twitter <- sample(twitter,10000)

# Create corpus
corpus <- Corpus(VectorSource(twitter))

# To lower case
corpus <- tm_map(corpus, content_transformer(tolower))

# Remove punctuation marks
corpus <- tm_map(corpus, removePunctuation)

# Remove numbers
corpus <- tm_map(corpus, removeNumbers)

#remove stop words
corpus <- tm_map(corpus, removeWords, stopwords("english"))

#Remove whitespaces
corpus <- tm_map(corpus, stripWhitespace)

Now, that the corpus is clean, we can create a term matrix and find the frequent terms

# We pass minWordLength=1 inorder to include single letter words such as 'a'
frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus))),decreasing=TRUE), 10)

barplot(frequentWords, 
        main = "Twitter Data: Most Frequent Words", 
        xlab="Word", 
        ylab = "Count")

Blogs:

fileSize <- format(file.info("final/en_US/en_US.blogs.txt")$size/(1024^2),digits = 0)
con <- file("final/en_US/en_US.blogs.txt", "rt")
blogs <- readLines(con)
close(con)

Blog Data: 200MB. Again, will use 10k lines, randomly selected.

Cleaning data: Create a corpus and make: all lowercase, no numbers, and no puncutation

# Sample the data
blogs <- sample(blogs,10000)

# Create corpus
corpus <- Corpus(VectorSource(blogs))

# To lower case
corpus <- tm_map(corpus, content_transformer(tolower))

# Remove punctuation marks
corpus <- tm_map(corpus, removePunctuation)

# Remove numbers
corpus <- tm_map(corpus, removeNumbers)

#remove stop words
corpus <- tm_map(corpus, removeWords, stopwords("english"))

#Remove whitespaces
corpus <- tm_map(corpus, stripWhitespace)

Now, that the corpus is clean, we can create a term matrix and find the frequent terms

frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus))),decreasing=TRUE), 10)

barplot(frequentWords, 
        main = "Blog Data: Most Frequent Words", 
        xlab="Word", 
        ylab = "Count")

Future Analysis/Plans:

More models - N grams: bigrams, trigrams. Create a prediction model