The purpose of the following project is to present a brief summary of three data sets from HC Corpora. Whereas the documents contain data on four languages—German, Finnish, Russian, and English— only the American English corpus will be used in this exercise. More especifically, we will analyse the en_US.blogs.txt, en_US.news.txt, en_US.twitter.txt files.
First, let us clean the workspace and set the working directory
rm(list=ls())
setwd("/home/sussa/Desktop")
Load the necessary packages
library(RCurl)
library(rJava)
library(RWeka)
library(tm)
library(dplyr)
library(parallel)
library(ggplot2)
library(SnowballC)
library(tau)
library(slam)
library(ggplot2)
library(stringr)
library(stringi)
Set global seed
set.seed(1234)
Download and unzip the data
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
file <- "Coursera-SwiftKey.zip"
download.file(url, file, method = "wget")
unzip(file)
Let us now read the three required files:
setwd("/home/sussa/Desktop/final/en_US")
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8")
news <- readLines("en_US.news.txt", encoding = "UTF-8")
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8")
Check file size (in MB).
round(file.info("/home/sussa/Desktop/final/en_US/en_US.blogs.txt")$size / 1024^2 ,2)
## [1] 200.42
round(file.info("/home/sussa/Desktop/final/en_US/en_US.news.txt")$size / 1024^2 ,2)
## [1] 196.28
round(file.info("/home/sussa/Desktop/final/en_US/en_US.twitter.txt")$size / 1024^2 ,2)
## [1] 159.36
Number of lines
length(blogs)
## [1] 899288
length(news)
## [1] 1010242
length(twitter)
## [1] 2360148
Characters per line
summary(nchar(blogs))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 47 156 230 329 40830
summary(nchar(news))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 110.0 185.0 201.2 268.0 11380.0
summary(nchar(twitter))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 37.00 64.00 68.68 100.00 140.00
Word counts
summary(a <- stri_count_words(blogs))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 28.00 41.75 60.00 6726.00
summary(b <- stri_count_words(news))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 19.00 32.00 34.41 46.00 1796.00
summary(c <- stri_count_words(twitter))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 12.00 12.75 18.00 47.00
Basic plots
qplot(stri_count_words(blogs)) + xlim(0,300) + theme_bw()
qplot(stri_count_words(news)) + xlim(0,200) + theme_bw()
qplot(stri_count_words(twitter)) + xlim(0,50) + theme_bw()
Since the original data sets are quite large, we’ll sample 1000 words from each file. The samples are much easier to handle.
sample.blogs <- sample(blogs, 1000)
sample.news <- sample(news, 1000)
sample.twitter <- sample(twitter, 1000)
We’re now ready to create a corpus and cleanse the (sample) data. The goal here is to convert all words to lowercase letters, remove punctuation, stopwords, profanity, numbers, whitespace and stem the files. The data profanity files can be downloaded here.
data.profanity <- read.table("/home/sussa/Desktop/final/full-list-of-bad-words-banned-by-google-txt-file_2013_11_26_04_53_31_867.txt", header = FALSE, sep="\n", quote = "", stringsAsFactors = FALSE)
Function get.corpus()
get.corpus <- function (data, rm_words)
{
corpus <- Corpus(VectorSource(data))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, removeWords, rm_words)
return (corpus)
}
Creating corpora:
corpus.blogs <- get.corpus(sample.blogs, c(stopwords('english'), data.profanity))
corpus.news <- get.corpus(sample.news, c(stopwords('english'), data.profanity))
corpus.twitter <- get.corpus(sample.twitter, c(stopwords('english'), data.profanity))
A function built with RWeka’s tokeniser:
onegram.tokeniser <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
twogram.tokeniser <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
threegram.tokeniser <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
Creating unigrams:
blogs.unigrams1 <- TermDocumentMatrix(corpus.blogs, control = list(tokenize = onegram.tokeniser))
news.unigrams1 <- TermDocumentMatrix(corpus.news, control = list(tokenize = onegram.tokeniser))
twitter.unigrams1 <- TermDocumentMatrix(corpus.twitter, control = list(tokenize = onegram.tokeniser))
Sort:
blogs.unigrams1a <- sort(rowSums(as.matrix(blogs.unigrams1)), decreasing = TRUE)
head(blogs.unigrams1a, 10)
## one like time will just can get know make year
## 136 125 120 117 109 103 101 89 87 85
news.unigrams1a <- sort(rowSums(as.matrix(news.unigrams1)), decreasing = TRUE)
head(news.unigrams1a, 10)
## said will year can one time new get also two
## 232 120 82 75 75 73 71 69 66 61
twitter.unigrams1a <- sort(rowSums(as.matrix(twitter.unigrams1)), decreasing = TRUE)
head(twitter.unigrams1a, 10)
## get just good like love thank day time follow make
## 64 62 56 54 52 48 47 43 41 41
Plots
barplot(head(blogs.unigrams1a, 10), cex.names = .7)
barplot(head(news.unigrams1a, 10), cex.names = .7)
barplot(head(twitter.unigrams1a, 10), cex.names = .7)
Creating twograms
blogs.twograms2 <- TermDocumentMatrix(corpus.blogs, control = list(tokenize = twogram.tokeniser))
news.twograms2 <- TermDocumentMatrix(corpus.news, control = list(tokenize = twogram.tokeniser))
twitter.twograms2 <- TermDocumentMatrix(corpus.twitter, control = list(tokenize = twogram.tokeniser))
## Warning in mclapply(unname(content(x)), termFreq, control): scheduled
## cores 2 encountered errors in user code, all values of the jobs will be
## affected
## Warning in simple_triplet_matrix(i = i, j = j, v = as.numeric(v), nrow =
## length(allTerms), : NAs introduced by coercion
Sort:
blogs.twograms2a <- sort(rowSums(as.matrix(blogs.twograms2)), decreasing = TRUE)
head(blogs.twograms2a, 10)
## make sure year old dont know last week last year look like
## 9 9 8 7 7 7
## can see can tell dont think dont want
## 6 6 6 6
news.twograms2a <- sort(rowSums(as.matrix(news.twograms2)), decreasing = TRUE)
head(news.twograms2a, 10)
## new york last year high school last week
## 15 13 9 9
## attorney general san francisco st loui two year
## 7 7 7 7
## dont want free agent
## 6 6
twitter.twograms2a <- sort(rowSums(as.matrix(twitter.twograms2)), decreasing = TRUE)
head(twitter.twograms2a, 10)
## look forward cant wait come back last night becaus can
## 5 4 4 4 3
## im sorri im sure join us let know look like
## 3 3 3 3 3
Plots
barplot(head(blogs.twograms2a, 10), las = 2, cex.names = .7)
barplot(head(news.twograms2a, 10), las = 2, cex.names = .7)
barplot(head(twitter.twograms2a, 10), las = 2, cex.names = .7)
Creating threegrams
blogs.threegrams3 <- TermDocumentMatrix(corpus.blogs, control = list(tokenize = threegram.tokeniser))
news.threegrams3 <- TermDocumentMatrix(corpus.news, control = list(tokenize = threegram.tokeniser))
twitter.threegrams3 <- TermDocumentMatrix(corpus.twitter, control = list(tokenize = threegram.tokeniser))
Sort:
blogs.threegrams3a <- sort(rowSums(as.matrix(blogs.threegrams3)), decreasing = TRUE)
head(blogs.threegrams3a, 10)
## doubl jog stroller move upon face ad hill everi
## 4 3 2
## believ one minut bell n whistl cant wait get
## 2 2 2
## command dawn rise copyright appel film dont get wrong
## 2 2 2
## drama angst section
## 2
news.threegrams3a <- sort(rowSums(as.matrix(news.threegrams3)), decreasing = TRUE)
head(news.threegrams3a, 10)
## new york citi two year ago
## 3 3
## accord fire media attorney general eric
## 2 2
## bank credit union cent per share
## 2 2
## cuyahoga counti commission famili aquat center
## 2 2
## fire media affair first time sinc
## 2 2
twitter.threegrams3a <- sort(rowSums(as.matrix(twitter.threegrams3)), decreasing = TRUE)
head(twitter.threegrams3a, 10)
## ass ass ass busi onlin know get busi onlin
## 2 2 2
## get grandson dog grandson dog collar happi practic compass
## 2 2 2
## look get busi make one product one product one
## 2 2 2
## onlin know goal
## 2
Plots
barplot(head(blogs.threegrams3a, 10), las = 2, cex.names = .7)
barplot(head(news.threegrams3a, 10), las = 2, cex.names = .7)
barplot(head(twitter.threegrams3a, 10), las = 2, cex.names = .7)