Data set

The data consist of text from 3 different sources: blogs, news, and twitter feeds and are provided in 4 different languages: German, English (US), Finnish, and Russian. For the remainder of this project, we will use only the the English (US) data sets.

Summary of the English (US) data

file.list = c("final/en_US/en_US.blogs.txt", "final/en_US/en_US.news.txt", "final/en_US/en_US.twitter.txt")
text <- list(blogs = "", news = "", twitter = "")

data.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", "twitter"),c("file size, Mb", "lines", "words")))
for (i in 1:3) {
  con <- file(file.list[i], "rb")
  text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
  close(con)
  data.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)
  data.summary[i,2] <- length(text[[i]])
  data.summary[i,3] <- sum(stri_count_words(text[[i]]))
}

The data is summarized in the table below.

	file size, Mb	lines	words
blogs	200.42	899288	37546239
news	196.28	1010242	34762395
twitter	159.36	2360148	30093413

These datasets are rather large, and since the goal is to provide a proof of concept for the data analysis, for the remainder of the report we will sample a smaller fraction of the data (1 %) to perform the analysis. The three parts will be combine into a single file and used to generate the corpus.

set.seed(123)
blogs_sample <- sample(text$blogs, 0.01*length(text$blogs))
news_sample <- sample(text$news, 0.01*length(text$news))
twitter_sample <- sample(text$twitter, 0.01*length(text$twitter))
sampled_data <- c(blogs_sample, news_sample, twitter_sample)
sum <- sum(stri_count_words(sampled_data))

The new data set consists of 1.023.563 words.

Build the corpus

# remove emoticons
sampled_data <- iconv(sampled_data, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(sampled_data, stringsAsFactors = FALSE))) 
corpus <- corpus %>%
  tm_map(tolower) %>%  
  tm_map(PlainTextDocument) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeNumbers) %>%
  tm_map(stripWhitespace)

## Warning in tm_map.SimpleCorpus(., tolower): transformation drops documents

## Warning in tm_map.SimpleCorpus(., PlainTextDocument): transformation drops
## documents

term.doc.matrix <- TermDocumentMatrix(corpus)
term.doc.matrix <- as.matrix(term.doc.matrix)
word.freqs <- sort(rowSums(term.doc.matrix), decreasing=TRUE) 
dm <- data.frame(word=names(word.freqs), freq=word.freqs)

Word cloud plot of the most common words in the corpus

wordcloud(dm$word, dm$freq, min.freq= 500, random.order=TRUE, rot.per=.25, colors=brewer.pal(8, "Dark2"))

Build n-gram models

toks0 <- tokens(char_tolower(sampled_data), remove_punct = TRUE)
toks1 <- tokens_remove(toks0, stopwords("english"))
toks2 <- tokens_ngrams(toks1, 2)
toks3 <- tokens_ngrams(toks1, 3)
myDfm_ng1 <- dfm(toks1)
myDfm_ng2 <- dfm(toks2)
myDfm_ng3 <- dfm(toks3)

# To access a list of the most frequently occurring features
myDfm_features_ng1 <- topfeatures(myDfm_ng1, 20)
myDfm_features_ng2 <- topfeatures(myDfm_ng2, 20)
myDfm_features_ng3 <- topfeatures(myDfm_ng3, 20)

# Create a data.frame for ggplot
topDf_ng1 <- data.frame(
    list(
        term = names(myDfm_features_ng1),
        frequency = unname(myDfm_features_ng1)
    )
)

topDf_ng2 <- data.frame(
    list(
        term = names(myDfm_features_ng2),
        frequency = unname(myDfm_features_ng2)
    )
)

topDf_ng3 <- data.frame(
    list(
        term = names(myDfm_features_ng3),
        frequency = unname(myDfm_features_ng3)
    )
)

Visualize top 20 features

# Sort by reverse frequency order
topDf_ng1$term <- with(topDf_ng1, reorder(term, -frequency))
topDf_ng2$term <- with(topDf_ng2, reorder(term, -frequency))
topDf_ng3$term <- with(topDf_ng3, reorder(term, -frequency))

ggplot(topDf_ng1) + geom_col(aes(x=term, y=frequency), color="steelblue", fill="steelblue") +
    theme(axis.text.x=element_text(angle=90, hjust=1))

ggplot(topDf_ng2) + geom_col(aes(x=term, y=frequency), color="steelblue", fill="steelblue") +
    theme(axis.text.x=element_text(angle=90, hjust=1))

ggplot(topDf_ng3) + geom_col(aes(x=term, y=frequency), color="steelblue", fill="steelblue") +
    theme(axis.text.x=element_text(angle=90, hjust=1))

Plans for creating the prediction algorithm

We can build a predictive model with an algorithm based on the previous word. Details about this approach are available here: https://sookocheff.com/post/nlp/ngram-modeling-with-markov-chains/

Peer-graded Assignment: Milestone Report

Andrea Puggioni