INTRODUCTION

The first step in building a predictive model for text is understanding the distribution and relationship between the words, tokens, and phrases in the text. The goal of this task is to understand the basic relationships you observe in the data and prepare to build your first linguistic models.

Tasks to accomplish

Exploratory analysis - perform a thorough exploratory analysis of the data, understanding the distribution of words and relationship between the words in the corpora.
Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.

GET AND READ THE DATA

path1 <- "C:/Users/junio/Desktop/COURSERA/DATA SCIENCE/COURSE 10 - Data Science Capstone/WEEK 1/final/en_US/en_US.blogs.txt"

path2 <- "C:/Users/junio/Desktop/COURSERA/DATA SCIENCE/COURSE 10 - Data Science Capstone/WEEK 1/final/en_US/en_US.news.txt"

path3 <- "C:/Users/junio/Desktop/COURSERA/DATA SCIENCE/COURSE 10 - Data Science Capstone/WEEK 1/final/en_US/en_US.twitter.txt"

con <- file(path1, open = "rb")
blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

con <- file(path2, open = "rb")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

con <- file(path3, open = "rb")
twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

CODE FOR CREATE WORDCLOUDS, PLOTS AND OTHER FUNCTIONS

library(RColorBrewer)
library(wordcloud)
library(slam)
library(ggplot2)
library(gridExtra)

wordcloud.print <- function(x){        

        # Set Plotting in 1 row 3 columns
        par(mfrow=c(1, 3))
        Headings= c("Word Cloud - US English Blogs", "Word Cloud - US English News", "Word Cloud - US English Twitter")

        # Iterate each corpus and DTM and plot word cloud (Max = 100)
        for (i in 1:length(levels(as.factor(x$source)))) {
                
            wordcloud(words = x[x$source == levels(as.factor(x$source))[i], 'word'], scale = c(1, .1),
            freq = x[x$source == levels(as.factor(x$source))[i], 'freq'], 
            max.words = 20, random.order = FALSE, rot.per = 0.45, use.r.layout = FALSE,
            colors = brewer.pal(8, "Dark2"))
            title(Headings[i])
        }
        
}       

Ngrams.plot <- function (x1, x2, x3, n) {
        
        type <- c("Unigrams", "Bigrams", "Trigrams")
        
        g1 <- ggplot(x1, aes(reorder(word, -freq), freq)) + geom_bar(stat = "identity", 
                fill = "blue" ) + ggtitle("Blogs") + xlab(type[n]) + ylab("Frequency") + 
                theme(axis.text.x=element_text(angle=90, hjust=1))
        
        g2 <- ggplot(x2, aes(reorder(word, -freq), freq)) + geom_bar(stat = "identity", 
                fill = "red" ) + ggtitle("News") + xlab(type[n]) + ylab("Frequency") + 
                theme(axis.text.x=element_text(angle=90, hjust=1))
        
        g3 <- ggplot(x3, aes(reorder(word, -freq), freq)) + geom_bar(stat = "identity", 
                fill = "green" ) + ggtitle("Twitter") + xlab(type[n]) + ylab("Frequency") + 
                theme(axis.text.x=element_text(angle=90, hjust=1))
        
        grid.arrange(g1, g2, g3, ncol = 3)
        
}

freq_frame <- function(x){      # x is dataframe
    v <- colSums(x)
    d <- data.frame(word = names(x), freq = v)
    d <- d %>% arrange(desc(freq))
    return(d)
}

freq_frame_tok <- function(tdm){
    freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
    freq_frame_tok <- data.frame(word=names(freq), freq=freq)
    return(freq_frame_tok)
}

GENERATE A TABLE WITH A SUMMARY OF THE DATA

statistics <- data.frame('File'= c("Blogs", "News", "Twitter"),
                         'File.Size' = c(file.info(path1)$size, file.info(path2)$size, file.info(path3)$size)/1024^2,
                         'Num.Entries' = sapply(list(blogs, news, twitter), function(x){length(x)}), 
                         'Words.Count' = sapply(list(blogs, news, twitter), stri_stats_latex)[4,],
                         'Total.Characteres' = sapply(list(blogs, news, twitter), function(x){sum(nchar(x))})
                         )

kable(statistics)

CREATE THE CLOUD WORD OF THE UNIGRAMS AND GRAPH THE TEN UNIGRAMS WITH THE HIGHEST FREQUENCIES

wordcloud.print(dfunigrams)

Ngrams.plot(df1.blogs[1:10,], df1.news[1:10,], df1.twitter[1:10,], 1)

CREATE THE CLOUD WORD OF THE BIGRAMS AND GRAPH THE TEN BIGRAMS WITH THE HIGHEST FREQUENCIES

wordcloud.print(dfbigrams)

Ngrams.plot(df2.blogs[1:10,], df2.news[1:10,], df2.twitter[1:10,], 2)

CREATE THE CLOUD WORD OF THE TRIGRAMS AND GRAPH THE TEN TRIGRAMS WITH THE HIGHEST FREQUENCIES

wordcloud.print(dftrigrams)

Ngrams.plot(df3.blogs[1:10,], df3.news[1:10,], df3.twitter[1:10,], 3)

CODE FOR SAMPLE, CLEAN DATA AND CREATE CORPUS

set.seed(3456)
sampleBlogs <- blogs[sample(1:length(blogs), 0.03*length(blogs), replace = FALSE)]
sampleNews <- news[sample(1:length(news), 0.03*length(news), replace = FALSE)]
sampleTwitter <- twitter[sample(1:length(twitter), 0.03*length(twitter), replace = FALSE)]

sblogs <- iconv(sampleBlogs, "UTF-8", "ASCII", "byte")
snews <- iconv(sampleNews, "UTF-8", "ASCII", "byte")
stwitter <- iconv(sampleTwitter, "UTF-8", "ASCII", "byte")

sdata <- list(sblogs, snews, stwitter)
rm(blogs, news, twitter, sampleBlogs, sampleNews, sampleTwitter)

vcorpus <- list()
# Iterate each sampled corpus data to clean up and create DTM
for (i in 1:length(sdata)) {
        # Create corpus dataset
        vcorpus[[i]] <- VCorpus(VectorSource(sdata[[i]]), readerControl = list(reader = readPlain, language="en"))
        # Cleaning Up corpus dataset
        vcorpus[[i]] <- tm_map(vcorpus[[i]], tolower)
        # Eleminate punctuation
        vcorpus[[i]] <- tm_map(vcorpus[[i]], removePunctuation)
        # Strip Whitespace
        vcorpus[[i]] <- tm_map(vcorpus[[i]], stripWhitespace)
        # Create plain text format
        vcorpus[[i]] <- tm_map(vcorpus[[i]], PlainTextDocument)

}

CODE FOR CREATE UNIGRAMS

v1corpus <- vcorpus

# Create corpus and Document Term Matrix (DTM) vectors
frequencies <- list()
sparse <- list()
df <- list()
# Iterate each sampled corpus data to clean up and create DTM
for (i in 1:length(v1corpus)) {
        # Eleminate English stop words
        v1corpus[[i]] <- tm_map(v1corpus[[i]], removeWords, stopwords("english"))
        # Perform stemming
        v1corpus[[i]] <- tm_map(v1corpus[[i]], stemDocument)
        # Calculate document term frequency for corpus
        frequencies[[i]] <- DocumentTermMatrix(v1corpus[[i]])
        # Remove sparse terms
        sparse[[i]] <- removeSparseTerms(frequencies[[i]], 0.99)
        # Convert to data frame
        df[[i]] = as.data.frame(as.matrix(sparse[[i]]))
        colnames(df[[i]]) = make.names(colnames(df[[i]]))
}

df1.blogs <- freq_frame(df[[1]]); df1.blogs$source <- "blogs"
df1.news <- freq_frame(df[[2]]); df1.news$source <- "news"
df1.twitter <- freq_frame(df[[3]]); df1.twitter$source <- "twitter"
dfunigrams <- rbind(df1.blogs[1:20,], df1.news[1:20,], df1.twitter[1:20,])

rm(v1corpus, frequencies, sparse, df)

CODE FOR CREATE BIGRAMS

v2corpus <- vcorpus
BigramTokenizer <- function(x) RWeka::NGramTokenizer(x, Weka_control(min = 2, max = 2))

# Create corpus and Document Term Matrix (DTM) vectors
frequencies <- list()
sparse <- list()
# Iterate each sampled corpus data to clean up and create DTM
for (i in 1:length(v2corpus)) {
        # Eleminate English stop words
        v2corpus[[i]] <- tm_map(v2corpus[[i]], removeWords, stopwords("english"))
        # Perform stemming
        # vcorpus[[i]] <- tm_map(vcorpus[[i]], stemDocument)
        # Calculate document term frequency for corpus
        frequencies[[i]] <- TermDocumentMatrix(v2corpus[[i]], control = list(tokenize = BigramTokenizer))
        # Remove sparse terms
        sparse[[i]] <- removeSparseTerms(frequencies[[i]], 0.999)
}


df2.blogs <- freq_frame_tok(sparse[[1]]); df2.blogs$source <- "blogs"
df2.news <- freq_frame_tok(sparse[[2]]); df2.news$source <- "news"
df2.twitter <- freq_frame_tok(sparse[[3]]); df2.twitter$source <- "twitter"
dfbigrams <- rbind(df2.blogs[1:20,], df2.news[1:20,], df2.twitter[1:20,])

rm(v2corpus, frequencies, sparse)

CODE FOR CREATE TRIGRAMS

v3corpus <- vcorpus
TrigramTokenizer <- function(x) RWeka::NGramTokenizer(x, Weka_control(min = 3, max = 3))

# Create corpus and Document Term Matrix (DTM) vectors
frequencies <- list()
sparse <- list()
# Iterate each sampled corpus data to clean up and create DTM
for (i in 1:length(v3corpus)) {
        # Eleminate English stop words
        v3corpus[[i]] <- tm_map(v3corpus[[i]], removeWords, stopwords("english"))
        # Calculate document term frequency for corpus
        frequencies[[i]] <- TermDocumentMatrix(v3corpus[[i]], control = list(tokenize = TrigramTokenizer))
        # Remove sparse terms
        sparse[[i]] <- removeSparseTerms(frequencies[[i]], 0.9999)
}


df3.blogs <- freq_frame_tok(sparse[[1]]); df3.blogs$source <- "blogs"
df3.news <- freq_frame_tok(sparse[[2]]); df3.news$source <- "news"
df3.twitter <- freq_frame_tok(sparse[[3]]); df3.twitter$source <- "twitter"
dftrigrams <- rbind(df3.blogs[1:20,], df3.news[1:20,], df3.twitter[1:20,])

rm(v3corpus, frequencies, sparse, vcorpus)

CAPSTONE WEEK 2: Milestone Report

Luis Valladares

6 de agosto de 2018