Introduction

This report provides the capsulization of the exploratory analysis of the text data, in addition to creating a predictive model

setwd('C:/Users/innugantii/Desktop/Indu/Coursera/Project Final/final/en_US')

 
file.list = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt")

text <- list(blogs = "", news = "", twitter = "")

 
data.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", "twitter"),

                                                              c("file size, Mb", "lines", "words")))

for (i in 1:3) {

  con <- file(file.list[i], "rb")

  text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)

  close(con)

  data.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)

  data.summary[i,2] <- length(text[[i]])

  data.summary[i,3] <- sum(stri_count_words(text[[i]]))

}

Summary

data.summary %>%
  kable() %>%
  kable_styling()
file size, Mb lines words
blogs 200.42 899288 37546246
news 196.28 1010242 34762395
twitter 159.36 2360148 30093410

However, these data sets are very large, will proceed with a small sample (0.005%) of the each data set, then combine them into one data set which shall be used for the analysis.

## Create the sample

set.seed(1234)

blogs <- sample(text$blogs, 0.005*length(text$blogs))

news<- sample(text$news, 0.005*length(text$news))

twitter <- sample(text$twitter, 0.005*length(text$twitter))

sample_data <- c(blogs, news, twitter)

 

sumWords <- sum(stri_count_words(sample_data))

sumWords
## [1] 517755
writeLines(sample_data, "sample_data.txt")

Create Corpus

Cleaning the Data

sample_data <- iconv(sample_data, 'UTF-8', 'ASCII')

corpus <- Corpus(VectorSource(as.data.frame(sample_data, stringsAsFactors = FALSE)))

 

corpus <- corpus %>%

  tm_map(removeWords, stopwords("english")) %>% # remove stopwords       

  tm_map(tolower) %>%                   # Converts all text to lower case

  tm_map(removeNumbers) %>%             # Remove Numbers

  tm_map(removePunctuation) %>%         # Remove punctuation marks


  tm_map(stripWhitespace) %>%           # Remove whitespaces

  tm_map(PlainTextDocument)         # An intermediate preprocessing step

Build Tokenizers:

Split the sentence into individual words

Unigram

unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1))

unigram.df <- data.frame(table(unigram))

unigram.df <- unigram.df[order(unigram.df$Freq, decreasing = TRUE),]

 

wordcloud(unigram.df$unigram, unigram.df$Freq, max.words=125, random.order=TRUE, colors=brewer.pal(8, "Dark2"))

text(x=0.5, y=0, "Word Cloud of Unigram")

Frequency of Unigram

# Plot area set up

layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))

 

## Unigram Frequency

plotUni <- ggplot(head(unigram.df,25), aes(reorder(unigram,Freq), Freq)) +

  geom_bar(stat="identity",fill = "brown") + coord_flip() +

  xlab("Unigrams") + ylab("Frequency") +

  ggtitle("Most frequently used words - Unigrams")

 

ggplotly(plotUni)

Bigram

bigram <- NGramTokenizer(corpus, Weka_control(min = 2, max = 2))

bigram.df <- data.frame(table(bigram))

bigram.df <- bigram.df[order(bigram.df$Freq, decreasing = TRUE),]

 

wordcloud(bigram.df$bigram, bigram.df$Freq, max.words=125, random.order=TRUE, colors=brewer.pal(8, "Dark2"))

text(x=0.5, y=0, "Word Cloud of Bigram")

Frequency of Bigram

layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))

 

## Bigram Frequency

plotBi <- ggplot(head(bigram.df,25), aes(reorder(bigram,Freq), Freq)) +

  geom_bar(stat="identity",fill = "seagreen") + coord_flip() +

  xlab("Bigrams") + ylab("Frequency") +

  ggtitle("Most frequently used words - Bigrams")

 

ggplotly(plotBi)

Trigram

trigram <- NGramTokenizer(corpus, Weka_control(min = 3, max = 3))

trigram.df <- data.frame(table(trigram))

trigram.df <- trigram.df[order(trigram.df$Freq, decreasing = TRUE),]

 

wordcloud(trigram.df$trigram, trigram.df$Freq, max.words=125, random.order=TRUE, colors=brewer.pal(8, "Dark2"))

text(x=0.5, y=0, "Word Cloud of Trigram")

Frequency of Trigram

layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))

 

## Trigram Frequency

plotTri <- ggplot(head(trigram.df,25), aes(reorder(trigram,Freq), Freq)) +

  geom_bar(stat="identity",fill = "blue") + coord_flip() +

  xlab("Trigrams") + ylab("Frequency") +

  ggtitle("Most frequently used words - Trigrams")

 

ggplotly(plotTri)

Quadgram

quadgram <- NGramTokenizer(corpus, Weka_control(min = 4, max = 4))

quadgram.df <- data.frame(table(quadgram))

quadgram.df <- quadgram.df[order(quadgram.df$Freq, decreasing = TRUE),]

 

wordcloud(quadgram.df$quadgram, quadgram.df$Freq, max.words=125, random.order=TRUE, colors=brewer.pal(8, "Dark2"))

text(x=0.5, y=0, "Word Cloud of Quadgram")

Frequency of Quadgram

layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))

 

## Quadgram Frequency

plotQuad <- ggplot(head(quadgram.df,25), aes(reorder(quadgram,Freq), Freq)) +

  geom_bar(stat="identity",fill = "purple") + coord_flip() +

  xlab("Quadgrams") + ylab("Frequency") +

  ggtitle("Most frequently used words - Quadgrams")

 

ggplotly(plotQuad)

Predict next word algorithm

(library(tm))
##  [1] "ngram"        "quanteda"     "tokenizers"   "slam"        
##  [5] "devtools"     "plotly"       "RSQLite"      "survey"      
##  [9] "survival"     "Matrix"       "grid"         "pwr"         
## [13] "kableExtra"   "knitr"        "RWeka"        "SnowballC"   
## [17] "wordcloud"    "RColorBrewer" "tm"           "NLP"         
## [21] "stringi"      "ggplot2"      "dplyr"        "stats"       
## [25] "graphics"     "grDevices"    "utils"        "datasets"    
## [29] "methods"      "base"
(library(stringr))
##  [1] "stringr"      "ngram"        "quanteda"     "tokenizers"  
##  [5] "slam"         "devtools"     "plotly"       "RSQLite"     
##  [9] "survey"       "survival"     "Matrix"       "grid"        
## [13] "pwr"          "kableExtra"   "knitr"        "RWeka"       
## [17] "SnowballC"    "wordcloud"    "RColorBrewer" "tm"          
## [21] "NLP"          "stringi"      "ggplot2"      "dplyr"       
## [25] "stats"        "graphics"     "grDevices"    "utils"       
## [29] "datasets"     "methods"      "base"
PredNextTerm <- function(inStr)
{
        assign("mesg", "in PredNextTerm", envir = .GlobalEnv)

        inStr <- CleanInputString(inStr);
  
        inStr <- unlist(strsplit(inStr, split=" "));
        inStrLen <- length(inStr);
        
        nxtTermFound <- FALSE;
        predNxtTerm <- as.character(NULL);
       
        if (inStrLen >= 3 & !nxtTermFound)
        {
                inStr1 <- paste(inStr[(inStrLen-2):inStrLen], collapse=" ");
                
                searchStr <- paste("^",inStr1, sep = "");
                fDF4Temp <- fDF4[grep (searchStr, fDF4$terms), ];
                
                if ( length(fDF4Temp[, 1]) > 1 )
                {
                        predNxtTerm <- fDF4Temp[1,1];
                        nxtTermFound <- TRUE;
                        mesg <<- " predicting next word using quadgram"
                }
                fDF4Temp <- NULL;
        }
        
             if (inStrLen >= 2 & !nxtTermFound)
        {
            
                inStr1 <- paste(inStr[(inStrLen-1):inStrLen], collapse=" ");
          
                searchStr <- paste("^",inStr1, sep = "");
                fDF3Temp <- fDF3[grep (searchStr, fDF3$terms), ];
                
                if ( length(fDF3Temp[, 1]) > 1 )
                {
                        predNxtTerm <- fDF3Temp[1,1];
                        nxtTermFound <- TRUE;
                        mesg <<- "predicting next word using trigram"
                }
                fDF3Temp <- NULL;
        }

        if (inStrLen >= 1 & !nxtTermFound)
        { inStr1 <- inStr[inStrLen];searchStr <- paste("^",inStr1, sep = "");
                fDF2Temp <- fDF2[grep (searchStr, fDF2$terms), ];
                
                if ( length(fDF2Temp[, 1]) > 1 )
                {
                        predNxtTerm <- fDF2Temp[1,1];
                        nxtTermFound <- TRUE;
                        mesg <<- "predicting next word using bigram.";
                }
                fDF2Temp <- NULL;
        }

        if (!nxtTermFound & inStrLen > 0)
        {
                predNxtTerm <- fDF1$terms[1];
                mesg <- "No word found, the most frequently used word is selected as next word."
        }
        
        nextTerm <- word(predNxtTerm, -1);
        
        if (inStrLen > 0){
                dfTemp1 <- data.frame(nextTerm, mesg);
                return(dfTemp1);
        } else {
                nextTerm <- "";
                mesg <-"";
                dfTemp1 <- data.frame(nextTerm, mesg);
                return(dfTemp1);
        }
}