Data taken from: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip #GETTING DATA this is the process of getting data from the net. I had already downloaded through the Coursera website in order to answer the quiz for week 1 #Setting working directory setwd(“D:/capstone research”)

getting data

fileUrl1 <- “https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip” download.file(url=fileUrl1, destfile=“Coursera-SwiftKey.zip”, mode=“w”, method=“curl”)

Load in the data set from disk.

Unizipping the zipped data unzip(“Coursera-SwiftKey.zip”)

Synopsis

This report explores 3 text corpora obtained from HC Corpora (www.corpora.Helios.org) and collected from different sources (Blogs, News, and twitter). The corpora will be used at a later stage to build a predictive text model. The model will be implemented as an online shiny application (i.e. available on the web) that proposes a list of next-to-be-typed words after a user enters a phrase of text. The scope of the investigation reported here was to do some basic per-processing and analysis on the corpora in order to get acquainted with the data, and to develop a strategy for building of the predictive model. 1. Data sampling

Data is comprised of 3 text documents en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt, with respective lengths of 899288, 77259, and 2360148 lines. In order to avoid computational performance issues, subsequent analyses were based on a subset of the corpora. In specific, 10000 lines were randomly sampled from each corpus.

Finding the memory used We use the pryr package

library(pryr)

## Warning: package 'pryr' was built under R version 3.1.2

mem_used()

## 13.3 MB

Data sampling

Data is comprised of 3 text documents en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt, with respective lengths of 899288, 77259, and 2360148 lines. In order to avoid computational performance issues, subsequent analyses were based on a subset of the corpora. In specific, 20000 lines were randomly sampled from each corpus.

An analysis was performed investigating number of retained unique words as function of lines sampled from the News corpus. The result is shown in the plots.

loading important packages

suppressWarnings(suppressMessages(library(tm))) # Framework for text mining.
suppressWarnings(suppressMessages(library(SnowballC)))# Provides wordStem() for stemming.
suppressWarnings(suppressMessages(library(RColorBrewer)))# Generate palette of colours for plots.
suppressWarnings(suppressMessages(library(ggplot2)))# Generate palette of colours for plots.
suppressWarnings(suppressMessages(library(magrittr)))
suppressWarnings(suppressMessages(library(Rgraphviz)))# Correlation plots.
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library("stringi")))# For fast, correct, consistent, and convenient character string/text processing in each locale and any native encoding.
suppressWarnings(suppressMessages(library("qdap")))
suppressWarnings(suppressMessages(library(RWeka)))

To start with, only looking at first 10000 lines for each file for performance Loading first 10000 lines of each file due to large file sizes

setwd("D:/capstone research/final/en_US")
Twitter <- readLines("en_US.twitter.txt", 10000)
t_lines <- length(Twitter)
t_word= sum(unlist(sapply(X=Twitter,FUN=str_count,pattern=" ")))+t_lines
t_nchar <- sum(nchar(Twitter))

Blogs <- readLines("en_US.blogs.txt", 10000)
b_lines <- length(Blogs)
b_word= sum(unlist(sapply(X=Blogs,FUN=str_count,pattern=" ")))+b_lines
b_nchar <- sum(nchar(Blogs))

News <- readLines("en_US.news.txt", 10000)
n_lines <- length(News)
n_word= sum(unlist(sapply(X=News,FUN=str_count,pattern=" ")))+n_lines
n_nchar <- sum(nchar(News))

the_sum <- data.frame(c("Blogs","News","Twitter"),c(b_lines,n_lines,t_lines), c(b_word,n_word,t_word), c(b_nchar,n_nchar,t_nchar))
names(the_sum)<-c("file","#lines", "#words", "#char")

the_sum

##      file #lines #words   #char
## 1   Blogs  10000 410620 2294142
## 2    News  10000 343929 2041079
## 3 Twitter  10000 127674  682791

Exploring data

par(mfrow = c(4, 1))
l1 <- c(length(Twitter), length(News), length(Blogs))
names(l1) <- c("Twitter", "News", "Blogs")
barplot(l1, main = "Texts (lines) per corpus")

Looking at format of first 4 lines for each file of interest

setwd("D:/capstone research/final/en_US")
readLines(file("en_US.news.txt","r"), 4)

## [1] "He wasn't home alone, apparently."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."                                                                                                                                                                                                                                                                                                                                                         
## [3] "WSU's plans quickly became a hot topic on local online sites. Though most people applauded plans for the new biomedical center, many deplored the potential loss of the building."                                                                                                                                                                                                                                                                                                                                 
## [4] "The Alaimo Group of Mount Holly was up for a contract last fall to evaluate and suggest improvements to Trenton Water Works. But campaign finance records released this week show the two employees donated a total of $4,500 to the political action committee (PAC) Partners for Progress in early June. Partners for Progress reported it gave more than $10,000 in both direct and in-kind contributions to Mayor Tony Mack in the two weeks leading up to his victory in the mayoral runoff election June 15."

readLines(file("en_US.blogs.txt","r"), 4)

## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan âgodsâ."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
## [2] "We love you Mr. Brown."                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
## [3] "Chad has been awesome with the kids and holding down the fort while I work later than usual! The kids have been busy together playing Skylander on the XBox together, after Kyan cashed in his $$$ from his piggy bank. He wanted that game so bad and used his gift card from his birthday he has been saving and the money to get it (he never taps into that thing either, that is how we know he wanted it so bad). We made him count all of his money to make sure that he had enough! It was very cute to watch his reaction when he realized he did! He also does a very good job of letting Lola feel like she is playing too, by letting her switch out the characters! She loves it almost as much as him."
## [4] "so anyways, i am going to share some home decor inspiration that i have been storing in my folder on the puter. i have all these amazing images stored away ready to come to life when we get our home."

readLines(file("en_US.twitter.txt","r"), 4)

## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."  
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
## [3] "they've decided its more fun if I don't."                                                                       
## [4] "So Tired D; Played Lazer Tag & Ran A LOT D; Ughh Going To Sleep Like In 5 Minutes ;)"

Clean Corpus Contents and Calculate Word Counts

library(stringr)
clean_corpus <- function(corpus) {
    output <- tolower(corpus)
    output <- str_replace_all(output, pattern = "'", replacement = "")
    output <- str_replace_all(output, pattern = "[^[:print:]]", replacement = "")
    output <- str_replace_all(output, pattern = "[^[:alpha:] | [:space:]]", replacement = "")
    output <- str_trim(output, side = 'both')
    output <- str_replace_all(output, pattern = " {2,}", replacement = " ")
    output <- str_split(output, pattern = "[[:space:]]")
    output
}
news_clean <- clean_corpus(News)
blogs_clean <- clean_corpus(Blogs)
twitter_clean <- clean_corpus(Twitter)

doc_counts <- c(length(News), length(Blogs), length(Twitter))
word_counts <- c(length(unlist(news_clean)), 
                 length(unlist(blogs_clean)), 
                 length(unlist(twitter_clean)))
eda_df <- data.frame(corpus_name = c('News','Blogs','Twitter'),
                     doc_counts = doc_counts, 
                     word_counts = word_counts)
eda_df$avg_doc_length <- word_counts / doc_counts
eda_df

##   corpus_name doc_counts word_counts avg_doc_length
## 1        News      10000      335257        33.5257
## 2       Blogs      10000      405672        40.5672
## 3     Twitter      10000      123686        12.3686

Using the package tm

WE WANT TO MAKE SURE WE ARE USING ENGLISH LANGUAGE

Sys.setenv(LANGUAGE="en")
Sys.setlocale("LC_MONETARY", "English")

## [1] "English_United States.1252"

Sys.setlocale("LC_CTYPE", "English")

## [1] "English_United States.1252"

Sys.setenv(LANGUAGE="en")
Sys.setlocale("LC_TIME", "English")

## [1] "English_United States.1252"

Sys.setlocale("LC_COLLATE", "English")

## [1] "English_United States.1252"

Inside the DirSource argument under the Corpus function, The encoding must be set to “UTF-8” to avoid encoding error

looking into unique words and dictionary size is to show in bar chart.

The Bar plot confirms our findings from word cloud that twitter has far less unique words whilst blogs have largest unique terms but not that far from news data.

Tokenization.

In the first step of the analysis, text lines were segmented into individual sentences and some basic cleaning of the data was done (both before and after segmentation). The reason behind segmentation into sentences is to avoid pairing together last word(s) of a sentence and first word(s) of the following sentence when calculating frequency of occurrence of combination of words (also known as n-gram frequency count). The strategy for splitting by sentences is simple: it is based on sentence ending punctuation marks. This strategy ignores disambiguation problems. For example, a period may not always denote end of a sentence, as it is also used as a decimal point and in abbreviations. Since the impact of this issue on the predictive model is not clear at the moment, the issue was not explored at length. However, we did partially address some of the most obvious disambiguation cases as described below.

The n-grams will be performed by dividing the clean_data file in 3 different parts due to memory RAM usage. For this, the ngram package will be used. tryCatch function is used for avoiding errors due to phrases with less than the minimal words during the n-gram loop. The same code below will be used for all 3 files and for all different n-grams.

Word cloud

Looking at the diagrams we can immediately spot the pattern that twitter data contains much smaller dictionary and more frequent words represents emotions for example “good”, “like”, “love”, “happy”. In opposite news data has no clear pattern and more frequent words have more formal meaning. Words from the blogs fits in between the vocabulary size is very similar to news vocabulary but frequent words appear to be closer to twitter’s word cloud.

Conclusion

Based on the data investigations done so far, the plan is to build a predictive text algorithm and accompanying shiny application based on frequency of occurrence of n-grams. The following is a rough sketch of the modeling strategy:

Randomly sample part of the corpora (e.g. 20000 lines from each corpus) Calculate frequency of occurrence of n-grams for n=1,2,3 (combining together results from Blog, News, and twitter). Build a model that attempts to predict next word based on the n-gram that is largest in length (i.e. n=3). Supplement the prediction with the prediction from the next largest n-gram when the largest yields no results.

The strategy outlined above reflects our current understanding of the problem and understanding of the provided training data. Note that we expect to build more complexity into the model as our thinking continues to evolve around Natural language processing, in general, and the problem at hand, in particular.

Plans for Final Prediction Algorithm

There are three steps in creating the prediction engine. The first step is to preprocess the data in order to create N-gram frequency tables. The second step is to create a prediction model using the preprocessed data. In the nutshell, the model consists of three tiers of N-gram tokenized frequency tables. The model will calculate prediction score starting with 4-gram frequency table, then 3-gram tab le and finally 2-gram frequency table. A word with highest score will be chosen as a prediction. The third step is the execution through a user interface. The user will be able to enter text via a text box on a web based Shiny app to get a prediction.

A sample code for final

loop for removing the real words from the TOKEN

for(n in 0:y){
    t = n*2500+1
    b = n + 1
    x = b*2500
    dic_data_nw <- dic_data_nw[!grepl(dic_data_nw[,1], pattern = paste("^",words[t:x,],"$", collapse="|",sep="")),]
    print(c("Round",b))
    print(c("Last x value",x))
    print(c("remaining not words",length(dic_data_nw[,1])))
}

Because of this procedure I was able to identify some writing mistakes that should be considered for correction before using the prediction models.

After this, there will be further processing and data cleaning, and separation of the n-grams by alphabetical order in different files.

for(i in letters){
    for(n in 1:3){
        test <- get(paste("ngram3_", n, sep=""))
        #if the file does not exist create the file
        if(!exists(paste("With_",i,sep=""))){
            #if it starts with i and I
            if(i == "i"){
                assign(paste("With_",i,sep=""), test[grepl(test, pattern=paste("^[","iI","]", sep=""))])
            }
            #all other letters are lowercased
            else{
                assign(paste("With_",i,sep=""), test[grepl(test, pattern=paste("^[",i,"]", sep=""))])
            }
        }
        #if the file already exists join them toguether
        else if(exists(paste("With_",i,sep=""))){
            #if it starts with i and I
            if(i == "i"){
                assign(paste("With_",i,sep=""), c(get(paste("With_",i,sep="")), test[grepl(test, pattern=paste("^[","iI","]", sep=""))]))
            }
            #all other letters are lowercased
            else{
                assign(paste("With_",i,sep=""), c(get(paste("With_",i,sep="")), test[grepl(test, pattern=paste("^[",i,"]", sep=""))]))
            }
        }
    }
}

remove lines that has less than 3 words

clean_data <- matrix(data_table[wc(data_table)>=3])
save(clean_data, file="good_data.RData")

token <- stri_extract_words(clean_data, locale = "en_US")
dic_data <- data.frame(table(matrix(unlist(token), ncol = 1, byrow = TRUE)))

English word list to filter existing words from my data_dic. In this way I was able to create a “not word dictionary” for later use to exclude the lines containing these junk words from my ngrams.

setwd("D:/capstone research")
words <- read.csv("words.csv", sep=";")
length(words[,1])

dic_data_nw <- dic_data

Counting how many times to work with the loop

for(n in 0:y){
    t = n*2500+1
    b = n + 1
    x = b*2500
    dic_data_nw <- dic_data_nw[!grepl(dic_data_nw[,1], pattern = paste("^",words[t:x,],"$", collapse="|",sep="")),]
    print(c("Round",b))
    print(c("Last x value",x))
    print(c("remaining not words",length(dic_data_nw[,1])))
}

Text Mining Report

Andrew Kireru

Wednesday, November 08, 2014