Capstonescript.R

#Tasks to accomplish
#Exploratory analysis - perform a thorough exploratory analysis of the data, understanding the distribution of words and relationship between
#the words in the corpora.
#Understand frequencies of words and word pairs - build figures and tables to understand variation in the frequencies of words and word pairs in the data.

library(stringi)
library(tm)

## Loading required package: NLP

library(ngram)
library(NLP)
library(RWeka)

#Reading in the data.
setwd("D:/Documents and Settings/tnjunge/Documents/R/Capstone Project")

Twitter <- readLines("D:/Documents and Settings/tnjunge/Documents/R/Capstone Project/final/en_US/en_US.twitter.txt",warn=FALSE,encoding="UTF-8")

News <- readLines("D:/Documents and Settings/tnjunge/Documents/R/Capstone Project/final/en_US/en_US.news.txt",warn=FALSE,encoding="UTF-8")

Blogs <- readLines("D:/Documents and Settings/tnjunge/Documents/R/Capstone Project/final/en_US/en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
                   
## Find out the Length  in Twitter,News and Blogs
blogs_length <- length(Blogs)
news_length <- length(News)
twitter_length <- length(Twitter)

Length_count <- rbind(blogs_length, news_length, twitter_length)
Length_count

##                   [,1]
## blogs_length    899288
## news_length      77259
## twitter_length 2360148

#### Find out the size of each object Twitter,News and Blogs
SizeInMemoryInMB_Twitter  <- object.size(Twitter)/1000000
SizeInMemoryInMB_News  <- object.size(News)/1000000
SizeInMemoryInMB_Blogs  <- object.size(Blogs)/1000000
SizeInMemoryInMB_Twitter;SizeInMemoryInMB_News;SizeInMemoryInMB_Blogs

## 316 bytes

## 20.1 bytes

## 260.6 bytes

###Find number of Words
twitter_words<-sum(stri_count_words(Twitter))
blog_words<-sum(stri_count_words(Blogs))
news_words<-sum(stri_count_words(News))
twitter_words;blog_words;news_words

## [1] 30093369

## [1] 37546246

## [1] 2674536

#Subset a sample(5%)
set.seed(10000)
sample_blogs <- sample(Blogs, blogs_length*0.05)
sample_news <- sample(News, news_length*0.05)
sample_twitter <- sample(Twitter, twitter_length*0.05)
sample_Data <- c(sample_blogs,sample_news,sample_twitter)
sum <- sum(stri_count_words(sample_Data))

#Create the corpus,clean the data by removing punctuations,whitespaces,convert to lower case and plain text
sample_Data <- iconv(sample_Data, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(sample_Data, stringsAsFactors = FALSE))) 


corpus <- tm_map(corpus, removePunctuation) 
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, content_transformer(tolower)) 
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, PlainTextDocument) 


#Create N-Grams (0ne)

UnigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))

Unigrams <- TermDocumentMatrix(corpus, control = list(tokenize = UnigramTokenizer))


freq_frame <- function(tdm){
  freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
  freq_frame <- data.frame(word=names(freq), freq=freq)
  return(freq_frame)
}

# Make matrix more dense, add up and sort
UnigramsDense <- removeSparseTerms(Unigrams, 0.999)
UnigramsDenseSorted <- freq_frame(UnigramsDense)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

uni_plot <- ggplot(data = UnigramsDenseSorted[1:50,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity",fill="blue")
uni_plot <- uni_plot+ labs(x = "N-gram", y = "Frequency", title = "Frequencies of the 50 Most used words)")
uni_plot <- uni_plot + theme(axis.text.x=element_text(angle=90))
uni_plot

    ###SUMMARY
#Based on the above iinitial exploration the next steps will be
###Clean the data further
###Build more n-grams for combinations of at least two and three words
###determine a training and test data set
###Use these to build prediction model.

Capstonescript.R

tnjunge

Tue Aug 08 22:37:57 2017