Introduction

This is the Natural Language Processing Report for the Coursera Data Science Capstone Module. The dataset used in this report was given in this course where it was downloaded from:

 https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

The report will meet several criteria stated in the instructions:

1.Demonstrate that you’ve downloaded the data and have successfully loaded it in. 2.Create a basic report of summary statistics about the data sets. 3.Report any interesting findings that you amassed so far. 4.Get feedback on your plans for creating a prediction algorithm and Shiny app.

And, lastly the contents of the report publishes in the RPubs for reviews.

Downloading The Data

The data set was downloaded and unziped. It contains 3 text huge text files; blogs, news, twitters, separated by different languages. The en_US (English) text files were used for this report. source and destination of dataset Basic Summary Statistics about The Datasets. This report demonstates how to get the following summary from the dataset chosen. Check length or lines of words

      setwd("C:/Users/user/Desktop/Downloads/Coursera-SwiftKey")

blogs <- readLines(con <- file("blogs.txt"), encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
news <- readLines(con2 <- file("news.txt"), encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
twitters <- readLines(con3 <- file("twitter.txt"), encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
close(con)
close(con2)
close(con3)
length(blogs)

## [1] 899288

length(news)

## [1] 77259

length(twitters)

## [1] 2360148

Check word count.

blogsWords <- sum(sapply(gregexpr("\\S+", blogs), length))
blogsWords

## [1] 37334131

newsWords <- sum(sapply(gregexpr("\\S+", news), length))
newsWords

## [1] 2643969

twitterWords <- sum(sapply(gregexpr("\\S+", twitters), length))
twitterWords

## [1] 30373583

Length of the longest line of Twitters for example, can be determined as below.

twitcount<-nchar(twitters)
tmax<-which.max(twitcount)
nchar(twitters[tmax])

## [1] 140

Data Cleaning

Before continuing on the exploratory analysis, the data will be cleaned. It includes removing the weird characters, punctiation, whitespace, changing to lower case and etc. The sample that I am going to use will be based on Twitters.Loading required package NLP.Remove weird characters.

library(tm)

## Loading required package: NLP

library(NLP)
cleanedBlog<- iconv(blogs, 'UTF-8', 'ASCII', "byte")
cleanedNews<- iconv(news, 'UTF-8', 'ASCII', "byte")
cleanedTwitter<- iconv(twitters, 'UTF-8', 'ASCII', "byte")

Sampling 10000 on cleanedTwitter

twitterSample<-sample(cleanedTwitter, 10000)
doc.vec <- VectorSource(twitterSample)                      
doc.corpus <- Corpus(doc.vec)

Convert to lower case

doc.corpus<- tm_map(doc.corpus, tolower)

Remove all punctuatins

doc.corpus<- tm_map(doc.corpus, removePunctuation)

Remove all numbers

doc.corpus<- tm_map(doc.corpus, removeNumbers)

Remove whitespace

doc.corpus <- tm_map(doc.corpus, stripWhitespace)

force everything back to plaintext document

doc.corpus <- tm_map(doc.corpus, PlainTextDocument)

Explanatory Analysis

The following report is using the n-gram function (Rweka Library). Based on some guides from my tutors. First, we tokenize the sample into Unigrams, Bigrams and Trigrams.

Tokenize samples

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(tm)
library(RWeka)
library(NLP)

uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
quadGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))

uniGramMatrix <- TermDocumentMatrix(doc.corpus, control = list(tokenize = uniGramTokenizer))
biGramMatrix <- TermDocumentMatrix(doc.corpus, control = list(tokenize = biGramTokenizer))
triGramMatrix <- TermDocumentMatrix(doc.corpus, control = list(tokenize = triGramTokenizer))
quadGramMatrix <- TermDocumentMatrix(doc.corpus, control = list(tokenize = quadGramTokenizer))

UnifreqTerms <- findFreqTerms(uniGramMatrix, lowfreq = 500)
UnitermFrequency <- rowSums(as.matrix(uniGramMatrix[UnifreqTerms,]))
UnitermFrequency <- data.frame(unigram=names(UnitermFrequency), frequency=UnitermFrequency)

g1 <- ggplot(UnitermFrequency, aes(x=reorder(unigram, frequency), y=frequency)) +
    geom_bar(stat = "identity") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Unigram") + ylab("Frequency") +
    labs(title = "Top Unigrams by Frequency")
print(g1)

freqTerms <- findFreqTerms(biGramMatrix, lowfreq = 100)
termFrequency <- rowSums(as.matrix(biGramMatrix[freqTerms,]))
termFrequency <- data.frame(bigram=names(termFrequency), frequency=termFrequency)

g2 <- ggplot(termFrequency, aes(x=reorder(bigram, frequency), y=frequency )) +
  geom_bar(stat = "identity", colour = "red") +  coord_flip() +
  theme(legend.title=element_blank()) +
  xlab("Bigram") + ylab("Frequency") +
  labs(title = "Top Bigrams by Frequency ")
print(g2)

freqTerms <- findFreqTerms(triGramMatrix, lowfreq = 15)
termFrequency <- rowSums(as.matrix(triGramMatrix[freqTerms,]))
termFrequency <- data.frame(trigram=names(termFrequency), frequency=termFrequency)

g3 <- ggplot(termFrequency, aes(x=reorder(trigram, frequency), y=frequency)) +
    geom_bar(stat = "identity", colour = "blue") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Trigram") + ylab("Frequency") +
    labs(title = "Top Trigrams by Frequency")
print(g3)

freqTerms <- findFreqTerms(quadGramMatrix, lowfreq = 15)
termFrequency <- rowSums(as.matrix(quadGramMatrix[freqTerms,]))
termFrequency <- data.frame(quadgram=names(termFrequency), frequency=termFrequency)


g4 <- ggplot(termFrequency, aes(x=reorder(quadgram, frequency), y=frequency)) +
    geom_bar(stat = "identity", colour = "blue") +  coord_flip() +
    theme(legend.title=element_blank()) +
    xlab("Quadgram") + ylab("Frequency") +
    labs(title = "Top Quadgrams by Frequency")
print(g4)

Visualisation using wordcloud.Demonstration of using the library WordCloud. visualize using wordcloud

library(wordcloud)

## Loading required package: RColorBrewer

wordcloud(doc.corpus, max.words = 500, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(6, "Dark2"))

Next Steps For Prediction Algorithm And Shiny Application.

This concluedes the Exploratory Analysis on this dataset.

A prediction algorithm and Shiny app will be constructed separately, using the Ngrams model with similar Exploratory Analysis as above.

Capstone Project

Tahir Hussain

September 5, 2016

Introduction

Downloading The Data

Data Cleaning

Explanatory Analysis