Milestone Report of Capstone Project

Introduction

The goal of this report is to demonstrate that I have got used to working with the text data and that I am on the path to create my prediction algorithm for the final deliverable of the Capstone project.

The motivation for this project is to:

Show that I have downloaded the data and have successfully loaded it in.
Display summary statistics of the text data for the project.
Show wordclouds and graphs of ngrams
Write about plan to create prediction model

##Downloading the data

Here, I download and store the data for the project. I also download a file of bad words in English which will be used for cleaning the text corpus later on.

if(!file.exists("./Capstone Project")){
  dir.create("./Capstone Project")
}
Url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
 
if(!file.exists("./Capstone Project/Coursera-SwiftKey.zip")){
  download.file(Url,destfile="./Capstone Project/Coursera-SwiftKey.zip",mode = "wb")
}

if(!file.exists("./Capstone Project/final")){
  unzip(zipfile="./Capstone Project/Coursera-SwiftKey.zip",exdir="./Capstone Project")
}

setwd("./Capstone Project/final/en_US")

if(!file.exists("full-list-of-bad-words_text-file_2018_07_30.zip")){
  download.file("https://www.freewebheaders.com/download/files/full-list-of-bad-words_text-file_2018_07_30.zip",destfile="full-list-of-bad-words_text-file_2018_07_30.zip",mode = "wb")
}

if(!file.exists("full-list-of-bad-words_text-file_2018_07_30.txt")){
  unzip(zipfile="full-list-of-bad-words_text-file_2018_07_30.zip")
}

Reading the text file

The text files are read into variables.

twitterdata<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
blogsdata<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
newsdata<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")

Display summary statistics of the text data

Here, I write code to display summary statistics of the data.

library(stringi)
length(twitterdata)

## [1] 2360148

length(blogsdata)

## [1] 899288

length(newsdata)

## [1] 77259

twitterdata_words <-stri_stats_latex(twitterdata)[4]
blogsdata_words <-stri_stats_latex(blogsdata)[4]
newsdata_words <-stri_stats_latex(newsdata)[4]
nchar_twitter<-sum(nchar(twitterdata))
nchar_blogs<-sum(nchar(blogsdata))
nchar_news<-sum(nchar(newsdata))

data.frame("File Name" = c("twitter", "blogs", "news"),
           "num.lines" = c(length(twitterdata),length(blogsdata), length(newsdata)),
           "num.words" = c(sum(blogsdata_words), sum(newsdata_words), sum(twitterdata_words)),
           "Num of character"=c(nchar_blogs,nchar_news,nchar_twitter))

##   File.Name num.lines num.words Num.of.character
## 1   twitter   2360148  37570839        206824505
## 2     blogs    899288   2651432         15639408
## 3      news     77259  30451128        162096031

Exploratory Data Analysis

Here, I write code to perform exploratory analysis on the data, create n grams and show word clouds.For this analysis, we take a sample of the data using binomial distribution for sampling from the text data. The text data is cleaned and preprocessed using tm_map function. Bad words are removed from the text corpus using a file of bad words downloaded from https://www.freewebheaders.com.

set.seed(1234)
blogs_c<-iconv(blogsdata,"latin1","ASCII",sub="")
news_c<-iconv(newsdata,"latin1","ASCII",sub="")
twitter_c<-iconv(twitterdata,"latin1","ASCII",sub="")

set.seed(2000)

data_sample_binomial<-c(twitter_c[rbinom(length(twitter_c)*.01, length(twitter_c), .5)],
              news_c[rbinom(length(news_c)*.01, length(news_c), .5)],
              blogs_c[rbinom(length(blogs_c)*.01, length(blogs_c), .5)])

library(tm)

## Loading required package: NLP

library(NLP)

#preprocessing the data

corpus <- VCorpus(VectorSource(data_sample_binomial))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

#removing bad words from the text data

profanewords <- read.table("./full-list-of-bad-words_text-file_2018_07_30.txt",skip=14)
corpus <- tm_map(corpus, removeWords, profanewords$V1)

removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removeNumPunct))



corpusresult<-data.frame(text=unlist(sapply(corpus,'[',"content")),stringsAsFactors = FALSE)
head(corpusresult)

##                                                                     text
## 1            blast last night singing event  new jobs afterward yay caps
## 2                          fool know yes just called austin mahone fool 
## 3                                 today feel like  good place wrong time
## 4            arguing comcast showtime channels less paying jacking rates
## 5      idea know lianli products will look stay tuned whole new set pcs 
## 6 thank help today smckc happy hour really appreciate guys coming rescue

library(wordcloud)

## Loading required package: RColorBrewer

wordcloud(corpus, max.words=50, random.order = 0, random.color = 1,colors=brewer.pal(8, "Accent"))

library(RWeka)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

unigram<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
unigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=unigram))

unigramcorpus<-findFreqTerms(unigramtab,lowfreq=1200)
unigramcorpusnum<-rowSums(as.matrix(unigramtab[unigramcorpus,]))
unigramcorpustab<-data.frame(Word=names(unigramcorpusnum),frequency=unigramcorpusnum)
unigramcorpussort<-unigramcorpustab[order(-unigramcorpustab$frequency),]

wordcloud(unigramcorpussort$Word, unigramcorpussort$frequency,  max.words = 100, random.order = 0, scale = c(5,1), colors=brewer.pal(8, "Accent"))

ggplot(unigramcorpussort[1:20,],aes(x=reorder(Word,-frequency),y=frequency))+
  geom_histogram(stat="identity",fill = I("black"))+
  labs(title="Unigrams",x="Words",y="Occurrences")+
  theme(axis.text.x=element_text(angle=60))

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Removed 8 rows containing missing values (position_stack).

bigram<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
bigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=bigram))
bigramcorpus<-findFreqTerms(bigramtab,lowfreq=100)
bigramcorpusnum<-rowSums(as.matrix(bigramtab[bigramcorpus,]))
bigramcorpustab<-data.frame(Word=names(bigramcorpusnum),frequency=bigramcorpusnum)
bigramcorpussort<-bigramcorpustab[order(-bigramcorpustab$frequency),]

wordcloud(bigramcorpussort$Word, bigramcorpussort$frequency,  max.words = 100, random.order = 0, scale = c(2,1), colors=brewer.pal(8, "Accent"))

ggplot(bigramcorpussort[1:10,],aes(x=reorder(Word,-frequency),y=frequency))+
  geom_histogram(stat="identity",fill = I("black"))+
  labs(title="Bigrams",x="Words",y="Occurrences")+
  theme(axis.text.x=element_text(angle=45))

## Warning: Ignoring unknown parameters: binwidth, bins, pad

trigram<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
trigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=trigram))
trigramcorpus<-findFreqTerms(trigramtab,lowfreq=12)
trigramcorpusnum<-rowSums(as.matrix(trigramtab[trigramcorpus,]))
trigramcorpustab<-data.frame(Word=names(trigramcorpusnum),frequency=trigramcorpusnum)
trigramcorpussort<-trigramcorpustab[order(-trigramcorpustab$frequency),]

wordcloud(trigramcorpussort$Word, trigramcorpussort$frequency,  max.words = 20, random.order = 0, scale = c(1.5,0.5), colors=brewer.pal(8, "Accent"))

ggplot(trigramcorpussort[1:15,],aes(x=reorder(Word,-frequency),y=frequency))+
  geom_histogram(stat="identity",fill = I("black"))+
  labs(title="Trigrams",x="Words",y="Occurrences")+
  theme(axis.text.x=element_text(angle=25))

## Warning: Ignoring unknown parameters: binwidth, bins, pad

Conclusion and Next Steps

The exploratory analysis has been completed and now, we know the frequently occurring single words (unigrams), two word combinations (bigrams) and three word combinations (trigrams). Next, we will plan to build our predictive algorithim for the Shiny app by using n-gram based model wherein user will input a set of words and the algorithm will predict the next word.