Exploratory Analysis of Blogs, News and Twitter Data

Executive Summary

This is a milestone report for the capstone project, and aims to provide a concise summary of my exploratory data analyis and high level goals for the project

The motivation for this project is to: - Demonstrate that I have downloaded the data and have successfully loaded it in. - Create a basic report of summary statistics about the data sets. - Report any interesting findings that I have amassed so far. - Get feedback on my plans for creating a prediction algorithm and Shiny app.

Load the data into an R session

file <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

if (!file.exists("Coursera-SwiftKey.zip")) {
    download.file(file, destfile="Coursera-SwiftKey.zip", method="curl")
    unzip("Coursera-SwiftKey.zip")
}

path <- paste(getwd(), "/final/en_US/", sep = "")

twitter <- readLines(paste(path, "en_US.twitter.txt", sep = ""), encoding = "UTF-8", skipNul = TRUE)

blogs <- readLines(paste(path, "en_US.blogs.txt", sep = ""), encoding = "UTF-8", skipNul = TRUE)

news <- readLines(paste(path,"en_US.news.txt", sep = ""), encoding="UTF-8", skipNul = TRUE)

  twitter_filesize <- paste(round(file.size(paste(path, "en_US.twitter.txt", sep = ""))/10^6, 2), "MB", sep = " ")
  twitter_wordCount <- format(sum(sapply(strsplit(twitter, " "), FUN=length, simplify = TRUE)),big.mark=",")
  twitter_noLines <- format(length(twitter), big.mark=",")
  twitter_maxLineLen <- format(max(nchar(twitter)), big.mark=",")
  twitterSummary <-data.frame(twitter_filesize, twitter_wordCount, twitter_noLines, twitter_maxLineLen)
  names(twitterSummary) <- c("File Size (MB)", "Word Count", "Line Count", "Longest Line Length")

blogs_filesize <- paste(round(file.size(paste(path, "en_US.blogs.txt", sep = ""))/10^6, 2), "MB", sep = " ")
blogs_wordCount <- format(sum(sapply(strsplit(blogs, " "), FUN=length, simplify = TRUE)), big.mark=",")
blogs_noLines <- format(length(blogs), big.mark=",")
blogs_maxLineLen <- format(max(nchar(blogs)), big.mark=",")
blogsSummary <-data.frame(blogs_filesize, blogs_wordCount, blogs_noLines, blogs_maxLineLen)
names(blogsSummary) <- c("File Size (MB)", "Word Count", "Line Count", "Longest Line Length")

news_filesize <- paste(round(file.size(paste(path, "en_US.news.txt", sep = ""))/10^6, 2), "MB", sep = " ")
news_wordCount <- format(sum(sapply(strsplit(news, " "), FUN=length, simplify = TRUE)), big.mark=",")
news_noLines <- format(length(news), big.mark=",")
news_maxLineLen <- format(max(nchar(news)), big.mark=",")
newsSummary <-data.frame(news_filesize, news_wordCount, news_noLines, news_maxLineLen)
names(newsSummary) <- c("File Size (MB)", "Word Count", "Line Count", "Longest Line Length")

summaryInfo <- data.frame(rbind(blogsSummary, newsSummary, twitterSummary))
names(summaryInfo) <- c("File Size (MB)", "Word Count", "Line Count", "Longest Line Length")
row.names(summaryInfo) <- c("Blogs", "News", "Twitter")
summaryInfo

##         File Size (MB) Word Count Line Count Longest Line Length
## Blogs        210.16 MB 37,334,131    899,288              40,833
## News         205.81 MB  2,643,969     77,259               5,760
## Twitter      167.11 MB 30,373,583  2,360,148                 140

Sampling the source Data

In order to reduce the computational load, we will randomly take a 20% sample of each data set, and then combine them together

  # sample 20% of the data
set.seed(1234)
sampleBlogs <- sample(blogs, length(blogs)*0.2, replace = FALSE)
sampleNews <- sample(news, length(news)*0.2, replace = FALSE)
sampleTwitter <- sample(twitter, length(twitter)*0.2, replace = FALSE)

sampleWords <- c(sampleBlogs, sampleNews, sampleTwitter)
sampleWords <- iconv(sampleWords, 'UTF-8','ASCII', "byte")

we will now clean the corpus

sampleCorpus <- Corpus(VectorSource(sampleWords))
sampleCorpus <- tm_map(sampleCorpus, removeNumbers)
sampleCorpus <- tm_map(sampleCorpus, removePunctuation)
sampleCorpus <- tm_map(sampleCorpus, stripWhitespace)
sampleCorpus <- tm_map(sampleCorpus, content_transformer(tolower))
sampleCorpus <- tm_map(sampleCorpus, removeWords, stopwords("english"))

get the highest frequency unigrams

  unigram <- textcnt(sampleCorpus, n = 1, method = "string")
  unigramSorted <- unigram[order(unigram, decreasing = TRUE)]
  barplot(head(unigramSorted, 10), main = "top 10 Unigrams", col = "yellow", ylab = "Frequency")

make a wordcloud of unigrams

  wordcloud(sampleCorpus, scale= c(3,0.5), min.freq=5, max.words=200, random.order = TRUE, rot.per=0.5, colors = brewer.pal(6, "Paired"), use.r.layout = FALSE)

get the highest frequency bigrams

bigram <- textcnt(sampleCorpus, n = 2, method = "string")
bigramSorted <- bigram[order(bigram, decreasing = TRUE)]
barplot(head(bigramSorted, 10), main = "top 10 bigrams", col = "green", ylab = "Frequency")

get the highest frequency trigrams

trigram <- textcnt(sampleCorpus, n = 3, method = "string")
trigramSorted <- trigram[order(trigram, decreasing = TRUE)]
trigramSorted[1:10]

##          cant wait see      happy mothers day            let us know 
##                    699                    693                    465 
##         happy new year         im pretty sure          new york city 
##                    375                    308                    271 
## looking forward seeing          cinco de mayo         dont even know 
##                    222                    213                    202 
##          cant wait get 
##                    181

barplot(head(trigramSorted, 5), main = "top 5 trigrams", col = "blue", ylab = "Frequency")

Plan for Creating the Model using Shiny

At a high level, the next steps can be categorized into 3 buckets.

Data Cleanup

Continued cleanup of corpus
better sampling method

Prediction Model

build bi & tri gram frequencies can be used to predict the next word, giving the user a choice of two words
testing prediction models

Shiny App

take a word as user input, and return the probable next word using the prediction model

Exploratory Analysis of Blogs, News and Twitter Data

Kishore Mamidi

5/14/2017

Executive Summary

Load the data into an R session

Sampling the source Data

Plan for Creating the Model using Shiny