R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.


url <- “https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip” download.file(url, dest = “capstone_dataset.zip”) unzip (“capstone_dataset.zip”)

Load the R packages necessary for running the analysis

Building a table

file.list = c(“//Home/final/en_US/en_US.blogs.txt”, “//Home/final/en_US/en_US.news.txt”, “//Home/final/en_US/en_US.twitter.txt”)

text <- list(blogs = ““, news =”“, twitter =”“)

matrix.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c(“blogs”, “news”, “twitter”),c(“file size, Mb”, “lines”, “words”))) for (i in 1:3) { con <- file(file.list[i], “rb”) text[[i]] <- readLines(con, encoding = “UTF-8”,skipNul = TRUE) close(con) matrix.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2) matrix.summary[i,2] <- length(text[[i]]) matrix.summary[i,3] <- sum(stri_count_words(text[[i]])) } kable(matrix.summary) file size, Mb lines words blogs 200.42 899288 37546246 news 196.28 1010242 34762395 twitter 159.36 2360148 30093410 How the files are very large, we will proceed with the analysis using a small fraction to get a sample. For example, News file is 196MB of size and 1.010,242 Lines. I will use 5k random lines for analysis.

set.seed(123) blogs_sample <- sample(text\(blogs, 0.005*length(text\)blogs)) news_sample <- sample(text\(news, 0.005*length(text\)news)) twitter_sample <- sample(text\(twitter, 0.005*length(text\)twitter)) Blogs Sample # Create corpus corpus1 <- Corpus(VectorSource(blogs_sample)) # To lower case corpus1 <- tm_map(corpus1, content_transformer(tolower)) # Remove punctuation marks corpus1 <- tm_map(corpus1, removePunctuation) # Remove numbers corpus1 <- tm_map(corpus1, removeNumbers) #remove stop words corpus1 <- tm_map(corpus1, removeWords, stopwords(“english”)) #Remove whitespaces corpus1 <- tm_map(corpus1, stripWhitespace) frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus1))),decreasing=TRUE), 10)

barplot(frequentWords, main = “Blogs Data: Most Frequent Words”, xlab=“Word”, ylab = “Count”)

term.doc.matrix1 <- TermDocumentMatrix(corpus1) term.doc.matrix1 <- as.matrix(term.doc.matrix1) word.freqs1 <- sort(rowSums(term.doc.matrix1), decreasing=TRUE) dm1 <- data.frame(word=names(word.freqs1), freq=word.freqs1) wordcloud(dm1\(word, dm1\)freq, min.freq= 100, random.order=TRUE, rot.per=.25, colors=brewer.pal(8, “Dark2”)) ## Warning in wordcloud(dm1\(word, dm1\)freq, min.freq = 100, random.order = ## TRUE, : can could not be fit on page. It will not be plotted.

News Data # Create corpus corpus2 <- Corpus(VectorSource(news_sample)) # To lower case corpus2 <- tm_map(corpus2, content_transformer(tolower)) # Remove punctuation marks corpus2 <- tm_map(corpus2, removePunctuation) # Remove numbers corpus2 <- tm_map(corpus2, removeNumbers) #remove stop words corpus2 <- tm_map(corpus2, removeWords, stopwords(“english”)) #Remove whitespaces corpus2 <- tm_map(corpus2, stripWhitespace) frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus2))),decreasing=TRUE), 10)

barplot(frequentWords, main = “News Data: Most Frequent Words”, xlab=“Word”, ylab = “Count”)

term.doc.matrix2 <- TermDocumentMatrix(corpus2) term.doc.matrix2 <- as.matrix(term.doc.matrix2) word.freqs2 <- sort(rowSums(term.doc.matrix2), decreasing=TRUE) dm2 <- data.frame(word=names(word.freqs2), freq=word.freqs2) Most common words in the corpus wordcloud(dm2\(word, dm2\)freq, min.freq= 100, random.order=TRUE, rot.per=.25, colors=brewer.pal(8, “Dark2”))

Twitter Data # Create corpus corpus3 <- Corpus(VectorSource(twitter_sample))

Convert Character Vector between Encodings

corpus3 <- tm_map(corpus3, content_transformer(function(x) iconv(x, to = “UTF-8”, sub = “byte”)))

To lower case

corpus3 <- tm_map(corpus3, content_transformer(tolower)) # Remove punctuation marks corpus3 <- tm_map(corpus3, removePunctuation) # Remove numbers corpus3 <- tm_map(corpus3, removeNumbers) #remove stop words corpus3 <- tm_map(corpus3, removeWords, stopwords(“english”)) #Remove whitespaces corpus3 <- tm_map(corpus3, stripWhitespace) frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus3))),decreasing=TRUE), 10)

barplot(frequentWords, main = “Twitter Data: Most Frequent Words”, xlab=“Word”, ylab = “Count”)

term.doc.matrix3 <- TermDocumentMatrix(corpus3) term.doc.matrix3 <- as.matrix(term.doc.matrix3) word.freqs3 <- sort(rowSums(term.doc.matrix3), decreasing=TRUE) dm3 <- data.frame(word=names(word.freqs3), freq=word.freqs3) wordcloud(dm3\(word, dm3\)freq, min.freq= 100, random.order=FALSE, rot.per=.25, colors=brewer.pal(8, “Dark2”))

Future Analysis/Plans: More models - N grams: bigrams, trigrams. Create a prediction model