This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
url <- “https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip” download.file(url, dest = “capstone_dataset.zip”) unzip (“capstone_dataset.zip”)
Load the R packages necessary for running the analysis
Building a table
file.list = c(“//Home/final/en_US/en_US.blogs.txt”, “//Home/final/en_US/en_US.news.txt”, “//Home/final/en_US/en_US.twitter.txt”)
text <- list(blogs = ““, news =”“, twitter =”“)
matrix.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c(“blogs”, “news”, “twitter”),c(“file size, Mb”, “lines”, “words”))) for (i in 1:3) { con <- file(file.list[i], “rb”) text[[i]] <- readLines(con, encoding = “UTF-8”,skipNul = TRUE) close(con) matrix.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2) matrix.summary[i,2] <- length(text[[i]]) matrix.summary[i,3] <- sum(stri_count_words(text[[i]])) } kable(matrix.summary) file size, Mb lines words blogs 200.42 899288 37546246 news 196.28 1010242 34762395 twitter 159.36 2360148 30093410 How the files are very large, we will proceed with the analysis using a small fraction to get a sample. For example, News file is 196MB of size and 1.010,242 Lines. I will use 5k random lines for analysis.
set.seed(123) blogs_sample <- sample(text\(blogs, 0.005*length(text\)blogs)) news_sample <- sample(text\(news, 0.005*length(text\)news)) twitter_sample <- sample(text\(twitter, 0.005*length(text\)twitter)) Blogs Sample # Create corpus corpus1 <- Corpus(VectorSource(blogs_sample)) # To lower case corpus1 <- tm_map(corpus1, content_transformer(tolower)) # Remove punctuation marks corpus1 <- tm_map(corpus1, removePunctuation) # Remove numbers corpus1 <- tm_map(corpus1, removeNumbers) #remove stop words corpus1 <- tm_map(corpus1, removeWords, stopwords(“english”)) #Remove whitespaces corpus1 <- tm_map(corpus1, stripWhitespace) frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus1))),decreasing=TRUE), 10)
barplot(frequentWords, main = “Blogs Data: Most Frequent Words”, xlab=“Word”, ylab = “Count”)
term.doc.matrix1 <- TermDocumentMatrix(corpus1) term.doc.matrix1 <- as.matrix(term.doc.matrix1) word.freqs1 <- sort(rowSums(term.doc.matrix1), decreasing=TRUE) dm1 <- data.frame(word=names(word.freqs1), freq=word.freqs1) wordcloud(dm1\(word, dm1\)freq, min.freq= 100, random.order=TRUE, rot.per=.25, colors=brewer.pal(8, “Dark2”)) ## Warning in wordcloud(dm1\(word, dm1\)freq, min.freq = 100, random.order = ## TRUE, : can could not be fit on page. It will not be plotted.
News Data # Create corpus corpus2 <- Corpus(VectorSource(news_sample)) # To lower case corpus2 <- tm_map(corpus2, content_transformer(tolower)) # Remove punctuation marks corpus2 <- tm_map(corpus2, removePunctuation) # Remove numbers corpus2 <- tm_map(corpus2, removeNumbers) #remove stop words corpus2 <- tm_map(corpus2, removeWords, stopwords(“english”)) #Remove whitespaces corpus2 <- tm_map(corpus2, stripWhitespace) frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus2))),decreasing=TRUE), 10)
barplot(frequentWords, main = “News Data: Most Frequent Words”, xlab=“Word”, ylab = “Count”)
term.doc.matrix2 <- TermDocumentMatrix(corpus2) term.doc.matrix2 <- as.matrix(term.doc.matrix2) word.freqs2 <- sort(rowSums(term.doc.matrix2), decreasing=TRUE) dm2 <- data.frame(word=names(word.freqs2), freq=word.freqs2) Most common words in the corpus wordcloud(dm2\(word, dm2\)freq, min.freq= 100, random.order=TRUE, rot.per=.25, colors=brewer.pal(8, “Dark2”))
Twitter Data # Create corpus corpus3 <- Corpus(VectorSource(twitter_sample))
corpus3 <- tm_map(corpus3, content_transformer(function(x) iconv(x, to = “UTF-8”, sub = “byte”)))
corpus3 <- tm_map(corpus3, content_transformer(tolower)) # Remove punctuation marks corpus3 <- tm_map(corpus3, removePunctuation) # Remove numbers corpus3 <- tm_map(corpus3, removeNumbers) #remove stop words corpus3 <- tm_map(corpus3, removeWords, stopwords(“english”)) #Remove whitespaces corpus3 <- tm_map(corpus3, stripWhitespace) frequentWords <- head(sort(rowSums(as.matrix(TermDocumentMatrix(corpus3))),decreasing=TRUE), 10)
barplot(frequentWords, main = “Twitter Data: Most Frequent Words”, xlab=“Word”, ylab = “Count”)
term.doc.matrix3 <- TermDocumentMatrix(corpus3) term.doc.matrix3 <- as.matrix(term.doc.matrix3) word.freqs3 <- sort(rowSums(term.doc.matrix3), decreasing=TRUE) dm3 <- data.frame(word=names(word.freqs3), freq=word.freqs3) wordcloud(dm3\(word, dm3\)freq, min.freq= 100, random.order=FALSE, rot.per=.25, colors=brewer.pal(8, “Dark2”))
Future Analysis/Plans: More models - N grams: bigrams, trigrams. Create a prediction model