The goal of this project is to display how I working with the final project data and prediction algorithm. The report on R Pubs explains my exploratory analysis and my goals for the eventual app and algorithm. This document explains the major features of the data and briefly summarize my plans for creating the prediction algorithm and Shiny app
The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.2. Create a basic report of summary statistics about the data sets.3. Report any interesting findings that you amassed so far.4. Get feedback on your plans for creating a prediction algorithm and Shiny app.
setwd("e:/datascience/r/Data Science Capstone/final/en_us")
library(readr)
## Warning: package 'readr' was built under R version 3.4.2
blogs <- read_lines("en_US.blogs.txt")
news <- read_lines("en_US.news.txt", n_max = -1L)
twitter <- read_lines("en_US.twitter.txt", n_max = -1L)
library(stringi)
blog_lines <- length(blogs)
print(paste0("Number of lines in blog file: ", blog_lines))
## [1] "Number of lines in blog file: 899288"
blog_words <- stri_count_words(paste(blogs, collapse = " "))
print(paste0("Number of words in blog file: ", blog_words))
## [1] "Number of words in blog file: 37546246"
news_lines <- length(news)
print(paste0("Number of lines in newa file: ", blog_lines))
## [1] "Number of lines in newa file: 899288"
news_words <- stri_count_words(paste(news, collapse = " "))
print(paste0("Number of words in newa file: ", blog_words))
## [1] "Number of words in newa file: 37546246"
twitter_lines <- length(twitter)
print(paste0("Number of lines in twitter file: ", blog_lines))
## [1] "Number of lines in twitter file: 899288"
twitter_words <- stri_count_words(paste(twitter, collapse = " "))
print(paste0("Number of words in twitter file: ", blog_words))
## [1] "Number of words in twitter file: 37546246"
The random 1000 samples from each of three files were used here as an example to analyze the 1-word frequency. The three files were combined and was preprocessed by TM package and saved in “frequency”" file. The top 100 words was plotted for reference.
library(tm)
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
set.seed((11111))
# File cleaning, sampling and combination:
blogs_sample = sample(blogs, 1000)
news_sample = sample(news, 1000)
twitter_sample = sample(twitter, 1000)
library(SnowballC)
blogs_sample <- iconv(blogs_sample, 'UTF-8', 'ASCII')
news_sample <- iconv(news_sample, 'UTF-8', 'ASCII')
twitter_sample <- iconv(twitter_sample, 'UTF-8', 'ASCII')
newfile = paste(blogs_sample, news_sample, twitter_sample, collapse = " ")
# File preprocessing and cleaning:
new_source <- VectorSource(newfile)
new_corpus1 <- Corpus(new_source)
new_corpus2 <- tm_map(new_corpus1, content_transformer(stringi::stri_trans_tolower))
new_corpus3 <- tm_map(new_corpus2, removePunctuation)
new_corpus4 <- tm_map(new_corpus3, stripWhitespace)
new_corpus5 <- tm_map(new_corpus4, removeWords, stopwords("english"))
# Word frequency count and plotting:
dtm_1gram <- DocumentTermMatrix(new_corpus5)
dtm_1 <- as.matrix(dtm_1gram)
frequency <- colSums(dtm_1)
frequency <- sort(frequency, decreasing=TRUE)
head(frequency)
## said will one just like get
## 228 208 165 164 141 137
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.2
## Loading required package: RColorBrewer
words <- names(frequency)
wordcloud(words[1:100], frequency[1:100])
The same combined file was used here as an example to analyze the 2-gram and 3-gram frequency. The top 10 in each group was bar-plotted and shown here.
# 2-gram frequency checking:
newdata <- strsplit(newfile, " ", fixed=TRUE)[[1]]
bigrams <- vapply(ngrams(newdata, 2), paste, "", collapse=" ")
top10_2gram <- sort(table(bigrams), decreasing=T)[1:10]
# plotting top 10 2-gram:
barplot(top10_2gram, las=2, main = "top 10 2-gram")
# 3-gram frequency checking:
trigrams <- vapply(ngrams(newdata, 3), paste, "", collapse=" ")
top10_3gram <- sort(table(trigrams), decreasing=T)[1:10]
# plotting top 10 3-gram:
barplot(top10_3gram, las=2, main = "top 10 3-gram")