library(RColorBrewer)
library(wordcloud)
library(NLP)
library(tm)
library(stringi)
library(ggplot2)
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(url, destfile="C:/Users/Home/Desktop/R/Coursera-SwiftKey.zip", mode = "wb")
unzip(zipfile="C:/Users/Home/Desktop/R/Coursera-SwiftKey.zip", exdir="C:/Users/Home/Desktop/R")
blogs <- readLines("C:/Users/Home/Desktop/R/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("C:/Users/Home/Desktop/R/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("C:/Users/Home/Desktop/R/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
stri_stats_general( twitter )
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162096241 134082806
stri_stats_general( news )
## Lines LinesNEmpty Chars CharsNWhite
## 77259 77259 15639408 13072698
stri_stats_general( blogs )
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 206824382 170389539
blog_words <- stri_count_words(blogs)
qplot(blog_words)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
twitter_words <- stri_count_words(twitter)
qplot(twitter_words)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
news_words <- stri_count_words(news)
qplot(news_words)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
twitter<-sample(twitter, 20000)
twitter<- iconv(twitter, 'UTF-8', 'ASCII', "byte")
twitterVec <- VectorSource(twitter)
twitterCorpus <- Corpus(twitterVec)
twitterCorpus<- tm_map(twitterCorpus, tolower)
twitterCorpus<- tm_map(twitterCorpus, removeNumbers)
twitterCorpus<- tm_map(twitterCorpus, removePunctuation)
twitter2 <- tm_map(twitterCorpus, PlainTextDocument)
wordcloud(twitterCorpus, max.words = 200, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(2, "Dark2"))
twitterCorpus<- tm_map(twitterCorpus, removeWords, stopwords("english"))
twitter2 <- tm_map(twitterCorpus, PlainTextDocument)
wordcloud(twitterCorpus, max.words = 100, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(2, "Dark2"))
### News
news <-sample(news, 20000)
news <- iconv(news, 'UTF-8', 'ASCII', "byte")
newsVec <- VectorSource(news)
newsCorpus <- Corpus(newsVec)
newsCorpus <- tm_map(newsCorpus, tolower)
newsCorpus <- tm_map(newsCorpus, removeNumbers)
newsCorpus <- tm_map(newsCorpus, removePunctuation)
news2 <- tm_map(newsCorpus, PlainTextDocument)
wordcloud(newsCorpus, max.words = 200, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(2, "Dark2"))
newsCorpus<- tm_map(twitterCorpus, removeWords, stopwords("english"))
news2 <- tm_map(newsCorpus, PlainTextDocument)
wordcloud(newsCorpus, max.words = 100, random.order = FALSE,rot.per=0.35, use.r.layout=FALSE,colors=brewer.pal(2, "Dark2"))