Assignment

blog<-readLines("C:\\Users\\Dell\\Desktop\\Coursera-SwiftKey\\final\\en_US\\en_US.blogs.txt",warn=FALSE, encoding="UTF-8")
news<-readLines("C:\\Users\\Dell\\Desktop\\Coursera-SwiftKey\\final\\en_US\\en_US.news.txt",warn=FALSE, encoding="UTF-8")
twit<-readLines("C:\\Users\\Dell\\Desktop\\Coursera-SwiftKey\\final\\en_US\\en_US.twitter.txt",warn=FALSE, encoding="UTF-8")

set.seed(1927)

twit_sample <- sample(twit, length(twit)*.05)
news_sample <- sample(news, length(news)*.05)
blog_sample <- sample(blog, length(blog)*.05)

combined_sample <- c(twit_sample, blog_sample, news_sample)
combined_sample <- iconv(combined_sample, "UTF-8","ASCII", sub="")
length(combined_sample)

## [1] 166833

library(tm)

## Loading required package: NLP

corpus <- VCorpus(VectorSource(combined_sample))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, removeWords, stopwords("english"))

Assignment

datasciencegit30

11/30/2020